From 31f61ac33032ee87ea404d6d996ba2c386502a36 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Tue, 14 Apr 2026 12:10:14 -0700 Subject: bpf: Refactor dynptr mutability tracking Redefine dynptr mutability and fix inconsistency in the verifier and kfunc signatures. Dynptr mutability is at two levels. The first is the bpf_dynptr structure and the second is the memory the dynptr points to. The verifer currently tracks the mutability of the bpf_dynptr struct through helper and kfunc prototypes, where "const struct bpf_dynptr *" means the structure itself is immutable. The second level is tracked in upper bit of bpf_dynptr->size in runtime and is not changed in this patch. There are two type of inconsistency in the verfier regarding the mutability of the bpf_dynptr struct. First, there are many existing kfuncs whose prototypes are wrong. For example, bpf_dynptr_adjust() mutates a dynptr's start and offset but marks the argument as a const pointer. At the same time many other kfuncs that does not mutate the dynptr but mark themselves as mutable. Second, the verifier currently does not honor the const qualifier in kfunc prototypes as it determines whether tagging the arg_type with MEM_RDONLY or not based on the register state. Since all the verifier care is to prevent CONST_PTR_TO_DYNPTR from being destroyed in callback and global subprogram, redefine the mutability at the bpf_dynptr level to just bpf_dynptr_kern->data. Then, explicitly prohibit passing CONST_PTR_TO_DYNPTR to an argument tagged with MEM_UNINIT or OBJ_RELEASE. The mutability of a dynptr's view is not really interesting so drop MEM_RDONLY annotation for dynptr from the helpers and kfuncs. Plus, if the mutability of the entire bpf_dynptr were to be done correctly, it would kill the bpf_dynptr_adjust() usage in callback and global subporgram. Implementation wise - First, make sure all kfunc arg are correctly tagged: Tag the dynptr argument of bpf_dynptr_file_discard() with OBJ_RELEASE. - Then, in process_dynptr_func(), make sure CONST_PTR_TO_DYNPTR cannot be passed to argument tagged with MEM_UNINIT or OBJ_RELEASE. For MEM_UNINIT, it is already checked by is_dynptr_reg_valid_uninit(). For OBJ_RELEASE, check against OBJ_RELEASE instead of MEM_RDONLY and drop a now identical check in unmark_stack_slots_dynptr(). - Remove the mutual exclusive check between MEM_UNINIT and MEM_RDONLY, but don't add a MEM_UNINIT and OBJ_RELEASE version as it is obviously wrong. Note that while this patch stops following the C semantic for the mutability of bpf_dynptr, the prototype of kfuncs are still fixed to maintain the correct C semantics in the implementation. Adding or removing the const qualifier does not break backward compatibility. In addition, fix kfuncs dropping the const qualifier when casting the opaque bpf_dynptr to bpf_dynptr_kern. In test_kfunc_dynptr_param.c, initialize dynptr to 0 to avoid -Wuninitialized-const-pointer warning. Signed-off-by: Amery Hung Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20260414191014.1218567-1-ameryhung@gmail.com Signed-off-by: Kumar Kartikeya Dwivedi --- kernel/bpf/btf.c | 2 +- kernel/bpf/helpers.c | 36 +++++++++++++------------- kernel/bpf/verifier.c | 70 ++++++++++++++------------------------------------- 3 files changed, 38 insertions(+), 70 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a62d78581207..3c2aaa3c5004 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7973,7 +7973,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) bpf_log(log, "arg#%d has invalid combination of tags\n", i); return -EINVAL; } - sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY; + sub->args[i].arg_type = ARG_PTR_TO_DYNPTR; continue; } if (tags & ARG_TAG_TRUSTED) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2bb60200c266..baa12b24bb64 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1944,7 +1944,7 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, + .arg3_type = ARG_PTR_TO_DYNPTR, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -2001,7 +2001,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, + .arg1_type = ARG_PTR_TO_DYNPTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, @@ -2044,7 +2044,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, - .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, + .arg1_type = ARG_PTR_TO_DYNPTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; @@ -3072,7 +3072,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk); } -__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) +__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr *p, u64 start, u64 end) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; u64 size; @@ -3093,14 +3093,14 @@ __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end __bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; return !ptr->data; } __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) return false; @@ -3110,7 +3110,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) __bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) return -EINVAL; @@ -3122,7 +3122,7 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, struct bpf_dynptr *clone__uninit) { struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit; - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) { bpf_dynptr_set_null(clone); @@ -3145,11 +3145,11 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, * Copies data from source dynptr to destination dynptr. * Returns 0 on success; negative error, otherwise. */ -__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, - struct bpf_dynptr *src_ptr, u64 src_off, u64 size) +__bpf_kfunc int bpf_dynptr_copy(const struct bpf_dynptr *dst_ptr, u64 dst_off, + const struct bpf_dynptr *src_ptr, u64 src_off, u64 size) { - struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; - struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; + const struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; + const struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; void *src_slice, *dst_slice; char buf[256]; u64 off; @@ -3200,9 +3200,9 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, * at @offset with the constant byte @val. * Returns 0 on success; negative error, otherwise. */ -__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val) +__bpf_kfunc int bpf_dynptr_memset(const struct bpf_dynptr *p, u64 offset, u64 size, u8 val) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; u64 chunk_sz, write_off; char buf[256]; void* slice; @@ -4214,13 +4214,13 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey) * * Return: 0 on success, a negative value on error. */ -__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, - struct bpf_dynptr *sig_p, +__bpf_kfunc int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p, + const struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring) { #ifdef CONFIG_SYSTEM_DATA_VERIFICATION - struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; - struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; + const struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; + const struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; const void *data, *sig; u32 data_len, sig_len; int ret; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 69d75515ed3f..185210b73385 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -717,15 +717,6 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re struct bpf_func_state *state = bpf_func(env, reg); int spi, ref_obj_id, i; - /* - * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot - * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr - * is safe to do directly. - */ - if (reg->type == CONST_PTR_TO_DYNPTR) { - verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released"); - return -EFAULT; - } spi = dynptr_get_spi(env, reg); if (spi < 0) return spi; @@ -7434,23 +7425,12 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object. * - * Mutability of bpf_dynptr is at two levels, one is at the level of struct - * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct - * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can - * mutate the view of the dynptr and also possibly destroy it. In the latter - * case, it cannot mutate the bpf_dynptr itself but it can still mutate the - * memory that dynptr points to. - * - * The verifier will keep track both levels of mutation (bpf_dynptr's in - * reg->type and the memory's in reg->dynptr.type), but there is no support for - * readonly dynptr view yet, hence only the first case is tracked and checked. - * - * This is consistent with how C applies the const modifier to a struct object, - * where the pointer itself inside bpf_dynptr becomes const but not what it - * points to. - * - * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument - * type, and declare it as 'const struct bpf_dynptr *' in their prototype. + * Mutability of bpf_dynptr is at two levels: the dynptr and the memory the + * dynptr points to. At the first level, the verifier will make sure a + * CONST_PTR_TO_DYNPTR cannot be reinitialized or destroyed. The mutability of + * a dynptr's view (i.e., start and offset) is not tracked as there is not such + * use case. The second level is tracked using the upper bit of bpf_dynptr->size + * and checked dynamically during runtime. */ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, enum bpf_arg_type arg_type, int clone_ref_obj_id) @@ -7465,14 +7445,6 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn return -EINVAL; } - /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an - * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): - */ - if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) { - verifier_bug(env, "misconfigured dynptr helper type flags"); - return -EFAULT; - } - /* MEM_UNINIT - Points to memory that is an appropriate candidate for * constructing a mutable bpf_dynptr object. * @@ -7480,13 +7452,12 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn * pointing to a region of at least 16 bytes which doesn't * contain an existing bpf_dynptr. * - * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be - * mutated or destroyed. However, the memory it points to - * may be mutated. + * OBJ_RELEASE - Points to a initialized bpf_dynptr that will be + * destroyed. * - * None - Points to a initialized dynptr that can be mutated and - * destroyed, including mutation of the memory it points - * to. + * None - Points to a initialized dynptr that cannot be + * reinitialized or destroyed. However, the view of the + * dynptr and the memory it points to may be mutated. */ if (arg_type & MEM_UNINIT) { int i; @@ -7505,10 +7476,10 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn } err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id); - } else /* MEM_RDONLY and None case from above */ { + } else /* OBJ_RELEASE and None case from above */ { /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ - if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { - verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n"); + if (reg->type == CONST_PTR_TO_DYNPTR && (arg_type & OBJ_RELEASE)) { + verbose(env, "CONST_PTR_TO_DYNPTR cannot be released\n"); return -EINVAL; } @@ -7519,8 +7490,8 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn return -EINVAL; } - /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */ - if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { + /* Fold modifiers (in this case, OBJ_RELEASE) when checking expected type */ + if (!is_dynptr_type_expected(env, reg, arg_type & ~OBJ_RELEASE)) { verbose(env, "Expected a dynptr of type %s as arg #%d\n", dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1); @@ -9366,7 +9337,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno); return -EINVAL; } - } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { + } else if (arg->arg_type == ARG_PTR_TO_DYNPTR) { ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR); if (ret) return ret; @@ -12273,9 +12244,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR; int clone_ref_obj_id = 0; - if (reg->type == CONST_PTR_TO_DYNPTR) - dynptr_arg_type |= MEM_RDONLY; - if (is_kfunc_arg_uninit(btf, &args[i])) dynptr_arg_type |= MEM_UNINIT; @@ -12288,7 +12256,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { dynptr_arg_type |= DYNPTR_TYPE_FILE; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) { - dynptr_arg_type |= DYNPTR_TYPE_FILE; + dynptr_arg_type |= DYNPTR_TYPE_FILE | OBJ_RELEASE; meta->release_regno = regno; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { @@ -18745,7 +18713,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) } else if (arg->arg_type == ARG_ANYTHING) { reg->type = SCALAR_VALUE; mark_reg_unknown(env, regs, i); - } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { + } else if (arg->arg_type == ARG_PTR_TO_DYNPTR) { /* assume unspecial LOCAL dynptr type */ __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen); } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { -- cgit v1.2.3 From f7a6b9eaff3e6693ba3b19c5812e28538049bbf2 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 17 Apr 2026 15:30:18 +0100 Subject: bpf: Extend BTF UAPI vlen, kinds to use unused bits BTF maximum vlen is encoded using 16 bits with a maximum vlen of 65535. This has sufficed for structs, function parameters and enumerated type values. However, with upcoming BTF location information - in particular information about inline sites - this limit is surpassed. Use bits 16-23 - currently unused in BTF info - to extend to 24 bits, giving a max vlen of (2^24 - 1), or 16 million. Also extend BTF kind encoding from 5 to 7 bits, giving a maximum available number of kinds of 128. Since with the BTF location work we use another 3 kinds, we are fast approaching the current limit of 32. Convert BTF_MAX_* values to enums to allow them to be encoded in kernel BTF; this will allow us to detect if the running kernel supports a 24-bit vlen or not. Add one for max _possible_ (not used) kind. Fix up a few places in the kernel where a 16-bit vlen is assumed; remove BTF_INFO_MASK as now all bits are used. The vlen expansion was suggested by Andrii in [1]; the kind expansion is tackled here too as it may be needed also to support new kinds in BTF. [1] https://lore.kernel.org/bpf/CAEf4BzZx=X6vGqcA8SPU6D+v6k+TR=ZewebXMuXtpmML058piw@mail.gmail.com/ Suggested-by: Andrii Nakryiko Signed-off-by: Alan Maguire Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260417143023.1551481-2-alan.maguire@oracle.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 4 ++-- include/uapi/linux/btf.h | 26 ++++++++++++++------------ kernel/bpf/btf.c | 27 ++++++++++----------------- tools/include/uapi/linux/btf.h | 26 ++++++++++++++------------ 4 files changed, 40 insertions(+), 43 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/btf.h b/include/linux/btf.h index 48108471c5b1..c82d0d689059 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -415,12 +415,12 @@ static inline bool btf_type_is_array(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; } -static inline u16 btf_type_vlen(const struct btf_type *t) +static inline u32 btf_type_vlen(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); } -static inline u16 btf_vlen(const struct btf_type *t) +static inline u32 btf_vlen(const struct btf_type *t) { return btf_type_vlen(t); } diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 638615ebddc2..618167cab4e6 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -33,20 +33,22 @@ struct btf_header { __u32 layout_len; /* length of layout section */ }; -/* Max # of type identifier */ -#define BTF_MAX_TYPE 0x000fffff -/* Max offset into the string section */ -#define BTF_MAX_NAME_OFFSET 0x00ffffff -/* Max # of struct/union/enum members or func args */ -#define BTF_MAX_VLEN 0xffff +enum btf_max { + /* Max possible kind */ + BTF_MAX_KIND = 0x0000007f, + /* Max # of type identifier */ + BTF_MAX_TYPE = 0x000fffff, + /* Max offset into the string section */ + BTF_MAX_NAME_OFFSET = 0x00ffffff, + /* Max # of struct/union/enum members or func args */ + BTF_MAX_VLEN = 0x00ffffff, +}; struct btf_type { __u32 name_off; /* "info" bits arrangement - * bits 0-15: vlen (e.g. # of struct's members) - * bits 16-23: unused - * bits 24-28: kind (e.g. int, ptr, array...etc) - * bits 29-30: unused + * bits 0-23: vlen (e.g. # of struct's members) + * bits 24-30: kind (e.g. int, ptr, array...etc) * bit 31: kind_flag, currently used by * struct, union, enum, fwd, enum64, * decl_tag and type_tag @@ -65,8 +67,8 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) -#define BTF_INFO_VLEN(info) ((info) & 0xffff) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x7f) +#define BTF_INFO_VLEN(info) ((info) & 0xffffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) enum { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 3c2aaa3c5004..77af44d8a3ad 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -182,7 +182,6 @@ #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) -#define BTF_INFO_MASK 0x9f00ffff #define BTF_INT_MASK 0x0fffffff #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) @@ -289,7 +288,7 @@ enum verifier_phase { struct resolve_vertex { const struct btf_type *t; u32 type_id; - u16 next_member; + u32 next_member; }; enum visit_state { @@ -2031,7 +2030,7 @@ static int env_stack_push(struct btf_verifier_env *env, } static void env_stack_set_next_member(struct btf_verifier_env *env, - u16 next_member) + u32 next_member) { env->stack[env->top_stack - 1].next_member = next_member; } @@ -3293,7 +3292,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, struct btf *btf = env->btf; u32 struct_size = t->size; u32 offset; - u16 i; + u32 i; meta_needed = btf_type_vlen(t) * sizeof(*member); if (meta_left < meta_needed) { @@ -3369,7 +3368,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env, { const struct btf_member *member; int err; - u16 i; + u32 i; /* Before continue resolving the next_member, * ensure the last member is indeed resolved to a @@ -4447,7 +4446,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, const struct btf_enum *enums = btf_type_enum(t); struct btf *btf = env->btf; const char *fmt_str; - u16 i, nr_enums; + u32 i, nr_enums; u32 meta_needed; nr_enums = btf_type_vlen(t); @@ -4555,7 +4554,7 @@ static s32 btf_enum64_check_meta(struct btf_verifier_env *env, const struct btf_enum64 *enums = btf_type_enum64(t); struct btf *btf = env->btf; const char *fmt_str; - u16 i, nr_enums; + u32 i, nr_enums; u32 meta_needed; nr_enums = btf_type_vlen(t); @@ -4683,7 +4682,7 @@ static void btf_func_proto_log(struct btf_verifier_env *env, const struct btf_type *t) { const struct btf_param *args = (const struct btf_param *)(t + 1); - u16 nr_args = btf_type_vlen(t), i; + u32 nr_args = btf_type_vlen(t), i; btf_verifier_log(env, "return=%u args=(", t->type); if (!nr_args) { @@ -4929,7 +4928,7 @@ static int btf_datasec_resolve(struct btf_verifier_env *env, { const struct btf_var_secinfo *vsi; struct btf *btf = env->btf; - u16 i; + u32 i; env->resolve_mode = RESOLVE_TBD; for_each_vsi_from(i, v->next_member, v->t, vsi) { @@ -5183,7 +5182,7 @@ static int btf_func_proto_check(struct btf_verifier_env *env, const struct btf_type *ret_type; const struct btf_param *args; const struct btf *btf; - u16 nr_args, i; + u32 nr_args, i; int err; btf = env->btf; @@ -5278,7 +5277,7 @@ static int btf_func_check(struct btf_verifier_env *env, const struct btf_type *proto_type; const struct btf_param *args; const struct btf *btf; - u16 nr_args, i; + u32 nr_args, i; btf = env->btf; proto_type = btf_type_by_id(btf, t->type); @@ -5336,12 +5335,6 @@ static s32 btf_check_meta(struct btf_verifier_env *env, } meta_left -= sizeof(*t); - if (t->info & ~BTF_INFO_MASK) { - btf_verifier_log(env, "[%u] Invalid btf_info:%x", - env->log_type_id, t->info); - return -EINVAL; - } - if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { btf_verifier_log(env, "[%u] Invalid kind:%u", diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h index 638615ebddc2..618167cab4e6 100644 --- a/tools/include/uapi/linux/btf.h +++ b/tools/include/uapi/linux/btf.h @@ -33,20 +33,22 @@ struct btf_header { __u32 layout_len; /* length of layout section */ }; -/* Max # of type identifier */ -#define BTF_MAX_TYPE 0x000fffff -/* Max offset into the string section */ -#define BTF_MAX_NAME_OFFSET 0x00ffffff -/* Max # of struct/union/enum members or func args */ -#define BTF_MAX_VLEN 0xffff +enum btf_max { + /* Max possible kind */ + BTF_MAX_KIND = 0x0000007f, + /* Max # of type identifier */ + BTF_MAX_TYPE = 0x000fffff, + /* Max offset into the string section */ + BTF_MAX_NAME_OFFSET = 0x00ffffff, + /* Max # of struct/union/enum members or func args */ + BTF_MAX_VLEN = 0x00ffffff, +}; struct btf_type { __u32 name_off; /* "info" bits arrangement - * bits 0-15: vlen (e.g. # of struct's members) - * bits 16-23: unused - * bits 24-28: kind (e.g. int, ptr, array...etc) - * bits 29-30: unused + * bits 0-23: vlen (e.g. # of struct's members) + * bits 24-30: kind (e.g. int, ptr, array...etc) * bit 31: kind_flag, currently used by * struct, union, enum, fwd, enum64, * decl_tag and type_tag @@ -65,8 +67,8 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) -#define BTF_INFO_VLEN(info) ((info) & 0xffff) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x7f) +#define BTF_INFO_VLEN(info) ((info) & 0xffffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) enum { -- cgit v1.2.3 From 8cfb77d3092052b52582e804e644202e2b10167a Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 22 Apr 2026 12:41:09 -0700 Subject: bpf: Verifier support for sleepable tracepoint programs Allow BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_TRACEPOINT, and BPF_TRACE_RAW_TP (tp_btf) programs to be sleepable by adding them to can_be_sleepable(). For BTF-based raw tracepoints (tp_btf), add a load-time check in bpf_check_attach_target() that rejects sleepable programs attaching to non-faultable tracepoints with a descriptive error message. For raw tracepoints (raw_tp), add an attach-time check in bpf_raw_tp_link_attach() that rejects sleepable programs on non-faultable tracepoints. The attach-time check is needed because the tracepoint name is not known at load time for raw_tp. The attach-time check for classic tracepoints (tp) in __perf_event_set_bpf_prog() was added in the previous patch. Replace the verbose error message that enumerates allowed program types with a generic "Program of this type cannot be sleepable" message, since the list of sleepable-capable types keeps growing. Signed-off-by: Mykyta Yatsenko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20260422-sleepable_tracepoints-v13-4-99005dff21ef@meta.com Signed-off-by: Kumar Kartikeya Dwivedi --- kernel/bpf/syscall.c | 5 +++++ kernel/bpf/verifier.c | 13 +++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a3c0214ca934..3b1f0ba02f61 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4281,6 +4281,11 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, if (!btp) return -ENOENT; + if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) { + bpf_put_raw_tracepoint(btp); + return -EINVAL; + } + link = kzalloc_obj(*link, GFP_USER); if (!link) { err = -ENOMEM; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 185210b73385..5b4806fdb648 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19267,6 +19267,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, btp = bpf_get_raw_tracepoint(tname); if (!btp) return -EINVAL; + if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) { + bpf_log(log, "Sleepable program cannot attach to non-faultable tracepoint %s\n", + tname); + bpf_put_raw_tracepoint(btp); + return -EINVAL; + } fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL, trace_symbol); bpf_put_raw_tracepoint(btp); @@ -19483,6 +19489,7 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_MODIFY_RETURN: case BPF_TRACE_ITER: case BPF_TRACE_FSESSION: + case BPF_TRACE_RAW_TP: return true; default: return false; @@ -19490,7 +19497,9 @@ static bool can_be_sleepable(struct bpf_prog *prog) } return prog->type == BPF_PROG_TYPE_LSM || prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ || - prog->type == BPF_PROG_TYPE_STRUCT_OPS; + prog->type == BPF_PROG_TYPE_STRUCT_OPS || + prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT || + prog->type == BPF_PROG_TYPE_TRACEPOINT; } static int check_attach_btf_id(struct bpf_verifier_env *env) @@ -19512,7 +19521,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } if (prog->sleepable && !can_be_sleepable(prog)) { - verbose(env, "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n"); + verbose(env, "Program of this type cannot be sleepable\n"); return -EINVAL; } -- cgit v1.2.3 From a7088176d8299ff74276a89dbdef3c5ce8748eeb Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:34:30 -0700 Subject: bpf: Remove unused parameter from check_map_kptr_access() The parameter 'regno' in check_map_kptr_access() is unused. Remove it. Acked-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033430.2537615-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5b4806fdb648..6118743d87e6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4710,7 +4710,7 @@ static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno, return 0; } -static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, +static int check_map_kptr_access(struct bpf_verifier_env *env, int value_regno, int insn_idx, struct btf_field *kptr_field) { @@ -6357,7 +6357,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn kptr_field = btf_record_find(reg->map_ptr->record, off + reg->var_off.value, BPF_KPTR | BPF_UPTR); if (kptr_field) { - err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field); + err = check_map_kptr_access(env, value_regno, insn_idx, kptr_field); } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; -- cgit v1.2.3 From 54c27ea6dadbe932a955bf40b7837947c5c202e1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:34:35 -0700 Subject: bpf: Fix tail_call_reachable leak In check_max_stack_depth_subprog(), the local variable tail_call_reachable is set when entering a callee that has a tail call, but never reset when popping back to the parent. This causes the flag to leak across sibling subprogs in the DFS traversal. This results in unnecessary JIT overhead: the JIT emits tail call counter preservation code for subprogs that can never be reached via a tail call path. Fix this by resetting tail_call_reachable to the parent's actual per-subprog flag when popping a frame. If the parent was already marked tail_call_reachable by a previous sibling's traversal, the local variable stays true. Otherwise it resets to false, so subsequent siblings start with a clean state. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033435.2538013-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6118743d87e6..26b6cdfd8613 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5490,6 +5490,9 @@ continue_func: frame = dinfo[idx].frame; i = dinfo[idx].ret_insn; + /* reset tail_call_reachable to the parent's actual state */ + tail_call_reachable = subprog[idx].tail_call_reachable; + goto continue_func; } -- cgit v1.2.3 From 8a16c5b2b22ff22c1a2e3ff92cc991990efceb38 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:34:41 -0700 Subject: bpf: Remove WARN_ON_ONCE in check_kfunc_mem_size_reg() The warning is too late if it does happen. Remove it. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033441.2538149-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 26b6cdfd8613..d123449f5552 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7134,8 +7134,6 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg struct bpf_call_arg_meta meta; int err; - WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5); - memset(&meta, 0, sizeof(meta)); if (may_be_null) { -- cgit v1.2.3 From 6a581b856c9e633c615483ad53910cba8cacbf28 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:34:46 -0700 Subject: bpf: Refactor to avoid redundant calculation of bpf_reg_state In many cases, once a bpf_reg_state is defined, it can pass to callee's. Otherwise, callee will need to get bpf_reg_state again based on regno. More importantly, this is needed for later stack arguments for kfuncs since the register state for stack arguments does not have a corresponding regno. So it makes sense to pass reg state for callee's. The following is the only change to avoid compilation warning: static int sanitize_check_bounds(struct bpf_verifier_env *env, const struct bpf_insn *insn, - const struct bpf_reg_state *dst_reg) + struct bpf_reg_state *dst_reg) Acked-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Reviewed-by: Amery Hung Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033446.2538321-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 223 ++++++++++++++++++++++---------------------------- 1 file changed, 98 insertions(+), 125 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d123449f5552..debae6133e4c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3908,7 +3908,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return 0; } -/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is +/* Write the stack: 'stack[ptr_reg + off] = value_regno'. 'ptr_reg' is * known to contain a variable offset. * This function checks whether the write is permitted and conservatively * tracks the effects of the write, considering that each stack slot in the @@ -3929,13 +3929,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, static int check_stack_write_var_off(struct bpf_verifier_env *env, /* func where register points to */ struct bpf_func_state *state, - int ptr_regno, int off, int size, + struct bpf_reg_state *ptr_reg, int off, int size, int value_regno, int insn_idx) { struct bpf_func_state *cur; /* state of the current function */ int min_off, max_off; int i, err; - struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL; + struct bpf_reg_state *value_reg = NULL; struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; bool writing_zero = false; /* set if the fact that we're writing a zero is used to let any @@ -3944,7 +3944,6 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, bool zero_used = false; cur = env->cur_state->frame[env->cur_state->curframe]; - ptr_reg = &cur->regs[ptr_regno]; min_off = ptr_reg->smin_value + off; max_off = ptr_reg->smax_value + off + size; if (value_regno >= 0) @@ -4241,7 +4240,7 @@ enum bpf_access_src { ACCESS_HELPER = 2, /* the access is performed by a helper */ }; -static int check_stack_range_initialized(struct bpf_verifier_env *env, +static int check_stack_range_initialized(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int off, int access_size, bool zero_size_allowed, enum bpf_access_type type, @@ -4252,31 +4251,29 @@ static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) return cur_regs(env) + regno; } -/* Read the stack at 'ptr_regno + off' and put the result into the register +/* Read the stack at 'reg + off' and put the result into the register * 'dst_regno'. - * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'), + * 'off' includes the pointer register's fixed offset(i.e. 'reg->off'), * but not its variable offset. * 'size' is assumed to be <= reg size and the access is assumed to be aligned. * * As opposed to check_stack_read_fixed_off, this function doesn't deal with * filling registers (i.e. reads of spilled register cannot be detected when * the offset is not fixed). We conservatively mark 'dst_regno' as containing - * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable + * SCALAR_VALUE. That's why we assert that the 'reg' has a variable * offset; for a fixed offset check_stack_read_fixed_off should be used * instead. */ -static int check_stack_read_var_off(struct bpf_verifier_env *env, +static int check_stack_read_var_off(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int ptr_regno, int off, int size, int dst_regno) { - /* The state of the source register. */ - struct bpf_reg_state *reg = reg_state(env, ptr_regno); struct bpf_func_state *ptr_state = bpf_func(env, reg); int err; int min_off, max_off; /* Note that we pass a NULL meta, so raw access will not be permitted. */ - err = check_stack_range_initialized(env, ptr_regno, off, size, + err = check_stack_range_initialized(env, reg, ptr_regno, off, size, false, BPF_READ, NULL); if (err) return err; @@ -4298,10 +4295,9 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, * can be -1, meaning that the read value is not going to a register. */ static int check_stack_read(struct bpf_verifier_env *env, - int ptr_regno, int off, int size, + struct bpf_reg_state *reg, int ptr_regno, int off, int size, int dst_regno) { - struct bpf_reg_state *reg = reg_state(env, ptr_regno); struct bpf_func_state *state = bpf_func(env, reg); int err; /* Some accesses are only permitted with a static offset. */ @@ -4337,7 +4333,7 @@ static int check_stack_read(struct bpf_verifier_env *env, * than fixed offset ones. Note that dst_regno >= 0 on this * branch. */ - err = check_stack_read_var_off(env, ptr_regno, off, size, + err = check_stack_read_var_off(env, reg, ptr_regno, off, size, dst_regno); } return err; @@ -4347,17 +4343,16 @@ static int check_stack_read(struct bpf_verifier_env *env, /* check_stack_write dispatches to check_stack_write_fixed_off or * check_stack_write_var_off. * - * 'ptr_regno' is the register used as a pointer into the stack. + * 'reg' is the register used as a pointer into the stack. * 'value_regno' is the register whose value we're writing to the stack. It can * be -1, meaning that we're not writing from a register. * * The caller must ensure that the offset falls within the maximum stack size. */ static int check_stack_write(struct bpf_verifier_env *env, - int ptr_regno, int off, int size, + struct bpf_reg_state *reg, int off, int size, int value_regno, int insn_idx) { - struct bpf_reg_state *reg = reg_state(env, ptr_regno); struct bpf_func_state *state = bpf_func(env, reg); int err; @@ -4370,16 +4365,15 @@ static int check_stack_write(struct bpf_verifier_env *env, * than fixed offset ones. */ err = check_stack_write_var_off(env, state, - ptr_regno, off, size, + reg, off, size, value_regno, insn_idx); } return err; } -static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, +static int check_map_access_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int off, int size, enum bpf_access_type type) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_map *map = reg->map_ptr; u32 cap = bpf_map_flags_to_cap(map); @@ -4399,17 +4393,15 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, } /* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ -static int __check_mem_access(struct bpf_verifier_env *env, int regno, +static int __check_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int off, int size, u32 mem_size, bool zero_size_allowed) { bool size_ok = size > 0 || (size == 0 && zero_size_allowed); - struct bpf_reg_state *reg; if (off >= 0 && size_ok && (u64)off + size <= mem_size) return 0; - reg = &cur_regs(env)[regno]; switch (reg->type) { case PTR_TO_MAP_KEY: verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n", @@ -4439,13 +4431,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, } /* check read/write into a memory region with possible variable offset */ -static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, +static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, int off, int size, u32 mem_size, bool zero_size_allowed) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *reg = &state->regs[regno]; int err; /* We may have adjusted the register pointing to memory region, so we @@ -4466,7 +4455,7 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_mem_access(env, regno, reg->smin_value + off, size, + err = __check_mem_access(env, reg, regno, reg->smin_value + off, size, mem_size, zero_size_allowed); if (err) { verbose(env, "R%d min value is outside of the allowed memory range\n", @@ -4483,7 +4472,7 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_mem_access(env, regno, reg->umax_value + off, size, + err = __check_mem_access(env, reg, regno, reg->umax_value + off, size, mem_size, zero_size_allowed); if (err) { verbose(env, "R%d max value is outside of the allowed memory range\n", @@ -4787,19 +4776,16 @@ static u32 map_mem_size(const struct bpf_map *map) } /* check read/write into a map element with possible variable offset */ -static int check_map_access(struct bpf_verifier_env *env, u32 regno, +static int check_map_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, int off, int size, bool zero_size_allowed, enum bpf_access_src src) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *reg = &state->regs[regno]; struct bpf_map *map = reg->map_ptr; u32 mem_size = map_mem_size(map); struct btf_record *rec; int err, i; - err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed); + err = check_mem_region_access(env, reg, regno, off, size, mem_size, zero_size_allowed); if (err) return err; @@ -4895,10 +4881,9 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, +static int check_packet_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, int off, int size, bool zero_size_allowed) { - struct bpf_reg_state *reg = reg_state(env, regno); int err; if (reg->range < 0) { @@ -4906,7 +4891,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, return -EINVAL; } - err = check_mem_region_access(env, regno, off, size, reg->range, zero_size_allowed); + err = check_mem_region_access(env, reg, regno, off, size, reg->range, zero_size_allowed); if (err) return err; @@ -4961,7 +4946,7 @@ static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int of return -EACCES; } -static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, +static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, u32 regno, int off, int access_size, enum bpf_access_type t, struct bpf_insn_access_aux *info) { @@ -4971,12 +4956,10 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regn */ bool var_off_ok = is_var_ctx_off_allowed(env->prog); bool fixed_off_ok = !env->ops->convert_ctx_access; - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = regs + regno; int err; if (var_off_ok) - err = check_mem_region_access(env, regno, off, access_size, U16_MAX, false); + err = check_mem_region_access(env, reg, regno, off, access_size, U16_MAX, false); else err = __check_ptr_off_reg(env, reg, regno, fixed_off_ok); if (err) @@ -5002,10 +4985,9 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off, } static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, - u32 regno, int off, int size, + struct bpf_reg_state *reg, u32 regno, int off, int size, enum bpf_access_type t) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_insn_access_aux info = {}; bool valid; @@ -5971,12 +5953,11 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, } static int check_ptr_to_btf_access(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, + struct bpf_reg_state *regs, struct bpf_reg_state *reg, int regno, int off, int size, enum bpf_access_type atype, int value_regno) { - struct bpf_reg_state *reg = regs + regno; const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); const char *tname = btf_name_by_offset(reg->btf, t->name_off); const char *field_name = NULL; @@ -6128,12 +6109,11 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } static int check_ptr_to_map_access(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, + struct bpf_reg_state *regs, struct bpf_reg_state *reg, int regno, int off, int size, enum bpf_access_type atype, int value_regno) { - struct bpf_reg_state *reg = regs + regno; struct bpf_map *map = reg->map_ptr; struct bpf_reg_state map_reg; enum bpf_type_flag flag = 0; @@ -6222,11 +6202,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, * 'off' includes `regno->offset`, but not its dynamic part (if any). */ static int check_stack_access_within_bounds( - struct bpf_verifier_env *env, + struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int off, int access_size, enum bpf_access_type type) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = bpf_func(env, reg); s64 min_off, max_off; int err; @@ -6314,12 +6293,11 @@ static void add_scalar_to_reg(struct bpf_reg_state *dst_reg, s64 val) * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, +static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, u32 regno, int off, int bpf_size, enum bpf_access_type t, int value_regno, bool strict_alignment_once, bool is_ldsx) { struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = regs + regno; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -6336,7 +6314,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_mem_region_access(env, regno, off, size, + err = check_mem_region_access(env, reg, regno, off, size, reg->map_ptr->key_size, false); if (err) return err; @@ -6350,10 +6328,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } - err = check_map_access_type(env, regno, off, size, t); + err = check_map_access_type(env, reg, off, size, t); if (err) return err; - err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT); + err = check_map_access(env, reg, regno, off, size, false, ACCESS_DIRECT); if (err) return err; if (tnum_is_const(reg->var_off)) @@ -6422,7 +6400,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * instructions, hence no need to check bounds in that case. */ if (!rdonly_untrusted) - err = check_mem_region_access(env, regno, off, size, + err = check_mem_region_access(env, reg, regno, off, size, reg->mem_size, false); if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); @@ -6440,7 +6418,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_ctx_access(env, insn_idx, regno, off, size, t, &info); + err = check_ctx_access(env, insn_idx, reg, regno, off, size, t, &info); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a * PTR_TO_PACKET[_META,_END]. In the latter @@ -6477,15 +6455,15 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == PTR_TO_STACK) { /* Basic bounds checks. */ - err = check_stack_access_within_bounds(env, regno, off, size, t); + err = check_stack_access_within_bounds(env, reg, regno, off, size, t); if (err) return err; if (t == BPF_READ) - err = check_stack_read(env, regno, off, size, + err = check_stack_read(env, reg, regno, off, size, value_regno); else - err = check_stack_write(env, regno, off, size, + err = check_stack_write(env, reg, off, size, value_regno, insn_idx); } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { @@ -6498,7 +6476,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno); return -EACCES; } - err = check_packet_access(env, regno, off, size, false); + err = check_packet_access(env, reg, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_FLOW_KEYS) { @@ -6518,7 +6496,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn regno, reg_type_str(env, reg->type)); return -EACCES; } - err = check_sock_access(env, insn_idx, regno, off, size, t); + err = check_sock_access(env, insn_idx, reg, regno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_TP_BUFFER) { @@ -6527,10 +6505,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } else if (base_type(reg->type) == PTR_TO_BTF_ID && !type_may_be_null(reg->type)) { - err = check_ptr_to_btf_access(env, regs, regno, off, size, t, + err = check_ptr_to_btf_access(env, regs, reg, regno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { - err = check_ptr_to_map_access(env, regs, regno, off, size, t, + err = check_ptr_to_map_access(env, regs, reg, regno, off, size, t, value_regno); } else if (base_type(reg->type) == PTR_TO_BUF && !type_may_be_null(reg->type)) { @@ -6599,7 +6577,7 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Check if (src_reg + off) is readable. The state of dst_reg will be * updated by this call. */ - err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off, + err = check_mem_access(env, env->insn_idx, regs + insn->src_reg, insn->src_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, strict_alignment_once, is_ldsx); err = err ?: save_aux_ptr_type(env, src_reg_type, @@ -6629,7 +6607,7 @@ static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, dst_reg_type = regs[insn->dst_reg].type; /* Check if (dst_reg + off) is writeable. */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, regs + insn->dst_reg, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg, strict_alignment_once, false); err = err ?: save_aux_ptr_type(env, dst_reg_type, false); @@ -6640,6 +6618,7 @@ static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, static int check_atomic_rmw(struct bpf_verifier_env *env, struct bpf_insn *insn) { + struct bpf_reg_state *dst_reg; int load_reg; int err; @@ -6701,13 +6680,15 @@ static int check_atomic_rmw(struct bpf_verifier_env *env, load_reg = -1; } + dst_reg = cur_regs(env) + insn->dst_reg; + /* Check whether we can read the memory, with second call for fetch * case to simulate the register fill. */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, dst_reg, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0) - err = check_mem_access(env, env->insn_idx, insn->dst_reg, + err = check_mem_access(env, env->insn_idx, dst_reg, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, load_reg, true, false); if (err) @@ -6719,7 +6700,7 @@ static int check_atomic_rmw(struct bpf_verifier_env *env, return err; } /* Check whether we can write into the same memory. */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, dst_reg, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; @@ -6808,11 +6789,10 @@ static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) * read offsets are marked as read. */ static int check_stack_range_initialized( - struct bpf_verifier_env *env, int regno, int off, + struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int off, int access_size, bool zero_size_allowed, enum bpf_access_type type, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = bpf_func(env, reg); int err, min_off, max_off, i, j, slot, spi; /* Some accesses can write anything into the stack, others are @@ -6834,7 +6814,7 @@ static int check_stack_range_initialized( return -EACCES; } - err = check_stack_access_within_bounds(env, regno, off, access_size, type); + err = check_stack_access_within_bounds(env, reg, regno, off, access_size, type); if (err) return err; @@ -6965,7 +6945,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: - return check_packet_access(env, regno, 0, access_size, + return check_packet_access(env, reg, regno, 0, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: if (access_type == BPF_WRITE) { @@ -6973,12 +6953,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, reg_type_str(env, reg->type)); return -EACCES; } - return check_mem_region_access(env, regno, 0, access_size, + return check_mem_region_access(env, reg, regno, 0, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: - if (check_map_access_type(env, regno, 0, access_size, access_type)) + if (check_map_access_type(env, reg, 0, access_size, access_type)) return -EACCES; - return check_map_access(env, regno, 0, access_size, + return check_map_access(env, reg, regno, 0, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { @@ -6988,7 +6968,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return -EACCES; } } - return check_mem_region_access(env, regno, 0, + return check_mem_region_access(env, reg, regno, 0, access_size, reg->mem_size, zero_size_allowed); case PTR_TO_BUF: @@ -7008,16 +6988,16 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, max_access); case PTR_TO_STACK: return check_stack_range_initialized( - env, + env, reg, regno, 0, access_size, zero_size_allowed, access_type, meta); case PTR_TO_BTF_ID: - return check_ptr_to_btf_access(env, regs, regno, 0, + return check_ptr_to_btf_access(env, regs, reg, regno, 0, access_size, BPF_READ, -1); case PTR_TO_CTX: /* Only permit reading or writing syscall context using helper calls. */ if (is_var_ctx_off_allowed(env->prog)) { - int err = check_mem_region_access(env, regno, 0, access_size, U16_MAX, + int err = check_mem_region_access(env, reg, regno, 0, access_size, U16_MAX, zero_size_allowed); if (err) return err; @@ -7178,11 +7158,10 @@ enum { * env->cur_state->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock. */ -static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) +static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int flags) { bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); bool is_irq = flags & PROCESS_LOCK_IRQ; @@ -7295,11 +7274,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) } /* Check if @regno is a pointer to a specific field in a map value */ -static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, +static int check_map_field_pointer(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, enum btf_field_type field_type, struct bpf_map_desc *map_desc) { - struct bpf_reg_state *reg = reg_state(env, regno); bool is_const = tnum_is_const(reg->var_off); struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; @@ -7349,26 +7327,26 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, return 0; } -static int process_timer_func(struct bpf_verifier_env *env, int regno, +static int process_timer_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, struct bpf_map_desc *map) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) { verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); return -EOPNOTSUPP; } - return check_map_field_pointer(env, regno, BPF_TIMER, map); + return check_map_field_pointer(env, reg, regno, BPF_TIMER, map); } -static int process_timer_helper(struct bpf_verifier_env *env, int regno, +static int process_timer_helper(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, struct bpf_call_arg_meta *meta) { - return process_timer_func(env, regno, &meta->map); + return process_timer_func(env, reg, regno, &meta->map); } -static int process_timer_kfunc(struct bpf_verifier_env *env, int regno, +static int process_timer_kfunc(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, struct bpf_kfunc_call_arg_meta *meta) { - return process_timer_func(env, regno, &meta->map); + return process_timer_func(env, reg, regno, &meta->map); } static int process_kptr_func(struct bpf_verifier_env *env, int regno, @@ -7433,10 +7411,9 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, * use case. The second level is tracked using the upper bit of bpf_dynptr->size * and checked dynamically during runtime. */ -static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, +static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int insn_idx, enum bpf_arg_type arg_type, int clone_ref_obj_id) { - struct bpf_reg_state *reg = reg_state(env, regno); int err; if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { @@ -7470,7 +7447,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn /* we write BPF_DW bits (8 bytes) at a time */ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { - err = check_mem_access(env, insn_idx, regno, + err = check_mem_access(env, insn_idx, reg, regno, i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; @@ -7540,10 +7517,9 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx, return btf_param_match_suffix(meta->btf, arg, "__iter"); } -static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx, +static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int insn_idx, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); const struct btf_type *t; int spi, err, i, nr_slots, btf_id; @@ -7575,7 +7551,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id } for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) { - err = check_mem_access(env, insn_idx, regno, + err = check_mem_access(env, insn_idx, reg, regno, i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; @@ -8014,12 +7990,11 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_DYNPTR] = &dynptr_types, }; -static int check_reg_type(struct bpf_verifier_env *env, u32 regno, +static int check_reg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, enum bpf_arg_type arg_type, const u32 *arg_btf_id, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_reg_type expected, type = reg->type; const struct bpf_reg_types *compatible; int i, j, err; @@ -8362,7 +8337,7 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return -EACCES; } - err = check_map_access(env, regno, 0, + err = check_map_access(env, reg, regno, 0, map->value_size - reg->var_off.value, false, ACCESS_HELPER); if (err) @@ -8498,7 +8473,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK) arg_btf_id = fn->arg_btf_id[arg]; - err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); + err = check_reg_type(env, reg, regno, arg_type, arg_btf_id, meta); if (err) return err; @@ -8636,11 +8611,11 @@ skip_type_check: return -EACCES; } if (meta->func_id == BPF_FUNC_spin_lock) { - err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK); + err = process_spin_lock(env, reg, regno, PROCESS_SPIN_LOCK); if (err) return err; } else if (meta->func_id == BPF_FUNC_spin_unlock) { - err = process_spin_lock(env, regno, 0); + err = process_spin_lock(env, reg, regno, 0); if (err) return err; } else { @@ -8649,7 +8624,7 @@ skip_type_check: } break; case ARG_PTR_TO_TIMER: - err = process_timer_helper(env, regno, meta); + err = process_timer_helper(env, reg, regno, meta); if (err) return err; break; @@ -8684,7 +8659,7 @@ skip_type_check: true, meta); break; case ARG_PTR_TO_DYNPTR: - err = process_dynptr_func(env, regno, insn_idx, arg_type, 0); + err = process_dynptr_func(env, reg, regno, insn_idx, arg_type, 0); if (err) return err; break; @@ -9343,7 +9318,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, if (ret) return ret; - ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0); + ret = process_dynptr_func(env, reg, regno, -1, arg->arg_type, 0); if (ret) return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { @@ -9354,7 +9329,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, continue; memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ - err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta); + err = check_reg_type(env, reg, regno, arg->arg_type, &arg->btf_id, &meta); err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type); if (err) return err; @@ -10312,18 +10287,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (err) return err; + regs = cur_regs(env); + /* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state. */ for (i = 0; i < meta.access_size; i++) { - err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, + err = check_mem_access(env, insn_idx, regs + meta.regno, meta.regno, i, BPF_B, BPF_WRITE, -1, false, false); if (err) return err; } - regs = cur_regs(env); - if (meta.release_regno) { err = -EINVAL; if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { @@ -11327,11 +11302,10 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, const struct btf_type *t, const struct btf_type *ref_t, const char *ref_tname, const struct btf_param *args, - int argno, int nargs) + int argno, int nargs, struct bpf_reg_state *reg) { u32 regno = argno + 1; struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; bool arg_mem_size = false; if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || @@ -11498,10 +11472,9 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, return 0; } -static int process_irq_flag(struct bpf_verifier_env *env, int regno, +static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); int err, kfunc_class = IRQ_NATIVE_KFUNC; bool irq_save; @@ -11526,7 +11499,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, return -EINVAL; } - err = check_mem_access(env, env->insn_idx, regno, 0, BPF_DW, BPF_WRITE, -1, false, false); + err = check_mem_access(env, env->insn_idx, reg, regno, 0, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; @@ -12114,7 +12087,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs); + kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs, reg); if (kf_arg_type < 0) return kf_arg_type; @@ -12276,7 +12249,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } - ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id); + ret = process_dynptr_func(env, reg, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id); if (ret < 0) return ret; @@ -12301,7 +12274,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } } - ret = process_iter_arg(env, regno, insn_idx, meta); + ret = process_iter_arg(env, reg, regno, insn_idx, meta); if (ret < 0) return ret; break; @@ -12478,7 +12451,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a map value\n", i); return -EINVAL; } - ret = check_map_field_pointer(env, regno, BPF_WORKQUEUE, &meta->map); + ret = check_map_field_pointer(env, reg, regno, BPF_WORKQUEUE, &meta->map); if (ret < 0) return ret; break; @@ -12487,7 +12460,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a map value\n", i); return -EINVAL; } - ret = process_timer_kfunc(env, regno, meta); + ret = process_timer_kfunc(env, reg, regno, meta); if (ret < 0) return ret; break; @@ -12496,7 +12469,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a map value\n", i); return -EINVAL; } - ret = check_map_field_pointer(env, regno, BPF_TASK_WORK, &meta->map); + ret = check_map_field_pointer(env, reg, regno, BPF_TASK_WORK, &meta->map); if (ret < 0) return ret; break; @@ -12505,7 +12478,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); return -EINVAL; } - ret = process_irq_flag(env, regno, meta); + ret = process_irq_flag(env, reg, regno, meta); if (ret < 0) return ret; break; @@ -12526,7 +12499,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) flags |= PROCESS_LOCK_IRQ; - ret = process_spin_lock(env, regno, flags); + ret = process_spin_lock(env, reg, regno, flags); if (ret < 0) return ret; break; @@ -13660,7 +13633,7 @@ static int check_stack_access_for_ptr_arithmetic( static int sanitize_check_bounds(struct bpf_verifier_env *env, const struct bpf_insn *insn, - const struct bpf_reg_state *dst_reg) + struct bpf_reg_state *dst_reg) { u32 dst = insn->dst_reg; @@ -13677,7 +13650,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, return -EACCES; break; case PTR_TO_MAP_VALUE: - if (check_map_access(env, dst, 0, 1, false, ACCESS_HELPER)) { + if (check_map_access(env, dst_reg, dst, 0, 1, false, ACCESS_HELPER)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; @@ -17563,7 +17536,7 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) dst_reg_type = cur_regs(env)[insn->dst_reg].type; - err = check_mem_access(env, env->insn_idx, insn->dst_reg, + err = check_mem_access(env, env->insn_idx, cur_regs(env) + insn->dst_reg, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, false, false); if (err) -- cgit v1.2.3 From 024c0ab5db9459d114304d070936c440a61d3fd4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:34:51 -0700 Subject: bpf: Refactor to handle memory and size together Similar to the previous patch, try to pass bpf_reg_state from caller to callee. Both mem_reg and size_reg are passed to helper functions. This is important for stack arguments as they may be beyond registers 1-5. Acked-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033451.2539065-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 57 +++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 29 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index debae6133e4c..1b9a4918fa57 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6934,12 +6934,12 @@ mark: return 0; } -static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, +static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int access_size, enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *regs = cur_regs(env); u32 *max_access; switch (base_type(reg->type)) { @@ -7022,12 +7022,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, /* verify arguments to helpers or kfuncs consisting of a pointer and an access * size. * - * @regno is the register containing the access size. regno-1 is the register - * containing the pointer. + * @mem_reg contains the pointer, @size_reg contains the access size. */ static int check_mem_size_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, - enum bpf_access_type access_type, + struct bpf_reg_state *mem_reg, + struct bpf_reg_state *size_reg, u32 mem_regno, + u32 size_regno, enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { @@ -7041,37 +7041,37 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, * out. Only upper bounds can be learned because retval is an * int type and negative retvals are allowed. */ - meta->msize_max_value = reg->umax_value; + meta->msize_max_value = size_reg->umax_value; /* The register is SCALAR_VALUE; the access check happens using * its boundaries. For unprivileged variable accesses, disable * raw mode so that the program is required to initialize all * the memory that the helper could just partially fill up. */ - if (!tnum_is_const(reg->var_off)) + if (!tnum_is_const(size_reg->var_off)) meta = NULL; - if (reg->smin_value < 0) { + if (size_reg->smin_value < 0) { verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", - regno); + size_regno); return -EACCES; } - if (reg->umin_value == 0 && !zero_size_allowed) { + if (size_reg->umin_value == 0 && !zero_size_allowed) { verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n", - regno, reg->umin_value, reg->umax_value); + size_regno, size_reg->umin_value, size_reg->umax_value); return -EACCES; } - if (reg->umax_value >= BPF_MAX_VAR_SIZ) { + if (size_reg->umax_value >= BPF_MAX_VAR_SIZ) { verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", - regno); + size_regno); return -EACCES; } - err = check_helper_mem_access(env, regno - 1, reg->umax_value, + err = check_helper_mem_access(env, mem_reg, mem_regno, size_reg->umax_value, access_type, zero_size_allowed, meta); if (!err) - err = mark_chain_precision(env, regno); + err = mark_chain_precision(env, size_regno); return err; } @@ -7096,8 +7096,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg int size = base_type(reg->type) == PTR_TO_STACK ? -(int)mem_size : mem_size; - err = check_helper_mem_access(env, regno, size, BPF_READ, true, NULL); - err = err ?: check_helper_mem_access(env, regno, size, BPF_WRITE, true, NULL); + err = check_helper_mem_access(env, reg, regno, size, BPF_READ, true, NULL); + err = err ?: check_helper_mem_access(env, reg, regno, size, BPF_WRITE, true, NULL); if (may_be_null) *reg = saved_reg; @@ -7105,10 +7105,9 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg return err; } -static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno) +static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *mem_reg, + struct bpf_reg_state *size_reg, u32 mem_regno, u32 size_regno) { - struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1]; bool may_be_null = type_may_be_null(mem_reg->type); struct bpf_reg_state saved_reg; struct bpf_call_arg_meta meta; @@ -7121,8 +7120,8 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta); - err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta); + err = check_mem_size_reg(env, mem_reg, size_reg, mem_regno, size_regno, BPF_READ, true, &meta); + err = err ?: check_mem_size_reg(env, mem_reg, size_reg, mem_regno, size_regno, BPF_WRITE, true, &meta); if (may_be_null) *mem_reg = saved_reg; @@ -8566,7 +8565,7 @@ skip_type_check: return -EFAULT; } key_size = meta->map.ptr->key_size; - err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); + err = check_helper_mem_access(env, reg, regno, key_size, BPF_READ, false, NULL); if (err) return err; if (can_elide_value_nullness(meta->map.ptr->map_type)) { @@ -8593,7 +8592,7 @@ skip_type_check: return -EFAULT; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, regno, meta->map.ptr->value_size, + err = check_helper_mem_access(env, reg, regno, meta->map.ptr->value_size, arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; @@ -8637,7 +8636,7 @@ skip_type_check: */ meta->raw_mode = arg_type & MEM_UNINIT; if (arg_type & MEM_FIXED_SIZE) { - err = check_helper_mem_access(env, regno, fn->arg_size[arg], + err = check_helper_mem_access(env, reg, regno, fn->arg_size[arg], arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); if (err) @@ -8647,13 +8646,13 @@ skip_type_check: } break; case ARG_CONST_SIZE: - err = check_mem_size_reg(env, reg, regno, + err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, regno - 1, regno, fn->arg_type[arg - 1] & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; case ARG_CONST_SIZE_OR_ZERO: - err = check_mem_size_reg(env, reg, regno, + err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, regno - 1, regno, fn->arg_type[arg - 1] & MEM_WRITE ? BPF_WRITE : BPF_READ, true, meta); @@ -12384,7 +12383,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ const struct btf_param *size_arg = &args[i + 1]; if (!bpf_register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) { - ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); + ret = check_kfunc_mem_size_reg(env, buff_reg, size_reg, regno, regno + 1); if (ret < 0) { verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); return ret; -- cgit v1.2.3 From 053a48cb2a5459659a5034ad20e7d9f28bc56f16 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:34:56 -0700 Subject: bpf: Rename existing argno to arg To support stack arguments, in later patches, argno will represent both registers and stack arguments. To avoid confusion, rename existing argno to arg. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033456.2539340-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 54 +++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1b9a4918fa57..81d77dfaaaf6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11301,9 +11301,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, const struct btf_type *t, const struct btf_type *ref_t, const char *ref_tname, const struct btf_param *args, - int argno, int nargs, struct bpf_reg_state *reg) + int arg, int nargs, struct bpf_reg_state *reg) { - u32 regno = argno + 1; + u32 regno = arg + 1; struct bpf_reg_state *regs = cur_regs(env); bool arg_mem_size = false; @@ -11312,9 +11312,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, meta->func_id == special_kfunc_list[KF_bpf_session_cookie]) return KF_ARG_PTR_TO_CTX; - if (argno + 1 < nargs && - (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || - is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) + if (arg + 1 < nargs && + (is_kfunc_arg_mem_size(meta->btf, &args[arg + 1], ®s[regno + 1]) || + is_kfunc_arg_const_mem_size(meta->btf, &args[arg + 1], ®s[regno + 1]))) arg_mem_size = true; /* In this function, we verify the kfunc's BTF as per the argument type, @@ -11322,68 +11322,68 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, * type to our caller. When a set of conditions hold in the BTF type of * arguments, we resolve it to a known kfunc_ptr_arg_type. */ - if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) + if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), arg)) return KF_ARG_PTR_TO_CTX; - if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && bpf_register_is_null(reg) && + if (is_kfunc_arg_nullable(meta->btf, &args[arg]) && bpf_register_is_null(reg) && !arg_mem_size) return KF_ARG_PTR_TO_NULL; - if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) + if (is_kfunc_arg_alloc_obj(meta->btf, &args[arg])) return KF_ARG_PTR_TO_ALLOC_BTF_ID; - if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno])) + if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[arg])) return KF_ARG_PTR_TO_REFCOUNTED_KPTR; - if (is_kfunc_arg_dynptr(meta->btf, &args[argno])) + if (is_kfunc_arg_dynptr(meta->btf, &args[arg])) return KF_ARG_PTR_TO_DYNPTR; - if (is_kfunc_arg_iter(meta, argno, &args[argno])) + if (is_kfunc_arg_iter(meta, arg, &args[arg])) return KF_ARG_PTR_TO_ITER; - if (is_kfunc_arg_list_head(meta->btf, &args[argno])) + if (is_kfunc_arg_list_head(meta->btf, &args[arg])) return KF_ARG_PTR_TO_LIST_HEAD; - if (is_kfunc_arg_list_node(meta->btf, &args[argno])) + if (is_kfunc_arg_list_node(meta->btf, &args[arg])) return KF_ARG_PTR_TO_LIST_NODE; - if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno])) + if (is_kfunc_arg_rbtree_root(meta->btf, &args[arg])) return KF_ARG_PTR_TO_RB_ROOT; - if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno])) + if (is_kfunc_arg_rbtree_node(meta->btf, &args[arg])) return KF_ARG_PTR_TO_RB_NODE; - if (is_kfunc_arg_const_str(meta->btf, &args[argno])) + if (is_kfunc_arg_const_str(meta->btf, &args[arg])) return KF_ARG_PTR_TO_CONST_STR; - if (is_kfunc_arg_map(meta->btf, &args[argno])) + if (is_kfunc_arg_map(meta->btf, &args[arg])) return KF_ARG_PTR_TO_MAP; - if (is_kfunc_arg_wq(meta->btf, &args[argno])) + if (is_kfunc_arg_wq(meta->btf, &args[arg])) return KF_ARG_PTR_TO_WORKQUEUE; - if (is_kfunc_arg_timer(meta->btf, &args[argno])) + if (is_kfunc_arg_timer(meta->btf, &args[arg])) return KF_ARG_PTR_TO_TIMER; - if (is_kfunc_arg_task_work(meta->btf, &args[argno])) + if (is_kfunc_arg_task_work(meta->btf, &args[arg])) return KF_ARG_PTR_TO_TASK_WORK; - if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) + if (is_kfunc_arg_irq_flag(meta->btf, &args[arg])) return KF_ARG_PTR_TO_IRQ_FLAG; - if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno])) + if (is_kfunc_arg_res_spin_lock(meta->btf, &args[arg])) return KF_ARG_PTR_TO_RES_SPIN_LOCK; if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", - meta->func_name, argno, btf_type_str(ref_t), ref_tname); + meta->func_name, arg, btf_type_str(ref_t), ref_tname); return -EINVAL; } return KF_ARG_PTR_TO_BTF_ID; } - if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) + if (is_kfunc_arg_callback(env, meta->btf, &args[arg])) return KF_ARG_PTR_TO_CALLBACK; /* This is the catch all argument type of register types supported by @@ -11394,7 +11394,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) && (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", - argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); + arg, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); return -EINVAL; } return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM; @@ -11405,7 +11405,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, const struct btf_type *ref_t, const char *ref_tname, u32 ref_id, struct bpf_kfunc_call_arg_meta *meta, - int argno) + int arg) { const struct btf_type *reg_ref_t; bool strict_type_match = false; @@ -11464,7 +11464,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname); if (!taking_projection && !struct_same) { verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", - meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1, + meta->func_name, arg, btf_type_str(ref_t), ref_tname, arg + 1, btf_type_str(reg_ref_t), reg_ref_tname); return -EINVAL; } -- cgit v1.2.3 From 9b9f0b42703ceb88332bcb19453c4288c2683e34 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:35:01 -0700 Subject: bpf: Prepare verifier logs for upcoming kfunc stack arguments This change prepares verifier log reporting for upcoming kfunc stack argument support. Currently verifier log code mostly assumes that an argument can be described directly by a register number. That works for arguments passed in `R1` to `R5`, but it does not work once kfunc arguments can also be passed on the stack. Introduce an opaque `argno_t` type that encodes both register-based and arg-based references. Four helpers form the interface: - argno_from_reg(regno): create from a register number - argno_from_arg(arg): create from a 1-based arg number - reg_from_argno(a): extract register number, or -1 - arg_from_argno(a): extract arg number, or -1 reg_arg_name() converts an argno_t to a human-readable string for verifier logs: "R%d" for register arguments, or "*(R11-off)" for stack arguments beyond R5. Update selftests accordingly. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033501.2539667-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 694 ++++++++++++--------- tools/testing/selftests/bpf/prog_tests/bpf_nf.c | 22 +- tools/testing/selftests/bpf/prog_tests/cb_refs.c | 2 +- .../testing/selftests/bpf/prog_tests/kfunc_call.c | 2 +- .../testing/selftests/bpf/prog_tests/linked_list.c | 4 +- .../selftests/bpf/progs/cgrp_kfunc_failure.c | 14 +- .../testing/selftests/bpf/progs/cpumask_failure.c | 10 +- tools/testing/selftests/bpf/progs/dynptr_fail.c | 22 +- .../testing/selftests/bpf/progs/file_reader_fail.c | 4 +- tools/testing/selftests/bpf/progs/irq.c | 4 +- tools/testing/selftests/bpf/progs/iters.c | 6 +- .../selftests/bpf/progs/iters_state_safety.c | 14 +- tools/testing/selftests/bpf/progs/iters_testmod.c | 4 +- .../selftests/bpf/progs/iters_testmod_seq.c | 4 +- tools/testing/selftests/bpf/progs/map_kptr_fail.c | 2 +- .../selftests/bpf/progs/percpu_alloc_fail.c | 4 +- tools/testing/selftests/bpf/progs/rbtree_fail.c | 6 +- .../selftests/bpf/progs/refcounted_kptr_fail.c | 2 +- tools/testing/selftests/bpf/progs/stream_fail.c | 2 +- .../selftests/bpf/progs/task_kfunc_failure.c | 18 +- tools/testing/selftests/bpf/progs/task_work_fail.c | 6 +- .../testing/selftests/bpf/progs/test_bpf_nf_fail.c | 8 +- .../selftests/bpf/progs/test_kfunc_dynptr_param.c | 2 +- .../bpf/progs/test_kfunc_param_nullable.c | 2 +- .../selftests/bpf/progs/verifier_bits_iter.c | 4 +- .../selftests/bpf/progs/verifier_ref_tracking.c | 6 +- .../selftests/bpf/progs/verifier_vfs_reject.c | 8 +- tools/testing/selftests/bpf/progs/wq_failures.c | 2 +- tools/testing/selftests/bpf/verifier/calls.c | 14 +- 30 files changed, 497 insertions(+), 396 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b148f816f25b..d5b4303315dd 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -913,6 +913,7 @@ struct bpf_verifier_env { * e.g., in reg_type_str() to generate reg_type string */ char tmp_str_buf[TMP_STR_BUF_LEN]; + char tmp_arg_name[32]; struct bpf_insn insn_buf[INSN_BUF_SIZE]; struct bpf_insn epilogue_buf[INSN_BUF_SIZE]; struct bpf_scc_callchain callchain_buf; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 81d77dfaaaf6..ff6ff1c27517 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -261,6 +261,36 @@ struct bpf_kfunc_meta { struct btf *btf_vmlinux; +typedef struct argno { + int argno; +} argno_t; + +static argno_t argno_from_reg(u32 regno) +{ + return (argno_t){ .argno = regno }; +} + +static argno_t argno_from_arg(u32 arg) +{ + return (argno_t){ .argno = -arg }; +} + +static int reg_from_argno(argno_t a) +{ + if (a.argno >= 0) + return a.argno; + if (a.argno >= -MAX_BPF_FUNC_REG_ARGS) + return -a.argno; + return -1; +} + +static int arg_from_argno(argno_t a) +{ + if (a.argno < 0) + return -a.argno; + return -1; +} + static const char *btf_type_name(const struct btf *btf, u32 id) { return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); @@ -1742,6 +1772,22 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, return &elem->st; } +static const char *reg_arg_name(struct bpf_verifier_env *env, argno_t argno) +{ + char *buf = env->tmp_arg_name; + int len = sizeof(env->tmp_arg_name); + int arg, regno = reg_from_argno(argno); + + if (regno >= 0) { + snprintf(buf, len, "R%d", regno); + } else { + arg = arg_from_argno(argno); + snprintf(buf, len, "*(R11-%u)", (arg - MAX_BPF_FUNC_REG_ARGS) * BPF_REG_SIZE); + } + + return buf; +} + static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; @@ -4241,7 +4287,7 @@ enum bpf_access_src { }; static int check_stack_range_initialized(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int regno, int off, int access_size, + argno_t argno, int off, int access_size, bool zero_size_allowed, enum bpf_access_type type, struct bpf_call_arg_meta *meta); @@ -4265,7 +4311,7 @@ static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) * instead. */ static int check_stack_read_var_off(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int ptr_regno, int off, int size, int dst_regno) + argno_t ptr_argno, int off, int size, int dst_regno) { struct bpf_func_state *ptr_state = bpf_func(env, reg); int err; @@ -4273,7 +4319,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, struct bpf_reg /* Note that we pass a NULL meta, so raw access will not be permitted. */ - err = check_stack_range_initialized(env, reg, ptr_regno, off, size, + err = check_stack_range_initialized(env, reg, ptr_argno, off, size, false, BPF_READ, NULL); if (err) return err; @@ -4295,7 +4341,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, struct bpf_reg * can be -1, meaning that the read value is not going to a register. */ static int check_stack_read(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, int ptr_regno, int off, int size, + struct bpf_reg_state *reg, argno_t ptr_argno, int off, int size, int dst_regno) { struct bpf_func_state *state = bpf_func(env, reg); @@ -4333,7 +4379,7 @@ static int check_stack_read(struct bpf_verifier_env *env, * than fixed offset ones. Note that dst_regno >= 0 on this * branch. */ - err = check_stack_read_var_off(env, reg, ptr_regno, off, size, + err = check_stack_read_var_off(env, reg, ptr_argno, off, size, dst_regno); } return err; @@ -4393,7 +4439,7 @@ static int check_map_access_type(struct bpf_verifier_env *env, struct bpf_reg_st } /* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ -static int __check_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, +static int __check_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, u32 mem_size, bool zero_size_allowed) { @@ -4414,8 +4460,8 @@ static int __check_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: - verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", - off, size, regno, reg->id, off, mem_size); + verbose(env, "invalid access to packet, off=%d size=%d, %s(id=%d,off=%d,r=%d)\n", + off, size, reg_arg_name(env, argno), reg->id, off, mem_size); break; case PTR_TO_CTX: verbose(env, "invalid access to context, ctx_size=%d off=%d size=%d\n", @@ -4431,7 +4477,7 @@ static int __check_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state } /* check read/write into a memory region with possible variable offset */ -static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, +static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, u32 mem_size, bool zero_size_allowed) { @@ -4451,15 +4497,15 @@ static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_ (reg->smin_value == S64_MIN || (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || reg->smin_value + off < 0)) { - verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); + verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n", + reg_arg_name(env, argno)); return -EACCES; } - err = __check_mem_access(env, reg, regno, reg->smin_value + off, size, + err = __check_mem_access(env, reg, argno, reg->smin_value + off, size, mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d min value is outside of the allowed memory range\n", - regno); + verbose(env, "%s min value is outside of the allowed memory range\n", + reg_arg_name(env, argno)); return err; } @@ -4468,15 +4514,15 @@ static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_ * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n", - regno); + verbose(env, "%s unbounded memory access, make sure to bounds check any such access\n", + reg_arg_name(env, argno)); return -EACCES; } - err = __check_mem_access(env, reg, regno, reg->umax_value + off, size, + err = __check_mem_access(env, reg, argno, reg->umax_value + off, size, mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d max value is outside of the allowed memory range\n", - regno); + verbose(env, "%s max value is outside of the allowed memory range\n", + reg_arg_name(env, argno)); return err; } @@ -4484,7 +4530,7 @@ static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_ } static int __check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, + const struct bpf_reg_state *reg, argno_t argno, bool fixed_off_ok) { /* Access to this pointer-typed register or passing it to a helper @@ -4501,14 +4547,14 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, } if (reg->smin_value < 0) { - verbose(env, "negative offset %s ptr R%d off=%lld disallowed\n", - reg_type_str(env, reg->type), regno, reg->var_off.value); + verbose(env, "negative offset %s ptr %s off=%lld disallowed\n", + reg_type_str(env, reg->type), reg_arg_name(env, argno), reg->var_off.value); return -EACCES; } if (!fixed_off_ok && reg->var_off.value != 0) { - verbose(env, "dereference of modified %s ptr R%d off=%lld disallowed\n", - reg_type_str(env, reg->type), regno, reg->var_off.value); + verbose(env, "dereference of modified %s ptr %s off=%lld disallowed\n", + reg_type_str(env, reg->type), reg_arg_name(env, argno), reg->var_off.value); return -EACCES; } @@ -4518,7 +4564,7 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, static int check_ptr_off_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno) { - return __check_ptr_off_reg(env, reg, regno, false); + return __check_ptr_off_reg(env, reg, argno_from_reg(regno), false); } static int map_kptr_match_type(struct bpf_verifier_env *env, @@ -4556,7 +4602,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * Since ref_ptr cannot be accessed directly by BPF insns, check for * reg->ref_obj_id is not needed here. */ - if (__check_ptr_off_reg(env, reg, regno, true)) + if (__check_ptr_off_reg(env, reg, argno_from_reg(regno), true)) return -EACCES; /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and @@ -4776,7 +4822,7 @@ static u32 map_mem_size(const struct bpf_map *map) } /* check read/write into a map element with possible variable offset */ -static int check_map_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, +static int check_map_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, bool zero_size_allowed, enum bpf_access_src src) { @@ -4785,7 +4831,7 @@ static int check_map_access(struct bpf_verifier_env *env, struct bpf_reg_state * struct btf_record *rec; int err, i; - err = check_mem_region_access(env, reg, regno, off, size, mem_size, zero_size_allowed); + err = check_mem_region_access(env, reg, argno, off, size, mem_size, zero_size_allowed); if (err) return err; @@ -4881,17 +4927,17 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int check_packet_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, int off, +static int check_packet_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, bool zero_size_allowed) { int err; if (reg->range < 0) { - verbose(env, "R%d offset is outside of the packet\n", regno); + verbose(env, "%s offset is outside of the packet\n", reg_arg_name(env, argno)); return -EINVAL; } - err = check_mem_region_access(env, reg, regno, off, size, reg->range, zero_size_allowed); + err = check_mem_region_access(env, reg, argno, off, size, reg->range, zero_size_allowed); if (err) return err; @@ -4946,7 +4992,7 @@ static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int of return -EACCES; } -static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, u32 regno, +static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, argno_t argno, int off, int access_size, enum bpf_access_type t, struct bpf_insn_access_aux *info) { @@ -4959,9 +5005,9 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct b int err; if (var_off_ok) - err = check_mem_region_access(env, reg, regno, off, access_size, U16_MAX, false); + err = check_mem_region_access(env, reg, argno, off, access_size, U16_MAX, false); else - err = __check_ptr_off_reg(env, reg, regno, fixed_off_ok); + err = __check_ptr_off_reg(env, reg, argno, fixed_off_ok); if (err) return err; off += reg->umax_value; @@ -4985,15 +5031,15 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off, } static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, - struct bpf_reg_state *reg, u32 regno, int off, int size, + struct bpf_reg_state *reg, argno_t argno, int off, int size, enum bpf_access_type t) { struct bpf_insn_access_aux info = {}; bool valid; if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); + verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n", + reg_arg_name(env, argno)); return -EACCES; } @@ -5021,8 +5067,8 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, return 0; } - verbose(env, "R%d invalid %s access off=%d size=%d\n", - regno, reg_type_str(env, reg->type), off, size); + verbose(env, "%s invalid %s access off=%d size=%d\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type), off, size); return -EACCES; } @@ -5535,12 +5581,12 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, - int regno, int off, int size) + argno_t argno, int off, int size) { if (off < 0) { verbose(env, - "R%d invalid %s buffer access: off=%d, size=%d\n", - regno, buf_info, off, size); + "%s invalid %s buffer access: off=%d, size=%d\n", + reg_arg_name(env, argno), buf_info, off, size); return -EACCES; } if (!tnum_is_const(reg->var_off)) { @@ -5548,8 +5594,8 @@ static int __check_buffer_access(struct bpf_verifier_env *env, tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "R%d invalid variable buffer offset: off=%d, var_off=%s\n", - regno, off, tn_buf); + "%s invalid variable buffer offset: off=%d, var_off=%s\n", + reg_arg_name(env, argno), off, tn_buf); return -EACCES; } @@ -5558,11 +5604,11 @@ static int __check_buffer_access(struct bpf_verifier_env *env, static int check_tp_buffer_access(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, - int regno, int off, int size) + argno_t argno, int off, int size) { int err; - err = __check_buffer_access(env, "tracepoint", reg, regno, off, size); + err = __check_buffer_access(env, "tracepoint", reg, argno, off, size); if (err) return err; @@ -5574,14 +5620,14 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env, static int check_buffer_access(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, - int regno, int off, int size, + argno_t argno, int off, int size, bool zero_size_allowed, u32 *max_access) { const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr"; int err; - err = __check_buffer_access(env, buf_info, reg, regno, off, size); + err = __check_buffer_access(env, buf_info, reg, argno, off, size); if (err) return err; @@ -5954,7 +6000,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *regs, struct bpf_reg_state *reg, - int regno, int off, int size, + argno_t argno, int off, int size, enum bpf_access_type atype, int value_regno) { @@ -5983,8 +6029,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n", - regno, tname, off, tn_buf); + "%s is ptr_%s invalid variable offset: off=%d, var_off=%s\n", + reg_arg_name(env, argno), tname, off, tn_buf); return -EACCES; } @@ -5992,22 +6038,22 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (off < 0) { verbose(env, - "R%d is ptr_%s invalid negative access: off=%d\n", - regno, tname, off); + "%s is ptr_%s invalid negative access: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } if (reg->type & MEM_USER) { verbose(env, - "R%d is ptr_%s access user memory: off=%d\n", - regno, tname, off); + "%s is ptr_%s access user memory: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } if (reg->type & MEM_PERCPU) { verbose(env, - "R%d is ptr_%s access percpu memory: off=%d\n", - regno, tname, off); + "%s is ptr_%s access percpu memory: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } @@ -6110,7 +6156,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, static int check_ptr_to_map_access(struct bpf_verifier_env *env, struct bpf_reg_state *regs, struct bpf_reg_state *reg, - int regno, int off, int size, + argno_t argno, int off, int size, enum bpf_access_type atype, int value_regno) { @@ -6144,8 +6190,8 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, } if (off < 0) { - verbose(env, "R%d is %s invalid negative access: off=%d\n", - regno, tname, off); + verbose(env, "%s is %s invalid negative access: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } @@ -6203,7 +6249,7 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, */ static int check_stack_access_within_bounds( struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int regno, int off, int access_size, + argno_t argno, int off, int access_size, enum bpf_access_type type) { struct bpf_func_state *state = bpf_func(env, reg); @@ -6222,8 +6268,8 @@ static int check_stack_access_within_bounds( } else { if (reg->smax_value >= BPF_MAX_VAR_OFF || reg->smin_value <= -BPF_MAX_VAR_OFF) { - verbose(env, "invalid unbounded variable-offset%s stack R%d\n", - err_extra, regno); + verbose(env, "invalid unbounded variable-offset%s stack %s\n", + err_extra, reg_arg_name(env, argno)); return -EACCES; } min_off = reg->smin_value + off; @@ -6241,14 +6287,14 @@ static int check_stack_access_within_bounds( if (err) { if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid%s stack R%d off=%lld size=%d\n", - err_extra, regno, min_off, access_size); + verbose(env, "invalid%s stack %s off=%lld size=%d\n", + err_extra, reg_arg_name(env, argno), min_off, access_size); } else { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n", - err_extra, regno, tn_buf, off, access_size); + verbose(env, "invalid variable-offset%s stack %s var_off=%s off=%d size=%d\n", + err_extra, reg_arg_name(env, argno), tn_buf, off, access_size); } return err; } @@ -6293,7 +6339,7 @@ static void add_scalar_to_reg(struct bpf_reg_state *dst_reg, s64 val) * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, u32 regno, +static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, argno_t argno, int off, int bpf_size, enum bpf_access_type t, int value_regno, bool strict_alignment_once, bool is_ldsx) { @@ -6310,11 +6356,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b if (reg->type == PTR_TO_MAP_KEY) { if (t == BPF_WRITE) { - verbose(env, "write to change key R%d not allowed\n", regno); + verbose(env, "write to change key %s not allowed\n", + reg_arg_name(env, argno)); return -EACCES; } - err = check_mem_region_access(env, reg, regno, off, size, + err = check_mem_region_access(env, reg, argno, off, size, reg->map_ptr->key_size, false); if (err) return err; @@ -6331,7 +6378,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b err = check_map_access_type(env, reg, off, size, t); if (err) return err; - err = check_map_access(env, reg, regno, off, size, false, ACCESS_DIRECT); + err = check_map_access(env, reg, argno, off, size, false, ACCESS_DIRECT); if (err) return err; if (tnum_is_const(reg->var_off)) @@ -6378,14 +6425,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b bool rdonly_untrusted = rdonly_mem && (reg->type & PTR_UNTRUSTED); if (type_may_be_null(reg->type)) { - verbose(env, "R%d invalid mem access '%s'\n", regno, + verbose(env, "%s invalid mem access '%s'\n", reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } if (t == BPF_WRITE && rdonly_mem) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } @@ -6400,7 +6447,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b * instructions, hence no need to check bounds in that case. */ if (!rdonly_untrusted) - err = check_mem_region_access(env, reg, regno, off, size, + err = check_mem_region_access(env, reg, argno, off, size, reg->mem_size, false); if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); @@ -6418,7 +6465,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b return -EACCES; } - err = check_ctx_access(env, insn_idx, reg, regno, off, size, t, &info); + err = check_ctx_access(env, insn_idx, reg, argno, off, size, t, &info); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a * PTR_TO_PACKET[_META,_END]. In the latter @@ -6455,12 +6502,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b } else if (reg->type == PTR_TO_STACK) { /* Basic bounds checks. */ - err = check_stack_access_within_bounds(env, reg, regno, off, size, t); + err = check_stack_access_within_bounds(env, reg, argno, off, size, t); if (err) return err; if (t == BPF_READ) - err = check_stack_read(env, reg, regno, off, size, + err = check_stack_read(env, reg, argno, off, size, value_regno); else err = check_stack_write(env, reg, off, size, @@ -6476,7 +6523,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b value_regno); return -EACCES; } - err = check_packet_access(env, reg, regno, off, size, false); + err = check_packet_access(env, reg, argno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_FLOW_KEYS) { @@ -6492,23 +6539,23 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b mark_reg_unknown(env, regs, value_regno); } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } - err = check_sock_access(env, insn_idx, reg, regno, off, size, t); + err = check_sock_access(env, insn_idx, reg, argno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_TP_BUFFER) { - err = check_tp_buffer_access(env, reg, regno, off, size); + err = check_tp_buffer_access(env, reg, argno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (base_type(reg->type) == PTR_TO_BTF_ID && !type_may_be_null(reg->type)) { - err = check_ptr_to_btf_access(env, regs, reg, regno, off, size, t, + err = check_ptr_to_btf_access(env, regs, reg, argno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { - err = check_ptr_to_map_access(env, regs, reg, regno, off, size, t, + err = check_ptr_to_map_access(env, regs, reg, argno, off, size, t, value_regno); } else if (base_type(reg->type) == PTR_TO_BUF && !type_may_be_null(reg->type)) { @@ -6517,8 +6564,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b if (rdonly_mem) { if (t == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } max_access = &env->prog->aux->max_rdonly_access; @@ -6526,7 +6573,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b max_access = &env->prog->aux->max_rdwr_access; } - err = check_buffer_access(env, reg, regno, off, size, false, + err = check_buffer_access(env, reg, argno, off, size, false, max_access); if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) @@ -6535,7 +6582,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b if (t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else { - verbose(env, "R%d invalid mem access '%s'\n", regno, + verbose(env, "%s invalid mem access '%s'\n", reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } @@ -6577,7 +6624,7 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Check if (src_reg + off) is readable. The state of dst_reg will be * updated by this call. */ - err = check_mem_access(env, env->insn_idx, regs + insn->src_reg, insn->src_reg, insn->off, + err = check_mem_access(env, env->insn_idx, regs + insn->src_reg, argno_from_reg(insn->src_reg), insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, strict_alignment_once, is_ldsx); err = err ?: save_aux_ptr_type(env, src_reg_type, @@ -6607,7 +6654,7 @@ static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, dst_reg_type = regs[insn->dst_reg].type; /* Check if (dst_reg + off) is writeable. */ - err = check_mem_access(env, env->insn_idx, regs + insn->dst_reg, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, regs + insn->dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg, strict_alignment_once, false); err = err ?: save_aux_ptr_type(env, dst_reg_type, false); @@ -6685,10 +6732,10 @@ static int check_atomic_rmw(struct bpf_verifier_env *env, /* Check whether we can read the memory, with second call for fetch * case to simulate the register fill. */ - err = check_mem_access(env, env->insn_idx, dst_reg, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0) - err = check_mem_access(env, env->insn_idx, dst_reg, insn->dst_reg, + err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_READ, load_reg, true, false); if (err) @@ -6700,7 +6747,7 @@ static int check_atomic_rmw(struct bpf_verifier_env *env, return err; } /* Check whether we can write into the same memory. */ - err = check_mem_access(env, env->insn_idx, dst_reg, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; @@ -6789,7 +6836,7 @@ static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) * read offsets are marked as read. */ static int check_stack_range_initialized( - struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int off, + struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int access_size, bool zero_size_allowed, enum bpf_access_type type, struct bpf_call_arg_meta *meta) { @@ -6814,7 +6861,7 @@ static int check_stack_range_initialized( return -EACCES; } - err = check_stack_access_within_bounds(env, reg, regno, off, access_size, type); + err = check_stack_access_within_bounds(env, reg, argno, off, access_size, type); if (err) return err; @@ -6831,8 +6878,8 @@ static int check_stack_range_initialized( char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n", - regno, tn_buf); + verbose(env, "%s variable offset stack access prohibited for !root, var_off=%s\n", + reg_arg_name(env, argno), tn_buf); return -EACCES; } /* Only initialized buffer on stack is allowed to be accessed @@ -6875,7 +6922,7 @@ static int check_stack_range_initialized( } } meta->access_size = access_size; - meta->regno = regno; + meta->regno = reg_from_argno(argno); return 0; } @@ -6915,17 +6962,17 @@ static int check_stack_range_initialized( if (*stype == STACK_POISON) { if (allow_poison) goto mark; - verbose(env, "reading from stack R%d off %d+%d size %d, slot poisoned by dead code elimination\n", - regno, min_off, i - min_off, access_size); + verbose(env, "reading from stack %s off %d+%d size %d, slot poisoned by dead code elimination\n", + reg_arg_name(env, argno), min_off, i - min_off, access_size); } else if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid read from stack R%d off %d+%d size %d\n", - regno, min_off, i - min_off, access_size); + verbose(env, "invalid read from stack %s off %d+%d size %d\n", + reg_arg_name(env, argno), min_off, i - min_off, access_size); } else { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid read from stack R%d var_off %s+%d size %d\n", - regno, tn_buf, i - min_off, access_size); + verbose(env, "invalid read from stack %s var_off %s+%d size %d\n", + reg_arg_name(env, argno), tn_buf, i - min_off, access_size); } return -EACCES; mark: @@ -6934,7 +6981,7 @@ mark: return 0; } -static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, +static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int access_size, enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) @@ -6945,37 +6992,37 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_ switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: - return check_packet_access(env, reg, regno, 0, access_size, + return check_packet_access(env, reg, argno, 0, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: if (access_type == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", regno, - reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } - return check_mem_region_access(env, reg, regno, 0, access_size, + return check_mem_region_access(env, reg, argno, 0, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: if (check_map_access_type(env, reg, 0, access_size, access_type)) return -EACCES; - return check_map_access(env, reg, regno, 0, access_size, + return check_map_access(env, reg, argno, 0, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { if (access_type == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", regno, - reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } } - return check_mem_region_access(env, reg, regno, 0, + return check_mem_region_access(env, reg, argno, 0, access_size, reg->mem_size, zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { if (access_type == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", regno, - reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } @@ -6983,21 +7030,21 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_ } else { max_access = &env->prog->aux->max_rdwr_access; } - return check_buffer_access(env, reg, regno, 0, + return check_buffer_access(env, reg, argno, 0, access_size, zero_size_allowed, max_access); case PTR_TO_STACK: return check_stack_range_initialized( env, reg, - regno, 0, access_size, + argno, 0, access_size, zero_size_allowed, access_type, meta); case PTR_TO_BTF_ID: - return check_ptr_to_btf_access(env, regs, reg, regno, 0, + return check_ptr_to_btf_access(env, regs, reg, argno, 0, access_size, BPF_READ, -1); case PTR_TO_CTX: /* Only permit reading or writing syscall context using helper calls. */ if (is_var_ctx_off_allowed(env->prog)) { - int err = check_mem_region_access(env, reg, regno, 0, access_size, U16_MAX, + int err = check_mem_region_access(env, reg, argno, 0, access_size, U16_MAX, zero_size_allowed); if (err) return err; @@ -7012,7 +7059,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_ bpf_register_is_null(reg)) return 0; - verbose(env, "R%d type=%s ", regno, + verbose(env, "%s type=%s ", reg_arg_name(env, argno), reg_type_str(env, reg->type)); verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK)); return -EACCES; @@ -7026,8 +7073,8 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_ */ static int check_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *mem_reg, - struct bpf_reg_state *size_reg, u32 mem_regno, - u32 size_regno, enum bpf_access_type access_type, + struct bpf_reg_state *size_reg, argno_t mem_argno, + argno_t size_argno, enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { @@ -7052,31 +7099,31 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, meta = NULL; if (size_reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", - size_regno); + verbose(env, "%s min value is negative, either use unsigned or 'var &= const'\n", + reg_arg_name(env, size_argno)); return -EACCES; } if (size_reg->umin_value == 0 && !zero_size_allowed) { - verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n", - size_regno, size_reg->umin_value, size_reg->umax_value); + verbose(env, "%s invalid zero-sized read: u64=[%lld,%lld]\n", + reg_arg_name(env, size_argno), size_reg->umin_value, size_reg->umax_value); return -EACCES; } if (size_reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", - size_regno); + verbose(env, "%s unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + reg_arg_name(env, size_argno)); return -EACCES; } - err = check_helper_mem_access(env, mem_reg, mem_regno, size_reg->umax_value, + err = check_helper_mem_access(env, mem_reg, mem_argno, size_reg->umax_value, access_type, zero_size_allowed, meta); if (!err) - err = mark_chain_precision(env, size_regno); + err = mark_chain_precision(env, reg_from_argno(size_argno)); return err; } static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno, u32 mem_size) + argno_t argno, u32 mem_size) { bool may_be_null = type_may_be_null(reg->type); struct bpf_reg_state saved_reg; @@ -7096,8 +7143,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg int size = base_type(reg->type) == PTR_TO_STACK ? -(int)mem_size : mem_size; - err = check_helper_mem_access(env, reg, regno, size, BPF_READ, true, NULL); - err = err ?: check_helper_mem_access(env, reg, regno, size, BPF_WRITE, true, NULL); + err = check_helper_mem_access(env, reg, argno, size, BPF_READ, true, NULL); + err = err ?: check_helper_mem_access(env, reg, argno, size, BPF_WRITE, true, NULL); if (may_be_null) *reg = saved_reg; @@ -7106,7 +7153,7 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg } static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *mem_reg, - struct bpf_reg_state *size_reg, u32 mem_regno, u32 size_regno) + struct bpf_reg_state *size_reg, argno_t mem_argno, argno_t size_argno) { bool may_be_null = type_may_be_null(mem_reg->type); struct bpf_reg_state saved_reg; @@ -7120,8 +7167,8 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, mem_reg, size_reg, mem_regno, size_regno, BPF_READ, true, &meta); - err = err ?: check_mem_size_reg(env, mem_reg, size_reg, mem_regno, size_regno, BPF_WRITE, true, &meta); + err = check_mem_size_reg(env, mem_reg, size_reg, mem_argno, size_argno, BPF_READ, true, &meta); + err = err ?: check_mem_size_reg(env, mem_reg, size_reg, mem_argno, size_argno, BPF_WRITE, true, &meta); if (may_be_null) *mem_reg = saved_reg; @@ -7157,7 +7204,7 @@ enum { * env->cur_state->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock. */ -static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int flags) +static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int flags) { bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; @@ -7173,8 +7220,8 @@ static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state if (!is_const) { verbose(env, - "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n", - regno, lock_str); + "%s doesn't have constant offset. %s_lock has to be at the constant offset\n", + reg_arg_name(env, argno), lock_str); return -EINVAL; } if (reg->type == PTR_TO_MAP_VALUE) { @@ -7273,7 +7320,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state } /* Check if @regno is a pointer to a specific field in a map value */ -static int check_map_field_pointer(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, +static int check_map_field_pointer(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, enum btf_field_type field_type, struct bpf_map_desc *map_desc) { @@ -7285,8 +7332,8 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, struct bpf_reg_ if (!is_const) { verbose(env, - "R%d doesn't have constant offset. %s has to be at the constant offset\n", - regno, struct_name); + "%s doesn't have constant offset. %s has to be at the constant offset\n", + reg_arg_name(env, argno), struct_name); return -EINVAL; } if (!map->btf) { @@ -7326,26 +7373,26 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, struct bpf_reg_ return 0; } -static int process_timer_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, +static int process_timer_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_map_desc *map) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) { verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); return -EOPNOTSUPP; } - return check_map_field_pointer(env, reg, regno, BPF_TIMER, map); + return check_map_field_pointer(env, reg, argno, BPF_TIMER, map); } -static int process_timer_helper(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, +static int process_timer_helper(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_call_arg_meta *meta) { - return process_timer_func(env, reg, regno, &meta->map); + return process_timer_func(env, reg, argno, &meta->map); } -static int process_timer_kfunc(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, +static int process_timer_kfunc(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return process_timer_func(env, reg, regno, &meta->map); + return process_timer_func(env, reg, argno, &meta->map); } static int process_kptr_func(struct bpf_verifier_env *env, int regno, @@ -7410,15 +7457,15 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, * use case. The second level is tracked using the upper bit of bpf_dynptr->size * and checked dynamically during runtime. */ -static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int insn_idx, +static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, enum bpf_arg_type arg_type, int clone_ref_obj_id) { int err; if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { verbose(env, - "arg#%d expected pointer to stack or const struct bpf_dynptr\n", - regno - 1); + "%s expected pointer to stack or const struct bpf_dynptr\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -7446,7 +7493,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat /* we write BPF_DW bits (8 bytes) at a time */ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { - err = check_mem_access(env, insn_idx, reg, regno, + err = check_mem_access(env, insn_idx, reg, argno, i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; @@ -7461,17 +7508,17 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat } if (!is_dynptr_reg_valid_init(env, reg)) { - verbose(env, - "Expected an initialized dynptr as arg #%d\n", - regno - 1); + verbose(env, "Expected an initialized dynptr as %s\n", + reg_arg_name(env, argno)); return -EINVAL; } /* Fold modifiers (in this case, OBJ_RELEASE) when checking expected type */ if (!is_dynptr_type_expected(env, reg, arg_type & ~OBJ_RELEASE)) { verbose(env, - "Expected a dynptr of type %s as arg #%d\n", - dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1); + "Expected a dynptr of type %s as %s\n", + dynptr_type_str(arg_to_dynptr_type(arg_type)), + reg_arg_name(env, argno)); return -EINVAL; } @@ -7516,14 +7563,16 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx, return btf_param_match_suffix(meta->btf, arg, "__iter"); } -static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, int insn_idx, +static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, struct bpf_kfunc_call_arg_meta *meta) { const struct btf_type *t; + u32 arg_idx = arg_from_argno(argno) - 1; int spi, err, i, nr_slots, btf_id; if (reg->type != PTR_TO_STACK) { - verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1); + verbose(env, "%s expected pointer to an iterator on stack\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -7533,9 +7582,10 @@ static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state * * to any kfunc, if arg has "__iter" suffix, we need to be a bit more * conservative here. */ - btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1); + btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, arg_idx); if (btf_id < 0) { - verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1); + verbose(env, "expected valid iter pointer as %s\n", + reg_arg_name(env, argno)); return -EINVAL; } t = btf_type_by_id(meta->btf, btf_id); @@ -7544,13 +7594,13 @@ static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state * if (is_iter_new_kfunc(meta)) { /* bpf_iter__new() expects pointer to uninit iter state */ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) { - verbose(env, "expected uninitialized iter_%s as arg #%d\n", - iter_type_str(meta->btf, btf_id), regno - 1); + verbose(env, "expected uninitialized iter_%s as %s\n", + iter_type_str(meta->btf, btf_id), reg_arg_name(env, argno)); return -EINVAL; } for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) { - err = check_mem_access(env, insn_idx, reg, regno, + err = check_mem_access(env, insn_idx, reg, argno, i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; @@ -7568,8 +7618,8 @@ static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state * case 0: break; case -EINVAL: - verbose(env, "expected an initialized iter_%s as arg #%d\n", - iter_type_str(meta->btf, btf_id), regno - 1); + verbose(env, "expected an initialized iter_%s as %s\n", + iter_type_str(meta->btf, btf_id), reg_arg_name(env, argno)); return err; case -EPROTO: verbose(env, "expected an RCU CS when using %s\n", meta->func_name); @@ -7989,7 +8039,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_DYNPTR] = &dynptr_types, }; -static int check_reg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, +static int check_reg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, enum bpf_arg_type arg_type, const u32 *arg_btf_id, struct bpf_call_arg_meta *meta) @@ -8024,7 +8074,7 @@ static int check_reg_type(struct bpf_verifier_env *env, struct bpf_reg_state *re type &= ~DYNPTR_TYPE_FLAG_MASK; /* Local kptr types are allowed as the source argument of bpf_kptr_xchg */ - if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) { + if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && reg_from_argno(argno) == BPF_REG_2) { type &= ~MEM_ALLOC; type &= ~MEM_PERCPU; } @@ -8038,7 +8088,7 @@ static int check_reg_type(struct bpf_verifier_env *env, struct bpf_reg_state *re goto found; } - verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type)); + verbose(env, "%s type=%s expected=", reg_arg_name(env, argno), reg_type_str(env, reg->type)); for (j = 0; j + 1 < i; j++) verbose(env, "%s, ", reg_type_str(env, compatible->types[j])); verbose(env, "%s\n", reg_type_str(env, compatible->types[j])); @@ -8051,9 +8101,9 @@ found: if (compatible == &mem_types) { if (!(arg_type & MEM_RDONLY)) { verbose(env, - "%s() may write into memory pointed by R%d type=%s\n", + "%s() may write into memory pointed by %s type=%s\n", func_id_name(meta->func_id), - regno, reg_type_str(env, reg->type)); + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } return 0; @@ -8076,7 +8126,8 @@ found: if (type_may_be_null(reg->type) && (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) { - verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno); + verbose(env, "Possibly NULL pointer passed to helper %s\n", + reg_arg_name(env, argno)); return -EACCES; } @@ -8089,25 +8140,26 @@ found: } if (meta->func_id == BPF_FUNC_kptr_xchg) { - if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) + if (map_kptr_match_type(env, meta->kptr_field, reg, reg_from_argno(argno))) return -EACCES; } else { if (arg_btf_id == BPF_PTR_POISON) { verbose(env, "verifier internal error:"); - verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n", - regno); + verbose(env, "%s has non-overwritten BPF_PTR_POISON type\n", + reg_arg_name(env, argno)); return -EACCES; } - err = __check_ptr_off_reg(env, reg, regno, true); + err = __check_ptr_off_reg(env, reg, argno, true); if (err) return err; if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->var_off.value, btf_vmlinux, *arg_btf_id, strict_type_match)) { - verbose(env, "R%d is of type %s but %s is expected\n", - regno, btf_type_name(reg->btf, reg->btf_id), + verbose(env, "%s is of type %s but %s is expected\n", + reg_arg_name(env, argno), + btf_type_name(reg->btf, reg->btf_id), btf_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES; } @@ -8124,8 +8176,11 @@ found: return -EFAULT; } /* Check if local kptr in src arg matches kptr in dst arg */ - if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) { - if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) + if (meta->func_id == BPF_FUNC_kptr_xchg) { + int regno = reg_from_argno(argno); + + if (regno == BPF_REG_2 && + map_kptr_match_type(env, meta->kptr_field, reg, regno)) return -EACCES; } break; @@ -8159,7 +8214,7 @@ reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields) } static int check_func_arg_reg_off(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, + const struct bpf_reg_state *reg, argno_t argno, enum bpf_arg_type arg_type) { u32 type = reg->type; @@ -8185,8 +8240,8 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, * to give the user a better error message. */ if (!tnum_is_const(reg->var_off) || reg->var_off.value != 0) { - verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n", - regno); + verbose(env, "%s must have zero offset when passed to release func or trusted arg to kfunc\n", + reg_arg_name(env, argno)); return -EINVAL; } } @@ -8222,7 +8277,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we * still need to do checks instead of returning. */ - return __check_ptr_off_reg(env, reg, regno, true); + return __check_ptr_off_reg(env, reg, argno, true); case PTR_TO_CTX: /* * Allow fixed and variable offsets for syscall context, but @@ -8234,7 +8289,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, return 0; fallthrough; default: - return __check_ptr_off_reg(env, reg, regno, false); + return __check_ptr_off_reg(env, reg, argno, false); } } @@ -8304,8 +8359,8 @@ static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, return state->stack[spi].spilled_ptr.dynptr.type; } -static int check_reg_const_str(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno) +static int check_arg_const_str(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, argno_t argno) { struct bpf_map *map = reg->map_ptr; int err; @@ -8317,17 +8372,18 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return -EINVAL; if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { - verbose(env, "R%d points to insn_array map which cannot be used as const string\n", regno); + verbose(env, "%s points to insn_array map which cannot be used as const string\n", + reg_arg_name(env, argno)); return -EACCES; } if (!bpf_map_is_rdonly(map)) { - verbose(env, "R%d does not point to a readonly map'\n", regno); + verbose(env, "%s does not point to a readonly map'\n", reg_arg_name(env, argno)); return -EACCES; } if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d is not a constant address'\n", regno); + verbose(env, "%s is not a constant address'\n", reg_arg_name(env, argno)); return -EACCES; } @@ -8336,7 +8392,7 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return -EACCES; } - err = check_map_access(env, reg, regno, 0, + err = check_map_access(env, reg, argno, 0, map->value_size - reg->var_off.value, false, ACCESS_HELPER); if (err) @@ -8472,11 +8528,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK) arg_btf_id = fn->arg_btf_id[arg]; - err = check_reg_type(env, reg, regno, arg_type, arg_btf_id, meta); + err = check_reg_type(env, reg, argno_from_reg(regno), arg_type, arg_btf_id, meta); if (err) return err; - err = check_func_arg_reg_off(env, reg, regno, arg_type); + err = check_func_arg_reg_off(env, reg, argno_from_reg(regno), arg_type); if (err) return err; @@ -8565,7 +8621,7 @@ skip_type_check: return -EFAULT; } key_size = meta->map.ptr->key_size; - err = check_helper_mem_access(env, reg, regno, key_size, BPF_READ, false, NULL); + err = check_helper_mem_access(env, reg, argno_from_reg(regno), key_size, BPF_READ, false, NULL); if (err) return err; if (can_elide_value_nullness(meta->map.ptr->map_type)) { @@ -8592,7 +8648,7 @@ skip_type_check: return -EFAULT; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, reg, regno, meta->map.ptr->value_size, + err = check_helper_mem_access(env, reg, argno_from_reg(regno), meta->map.ptr->value_size, arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; @@ -8610,11 +8666,11 @@ skip_type_check: return -EACCES; } if (meta->func_id == BPF_FUNC_spin_lock) { - err = process_spin_lock(env, reg, regno, PROCESS_SPIN_LOCK); + err = process_spin_lock(env, reg, argno_from_reg(regno), PROCESS_SPIN_LOCK); if (err) return err; } else if (meta->func_id == BPF_FUNC_spin_unlock) { - err = process_spin_lock(env, reg, regno, 0); + err = process_spin_lock(env, reg, argno_from_reg(regno), 0); if (err) return err; } else { @@ -8623,7 +8679,7 @@ skip_type_check: } break; case ARG_PTR_TO_TIMER: - err = process_timer_helper(env, reg, regno, meta); + err = process_timer_helper(env, reg, argno_from_reg(regno), meta); if (err) return err; break; @@ -8636,7 +8692,7 @@ skip_type_check: */ meta->raw_mode = arg_type & MEM_UNINIT; if (arg_type & MEM_FIXED_SIZE) { - err = check_helper_mem_access(env, reg, regno, fn->arg_size[arg], + err = check_helper_mem_access(env, reg, argno_from_reg(regno), fn->arg_size[arg], arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); if (err) @@ -8646,19 +8702,21 @@ skip_type_check: } break; case ARG_CONST_SIZE: - err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, regno - 1, regno, + err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, argno_from_reg(regno - 1), + argno_from_reg(regno), fn->arg_type[arg - 1] & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; case ARG_CONST_SIZE_OR_ZERO: - err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, regno - 1, regno, + err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, argno_from_reg(regno - 1), + argno_from_reg(regno), fn->arg_type[arg - 1] & MEM_WRITE ? BPF_WRITE : BPF_READ, true, meta); break; case ARG_PTR_TO_DYNPTR: - err = process_dynptr_func(env, reg, regno, insn_idx, arg_type, 0); + err = process_dynptr_func(env, reg, argno_from_reg(regno), insn_idx, arg_type, 0); if (err) return err; break; @@ -8675,7 +8733,7 @@ skip_type_check: break; case ARG_PTR_TO_CONST_STR: { - err = check_reg_const_str(env, reg, regno); + err = check_arg_const_str(env, reg, argno_from_reg(regno)); if (err) return err; break; @@ -9264,13 +9322,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, * verifier sees. */ for (i = 0; i < sub->arg_cnt; i++) { + argno_t argno = argno_from_arg(i + 1); u32 regno = i + 1; struct bpf_reg_state *reg = ®s[regno]; struct bpf_subprog_arg_info *arg = &sub->args[i]; if (arg->arg_type == ARG_ANYTHING) { if (reg->type != SCALAR_VALUE) { - bpf_log(log, "R%d is not a scalar\n", regno); + bpf_log(log, "%s is not a scalar\n", reg_arg_name(env, argno)); return -EINVAL; } } else if (arg->arg_type & PTR_UNTRUSTED) { @@ -9280,24 +9339,26 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, * invalid memory access. */ } else if (arg->arg_type == ARG_PTR_TO_CTX) { - ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_CTX); + ret = check_func_arg_reg_off(env, reg, argno, ARG_PTR_TO_CTX); if (ret < 0) return ret; /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ if (reg->type != PTR_TO_CTX) { - bpf_log(log, "arg#%d expects pointer to ctx\n", i); + bpf_log(log, "%s expects pointer to ctx\n", + reg_arg_name(env, argno)); return -EINVAL; } } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { - ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); + ret = check_func_arg_reg_off(env, reg, argno, ARG_DONTCARE); if (ret < 0) return ret; - if (check_mem_reg(env, reg, regno, arg->mem_size)) + if (check_mem_reg(env, reg, argno, arg->mem_size)) return -EINVAL; if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) { - bpf_log(log, "arg#%d is expected to be non-NULL\n", i); + bpf_log(log, "%s is expected to be non-NULL\n", + reg_arg_name(env, argno)); return -EINVAL; } } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) { @@ -9309,15 +9370,16 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, * run-time debug nightmare. */ if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) { - bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno); + bpf_log(log, "%s is not a pointer to arena or scalar.\n", + reg_arg_name(env, argno)); return -EINVAL; } } else if (arg->arg_type == ARG_PTR_TO_DYNPTR) { - ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR); + ret = check_func_arg_reg_off(env, reg, argno, ARG_PTR_TO_DYNPTR); if (ret) return ret; - ret = process_dynptr_func(env, reg, regno, -1, arg->arg_type, 0); + ret = process_dynptr_func(env, reg, argno, -1, arg->arg_type, 0); if (ret) return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { @@ -9328,12 +9390,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, continue; memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ - err = check_reg_type(env, reg, regno, arg->arg_type, &arg->btf_id, &meta); - err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type); + err = check_reg_type(env, reg, argno, arg->arg_type, &arg->btf_id, &meta); + err = err ?: check_func_arg_reg_off(env, reg, argno, arg->arg_type); if (err) return err; } else { - verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type); + verifier_bug(env, "unrecognized %s type %d", + reg_arg_name(env, argno), arg->arg_type); return -EFAULT; } } @@ -10292,7 +10355,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn * is inferred from register state. */ for (i = 0; i < meta.access_size; i++) { - err = check_mem_access(env, insn_idx, regs + meta.regno, meta.regno, i, BPF_B, + err = check_mem_access(env, insn_idx, regs + meta.regno, argno_from_reg(meta.regno), i, BPF_B, BPF_WRITE, -1, false, false); if (err) return err; @@ -11301,7 +11364,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, const struct btf_type *t, const struct btf_type *ref_t, const char *ref_tname, const struct btf_param *args, - int arg, int nargs, struct bpf_reg_state *reg) + int arg, int nargs, argno_t argno, struct bpf_reg_state *reg) { u32 regno = arg + 1; struct bpf_reg_state *regs = cur_regs(env); @@ -11376,8 +11439,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { - verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", - meta->func_name, arg, btf_type_str(ref_t), ref_tname); + verbose(env, "kernel function %s %s pointer type %s %s is not supported\n", + meta->func_name, reg_arg_name(env, argno), + btf_type_str(ref_t), ref_tname); return -EINVAL; } return KF_ARG_PTR_TO_BTF_ID; @@ -11393,8 +11457,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, */ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) && (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { - verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", - arg, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); + verbose(env, "%s pointer type %s %s must point to %sscalar, or struct with scalar\n", + reg_arg_name(env, argno), + btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); return -EINVAL; } return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM; @@ -11405,7 +11470,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, const struct btf_type *ref_t, const char *ref_tname, u32 ref_id, struct bpf_kfunc_call_arg_meta *meta, - int arg) + int arg, argno_t argno) { const struct btf_type *reg_ref_t; bool strict_type_match = false; @@ -11463,15 +11528,16 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, */ taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname); if (!taking_projection && !struct_same) { - verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", - meta->func_name, arg, btf_type_str(ref_t), ref_tname, arg + 1, + verbose(env, "kernel function %s %s expected pointer to %s %s but %s has a pointer to %s %s\n", + meta->func_name, reg_arg_name(env, argno), + btf_type_str(ref_t), ref_tname, reg_arg_name(env, argno), btf_type_str(reg_ref_t), reg_ref_tname); return -EINVAL; } return 0; } -static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int regno, +static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { int err, kfunc_class = IRQ_NATIVE_KFUNC; @@ -11494,11 +11560,13 @@ static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state * if (irq_save) { if (!is_irq_flag_reg_valid_uninit(env, reg)) { - verbose(env, "expected uninitialized irq flag as arg#%d\n", regno - 1); + verbose(env, "expected uninitialized irq flag as %s\n", + reg_arg_name(env, argno)); return -EINVAL; } - err = check_mem_access(env, env->insn_idx, reg, regno, 0, BPF_DW, BPF_WRITE, -1, false, false); + err = check_mem_access(env, env->insn_idx, reg, argno, 0, BPF_DW, + BPF_WRITE, -1, false, false); if (err) return err; @@ -11508,7 +11576,8 @@ static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state * } else { err = is_irq_flag_reg_valid_init(env, reg); if (err) { - verbose(env, "expected an initialized irq flag as arg#%d\n", regno - 1); + verbose(env, "expected an initialized irq flag as %s\n", + reg_arg_name(env, argno)); return err; } @@ -11799,7 +11868,7 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, static int __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta, enum btf_field_type head_field_type, struct btf_field **head_field) @@ -11820,8 +11889,8 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, head_type_name = btf_field_type_name(head_field_type); if (!tnum_is_const(reg->var_off)) { verbose(env, - "R%d doesn't have constant offset. %s has to be at the constant offset\n", - regno, head_type_name); + "%s doesn't have constant offset. %s has to be at the constant offset\n", + reg_arg_name(env, argno), head_type_name); return -EINVAL; } @@ -11849,24 +11918,24 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, } static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD, + return __process_kf_arg_ptr_to_graph_root(env, reg, argno, meta, BPF_LIST_HEAD, &meta->arg_list_head.field); } static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT, + return __process_kf_arg_ptr_to_graph_root(env, reg, argno, meta, BPF_RB_ROOT, &meta->arg_rbtree_root.field); } static int __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta, enum btf_field_type head_field_type, enum btf_field_type node_field_type, @@ -11888,8 +11957,8 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, node_type_name = btf_field_type_name(node_field_type); if (!tnum_is_const(reg->var_off)) { verbose(env, - "R%d doesn't have constant offset. %s has to be at the constant offset\n", - regno, node_type_name); + "%s doesn't have constant offset. %s has to be at the constant offset\n", + reg_arg_name(env, argno), node_type_name); return -EINVAL; } @@ -11930,19 +11999,19 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, } static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta, + return __process_kf_arg_ptr_to_graph_node(env, reg, argno, meta, BPF_LIST_HEAD, BPF_LIST_NODE, &meta->arg_list_head.field); } static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta, + return __process_kf_arg_ptr_to_graph_node(env, reg, argno, meta, BPF_RB_ROOT, BPF_RB_NODE, &meta->arg_rbtree_root.field); } @@ -11994,6 +12063,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[i + 1]; const struct btf_type *t, *ref_t, *resolve_ret; enum bpf_arg_type arg_type = ARG_DONTCARE; + argno_t argno = argno_from_arg(i + 1); u32 regno = i + 1, ref_id, type_size; bool is_ret_buf_sz = false; int kf_arg_type; @@ -12016,7 +12086,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (btf_type_is_scalar(t)) { if (reg->type != SCALAR_VALUE) { - verbose(env, "R%d is not a scalar\n", regno); + verbose(env, "%s is not a scalar\n", reg_arg_name(env, argno)); return -EINVAL; } @@ -12026,7 +12096,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d must be a known constant\n", regno); + verbose(env, "%s must be a known constant\n", + reg_arg_name(env, argno)); return -EINVAL; } ret = mark_chain_precision(env, regno); @@ -12048,7 +12119,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d is not a const\n", regno); + verbose(env, "%s is not a const\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -12061,20 +12133,22 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (!btf_type_is_ptr(t)) { - verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t)); + verbose(env, "Unrecognized %s type %s\n", + reg_arg_name(env, argno), btf_type_str(t)); return -EINVAL; } if ((bpf_register_is_null(reg) || type_may_be_null(reg->type)) && !is_kfunc_arg_nullable(meta->btf, &args[i])) { - verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); + verbose(env, "Possibly NULL pointer passed to trusted %s\n", + reg_arg_name(env, argno)); return -EACCES; } if (reg->ref_obj_id) { if (is_kfunc_release(meta) && meta->ref_obj_id) { - verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u", - regno, reg->ref_obj_id, + verifier_bug(env, "more than one arg with ref_obj_id %s %u %u", + reg_arg_name(env, argno), reg->ref_obj_id, meta->ref_obj_id); return -EFAULT; } @@ -12086,7 +12160,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs, reg); + kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs, argno, reg); if (kf_arg_type < 0) return kf_arg_type; @@ -12095,7 +12169,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ continue; case KF_ARG_PTR_TO_MAP: if (!reg->map_ptr) { - verbose(env, "pointer in R%d isn't map pointer\n", regno); + verbose(env, "pointer in %s isn't map pointer\n", + reg_arg_name(env, argno)); return -EINVAL; } if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 || @@ -12133,11 +12208,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_BTF_ID: if (!is_trusted_reg(reg)) { if (!is_kfunc_rcu(meta)) { - verbose(env, "R%d must be referenced or trusted\n", regno); + verbose(env, "%s must be referenced or trusted\n", + reg_arg_name(env, argno)); return -EINVAL; } if (!is_rcu_reg(reg)) { - verbose(env, "R%d must be a rcu pointer\n", regno); + verbose(env, "%s must be a rcu pointer\n", + reg_arg_name(env, argno)); return -EINVAL; } } @@ -12169,15 +12246,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_release(meta) && reg->ref_obj_id) arg_type |= OBJ_RELEASE; - ret = check_func_arg_reg_off(env, reg, regno, arg_type); + ret = check_func_arg_reg_off(env, reg, argno, arg_type); if (ret < 0) return ret; switch (kf_arg_type) { case KF_ARG_PTR_TO_CTX: if (reg->type != PTR_TO_CTX) { - verbose(env, "arg#%d expected pointer to ctx, but got %s\n", - i, reg_type_str(env, reg->type)); + verbose(env, "%s expected pointer to ctx, but got %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EINVAL; } @@ -12191,16 +12268,19 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_ALLOC_BTF_ID: if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) { if (!is_bpf_obj_drop_kfunc(meta->func_id)) { - verbose(env, "arg#%d expected for bpf_obj_drop()\n", i); + verbose(env, "%s expected for bpf_obj_drop()\n", + reg_arg_name(env, argno)); return -EINVAL; } } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) { if (!is_bpf_percpu_obj_drop_kfunc(meta->func_id)) { - verbose(env, "arg#%d expected for bpf_percpu_obj_drop()\n", i); + verbose(env, "%s expected for bpf_percpu_obj_drop()\n", + reg_arg_name(env, argno)); return -EINVAL; } } else { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + verbose(env, "%s expected pointer to allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } if (!reg->ref_obj_id) { @@ -12248,7 +12328,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } - ret = process_dynptr_func(env, reg, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id); + ret = process_dynptr_func(env, reg, argno, insn_idx, + dynptr_arg_type, clone_ref_obj_id); if (ret < 0) return ret; @@ -12273,55 +12354,59 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } } - ret = process_iter_arg(env, reg, regno, insn_idx, meta); + ret = process_iter_arg(env, reg, argno, insn_idx, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_LIST_HEAD: if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); + verbose(env, "%s expected pointer to map value or allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } - ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta); + ret = process_kf_arg_ptr_to_list_head(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_RB_ROOT: if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); + verbose(env, "%s expected pointer to map value or allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } - ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta); + ret = process_kf_arg_ptr_to_rbtree_root(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_LIST_NODE: if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + verbose(env, "%s expected pointer to allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } if (!reg->ref_obj_id) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } - ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta); + ret = process_kf_arg_ptr_to_list_node(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_RB_NODE: if (is_bpf_rbtree_add_kfunc(meta->func_id)) { if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + verbose(env, "%s expected pointer to allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } if (!reg->ref_obj_id) { @@ -12339,7 +12424,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } - ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta); + ret = process_kf_arg_ptr_to_rbtree_node(env, reg, argno, meta); if (ret < 0) return ret; break; @@ -12354,24 +12439,26 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if ((base_type(reg->type) != PTR_TO_BTF_ID || (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) && !reg2btf_ids[base_type(reg->type)]) { - verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type)); + verbose(env, "%s is %s ", reg_arg_name(env, argno), + reg_type_str(env, reg->type)); verbose(env, "expected %s or socket\n", reg_type_str(env, base_type(reg->type) | (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS))); return -EINVAL; } - ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i); + ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i, argno); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_MEM: resolve_ret = btf_resolve_size(btf, ref_t, &type_size); if (IS_ERR(resolve_ret)) { - verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n", - i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret)); + verbose(env, "%s reference type('%s %s') size cannot be determined: %ld\n", + reg_arg_name(env, argno), btf_type_str(ref_t), + ref_tname, PTR_ERR(resolve_ret)); return -EINVAL; } - ret = check_mem_reg(env, reg, regno, type_size); + ret = check_mem_reg(env, reg, argno, type_size); if (ret < 0) return ret; break; @@ -12381,11 +12468,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ const struct btf_param *buff_arg = &args[i]; struct bpf_reg_state *size_reg = ®s[regno + 1]; const struct btf_param *size_arg = &args[i + 1]; + argno_t next_argno = argno_from_arg(i + 2); if (!bpf_register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) { - ret = check_kfunc_mem_size_reg(env, buff_reg, size_reg, regno, regno + 1); + ret = check_kfunc_mem_size_reg(env, buff_reg, size_reg, + argno, next_argno); if (ret < 0) { - verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); + verbose(env, "%s and ", reg_arg_name(env, argno)); + verbose(env, "%s memory, len pair leads to invalid memory access\n", + reg_arg_name(env, next_argno)); return ret; } } @@ -12396,7 +12487,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } if (!tnum_is_const(size_reg->var_off)) { - verbose(env, "R%d must be a known constant\n", regno + 1); + verbose(env, "%s must be a known constant\n", + reg_arg_name(env, next_argno)); return -EINVAL; } meta->arg_constant.found = true; @@ -12409,14 +12501,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } case KF_ARG_PTR_TO_CALLBACK: if (reg->type != PTR_TO_FUNC) { - verbose(env, "arg%d expected pointer to func\n", i); + verbose(env, "%s expected pointer to func\n", reg_arg_name(env, argno)); return -EINVAL; } meta->subprogno = reg->subprogno; break; case KF_ARG_PTR_TO_REFCOUNTED_KPTR: if (!type_is_ptr_alloc_obj(reg->type)) { - verbose(env, "arg#%d is neither owning or non-owning ref\n", i); + verbose(env, "%s is neither owning or non-owning ref\n", + reg_arg_name(env, argno)); return -EINVAL; } if (!type_is_non_owning_ref(reg->type)) @@ -12429,7 +12522,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (rec->refcount_off < 0) { - verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i); + verbose(env, "%s doesn't point to a type with bpf_refcount field\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -12438,46 +12532,51 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_CONST_STR: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a const string\n", i); + verbose(env, "%s doesn't point to a const string\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = check_reg_const_str(env, reg, regno); + ret = check_arg_const_str(env, reg, argno); if (ret) return ret; break; case KF_ARG_PTR_TO_WORKQUEUE: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a map value\n", i); + verbose(env, "%s doesn't point to a map value\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = check_map_field_pointer(env, reg, regno, BPF_WORKQUEUE, &meta->map); + ret = check_map_field_pointer(env, reg, argno, BPF_WORKQUEUE, &meta->map); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_TIMER: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a map value\n", i); + verbose(env, "%s doesn't point to a map value\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = process_timer_kfunc(env, reg, regno, meta); + ret = process_timer_kfunc(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_TASK_WORK: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a map value\n", i); + verbose(env, "%s doesn't point to a map value\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = check_map_field_pointer(env, reg, regno, BPF_TASK_WORK, &meta->map); + ret = check_map_field_pointer(env, reg, argno, BPF_TASK_WORK, &meta->map); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_IRQ_FLAG: if (reg->type != PTR_TO_STACK) { - verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); + verbose(env, "%s doesn't point to an irq flag on stack\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = process_irq_flag(env, reg, regno, meta); + ret = process_irq_flag(env, reg, argno, meta); if (ret < 0) return ret; break; @@ -12486,7 +12585,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ int flags = PROCESS_RES_LOCK; if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d doesn't point to map value or allocated object\n", i); + verbose(env, "%s doesn't point to map value or allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -12498,7 +12598,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) flags |= PROCESS_LOCK_IRQ; - ret = process_spin_lock(env, reg, regno, flags); + ret = process_spin_lock(env, reg, argno, flags); if (ret < 0) return ret; break; @@ -13649,7 +13749,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, return -EACCES; break; case PTR_TO_MAP_VALUE: - if (check_map_access(env, dst_reg, dst, 0, 1, false, ACCESS_HELPER)) { + if (check_map_access(env, dst_reg, argno_from_reg(dst), 0, 1, false, ACCESS_HELPER)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; @@ -16831,7 +16931,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char prog->aux->attach_func_proto->type, NULL); if (ret_type && ret_type == reg_type && reg->ref_obj_id) - return __check_ptr_off_reg(env, reg, regno, false); + return __check_ptr_off_reg(env, reg, argno_from_reg(regno), false); } /* eBPF calling convention is such that R0 is used @@ -17535,7 +17635,7 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) dst_reg_type = cur_regs(env)[insn->dst_reg].type; - err = check_mem_access(env, env->insn_idx, cur_regs(env) + insn->dst_reg, insn->dst_reg, + err = check_mem_access(env, env->insn_idx, cur_regs(env) + insn->dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, false, false); if (err) @@ -18714,7 +18814,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_unknown(env, regs, i); } else { verifier_bug(env, "unhandled arg#%d type %d", - i - BPF_REG_1, arg->arg_type); + i - BPF_REG_1 + 1, arg->arg_type); ret = -EFAULT; goto out; } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c index 215878ea04de..b33dba4b126e 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c @@ -11,18 +11,18 @@ struct { const char *prog_name; const char *err_msg; } test_bpf_nf_fail_tests[] = { - { "alloc_release", "kernel function bpf_ct_release args#0 expected pointer to STRUCT nf_conn but" }, - { "insert_insert", "kernel function bpf_ct_insert_entry args#0 expected pointer to STRUCT nf_conn___init but" }, - { "lookup_insert", "kernel function bpf_ct_insert_entry args#0 expected pointer to STRUCT nf_conn___init but" }, - { "set_timeout_after_insert", "kernel function bpf_ct_set_timeout args#0 expected pointer to STRUCT nf_conn___init but" }, - { "set_status_after_insert", "kernel function bpf_ct_set_status args#0 expected pointer to STRUCT nf_conn___init but" }, - { "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout args#0 expected pointer to STRUCT nf_conn but" }, - { "change_status_after_alloc", "kernel function bpf_ct_change_status args#0 expected pointer to STRUCT nf_conn but" }, + { "alloc_release", "kernel function bpf_ct_release R1 expected pointer to STRUCT nf_conn but" }, + { "insert_insert", "kernel function bpf_ct_insert_entry R1 expected pointer to STRUCT nf_conn___init but" }, + { "lookup_insert", "kernel function bpf_ct_insert_entry R1 expected pointer to STRUCT nf_conn___init but" }, + { "set_timeout_after_insert", "kernel function bpf_ct_set_timeout R1 expected pointer to STRUCT nf_conn___init but" }, + { "set_status_after_insert", "kernel function bpf_ct_set_status R1 expected pointer to STRUCT nf_conn___init but" }, + { "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout R1 expected pointer to STRUCT nf_conn but" }, + { "change_status_after_alloc", "kernel function bpf_ct_change_status R1 expected pointer to STRUCT nf_conn but" }, { "write_not_allowlisted_field", "no write support to nf_conn at off" }, - { "lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" }, - { "lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" }, - { "xdp_lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" }, - { "xdp_lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" }, + { "lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted R2" }, + { "lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted R4" }, + { "xdp_lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted R2" }, + { "xdp_lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted R4" }, }; enum { diff --git a/tools/testing/selftests/bpf/prog_tests/cb_refs.c b/tools/testing/selftests/bpf/prog_tests/cb_refs.c index c40df623a8f7..6300b67a3a84 100644 --- a/tools/testing/selftests/bpf/prog_tests/cb_refs.c +++ b/tools/testing/selftests/bpf/prog_tests/cb_refs.c @@ -12,7 +12,7 @@ struct { const char *err_msg; } cb_refs_tests[] = { { "underflow_prog", "must point to scalar, or struct with scalar" }, - { "leak_prog", "Possibly NULL pointer passed to helper arg2" }, + { "leak_prog", "Possibly NULL pointer passed to helper R2" }, { "nested_cb", "Unreleased reference id=4 alloc_insn=2" }, /* alloc_insn=2{4,5} */ { "non_cb_transfer_ref", "Unreleased reference id=4 alloc_insn=1" }, /* alloc_insn=1{1,2} */ }; diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c index 62f3fb79f5d1..3df07680f9e0 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -68,7 +68,7 @@ static struct kfunc_test_params kfunc_tests[] = { TC_FAIL(kfunc_call_test_get_mem_fail_oob, 0, "min value is outside of the allowed memory range"), TC_FAIL(kfunc_call_test_get_mem_fail_not_const, 0, "is not a const"), TC_FAIL(kfunc_call_test_mem_acquire_fail, 0, "acquire kernel function does not return PTR_TO_BTF_ID"), - TC_FAIL(kfunc_call_test_pointer_arg_type_mismatch, 0, "arg#0 expected pointer to ctx, but got scalar"), + TC_FAIL(kfunc_call_test_pointer_arg_type_mismatch, 0, "R1 expected pointer to ctx, but got scalar"), /* success cases */ TC_TEST(kfunc_call_test1, 12), diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c index 6f25b5f39a79..dbff099860ba 100644 --- a/tools/testing/selftests/bpf/prog_tests/linked_list.c +++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c @@ -81,8 +81,8 @@ static struct { { "direct_write_node", "direct access to bpf_list_node is disallowed" }, { "use_after_unlock_push_front", "invalid mem access 'scalar'" }, { "use_after_unlock_push_back", "invalid mem access 'scalar'" }, - { "double_push_front", "arg#1 expected pointer to allocated object" }, - { "double_push_back", "arg#1 expected pointer to allocated object" }, + { "double_push_front", "R2 expected pointer to allocated object" }, + { "double_push_back", "R2 expected pointer to allocated object" }, { "no_node_value_type", "bpf_list_node not found at offset=0" }, { "incorrect_value_type", "operation on bpf_list_head expects arg#1 bpf_list_node at offset=48 in struct foo, " diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c index 9fe9c4a4e8f6..a875ba8e5007 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c @@ -29,7 +29,7 @@ static struct __cgrps_kfunc_map_value *insert_lookup_cgrp(struct cgroup *cgrp) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(cgrp_kfunc_acquire_untrusted, struct cgroup *cgrp, const char *path) { struct cgroup *acquired; @@ -48,7 +48,7 @@ int BPF_PROG(cgrp_kfunc_acquire_untrusted, struct cgroup *cgrp, const char *path } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(cgrp_kfunc_acquire_no_null_check, struct cgroup *cgrp, const char *path) { struct cgroup *acquired; @@ -64,7 +64,7 @@ int BPF_PROG(cgrp_kfunc_acquire_no_null_check, struct cgroup *cgrp, const char * } SEC("tp_btf/cgroup_mkdir") -__failure __msg("arg#0 pointer type STRUCT cgroup must point") +__failure __msg("R1 pointer type STRUCT cgroup must point") int BPF_PROG(cgrp_kfunc_acquire_fp, struct cgroup *cgrp, const char *path) { struct cgroup *acquired, *stack_cgrp = (struct cgroup *)&path; @@ -106,7 +106,7 @@ int BPF_PROG(cgrp_kfunc_acquire_trusted_walked, struct cgroup *cgrp, const char } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(cgrp_kfunc_acquire_null, struct cgroup *cgrp, const char *path) { struct cgroup *acquired; @@ -175,7 +175,7 @@ int BPF_PROG(cgrp_kfunc_rcu_get_release, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path) { struct __cgrps_kfunc_map_value *v; @@ -191,7 +191,7 @@ int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path } SEC("tp_btf/cgroup_mkdir") -__failure __msg("arg#0 pointer type STRUCT cgroup must point") +__failure __msg("R1 pointer type STRUCT cgroup must point") int BPF_PROG(cgrp_kfunc_release_fp, struct cgroup *cgrp, const char *path) { struct cgroup *acquired = (struct cgroup *)&path; @@ -203,7 +203,7 @@ int BPF_PROG(cgrp_kfunc_release_fp, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(cgrp_kfunc_release_null, struct cgroup *cgrp, const char *path) { struct __cgrps_kfunc_map_value local, *v; diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index 61c32e91e8c3..4c45346fe6f7 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -45,7 +45,7 @@ int BPF_PROG(test_alloc_no_release, struct task_struct *task, u64 clone_flags) } SEC("tp_btf/task_newtask") -__failure __msg("NULL pointer passed to trusted arg0") +__failure __msg("NULL pointer passed to trusted R1") int BPF_PROG(test_alloc_double_release, struct task_struct *task, u64 clone_flags) { struct bpf_cpumask *cpumask; @@ -73,7 +73,7 @@ int BPF_PROG(test_acquire_wrong_cpumask, struct task_struct *task, u64 clone_fla } SEC("tp_btf/task_newtask") -__failure __msg("bpf_cpumask_set_cpu args#1 expected pointer to STRUCT bpf_cpumask") +__failure __msg("bpf_cpumask_set_cpu R2 expected pointer to STRUCT bpf_cpumask") int BPF_PROG(test_mutate_cpumask, struct task_struct *task, u64 clone_flags) { /* Can't set the CPU of a non-struct bpf_cpumask. */ @@ -107,7 +107,7 @@ int BPF_PROG(test_insert_remove_no_release, struct task_struct *task, u64 clone_ } SEC("tp_btf/task_newtask") -__failure __msg("NULL pointer passed to trusted arg0") +__failure __msg("NULL pointer passed to trusted R1") int BPF_PROG(test_cpumask_null, struct task_struct *task, u64 clone_flags) { /* NULL passed to kfunc. */ @@ -151,7 +151,7 @@ int BPF_PROG(test_global_mask_out_of_rcu, struct task_struct *task, u64 clone_fl } SEC("tp_btf/task_newtask") -__failure __msg("NULL pointer passed to trusted arg1") +__failure __msg("NULL pointer passed to trusted R2") int BPF_PROG(test_global_mask_no_null_check, struct task_struct *task, u64 clone_flags) { struct bpf_cpumask *local, *prev; @@ -179,7 +179,7 @@ int BPF_PROG(test_global_mask_no_null_check, struct task_struct *task, u64 clone } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to helper arg2") +__failure __msg("Possibly NULL pointer passed to helper R2") int BPF_PROG(test_global_mask_rcu_no_null_check, struct task_struct *task, u64 clone_flags) { struct bpf_cpumask *prev, *curr; diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index b62773ce5219..dbd97add5a5a 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -149,7 +149,7 @@ int ringbuf_release_uninit_dynptr(void *ctx) /* A dynptr can't be used after it has been invalidated */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #2") +__failure __msg("Expected an initialized dynptr as R3") int use_after_invalid(void *ctx) { struct bpf_dynptr ptr; @@ -448,7 +448,7 @@ int invalid_helper2(void *ctx) /* A bpf_dynptr is invalidated if it's been written into */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int invalid_write1(void *ctx) { struct bpf_dynptr ptr; @@ -1642,7 +1642,7 @@ int invalid_slice_rdwr_rdonly(struct __sk_buff *skb) /* bpf_dynptr_adjust can only be called on initialized dynptrs */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int dynptr_adjust_invalid(void *ctx) { struct bpf_dynptr ptr = {}; @@ -1655,7 +1655,7 @@ int dynptr_adjust_invalid(void *ctx) /* bpf_dynptr_is_null can only be called on initialized dynptrs */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int dynptr_is_null_invalid(void *ctx) { struct bpf_dynptr ptr = {}; @@ -1668,7 +1668,7 @@ int dynptr_is_null_invalid(void *ctx) /* bpf_dynptr_is_rdonly can only be called on initialized dynptrs */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int dynptr_is_rdonly_invalid(void *ctx) { struct bpf_dynptr ptr = {}; @@ -1681,7 +1681,7 @@ int dynptr_is_rdonly_invalid(void *ctx) /* bpf_dynptr_size can only be called on initialized dynptrs */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int dynptr_size_invalid(void *ctx) { struct bpf_dynptr ptr = {}; @@ -1694,7 +1694,7 @@ int dynptr_size_invalid(void *ctx) /* Only initialized dynptrs can be cloned */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int clone_invalid1(void *ctx) { struct bpf_dynptr ptr1 = {}; @@ -1728,7 +1728,7 @@ int clone_invalid2(struct xdp_md *xdp) /* Invalidating a dynptr should invalidate its clones */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #2") +__failure __msg("Expected an initialized dynptr as R3") int clone_invalidate1(void *ctx) { struct bpf_dynptr clone; @@ -1749,7 +1749,7 @@ int clone_invalidate1(void *ctx) /* Invalidating a dynptr should invalidate its parent */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #2") +__failure __msg("Expected an initialized dynptr as R3") int clone_invalidate2(void *ctx) { struct bpf_dynptr ptr; @@ -1770,7 +1770,7 @@ int clone_invalidate2(void *ctx) /* Invalidating a dynptr should invalidate its siblings */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #2") +__failure __msg("Expected an initialized dynptr as R3") int clone_invalidate3(void *ctx) { struct bpf_dynptr ptr; @@ -1981,7 +1981,7 @@ __noinline long global_call_bpf_dynptr(const struct bpf_dynptr *dynptr) } SEC("?raw_tp") -__failure __msg("arg#0 expected pointer to stack or const struct bpf_dynptr") +__failure __msg("R1 expected pointer to stack or const struct bpf_dynptr") int test_dynptr_reg_type(void *ctx) { struct task_struct *current = NULL; diff --git a/tools/testing/selftests/bpf/progs/file_reader_fail.c b/tools/testing/selftests/bpf/progs/file_reader_fail.c index 32fe28ed2439..0739620dea8a 100644 --- a/tools/testing/selftests/bpf/progs/file_reader_fail.c +++ b/tools/testing/selftests/bpf/progs/file_reader_fail.c @@ -30,7 +30,7 @@ int on_nanosleep_unreleased_ref(void *ctx) SEC("xdp") __failure -__msg("Expected a dynptr of type file as arg #0") +__msg("Expected a dynptr of type file as R1") int xdp_wrong_dynptr_type(struct xdp_md *xdp) { struct bpf_dynptr dynptr; @@ -42,7 +42,7 @@ int xdp_wrong_dynptr_type(struct xdp_md *xdp) SEC("xdp") __failure -__msg("Expected an initialized dynptr as arg #0") +__msg("Expected an initialized dynptr as R1") int xdp_no_dynptr_type(struct xdp_md *xdp) { struct bpf_dynptr dynptr; diff --git a/tools/testing/selftests/bpf/progs/irq.c b/tools/testing/selftests/bpf/progs/irq.c index e11e82d98904..a4a007866a33 100644 --- a/tools/testing/selftests/bpf/progs/irq.c +++ b/tools/testing/selftests/bpf/progs/irq.c @@ -15,7 +15,7 @@ struct bpf_res_spin_lock lockA __hidden SEC(".data.A"); struct bpf_res_spin_lock lockB __hidden SEC(".data.B"); SEC("?tc") -__failure __msg("arg#0 doesn't point to an irq flag on stack") +__failure __msg("R1 doesn't point to an irq flag on stack") int irq_save_bad_arg(struct __sk_buff *ctx) { bpf_local_irq_save(&global_flags); @@ -23,7 +23,7 @@ int irq_save_bad_arg(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("arg#0 doesn't point to an irq flag on stack") +__failure __msg("R1 doesn't point to an irq flag on stack") int irq_restore_bad_arg(struct __sk_buff *ctx) { bpf_local_irq_restore(&global_flags); diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 86b74e3579d9..0fa70b133d93 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -1605,7 +1605,7 @@ int iter_subprog_check_stacksafe(const void *ctx) struct bpf_iter_num global_it; SEC("raw_tp") -__failure __msg("arg#0 expected pointer to an iterator on stack") +__failure __msg("R1 expected pointer to an iterator on stack") int iter_new_bad_arg(const void *ctx) { bpf_iter_num_new(&global_it, 0, 1); @@ -1613,7 +1613,7 @@ int iter_new_bad_arg(const void *ctx) } SEC("raw_tp") -__failure __msg("arg#0 expected pointer to an iterator on stack") +__failure __msg("R1 expected pointer to an iterator on stack") int iter_next_bad_arg(const void *ctx) { bpf_iter_num_next(&global_it); @@ -1621,7 +1621,7 @@ int iter_next_bad_arg(const void *ctx) } SEC("raw_tp") -__failure __msg("arg#0 expected pointer to an iterator on stack") +__failure __msg("R1 expected pointer to an iterator on stack") int iter_destroy_bad_arg(const void *ctx) { bpf_iter_num_destroy(&global_it); diff --git a/tools/testing/selftests/bpf/progs/iters_state_safety.c b/tools/testing/selftests/bpf/progs/iters_state_safety.c index d273b46dfc7c..af8f9ec1ea98 100644 --- a/tools/testing/selftests/bpf/progs/iters_state_safety.c +++ b/tools/testing/selftests/bpf/progs/iters_state_safety.c @@ -73,7 +73,7 @@ int create_and_forget_to_destroy_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int destroy_without_creating_fail(void *ctx) { /* init with zeros to stop verifier complaining about uninit stack */ @@ -91,7 +91,7 @@ int destroy_without_creating_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int compromise_iter_w_direct_write_fail(void *ctx) { struct bpf_iter_num iter; @@ -143,7 +143,7 @@ int compromise_iter_w_direct_write_and_skip_destroy_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int compromise_iter_w_helper_write_fail(void *ctx) { struct bpf_iter_num iter; @@ -230,7 +230,7 @@ int valid_stack_reuse(void *ctx) } SEC("?raw_tp") -__failure __msg("expected uninitialized iter_num as arg #0") +__failure __msg("expected uninitialized iter_num as R1") int double_create_fail(void *ctx) { struct bpf_iter_num iter; @@ -258,7 +258,7 @@ int double_create_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int double_destroy_fail(void *ctx) { struct bpf_iter_num iter; @@ -284,7 +284,7 @@ int double_destroy_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int next_without_new_fail(void *ctx) { struct bpf_iter_num iter; @@ -305,7 +305,7 @@ int next_without_new_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int next_after_destroy_fail(void *ctx) { struct bpf_iter_num iter; diff --git a/tools/testing/selftests/bpf/progs/iters_testmod.c b/tools/testing/selftests/bpf/progs/iters_testmod.c index 5379e9960ffd..76012dbbdb41 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod.c @@ -29,7 +29,7 @@ out: } SEC("raw_tp/sys_enter") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int iter_next_trusted_or_null(const void *ctx) { struct task_struct *cur_task = bpf_get_current_task_btf(); @@ -67,7 +67,7 @@ out: } SEC("raw_tp/sys_enter") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int iter_next_rcu_or_null(const void *ctx) { struct task_struct *cur_task = bpf_get_current_task_btf(); diff --git a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c index 83791348bed5..9b760dac333e 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c @@ -79,7 +79,7 @@ int testmod_seq_truncated(const void *ctx) SEC("?raw_tp") __failure -__msg("expected an initialized iter_testmod_seq as arg #1") +__msg("expected an initialized iter_testmod_seq as R2") int testmod_seq_getter_before_bad(const void *ctx) { struct bpf_iter_testmod_seq it; @@ -89,7 +89,7 @@ int testmod_seq_getter_before_bad(const void *ctx) SEC("?raw_tp") __failure -__msg("expected an initialized iter_testmod_seq as arg #1") +__msg("expected an initialized iter_testmod_seq as R2") int testmod_seq_getter_after_bad(const void *ctx) { struct bpf_iter_testmod_seq it; diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c index ee053b24e6ca..8f36e74fd8f9 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c @@ -364,7 +364,7 @@ int kptr_xchg_ref_state(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("Possibly NULL pointer passed to helper arg2") +__failure __msg("Possibly NULL pointer passed to helper R2") int kptr_xchg_possibly_null(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *p; diff --git a/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c b/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c index 81813c724fa9..08379c3b6a03 100644 --- a/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c +++ b/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c @@ -110,7 +110,7 @@ int BPF_PROG(test_array_map_3) } SEC("?fentry.s/bpf_fentry_test1") -__failure __msg("arg#0 expected for bpf_percpu_obj_drop()") +__failure __msg("R1 expected for bpf_percpu_obj_drop()") int BPF_PROG(test_array_map_4) { struct val_t __percpu_kptr *p; @@ -124,7 +124,7 @@ int BPF_PROG(test_array_map_4) } SEC("?fentry.s/bpf_fentry_test1") -__failure __msg("arg#0 expected for bpf_obj_drop()") +__failure __msg("R1 expected for bpf_obj_drop()") int BPF_PROG(test_array_map_5) { struct val_t *p; diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c index 70b7baf9304b..555379952dcc 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_fail.c +++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c @@ -134,7 +134,7 @@ unlock_err: } SEC("?tc") -__failure __msg("arg#1 expected pointer to allocated object") +__failure __msg("R2 expected pointer to allocated object") long rbtree_api_add_to_multiple_trees(void *ctx) { struct node_data *n; @@ -153,7 +153,7 @@ long rbtree_api_add_to_multiple_trees(void *ctx) } SEC("?tc") -__failure __msg("Possibly NULL pointer passed to trusted arg1") +__failure __msg("Possibly NULL pointer passed to trusted R2") long rbtree_api_use_unchecked_remove_retval(void *ctx) { struct bpf_rb_node *res; @@ -281,7 +281,7 @@ long add_with_cb(bool (cb)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) } SEC("?tc") -__failure __msg("arg#1 expected pointer to allocated object") +__failure __msg("R2 expected pointer to allocated object") long rbtree_api_add_bad_cb_bad_fn_call_add(void *ctx) { return add_with_cb(less__bad_fn_call_add); diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c b/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c index b2808bfcec29..7247a20c0a3b 100644 --- a/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c @@ -54,7 +54,7 @@ long rbtree_refcounted_node_ref_escapes(void *ctx) } SEC("?tc") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") long refcount_acquire_maybe_null(void *ctx) { struct node_acquire *n, *m; diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c index 8e8249f3521c..21428bb1ee59 100644 --- a/tools/testing/selftests/bpf/progs/stream_fail.c +++ b/tools/testing/selftests/bpf/progs/stream_fail.c @@ -23,7 +23,7 @@ int stream_vprintk_scalar_arg(void *ctx) } SEC("syscall") -__failure __msg("arg#1 doesn't point to a const string") +__failure __msg("R2 doesn't point to a const string") int stream_vprintk_string_arg(void *ctx) { bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0); diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c index 4c07ea193f72..41047d81ec42 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c @@ -28,7 +28,7 @@ static struct __tasks_kfunc_map_value *insert_lookup_task(struct task_struct *ta } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_acquire_untrusted, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; @@ -49,7 +49,7 @@ int BPF_PROG(task_kfunc_acquire_untrusted, struct task_struct *task, u64 clone_f } SEC("tp_btf/task_newtask") -__failure __msg("arg#0 pointer type STRUCT task_struct must point") +__failure __msg("R1 pointer type STRUCT task_struct must point") int BPF_PROG(task_kfunc_acquire_fp, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired, *stack_task = (struct task_struct *)&clone_flags; @@ -100,7 +100,7 @@ int BPF_PROG(task_kfunc_acquire_unsafe_kretprobe_rcu, struct task_struct *task, } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_acquire_null, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; @@ -149,7 +149,7 @@ int BPF_PROG(task_kfunc_xchg_unreleased, struct task_struct *task, u64 clone_fla } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_acquire_release_no_null_check, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; @@ -162,7 +162,7 @@ int BPF_PROG(task_kfunc_acquire_release_no_null_check, struct task_struct *task, } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_flags) { struct __tasks_kfunc_map_value *v; @@ -178,7 +178,7 @@ int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_f } SEC("tp_btf/task_newtask") -__failure __msg("arg#0 pointer type STRUCT task_struct must point") +__failure __msg("R1 pointer type STRUCT task_struct must point") int BPF_PROG(task_kfunc_release_fp, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired = (struct task_struct *)&clone_flags; @@ -190,7 +190,7 @@ int BPF_PROG(task_kfunc_release_fp, struct task_struct *task, u64 clone_flags) } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_release_null, struct task_struct *task, u64 clone_flags) { struct __tasks_kfunc_map_value local, *v; @@ -234,7 +234,7 @@ int BPF_PROG(task_kfunc_release_unacquired, struct task_struct *task, u64 clone_ } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; @@ -248,7 +248,7 @@ int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 cl } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_from_vpid_no_null_check, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c index 82e4b8913333..3186e7b4b24e 100644 --- a/tools/testing/selftests/bpf/progs/task_work_fail.c +++ b/tools/testing/selftests/bpf/progs/task_work_fail.c @@ -58,7 +58,7 @@ int mismatch_map(struct pt_regs *args) } SEC("perf_event") -__failure __msg("arg#1 doesn't point to a map value") +__failure __msg("R2 doesn't point to a map value") int no_map_task_work(struct pt_regs *args) { struct task_struct *task; @@ -70,7 +70,7 @@ int no_map_task_work(struct pt_regs *args) } SEC("perf_event") -__failure __msg("Possibly NULL pointer passed to trusted arg1") +__failure __msg("Possibly NULL pointer passed to trusted R2") int task_work_null(struct pt_regs *args) { struct task_struct *task; @@ -81,7 +81,7 @@ int task_work_null(struct pt_regs *args) } SEC("perf_event") -__failure __msg("Possibly NULL pointer passed to trusted arg2") +__failure __msg("Possibly NULL pointer passed to trusted R3") int map_null(struct pt_regs *args) { struct elem *work; diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c index 2c156cd166af..332cda89caba 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c @@ -152,7 +152,7 @@ int change_status_after_alloc(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("Possibly NULL pointer passed to trusted arg1") +__failure __msg("Possibly NULL pointer passed to trusted R2") int lookup_null_bpf_tuple(struct __sk_buff *ctx) { struct bpf_ct_opts___local opts = {}; @@ -165,7 +165,7 @@ int lookup_null_bpf_tuple(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("Possibly NULL pointer passed to trusted arg3") +__failure __msg("Possibly NULL pointer passed to trusted R4") int lookup_null_bpf_opts(struct __sk_buff *ctx) { struct bpf_sock_tuple tup = {}; @@ -178,7 +178,7 @@ int lookup_null_bpf_opts(struct __sk_buff *ctx) } SEC("?xdp") -__failure __msg("Possibly NULL pointer passed to trusted arg1") +__failure __msg("Possibly NULL pointer passed to trusted R2") int xdp_lookup_null_bpf_tuple(struct xdp_md *ctx) { struct bpf_ct_opts___local opts = {}; @@ -191,7 +191,7 @@ int xdp_lookup_null_bpf_tuple(struct xdp_md *ctx) } SEC("?xdp") -__failure __msg("Possibly NULL pointer passed to trusted arg3") +__failure __msg("Possibly NULL pointer passed to trusted R4") int xdp_lookup_null_bpf_opts(struct xdp_md *ctx) { struct bpf_sock_tuple tup = {}; diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c index 1c6cfd0888ba..bf48fc43c7ab 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c @@ -40,7 +40,7 @@ int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size, } SEC("?lsm.s/bpf") -__failure __msg("arg#0 expected pointer to stack or const struct bpf_dynptr") +__failure __msg("R1 expected pointer to stack or const struct bpf_dynptr") int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { static struct bpf_dynptr val; diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c index 967081bbcfe1..ca35b92ea095 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c @@ -29,7 +29,7 @@ int kfunc_dynptr_nullable_test2(struct __sk_buff *skb) } SEC("tc") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int kfunc_dynptr_nullable_test3(struct __sk_buff *skb) { struct bpf_dynptr data; diff --git a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c index 8bcddadfc4da..dd97f2027505 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c +++ b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c @@ -32,7 +32,7 @@ int BPF_PROG(no_destroy, struct bpf_iter_meta *meta, struct cgroup *cgrp) SEC("iter/cgroup") __description("uninitialized iter in ->next()") -__failure __msg("expected an initialized iter_bits as arg #0") +__failure __msg("expected an initialized iter_bits as R1") int BPF_PROG(next_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp) { struct bpf_iter_bits it = {}; @@ -43,7 +43,7 @@ int BPF_PROG(next_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp) SEC("iter/cgroup") __description("uninitialized iter in ->destroy()") -__failure __msg("expected an initialized iter_bits as arg #0") +__failure __msg("expected an initialized iter_bits as R1") int BPF_PROG(destroy_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp) { struct bpf_iter_bits it = {}; diff --git a/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c b/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c index 910365201f68..139f70bb3595 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c +++ b/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c @@ -263,7 +263,7 @@ l0_%=: r0 = 0; \ SEC("lsm.s/bpf") __description("reference tracking: release user key reference without check") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") __naked void user_key_reference_without_check(void) { asm volatile (" \ @@ -282,7 +282,7 @@ __naked void user_key_reference_without_check(void) SEC("lsm.s/bpf") __description("reference tracking: release system key reference without check") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") __naked void system_key_reference_without_check(void) { asm volatile (" \ @@ -300,7 +300,7 @@ __naked void system_key_reference_without_check(void) SEC("lsm.s/bpf") __description("reference tracking: release with NULL key pointer") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") __naked void release_with_null_key_pointer(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c index 4b392c6c8fc4..0990de076844 100644 --- a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c @@ -13,7 +13,7 @@ static char buf[PATH_MAX]; SEC("lsm.s/file_open") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(get_task_exe_file_kfunc_null) { struct file *acquired; @@ -28,7 +28,7 @@ int BPF_PROG(get_task_exe_file_kfunc_null) } SEC("lsm.s/inode_getxattr") -__failure __msg("arg#0 pointer type STRUCT task_struct must point to scalar, or struct with scalar") +__failure __msg("R1 pointer type STRUCT task_struct must point to scalar, or struct with scalar") int BPF_PROG(get_task_exe_file_kfunc_fp) { u64 x; @@ -89,7 +89,7 @@ int BPF_PROG(put_file_kfunc_unacquired, struct file *file) } SEC("lsm.s/file_open") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(path_d_path_kfunc_null) { /* Can't pass NULL value to bpf_path_d_path() kfunc. */ @@ -128,7 +128,7 @@ int BPF_PROG(path_d_path_kfunc_untrusted_from_current) } SEC("lsm.s/file_open") -__failure __msg("kernel function bpf_path_d_path args#0 expected pointer to STRUCT path but R1 has a pointer to STRUCT file") +__failure __msg("kernel function bpf_path_d_path R1 expected pointer to STRUCT path but R1 has a pointer to STRUCT file") int BPF_PROG(path_d_path_kfunc_type_mismatch, struct file *file) { bpf_path_d_path((struct path *)&file->f_task_work, buf, sizeof(buf)); diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c index 3767f5595bbc..32dc8827e128 100644 --- a/tools/testing/selftests/bpf/progs/wq_failures.c +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -98,7 +98,7 @@ __failure * is a correct bpf_wq pointer. */ __msg(": (85) call bpf_wq_set_callback#") /* anchor message */ -__msg("arg#0 doesn't point to a map value") +__msg("R1 doesn't point to a map value") long test_wrong_wq_pointer(void *ctx) { int key = 0; diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index c3164b9b2be5..0bb4337552c8 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -31,7 +31,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 pointer type STRUCT prog_test_fail1 must point to scalar", + .errstr = "R1 pointer type STRUCT prog_test_fail1 must point to scalar", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_fail1", 2 }, }, @@ -46,7 +46,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "max struct nesting depth exceeded\narg#0 pointer type STRUCT prog_test_fail2", + .errstr = "max struct nesting depth exceeded\nR1 pointer type STRUCT prog_test_fail2", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_fail2", 2 }, }, @@ -61,7 +61,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 pointer type STRUCT prog_test_fail3 must point to scalar", + .errstr = "R1 pointer type STRUCT prog_test_fail3 must point to scalar", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_fail3", 2 }, }, @@ -76,7 +76,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 expected pointer to ctx, but got fp", + .errstr = "R1 expected pointer to ctx, but got fp", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_pass_ctx", 2 }, }, @@ -91,7 +91,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 pointer type UNKNOWN must point to scalar", + .errstr = "R1 pointer type UNKNOWN must point to scalar", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_mem_len_fail1", 2 }, }, @@ -109,7 +109,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "Possibly NULL pointer passed to trusted arg0", + .errstr = "Possibly NULL pointer passed to trusted R1", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_acquire", 3 }, { "bpf_kfunc_call_test_release", 5 }, @@ -152,7 +152,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "kernel function bpf_kfunc_call_memb1_release args#0 expected pointer", + .errstr = "kernel function bpf_kfunc_call_memb1_release R1 expected pointer", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_memb_acquire", 1 }, { "bpf_kfunc_call_memb1_release", 5 }, -- cgit v1.2.3 From 246ad6e5ee259669692bdb7fb353e8c5d5bba628 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 22 Apr 2026 20:35:06 -0700 Subject: bpf: Introduce bpf register BPF_REG_PARAMS Introduce BPF_REG_PARAMS as a dedicated BPF register for stack argument accesses. It occupies the BPF register number 11 (R11), which is used as the base pointer for the stack argument area, keeping it separate from the R10-based (BPF_REG_FP) program stack. The kernel-internal hidden register BPF_REG_AX previously occupied slot 11 (MAX_BPF_REG). With BPF_REG_PARAMS taking that slot, BPF_REG_AX moves to slot 12 and MAX_BPF_EXT_REG increases accordingly. Acked-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260423033506.2542005-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 5 +- kernel/bpf/core.c | 4 +- .../testing/selftests/bpf/prog_tests/ctx_rewrite.c | 14 ++--- .../selftests/bpf/progs/verifier_bpf_fastcall.c | 24 ++++---- .../selftests/bpf/progs/verifier_may_goto_1.c | 12 ++-- tools/testing/selftests/bpf/progs/verifier_sdiv.c | 64 +++++++++++----------- 6 files changed, 62 insertions(+), 61 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1ec6d5ba64cc..b77d0b06db6e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -58,8 +58,9 @@ struct ctl_table_header; #define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ /* Kernel hidden auxiliary/helper register. */ -#define BPF_REG_AX MAX_BPF_REG -#define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) +#define BPF_REG_PARAMS MAX_BPF_REG +#define BPF_REG_AX (MAX_BPF_REG + 1) +#define MAX_BPF_EXT_REG (MAX_BPF_REG + 2) #define MAX_BPF_JIT_REG MAX_BPF_EXT_REG /* unused opcode to mark special call to bpf_tail_call() helper */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8b018ff48875..ae10b9ca018d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1299,8 +1299,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, u32 imm_rnd = get_random_u32(); s16 off; - BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); - BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + BUILD_BUG_ON(BPF_REG_PARAMS + 2 != MAX_BPF_JIT_REG); + BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); /* Constraints on AX register: * diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c index 5064aeb8fe67..2c3124092b73 100644 --- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c +++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c @@ -69,19 +69,19 @@ static struct test_case test_cases[] = { #if defined(__x86_64__) || defined(__aarch64__) { N(SCHED_CLS, struct __sk_buff, tstamp), - .read = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" - "if w11 & 0x4 goto pc+1;" + .read = "r12 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" + "if w12 & 0x4 goto pc+1;" "goto pc+4;" - "if w11 & 0x3 goto pc+1;" + "if w12 & 0x3 goto pc+1;" "goto pc+2;" "$dst = 0;" "goto pc+1;" "$dst = *(u64 *)($ctx + sk_buff::tstamp);", - .write = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" - "if w11 & 0x4 goto pc+1;" + .write = "r12 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" + "if w12 & 0x4 goto pc+1;" "goto pc+2;" - "w11 &= -4;" - "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r11;" + "w12 &= -4;" + "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r12;" "*(u64 *)($ctx + sk_buff::tstamp) = $src;", }, #endif diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c index fb4fa465d67c..0d9e167555b5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c +++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c @@ -630,13 +630,13 @@ __xlated("...") __xlated("4: r0 = &(void __percpu *)(r0)") __xlated("...") /* may_goto expansion starts */ -__xlated("6: r11 = *(u64 *)(r10 -24)") -__xlated("7: if r11 == 0x0 goto pc+6") -__xlated("8: r11 -= 1") -__xlated("9: if r11 != 0x0 goto pc+2") -__xlated("10: r11 = -24") +__xlated("6: r12 = *(u64 *)(r10 -24)") +__xlated("7: if r12 == 0x0 goto pc+6") +__xlated("8: r12 -= 1") +__xlated("9: if r12 != 0x0 goto pc+2") +__xlated("10: r12 = -24") __xlated("11: call unknown") -__xlated("12: *(u64 *)(r10 -24) = r11") +__xlated("12: *(u64 *)(r10 -24) = r12") /* may_goto expansion ends */ __xlated("13: *(u64 *)(r10 -8) = r1") __xlated("14: exit") @@ -668,13 +668,13 @@ __xlated("1: *(u64 *)(r10 -16) =") __xlated("2: r1 = 1") __xlated("3: call bpf_get_smp_processor_id") /* may_goto expansion starts */ -__xlated("4: r11 = *(u64 *)(r10 -24)") -__xlated("5: if r11 == 0x0 goto pc+6") -__xlated("6: r11 -= 1") -__xlated("7: if r11 != 0x0 goto pc+2") -__xlated("8: r11 = -24") +__xlated("4: r12 = *(u64 *)(r10 -24)") +__xlated("5: if r12 == 0x0 goto pc+6") +__xlated("6: r12 -= 1") +__xlated("7: if r12 != 0x0 goto pc+2") +__xlated("8: r12 = -24") __xlated("9: call unknown") -__xlated("10: *(u64 *)(r10 -24) = r11") +__xlated("10: *(u64 *)(r10 -24) = r12") /* may_goto expansion ends */ __xlated("11: *(u64 *)(r10 -8) = r1") __xlated("12: exit") diff --git a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c index 6d1edaef9213..4bdf4256a41e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c +++ b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c @@ -81,13 +81,13 @@ __arch_s390x __arch_arm64 __xlated("0: *(u64 *)(r10 -16) = 65535") __xlated("1: *(u64 *)(r10 -8) = 0") -__xlated("2: r11 = *(u64 *)(r10 -16)") -__xlated("3: if r11 == 0x0 goto pc+6") -__xlated("4: r11 -= 1") -__xlated("5: if r11 != 0x0 goto pc+2") -__xlated("6: r11 = -16") +__xlated("2: r12 = *(u64 *)(r10 -16)") +__xlated("3: if r12 == 0x0 goto pc+6") +__xlated("4: r12 -= 1") +__xlated("5: if r12 != 0x0 goto pc+2") +__xlated("6: r12 = -16") __xlated("7: call unknown") -__xlated("8: *(u64 *)(r10 -16) = r11") +__xlated("8: *(u64 *)(r10 -16) = r12") __xlated("9: r0 = 1") __xlated("10: r0 = 2") __xlated("11: exit") diff --git a/tools/testing/selftests/bpf/progs/verifier_sdiv.c b/tools/testing/selftests/bpf/progs/verifier_sdiv.c index fd59d57e8e37..95f3239ce228 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sdiv.c +++ b/tools/testing/selftests/bpf/progs/verifier_sdiv.c @@ -778,10 +778,10 @@ __arch_x86_64 __xlated("0: r2 = 0x8000000000000000") __xlated("2: r3 = -1") __xlated("3: r4 = r2") -__xlated("4: r11 = r3") -__xlated("5: r11 += 1") -__xlated("6: if r11 > 0x1 goto pc+4") -__xlated("7: if r11 == 0x0 goto pc+1") +__xlated("4: r12 = r3") +__xlated("5: r12 += 1") +__xlated("6: if r12 > 0x1 goto pc+4") +__xlated("7: if r12 == 0x0 goto pc+1") __xlated("8: r2 = 0") __xlated("9: r2 = -r2") __xlated("10: goto pc+1") @@ -812,10 +812,10 @@ __success __retval(-5) __arch_x86_64 __xlated("0: r2 = 5") __xlated("1: r3 = -1") -__xlated("2: r11 = r3") -__xlated("3: r11 += 1") -__xlated("4: if r11 > 0x1 goto pc+4") -__xlated("5: if r11 == 0x0 goto pc+1") +__xlated("2: r12 = r3") +__xlated("3: r12 += 1") +__xlated("4: if r12 > 0x1 goto pc+4") +__xlated("5: if r12 == 0x0 goto pc+1") __xlated("6: r2 = 0") __xlated("7: r2 = -r2") __xlated("8: goto pc+1") @@ -890,10 +890,10 @@ __arch_x86_64 __xlated("0: w2 = -2147483648") __xlated("1: w3 = -1") __xlated("2: w4 = w2") -__xlated("3: r11 = r3") -__xlated("4: w11 += 1") -__xlated("5: if w11 > 0x1 goto pc+4") -__xlated("6: if w11 == 0x0 goto pc+1") +__xlated("3: r12 = r3") +__xlated("4: w12 += 1") +__xlated("5: if w12 > 0x1 goto pc+4") +__xlated("6: if w12 == 0x0 goto pc+1") __xlated("7: w2 = 0") __xlated("8: w2 = -w2") __xlated("9: goto pc+1") @@ -925,10 +925,10 @@ __arch_x86_64 __xlated("0: w2 = -5") __xlated("1: w3 = -1") __xlated("2: w4 = w2") -__xlated("3: r11 = r3") -__xlated("4: w11 += 1") -__xlated("5: if w11 > 0x1 goto pc+4") -__xlated("6: if w11 == 0x0 goto pc+1") +__xlated("3: r12 = r3") +__xlated("4: w12 += 1") +__xlated("5: if w12 > 0x1 goto pc+4") +__xlated("6: if w12 == 0x0 goto pc+1") __xlated("7: w2 = 0") __xlated("8: w2 = -w2") __xlated("9: goto pc+1") @@ -1004,10 +1004,10 @@ __arch_x86_64 __xlated("0: r2 = 0x8000000000000000") __xlated("2: r3 = -1") __xlated("3: r4 = r2") -__xlated("4: r11 = r3") -__xlated("5: r11 += 1") -__xlated("6: if r11 > 0x1 goto pc+3") -__xlated("7: if r11 == 0x1 goto pc+3") +__xlated("4: r12 = r3") +__xlated("5: r12 += 1") +__xlated("6: if r12 > 0x1 goto pc+3") +__xlated("7: if r12 == 0x1 goto pc+3") __xlated("8: w2 = 0") __xlated("9: goto pc+1") __xlated("10: r2 s%= r3") @@ -1034,10 +1034,10 @@ __arch_x86_64 __xlated("0: r2 = 5") __xlated("1: r3 = -1") __xlated("2: r4 = r2") -__xlated("3: r11 = r3") -__xlated("4: r11 += 1") -__xlated("5: if r11 > 0x1 goto pc+3") -__xlated("6: if r11 == 0x1 goto pc+3") +__xlated("3: r12 = r3") +__xlated("4: r12 += 1") +__xlated("5: if r12 > 0x1 goto pc+3") +__xlated("6: if r12 == 0x1 goto pc+3") __xlated("7: w2 = 0") __xlated("8: goto pc+1") __xlated("9: r2 s%= r3") @@ -1108,10 +1108,10 @@ __arch_x86_64 __xlated("0: w2 = -2147483648") __xlated("1: w3 = -1") __xlated("2: w4 = w2") -__xlated("3: r11 = r3") -__xlated("4: w11 += 1") -__xlated("5: if w11 > 0x1 goto pc+3") -__xlated("6: if w11 == 0x1 goto pc+4") +__xlated("3: r12 = r3") +__xlated("4: w12 += 1") +__xlated("5: if w12 > 0x1 goto pc+3") +__xlated("6: if w12 == 0x1 goto pc+4") __xlated("7: w2 = 0") __xlated("8: goto pc+1") __xlated("9: w2 s%= w3") @@ -1140,10 +1140,10 @@ __arch_x86_64 __xlated("0: w2 = -5") __xlated("1: w3 = -1") __xlated("2: w4 = w2") -__xlated("3: r11 = r3") -__xlated("4: w11 += 1") -__xlated("5: if w11 > 0x1 goto pc+3") -__xlated("6: if w11 == 0x1 goto pc+4") +__xlated("3: r12 = r3") +__xlated("4: w12 += 1") +__xlated("5: if w12 > 0x1 goto pc+3") +__xlated("6: if w12 == 0x1 goto pc+4") __xlated("7: w2 = 0") __xlated("8: goto pc+1") __xlated("9: w2 s%= w3") -- cgit v1.2.3 From 256f0071f9b61ae5028f749449fd3fdad015889d Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 24 Apr 2026 15:52:42 -0700 Subject: bpf: representation and basic operations on circular numbers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds basic definitions for cnum32/cnum64. This is a unified numeric range representation for signed and unsigned domains. Inspired by an old post from Shung-Hsi Yu [1] and paper [2]. Operations correctness is verified using cbmc model checker, tests source code can be found in a separate repo [3]. The cnum64_cnum32_intersect() function is notable, because it handled several cases verifier.c:deduce_bounds_64_from_32() does not. Given: - a is a 64-bit range - b is a 32-bit range - t is a refined 64-bit range, such that ∀ v ∈ a, (u32)v ∈ b: v ∈ t. cnum64_cnum32_intersect() makes the following deductions: (A): 'b' is a sub-range of the first or the last 32-bit sub-range of 'a': 64-bit number axis ---> N*2^32 (N+1)*2^32 (N+2)*2^32 (N+3)*2^32 ||------|---|=====|-------||----------|=====|-------||----------|=====|----|--|| | |< b >| |< b >| |< b >| | | | | | |<--+--------------------------- a ---------------------------+--->| | | |<-------------------------- t -------------------------->| (B) 'b' does not intersect with the first of the last 32-bit sub-range of 'a': N*2^32 (N+1)*2^32 (N+2)*2^32 (N+3)*2^32 ||--|=====|----|----------||--|=====|---------------||--|=====|------------|--|| |< b >| | |< b >| |< b >| | | | | | |<-------------+--------- a -------------------|----------->| | | |<-------- t ------------------>| (C) 'b' crosses 0/U32_MAX boundary: N*2^32 (N+1)*2^32 (N+2)*2^32 (N+3)*2^32 ||===|---------|------|===||===|----------------|===||===|---------|------|===|| |b >| | |< b||b >| |< b||b >| | |< b| | | | | |<-----+----------------- a --------------+-------->| | | |<---------------- t ------------->| Current implementation of deduce_bounds_64_from_32() only handles case (A). [1] https://lore.kernel.org/all/ZTZxoDJJbX9mrQ9w@u94a/ [2] https://jorgenavas.github.io/papers/ACM-TOPLAS-wrapped.pdf [3] https://github.com/eddyz87/cnum-verif/tree/master Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260424-cnums-everywhere-rfc-v1-v3-1-ca434b39a486@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/cnum.h | 80 +++++++++++++++++ kernel/bpf/Makefile | 2 +- kernel/bpf/cnum.c | 120 ++++++++++++++++++++++++++ kernel/bpf/cnum_defs.h | 230 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 431 insertions(+), 1 deletion(-) create mode 100644 include/linux/cnum.h create mode 100644 kernel/bpf/cnum.c create mode 100644 kernel/bpf/cnum_defs.h (limited to 'kernel/bpf') diff --git a/include/linux/cnum.h b/include/linux/cnum.h new file mode 100644 index 000000000000..a7259b105b45 --- /dev/null +++ b/include/linux/cnum.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#ifndef _LINUX_CNUM_H +#define _LINUX_CNUM_H + +#include + +/* + * cnum32: a circular number. + * A unified representation for signed and unsigned ranges. + * + * Assume that a 32-bit range is a circle, with 0 being in the 12 o'clock + * position, numbers placed sequentially in clockwise order and U32_MAX + * in the 11 o'clock position. Signed values map onto the same circle: + * S32_MAX sits at 5 o'clock, S32_MIN sits at 6 o'clock (opposite 0), + * negative values occupy the left half and positive values the right half. + * + * @cnum32 represents an arc on this circle drawn clockwise. + * @base corresponds to the first value of the range. + * @size corresponds to the number of integers in the range excluding @base. + * (The @base is excluded to avoid integer overflow when representing the full + * 0..U32_MAX range, which corresponds to 2^32, which can't be stored in u32). + * + * For example: {U32_MAX, 1} corresponds to signed range [-1, 0], + * {S32_MAX, 1} corresponds to unsigned range [S32_MAX, S32_MIN]. + */ +struct cnum32 { + u32 base; + u32 size; +}; + +#define CNUM32_UNBOUNDED ((struct cnum32){ .base = 0, .size = U32_MAX }) +#define CNUM32_EMPTY ((struct cnum32){ .base = U32_MAX, .size = U32_MAX }) + +struct cnum32 cnum32_from_urange(u32 min, u32 max); +struct cnum32 cnum32_from_srange(s32 min, s32 max); +u32 cnum32_umin(struct cnum32 cnum); +u32 cnum32_umax(struct cnum32 cnum); +s32 cnum32_smin(struct cnum32 cnum); +s32 cnum32_smax(struct cnum32 cnum); +struct cnum32 cnum32_intersect(struct cnum32 a, struct cnum32 b); +void cnum32_intersect_with(struct cnum32 *dst, struct cnum32 src); +void cnum32_intersect_with_urange(struct cnum32 *dst, u32 min, u32 max); +void cnum32_intersect_with_srange(struct cnum32 *dst, s32 min, s32 max); +bool cnum32_contains(struct cnum32 cnum, u32 v); +bool cnum32_is_const(struct cnum32 cnum); +bool cnum32_is_empty(struct cnum32 cnum); +struct cnum32 cnum32_add(struct cnum32 a, struct cnum32 b); +struct cnum32 cnum32_negate(struct cnum32 a); + +/* Same as cnum32 but for 64-bit ranges */ +struct cnum64 { + u64 base; + u64 size; +}; + +#define CNUM64_UNBOUNDED ((struct cnum64){ .base = 0, .size = U64_MAX }) +#define CNUM64_EMPTY ((struct cnum64){ .base = U64_MAX, .size = U64_MAX }) + +struct cnum64 cnum64_from_urange(u64 min, u64 max); +struct cnum64 cnum64_from_srange(s64 min, s64 max); +u64 cnum64_umin(struct cnum64 cnum); +u64 cnum64_umax(struct cnum64 cnum); +s64 cnum64_smin(struct cnum64 cnum); +s64 cnum64_smax(struct cnum64 cnum); +struct cnum64 cnum64_intersect(struct cnum64 a, struct cnum64 b); +void cnum64_intersect_with(struct cnum64 *dst, struct cnum64 src); +void cnum64_intersect_with_urange(struct cnum64 *dst, u64 min, u64 max); +void cnum64_intersect_with_srange(struct cnum64 *dst, s64 min, s64 max); +bool cnum64_contains(struct cnum64 cnum, u64 v); +bool cnum64_is_const(struct cnum64 cnum); +bool cnum64_is_empty(struct cnum64 cnum); +struct cnum64 cnum64_add(struct cnum64 a, struct cnum64 b); +struct cnum64 cnum64_negate(struct cnum64 a); + +struct cnum32 cnum32_from_cnum64(struct cnum64 cnum); +struct cnum64 cnum64_cnum32_intersect(struct cnum64 a, struct cnum32 b); + +#endif /* _LINUX_CNUM_H */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 399007b67a92..4dc41bf5780c 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o const_fold.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o cnum.o log.o token.o liveness.o const_fold.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o diff --git a/kernel/bpf/cnum.c b/kernel/bpf/cnum.c new file mode 100644 index 000000000000..86142cb2aee5 --- /dev/null +++ b/kernel/bpf/cnum.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include + +#define T 32 +#include "cnum_defs.h" +#undef T + +#define T 64 +#include "cnum_defs.h" +#undef T + +struct cnum32 cnum32_from_cnum64(struct cnum64 cnum) +{ + if (cnum64_is_empty(cnum)) + return CNUM32_EMPTY; + + if (cnum.size >= U32_MAX) + return (struct cnum32){ .base = 0, .size = U32_MAX }; + else + return (struct cnum32){ .base = (u32)cnum.base, .size = cnum.size }; +} + +/* + * Suppose 'a' and 'b' are laid out as follows: + * + * 64-bit number axis ---> + * + * N*2^32 (N+1)*2^32 (N+2)*2^32 (N+3)*2^32 + * ||------|---|=====|-------||----------|=====|-------||----------|=====|----|--|| + * | |< b >| |< b >| |< b >| | + * | | | | + * |<--+--------------------------- a ---------------------------+--->| + * | | + * |<-------------------------- t -------------------------->| + * + * In such a case it is possible to infer a more tight representation t + * such that ∀ v ∈ a, (u32)v ∈ b: v ∈ t. + */ +struct cnum64 cnum64_cnum32_intersect(struct cnum64 a, struct cnum32 b) +{ + /* + * To simplify reasoning, rotate the circles so that [virtual] a1 starts + * at u32 boundary, b1 represents b in this new frame of reference. + */ + struct cnum32 b1 = { b.base - (u32)a.base, b.size }; + struct cnum64 t = a; + u64 d, b1_max; + + if (cnum64_is_empty(a) || cnum32_is_empty(b)) + return CNUM64_EMPTY; + + if (cnum32_urange_overflow(b1)) { + b1_max = (u32)b1.base + (u32)b1.size; /* overflow here is fine and necessary */ + if ((u32)a.size > b1_max && (u32)a.size < b1.base) { + /* + * N*2^32 (N+1)*2^32 + * ||=====|------------|=====||=====|---------|---|=====|| + * |b1 ->| |<- b1||b1 ->| | |<- b1| + * |<----------------- a1 ------------------>| + * |<-------------- t ------------>|<-- d -->| (after adjustment) + * ^ + * b1_max + */ + d = (u32)a.size - b1_max; + t.size -= d; + } else { + /* + * No adjustments possible in the following cases: + * + * ||=====|------------|=====||===|=|-------------|=|===|| + * |b1 ->| |<- b1||b1 +>| |<+ b1| + * |<----------------- a1 ------>| | + * |<----------------- (or) a1 ------------------->| + */ + } + } else { + if (t.size < b1.base) + /* + * N*2^32 (N+1)*2^32 + * ||----------|--|=======|--||------> + * |<-- a1 -->| |<- b ->| + */ + return CNUM64_EMPTY; + /* + * N*2^32 (N+1)*2^32 + * ||-------------|========|-||-----| -------|========|-|| + * | |<- b1 ->| | |<- b1 ->| + * |<------------+ a1 ------------>| + * |<------ t ------>| (after adjustment) + */ + t.base += b1.base; + t.size -= b1.base; + b1_max = b1.base + b1.size; + d = 0; + if ((u32)a.size < b1.base) + /* + * N*2^32 (N+1)*2^32 + * ||-------------|========|-||------|-------|========|-|| + * | |<- b1 ->| | |<- b1 ->| + * |<------------+-- a1 --+-------->| + * |<- t ->|<-- d -->| (after adjustment) + */ + d = (u32)a.size + (BIT_ULL(32) - b1_max); + else if ((u32)a.size >= b1_max) + /* + * N*2^32 (N+1)*2^32 + * ||--|========|------------||--|========|-------|-----|| + * | |<- b1 ->| |<- b1 ->| | + * |<-+------------------ a1 ------------+------>| + * |<-------------- t --------------->|<- d ->| (after adjustment) + */ + d = (u32)a.size - b1_max; + if (t.size < d) + return CNUM64_EMPTY; + t.size -= d; + } + return t; +} diff --git a/kernel/bpf/cnum_defs.h b/kernel/bpf/cnum_defs.h new file mode 100644 index 000000000000..3ebd8f723dbb --- /dev/null +++ b/kernel/bpf/cnum_defs.h @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#ifndef T +#error "Define T (bit width: 32, 64) before including cnum_defs.h" +#endif + +#include +#include +#include +#include + +#define cnum_t __PASTE(cnum, T) +#define ut __PASTE(u, T) +#define st __PASTE(s, T) +#define UT_MAX __PASTE(__PASTE(U, T), _MAX) +#define ST_MAX __PASTE(__PASTE(S, T), _MAX) +#define ST_MIN __PASTE(__PASTE(S, T), _MIN) +#define EMPTY __PASTE(__PASTE(CNUM, T), _EMPTY) +#define FN(name) __PASTE(__PASTE(cnum, T), __PASTE(_, name)) + +struct cnum_t FN(from_urange)(ut min, ut max) +{ + return (struct cnum_t){ .base = min, .size = (ut)max - min }; +} + +struct cnum_t FN(from_srange)(st min, st max) +{ + ut size = (ut)max - (ut)min; + ut base = size == UT_MAX ? 0 : (ut)min; + + return (struct cnum_t){ .base = base, .size = size }; +} + +/* True if this cnum represents two unsigned ranges. */ +static inline bool FN(urange_overflow)(struct cnum_t cnum) +{ + /* Same as cnum.base + cnum.size > UT_MAX but avoids overflow */ + return cnum.size > UT_MAX - (ut)cnum.base; +} + +/* + * cnum{T}_umin / cnum{T}_umax query an unsigned range represented by this cnum. + * If cnum represents a range crossing the UT_MAX/0 boundary, the unbound range + * [0..UT_MAX] is returned. + */ +ut FN(umin)(struct cnum_t cnum) +{ + return FN(urange_overflow)(cnum) ? 0 : cnum.base; +} + +ut FN(umax)(struct cnum_t cnum) +{ + return FN(urange_overflow)(cnum) ? UT_MAX : cnum.base + cnum.size; +} + +/* True if this cnum represents two signed ranges. */ +static inline bool FN(srange_overflow)(struct cnum_t cnum) +{ + return FN(contains)(cnum, (ut)ST_MAX) && FN(contains)(cnum, (ut)ST_MIN); +} + +/* + * cnum{T}_smin / cnum{T}_smax query a signed range represented by this cnum. + * If cnum represents a range crossing the ST_MAX/ST_MIN boundary, the unbound range + * [ST_MIN..ST_MAX] is returned. + */ +st FN(smin)(struct cnum_t cnum) +{ + return FN(srange_overflow)(cnum) + ? ST_MIN + : min((st)cnum.base, (st)(cnum.base + cnum.size)); +} + +st FN(smax)(struct cnum_t cnum) +{ + return FN(srange_overflow)(cnum) + ? ST_MAX + : max((st)cnum.base, (st)(cnum.base + cnum.size)); +} + +/* + * Returns a possibly empty intersection of cnums 'a' and 'b'. + * If 'a' and 'b' intersect in two sub-arcs, the function over-approximates + * and returns either 'a' or 'b', whichever is smaller. + */ +struct cnum_t FN(intersect)(struct cnum_t a, struct cnum_t b) +{ + struct cnum_t b1; + ut dbase; + + if (FN(is_empty)(a) || FN(is_empty)(b)) + return EMPTY; + + if (a.base > b.base) + swap(a, b); + + /* + * Rotate frame of reference such that a.base is 0. + * 'b1' is 'b' in this frame of reference. + */ + dbase = b.base - a.base; + b1 = (struct cnum_t){ dbase, b.size }; + if (FN(urange_overflow)(b1)) { + if (b1.base <= a.size) { + /* + * Rotated frame (a.base at origin): + * + * 0 UT_MAX + * |--------------------------------------------| + * [=== a ==========================] | + * [= b1 tail =] [========= b1 main ==========>] + * ^-- b1.base <= a.size + * + * 'a' and 'b' intersect in two disjoint arcs, + * can't represent as single cnum, over-approximate + * the result. + */ + return a.size <= b.size ? a : b; + } else { + /* + * Rotated frame (a.base at origin): + * + * 0 UT_MAX + * |--------------------------------------------| + * [=== a =============] | | + * [= b1 tail =] [======= b1 main ====>] + * ^-- b1.base > a.size + * + * Only 'b' tail intersects 'a'. + */ + return (struct cnum_t) { + .base = a.base, + .size = min(a.size, (ut)(b1.base + b1.size)), + }; + } + } else if (a.size >= b1.base) { + /* + * Rotated frame (a.base at origin): + * + * 0 UT_MAX + * |--------------------------------------------------| + * [=== a ==================================] | + * [== b1 =====================] + * + * 0 UT_MAX + * |--------------------------------------------------| + * [=== a ==================================] | + * [== b1 ====] + * ^-- b1.base <= a.size + * |<-- a.size - dbase -->| + * + * 'a' and 'b' intersect as one cnum. + */ + return (struct cnum_t) { + .base = b.base, + .size = min((ut)(a.size - dbase), b.size), + }; + } else { + return EMPTY; + } +} + +void FN(intersect_with)(struct cnum_t *dst, struct cnum_t src) +{ + *dst = FN(intersect)(*dst, src); +} + +void FN(intersect_with_urange)(struct cnum_t *dst, ut min, ut max) +{ + FN(intersect_with)(dst, FN(from_urange)(min, max)); +} + +void FN(intersect_with_srange)(struct cnum_t *dst, st min, st max) +{ + FN(intersect_with)(dst, FN(from_srange)(min, max)); +} + +static inline struct cnum_t FN(normalize)(struct cnum_t cnum) +{ + if (cnum.size == UT_MAX && cnum.base != 0 && cnum.base != (ut)ST_MAX) + cnum.base = 0; + return cnum; +} + +struct cnum_t FN(add)(struct cnum_t a, struct cnum_t b) +{ + if (FN(is_empty)(a) || FN(is_empty)(b)) + return EMPTY; + if (a.size > UT_MAX - b.size) + return (struct cnum_t){ 0, (ut)UT_MAX }; + else + return FN(normalize)((struct cnum_t){ a.base + b.base, a.size + b.size }); +} + +struct cnum_t FN(negate)(struct cnum_t a) +{ + if (FN(is_empty)(a)) + return EMPTY; + return FN(normalize)((struct cnum_t){ -((ut)a.base + a.size), a.size }); +} + +bool FN(is_empty)(struct cnum_t cnum) +{ + return cnum.base == EMPTY.base && cnum.size == EMPTY.size; +} + +bool FN(contains)(struct cnum_t cnum, ut v) +{ + if (FN(is_empty)(cnum)) + return false; + if (FN(urange_overflow)(cnum)) + return v >= cnum.base || v <= (ut)cnum.base + cnum.size; + else + return v >= cnum.base && v <= (ut)cnum.base + cnum.size; +} + +bool FN(is_const)(struct cnum_t cnum) +{ + return cnum.size == 0; +} + +#undef EMPTY +#undef cnum_t +#undef ut +#undef st +#undef UT_MAX +#undef ST_MAX +#undef ST_MIN +#undef FN -- cgit v1.2.3 From b93f7180f0bc37336cb26b43aa4796973d84852e Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 24 Apr 2026 15:52:43 -0700 Subject: bpf: use accessor functions for bpf_reg_state min/max fields Replace direct access to bpf_reg_state->{smin,smax,umin,umax, s32_min,s32_max,u32_min,u32_max}_value with getter/setter inline functions, preparing for future switch to cnum-based internal representation. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260424-cnums-everywhere-rfc-v1-v3-2-ca434b39a486@gmail.com Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 8 +- include/linux/bpf_verifier.h | 64 ++ kernel/bpf/log.c | 24 +- kernel/bpf/states.c | 16 +- kernel/bpf/verifier.c | 1233 ++++++++++----------- 5 files changed, 678 insertions(+), 667 deletions(-) (limited to 'kernel/bpf') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index 70368fe7c510..1caa87da72b5 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -561,10 +561,10 @@ nfp_bpf_check_alu(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, const struct bpf_reg_state *dreg = cur_regs(env) + meta->insn.dst_reg; - meta->umin_src = min(meta->umin_src, sreg->umin_value); - meta->umax_src = max(meta->umax_src, sreg->umax_value); - meta->umin_dst = min(meta->umin_dst, dreg->umin_value); - meta->umax_dst = max(meta->umax_dst, dreg->umax_value); + meta->umin_src = min(meta->umin_src, reg_umin(sreg)); + meta->umax_src = max(meta->umax_src, reg_umax(sreg)); + meta->umin_dst = min(meta->umin_dst, reg_umin(dreg)); + meta->umax_dst = max(meta->umax_dst, reg_umax(dreg)); /* NFP supports u16 and u32 multiplication. * diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d5b4303315dd..bf3ffa56bbe5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -209,6 +209,70 @@ struct bpf_reg_state { bool precise; }; +static inline s64 reg_smin(const struct bpf_reg_state *reg) +{ + return reg->smin_value; +} + +static inline s64 reg_smax(const struct bpf_reg_state *reg) +{ + return reg->smax_value; +} + +static inline u64 reg_umin(const struct bpf_reg_state *reg) +{ + return reg->umin_value; +} + +static inline u64 reg_umax(const struct bpf_reg_state *reg) +{ + return reg->umax_value; +} + +static inline s32 reg_s32_min(const struct bpf_reg_state *reg) +{ + return reg->s32_min_value; +} + +static inline s32 reg_s32_max(const struct bpf_reg_state *reg) +{ + return reg->s32_max_value; +} + +static inline u32 reg_u32_min(const struct bpf_reg_state *reg) +{ + return reg->u32_min_value; +} + +static inline u32 reg_u32_max(const struct bpf_reg_state *reg) +{ + return reg->u32_max_value; +} + +static inline void reg_set_srange32(struct bpf_reg_state *reg, s32 smin, s32 smax) +{ + reg->s32_min_value = smin; + reg->s32_max_value = smax; +} + +static inline void reg_set_urange32(struct bpf_reg_state *reg, u32 umin, u32 umax) +{ + reg->u32_min_value = umin; + reg->u32_max_value = umax; +} + +static inline void reg_set_srange64(struct bpf_reg_state *reg, s64 smin, s64 smax) +{ + reg->smin_value = smin; + reg->smax_value = smax; +} + +static inline void reg_set_urange64(struct bpf_reg_state *reg, u64 umin, u64 umax) +{ + reg->umin_value = umin; + reg->umax_value = umax; +} + enum bpf_stack_slot_type { STACK_INVALID, /* nothing was stored in this stack slot */ STACK_SPILL, /* register spilled into stack */ diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 011e4ec25acd..64566b86dd27 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -571,20 +571,20 @@ static void print_scalar_ranges(struct bpf_verifier_env *env, u64 val; bool omit; } minmaxs[] = { - {"smin", reg->smin_value, reg->smin_value == S64_MIN}, - {"smax", reg->smax_value, reg->smax_value == S64_MAX}, - {"umin", reg->umin_value, reg->umin_value == 0}, - {"umax", reg->umax_value, reg->umax_value == U64_MAX}, + {"smin", reg_smin(reg), reg_smin(reg) == S64_MIN}, + {"smax", reg_smax(reg), reg_smax(reg) == S64_MAX}, + {"umin", reg_umin(reg), reg_umin(reg) == 0}, + {"umax", reg_umax(reg), reg_umax(reg) == U64_MAX}, {"smin32", - is_snum_decimal((s64)reg->s32_min_value) - ? (s64)reg->s32_min_value - : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN}, + is_snum_decimal((s64)reg_s32_min(reg)) + ? (s64)reg_s32_min(reg) + : (u32)reg_s32_min(reg), reg_s32_min(reg) == S32_MIN}, {"smax32", - is_snum_decimal((s64)reg->s32_max_value) - ? (s64)reg->s32_max_value - : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX}, - {"umin32", reg->u32_min_value, reg->u32_min_value == 0}, - {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX}, + is_snum_decimal((s64)reg_s32_max(reg)) + ? (s64)reg_s32_max(reg) + : (u32)reg_s32_max(reg), reg_s32_max(reg) == S32_MAX}, + {"umin32", reg_u32_min(reg), reg_u32_min(reg) == 0}, + {"umax32", reg_u32_max(reg), reg_u32_max(reg) == U32_MAX}, }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)]; bool neg1, neg2; diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index 8478d2c6ed5b..a78ae891b743 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -301,14 +301,14 @@ int bpf_update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_s static bool range_within(const struct bpf_reg_state *old, const struct bpf_reg_state *cur) { - return old->umin_value <= cur->umin_value && - old->umax_value >= cur->umax_value && - old->smin_value <= cur->smin_value && - old->smax_value >= cur->smax_value && - old->u32_min_value <= cur->u32_min_value && - old->u32_max_value >= cur->u32_max_value && - old->s32_min_value <= cur->s32_min_value && - old->s32_max_value >= cur->s32_max_value; + return reg_umin(old) <= reg_umin(cur) && + reg_umax(old) >= reg_umax(cur) && + reg_smin(old) <= reg_smin(cur) && + reg_smax(old) >= reg_smax(cur) && + reg_u32_min(old) <= reg_u32_min(cur) && + reg_u32_max(old) >= reg_u32_max(cur) && + reg_s32_min(old) <= reg_s32_min(cur) && + reg_s32_max(old) >= reg_s32_max(cur); } /* If in the old state two registers had the same id, then they need to have diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ff6ff1c27517..b91d2789e7b9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -320,12 +320,12 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, bool unknown = true; verbose(env, "%s the register %s has", ctx, reg_name); - if (reg->smin_value > S64_MIN) { - verbose(env, " smin=%lld", reg->smin_value); + if (reg_smin(reg) > S64_MIN) { + verbose(env, " smin=%lld", reg_smin(reg)); unknown = false; } - if (reg->smax_value < S64_MAX) { - verbose(env, " smax=%lld", reg->smax_value); + if (reg_smax(reg) < S64_MAX) { + verbose(env, " smax=%lld", reg_smax(reg)); unknown = false; } if (unknown) @@ -1796,15 +1796,10 @@ static const int caller_saved[CALLER_SAVED_REGS] = { static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) { reg->var_off = tnum_const(imm); - reg->smin_value = (s64)imm; - reg->smax_value = (s64)imm; - reg->umin_value = imm; - reg->umax_value = imm; - - reg->s32_min_value = (s32)imm; - reg->s32_max_value = (s32)imm; - reg->u32_min_value = (u32)imm; - reg->u32_max_value = (u32)imm; + reg_set_srange64(reg, (s64)imm, (s64)imm); + reg_set_urange64(reg, imm, imm); + reg_set_srange32(reg, (s32)imm, (s32)imm); + reg_set_urange32(reg, (u32)imm, (u32)imm); } /* Mark the unknown part of a register (variable offset or scalar value) as @@ -1823,10 +1818,8 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm) { reg->var_off = tnum_const_subreg(reg->var_off, imm); - reg->s32_min_value = (s32)imm; - reg->s32_max_value = (s32)imm; - reg->u32_min_value = (u32)imm; - reg->u32_max_value = (u32)imm; + reg_set_srange32(reg, (s32)imm, (s32)imm); + reg_set_urange32(reg, (u32)imm, (u32)imm); } /* Mark the 'variable offset' part of a register as zero. This should be @@ -1937,34 +1930,25 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, tnum_equals_const(reg->var_off, 0); } +static void __mark_reg32_unbounded(struct bpf_reg_state *reg) +{ + reg_set_srange32(reg, S32_MIN, S32_MAX); + reg_set_urange32(reg, 0, U32_MAX); +} + /* Reset the min/max bounds of a register */ static void __mark_reg_unbounded(struct bpf_reg_state *reg) { - reg->smin_value = S64_MIN; - reg->smax_value = S64_MAX; - reg->umin_value = 0; - reg->umax_value = U64_MAX; + reg_set_srange64(reg, S64_MIN, S64_MAX); + reg_set_urange64(reg, 0, U64_MAX); - reg->s32_min_value = S32_MIN; - reg->s32_max_value = S32_MAX; - reg->u32_min_value = 0; - reg->u32_max_value = U32_MAX; + __mark_reg32_unbounded(reg); } static void __mark_reg64_unbounded(struct bpf_reg_state *reg) { - reg->smin_value = S64_MIN; - reg->smax_value = S64_MAX; - reg->umin_value = 0; - reg->umax_value = U64_MAX; -} - -static void __mark_reg32_unbounded(struct bpf_reg_state *reg) -{ - reg->s32_min_value = S32_MIN; - reg->s32_max_value = S32_MAX; - reg->u32_min_value = 0; - reg->u32_max_value = U32_MAX; + reg_set_srange64(reg, S64_MIN, S64_MAX); + reg_set_urange64(reg, 0, U64_MAX); } static void reset_reg64_and_tnum(struct bpf_reg_state *reg) @@ -1983,15 +1967,14 @@ static void __update_reg32_bounds(struct bpf_reg_state *reg) { struct tnum var32_off = tnum_subreg(reg->var_off); - /* min signed is max(sign bit) | min(other bits) */ - reg->s32_min_value = max_t(s32, reg->s32_min_value, - var32_off.value | (var32_off.mask & S32_MIN)); - /* max signed is min(sign bit) | max(other bits) */ - reg->s32_max_value = min_t(s32, reg->s32_max_value, - var32_off.value | (var32_off.mask & S32_MAX)); - reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value); - reg->u32_max_value = min(reg->u32_max_value, - (u32)(var32_off.value | var32_off.mask)); + reg_set_srange32(reg, + /* min signed is max(sign bit) | min(other bits) */ + max_t(s32, reg_s32_min(reg), var32_off.value | (var32_off.mask & S32_MIN)), + /* max signed is min(sign bit) | max(other bits) */ + min_t(s32, reg_s32_max(reg), var32_off.value | (var32_off.mask & S32_MAX))); + reg_set_urange32(reg, + max_t(u32, reg_u32_min(reg), (u32)var32_off.value), + min(reg_u32_max(reg), (u32)(var32_off.value | var32_off.mask))); } static void __update_reg64_bounds(struct bpf_reg_state *reg) @@ -2000,25 +1983,27 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg) bool umin_in_tnum; /* min signed is max(sign bit) | min(other bits) */ - reg->smin_value = max_t(s64, reg->smin_value, - reg->var_off.value | (reg->var_off.mask & S64_MIN)); /* max signed is min(sign bit) | max(other bits) */ - reg->smax_value = min_t(s64, reg->smax_value, - reg->var_off.value | (reg->var_off.mask & S64_MAX)); - reg->umin_value = max(reg->umin_value, reg->var_off.value); - reg->umax_value = min(reg->umax_value, - reg->var_off.value | reg->var_off.mask); + reg_set_srange64(reg, + max_t(s64, reg_smin(reg), + reg->var_off.value | (reg->var_off.mask & S64_MIN)), + min_t(s64, reg_smax(reg), + reg->var_off.value | (reg->var_off.mask & S64_MAX))); + reg_set_urange64(reg, + max(reg_umin(reg), reg->var_off.value), + min(reg_umax(reg), + reg->var_off.value | reg->var_off.mask)); /* Check if u64 and tnum overlap in a single value */ - tnum_next = tnum_step(reg->var_off, reg->umin_value); - umin_in_tnum = (reg->umin_value & ~reg->var_off.mask) == reg->var_off.value; + tnum_next = tnum_step(reg->var_off, reg_umin(reg)); + umin_in_tnum = (reg_umin(reg) & ~reg->var_off.mask) == reg->var_off.value; tmax = reg->var_off.value | reg->var_off.mask; - if (umin_in_tnum && tnum_next > reg->umax_value) { + if (umin_in_tnum && tnum_next > reg_umax(reg)) { /* The u64 range and the tnum only overlap in umin. * u64: ---[xxxxxx]----- * tnum: --xx----------x- */ - ___mark_reg_known(reg, reg->umin_value); + ___mark_reg_known(reg, reg_umin(reg)); } else if (!umin_in_tnum && tnum_next == tmax) { /* The u64 range and the tnum only overlap in the maximum value * represented by the tnum, called tmax. @@ -2026,8 +2011,8 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg) * tnum: xx-----x-------- */ ___mark_reg_known(reg, tmax); - } else if (!umin_in_tnum && tnum_next <= reg->umax_value && - tnum_step(reg->var_off, tnum_next) > reg->umax_value) { + } else if (!umin_in_tnum && tnum_next <= reg_umax(reg) && + tnum_step(reg->var_off, tnum_next) > reg_umax(reg)) { /* The u64 range and the tnum only overlap in between umin * (excluded) and umax. * u64: ---[xxxxxx]----- @@ -2067,28 +2052,32 @@ static void deduce_bounds_32_from_64(struct bpf_reg_state *reg) * * So we use all these insights to derive bounds for subregisters here. */ - if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) { + if ((reg_umin(reg) >> 32) == (reg_umax(reg) >> 32)) { /* u64 to u32 casting preserves validity of low 32 bits as * a range, if upper 32 bits are the same */ - reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value); - reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value); + reg_set_urange32(reg, + max_t(u32, reg_u32_min(reg), (u32)reg_umin(reg)), + min_t(u32, reg_u32_max(reg), (u32)reg_umax(reg))); - if ((s32)reg->umin_value <= (s32)reg->umax_value) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value); + if ((s32)reg_umin(reg) <= (s32)reg_umax(reg)) { + reg_set_srange32(reg, + max_t(s32, reg_s32_min(reg), (s32)reg_umin(reg)), + min_t(s32, reg_s32_max(reg), (s32)reg_umax(reg))); } } - if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) { + if ((reg_smin(reg) >> 32) == (reg_smax(reg) >> 32)) { /* low 32 bits should form a proper u32 range */ - if ((u32)reg->smin_value <= (u32)reg->smax_value) { - reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value); - reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value); + if ((u32)reg_smin(reg) <= (u32)reg_smax(reg)) { + reg_set_urange32(reg, + max_t(u32, reg_u32_min(reg), (u32)reg_smin(reg)), + min_t(u32, reg_u32_max(reg), (u32)reg_smax(reg))); } /* low 32 bits should form a proper s32 range */ - if ((s32)reg->smin_value <= (s32)reg->smax_value) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); + if ((s32)reg_smin(reg) <= (s32)reg_smax(reg)) { + reg_set_srange32(reg, + max_t(s32, reg_s32_min(reg), (s32)reg_smin(reg)), + min_t(s32, reg_s32_max(reg), (s32)reg_smax(reg))); } } /* Special case where upper bits form a small sequence of two @@ -2104,15 +2093,17 @@ static void deduce_bounds_32_from_64(struct bpf_reg_state *reg) * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister. */ - if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) && - (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value); + if ((u32)(reg_umin(reg) >> 32) + 1 == (u32)(reg_umax(reg) >> 32) && + (s32)reg_umin(reg) < 0 && (s32)reg_umax(reg) >= 0) { + reg_set_srange32(reg, + max_t(s32, reg_s32_min(reg), (s32)reg_umin(reg)), + min_t(s32, reg_s32_max(reg), (s32)reg_umax(reg))); } - if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) && - (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); + if ((u32)(reg_smin(reg) >> 32) + 1 == (u32)(reg_smax(reg) >> 32) && + (s32)reg_smin(reg) < 0 && (s32)reg_smax(reg) >= 0) { + reg_set_srange32(reg, + max_t(s32, reg_s32_min(reg), (s32)reg_smin(reg)), + min_t(s32, reg_s32_max(reg), (s32)reg_smax(reg))); } } @@ -2121,19 +2112,21 @@ static void deduce_bounds_32_from_32(struct bpf_reg_state *reg) /* if u32 range forms a valid s32 range (due to matching sign bit), * try to learn from that */ - if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value); + if ((s32)reg_u32_min(reg) <= (s32)reg_u32_max(reg)) { + reg_set_srange32(reg, + max_t(s32, reg_s32_min(reg), reg_u32_min(reg)), + min_t(s32, reg_s32_max(reg), reg_u32_max(reg))); } /* If we cannot cross the sign boundary, then signed and unsigned bounds * are the same, so combine. This works even in the negative case, e.g. * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. */ - if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { - reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value); - reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value); + if ((u32)reg_s32_min(reg) <= (u32)reg_s32_max(reg)) { + reg_set_urange32(reg, + max_t(u32, reg_s32_min(reg), reg_u32_min(reg)), + min_t(u32, reg_s32_max(reg), reg_u32_max(reg))); } else { - if (reg->u32_max_value < (u32)reg->s32_min_value) { + if (reg_u32_max(reg) < (u32)reg_s32_min(reg)) { /* See __reg64_deduce_bounds() for detailed explanation. * Refine ranges in the following situation: * @@ -2143,9 +2136,11 @@ static void deduce_bounds_32_from_32(struct bpf_reg_state *reg) * |xxxxx s32 range xxxxxxxxx] [xxxxxxx| * 0 S32_MAX S32_MIN -1 */ - reg->s32_min_value = (s32)reg->u32_min_value; - reg->u32_max_value = min_t(u32, reg->u32_max_value, reg->s32_max_value); - } else if ((u32)reg->s32_max_value < reg->u32_min_value) { + reg_set_srange32(reg, (s32)reg_u32_min(reg), reg_s32_max(reg)); + reg_set_urange32(reg, + reg_u32_min(reg), + min_t(u32, reg_u32_max(reg), reg_s32_max(reg))); + } else if ((u32)reg_s32_max(reg) < reg_u32_min(reg)) { /* * 0 U32_MAX * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] | @@ -2153,8 +2148,10 @@ static void deduce_bounds_32_from_32(struct bpf_reg_state *reg) * |xxxxxxxxx] [xxxxxxxxxxxx s32 range | * 0 S32_MAX S32_MIN -1 */ - reg->s32_max_value = (s32)reg->u32_max_value; - reg->u32_min_value = max_t(u32, reg->u32_min_value, reg->s32_min_value); + reg_set_srange32(reg, reg_s32_min(reg), (s32)reg_u32_max(reg)); + reg_set_urange32(reg, + max_t(u32, reg_u32_min(reg), reg_s32_min(reg)), + reg_u32_max(reg)); } } } @@ -2228,17 +2225,19 @@ static void deduce_bounds_64_from_64(struct bpf_reg_state *reg) * casting umin/umax as smin/smax and checking if they form valid * range, and vice versa. Those are equivalent checks. */ - if ((s64)reg->umin_value <= (s64)reg->umax_value) { - reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value); - reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value); + if ((s64)reg_umin(reg) <= (s64)reg_umax(reg)) { + reg_set_srange64(reg, + max_t(s64, reg_smin(reg), reg_umin(reg)), + min_t(s64, reg_smax(reg), reg_umax(reg))); } /* If we cannot cross the sign boundary, then signed and unsigned bounds * are the same, so combine. This works even in the negative case, e.g. * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. */ - if ((u64)reg->smin_value <= (u64)reg->smax_value) { - reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value); - reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value); + if ((u64)reg_smin(reg) <= (u64)reg_smax(reg)) { + reg_set_urange64(reg, + max_t(u64, reg_smin(reg), reg_umin(reg)), + min_t(u64, reg_smax(reg), reg_umax(reg))); } else { /* If the s64 range crosses the sign boundary, then it's split * between the beginning and end of the U64 domain. In that @@ -2275,10 +2274,10 @@ static void deduce_bounds_64_from_64(struct bpf_reg_state *reg) * The first condition below corresponds to the first diagram * above. */ - if (reg->umax_value < (u64)reg->smin_value) { - reg->smin_value = (s64)reg->umin_value; - reg->umax_value = min_t(u64, reg->umax_value, reg->smax_value); - } else if ((u64)reg->smax_value < reg->umin_value) { + if (reg_umax(reg) < (u64)reg_smin(reg)) { + reg_set_srange64(reg, (s64)reg_umin(reg), reg_smax(reg)); + reg_set_urange64(reg, reg_umin(reg), min_t(u64, reg_umax(reg), reg_smax(reg))); + } else if ((u64)reg_smax(reg) < reg_umin(reg)) { /* This second condition considers the case where the u64 range * overlaps with the negative portion of the s64 range: * @@ -2288,8 +2287,8 @@ static void deduce_bounds_64_from_64(struct bpf_reg_state *reg) * |xxxxxxxxx] [xxxxxxxxxxxx s64 range | * 0 S64_MAX S64_MIN -1 */ - reg->smax_value = (s64)reg->umax_value; - reg->umin_value = max_t(u64, reg->umin_value, reg->smin_value); + reg_set_srange64(reg, reg_smin(reg), (s64)reg_umax(reg)); + reg_set_urange64(reg, max_t(u64, reg_umin(reg), reg_smin(reg)), reg_umax(reg)); } } } @@ -2312,15 +2311,17 @@ static void deduce_bounds_64_from_32(struct bpf_reg_state *reg) __s64 new_smin, new_smax; /* u32 -> u64 tightening, it's always well-formed */ - new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value; - new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value; - reg->umin_value = max_t(u64, reg->umin_value, new_umin); - reg->umax_value = min_t(u64, reg->umax_value, new_umax); + new_umin = (reg_umin(reg) & ~0xffffffffULL) | reg_u32_min(reg); + new_umax = (reg_umax(reg) & ~0xffffffffULL) | reg_u32_max(reg); + reg_set_urange64(reg, + max_t(u64, reg_umin(reg), new_umin), + min_t(u64, reg_umax(reg), new_umax)); /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */ - new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value; - new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value; - reg->smin_value = max_t(s64, reg->smin_value, new_smin); - reg->smax_value = min_t(s64, reg->smax_value, new_smax); + new_smin = (reg_smin(reg) & ~0xffffffffULL) | reg_u32_min(reg); + new_smax = (reg_smax(reg) & ~0xffffffffULL) | reg_u32_max(reg); + reg_set_srange64(reg, + max_t(s64, reg_smin(reg), new_smin), + min_t(s64, reg_smax(reg), new_smax)); /* Here we would like to handle a special case after sign extending load, * when upper bits for a 64-bit range are all 1s or all 0s. @@ -2351,13 +2352,11 @@ static void deduce_bounds_64_from_32(struct bpf_reg_state *reg) * - 0x0000_0000_7fff_ffff == (s64)S32_MAX * These relations are used in the conditions below. */ - if (reg->s32_min_value >= 0 && reg->smin_value >= S32_MIN && reg->smax_value <= S32_MAX) { - reg->smin_value = reg->s32_min_value; - reg->smax_value = reg->s32_max_value; - reg->umin_value = reg->s32_min_value; - reg->umax_value = reg->s32_max_value; + if (reg_s32_min(reg) >= 0 && reg_smin(reg) >= S32_MIN && reg_smax(reg) <= S32_MAX) { + reg_set_srange64(reg, reg_s32_min(reg), reg_s32_max(reg)); + reg_set_urange64(reg, reg_s32_min(reg), reg_s32_max(reg)); reg->var_off = tnum_intersect(reg->var_off, - tnum_range(reg->smin_value, reg->smax_value)); + tnum_range(reg_smin(reg), reg_smax(reg))); } } @@ -2373,11 +2372,11 @@ static void __reg_deduce_bounds(struct bpf_reg_state *reg) static void __reg_bound_offset(struct bpf_reg_state *reg) { struct tnum var64_off = tnum_intersect(reg->var_off, - tnum_range(reg->umin_value, - reg->umax_value)); + tnum_range(reg_umin(reg), + reg_umax(reg))); struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off), - tnum_range(reg->u32_min_value, - reg->u32_max_value)); + tnum_range(reg_u32_min(reg), + reg_u32_max(reg))); reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); } @@ -2405,9 +2404,9 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) static bool range_bounds_violation(struct bpf_reg_state *reg) { - return (reg->umin_value > reg->umax_value || reg->smin_value > reg->smax_value || - reg->u32_min_value > reg->u32_max_value || - reg->s32_min_value > reg->s32_max_value); + return (reg_umin(reg) > reg_umax(reg) || reg_smin(reg) > reg_smax(reg) || + reg_u32_min(reg) > reg_u32_max(reg) || + reg_s32_min(reg) > reg_s32_max(reg)); } static bool const_tnum_range_mismatch(struct bpf_reg_state *reg) @@ -2418,8 +2417,8 @@ static bool const_tnum_range_mismatch(struct bpf_reg_state *reg) if (!tnum_is_const(reg->var_off)) return false; - return reg->umin_value != uval || reg->umax_value != uval || - reg->smin_value != sval || reg->smax_value != sval; + return reg_umin(reg) != uval || reg_umax(reg) != uval || + reg_smin(reg) != sval || reg_smax(reg) != sval; } static bool const_tnum_range_mismatch_32(struct bpf_reg_state *reg) @@ -2430,8 +2429,8 @@ static bool const_tnum_range_mismatch_32(struct bpf_reg_state *reg) if (!tnum_subreg_is_const(reg->var_off)) return false; - return reg->u32_min_value != uval32 || reg->u32_max_value != uval32 || - reg->s32_min_value != sval32 || reg->s32_max_value != sval32; + return reg_u32_min(reg) != uval32 || reg_u32_max(reg) != uval32 || + reg_s32_min(reg) != sval32 || reg_s32_max(reg) != sval32; } static int reg_bounds_sanity_check(struct bpf_verifier_env *env, @@ -2458,10 +2457,10 @@ static int reg_bounds_sanity_check(struct bpf_verifier_env *env, out: verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)", - ctx, msg, reg->umin_value, reg->umax_value, - reg->smin_value, reg->smax_value, - reg->u32_min_value, reg->u32_max_value, - reg->s32_min_value, reg->s32_max_value, + ctx, msg, reg_umin(reg), reg_umax(reg), + reg_smin(reg), reg_smax(reg), + reg_u32_min(reg), reg_u32_max(reg), + reg_s32_min(reg), reg_s32_max(reg), reg->var_off.value, reg->var_off.mask); if (env->test_reg_invariants) return -EFAULT; @@ -2476,21 +2475,17 @@ static bool __reg32_bound_s64(s32 a) static void __reg_assign_32_into_64(struct bpf_reg_state *reg) { - reg->umin_value = reg->u32_min_value; - reg->umax_value = reg->u32_max_value; + reg_set_urange64(reg, reg_u32_min(reg), reg_u32_max(reg)); /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must * be positive otherwise set to worse case bounds and refine later * from tnum. */ - if (__reg32_bound_s64(reg->s32_min_value) && - __reg32_bound_s64(reg->s32_max_value)) { - reg->smin_value = reg->s32_min_value; - reg->smax_value = reg->s32_max_value; - } else { - reg->smin_value = 0; - reg->smax_value = U32_MAX; - } + if (__reg32_bound_s64(reg_s32_min(reg)) && + __reg32_bound_s64(reg_s32_max(reg))) + reg_set_srange64(reg, reg_s32_min(reg), reg_s32_max(reg)); + else + reg_set_srange64(reg, 0, U32_MAX); } /* Mark a register as having a completely unknown (scalar) value. */ @@ -2534,11 +2529,12 @@ static int __mark_reg_s32_range(struct bpf_verifier_env *env, { struct bpf_reg_state *reg = regs + regno; - reg->s32_min_value = max_t(s32, reg->s32_min_value, s32_min); - reg->s32_max_value = min_t(s32, reg->s32_max_value, s32_max); - - reg->smin_value = max_t(s64, reg->smin_value, s32_min); - reg->smax_value = min_t(s64, reg->smax_value, s32_max); + reg_set_srange32(reg, + max_t(s32, reg_s32_min(reg), s32_min), + min_t(s32, reg_s32_max(reg), s32_max)); + reg_set_srange64(reg, + max_t(s64, reg_smin(reg), s32_min), + min_t(s64, reg_smax(reg), s32_max)); reg_bounds_sync(reg); @@ -3801,7 +3797,7 @@ static bool is_bpf_st_mem(struct bpf_insn *insn) static int get_reg_width(struct bpf_reg_state *reg) { - return fls64(reg->umax_value); + return fls64(reg_umax(reg)); } /* See comment for mark_fastcall_pattern_for_call() */ @@ -3990,8 +3986,8 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, bool zero_used = false; cur = env->cur_state->frame[env->cur_state->curframe]; - min_off = ptr_reg->smin_value + off; - max_off = ptr_reg->smax_value + off + size; + min_off = reg_smin(ptr_reg) + off; + max_off = reg_smax(ptr_reg) + off + size; if (value_regno >= 0) value_reg = &cur->regs[value_regno]; if ((value_reg && bpf_register_is_null(value_reg)) || @@ -4324,8 +4320,8 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, struct bpf_reg if (err) return err; - min_off = reg->smin_value + off; - max_off = reg->smax_value + off; + min_off = reg_smin(reg) + off; + max_off = reg_smax(reg) + off; mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno); check_fastcall_stack_contract(env, ptr_state, env->insn_idx, min_off); return 0; @@ -4425,13 +4421,13 @@ static int check_map_access_type(struct bpf_verifier_env *env, struct bpf_reg_st if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { verbose(env, "write into map forbidden, value_size=%d off=%lld size=%d\n", - map->value_size, reg->smin_value + off, size); + map->value_size, reg_smin(reg) + off, size); return -EACCES; } if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { verbose(env, "read from map forbidden, value_size=%d off=%lld size=%d\n", - map->value_size, reg->smin_value + off, size); + map->value_size, reg_smin(reg) + off, size); return -EACCES; } @@ -4493,15 +4489,15 @@ static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_ * index'es we need to make sure that whatever we use * will have a set floor within our range. */ - if (reg->smin_value < 0 && - (reg->smin_value == S64_MIN || - (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || - reg->smin_value + off < 0)) { + if (reg_smin(reg) < 0 && + (reg_smin(reg) == S64_MIN || + (off + reg_smin(reg) != (s64)(s32)(off + reg_smin(reg))) || + reg_smin(reg) + off < 0)) { verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n", reg_arg_name(env, argno)); return -EACCES; } - err = __check_mem_access(env, reg, argno, reg->smin_value + off, size, + err = __check_mem_access(env, reg, argno, reg_smin(reg) + off, size, mem_size, zero_size_allowed); if (err) { verbose(env, "%s min value is outside of the allowed memory range\n", @@ -4511,14 +4507,14 @@ static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_ /* If we haven't set a max value then we need to bail since we can't be * sure we won't do bad things. - * If reg->umax_value + off could overflow, treat that as unbounded too. + * If reg_umax(reg) + off could overflow, treat that as unbounded too. */ - if (reg->umax_value >= BPF_MAX_VAR_OFF) { + if (reg_umax(reg) >= BPF_MAX_VAR_OFF) { verbose(env, "%s unbounded memory access, make sure to bounds check any such access\n", reg_arg_name(env, argno)); return -EACCES; } - err = __check_mem_access(env, reg, argno, reg->umax_value + off, size, + err = __check_mem_access(env, reg, argno, reg_umax(reg) + off, size, mem_size, zero_size_allowed); if (err) { verbose(env, "%s max value is outside of the allowed memory range\n", @@ -4546,7 +4542,7 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, return -EACCES; } - if (reg->smin_value < 0) { + if (reg_smin(reg) < 0) { verbose(env, "negative offset %s ptr %s off=%lld disallowed\n", reg_type_str(env, reg->type), reg_arg_name(env, argno), reg->var_off.value); return -EACCES; @@ -4846,8 +4842,8 @@ static int check_map_access(struct bpf_verifier_env *env, struct bpf_reg_state * * this program. To check that [x1, x2) overlaps with [y1, y2), * it is sufficient to check x1 < y2 && y1 < x2. */ - if (reg->smin_value + off < p + field->size && - p < reg->umax_value + off + size) { + if (reg_smin(reg) + off < p + field->size && + p < reg_umax(reg) + off + size) { switch (field->type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: @@ -4942,14 +4938,14 @@ static int check_packet_access(struct bpf_verifier_env *env, struct bpf_reg_stat return err; /* __check_mem_access has made sure "off + size - 1" is within u16. - * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, + * reg_umax(reg) can't be bigger than MAX_PACKET_OFF which is 0xffff, * otherwise find_good_pkt_pointers would have refused to set range info * that __check_mem_access would have rejected this pkt access. - * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. + * Therefore, "off + reg_umax(reg) + size - 1" won't overflow u32. */ env->prog->aux->max_pkt_offset = max_t(u32, env->prog->aux->max_pkt_offset, - off + reg->umax_value + size - 1); + off + reg_umax(reg) + size - 1); return 0; } @@ -5010,7 +5006,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct b err = __check_ptr_off_reg(env, reg, argno, fixed_off_ok); if (err) return err; - off += reg->umax_value; + off += reg_umax(reg); err = __check_ctx_access(env, insn_idx, off, access_size, t, info); if (err) @@ -5037,7 +5033,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn_access_aux info = {}; bool valid; - if (reg->smin_value < 0) { + if (reg_smin(reg) < 0) { verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n", reg_arg_name(env, argno)); return -EACCES; @@ -5655,15 +5651,12 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) /* fix arithmetic bounds */ mask = ((u64)1 << (size * 8)) - 1; - if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) { - reg->umin_value &= mask; - reg->umax_value &= mask; + if ((reg_umin(reg) & ~mask) == (reg_umax(reg) & ~mask)) { + reg_set_urange64(reg, reg_umin(reg) & mask, reg_umax(reg) & mask); } else { - reg->umin_value = 0; - reg->umax_value = mask; + reg_set_urange64(reg, 0, mask); } - reg->smin_value = reg->umin_value; - reg->smax_value = reg->umax_value; + reg_set_srange64(reg, reg_umin(reg), reg_umax(reg)); /* If size is smaller than 32bit register the 32bit register * values are also truncated so we push 64-bit bounds into @@ -5678,19 +5671,18 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) static void set_sext64_default_val(struct bpf_reg_state *reg, int size) { if (size == 1) { - reg->smin_value = reg->s32_min_value = S8_MIN; - reg->smax_value = reg->s32_max_value = S8_MAX; + reg_set_srange64(reg, S8_MIN, S8_MAX); + reg_set_srange32(reg, S8_MIN, S8_MAX); } else if (size == 2) { - reg->smin_value = reg->s32_min_value = S16_MIN; - reg->smax_value = reg->s32_max_value = S16_MAX; + reg_set_srange64(reg, S16_MIN, S16_MAX); + reg_set_srange32(reg, S16_MIN, S16_MAX); } else { /* size == 4 */ - reg->smin_value = reg->s32_min_value = S32_MIN; - reg->smax_value = reg->s32_max_value = S32_MAX; + reg_set_srange64(reg, S32_MIN, S32_MAX); + reg_set_srange32(reg, S32_MIN, S32_MAX); } - reg->umin_value = reg->u32_min_value = 0; - reg->umax_value = U64_MAX; - reg->u32_max_value = U32_MAX; + reg_set_urange64(reg, 0, U64_MAX); + reg_set_urange32(reg, 0, U32_MAX); reg->var_off = tnum_unknown; } @@ -5711,29 +5703,29 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) reg->var_off = tnum_const((s32)u64_cval); u64_cval = reg->var_off.value; - reg->smax_value = reg->smin_value = u64_cval; - reg->umax_value = reg->umin_value = u64_cval; - reg->s32_max_value = reg->s32_min_value = u64_cval; - reg->u32_max_value = reg->u32_min_value = u64_cval; + reg_set_srange64(reg, u64_cval, u64_cval); + reg_set_urange64(reg, u64_cval, u64_cval); + reg_set_srange32(reg, u64_cval, u64_cval); + reg_set_urange32(reg, u64_cval, u64_cval); return; } - top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits; - top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits; + top_smax_value = ((u64)reg_smax(reg) >> num_bits) << num_bits; + top_smin_value = ((u64)reg_smin(reg) >> num_bits) << num_bits; if (top_smax_value != top_smin_value) goto out; /* find the s64_min and s64_min after sign extension */ if (size == 1) { - init_s64_max = (s8)reg->smax_value; - init_s64_min = (s8)reg->smin_value; + init_s64_max = (s8)reg_smax(reg); + init_s64_min = (s8)reg_smin(reg); } else if (size == 2) { - init_s64_max = (s16)reg->smax_value; - init_s64_min = (s16)reg->smin_value; + init_s64_max = (s16)reg_smax(reg); + init_s64_min = (s16)reg_smin(reg); } else { - init_s64_max = (s32)reg->smax_value; - init_s64_min = (s32)reg->smin_value; + init_s64_max = (s32)reg_smax(reg); + init_s64_min = (s32)reg_smin(reg); } s64_max = max(init_s64_max, init_s64_min); @@ -5741,10 +5733,10 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) /* both of s64_max/s64_min positive or negative */ if ((s64_max >= 0) == (s64_min >= 0)) { - reg->s32_min_value = reg->smin_value = s64_min; - reg->s32_max_value = reg->smax_value = s64_max; - reg->u32_min_value = reg->umin_value = s64_min; - reg->u32_max_value = reg->umax_value = s64_max; + reg_set_srange64(reg, s64_min, s64_max); + reg_set_urange64(reg, s64_min, s64_max); + reg_set_srange32(reg, s64_min, s64_max); + reg_set_urange32(reg, s64_min, s64_max); reg->var_off = tnum_range(s64_min, s64_max); return; } @@ -5755,16 +5747,12 @@ out: static void set_sext32_default_val(struct bpf_reg_state *reg, int size) { - if (size == 1) { - reg->s32_min_value = S8_MIN; - reg->s32_max_value = S8_MAX; - } else { + if (size == 1) + reg_set_srange32(reg, S8_MIN, S8_MAX); + else /* size == 2 */ - reg->s32_min_value = S16_MIN; - reg->s32_max_value = S16_MAX; - } - reg->u32_min_value = 0; - reg->u32_max_value = U32_MAX; + reg_set_srange32(reg, S16_MIN, S16_MAX); + reg_set_urange32(reg, 0, U32_MAX); reg->var_off = tnum_subreg(tnum_unknown); } @@ -5782,34 +5770,32 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) reg->var_off = tnum_const((s16)u32_val); u32_val = reg->var_off.value; - reg->s32_min_value = reg->s32_max_value = u32_val; - reg->u32_min_value = reg->u32_max_value = u32_val; + reg_set_srange32(reg, u32_val, u32_val); + reg_set_urange32(reg, u32_val, u32_val); return; } - top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits; - top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits; + top_smax_value = ((u32)reg_s32_max(reg) >> num_bits) << num_bits; + top_smin_value = ((u32)reg_s32_min(reg) >> num_bits) << num_bits; if (top_smax_value != top_smin_value) goto out; /* find the s32_min and s32_min after sign extension */ if (size == 1) { - init_s32_max = (s8)reg->s32_max_value; - init_s32_min = (s8)reg->s32_min_value; + init_s32_max = (s8)reg_s32_max(reg); + init_s32_min = (s8)reg_s32_min(reg); } else { /* size == 2 */ - init_s32_max = (s16)reg->s32_max_value; - init_s32_min = (s16)reg->s32_min_value; + init_s32_max = (s16)reg_s32_max(reg); + init_s32_min = (s16)reg_s32_min(reg); } s32_max = max(init_s32_max, init_s32_min); s32_min = min(init_s32_max, init_s32_min); if ((s32_min >= 0) == (s32_max >= 0)) { - reg->s32_min_value = s32_min; - reg->s32_max_value = s32_max; - reg->u32_min_value = (u32)s32_min; - reg->u32_max_value = (u32)s32_max; + reg_set_srange32(reg, s32_min, s32_max); + reg_set_urange32(reg, (u32)s32_min, (u32)s32_max); reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max)); return; } @@ -6266,14 +6252,14 @@ static int check_stack_access_within_bounds( min_off = (s64)reg->var_off.value + off; max_off = min_off + access_size; } else { - if (reg->smax_value >= BPF_MAX_VAR_OFF || - reg->smin_value <= -BPF_MAX_VAR_OFF) { + if (reg_smax(reg) >= BPF_MAX_VAR_OFF || + reg_smin(reg) <= -BPF_MAX_VAR_OFF) { verbose(env, "invalid unbounded variable-offset%s stack %s\n", err_extra, reg_arg_name(env, argno)); return -EACCES; } - min_off = reg->smin_value + off; - max_off = reg->smax_value + off + access_size; + min_off = reg_smin(reg) + off; + max_off = reg_smax(reg) + off + access_size; } err = check_stack_slot_within_bounds(env, min_off, state, type); @@ -6891,8 +6877,8 @@ static int check_stack_range_initialized( if (meta && meta->raw_mode) meta = NULL; - min_off = reg->smin_value + off; - max_off = reg->smax_value + off; + min_off = reg_smin(reg) + off; + max_off = reg_smax(reg) + off; } if (meta && meta->raw_mode) { @@ -7048,8 +7034,8 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_ zero_size_allowed); if (err) return err; - if (env->prog->aux->max_ctx_offset < reg->umax_value + access_size) - env->prog->aux->max_ctx_offset = reg->umax_value + access_size; + if (env->prog->aux->max_ctx_offset < reg_umax(reg) + access_size) + env->prog->aux->max_ctx_offset = reg_umax(reg) + access_size; return 0; } fallthrough; @@ -7088,7 +7074,7 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, * out. Only upper bounds can be learned because retval is an * int type and negative retvals are allowed. */ - meta->msize_max_value = size_reg->umax_value; + meta->msize_max_value = reg_umax(size_reg); /* The register is SCALAR_VALUE; the access check happens using * its boundaries. For unprivileged variable accesses, disable @@ -7098,24 +7084,24 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, if (!tnum_is_const(size_reg->var_off)) meta = NULL; - if (size_reg->smin_value < 0) { + if (reg_smin(size_reg) < 0) { verbose(env, "%s min value is negative, either use unsigned or 'var &= const'\n", reg_arg_name(env, size_argno)); return -EACCES; } - if (size_reg->umin_value == 0 && !zero_size_allowed) { + if (reg_umin(size_reg) == 0 && !zero_size_allowed) { verbose(env, "%s invalid zero-sized read: u64=[%lld,%lld]\n", - reg_arg_name(env, size_argno), size_reg->umin_value, size_reg->umax_value); + reg_arg_name(env, size_argno), reg_umin(size_reg), reg_umax(size_reg)); return -EACCES; } - if (size_reg->umax_value >= BPF_MAX_VAR_SIZ) { + if (reg_umax(size_reg) >= BPF_MAX_VAR_SIZ) { verbose(env, "%s unbounded memory access, use 'var &= const' or 'if (var < const)'\n", reg_arg_name(env, size_argno)); return -EACCES; } - err = check_helper_mem_access(env, mem_reg, mem_argno, size_reg->umax_value, + err = check_helper_mem_access(env, mem_reg, mem_argno, reg_umax(size_reg), access_type, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, reg_from_argno(size_argno)); @@ -9848,9 +9834,9 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg) { if (range.return_32bit) - return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval; + return range.minval <= reg_s32_min(reg) && reg_s32_max(reg) <= range.maxval; else - return range.minval <= reg->smin_value && reg->smax_value <= range.maxval; + return range.minval <= reg_smin(reg) && reg_smax(reg) <= range.maxval; } static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) @@ -9959,21 +9945,15 @@ static int do_refine_retval_range(struct bpf_verifier_env *env, case BPF_FUNC_probe_read_str: case BPF_FUNC_probe_read_kernel_str: case BPF_FUNC_probe_read_user_str: - ret_reg->smax_value = meta->msize_max_value; - ret_reg->s32_max_value = meta->msize_max_value; - ret_reg->smin_value = -MAX_ERRNO; - ret_reg->s32_min_value = -MAX_ERRNO; + reg_set_srange64(ret_reg, -MAX_ERRNO, meta->msize_max_value); + reg_set_srange32(ret_reg, -MAX_ERRNO, meta->msize_max_value); reg_bounds_sync(ret_reg); break; case BPF_FUNC_get_smp_processor_id: - ret_reg->umax_value = nr_cpu_ids - 1; - ret_reg->u32_max_value = nr_cpu_ids - 1; - ret_reg->smax_value = nr_cpu_ids - 1; - ret_reg->s32_max_value = nr_cpu_ids - 1; - ret_reg->umin_value = 0; - ret_reg->u32_min_value = 0; - ret_reg->smin_value = 0; - ret_reg->s32_min_value = 0; + reg_set_urange64(ret_reg, 0, nr_cpu_ids - 1); + reg_set_urange32(ret_reg, 0, nr_cpu_ids - 1); + reg_set_srange64(ret_reg, 0, nr_cpu_ids - 1); + reg_set_srange32(ret_reg, 0, nr_cpu_ids - 1); reg_bounds_sync(ret_reg); break; } @@ -10438,7 +10418,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = mark_chain_precision(env, BPF_REG_1); if (err) return err; - if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) { + if (cur_func(env)->callback_depth < reg_umax(®s[BPF_REG_1])) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_loop_callback_state); } else { @@ -13403,7 +13383,7 @@ static bool check_reg_sane_offset_scalar(struct bpf_verifier_env *env, { bool known = tnum_is_const(reg->var_off); s64 val = reg->var_off.value; - s64 smin = reg->smin_value; + s64 smin = reg_smin(reg); if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { verbose(env, "math between %s pointer and %lld is not allowed\n", @@ -13432,7 +13412,7 @@ static bool check_reg_sane_offset_ptr(struct bpf_verifier_env *env, { bool known = tnum_is_const(reg->var_off); s64 val = reg->var_off.value; - s64 smin = reg->smin_value; + s64 smin = reg_smin(reg); if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { verbose(env, "%s pointer offset %lld is not allowed\n", @@ -13474,7 +13454,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, break; case PTR_TO_MAP_VALUE: max = ptr_reg->map_ptr->value_size; - ptr_limit = mask_to_left ? ptr_reg->smin_value : ptr_reg->umax_value; + ptr_limit = mask_to_left ? reg_smin(ptr_reg) : reg_umax(ptr_reg); break; default: return REASON_TYPE; @@ -13563,7 +13543,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux; struct bpf_verifier_state *vstate = env->cur_state; bool off_is_imm = tnum_is_const(off_reg->var_off); - bool off_is_neg = off_reg->smin_value < 0; + bool off_is_neg = reg_smin(off_reg) < 0; bool ptr_is_dst_reg = ptr_reg == dst_reg; u8 opcode = BPF_OP(insn->code); u32 alu_state, alu_limit; @@ -13582,7 +13562,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, if (!commit_window) { if (!tnum_is_const(off_reg->var_off) && - (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) + (reg_smin(off_reg) < 0) != (reg_smax(off_reg) < 0)) return REASON_BOUNDS; info->mask_to_left = (opcode == BPF_ADD && off_is_neg) || @@ -13776,10 +13756,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *dst_reg; bool known = tnum_is_const(off_reg->var_off); - s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, - smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; - u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, - umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; + s64 smin_val = reg_smin(off_reg), smax_val = reg_smax(off_reg), + smin_ptr = reg_smin(ptr_reg), smax_ptr = reg_smax(ptr_reg); + u64 umin_val = reg_umin(off_reg), umax_val = reg_umax(off_reg), + umin_ptr = reg_umin(ptr_reg), umax_ptr = reg_umax(ptr_reg); struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; @@ -13881,15 +13861,22 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * added into the variable offset, and we copy the fixed offset * from ptr_reg. */ - if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) || - check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; + { + s64 smin_res, smax_res; + u64 umin_res, umax_res; + + if (check_add_overflow(smin_ptr, smin_val, &smin_res) || + check_add_overflow(smax_ptr, smax_val, &smax_res)) { + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); + } else { + reg_set_srange64(dst_reg, smin_res, smax_res); + } + if (check_add_overflow(umin_ptr, umin_val, &umin_res) || + check_add_overflow(umax_ptr, umax_val, &umax_res)) { + reg_set_urange64(dst_reg, 0, U64_MAX); + } else { + reg_set_urange64(dst_reg, umin_res, umax_res); } - if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) || - check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) { - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->raw = ptr_reg->raw; @@ -13925,20 +13912,23 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* A new variable offset is created. If the subtrahend is known * nonnegative, then any reg->range we had before is still good. */ - if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) || - check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) { + { + s64 smin_res, smax_res; + + if (check_sub_overflow(smin_ptr, smax_val, &smin_res) || + check_sub_overflow(smax_ptr, smin_val, &smax_res)) { /* Overflow possible, we know nothing */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); + } else { + reg_set_srange64(dst_reg, smin_res, smax_res); + } } if (umin_ptr < umax_val) { /* Overflow possible, we know nothing */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; + reg_set_urange64(dst_reg, 0, U64_MAX); } else { /* Cannot overflow (as long as bounds are consistent) */ - dst_reg->umin_value = umin_ptr - umax_val; - dst_reg->umax_value = umax_ptr - umin_val; + reg_set_urange64(dst_reg, umin_ptr - umax_val, umax_ptr - umin_val); } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->raw = ptr_reg->raw; @@ -13996,18 +13986,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; - u32 umax_val = src_reg->u32_max_value; + s32 smin = reg_s32_min(dst_reg); + s32 smax = reg_s32_max(dst_reg); + u32 umin = reg_u32_min(dst_reg); + u32 umax = reg_u32_max(dst_reg); + u32 umin_val = reg_u32_min(src_reg); + u32 umax_val = reg_u32_max(src_reg); bool min_overflow, max_overflow; - if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) || - check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) { - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; + if (check_add_overflow(smin, reg_s32_min(src_reg), &smin) || + check_add_overflow(smax, reg_s32_max(src_reg), &smax)) { + smin = S32_MIN; + smax = S32_MAX; } /* If either all additions overflow or no additions overflow, then @@ -14015,30 +14005,33 @@ static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, * dst_umax + src_umax. Otherwise (some additions overflow), set * the output bounds to unbounded. */ - min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); - max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); + min_overflow = check_add_overflow(umin, umin_val, &umin); + max_overflow = check_add_overflow(umax, umax_val, &umax); if (!min_overflow && max_overflow) { - *dst_umin = 0; - *dst_umax = U32_MAX; + umin = 0; + umax = U32_MAX; } + + reg_set_srange32(dst_reg, smin, smax); + reg_set_urange32(dst_reg, umin, umax); } static void scalar_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 umin_val = src_reg->umin_value; - u64 umax_val = src_reg->umax_value; + s64 smin = reg_smin(dst_reg); + s64 smax = reg_smax(dst_reg); + u64 umin = reg_umin(dst_reg); + u64 umax = reg_umax(dst_reg); + u64 umin_val = reg_umin(src_reg); + u64 umax_val = reg_umax(src_reg); bool min_overflow, max_overflow; - if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) || - check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) { - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; + if (check_add_overflow(smin, reg_smin(src_reg), &smin) || + check_add_overflow(smax, reg_smax(src_reg), &smax)) { + smin = S64_MIN; + smax = S64_MAX; } /* If either all additions overflow or no additions overflow, then @@ -14046,31 +14039,34 @@ static void scalar_min_max_add(struct bpf_reg_state *dst_reg, * dst_umax + src_umax. Otherwise (some additions overflow), set * the output bounds to unbounded. */ - min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); - max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); + min_overflow = check_add_overflow(umin, umin_val, &umin); + max_overflow = check_add_overflow(umax, umax_val, &umax); if (!min_overflow && max_overflow) { - *dst_umin = 0; - *dst_umax = U64_MAX; + umin = 0; + umax = U64_MAX; } + + reg_set_srange64(dst_reg, smin, smax); + reg_set_urange64(dst_reg, umin, umax); } static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; - u32 umax_val = src_reg->u32_max_value; + s32 smin = reg_s32_min(dst_reg); + s32 smax = reg_s32_max(dst_reg); + u32 umin = reg_u32_min(dst_reg); + u32 umax = reg_u32_max(dst_reg); + u32 umin_val = reg_u32_min(src_reg); + u32 umax_val = reg_u32_max(src_reg); bool min_underflow, max_underflow; - if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) || - check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) { + if (check_sub_overflow(smin, reg_s32_max(src_reg), &smin) || + check_sub_overflow(smax, reg_s32_min(src_reg), &smax)) { /* Overflow possible, we know nothing */ - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; + smin = S32_MIN; + smax = S32_MAX; } /* If either all subtractions underflow or no subtractions @@ -14078,31 +14074,34 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, * dst_umax = dst_umax - src_umin. Otherwise (some subtractions * underflow), set the output bounds to unbounded. */ - min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); - max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); + min_underflow = check_sub_overflow(umin, umax_val, &umin); + max_underflow = check_sub_overflow(umax, umin_val, &umax); if (min_underflow && !max_underflow) { - *dst_umin = 0; - *dst_umax = U32_MAX; + umin = 0; + umax = U32_MAX; } + + reg_set_srange32(dst_reg, smin, smax); + reg_set_urange32(dst_reg, umin, umax); } static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 umin_val = src_reg->umin_value; - u64 umax_val = src_reg->umax_value; + s64 smin = reg_smin(dst_reg); + s64 smax = reg_smax(dst_reg); + u64 umin = reg_umin(dst_reg); + u64 umax = reg_umax(dst_reg); + u64 umin_val = reg_umin(src_reg); + u64 umax_val = reg_umax(src_reg); bool min_underflow, max_underflow; - if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) || - check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) { + if (check_sub_overflow(smin, reg_smax(src_reg), &smin) || + check_sub_overflow(smax, reg_smin(src_reg), &smax)) { /* Overflow possible, we know nothing */ - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; + smin = S64_MIN; + smax = S64_MAX; } /* If either all subtractions underflow or no subtractions @@ -14110,113 +14109,116 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, * dst_umax = dst_umax - src_umin. Otherwise (some subtractions * underflow), set the output bounds to unbounded. */ - min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); - max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); + min_underflow = check_sub_overflow(umin, umax_val, &umin); + max_underflow = check_sub_overflow(umax, umin_val, &umax); if (min_underflow && !max_underflow) { - *dst_umin = 0; - *dst_umax = U64_MAX; + umin = 0; + umax = U64_MAX; } + + reg_set_srange64(dst_reg, smin, smax); + reg_set_urange64(dst_reg, umin, umax); } static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; + s32 smin = reg_s32_min(dst_reg); + s32 smax = reg_s32_max(dst_reg); + u32 umin = reg_u32_min(dst_reg); + u32 umax = reg_u32_max(dst_reg); s32 tmp_prod[4]; - if (check_mul_overflow(*dst_umax, src_reg->u32_max_value, dst_umax) || - check_mul_overflow(*dst_umin, src_reg->u32_min_value, dst_umin)) { + if (check_mul_overflow(umax, reg_u32_max(src_reg), &umax) || + check_mul_overflow(umin, reg_u32_min(src_reg), &umin)) { /* Overflow possible, we know nothing */ - *dst_umin = 0; - *dst_umax = U32_MAX; + umin = 0; + umax = U32_MAX; } - if (check_mul_overflow(*dst_smin, src_reg->s32_min_value, &tmp_prod[0]) || - check_mul_overflow(*dst_smin, src_reg->s32_max_value, &tmp_prod[1]) || - check_mul_overflow(*dst_smax, src_reg->s32_min_value, &tmp_prod[2]) || - check_mul_overflow(*dst_smax, src_reg->s32_max_value, &tmp_prod[3])) { + if (check_mul_overflow(smin, reg_s32_min(src_reg), &tmp_prod[0]) || + check_mul_overflow(smin, reg_s32_max(src_reg), &tmp_prod[1]) || + check_mul_overflow(smax, reg_s32_min(src_reg), &tmp_prod[2]) || + check_mul_overflow(smax, reg_s32_max(src_reg), &tmp_prod[3])) { /* Overflow possible, we know nothing */ - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; + smin = S32_MIN; + smax = S32_MAX; } else { - *dst_smin = min_array(tmp_prod, 4); - *dst_smax = max_array(tmp_prod, 4); + smin = min_array(tmp_prod, 4); + smax = max_array(tmp_prod, 4); } + + reg_set_srange32(dst_reg, smin, smax); + reg_set_urange32(dst_reg, umin, umax); } static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; + s64 smin = reg_smin(dst_reg); + s64 smax = reg_smax(dst_reg); + u64 umin = reg_umin(dst_reg); + u64 umax = reg_umax(dst_reg); s64 tmp_prod[4]; - if (check_mul_overflow(*dst_umax, src_reg->umax_value, dst_umax) || - check_mul_overflow(*dst_umin, src_reg->umin_value, dst_umin)) { + if (check_mul_overflow(umax, reg_umax(src_reg), &umax) || + check_mul_overflow(umin, reg_umin(src_reg), &umin)) { /* Overflow possible, we know nothing */ - *dst_umin = 0; - *dst_umax = U64_MAX; + umin = 0; + umax = U64_MAX; } - if (check_mul_overflow(*dst_smin, src_reg->smin_value, &tmp_prod[0]) || - check_mul_overflow(*dst_smin, src_reg->smax_value, &tmp_prod[1]) || - check_mul_overflow(*dst_smax, src_reg->smin_value, &tmp_prod[2]) || - check_mul_overflow(*dst_smax, src_reg->smax_value, &tmp_prod[3])) { + if (check_mul_overflow(smin, reg_smin(src_reg), &tmp_prod[0]) || + check_mul_overflow(smin, reg_smax(src_reg), &tmp_prod[1]) || + check_mul_overflow(smax, reg_smin(src_reg), &tmp_prod[2]) || + check_mul_overflow(smax, reg_smax(src_reg), &tmp_prod[3])) { /* Overflow possible, we know nothing */ - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; + smin = S64_MIN; + smax = S64_MAX; } else { - *dst_smin = min_array(tmp_prod, 4); - *dst_smax = max_array(tmp_prod, 4); + smin = min_array(tmp_prod, 4); + smax = max_array(tmp_prod, 4); } + + reg_set_srange64(dst_reg, smin, smax); + reg_set_urange64(dst_reg, umin, umax); } static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + u32 src_val = reg_u32_min(src_reg); /* non-zero, const divisor */ - *dst_umin = *dst_umin / src_val; - *dst_umax = *dst_umax / src_val; + reg_set_urange32(dst_reg, reg_u32_min(dst_reg) / src_val, + reg_u32_max(dst_reg) / src_val); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; + reg_set_srange32(dst_reg, S32_MIN, S32_MAX); reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + u64 src_val = reg_umin(src_reg); /* non-zero, const divisor */ - *dst_umin = div64_u64(*dst_umin, src_val); - *dst_umax = div64_u64(*dst_umax, src_val); + reg_set_urange64(dst_reg, div64_u64(reg_umin(dst_reg), src_val), + div64_u64(reg_umax(dst_reg), src_val)); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); reset_reg32_and_tnum(dst_reg); } static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + s32 smin = reg_s32_min(dst_reg); + s32 smax = reg_s32_max(dst_reg); + s32 src_val = reg_s32_min(src_reg); /* non-zero, const divisor */ s32 res1, res2; /* BPF div specification: S32_MIN / -1 = S32_MIN */ - if (*dst_smin == S32_MIN && src_val == -1) { + if (smin == S32_MIN && src_val == -1) { /* * If the dividend range contains more than just S32_MIN, * we cannot precisely track the result, so it becomes unbounded. @@ -14225,35 +14227,35 @@ static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, * = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX] * Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN. */ - if (*dst_smax != S32_MIN) { - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; + if (smax != S32_MIN) { + smin = S32_MIN; + smax = S32_MAX; } goto reset; } - res1 = *dst_smin / src_val; - res2 = *dst_smax / src_val; - *dst_smin = min(res1, res2); - *dst_smax = max(res1, res2); + res1 = smin / src_val; + res2 = smax / src_val; + smin = min(res1, res2); + smax = max(res1, res2); reset: + reg_set_srange32(dst_reg, smin, smax); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; + reg_set_urange32(dst_reg, 0, U32_MAX); reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + s64 smin = reg_smin(dst_reg); + s64 smax = reg_smax(dst_reg); + s64 src_val = reg_smin(src_reg); /* non-zero, const divisor */ s64 res1, res2; /* BPF div specification: S64_MIN / -1 = S64_MIN */ - if (*dst_smin == S64_MIN && src_val == -1) { + if (smin == S64_MIN && src_val == -1) { /* * If the dividend range contains more than just S64_MIN, * we cannot precisely track the result, so it becomes unbounded. @@ -14262,79 +14264,69 @@ static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, * = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX] * Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN. */ - if (*dst_smax != S64_MIN) { - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; + if (smax != S64_MIN) { + smin = S64_MIN; + smax = S64_MAX; } goto reset; } - res1 = div64_s64(*dst_smin, src_val); - res2 = div64_s64(*dst_smax, src_val); - *dst_smin = min(res1, res2); - *dst_smax = max(res1, res2); + res1 = div64_s64(smin, src_val); + res2 = div64_s64(smax, src_val); + smin = min(res1, res2); + smax = max(res1, res2); reset: + reg_set_srange64(dst_reg, smin, smax); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; + reg_set_urange64(dst_reg, 0, U64_MAX); reset_reg32_and_tnum(dst_reg); } static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + u32 src_val = reg_u32_min(src_reg); /* non-zero, const divisor */ u32 res_max = src_val - 1; /* * If dst_umax <= res_max, the result remains unchanged. * e.g., [2, 5] % 10 = [2, 5]. */ - if (*dst_umax <= res_max) + if (reg_u32_max(dst_reg) <= res_max) return; - *dst_umin = 0; - *dst_umax = min(*dst_umax, res_max); + reg_set_urange32(dst_reg, 0, min(reg_u32_max(dst_reg), res_max)); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; + reg_set_srange32(dst_reg, S32_MIN, S32_MAX); reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_umod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + u64 src_val = reg_umin(src_reg); /* non-zero, const divisor */ u64 res_max = src_val - 1; /* * If dst_umax <= res_max, the result remains unchanged. * e.g., [2, 5] % 10 = [2, 5]. */ - if (*dst_umax <= res_max) + if (reg_umax(dst_reg) <= res_max) return; - *dst_umin = 0; - *dst_umax = min(*dst_umax, res_max); + reg_set_urange64(dst_reg, 0, min(reg_umax(dst_reg), res_max)); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); reset_reg32_and_tnum(dst_reg); } static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + s32 src_val = reg_s32_min(src_reg); /* non-zero, const divisor */ /* * Safe absolute value calculation: @@ -14354,33 +14346,27 @@ static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, * If the dividend is already within the result range, * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. */ - if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + if (reg_s32_min(dst_reg) >= -res_max_abs && reg_s32_max(dst_reg) <= res_max_abs) return; /* General case: result has the same sign as the dividend. */ - if (*dst_smin >= 0) { - *dst_smin = 0; - *dst_smax = min(*dst_smax, res_max_abs); - } else if (*dst_smax <= 0) { - *dst_smax = 0; - *dst_smin = max(*dst_smin, -res_max_abs); + if (reg_s32_min(dst_reg) >= 0) { + reg_set_srange32(dst_reg, 0, min(reg_s32_max(dst_reg), res_max_abs)); + } else if (reg_s32_max(dst_reg) <= 0) { + reg_set_srange32(dst_reg, max(reg_s32_min(dst_reg), -res_max_abs), 0); } else { - *dst_smin = -res_max_abs; - *dst_smax = res_max_abs; + reg_set_srange32(dst_reg, -res_max_abs, res_max_abs); } /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; + reg_set_urange32(dst_reg, 0, U32_MAX); reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + s64 src_val = reg_smin(src_reg); /* non-zero, const divisor */ /* * Safe absolute value calculation: @@ -14400,24 +14386,20 @@ static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, * If the dividend is already within the result range, * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. */ - if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + if (reg_smin(dst_reg) >= -res_max_abs && reg_smax(dst_reg) <= res_max_abs) return; /* General case: result has the same sign as the dividend. */ - if (*dst_smin >= 0) { - *dst_smin = 0; - *dst_smax = min(*dst_smax, res_max_abs); - } else if (*dst_smax <= 0) { - *dst_smax = 0; - *dst_smin = max(*dst_smin, -res_max_abs); + if (reg_smin(dst_reg) >= 0) { + reg_set_srange64(dst_reg, 0, min(reg_smax(dst_reg), res_max_abs)); + } else if (reg_smax(dst_reg) <= 0) { + reg_set_srange64(dst_reg, max(reg_smin(dst_reg), -res_max_abs), 0); } else { - *dst_smin = -res_max_abs; - *dst_smax = res_max_abs; + reg_set_srange64(dst_reg, -res_max_abs, res_max_abs); } /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; + reg_set_urange64(dst_reg, 0, U64_MAX); reset_reg32_and_tnum(dst_reg); } @@ -14427,7 +14409,7 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - u32 umax_val = src_reg->u32_max_value; + u32 umax_val = reg_u32_max(src_reg); if (src_known && dst_known) { __mark_reg32_known(dst_reg, var32_off.value); @@ -14437,19 +14419,15 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, /* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima. */ - dst_reg->u32_min_value = var32_off.value; - dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val); + reg_set_urange32(dst_reg, var32_off.value, min(reg_u32_max(dst_reg), umax_val)); /* Safe to set s32 bounds by casting u32 result into s32 when u32 * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. */ - if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { - dst_reg->s32_min_value = dst_reg->u32_min_value; - dst_reg->s32_max_value = dst_reg->u32_max_value; - } else { - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } + if ((s32)reg_u32_min(dst_reg) <= (s32)reg_u32_max(dst_reg)) + reg_set_srange32(dst_reg, reg_u32_min(dst_reg), reg_u32_max(dst_reg)); + else + reg_set_srange32(dst_reg, S32_MIN, S32_MAX); } static void scalar_min_max_and(struct bpf_reg_state *dst_reg, @@ -14457,7 +14435,7 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - u64 umax_val = src_reg->umax_value; + u64 umax_val = reg_umax(src_reg); if (src_known && dst_known) { __mark_reg_known(dst_reg, dst_reg->var_off.value); @@ -14467,19 +14445,15 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, /* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima. */ - dst_reg->umin_value = dst_reg->var_off.value; - dst_reg->umax_value = min(dst_reg->umax_value, umax_val); + reg_set_urange64(dst_reg, dst_reg->var_off.value, min(reg_umax(dst_reg), umax_val)); /* Safe to set s64 bounds by casting u64 result into s64 when u64 * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. */ - if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } else { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } + if ((s64)reg_umin(dst_reg) <= (s64)reg_umax(dst_reg)) + reg_set_srange64(dst_reg, reg_umin(dst_reg), reg_umax(dst_reg)); + else + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); } @@ -14490,7 +14464,7 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - u32 umin_val = src_reg->u32_min_value; + u32 umin_val = reg_u32_min(src_reg); if (src_known && dst_known) { __mark_reg32_known(dst_reg, var32_off.value); @@ -14500,19 +14474,16 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, /* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima */ - dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val); - dst_reg->u32_max_value = var32_off.value | var32_off.mask; + reg_set_urange32(dst_reg, max(reg_u32_min(dst_reg), umin_val), + var32_off.value | var32_off.mask); /* Safe to set s32 bounds by casting u32 result into s32 when u32 * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. */ - if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { - dst_reg->s32_min_value = dst_reg->u32_min_value; - dst_reg->s32_max_value = dst_reg->u32_max_value; - } else { - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } + if ((s32)reg_u32_min(dst_reg) <= (s32)reg_u32_max(dst_reg)) + reg_set_srange32(dst_reg, reg_u32_min(dst_reg), reg_u32_max(dst_reg)); + else + reg_set_srange32(dst_reg, S32_MIN, S32_MAX); } static void scalar_min_max_or(struct bpf_reg_state *dst_reg, @@ -14520,7 +14491,7 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - u64 umin_val = src_reg->umin_value; + u64 umin_val = reg_umin(src_reg); if (src_known && dst_known) { __mark_reg_known(dst_reg, dst_reg->var_off.value); @@ -14530,19 +14501,16 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, /* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima */ - dst_reg->umin_value = max(dst_reg->umin_value, umin_val); - dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; + reg_set_urange64(dst_reg, max(reg_umin(dst_reg), umin_val), + dst_reg->var_off.value | dst_reg->var_off.mask); /* Safe to set s64 bounds by casting u64 result into s64 when u64 * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. */ - if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } else { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } + if ((s64)reg_umin(dst_reg) <= (s64)reg_umax(dst_reg)) + reg_set_srange64(dst_reg, reg_umin(dst_reg), reg_umax(dst_reg)); + else + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); } @@ -14560,19 +14528,15 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, } /* We get both minimum and maximum from the var32_off. */ - dst_reg->u32_min_value = var32_off.value; - dst_reg->u32_max_value = var32_off.value | var32_off.mask; + reg_set_urange32(dst_reg, var32_off.value, var32_off.value | var32_off.mask); /* Safe to set s32 bounds by casting u32 result into s32 when u32 * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. */ - if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { - dst_reg->s32_min_value = dst_reg->u32_min_value; - dst_reg->s32_max_value = dst_reg->u32_max_value; - } else { - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } + if ((s32)reg_u32_min(dst_reg) <= (s32)reg_u32_max(dst_reg)) + reg_set_srange32(dst_reg, reg_u32_min(dst_reg), reg_u32_max(dst_reg)); + else + reg_set_srange32(dst_reg, S32_MIN, S32_MAX); } static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, @@ -14588,19 +14552,16 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, } /* We get both minimum and maximum from the var_off. */ - dst_reg->umin_value = dst_reg->var_off.value; - dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; + reg_set_urange64(dst_reg, dst_reg->var_off.value, + dst_reg->var_off.value | dst_reg->var_off.mask); /* Safe to set s64 bounds by casting u64 result into s64 when u64 * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. */ - if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } else { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } + if ((s64)reg_umin(dst_reg) <= (s64)reg_umax(dst_reg)) + reg_set_srange64(dst_reg, reg_umin(dst_reg), reg_umax(dst_reg)); + else + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); __update_reg_bounds(dst_reg); } @@ -14611,23 +14572,20 @@ static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, /* We lose all sign bit information (except what we can pick * up from var_off) */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; + reg_set_srange32(dst_reg, S32_MIN, S32_MAX); /* If we might shift our top bit out, then we know nothing */ - if (umax_val > 31 || dst_reg->u32_max_value > 1ULL << (31 - umax_val)) { - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; - } else { - dst_reg->u32_min_value <<= umin_val; - dst_reg->u32_max_value <<= umax_val; - } + if (umax_val > 31 || reg_u32_max(dst_reg) > 1ULL << (31 - umax_val)) + reg_set_urange32(dst_reg, 0, U32_MAX); + else + reg_set_urange32(dst_reg, reg_u32_min(dst_reg) << umin_val, + reg_u32_max(dst_reg) << umax_val); } static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u32 umax_val = src_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; + u32 umax_val = reg_u32_max(src_reg); + u32 umin_val = reg_u32_min(src_reg); /* u32 alu operation will zext upper bits */ struct tnum subreg = tnum_subreg(dst_reg->var_off); @@ -14649,29 +14607,25 @@ static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg, * because s32 bounds don't flip sign when shifting to the left by * 32bits. */ - if (umin_val == 32 && umax_val == 32) { - dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32; - dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32; - } else { - dst_reg->smax_value = S64_MAX; - dst_reg->smin_value = S64_MIN; - } + if (umin_val == 32 && umax_val == 32) + reg_set_srange64(dst_reg, (s64)reg_s32_min(dst_reg) << 32, + (s64)reg_s32_max(dst_reg) << 32); + else + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); /* If we might shift our top bit out, then we know nothing */ - if (dst_reg->umax_value > 1ULL << (63 - umax_val)) { - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - dst_reg->umin_value <<= umin_val; - dst_reg->umax_value <<= umax_val; - } + if (reg_umax(dst_reg) > 1ULL << (63 - umax_val)) + reg_set_urange64(dst_reg, 0, U64_MAX); + else + reg_set_urange64(dst_reg, reg_umin(dst_reg) << umin_val, + reg_umax(dst_reg) << umax_val); } static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umax_val = src_reg->umax_value; - u64 umin_val = src_reg->umin_value; + u64 umax_val = reg_umax(src_reg); + u64 umin_val = reg_umin(src_reg); /* scalar64 calc uses 32bit unshifted bounds so must be called first */ __scalar64_min_max_lsh(dst_reg, umin_val, umax_val); @@ -14686,8 +14640,8 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { struct tnum subreg = tnum_subreg(dst_reg->var_off); - u32 umax_val = src_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; + u32 umax_val = reg_u32_max(src_reg); + u32 umin_val = reg_u32_min(src_reg); /* BPF_RSH is an unsigned shift. If the value in dst_reg might * be negative, then either: @@ -14703,12 +14657,11 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, * and rely on inferring new ones from the unsigned bounds and * var_off of the result. */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; + reg_set_srange32(dst_reg, S32_MIN, S32_MAX); dst_reg->var_off = tnum_rshift(subreg, umin_val); - dst_reg->u32_min_value >>= umax_val; - dst_reg->u32_max_value >>= umin_val; + reg_set_urange32(dst_reg, reg_u32_min(dst_reg) >> umax_val, + reg_u32_max(dst_reg) >> umin_val); __mark_reg64_unbounded(dst_reg); __update_reg32_bounds(dst_reg); @@ -14717,8 +14670,8 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umax_val = src_reg->umax_value; - u64 umin_val = src_reg->umin_value; + u64 umax_val = reg_umax(src_reg); + u64 umin_val = reg_umin(src_reg); /* BPF_RSH is an unsigned shift. If the value in dst_reg might * be negative, then either: @@ -14734,11 +14687,10 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, * and rely on inferring new ones from the unsigned bounds and * var_off of the result. */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; + reg_set_srange64(dst_reg, S64_MIN, S64_MAX); dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); - dst_reg->umin_value >>= umax_val; - dst_reg->umax_value >>= umin_val; + reg_set_urange64(dst_reg, reg_umin(dst_reg) >> umax_val, + reg_umax(dst_reg) >> umin_val); /* Its not easy to operate on alu32 bounds here because it depends * on bits being shifted in. Take easy way out and mark unbounded @@ -14751,21 +14703,21 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umin_val = src_reg->u32_min_value; + u64 umin_val = reg_u32_min(src_reg); /* Upon reaching here, src_known is true and * umax_val is equal to umin_val. */ - dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val); - dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val); + reg_set_srange32(dst_reg, + (u32)(((s32)reg_s32_min(dst_reg)) >> umin_val), + (u32)(((s32)reg_s32_max(dst_reg)) >> umin_val)); dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32); /* blow away the dst_reg umin_value/umax_value and rely on * dst_reg var_off to refine the result. */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; + reg_set_urange32(dst_reg, 0, U32_MAX); __mark_reg64_unbounded(dst_reg); __update_reg32_bounds(dst_reg); @@ -14774,21 +14726,20 @@ static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg, static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umin_val = src_reg->umin_value; + u64 umin_val = reg_umin(src_reg); /* Upon reaching here, src_known is true and umax_val is equal * to umin_val. */ - dst_reg->smin_value >>= umin_val; - dst_reg->smax_value >>= umin_val; + reg_set_srange64(dst_reg, reg_smin(dst_reg) >> umin_val, + reg_smax(dst_reg) >> umin_val); dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64); /* blow away the dst_reg umin_value/umax_value and rely on * dst_reg var_off to refine the result. */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; + reg_set_urange64(dst_reg, 0, U64_MAX); /* Its not easy to operate on alu32 bounds here because it depends * on bits being shifted in from upper 32-bits. Take easy way out @@ -14855,13 +14806,13 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, if (insn_bitness == 32) { if (tnum_subreg_is_const(src_reg->var_off) - && src_reg->s32_min_value == src_reg->s32_max_value - && src_reg->u32_min_value == src_reg->u32_max_value) + && reg_s32_min(src_reg) == reg_s32_max(src_reg) + && reg_u32_min(src_reg) == reg_u32_max(src_reg)) src_is_const = true; } else { if (tnum_is_const(src_reg->var_off) - && src_reg->smin_value == src_reg->smax_value - && src_reg->umin_value == src_reg->umax_value) + && reg_smin(src_reg) == reg_smax(src_reg) + && reg_umin(src_reg) == reg_umax(src_reg)) src_is_const = true; } @@ -14891,7 +14842,7 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_LSH: case BPF_RSH: case BPF_ARSH: - return (src_is_const && src_reg->umax_value < insn_bitness); + return (src_is_const && reg_umax(src_reg) < insn_bitness); default: return false; } @@ -14904,9 +14855,9 @@ static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *ins struct bpf_reg_state *regs; bool alu32; - if (dst_reg->smin_value == -1 && dst_reg->smax_value == 0) + if (reg_smin(dst_reg) == -1 && reg_smax(dst_reg) == 0) alu32 = false; - else if (dst_reg->s32_min_value == -1 && dst_reg->s32_max_value == 0) + else if (reg_s32_min(dst_reg) == -1 && reg_s32_max(dst_reg) == 0) alu32 = true; else return 0; @@ -14990,7 +14941,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, break; case BPF_DIV: /* BPF div specification: x / 0 = 0 */ - if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) { + if ((alu32 && reg_u32_min(&src_reg) == 0) || (!alu32 && reg_umin(&src_reg) == 0)) { ___mark_reg_known(dst_reg, 0); break; } @@ -15007,7 +14958,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, break; case BPF_MOD: /* BPF mod specification: x % 0 = x */ - if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) + if ((alu32 && reg_u32_min(&src_reg) == 0) || (!alu32 && reg_umin(&src_reg) == 0)) break; if (alu32) if (off == 1) @@ -15195,7 +15146,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * umax_value before the ALU operation. After adjust_scalar_min_max_vals(), * alu32 ops will have zero-extended the result, making umax_value <= U32_MAX. */ - u64 dst_umax = dst_reg->umax_value; + u64 dst_umax = reg_umax(dst_reg); err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); if (err) @@ -15337,7 +15288,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (src_reg->type == SCALAR_VALUE) { bool no_sext; - no_sext = src_reg->umax_value < (1ULL << (insn->off - 1)); + no_sext = reg_umax(src_reg) < (1ULL << (insn->off - 1)); if (no_sext) assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); @@ -15372,7 +15323,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->subreg_def = env->insn_idx + 1; } else { /* case: W1 = (s8, s16)W2 */ - bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1)); + bool no_sext = reg_umax(src_reg) < (1ULL << (insn->off - 1)); if (no_sext) assign_scalar_id_before_mov(env, src_reg); @@ -15454,17 +15405,17 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *reg; int new_range; - if (dst_reg->umax_value == 0 && range_right_open) + if (reg_umax(dst_reg) == 0 && range_right_open) /* This doesn't give us any range */ return; - if (dst_reg->umax_value > MAX_PACKET_OFF) + if (reg_umax(dst_reg) > MAX_PACKET_OFF) /* Risk of overflow. For instance, ptr + (1<<63) may be less * than pkt_end, but that's because it's also less than pkt. */ return; - new_range = dst_reg->umax_value; + new_range = reg_umax(dst_reg); if (range_right_open) new_range++; @@ -15513,7 +15464,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, /* If our ids match, then we must have the same max_value. And we * don't care about the other reg's fixed offset, since if it's too big * the range won't allow anything. - * dst_reg->umax_value is known < MAX_PACKET_OFF, therefore it fits in a u16. + * reg_umax(dst_reg) is known < MAX_PACKET_OFF, therefore it fits in a u16. */ bpf_for_each_reg_in_vstate(vstate, state, reg, ({ if (reg->type == type && reg->id == dst_reg->id) @@ -15569,14 +15520,14 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s { struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off; struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off; - u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value; - u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value; - s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value; - s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value; - u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value; - u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value; - s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value; - s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value; + u64 umin1 = is_jmp32 ? (u64)reg_u32_min(reg1) : reg_umin(reg1); + u64 umax1 = is_jmp32 ? (u64)reg_u32_max(reg1) : reg_umax(reg1); + s64 smin1 = is_jmp32 ? (s64)reg_s32_min(reg1) : reg_smin(reg1); + s64 smax1 = is_jmp32 ? (s64)reg_s32_max(reg1) : reg_smax(reg1); + u64 umin2 = is_jmp32 ? (u64)reg_u32_min(reg2) : reg_umin(reg2); + u64 umax2 = is_jmp32 ? (u64)reg_u32_max(reg2) : reg_umax(reg2); + s64 smin2 = is_jmp32 ? (s64)reg_s32_min(reg2) : reg_smin(reg2); + s64 smax2 = is_jmp32 ? (s64)reg_s32_max(reg2) : reg_smax(reg2); if (reg1 == reg2) { switch (opcode) { @@ -15621,11 +15572,11 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s * utilize 32-bit subrange knowledge to eliminate * branches that can't be taken a priori */ - if (reg1->u32_min_value > reg2->u32_max_value || - reg1->u32_max_value < reg2->u32_min_value) + if (reg_u32_min(reg1) > reg_u32_max(reg2) || + reg_u32_max(reg1) < reg_u32_min(reg2)) return 0; - if (reg1->s32_min_value > reg2->s32_max_value || - reg1->s32_max_value < reg2->s32_min_value) + if (reg_s32_min(reg1) > reg_s32_max(reg2) || + reg_s32_max(reg1) < reg_s32_min(reg2)) return 0; } break; @@ -15647,11 +15598,11 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s * utilize 32-bit subrange knowledge to eliminate * branches that can't be taken a priori */ - if (reg1->u32_min_value > reg2->u32_max_value || - reg1->u32_max_value < reg2->u32_min_value) + if (reg_u32_min(reg1) > reg_u32_max(reg2) || + reg_u32_max(reg1) < reg_u32_min(reg2)) return 1; - if (reg1->s32_min_value > reg2->s32_max_value || - reg1->s32_max_value < reg2->s32_min_value) + if (reg_s32_min(reg1) > reg_s32_max(reg2) || + reg_s32_max(reg1) < reg_s32_min(reg2)) return 1; } break; @@ -15878,27 +15829,23 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state switch (opcode) { case BPF_JEQ: if (is_jmp32) { - reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value); - reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value); - reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value); - reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value); - reg2->u32_min_value = reg1->u32_min_value; - reg2->u32_max_value = reg1->u32_max_value; - reg2->s32_min_value = reg1->s32_min_value; - reg2->s32_max_value = reg1->s32_max_value; + reg_set_urange32(reg1, max(reg_u32_min(reg1), reg_u32_min(reg2)), + min(reg_u32_max(reg1), reg_u32_max(reg2))); + reg_set_srange32(reg1, max(reg_s32_min(reg1), reg_s32_min(reg2)), + min(reg_s32_max(reg1), reg_s32_max(reg2))); + reg_set_urange32(reg2, reg_u32_min(reg1), reg_u32_max(reg1)); + reg_set_srange32(reg2, reg_s32_min(reg1), reg_s32_max(reg1)); t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); reg2->var_off = tnum_with_subreg(reg2->var_off, t); } else { - reg1->umin_value = max(reg1->umin_value, reg2->umin_value); - reg1->umax_value = min(reg1->umax_value, reg2->umax_value); - reg1->smin_value = max(reg1->smin_value, reg2->smin_value); - reg1->smax_value = min(reg1->smax_value, reg2->smax_value); - reg2->umin_value = reg1->umin_value; - reg2->umax_value = reg1->umax_value; - reg2->smin_value = reg1->smin_value; - reg2->smax_value = reg1->smax_value; + reg_set_urange64(reg1, max(reg_umin(reg1), reg_umin(reg2)), + min(reg_umax(reg1), reg_umax(reg2))); + reg_set_srange64(reg1, max(reg_smin(reg1), reg_smin(reg2)), + min(reg_smax(reg1), reg_smax(reg2))); + reg_set_urange64(reg2, reg_umin(reg1), reg_umax(reg1)); + reg_set_srange64(reg2, reg_smin(reg1), reg_smax(reg1)); reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off); reg2->var_off = reg1->var_off; @@ -15915,8 +15862,8 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state */ val = reg_const_value(reg2, is_jmp32); if (is_jmp32) { - /* u32_min_value is not equal to 0xffffffff at this point, - * because otherwise u32_max_value is 0xffffffff as well, + /* u32_min is not equal to 0xffffffff at this point, + * because otherwise u32_max is 0xffffffff as well, * in such a case both reg1 and reg2 would be constants, * jump would be predicted and regs_refine_cond_op() * wouldn't be called. @@ -15924,23 +15871,23 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state * Same reasoning works for all {u,s}{min,max}{32,64} cases * below. */ - if (reg1->u32_min_value == (u32)val) - reg1->u32_min_value++; - if (reg1->u32_max_value == (u32)val) - reg1->u32_max_value--; - if (reg1->s32_min_value == (s32)val) - reg1->s32_min_value++; - if (reg1->s32_max_value == (s32)val) - reg1->s32_max_value--; + if (reg_u32_min(reg1) == (u32)val) + reg_set_urange32(reg1, reg_u32_min(reg1) + 1, reg_u32_max(reg1)); + if (reg_u32_max(reg1) == (u32)val) + reg_set_urange32(reg1, reg_u32_min(reg1), reg_u32_max(reg1) - 1); + if (reg_s32_min(reg1) == (s32)val) + reg_set_srange32(reg1, reg_s32_min(reg1) + 1, reg_s32_max(reg1)); + if (reg_s32_max(reg1) == (s32)val) + reg_set_srange32(reg1, reg_s32_min(reg1), reg_s32_max(reg1) - 1); } else { - if (reg1->umin_value == (u64)val) - reg1->umin_value++; - if (reg1->umax_value == (u64)val) - reg1->umax_value--; - if (reg1->smin_value == (s64)val) - reg1->smin_value++; - if (reg1->smax_value == (s64)val) - reg1->smax_value--; + if (reg_umin(reg1) == (u64)val) + reg_set_urange64(reg1, reg_umin(reg1) + 1, reg_umax(reg1)); + if (reg_umax(reg1) == (u64)val) + reg_set_urange64(reg1, reg_umin(reg1), reg_umax(reg1) - 1); + if (reg_smin(reg1) == (s64)val) + reg_set_srange64(reg1, reg_smin(reg1) + 1, reg_smax(reg1)); + if (reg_smax(reg1) == (s64)val) + reg_set_srange64(reg1, reg_smin(reg1), reg_smax(reg1) - 1); } break; case BPF_JSET: @@ -15987,38 +15934,38 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state break; case BPF_JLE: if (is_jmp32) { - reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value); - reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value); + reg_set_urange32(reg1, reg_u32_min(reg1), min(reg_u32_max(reg1), reg_u32_max(reg2))); + reg_set_urange32(reg2, max(reg_u32_min(reg1), reg_u32_min(reg2)), reg_u32_max(reg2)); } else { - reg1->umax_value = min(reg1->umax_value, reg2->umax_value); - reg2->umin_value = max(reg1->umin_value, reg2->umin_value); + reg_set_urange64(reg1, reg_umin(reg1), min(reg_umax(reg1), reg_umax(reg2))); + reg_set_urange64(reg2, max(reg_umin(reg1), reg_umin(reg2)), reg_umax(reg2)); } break; case BPF_JLT: if (is_jmp32) { - reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1); - reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value); + reg_set_urange32(reg1, reg_u32_min(reg1), min(reg_u32_max(reg1), reg_u32_max(reg2) - 1)); + reg_set_urange32(reg2, max(reg_u32_min(reg1) + 1, reg_u32_min(reg2)), reg_u32_max(reg2)); } else { - reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1); - reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value); + reg_set_urange64(reg1, reg_umin(reg1), min(reg_umax(reg1), reg_umax(reg2) - 1)); + reg_set_urange64(reg2, max(reg_umin(reg1) + 1, reg_umin(reg2)), reg_umax(reg2)); } break; case BPF_JSLE: if (is_jmp32) { - reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value); - reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value); + reg_set_srange32(reg1, reg_s32_min(reg1), min(reg_s32_max(reg1), reg_s32_max(reg2))); + reg_set_srange32(reg2, max(reg_s32_min(reg1), reg_s32_min(reg2)), reg_s32_max(reg2)); } else { - reg1->smax_value = min(reg1->smax_value, reg2->smax_value); - reg2->smin_value = max(reg1->smin_value, reg2->smin_value); + reg_set_srange64(reg1, reg_smin(reg1), min(reg_smax(reg1), reg_smax(reg2))); + reg_set_srange64(reg2, max(reg_smin(reg1), reg_smin(reg2)), reg_smax(reg2)); } break; case BPF_JSLT: if (is_jmp32) { - reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1); - reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value); + reg_set_srange32(reg1, reg_s32_min(reg1), min(reg_s32_max(reg1), reg_s32_max(reg2) - 1)); + reg_set_srange32(reg2, max(reg_s32_min(reg1) + 1, reg_s32_min(reg2)), reg_s32_max(reg2)); } else { - reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1); - reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value); + reg_set_srange64(reg1, reg_smin(reg1), min(reg_smax(reg1), reg_smax(reg2) - 1)); + reg_set_srange64(reg2, max(reg_smin(reg1) + 1, reg_smin(reg2)), reg_smax(reg2)); } break; default: @@ -17519,16 +17466,16 @@ static int indirect_jump_min_max_index(struct bpf_verifier_env *env, u32 *pmin_index, u32 *pmax_index) { struct bpf_reg_state *reg = reg_state(env, regno); - u64 min_index = reg->umin_value; - u64 max_index = reg->umax_value; + u64 min_index = reg_umin(reg); + u64 max_index = reg_umax(reg); const u32 size = 8; if (min_index > (u64) U32_MAX * size) { - verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg->umin_value); + verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg_umin(reg)); return -ERANGE; } if (max_index > (u64) U32_MAX * size) { - verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg->umax_value); + verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg_umax(reg)); return -ERANGE; } -- cgit v1.2.3 From bbc631085503a7fde9617be18b0657cc9a83910a Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 24 Apr 2026 15:52:44 -0700 Subject: bpf: replace min/max fields with struct cnum{32,64} Replace eight independent s64, u64, s32, u32 min/max fields in bpf_reg_state with two circular number fields: - cnum64 for a unified signed/unsigned 64-bit range tracking; - cnum32 for a unified signed/unsigned 32-bit range tracking. Each cnum represents a range as a single arc on the circular number line (base + size), from which signed and unsigned bounds are derived on demand via accessor functions introduced in the preceding commit. Notable changes: - Signed<->unsigned deductions in __reg_deduce_bounds() are removed. - 64<->32 bit deductions are replaced with: - reg->r32 = cnum32_intersect(reg->r32, cnum32_from_cnum64(reg->r64)); this is functionally equivalent to the old code. - reg->r64 = cnum64_cnum32_intersect(reg->r64, reg->r32); this handles a few additional cases, see commit message for "bpf: representation and basic operations on circular numbers". - regs_refine_cond_op() now computes results in terms of operations on sets, e.g. for JNE: /* Complement of the range [val, val] as cnum64. */ lo = (struct cnum64){ val + 1, U64_MAX - 1 }; reg1->r64 = cnum64_intersect(reg1->r64, lo); - For add, sub operations on scalars replace explicit bounds computations with cnum{32,64}_{add,negate}. - For add, sub operations on pointers deduplicate with arithmetic operations on scalars and use cnum{32,64}_{add,negate}. - For and, or, xor operations on scalars remove explicit signed bounds computations. - range_bounds_violation() reduces to checking cnum_is_empty(). - const_tnum_range_mismatch() reduces to checking cnum_is_const(). Selftest adjustments: a few existing tests are updated because a single cnum arc cannot always represent what the old system expressed as the intersection of independent signed and unsigned ranges. For example, if the old system tracked u64=[0, U64_MAX-U32_MAX+2] and s64=[S64_MIN+2, 2] independently, their intersection is a tight two-point set. A single cnum must pick the shorter arc, losing the other constraint. These cases are documented with comments in the adjusted tests. reg_bounds.c is updated with logic similar to cnum64_cnum32_intersect(). Instead of using cnums it inspects intersection between 'b' and first / last / next-after-first / previous-before-last sub-ranges of 'a'. reg_bounds.c is also updated to skip test cases that rely in signed and unsigned ranges intersecting in two intervals, as such cases are not representable by a single cnum. The following "crafted" test cases are affected: - reg_bounds_crafted/(s64)[0xffffffffffff8000; 0x7fff] (u32) [0; 0x1f] - reg_bounds_crafted/(s64)[0; 0x1f] (u32) [0xffffffffffffff80; 0x7f] - reg_bounds_crafted/(s64)[0xffffffffffffff80; 0x7f] (u32) [0; 0x1f] - reg_bounds_crafted/(u64)[0; 1] (s32) [1; 2147483648] - reg_bounds_crafted/(u64)[1; 2147483648] (s32) [0; 1] - reg_bounds_crafted/(u64)[0; 0xffffffff00000000] (s64) 0 - reg_bounds_crafted/(u64)0 (s64) [0; 0xffffffff00000000] - reg_bounds_crafted/(u64)[0; 0xffffffff00000000] (s32) 0 - reg_bounds_crafted/(u64)0 (s32) [0; 0xffffffff00000000] - reg_bounds_crafted/(s64)[S64_MIN; 0] (u64) S64_MIN - reg_bounds_crafted/(s64)S64_MIN (u64) [S64_MIN; 0] - reg_bounds_crafted/(s32)[S32_MIN; 0] (u32) S32_MIN - reg_bounds_crafted/(s32)S32_MIN (u32) [S32_MIN; 0] - reg_bounds_crafted/(s64)[0; 0x1f] (u32) [0xffffffff80000000; 0x7fffffff] - reg_bounds_crafted/(s64)[0xffffffff80000000; 0x7fffffff] (u32) [0; 0x1f] - reg_bounds_crafted/(s64)[0; 0x1f] (u32) [0xffffffffffff8000; 0x7fff] As well as some reg_bounds_roand_{consts,ranges}_A_B, where A and B differ in sign domain. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260424-cnums-everywhere-rfc-v1-v3-3-ca434b39a486@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 39 +- kernel/bpf/verifier.c | 843 +++------------------ .../testing/selftests/bpf/prog_tests/reg_bounds.c | 90 ++- .../testing/selftests/bpf/progs/verifier_bounds.c | 9 +- .../testing/selftests/bpf/progs/verifier_subreg.c | 6 +- 5 files changed, 218 insertions(+), 769 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index bf3ffa56bbe5..101ca6cc5424 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -8,6 +8,7 @@ #include /* for struct btf and btf_id() */ #include /* for MAX_BPF_STACK */ #include +#include /* Maximum variable offset umax_value permitted when resolving memory accesses. * In practice this is far bigger than any realistic pointer offset; this limit @@ -120,14 +121,8 @@ struct bpf_reg_state { * These refer to the same value as var_off, not necessarily the actual * contents of the register. */ - s64 smin_value; /* minimum possible (s64)value */ - s64 smax_value; /* maximum possible (s64)value */ - u64 umin_value; /* minimum possible (u64)value */ - u64 umax_value; /* maximum possible (u64)value */ - s32 s32_min_value; /* minimum possible (s32)value */ - s32 s32_max_value; /* maximum possible (s32)value */ - u32 u32_min_value; /* minimum possible (u32)value */ - u32 u32_max_value; /* maximum possible (u32)value */ + struct cnum64 r64; /* 64-bit range as circular number */ + struct cnum32 r32; /* 32-bit range as circular number */ /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we @@ -211,66 +206,62 @@ struct bpf_reg_state { static inline s64 reg_smin(const struct bpf_reg_state *reg) { - return reg->smin_value; + return cnum64_smin(reg->r64); } static inline s64 reg_smax(const struct bpf_reg_state *reg) { - return reg->smax_value; + return cnum64_smax(reg->r64); } static inline u64 reg_umin(const struct bpf_reg_state *reg) { - return reg->umin_value; + return cnum64_umin(reg->r64); } static inline u64 reg_umax(const struct bpf_reg_state *reg) { - return reg->umax_value; + return cnum64_umax(reg->r64); } static inline s32 reg_s32_min(const struct bpf_reg_state *reg) { - return reg->s32_min_value; + return cnum32_smin(reg->r32); } static inline s32 reg_s32_max(const struct bpf_reg_state *reg) { - return reg->s32_max_value; + return cnum32_smax(reg->r32); } static inline u32 reg_u32_min(const struct bpf_reg_state *reg) { - return reg->u32_min_value; + return cnum32_umin(reg->r32); } static inline u32 reg_u32_max(const struct bpf_reg_state *reg) { - return reg->u32_max_value; + return cnum32_umax(reg->r32); } static inline void reg_set_srange32(struct bpf_reg_state *reg, s32 smin, s32 smax) { - reg->s32_min_value = smin; - reg->s32_max_value = smax; + reg->r32 = cnum32_from_srange(smin, smax); } static inline void reg_set_urange32(struct bpf_reg_state *reg, u32 umin, u32 umax) { - reg->u32_min_value = umin; - reg->u32_max_value = umax; + reg->r32 = cnum32_from_urange(umin, umax); } static inline void reg_set_srange64(struct bpf_reg_state *reg, s64 smin, s64 smax) { - reg->smin_value = smin; - reg->smax_value = smax; + reg->r64 = cnum64_from_srange(smin, smax); } static inline void reg_set_urange64(struct bpf_reg_state *reg, u64 umin, u64 umax) { - reg->umin_value = umin; - reg->umax_value = umax; + reg->r64 = cnum64_from_urange(umin, umax); } enum bpf_stack_slot_type { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b91d2789e7b9..03f9e16c2abe 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -1796,10 +1797,8 @@ static const int caller_saved[CALLER_SAVED_REGS] = { static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) { reg->var_off = tnum_const(imm); - reg_set_srange64(reg, (s64)imm, (s64)imm); - reg_set_urange64(reg, imm, imm); - reg_set_srange32(reg, (s32)imm, (s32)imm); - reg_set_urange32(reg, (u32)imm, (u32)imm); + reg->r64 = cnum64_from_urange(imm, imm); + reg->r32 = cnum32_from_urange((u32)imm, (u32)imm); } /* Mark the unknown part of a register (variable offset or scalar value) as @@ -1818,8 +1817,7 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm) { reg->var_off = tnum_const_subreg(reg->var_off, imm); - reg_set_srange32(reg, (s32)imm, (s32)imm); - reg_set_urange32(reg, (u32)imm, (u32)imm); + reg->r32 = cnum32_from_urange((u32)imm, (u32)imm); } /* Mark the 'variable offset' part of a register as zero. This should be @@ -1932,23 +1930,19 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, static void __mark_reg32_unbounded(struct bpf_reg_state *reg) { - reg_set_srange32(reg, S32_MIN, S32_MAX); - reg_set_urange32(reg, 0, U32_MAX); + reg->r32 = CNUM32_UNBOUNDED; } -/* Reset the min/max bounds of a register */ -static void __mark_reg_unbounded(struct bpf_reg_state *reg) +static void __mark_reg64_unbounded(struct bpf_reg_state *reg) { - reg_set_srange64(reg, S64_MIN, S64_MAX); - reg_set_urange64(reg, 0, U64_MAX); - - __mark_reg32_unbounded(reg); + reg->r64 = CNUM64_UNBOUNDED; } -static void __mark_reg64_unbounded(struct bpf_reg_state *reg) +/* Reset the min/max bounds of a register */ +static void __mark_reg_unbounded(struct bpf_reg_state *reg) { - reg_set_srange64(reg, S64_MIN, S64_MAX); - reg_set_urange64(reg, 0, U64_MAX); + __mark_reg64_unbounded(reg); + __mark_reg32_unbounded(reg); } static void reset_reg64_and_tnum(struct bpf_reg_state *reg) @@ -1963,18 +1957,32 @@ static void reset_reg32_and_tnum(struct bpf_reg_state *reg) reg->var_off = tnum_unknown; } -static void __update_reg32_bounds(struct bpf_reg_state *reg) +static struct cnum32 cnum32_from_tnum(struct tnum tnum) { - struct tnum var32_off = tnum_subreg(reg->var_off); + tnum = tnum_subreg(tnum); + if ((tnum.mask & S32_MIN) || (tnum.value & S32_MIN)) + /* min signed is max(sign bit) | min(other bits) */ + /* max signed is min(sign bit) | max(other bits) */ + return cnum32_from_srange(tnum.value | (tnum.mask & S32_MIN), + tnum.value | (tnum.mask & S32_MAX)); + else + return cnum32_from_urange(tnum.value, (tnum.value | tnum.mask)); +} - reg_set_srange32(reg, - /* min signed is max(sign bit) | min(other bits) */ - max_t(s32, reg_s32_min(reg), var32_off.value | (var32_off.mask & S32_MIN)), - /* max signed is min(sign bit) | max(other bits) */ - min_t(s32, reg_s32_max(reg), var32_off.value | (var32_off.mask & S32_MAX))); - reg_set_urange32(reg, - max_t(u32, reg_u32_min(reg), (u32)var32_off.value), - min(reg_u32_max(reg), (u32)(var32_off.value | var32_off.mask))); +static struct cnum64 cnum64_from_tnum(struct tnum tnum) +{ + if ((tnum.mask & S64_MIN) || (tnum.value & S64_MIN)) + /* min signed is max(sign bit) | min(other bits) */ + /* max signed is min(sign bit) | max(other bits) */ + return cnum64_from_srange(tnum.value | (tnum.mask & S64_MIN), + tnum.value | (tnum.mask & S64_MAX)); + else + return cnum64_from_urange(tnum.value, (tnum.value | tnum.mask)); +} + +static void __update_reg32_bounds(struct bpf_reg_state *reg) +{ + cnum32_intersect_with(®->r32, cnum32_from_tnum(reg->var_off)); } static void __update_reg64_bounds(struct bpf_reg_state *reg) @@ -1982,17 +1990,7 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg) u64 tnum_next, tmax; bool umin_in_tnum; - /* min signed is max(sign bit) | min(other bits) */ - /* max signed is min(sign bit) | max(other bits) */ - reg_set_srange64(reg, - max_t(s64, reg_smin(reg), - reg->var_off.value | (reg->var_off.mask & S64_MIN)), - min_t(s64, reg_smax(reg), - reg->var_off.value | (reg->var_off.mask & S64_MAX))); - reg_set_urange64(reg, - max(reg_umin(reg), reg->var_off.value), - min(reg_umax(reg), - reg->var_off.value | reg->var_off.mask)); + cnum64_intersect_with(®->r64, cnum64_from_tnum(reg->var_off)); /* Check if u64 and tnum overlap in a single value */ tnum_next = tnum_step(reg->var_off, reg_umin(reg)); @@ -2028,343 +2026,19 @@ static void __update_reg_bounds(struct bpf_reg_state *reg) __update_reg64_bounds(reg); } -/* Uses signed min/max values to inform unsigned, and vice-versa */ static void deduce_bounds_32_from_64(struct bpf_reg_state *reg) { - /* If upper 32 bits of u64/s64 range don't change, we can use lower 32 - * bits to improve our u32/s32 boundaries. - * - * E.g., the case where we have upper 32 bits as zero ([10, 20] in - * u64) is pretty trivial, it's obvious that in u32 we'll also have - * [10, 20] range. But this property holds for any 64-bit range as - * long as upper 32 bits in that entire range of values stay the same. - * - * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311] - * in decimal) has the same upper 32 bits throughout all the values in - * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15]) - * range. - * - * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32, - * following the rules outlined below about u64/s64 correspondence - * (which equally applies to u32 vs s32 correspondence). In general it - * depends on actual hexadecimal values of 32-bit range. They can form - * only valid u32, or only valid s32 ranges in some cases. - * - * So we use all these insights to derive bounds for subregisters here. - */ - if ((reg_umin(reg) >> 32) == (reg_umax(reg) >> 32)) { - /* u64 to u32 casting preserves validity of low 32 bits as - * a range, if upper 32 bits are the same - */ - reg_set_urange32(reg, - max_t(u32, reg_u32_min(reg), (u32)reg_umin(reg)), - min_t(u32, reg_u32_max(reg), (u32)reg_umax(reg))); - - if ((s32)reg_umin(reg) <= (s32)reg_umax(reg)) { - reg_set_srange32(reg, - max_t(s32, reg_s32_min(reg), (s32)reg_umin(reg)), - min_t(s32, reg_s32_max(reg), (s32)reg_umax(reg))); - } - } - if ((reg_smin(reg) >> 32) == (reg_smax(reg) >> 32)) { - /* low 32 bits should form a proper u32 range */ - if ((u32)reg_smin(reg) <= (u32)reg_smax(reg)) { - reg_set_urange32(reg, - max_t(u32, reg_u32_min(reg), (u32)reg_smin(reg)), - min_t(u32, reg_u32_max(reg), (u32)reg_smax(reg))); - } - /* low 32 bits should form a proper s32 range */ - if ((s32)reg_smin(reg) <= (s32)reg_smax(reg)) { - reg_set_srange32(reg, - max_t(s32, reg_s32_min(reg), (s32)reg_smin(reg)), - min_t(s32, reg_s32_max(reg), (s32)reg_smax(reg))); - } - } - /* Special case where upper bits form a small sequence of two - * sequential numbers (in 32-bit unsigned space, so 0xffffffff to - * 0x00000000 is also valid), while lower bits form a proper s32 range - * going from negative numbers to positive numbers. E.g., let's say we - * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]). - * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff, - * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits, - * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]). - * Note that it doesn't have to be 0xffffffff going to 0x00000000 in - * upper 32 bits. As a random example, s64 range - * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range - * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister. - */ - if ((u32)(reg_umin(reg) >> 32) + 1 == (u32)(reg_umax(reg) >> 32) && - (s32)reg_umin(reg) < 0 && (s32)reg_umax(reg) >= 0) { - reg_set_srange32(reg, - max_t(s32, reg_s32_min(reg), (s32)reg_umin(reg)), - min_t(s32, reg_s32_max(reg), (s32)reg_umax(reg))); - } - if ((u32)(reg_smin(reg) >> 32) + 1 == (u32)(reg_smax(reg) >> 32) && - (s32)reg_smin(reg) < 0 && (s32)reg_smax(reg) >= 0) { - reg_set_srange32(reg, - max_t(s32, reg_s32_min(reg), (s32)reg_smin(reg)), - min_t(s32, reg_s32_max(reg), (s32)reg_smax(reg))); - } -} - -static void deduce_bounds_32_from_32(struct bpf_reg_state *reg) -{ - /* if u32 range forms a valid s32 range (due to matching sign bit), - * try to learn from that - */ - if ((s32)reg_u32_min(reg) <= (s32)reg_u32_max(reg)) { - reg_set_srange32(reg, - max_t(s32, reg_s32_min(reg), reg_u32_min(reg)), - min_t(s32, reg_s32_max(reg), reg_u32_max(reg))); - } - /* If we cannot cross the sign boundary, then signed and unsigned bounds - * are the same, so combine. This works even in the negative case, e.g. - * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. - */ - if ((u32)reg_s32_min(reg) <= (u32)reg_s32_max(reg)) { - reg_set_urange32(reg, - max_t(u32, reg_s32_min(reg), reg_u32_min(reg)), - min_t(u32, reg_s32_max(reg), reg_u32_max(reg))); - } else { - if (reg_u32_max(reg) < (u32)reg_s32_min(reg)) { - /* See __reg64_deduce_bounds() for detailed explanation. - * Refine ranges in the following situation: - * - * 0 U32_MAX - * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxx s32 range xxxxxxxxx] [xxxxxxx| - * 0 S32_MAX S32_MIN -1 - */ - reg_set_srange32(reg, (s32)reg_u32_min(reg), reg_s32_max(reg)); - reg_set_urange32(reg, - reg_u32_min(reg), - min_t(u32, reg_u32_max(reg), reg_s32_max(reg))); - } else if ((u32)reg_s32_max(reg) < reg_u32_min(reg)) { - /* - * 0 U32_MAX - * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxxxxxx] [xxxxxxxxxxxx s32 range | - * 0 S32_MAX S32_MIN -1 - */ - reg_set_srange32(reg, reg_s32_min(reg), (s32)reg_u32_max(reg)); - reg_set_urange32(reg, - max_t(u32, reg_u32_min(reg), reg_s32_min(reg)), - reg_u32_max(reg)); - } - } -} - -static void deduce_bounds_64_from_64(struct bpf_reg_state *reg) -{ - /* If u64 range forms a valid s64 range (due to matching sign bit), - * try to learn from that. Let's do a bit of ASCII art to see when - * this is happening. Let's take u64 range first: - * - * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX - * |-------------------------------|--------------------------------| - * - * Valid u64 range is formed when umin and umax are anywhere in the - * range [0, U64_MAX], and umin <= umax. u64 case is simple and - * straightforward. Let's see how s64 range maps onto the same range - * of values, annotated below the line for comparison: - * - * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX - * |-------------------------------|--------------------------------| - * 0 S64_MAX S64_MIN -1 - * - * So s64 values basically start in the middle and they are logically - * contiguous to the right of it, wrapping around from -1 to 0, and - * then finishing as S64_MAX (0x7fffffffffffffff) right before - * S64_MIN. We can try drawing the continuity of u64 vs s64 values - * more visually as mapped to sign-agnostic range of hex values. - * - * u64 start u64 end - * _______________________________________________________________ - * / \ - * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX - * |-------------------------------|--------------------------------| - * 0 S64_MAX S64_MIN -1 - * / \ - * >------------------------------ -------------------------------> - * s64 continues... s64 end s64 start s64 "midpoint" - * - * What this means is that, in general, we can't always derive - * something new about u64 from any random s64 range, and vice versa. - * - * But we can do that in two particular cases. One is when entire - * u64/s64 range is *entirely* contained within left half of the above - * diagram or when it is *entirely* contained in the right half. I.e.: - * - * |-------------------------------|--------------------------------| - * ^ ^ ^ ^ - * A B C D - * - * [A, B] and [C, D] are contained entirely in their respective halves - * and form valid contiguous ranges as both u64 and s64 values. [A, B] - * will be non-negative both as u64 and s64 (and in fact it will be - * identical ranges no matter the signedness). [C, D] treated as s64 - * will be a range of negative values, while in u64 it will be - * non-negative range of values larger than 0x8000000000000000. - * - * Now, any other range here can't be represented in both u64 and s64 - * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid - * contiguous u64 ranges, but they are discontinuous in s64. [B, C] - * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX], - * for example. Similarly, valid s64 range [D, A] (going from negative - * to positive values), would be two separate [D, U64_MAX] and [0, A] - * ranges as u64. Currently reg_state can't represent two segments per - * numeric domain, so in such situations we can only derive maximal - * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64). - * - * So we use these facts to derive umin/umax from smin/smax and vice - * versa only if they stay within the same "half". This is equivalent - * to checking sign bit: lower half will have sign bit as zero, upper - * half have sign bit 1. Below in code we simplify this by just - * casting umin/umax as smin/smax and checking if they form valid - * range, and vice versa. Those are equivalent checks. - */ - if ((s64)reg_umin(reg) <= (s64)reg_umax(reg)) { - reg_set_srange64(reg, - max_t(s64, reg_smin(reg), reg_umin(reg)), - min_t(s64, reg_smax(reg), reg_umax(reg))); - } - /* If we cannot cross the sign boundary, then signed and unsigned bounds - * are the same, so combine. This works even in the negative case, e.g. - * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. - */ - if ((u64)reg_smin(reg) <= (u64)reg_smax(reg)) { - reg_set_urange64(reg, - max_t(u64, reg_smin(reg), reg_umin(reg)), - min_t(u64, reg_smax(reg), reg_umax(reg))); - } else { - /* If the s64 range crosses the sign boundary, then it's split - * between the beginning and end of the U64 domain. In that - * case, we can derive new bounds if the u64 range overlaps - * with only one end of the s64 range. - * - * In the following example, the u64 range overlaps only with - * positive portion of the s64 range. - * - * 0 U64_MAX - * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxx s64 range xxxxxxxxx] [xxxxxxx| - * 0 S64_MAX S64_MIN -1 - * - * We can thus derive the following new s64 and u64 ranges. - * - * 0 U64_MAX - * | [xxxxxx u64 range xxxxx] | - * |----------------------------|----------------------------| - * | [xxxxxx s64 range xxxxx] | - * 0 S64_MAX S64_MIN -1 - * - * If they overlap in two places, we can't derive anything - * because reg_state can't represent two ranges per numeric - * domain. - * - * 0 U64_MAX - * | [xxxxxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxx s64 range xxxxxxxxx] [xxxxxxxxxx| - * 0 S64_MAX S64_MIN -1 - * - * The first condition below corresponds to the first diagram - * above. - */ - if (reg_umax(reg) < (u64)reg_smin(reg)) { - reg_set_srange64(reg, (s64)reg_umin(reg), reg_smax(reg)); - reg_set_urange64(reg, reg_umin(reg), min_t(u64, reg_umax(reg), reg_smax(reg))); - } else if ((u64)reg_smax(reg) < reg_umin(reg)) { - /* This second condition considers the case where the u64 range - * overlaps with the negative portion of the s64 range: - * - * 0 U64_MAX - * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxxxxxx] [xxxxxxxxxxxx s64 range | - * 0 S64_MAX S64_MIN -1 - */ - reg_set_srange64(reg, reg_smin(reg), (s64)reg_umax(reg)); - reg_set_urange64(reg, max_t(u64, reg_umin(reg), reg_smin(reg)), reg_umax(reg)); - } - } + cnum32_intersect_with(®->r32, cnum32_from_cnum64(reg->r64)); } static void deduce_bounds_64_from_32(struct bpf_reg_state *reg) { - /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit - * values on both sides of 64-bit range in hope to have tighter range. - * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from - * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff]. - * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound - * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of - * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a - * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff]. - * We just need to make sure that derived bounds we are intersecting - * with are well-formed ranges in respective s64 or u64 domain, just - * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments. - */ - __u64 new_umin, new_umax; - __s64 new_smin, new_smax; - - /* u32 -> u64 tightening, it's always well-formed */ - new_umin = (reg_umin(reg) & ~0xffffffffULL) | reg_u32_min(reg); - new_umax = (reg_umax(reg) & ~0xffffffffULL) | reg_u32_max(reg); - reg_set_urange64(reg, - max_t(u64, reg_umin(reg), new_umin), - min_t(u64, reg_umax(reg), new_umax)); - /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */ - new_smin = (reg_smin(reg) & ~0xffffffffULL) | reg_u32_min(reg); - new_smax = (reg_smax(reg) & ~0xffffffffULL) | reg_u32_max(reg); - reg_set_srange64(reg, - max_t(s64, reg_smin(reg), new_smin), - min_t(s64, reg_smax(reg), new_smax)); - - /* Here we would like to handle a special case after sign extending load, - * when upper bits for a 64-bit range are all 1s or all 0s. - * - * Upper bits are all 1s when register is in a range: - * [0xffff_ffff_0000_0000, 0xffff_ffff_ffff_ffff] - * Upper bits are all 0s when register is in a range: - * [0x0000_0000_0000_0000, 0x0000_0000_ffff_ffff] - * Together this forms are continuous range: - * [0xffff_ffff_0000_0000, 0x0000_0000_ffff_ffff] - * - * Now, suppose that register range is in fact tighter: - * [0xffff_ffff_8000_0000, 0x0000_0000_ffff_ffff] (R) - * Also suppose that it's 32-bit range is positive, - * meaning that lower 32-bits of the full 64-bit register - * are in the range: - * [0x0000_0000, 0x7fff_ffff] (W) - * - * If this happens, then any value in a range: - * [0xffff_ffff_0000_0000, 0xffff_ffff_7fff_ffff] - * is smaller than a lowest bound of the range (R): - * 0xffff_ffff_8000_0000 - * which means that upper bits of the full 64-bit register - * can't be all 1s, when lower bits are in range (W). - * - * Note that: - * - 0xffff_ffff_8000_0000 == (s64)S32_MIN - * - 0x0000_0000_7fff_ffff == (s64)S32_MAX - * These relations are used in the conditions below. - */ - if (reg_s32_min(reg) >= 0 && reg_smin(reg) >= S32_MIN && reg_smax(reg) <= S32_MAX) { - reg_set_srange64(reg, reg_s32_min(reg), reg_s32_max(reg)); - reg_set_urange64(reg, reg_s32_min(reg), reg_s32_max(reg)); - reg->var_off = tnum_intersect(reg->var_off, - tnum_range(reg_smin(reg), reg_smax(reg))); - } + reg->r64 = cnum64_cnum32_intersect(reg->r64, reg->r32); } static void __reg_deduce_bounds(struct bpf_reg_state *reg) { - deduce_bounds_64_from_64(reg); deduce_bounds_32_from_64(reg); - deduce_bounds_32_from_32(reg); deduce_bounds_64_from_32(reg); } @@ -2402,35 +2076,25 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) __update_reg_bounds(reg); } -static bool range_bounds_violation(struct bpf_reg_state *reg) -{ - return (reg_umin(reg) > reg_umax(reg) || reg_smin(reg) > reg_smax(reg) || - reg_u32_min(reg) > reg_u32_max(reg) || - reg_s32_min(reg) > reg_s32_max(reg)); -} - static bool const_tnum_range_mismatch(struct bpf_reg_state *reg) { - u64 uval = reg->var_off.value; - s64 sval = (s64)uval; - if (!tnum_is_const(reg->var_off)) return false; - return reg_umin(reg) != uval || reg_umax(reg) != uval || - reg_smin(reg) != sval || reg_smax(reg) != sval; + return !cnum64_is_const(reg->r64) || reg->r64.base != reg->var_off.value; } static bool const_tnum_range_mismatch_32(struct bpf_reg_state *reg) { - u32 uval32 = tnum_subreg(reg->var_off).value; - s32 sval32 = (s32)uval32; - if (!tnum_subreg_is_const(reg->var_off)) return false; - return reg_u32_min(reg) != uval32 || reg_u32_max(reg) != uval32 || - reg_s32_min(reg) != sval32 || reg_s32_max(reg) != sval32; + return !cnum32_is_const(reg->r32) || reg->r32.base != tnum_subreg(reg->var_off).value; +} + +static bool range_bounds_violation(struct bpf_reg_state *reg) +{ + return cnum32_is_empty(reg->r32) || cnum64_is_empty(reg->r64); } static int reg_bounds_sanity_check(struct bpf_verifier_env *env, @@ -2455,12 +2119,11 @@ static int reg_bounds_sanity_check(struct bpf_verifier_env *env, return 0; out: - verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " - "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)", - ctx, msg, reg_umin(reg), reg_umax(reg), - reg_smin(reg), reg_smax(reg), - reg_u32_min(reg), reg_u32_max(reg), - reg_s32_min(reg), reg_s32_max(reg), + verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s r64={.base=%#llx, .size=%#llx} " + "r32={.base=%#x, .size=%#x} var_off=(%#llx, %#llx)", + ctx, msg, + reg->r64.base, reg->r64.size, + reg->r32.base, reg->r32.size, reg->var_off.value, reg->var_off.mask); if (env->test_reg_invariants) return -EFAULT; @@ -2468,26 +2131,6 @@ out: return 0; } -static bool __reg32_bound_s64(s32 a) -{ - return a >= 0 && a <= S32_MAX; -} - -static void __reg_assign_32_into_64(struct bpf_reg_state *reg) -{ - reg_set_urange64(reg, reg_u32_min(reg), reg_u32_max(reg)); - - /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must - * be positive otherwise set to worse case bounds and refine later - * from tnum. - */ - if (__reg32_bound_s64(reg_s32_min(reg)) && - __reg32_bound_s64(reg_s32_max(reg))) - reg_set_srange64(reg, reg_s32_min(reg), reg_s32_max(reg)); - else - reg_set_srange64(reg, 0, U32_MAX); -} - /* Mark a register as having a completely unknown (scalar) value. */ void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg) { @@ -5636,7 +5279,7 @@ static int check_buffer_access(struct bpf_verifier_env *env, static void zext_32_to_64(struct bpf_reg_state *reg) { reg->var_off = tnum_subreg(reg->var_off); - __reg_assign_32_into_64(reg); + reg_set_urange64(reg, reg_u32_min(reg), reg_u32_max(reg)); } /* truncate register to smaller size (in bytes) @@ -5651,12 +5294,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) /* fix arithmetic bounds */ mask = ((u64)1 << (size * 8)) - 1; - if ((reg_umin(reg) & ~mask) == (reg_umax(reg) & ~mask)) { + if ((reg_umin(reg) & ~mask) == (reg_umax(reg) & ~mask)) reg_set_urange64(reg, reg_umin(reg) & mask, reg_umax(reg) & mask); - } else { + else reg_set_urange64(reg, 0, mask); - } - reg_set_srange64(reg, reg_umin(reg), reg_umax(reg)); /* If size is smaller than 32bit register the 32bit register * values are also truncated so we push 64-bit bounds into @@ -5681,8 +5322,6 @@ static void set_sext64_default_val(struct bpf_reg_state *reg, int size) reg_set_srange64(reg, S32_MIN, S32_MAX); reg_set_srange32(reg, S32_MIN, S32_MAX); } - reg_set_urange64(reg, 0, U64_MAX); - reg_set_urange32(reg, 0, U32_MAX); reg->var_off = tnum_unknown; } @@ -5703,10 +5342,8 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) reg->var_off = tnum_const((s32)u64_cval); u64_cval = reg->var_off.value; - reg_set_srange64(reg, u64_cval, u64_cval); - reg_set_urange64(reg, u64_cval, u64_cval); - reg_set_srange32(reg, u64_cval, u64_cval); - reg_set_urange32(reg, u64_cval, u64_cval); + reg->r64 = cnum64_from_urange(u64_cval, u64_cval); + reg->r32 = cnum32_from_urange((u32)u64_cval, (u32)u64_cval); return; } @@ -5734,9 +5371,7 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) /* both of s64_max/s64_min positive or negative */ if ((s64_max >= 0) == (s64_min >= 0)) { reg_set_srange64(reg, s64_min, s64_max); - reg_set_urange64(reg, s64_min, s64_max); reg_set_srange32(reg, s64_min, s64_max); - reg_set_urange32(reg, s64_min, s64_max); reg->var_off = tnum_range(s64_min, s64_max); return; } @@ -5752,7 +5387,6 @@ static void set_sext32_default_val(struct bpf_reg_state *reg, int size) else /* size == 2 */ reg_set_srange32(reg, S16_MIN, S16_MAX); - reg_set_urange32(reg, 0, U32_MAX); reg->var_off = tnum_subreg(tnum_unknown); } @@ -5771,7 +5405,6 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) u32_val = reg->var_off.value; reg_set_srange32(reg, u32_val, u32_val); - reg_set_urange32(reg, u32_val, u32_val); return; } @@ -5795,7 +5428,6 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) if ((s32_min >= 0) == (s32_max >= 0)) { reg_set_srange32(reg, s32_min, s32_max); - reg_set_urange32(reg, (u32)s32_min, (u32)s32_max); reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max)); return; } @@ -9952,8 +9584,6 @@ static int do_refine_retval_range(struct bpf_verifier_env *env, case BPF_FUNC_get_smp_processor_id: reg_set_urange64(ret_reg, 0, nr_cpu_ids - 1); reg_set_urange32(ret_reg, 0, nr_cpu_ids - 1); - reg_set_srange64(ret_reg, 0, nr_cpu_ids - 1); - reg_set_srange32(ret_reg, 0, nr_cpu_ids - 1); reg_bounds_sync(ret_reg); break; } @@ -13756,10 +13386,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *dst_reg; bool known = tnum_is_const(off_reg->var_off); - s64 smin_val = reg_smin(off_reg), smax_val = reg_smax(off_reg), - smin_ptr = reg_smin(ptr_reg), smax_ptr = reg_smax(ptr_reg); - u64 umin_val = reg_umin(off_reg), umax_val = reg_umax(off_reg), - umin_ptr = reg_umin(ptr_reg), umax_ptr = reg_umax(ptr_reg); + s64 smin_val = reg_smin(off_reg), smax_val = reg_smax(off_reg); + u64 umin_val = reg_umin(off_reg), umax_val = reg_umax(off_reg); struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; @@ -13861,23 +13489,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * added into the variable offset, and we copy the fixed offset * from ptr_reg. */ - { - s64 smin_res, smax_res; - u64 umin_res, umax_res; - - if (check_add_overflow(smin_ptr, smin_val, &smin_res) || - check_add_overflow(smax_ptr, smax_val, &smax_res)) { - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); - } else { - reg_set_srange64(dst_reg, smin_res, smax_res); - } - if (check_add_overflow(umin_ptr, umin_val, &umin_res) || - check_add_overflow(umax_ptr, umax_val, &umax_res)) { - reg_set_urange64(dst_reg, 0, U64_MAX); - } else { - reg_set_urange64(dst_reg, umin_res, umax_res); - } - } + dst_reg->r64 = cnum64_add(ptr_reg->r64, off_reg->r64); dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { @@ -13909,27 +13521,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst); return -EACCES; } - /* A new variable offset is created. If the subtrahend is known - * nonnegative, then any reg->range we had before is still good. - */ - { - s64 smin_res, smax_res; - - if (check_sub_overflow(smin_ptr, smax_val, &smin_res) || - check_sub_overflow(smax_ptr, smin_val, &smax_res)) { - /* Overflow possible, we know nothing */ - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); - } else { - reg_set_srange64(dst_reg, smin_res, smax_res); - } - } - if (umin_ptr < umax_val) { - /* Overflow possible, we know nothing */ - reg_set_urange64(dst_reg, 0, U64_MAX); - } else { - /* Cannot overflow (as long as bounds are consistent) */ - reg_set_urange64(dst_reg, umin_ptr - umax_val, umax_ptr - umin_val); - } + dst_reg->r64 = cnum64_add(ptr_reg->r64, cnum64_negate(off_reg->r64)); dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { @@ -13986,139 +13578,25 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 smin = reg_s32_min(dst_reg); - s32 smax = reg_s32_max(dst_reg); - u32 umin = reg_u32_min(dst_reg); - u32 umax = reg_u32_max(dst_reg); - u32 umin_val = reg_u32_min(src_reg); - u32 umax_val = reg_u32_max(src_reg); - bool min_overflow, max_overflow; - - if (check_add_overflow(smin, reg_s32_min(src_reg), &smin) || - check_add_overflow(smax, reg_s32_max(src_reg), &smax)) { - smin = S32_MIN; - smax = S32_MAX; - } - - /* If either all additions overflow or no additions overflow, then - * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = - * dst_umax + src_umax. Otherwise (some additions overflow), set - * the output bounds to unbounded. - */ - min_overflow = check_add_overflow(umin, umin_val, &umin); - max_overflow = check_add_overflow(umax, umax_val, &umax); - - if (!min_overflow && max_overflow) { - umin = 0; - umax = U32_MAX; - } - - reg_set_srange32(dst_reg, smin, smax); - reg_set_urange32(dst_reg, umin, umax); + dst_reg->r32 = cnum32_add(dst_reg->r32, src_reg->r32); } static void scalar_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 smin = reg_smin(dst_reg); - s64 smax = reg_smax(dst_reg); - u64 umin = reg_umin(dst_reg); - u64 umax = reg_umax(dst_reg); - u64 umin_val = reg_umin(src_reg); - u64 umax_val = reg_umax(src_reg); - bool min_overflow, max_overflow; - - if (check_add_overflow(smin, reg_smin(src_reg), &smin) || - check_add_overflow(smax, reg_smax(src_reg), &smax)) { - smin = S64_MIN; - smax = S64_MAX; - } - - /* If either all additions overflow or no additions overflow, then - * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = - * dst_umax + src_umax. Otherwise (some additions overflow), set - * the output bounds to unbounded. - */ - min_overflow = check_add_overflow(umin, umin_val, &umin); - max_overflow = check_add_overflow(umax, umax_val, &umax); - - if (!min_overflow && max_overflow) { - umin = 0; - umax = U64_MAX; - } - - reg_set_srange64(dst_reg, smin, smax); - reg_set_urange64(dst_reg, umin, umax); + dst_reg->r64 = cnum64_add(dst_reg->r64, src_reg->r64); } static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 smin = reg_s32_min(dst_reg); - s32 smax = reg_s32_max(dst_reg); - u32 umin = reg_u32_min(dst_reg); - u32 umax = reg_u32_max(dst_reg); - u32 umin_val = reg_u32_min(src_reg); - u32 umax_val = reg_u32_max(src_reg); - bool min_underflow, max_underflow; - - if (check_sub_overflow(smin, reg_s32_max(src_reg), &smin) || - check_sub_overflow(smax, reg_s32_min(src_reg), &smax)) { - /* Overflow possible, we know nothing */ - smin = S32_MIN; - smax = S32_MAX; - } - - /* If either all subtractions underflow or no subtractions - * underflow, it is okay to set: dst_umin = dst_umin - src_umax, - * dst_umax = dst_umax - src_umin. Otherwise (some subtractions - * underflow), set the output bounds to unbounded. - */ - min_underflow = check_sub_overflow(umin, umax_val, &umin); - max_underflow = check_sub_overflow(umax, umin_val, &umax); - - if (min_underflow && !max_underflow) { - umin = 0; - umax = U32_MAX; - } - - reg_set_srange32(dst_reg, smin, smax); - reg_set_urange32(dst_reg, umin, umax); + dst_reg->r32 = cnum32_add(dst_reg->r32, cnum32_negate(src_reg->r32)); } static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 smin = reg_smin(dst_reg); - s64 smax = reg_smax(dst_reg); - u64 umin = reg_umin(dst_reg); - u64 umax = reg_umax(dst_reg); - u64 umin_val = reg_umin(src_reg); - u64 umax_val = reg_umax(src_reg); - bool min_underflow, max_underflow; - - if (check_sub_overflow(smin, reg_smax(src_reg), &smin) || - check_sub_overflow(smax, reg_smin(src_reg), &smax)) { - /* Overflow possible, we know nothing */ - smin = S64_MIN; - smax = S64_MAX; - } - - /* If either all subtractions underflow or no subtractions - * underflow, it is okay to set: dst_umin = dst_umin - src_umax, - * dst_umax = dst_umax - src_umin. Otherwise (some subtractions - * underflow), set the output bounds to unbounded. - */ - min_underflow = check_sub_overflow(umin, umax_val, &umin); - max_underflow = check_sub_overflow(umax, umin_val, &umax); - - if (min_underflow && !max_underflow) { - umin = 0; - umax = U64_MAX; - } - - reg_set_srange64(dst_reg, smin, smax); - reg_set_urange64(dst_reg, umin, umax); + dst_reg->r64 = cnum64_add(dst_reg->r64, cnum64_negate(src_reg->r64)); } static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg, @@ -14148,8 +13626,8 @@ static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg, smax = max_array(tmp_prod, 4); } - reg_set_srange32(dst_reg, smin, smax); - reg_set_urange32(dst_reg, umin, umax); + dst_reg->r32 = cnum32_intersect(cnum32_from_urange(umin, umax), + cnum32_from_srange(smin, smax)); } static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, @@ -14179,8 +13657,8 @@ static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, smax = max_array(tmp_prod, 4); } - reg_set_srange64(dst_reg, smin, smax); - reg_set_urange64(dst_reg, umin, umax); + dst_reg->r64 = cnum64_intersect(cnum64_from_urange(umin, umax), + cnum64_from_srange(smin, smax)); } static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg, @@ -14192,7 +13670,6 @@ static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg, reg_u32_max(dst_reg) / src_val); /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_srange32(dst_reg, S32_MIN, S32_MAX); reset_reg64_and_tnum(dst_reg); } @@ -14205,7 +13682,6 @@ static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg, div64_u64(reg_umax(dst_reg), src_val)); /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); reset_reg32_and_tnum(dst_reg); } @@ -14242,7 +13718,6 @@ static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, reset: reg_set_srange32(dst_reg, smin, smax); /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_urange32(dst_reg, 0, U32_MAX); reset_reg64_and_tnum(dst_reg); } @@ -14279,7 +13754,6 @@ static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, reset: reg_set_srange64(dst_reg, smin, smax); /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_urange64(dst_reg, 0, U64_MAX); reset_reg32_and_tnum(dst_reg); } @@ -14299,7 +13773,6 @@ static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg, reg_set_urange32(dst_reg, 0, min(reg_u32_max(dst_reg), res_max)); /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_srange32(dst_reg, S32_MIN, S32_MAX); reset_reg64_and_tnum(dst_reg); } @@ -14319,7 +13792,6 @@ static void scalar_min_max_umod(struct bpf_reg_state *dst_reg, reg_set_urange64(dst_reg, 0, min(reg_umax(dst_reg), res_max)); /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); reset_reg32_and_tnum(dst_reg); } @@ -14359,7 +13831,6 @@ static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, } /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_urange32(dst_reg, 0, U32_MAX); reset_reg64_and_tnum(dst_reg); } @@ -14399,7 +13870,6 @@ static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, } /* Reset other ranges/tnum to unbounded/unknown. */ - reg_set_urange64(dst_reg, 0, U64_MAX); reset_reg32_and_tnum(dst_reg); } @@ -14419,15 +13889,9 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, /* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima. */ - reg_set_urange32(dst_reg, var32_off.value, min(reg_u32_max(dst_reg), umax_val)); - - /* Safe to set s32 bounds by casting u32 result into s32 when u32 - * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. - */ - if ((s32)reg_u32_min(dst_reg) <= (s32)reg_u32_max(dst_reg)) - reg_set_srange32(dst_reg, reg_u32_min(dst_reg), reg_u32_max(dst_reg)); - else - reg_set_srange32(dst_reg, S32_MIN, S32_MAX); + reg_set_urange32(dst_reg, + var32_off.value, + min(reg_u32_max(dst_reg), umax_val)); } static void scalar_min_max_and(struct bpf_reg_state *dst_reg, @@ -14445,15 +13909,10 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, /* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima. */ - reg_set_urange64(dst_reg, dst_reg->var_off.value, min(reg_umax(dst_reg), umax_val)); + reg_set_urange64(dst_reg, + dst_reg->var_off.value, + min(reg_umax(dst_reg), umax_val)); - /* Safe to set s64 bounds by casting u64 result into s64 when u64 - * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. - */ - if ((s64)reg_umin(dst_reg) <= (s64)reg_umax(dst_reg)) - reg_set_srange64(dst_reg, reg_umin(dst_reg), reg_umax(dst_reg)); - else - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); } @@ -14474,16 +13933,9 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, /* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima */ - reg_set_urange32(dst_reg, max(reg_u32_min(dst_reg), umin_val), + reg_set_urange32(dst_reg, + max(reg_u32_min(dst_reg), umin_val), var32_off.value | var32_off.mask); - - /* Safe to set s32 bounds by casting u32 result into s32 when u32 - * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. - */ - if ((s32)reg_u32_min(dst_reg) <= (s32)reg_u32_max(dst_reg)) - reg_set_srange32(dst_reg, reg_u32_min(dst_reg), reg_u32_max(dst_reg)); - else - reg_set_srange32(dst_reg, S32_MIN, S32_MAX); } static void scalar_min_max_or(struct bpf_reg_state *dst_reg, @@ -14501,16 +13953,10 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, /* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima */ - reg_set_urange64(dst_reg, max(reg_umin(dst_reg), umin_val), + reg_set_urange64(dst_reg, + max(reg_umin(dst_reg), umin_val), dst_reg->var_off.value | dst_reg->var_off.mask); - /* Safe to set s64 bounds by casting u64 result into s64 when u64 - * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. - */ - if ((s64)reg_umin(dst_reg) <= (s64)reg_umax(dst_reg)) - reg_set_srange64(dst_reg, reg_umin(dst_reg), reg_umax(dst_reg)); - else - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); } @@ -14529,14 +13975,6 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, /* We get both minimum and maximum from the var32_off. */ reg_set_urange32(dst_reg, var32_off.value, var32_off.value | var32_off.mask); - - /* Safe to set s32 bounds by casting u32 result into s32 when u32 - * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. - */ - if ((s32)reg_u32_min(dst_reg) <= (s32)reg_u32_max(dst_reg)) - reg_set_srange32(dst_reg, reg_u32_min(dst_reg), reg_u32_max(dst_reg)); - else - reg_set_srange32(dst_reg, S32_MIN, S32_MAX); } static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, @@ -14552,31 +13990,21 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, } /* We get both minimum and maximum from the var_off. */ - reg_set_urange64(dst_reg, dst_reg->var_off.value, + reg_set_urange64(dst_reg, + dst_reg->var_off.value, dst_reg->var_off.value | dst_reg->var_off.mask); - - /* Safe to set s64 bounds by casting u64 result into s64 when u64 - * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. - */ - if ((s64)reg_umin(dst_reg) <= (s64)reg_umax(dst_reg)) - reg_set_srange64(dst_reg, reg_umin(dst_reg), reg_umax(dst_reg)); - else - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); - - __update_reg_bounds(dst_reg); } static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, u64 umin_val, u64 umax_val) { - /* We lose all sign bit information (except what we can pick - * up from var_off) - */ - reg_set_srange32(dst_reg, S32_MIN, S32_MAX); /* If we might shift our top bit out, then we know nothing */ if (umax_val > 31 || reg_u32_max(dst_reg) > 1ULL << (31 - umax_val)) reg_set_urange32(dst_reg, 0, U32_MAX); else + /* We lose all sign bit information (except what we can pick + * up from var_off) + */ reg_set_urange32(dst_reg, reg_u32_min(dst_reg) << umin_val, reg_u32_max(dst_reg) << umax_val); } @@ -14602,23 +14030,27 @@ static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg, u64 umin_val, u64 umax_val) { + struct cnum64 u, s; + /* Special case <<32 because it is a common compiler pattern to sign * extend subreg by doing <<32 s>>32. smin/smax assignments are correct * because s32 bounds don't flip sign when shifting to the left by * 32bits. */ if (umin_val == 32 && umax_val == 32) - reg_set_srange64(dst_reg, (s64)reg_s32_min(dst_reg) << 32, - (s64)reg_s32_max(dst_reg) << 32); + s = cnum64_from_srange((s64)reg_s32_min(dst_reg) << 32, + (s64)reg_s32_max(dst_reg) << 32); else - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); + s = CNUM64_UNBOUNDED; /* If we might shift our top bit out, then we know nothing */ if (reg_umax(dst_reg) > 1ULL << (63 - umax_val)) - reg_set_urange64(dst_reg, 0, U64_MAX); + u = CNUM64_UNBOUNDED; else - reg_set_urange64(dst_reg, reg_umin(dst_reg) << umin_val, - reg_umax(dst_reg) << umax_val); + u = cnum64_from_urange(reg_umin(dst_reg) << umin_val, + reg_umax(dst_reg) << umax_val); + + dst_reg->r64 = cnum64_intersect(u, s); } static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg, @@ -14657,7 +14089,6 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, * and rely on inferring new ones from the unsigned bounds and * var_off of the result. */ - reg_set_srange32(dst_reg, S32_MIN, S32_MAX); dst_reg->var_off = tnum_rshift(subreg, umin_val); reg_set_urange32(dst_reg, reg_u32_min(dst_reg) >> umax_val, @@ -14687,7 +14118,6 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, * and rely on inferring new ones from the unsigned bounds and * var_off of the result. */ - reg_set_srange64(dst_reg, S64_MIN, S64_MAX); dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); reg_set_urange64(dst_reg, reg_umin(dst_reg) >> umax_val, reg_umax(dst_reg) >> umin_val); @@ -14707,6 +14137,8 @@ static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg, /* Upon reaching here, src_known is true and * umax_val is equal to umin_val. + * Blow away the dst_reg umin_value/umax_value and rely on + * dst_reg var_off to refine the result. */ reg_set_srange32(dst_reg, (u32)(((s32)reg_s32_min(dst_reg)) >> umin_val), @@ -14714,11 +14146,6 @@ static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg, dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32); - /* blow away the dst_reg umin_value/umax_value and rely on - * dst_reg var_off to refine the result. - */ - reg_set_urange32(dst_reg, 0, U32_MAX); - __mark_reg64_unbounded(dst_reg); __update_reg32_bounds(dst_reg); } @@ -14736,11 +14163,6 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64); - /* blow away the dst_reg umin_value/umax_value and rely on - * dst_reg var_off to refine the result. - */ - reg_set_urange64(dst_reg, 0, U64_MAX); - /* Its not easy to operate on alu32 bounds here because it depends * on bits being shifted in from upper 32-bits. Take easy way out * and mark unbounded so we can recalculate later from tnum. @@ -15829,23 +15251,15 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state switch (opcode) { case BPF_JEQ: if (is_jmp32) { - reg_set_urange32(reg1, max(reg_u32_min(reg1), reg_u32_min(reg2)), - min(reg_u32_max(reg1), reg_u32_max(reg2))); - reg_set_srange32(reg1, max(reg_s32_min(reg1), reg_s32_min(reg2)), - min(reg_s32_max(reg1), reg_s32_max(reg2))); - reg_set_urange32(reg2, reg_u32_min(reg1), reg_u32_max(reg1)); - reg_set_srange32(reg2, reg_s32_min(reg1), reg_s32_max(reg1)); + reg1->r32 = cnum32_intersect(reg1->r32, reg2->r32); + reg2->r32 = reg1->r32; t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); reg2->var_off = tnum_with_subreg(reg2->var_off, t); } else { - reg_set_urange64(reg1, max(reg_umin(reg1), reg_umin(reg2)), - min(reg_umax(reg1), reg_umax(reg2))); - reg_set_srange64(reg1, max(reg_smin(reg1), reg_smin(reg2)), - min(reg_smax(reg1), reg_smax(reg2))); - reg_set_urange64(reg2, reg_umin(reg1), reg_umax(reg1)); - reg_set_srange64(reg2, reg_smin(reg1), reg_smax(reg1)); + reg1->r64 = cnum64_intersect(reg1->r64, reg2->r64); + reg2->r64 = reg1->r64; reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off); reg2->var_off = reg1->var_off; @@ -15862,32 +15276,11 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state */ val = reg_const_value(reg2, is_jmp32); if (is_jmp32) { - /* u32_min is not equal to 0xffffffff at this point, - * because otherwise u32_max is 0xffffffff as well, - * in such a case both reg1 and reg2 would be constants, - * jump would be predicted and regs_refine_cond_op() - * wouldn't be called. - * - * Same reasoning works for all {u,s}{min,max}{32,64} cases - * below. - */ - if (reg_u32_min(reg1) == (u32)val) - reg_set_urange32(reg1, reg_u32_min(reg1) + 1, reg_u32_max(reg1)); - if (reg_u32_max(reg1) == (u32)val) - reg_set_urange32(reg1, reg_u32_min(reg1), reg_u32_max(reg1) - 1); - if (reg_s32_min(reg1) == (s32)val) - reg_set_srange32(reg1, reg_s32_min(reg1) + 1, reg_s32_max(reg1)); - if (reg_s32_max(reg1) == (s32)val) - reg_set_srange32(reg1, reg_s32_min(reg1), reg_s32_max(reg1) - 1); + /* Complement of the range [val, val] as cnum32. */ + cnum32_intersect_with(®1->r32, (struct cnum32){ val + 1, U32_MAX - 1 }); } else { - if (reg_umin(reg1) == (u64)val) - reg_set_urange64(reg1, reg_umin(reg1) + 1, reg_umax(reg1)); - if (reg_umax(reg1) == (u64)val) - reg_set_urange64(reg1, reg_umin(reg1), reg_umax(reg1) - 1); - if (reg_smin(reg1) == (s64)val) - reg_set_srange64(reg1, reg_smin(reg1) + 1, reg_smax(reg1)); - if (reg_smax(reg1) == (s64)val) - reg_set_srange64(reg1, reg_smin(reg1), reg_smax(reg1) - 1); + /* Complement of the range [val, val] as cnum64. */ + cnum64_intersect_with(®1->r64, (struct cnum64){ val + 1, U64_MAX - 1 }); } break; case BPF_JSET: @@ -15934,38 +15327,38 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state break; case BPF_JLE: if (is_jmp32) { - reg_set_urange32(reg1, reg_u32_min(reg1), min(reg_u32_max(reg1), reg_u32_max(reg2))); - reg_set_urange32(reg2, max(reg_u32_min(reg1), reg_u32_min(reg2)), reg_u32_max(reg2)); + cnum32_intersect_with_urange(®1->r32, 0, reg_u32_max(reg2)); + cnum32_intersect_with_urange(®2->r32, reg_u32_min(reg1), U32_MAX); } else { - reg_set_urange64(reg1, reg_umin(reg1), min(reg_umax(reg1), reg_umax(reg2))); - reg_set_urange64(reg2, max(reg_umin(reg1), reg_umin(reg2)), reg_umax(reg2)); + cnum64_intersect_with_urange(®1->r64, 0, reg_umax(reg2)); + cnum64_intersect_with_urange(®2->r64, reg_umin(reg1), U64_MAX); } break; case BPF_JLT: if (is_jmp32) { - reg_set_urange32(reg1, reg_u32_min(reg1), min(reg_u32_max(reg1), reg_u32_max(reg2) - 1)); - reg_set_urange32(reg2, max(reg_u32_min(reg1) + 1, reg_u32_min(reg2)), reg_u32_max(reg2)); + cnum32_intersect_with_urange(®1->r32, 0, reg_u32_max(reg2) - 1); + cnum32_intersect_with_urange(®2->r32, reg_u32_min(reg1) + 1, U32_MAX); } else { - reg_set_urange64(reg1, reg_umin(reg1), min(reg_umax(reg1), reg_umax(reg2) - 1)); - reg_set_urange64(reg2, max(reg_umin(reg1) + 1, reg_umin(reg2)), reg_umax(reg2)); + cnum64_intersect_with_urange(®1->r64, 0, reg_umax(reg2) - 1); + cnum64_intersect_with_urange(®2->r64, reg_umin(reg1) + 1, U64_MAX); } break; case BPF_JSLE: if (is_jmp32) { - reg_set_srange32(reg1, reg_s32_min(reg1), min(reg_s32_max(reg1), reg_s32_max(reg2))); - reg_set_srange32(reg2, max(reg_s32_min(reg1), reg_s32_min(reg2)), reg_s32_max(reg2)); + cnum32_intersect_with_srange(®1->r32, S32_MIN, reg_s32_max(reg2)); + cnum32_intersect_with_srange(®2->r32, reg_s32_min(reg1), S32_MAX); } else { - reg_set_srange64(reg1, reg_smin(reg1), min(reg_smax(reg1), reg_smax(reg2))); - reg_set_srange64(reg2, max(reg_smin(reg1), reg_smin(reg2)), reg_smax(reg2)); + cnum64_intersect_with_srange(®1->r64, S64_MIN, reg_smax(reg2)); + cnum64_intersect_with_srange(®2->r64, reg_smin(reg1), S64_MAX); } break; case BPF_JSLT: if (is_jmp32) { - reg_set_srange32(reg1, reg_s32_min(reg1), min(reg_s32_max(reg1), reg_s32_max(reg2) - 1)); - reg_set_srange32(reg2, max(reg_s32_min(reg1) + 1, reg_s32_min(reg2)), reg_s32_max(reg2)); + cnum32_intersect_with_srange(®1->r32, S32_MIN, reg_s32_max(reg2) - 1); + cnum32_intersect_with_srange(®2->r32, reg_s32_min(reg1) + 1, S32_MAX); } else { - reg_set_srange64(reg1, reg_smin(reg1), min(reg_smax(reg1), reg_smax(reg2) - 1)); - reg_set_srange64(reg2, max(reg_smin(reg1) + 1, reg_smin(reg2)), reg_smax(reg2)); + cnum64_intersect_with_srange(®1->r64, S64_MIN, reg_smax(reg2) - 1); + cnum64_intersect_with_srange(®2->r64, reg_smin(reg1) + 1, S64_MAX); } break; default: diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c index 71f5240cc5b7..7f170a69d1d8 100644 --- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c +++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c @@ -478,6 +478,52 @@ static struct range range_refine_in_halves(enum num_t x_t, struct range x, } +static __always_inline u64 next_u32_block(u64 x) { return x + (1ULL << 32); } +static __always_inline u64 prev_u32_block(u64 x) { return x - (1ULL << 32); } + +/* Is v within the circular u64 range [base, base + len]? */ +static __always_inline bool u64_range_contains(u64 v, u64 base, u64 len) +{ + return v - base <= len; +} + +/* Is v within the circular u32 range [base, base + len]? */ +static __always_inline bool u32_range_contains(u32 v, u32 base, u32 len) +{ + return v - base <= len; +} + +static bool range64_range32_intersect(enum num_t a_t, + struct range a /* 64 */, + struct range b /* 32 */, + struct range *out /* 64 */) +{ + u64 b_len = (u32)(b.b - b.a); + u64 a_len = a.b - a.a; + u64 lo, hi; + + if (u32_range_contains((u32)a.a, (u32)b.a, b_len)) { + lo = a.a; + } else { + lo = swap_low32(a.a, (u32)b.a); + if (!u64_range_contains(lo, a.a, a_len)) + lo = next_u32_block(lo); + if (!u64_range_contains(lo, a.a, a_len)) + return false; + } + if (u32_range_contains(a.b, (u32)b.a, b_len)) { + hi = a.b; + } else { + hi = swap_low32(a.b, (u32)b.b); + if (!u64_range_contains(hi, a.a, a_len)) + hi = prev_u32_block(hi); + if (!u64_range_contains(hi, a.a, a_len)) + return false; + } + *out = range(a_t, lo, hi); + return true; +} + static struct range range_refine(enum num_t x_t, struct range x, enum num_t y_t, struct range y) { struct range y_cast; @@ -533,23 +579,12 @@ static struct range range_refine(enum num_t x_t, struct range x, enum num_t y_t, } } - /* the case when new range knowledge, *y*, is a 32-bit subregister - * range, while previous range knowledge, *x*, is a full register - * 64-bit range, needs special treatment to take into account upper 32 - * bits of full register range - */ if (t_is_32(y_t) && !t_is_32(x_t)) { - struct range x_swap; + struct range x1; - /* some combinations of upper 32 bits and sign bit can lead to - * invalid ranges, in such cases it's easier to detect them - * after cast/swap than try to enumerate all the conditions - * under which transformation and knowledge transfer is valid - */ - x_swap = range(x_t, swap_low32(x.a, y_cast.a), swap_low32(x.b, y_cast.b)); - if (!is_valid_range(x_t, x_swap)) - return x; - return range_intersection(x_t, x, x_swap); + if (range64_range32_intersect(x_t, x, y, &x1)) + return x1; + return x; } /* otherwise, plain range cast and intersection works */ @@ -1300,6 +1335,26 @@ static bool assert_range_eq(enum num_t t, struct range x, struct range y, return false; } +/* For a pair of signed/unsigned t1/t2 checks if r1/r2 intersect in two intervals. */ +static bool needs_two_arcs(enum num_t t1, struct range r1, + enum num_t t2, struct range r2) +{ + u64 lo = cast_t(t1, r2.a); + u64 hi = cast_t(t1, r2.b); + + /* does r2 wrap in t1's domain: [0, hi] ∪ [lo, MAX]? */ + return lo > hi && r1.a <= hi && r1.b >= lo; +} + +static bool reg_state_needs_two_arcs(struct reg_state *s) +{ + if (!s->valid) + return false; + + return needs_two_arcs(U64, s->r[U64], S64, s->r[S64]) || + needs_two_arcs(U32, s->r[U32], S32, s->r[S32]); +} + /* Validate that register states match, and print details if they don't */ static bool assert_reg_state_eq(struct reg_state *r, struct reg_state *e, const char *ctx) { @@ -1524,6 +1579,11 @@ static int verify_case_op(enum num_t init_t, enum num_t cond_t, !assert_reg_state_eq(&fr2, &fe2, "false_reg2") || !assert_reg_state_eq(&tr1, &te1, "true_reg1") || !assert_reg_state_eq(&tr2, &te2, "true_reg2")) { + if (reg_state_needs_two_arcs(&fe1) || reg_state_needs_two_arcs(&fe2) || + reg_state_needs_two_arcs(&te1) || reg_state_needs_two_arcs(&te2)) { + test__skip(); + return 0; + } failed = true; } diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index c1ae013dee29..f0b3fbbbb627 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -1239,7 +1239,8 @@ l0_%=: r0 = 0; \ SEC("tc") __description("multiply mixed sign bounds. test 1") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,umax32=0xfffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") +/* cnum can't represent both [0, 0xffff_feff] and [0x8000_0000, 0x7fff_feff], so it picks one */ __naked void mult_mixed0_sign(void) { asm volatile ( @@ -1648,7 +1649,8 @@ l0_%=: r0 = 0; \ SEC("socket") __description("bounds deduction cross sign boundary, two overlaps") __failure -__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=-128,smax=smax32=127,umax=0xffffffffffffff80)") +__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=-128,smax=smax32=127)") +/* smin=-128 includes point 0xffffffffffffff80 */ __msg("frame pointer is read only") __naked void bounds_deduct_two_overlaps(void) { @@ -2043,7 +2045,8 @@ __naked void signed_unsigned_intersection32_case2(void *ctx) */ SEC("socket") __description("bounds refinement: 64bits ranges not overwritten by 32bits ranges") -__msg("3: (65) if r0 s> 0x2 {{.*}} R0=scalar(smin=0x8000000000000002,smax=2,umin=smin32=umin32=2,umax=0xffffffff00000003,smax32=umax32=3") +__msg("3: (65) if r0 s> 0x2 {{.*}} R0=scalar(smin=0x8000000000000002,smax=2,smin32=umin32=2,smax32=umax32=3,var_off{{.*}}))") +/* Can't represent both [S64_MIN+2, 2] and [2, U64_MAX - U32_MAX + 2] at the same time, picks shorter interval */ __msg("4: (25) if r0 > 0x13 {{.*}} R0=2") __success __log_level(2) __naked void refinement_32bounds_not_overwriting_64bounds(void *ctx) diff --git a/tools/testing/selftests/bpf/progs/verifier_subreg.c b/tools/testing/selftests/bpf/progs/verifier_subreg.c index 31832a306f91..73b5b0cf6706 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subreg.c +++ b/tools/testing/selftests/bpf/progs/verifier_subreg.c @@ -558,7 +558,8 @@ __description("arsh32 imm sign negative extend check") __success __retval(0) __log_level(2) __msg("3: (17) r6 -= 4095 ; R6=scalar(smin=smin32=-4095,smax=smax32=0)") -__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff00100000000,smax=smax32=umax32=0,umax=0xffffffff00000000,smin32=0,var_off=(0x0; 0xffffffff00000000))") +__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff00100000000,smax=smax32=umax32=0,smin32=0,var_off=(0x0; 0xffffffff00000000))") +/* represents shorter of signed / unsigned 64-bit ranges */ __msg("5: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=-4095,smax=smax32=0)") __naked void arsh32_imm_sign_extend_negative_check(void) { @@ -581,7 +582,8 @@ __description("arsh32 imm sign extend check") __success __retval(0) __log_level(2) __msg("3: (17) r6 -= 2047 ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)") -__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff80100000000,smax=0x80000000000,umax=0xffffffff00000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000))") +__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff80100000000,smax=0x80000000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000))") +/* represents shorter of signed / unsigned 64-bit ranges */ __msg("5: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)") __naked void arsh32_imm_sign_extend_check(void) { -- cgit v1.2.3 From cd5b460ed1eca9e48f3eb07db1ee0a522c0eaa23 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Sat, 25 Apr 2026 15:48:23 -0700 Subject: bpf: range_within() must check cnum ranges instead of min/max pairs states.c:range_within() must be updated to properly check if cnum-based range in an old state is a superset of a range in the cur state. Currently it makes the decision using min/max accessors: reg_umin(old) <= reg_umin(cur) <= reg_umax(old) This is wrong for cnums that cross both UT_MAX/0 and ST_MAX/ST_MIN boundaries. Consider cnum32{base=0x7FFFFFF0, size=0x80000020}, which represents values [0x7FFFFFF0, ..., U32_MAX, 0, ..., 0x10]. Its projections are u32_min/max=0/U32_MAX, s32_min/max=S32_MIN/MAX. A register with range [0x100, 0x200] (which lies entirely in the gap of the wrapping range) would pass the min/max check despite having no overlap with the actual cnum arc. This commit replaces min/max comparison with cnum{32,64}_is_subset() operation. The operation implementation is verified using cbmc model checker in [1]. [1] https://github.com/eddyz87/cnum-verif/ Fixes: bbc631085503 ("bpf: replace min/max fields with struct cnum{32,64}") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260425-cnum-range-within-v1-1-2fdca70cb09d@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/cnum.h | 2 ++ kernel/bpf/cnum_defs.h | 14 ++++++++++++++ kernel/bpf/states.c | 11 +++-------- 3 files changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/cnum.h b/include/linux/cnum.h index a7259b105b45..49b7d0c7645d 100644 --- a/include/linux/cnum.h +++ b/include/linux/cnum.h @@ -48,6 +48,7 @@ bool cnum32_is_const(struct cnum32 cnum); bool cnum32_is_empty(struct cnum32 cnum); struct cnum32 cnum32_add(struct cnum32 a, struct cnum32 b); struct cnum32 cnum32_negate(struct cnum32 a); +bool cnum32_is_subset(struct cnum32 outer, struct cnum32 inner); /* Same as cnum32 but for 64-bit ranges */ struct cnum64 { @@ -73,6 +74,7 @@ bool cnum64_is_const(struct cnum64 cnum); bool cnum64_is_empty(struct cnum64 cnum); struct cnum64 cnum64_add(struct cnum64 a, struct cnum64 b); struct cnum64 cnum64_negate(struct cnum64 a); +bool cnum64_is_subset(struct cnum64 outer, struct cnum64 inner); struct cnum32 cnum32_from_cnum64(struct cnum64 cnum); struct cnum64 cnum64_cnum32_intersect(struct cnum64 a, struct cnum32 b); diff --git a/kernel/bpf/cnum_defs.h b/kernel/bpf/cnum_defs.h index 3ebd8f723dbb..1f232138b6e9 100644 --- a/kernel/bpf/cnum_defs.h +++ b/kernel/bpf/cnum_defs.h @@ -220,6 +220,20 @@ bool FN(is_const)(struct cnum_t cnum) return cnum.size == 0; } +bool FN(is_subset)(struct cnum_t bigger, struct cnum_t smaller) +{ + if (FN(is_empty(smaller))) + return true; + if (FN(is_empty(bigger))) + return false; + /* rotate both arcs such that 'bigger' starts at origin, hence does not overflow */ + smaller.base -= bigger.base; + bigger.base = 0; + if (FN(urange_overflow)(smaller) && bigger.size < UT_MAX) + return false; + return smaller.base + smaller.size <= bigger.size; +} + #undef EMPTY #undef cnum_t #undef ut diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index a78ae891b743..bd9c22945050 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -2,6 +2,7 @@ /* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ #include #include +#include #include #define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) @@ -301,14 +302,8 @@ int bpf_update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_s static bool range_within(const struct bpf_reg_state *old, const struct bpf_reg_state *cur) { - return reg_umin(old) <= reg_umin(cur) && - reg_umax(old) >= reg_umax(cur) && - reg_smin(old) <= reg_smin(cur) && - reg_smax(old) >= reg_smax(cur) && - reg_u32_min(old) <= reg_u32_min(cur) && - reg_u32_max(old) >= reg_u32_max(cur) && - reg_s32_min(old) <= reg_s32_min(cur) && - reg_s32_max(old) >= reg_s32_max(cur); + return cnum64_is_subset(old->r64, cur->r64) && + cnum32_is_subset(old->r32, cur->r32); } /* If in the old state two registers had the same id, then they need to have -- cgit v1.2.3 From 79b8ebcbe483fee401e1b91dd32470348d9aa5b8 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 27 Apr 2026 12:22:05 +0100 Subject: bpf: Export cnum_umin/umax() helpers for netronome driver ERROR: modpost: "cnum64_umin" [drivers/net/ethernet/netronome/nfp/nfp.ko] undefined! ERROR: modpost: "cnum64_umax" [drivers/net/ethernet/netronome/nfp/nfp.ko] undefined! Export symbols for these references. Reported-by: Kaitao Cheng Fixes: bbc631085503 ("bpf: replace min/max fields with struct cnum{32,64}") Signed-off-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260427112205.1346733-1-alan.maguire@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cnum_defs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cnum_defs.h b/kernel/bpf/cnum_defs.h index 1f232138b6e9..a90e317e3578 100644 --- a/kernel/bpf/cnum_defs.h +++ b/kernel/bpf/cnum_defs.h @@ -6,6 +6,7 @@ #endif #include +#include #include #include #include @@ -48,11 +49,13 @@ ut FN(umin)(struct cnum_t cnum) { return FN(urange_overflow)(cnum) ? 0 : cnum.base; } +EXPORT_SYMBOL_GPL(FN(umin)); ut FN(umax)(struct cnum_t cnum) { return FN(urange_overflow)(cnum) ? UT_MAX : cnum.base + cnum.size; } +EXPORT_SYMBOL_GPL(FN(umax)); /* True if this cnum represents two signed ranges. */ static inline bool FN(srange_overflow)(struct cnum_t cnum) -- cgit v1.2.3 From cfeddb4244268c246d67cbe50269a9475cb112fc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 24 Apr 2026 17:39:05 +0200 Subject: bpf: Remove obsolete WARN_ON call The WARN_ON call in bpf_trampoline_update could never hit, because we direct the code path with (total == 0) to out label, which effectively skips the WARN_ON call. The WARN_ON made sense back then when it checked tr->selector, but now with total being set just inside the function it's useless. Signed-off-by: Jiri Olsa Acked-by: Song Liu Link: https://lore.kernel.org/r/20260424153905.354922-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index f02254a21585..a4298a25d4ba 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -685,7 +685,6 @@ again: if (err) goto out_free; - WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ err = modify_fentry(tr, orig_flags, tr->cur_image->image, -- cgit v1.2.3 From f603e84ab7918db6470c0b06b46ece7fbdb71e9a Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Thu, 30 Apr 2026 10:44:28 +0200 Subject: bpf: Print breakdown of insns processed by subprogs When using global functions (i.e. subprogs), the verifier performs function-by-function verification. In that case, the sum of the instructions processed in each global function and in the main program counts towards the 1 million instructions limit. Only that sum is reported in the verifier logs. While starting to use global functions in Cilium (finally!), we found it can be useful to have the breakdown per global function, to understand exactly where the budget is currently spent. This patch implements this breakdown, under BPF_LOG_STATS, as done for the stack depths. When iterating over subprogs, we need to skip the hidden subprogs at the end because they don't have a corresponding func_info_aux entry and calling bpf_subprog_is_global() would result in an OOB access. Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/bpf/5590f9c67e614ec9054d0c7e74e87cc690a52c56.1777538384.git.paul.chaignon@gmail.com Signed-off-by: Kumar Kartikeya Dwivedi --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 25 ++++++++++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 101ca6cc5424..976e2b2f40e8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -779,6 +779,7 @@ struct bpf_subprog_info { u32 exit_idx; /* Index of one of the BPF_EXIT instructions in this subprogram */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; + u32 insn_processed; /* offsets in range [stack_depth .. fastcall_stack_off) * are used for bpf_fastcall spills and fills. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 03f9e16c2abe..11054ad89c14 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18215,6 +18215,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env) struct bpf_prog_aux *aux = env->prog->aux; struct bpf_func_info_aux *sub_aux; int i, ret, new_cnt; + u32 insn_processed; if (!aux->func_info) return 0; @@ -18229,6 +18230,8 @@ again: if (!bpf_subprog_is_global(env, i)) continue; + insn_processed = env->insn_processed; + sub_aux = subprog_aux(env, i); if (!sub_aux->called || sub_aux->verified) continue; @@ -18236,6 +18239,7 @@ again: env->insn_idx = env->subprog_info[i].start; WARN_ON_ONCE(env->insn_idx == 0); ret = do_check_common(env, i); + env->subprog_info[i].insn_processed = env->insn_processed - insn_processed; if (ret) { return ret; } else if (env->log.level & BPF_LOG_LEVEL) { @@ -18262,10 +18266,12 @@ again: static int do_check_main(struct bpf_verifier_env *env) { + u32 insn_processed = env->insn_processed; int ret; env->insn_idx = 0; ret = do_check_common(env, 0); + env->subprog_info[0].insn_processed = env->insn_processed - insn_processed; if (!ret) env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return ret; @@ -18274,19 +18280,20 @@ static int do_check_main(struct bpf_verifier_env *env) static void print_verification_stats(struct bpf_verifier_env *env) { - int i; + /* Skip over hidden subprogs which are not verified. */ + int i, subprog_cnt = env->subprog_cnt - env->hidden_subprog_cnt; if (env->log.level & BPF_LOG_STATS) { verbose(env, "verification time %lld usec\n", div_u64(env->verification_time, 1000)); - verbose(env, "stack depth "); - for (i = 0; i < env->subprog_cnt; i++) { - u32 depth = env->subprog_info[i].stack_depth; - - verbose(env, "%d", depth); - if (i + 1 < env->subprog_cnt) - verbose(env, "+"); - } + verbose(env, "stack depth %d", env->subprog_info[0].stack_depth); + for (i = 1; i < subprog_cnt; i++) + verbose(env, "+%d", env->subprog_info[i].stack_depth); + verbose(env, "\n"); + verbose(env, "insns processed %d", env->subprog_info[0].insn_processed); + for (i = 1; i < subprog_cnt; i++) + if (bpf_subprog_is_global(env, i)) + verbose(env, "+%d", env->subprog_info[i].insn_processed); verbose(env, "\n"); } verbose(env, "processed %d insns (limit %d) max_states_per_insn %d " -- cgit v1.2.3 From f28771c0691bcb7f477a0f35550b17b88c32dea8 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 12 May 2026 23:31:50 +0800 Subject: bpf: Extend BPF syscall with common attributes support Add generic BPF syscall support for passing common attributes. The initial set of common attributes includes: 1. 'log_buf': User-provided buffer for storing logs. 2. 'log_size': Size of the log buffer. 3. 'log_level': Log verbosity level. 4. 'log_true_size': Actual log size reported by kernel. The common-attribute pointer and its size are passed as the 4th and 5th syscall arguments. A new command bit, 'BPF_COMMON_ATTRS' ('1 << 16'), indicates that common attributes are supplied. This commit adds syscall and uapi plumbing. Command-specific handling is added in follow-up patches. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260512153157.28382-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/syscalls.h | 3 ++- include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/syscall.c | 25 +++++++++++++++++++++---- tools/include/uapi/linux/bpf.h | 8 ++++++++ 4 files changed, 39 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index f5639d5ac331..50055ab73649 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -936,7 +936,8 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, asmlinkage long sys_getrandom(char __user *buf, size_t count, unsigned int flags); asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); -asmlinkage long sys_bpf(int cmd, union bpf_attr __user *attr, unsigned int size); +asmlinkage long sys_bpf(int cmd, union bpf_attr __user *attr, unsigned int size, + struct bpf_common_attr __user *attr_common, unsigned int size_common); asmlinkage long sys_execveat(int dfd, const char __user *filename, const char __user *const __user *argv, const char __user *const __user *envp, int flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 552bc5d9afbd..aec171ccb6ef 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -994,6 +994,7 @@ enum bpf_cmd { BPF_PROG_STREAM_READ_BY_FD, BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, + BPF_COMMON_ATTRS = 1 << 16, /* Indicate carrying syscall common attrs. */ }; enum bpf_map_type { @@ -1500,6 +1501,13 @@ struct bpf_stack_build_id { }; }; +struct bpf_common_attr { + __aligned_u64 log_buf; + __u32 log_size; + __u32 log_level; + __u32 log_true_size; +}; + #define BPF_OBJ_NAME_LEN 16U enum { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3b1f0ba02f61..354f6f471a08 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6211,8 +6211,10 @@ put_prog: return ret; } -static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) +static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, + bpfptr_t uattr_common, unsigned int size_common) { + struct bpf_common_attr attr_common; union bpf_attr attr; int err; @@ -6226,6 +6228,20 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; + memset(&attr_common, 0, sizeof(attr_common)); + if (cmd & BPF_COMMON_ATTRS) { + err = bpf_check_uarg_tail_zero(uattr_common, sizeof(attr_common), size_common); + if (err) + return err; + + cmd &= ~BPF_COMMON_ATTRS; + size_common = min_t(u32, size_common, sizeof(attr_common)); + if (copy_from_bpfptr(&attr_common, uattr_common, size_common) != 0) + return -EFAULT; + } else { + size_common = 0; + } + err = security_bpf(cmd, &attr, size, uattr.is_kernel); if (err < 0) return err; @@ -6361,9 +6377,10 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) return err; } -SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +SYSCALL_DEFINE5(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size, + struct bpf_common_attr __user *, uattr_common, unsigned int, size_common) { - return __sys_bpf(cmd, USER_BPFPTR(uattr), size); + return __sys_bpf(cmd, USER_BPFPTR(uattr), size, USER_BPFPTR(uattr_common), size_common); } static bool syscall_prog_is_valid_access(int off, int size, @@ -6393,7 +6410,7 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) default: return -EINVAL; } - return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); + return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size, KERNEL_BPFPTR(NULL), 0); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 677be9a47347..37142e6d911a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -994,6 +994,7 @@ enum bpf_cmd { BPF_PROG_STREAM_READ_BY_FD, BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, + BPF_COMMON_ATTRS = 1 << 16, /* Indicate carrying syscall common attrs. */ }; enum bpf_map_type { @@ -1500,6 +1501,13 @@ struct bpf_stack_build_id { }; }; +struct bpf_common_attr { + __aligned_u64 log_buf; + __u32 log_size; + __u32 log_level; + __u32 log_true_size; +}; + #define BPF_OBJ_NAME_LEN 16U enum { -- cgit v1.2.3 From 503c039ffeca7530ce9d6446a07b4bb776180b45 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 12 May 2026 23:31:52 +0800 Subject: bpf: Refactor reporting log_true_size for prog_load The next commit will add support for reporting logs via extended common attributes, including 'log_true_size'. To prepare for that, refactor the 'log_true_size' reporting logic by introducing a new struct bpf_log_attr to encapsulate log-related behavior: * bpf_log_attr_init(): initialize log fields, which will support extended common attributes in the next commit. * bpf_log_attr_finalize(): handle log finalization and write back 'log_true_size' to userspace. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260512153157.28382-4-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 +++- include/linux/bpf_verifier.h | 12 ++++++++++++ kernel/bpf/log.c | 29 +++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 12 +++++++++--- kernel/bpf/verifier.c | 17 ++++------------- 5 files changed, 57 insertions(+), 17 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 14759972f148..9e16e91647d3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2919,7 +2919,9 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size); /* verify correctness of eBPF program */ -int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size); +struct bpf_log_attr; +int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, + struct bpf_log_attr *attr_log); #ifndef CONFIG_BPF_JIT_ALWAYS_ON void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 976e2b2f40e8..8d27ad1f9f94 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -755,6 +755,18 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) return log && log->level; } +struct bpf_log_attr { + char __user *ubuf; + u32 size; + u32 level; + u32 offsetof_true_size; + bpfptr_t uattr; +}; + +int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level, + u32 offsetof_log_true_size, bpfptr_t uattr); +int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log); + #define BPF_MAX_SUBPROGS 256 struct bpf_subprog_arg_info { diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 64566b86dd27..1b1efe75398b 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -825,3 +825,32 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st } print_verifier_state(env, vstate, frameno, false); } + +int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level, + u32 offsetof_log_true_size, bpfptr_t uattr) +{ + char __user *ubuf = u64_to_user_ptr(log_buf); + + memset(log, 0, sizeof(*log)); + log->ubuf = ubuf; + log->size = log_size; + log->level = log_level; + log->offsetof_true_size = offsetof_log_true_size; + log->uattr = uattr; + return 0; +} + +int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log) +{ + u32 log_true_size; + int err; + + err = bpf_vlog_finalize(log, &log_true_size); + + if (attr->offsetof_true_size && + copy_to_bpfptr_offset(attr->uattr, attr->offsetof_true_size, &log_true_size, + sizeof(log_true_size))) + return -EFAULT; + + return err; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 354f6f471a08..70b78ddcdedb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2861,7 +2861,7 @@ static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog) /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD keyring_id -static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) +static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog, *dst_prog = NULL; @@ -3079,7 +3079,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) goto free_prog_sec; /* run eBPF verifier */ - err = bpf_check(&prog, attr, uattr, uattr_size); + err = bpf_check(&prog, attr, uattr, attr_log); if (err < 0) goto free_used_maps; @@ -6215,6 +6215,8 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, bpfptr_t uattr_common, unsigned int size_common) { struct bpf_common_attr attr_common; + u32 offsetof_log_true_size = 0; + struct bpf_log_attr attr_log; union bpf_attr attr; int err; @@ -6266,7 +6268,11 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, err = map_freeze(&attr); break; case BPF_PROG_LOAD: - err = bpf_prog_load(&attr, uattr, size); + if (size >= offsetofend(union bpf_attr, log_true_size)) + offsetof_log_true_size = offsetof(union bpf_attr, log_true_size); + err = bpf_log_attr_init(&attr_log, attr.log_buf, attr.log_size, attr.log_level, + offsetof_log_true_size, uattr); + err = err ?: bpf_prog_load(&attr, uattr, &attr_log); break; case BPF_OBJ_PIN: err = bpf_obj_pin(&attr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 11054ad89c14..0e654ef01ae0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19294,12 +19294,12 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return 0; } -int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, + struct bpf_log_attr *attr_log) { u64 start_time = ktime_get_ns(); struct bpf_verifier_env *env; int i, len, ret = -EINVAL, err; - u32 log_true_size; bool is_priv; BTF_TYPE_EMIT(enum bpf_features); @@ -19346,9 +19346,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* user could have requested verbose verifier output * and supplied buffer to store the verification trace */ - ret = bpf_vlog_init(&env->log, attr->log_level, - (char __user *) (unsigned long) attr->log_buf, - attr->log_size); + ret = bpf_vlog_init(&env->log, attr_log->level, attr_log->ubuf, attr_log->size); if (ret) goto err_unlock; @@ -19510,17 +19508,10 @@ skip_full_check: env->prog->aux->verified_insns = env->insn_processed; /* preserve original error even if log finalization is successful */ - err = bpf_vlog_finalize(&env->log, &log_true_size); + err = bpf_log_attr_finalize(attr_log, &env->log); if (err) ret = err; - if (uattr_size >= offsetofend(union bpf_attr, log_true_size) && - copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size), - &log_true_size, sizeof(log_true_size))) { - ret = -EFAULT; - goto err_release_maps; - } - if (ret) goto err_release_maps; -- cgit v1.2.3 From ac89d33fdd8183df39fe92ffa525be7af6feb9d1 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 12 May 2026 23:31:53 +0800 Subject: bpf: Add syscall common attributes support for prog_load BPF_PROG_LOAD can now take log parameters from both union bpf_attr and struct bpf_common_attr. The merge rules are: - if both sides provide a complete log tuple (buf/size/level) and they match, use it; - if only one side provides log parameters, use that one; - if both sides provide complete tuples but they differ, return -EINVAL. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260512153157.28382-5-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 ++- kernel/bpf/log.c | 34 +++++++++++++++++++++++++++------- kernel/bpf/syscall.c | 3 ++- 3 files changed, 31 insertions(+), 9 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8d27ad1f9f94..8433430dedb7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -764,7 +764,8 @@ struct bpf_log_attr { }; int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level, - u32 offsetof_log_true_size, bpfptr_t uattr); + u32 offsetof_log_true_size, bpfptr_t uattr, struct bpf_common_attr *common, + bpfptr_t uattr_common, u32 size_common); int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log); #define BPF_MAX_SUBPROGS 256 diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 1b1efe75398b..fd12ad5a0338 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -13,17 +13,17 @@ #define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) -static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) +static bool bpf_verifier_log_attr_valid(u32 log_level, char __user *log_buf, u32 log_size) { /* ubuf and len_total should both be specified (or not) together */ - if (!!log->ubuf != !!log->len_total) + if (!!log_buf != !!log_size) return false; /* log buf without log_level is meaningless */ - if (log->ubuf && log->level == 0) + if (log_buf && log_level == 0) return false; - if (log->level & ~BPF_LOG_MASK) + if (log_level & ~BPF_LOG_MASK) return false; - if (log->len_total > UINT_MAX >> 2) + if (log_size > UINT_MAX >> 2) return false; return true; } @@ -36,7 +36,7 @@ int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, log->len_total = log_size; /* log attributes have to be sane */ - if (!bpf_verifier_log_attr_valid(log)) + if (!bpf_verifier_log_attr_valid(log_level, log_buf, log_size)) return -EINVAL; return 0; @@ -827,16 +827,36 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st } int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level, - u32 offsetof_log_true_size, bpfptr_t uattr) + u32 offsetof_log_true_size, bpfptr_t uattr, struct bpf_common_attr *common, + bpfptr_t uattr_common, u32 size_common) { + char __user *ubuf_common = u64_to_user_ptr(common->log_buf); char __user *ubuf = u64_to_user_ptr(log_buf); + if (!bpf_verifier_log_attr_valid(common->log_level, ubuf_common, common->log_size) || + !bpf_verifier_log_attr_valid(log_level, ubuf, log_size)) + return -EINVAL; + + if (ubuf && ubuf_common && (ubuf != ubuf_common || log_size != common->log_size || + log_level != common->log_level)) + return -EINVAL; + memset(log, 0, sizeof(*log)); log->ubuf = ubuf; log->size = log_size; log->level = log_level; log->offsetof_true_size = offsetof_log_true_size; log->uattr = uattr; + + if (!ubuf && ubuf_common) { + log->ubuf = ubuf_common; + log->size = common->log_size; + log->level = common->log_level; + log->uattr = uattr_common; + log->offsetof_true_size = 0; + if (size_common >= offsetofend(struct bpf_common_attr, log_true_size)) + log->offsetof_true_size = offsetof(struct bpf_common_attr, log_true_size); + } return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 70b78ddcdedb..db893cae826c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6271,7 +6271,8 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, if (size >= offsetofend(union bpf_attr, log_true_size)) offsetof_log_true_size = offsetof(union bpf_attr, log_true_size); err = bpf_log_attr_init(&attr_log, attr.log_buf, attr.log_size, attr.log_level, - offsetof_log_true_size, uattr); + offsetof_log_true_size, uattr, &attr_common, uattr_common, + size_common); err = err ?: bpf_prog_load(&attr, uattr, &attr_log); break; case BPF_OBJ_PIN: -- cgit v1.2.3 From ceeb7eda94a3548958b30818495ef7eb12898727 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 12 May 2026 23:31:54 +0800 Subject: bpf: Add syscall common attributes support for btf_load BPF_BTF_LOAD can now take log parameters from both union bpf_attr and struct bpf_common_attr, with the same merge rules as BPF_PROG_LOAD: - if both sides provide a complete log tuple (buf/size/level) and they match, use it; - if only one side provides log parameters, use that one; - if both sides provide complete tuples but they differ, return -EINVAL. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260512153157.28382-6-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 3 ++- kernel/bpf/btf.c | 30 +++++++----------------------- kernel/bpf/syscall.c | 11 ++++++++--- 3 files changed, 17 insertions(+), 27 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/btf.h b/include/linux/btf.h index c82d0d689059..240401d9b25b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -145,7 +145,8 @@ const char *btf_get_name(const struct btf *btf); void btf_get(struct btf *btf); void btf_put(struct btf *btf); const struct btf_header *btf_header(const struct btf *btf); -int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz); +struct bpf_log_attr; +int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log); struct btf *btf_get_by_fd(int fd); int btf_get_info_by_fd(const struct btf *btf, const union bpf_attr *attr, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 77af44d8a3ad..a6bf4781943c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5907,25 +5907,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env, return 0; } -static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size) -{ - u32 log_true_size; - int err; - - err = bpf_vlog_finalize(log, &log_true_size); - - if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) && - copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size), - &log_true_size, sizeof(log_true_size))) - err = -EFAULT; - - return err; -} - -static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) +static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, + struct bpf_log_attr *attr_log) { bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel); - char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf); struct btf_struct_metas *struct_meta_tab; struct btf_verifier_env *env = NULL; struct btf *btf = NULL; @@ -5942,8 +5927,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat /* user could have requested verbose verifier output * and supplied buffer to store the verification trace */ - err = bpf_vlog_init(&env->log, attr->btf_log_level, - log_ubuf, attr->btf_log_size); + err = bpf_vlog_init(&env->log, attr_log->level, attr_log->ubuf, attr_log->size); if (err) goto errout_free; @@ -6008,7 +5992,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat } } - err = finalize_log(&env->log, uattr, uattr_size); + err = bpf_log_attr_finalize(attr_log, &env->log); if (err) goto errout_free; @@ -6020,7 +6004,7 @@ errout_meta: btf_free_struct_meta_tab(btf); errout: /* overwrite err with -ENOSPC or -EFAULT */ - ret = finalize_log(&env->log, uattr, uattr_size); + ret = bpf_log_attr_finalize(attr_log, &env->log); if (ret) err = ret; errout_free: @@ -8189,12 +8173,12 @@ static int __btf_new_fd(struct btf *btf) return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC); } -int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) +int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) { struct btf *btf; int ret; - btf = btf_parse(attr, uattr, uattr_size); + btf = btf_parse(attr, uattr, attr_log); if (IS_ERR(btf)) return PTR_ERR(btf); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index db893cae826c..2fa05ba8f161 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5474,7 +5474,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, #define BPF_BTF_LOAD_LAST_FIELD btf_token_fd -static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) +static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) { struct bpf_token *token = NULL; @@ -5501,7 +5501,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_ bpf_token_put(token); - return btf_new_fd(attr, uattr, uattr_size); + return btf_new_fd(attr, uattr, attr_log); } #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd @@ -6318,7 +6318,12 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, err = bpf_raw_tracepoint_open(&attr); break; case BPF_BTF_LOAD: - err = bpf_btf_load(&attr, uattr, size); + if (size >= offsetofend(union bpf_attr, btf_log_true_size)) + offsetof_log_true_size = offsetof(union bpf_attr, btf_log_true_size); + err = bpf_log_attr_init(&attr_log, attr.btf_log_buf, attr.btf_log_size, + attr.btf_log_level, offsetof_log_true_size, uattr, + &attr_common, uattr_common, size_common); + err = err ?: bpf_btf_load(&attr, uattr, &attr_log); break; case BPF_BTF_GET_FD_BY_ID: err = bpf_btf_get_fd_by_id(&attr); -- cgit v1.2.3 From 49f9b2b2a18c5ce06b21fc2b3399352d80dee0c6 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 12 May 2026 23:31:55 +0800 Subject: bpf: Add syscall common attributes support for map_create Many BPF_MAP_CREATE validation failures currently return -EINVAL without any explanation to userspace. Plumb common syscall log attributes into map_create(), create a verifier log from bpf_common_attr::log_buf/log_size/log_level, and report map-creation failure reasons through that buffer. This improves debuggability by allowing userspace to inspect why map creation failed and read back log_true_size from common attributes. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260512153157.28382-7-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 ++ kernel/bpf/log.c | 29 +++++++++++++++++++ kernel/bpf/syscall.c | 66 +++++++++++++++++++++++++++++++++++++------- 3 files changed, 88 insertions(+), 10 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8433430dedb7..c15a4c26a43b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -766,6 +766,9 @@ struct bpf_log_attr { int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level, u32 offsetof_log_true_size, bpfptr_t uattr, struct bpf_common_attr *common, bpfptr_t uattr_common, u32 size_common); +struct bpf_verifier_log *bpf_log_attr_create_vlog(struct bpf_log_attr *attr_log, + struct bpf_common_attr *common, bpfptr_t uattr, + u32 size); int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log); #define BPF_MAX_SUBPROGS 256 diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index fd12ad5a0338..62fe6ed18374 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -860,6 +860,35 @@ int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 l return 0; } +struct bpf_verifier_log *bpf_log_attr_create_vlog(struct bpf_log_attr *attr_log, + struct bpf_common_attr *common, bpfptr_t uattr, + u32 size) +{ + struct bpf_verifier_log *log; + int err; + + memset(attr_log, 0, sizeof(*attr_log)); + attr_log->uattr = uattr; + if (size >= offsetofend(struct bpf_common_attr, log_true_size)) + attr_log->offsetof_true_size = offsetof(struct bpf_common_attr, log_true_size); + + if (!size) + return NULL; + + log = kzalloc_obj(*log, GFP_KERNEL); + if (!log) + return ERR_PTR(-ENOMEM); + + err = bpf_vlog_init(log, common->log_level, u64_to_user_ptr(common->log_buf), + common->log_size); + if (err) { + kfree(log); + return ERR_PTR(err); + } + + return log; +} + int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log) { u32 log_true_size; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2fa05ba8f161..6600e126fbfb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1359,7 +1359,7 @@ free_map_tab: #define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ -static int map_create(union bpf_attr *attr, bpfptr_t uattr) +static int __map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifier_log *log) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1371,8 +1371,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) int err; err = CHECK_ATTR(BPF_MAP_CREATE); - if (err) + if (err) { + bpf_log(log, "Invalid attr.\n"); return -EINVAL; + } /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it * to avoid per-map type checks tripping on unknown flag @@ -1381,17 +1383,25 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) attr->map_flags &= ~BPF_F_TOKEN_FD; if (attr->btf_vmlinux_value_type_id) { - if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || - attr->btf_key_type_id || attr->btf_value_type_id) + if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS) { + bpf_log(log, "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n"); return -EINVAL; + } + if (attr->btf_key_type_id || attr->btf_value_type_id) { + bpf_log(log, "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n"); + return -EINVAL; + } } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { + bpf_log(log, "Invalid btf_value_type_id.\n"); return -EINVAL; } if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && attr->map_type != BPF_MAP_TYPE_ARENA && - attr->map_extra != 0) + attr->map_extra != 0) { + bpf_log(log, "Invalid map_extra.\n"); return -EINVAL; + } f_flags = bpf_get_file_flag(attr->map_flags); if (f_flags < 0) @@ -1399,13 +1409,17 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || - !node_online(numa_node))) + !node_online(numa_node))) { + bpf_log(log, "Invalid numa_node.\n"); return -EINVAL; + } /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ map_type = attr->map_type; - if (map_type >= ARRAY_SIZE(bpf_map_types)) + if (map_type >= ARRAY_SIZE(bpf_map_types)) { + bpf_log(log, "Invalid map_type.\n"); return -EINVAL; + } map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types)); ops = bpf_map_types[map_type]; if (!ops) @@ -1423,8 +1437,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) if (token_flag) { token = bpf_token_get_from_fd(attr->map_token_fd); - if (IS_ERR(token)) + if (IS_ERR(token)) { + bpf_log(log, "Invalid map_token_fd.\n"); return PTR_ERR(token); + } /* if current token doesn't grant map creation permissions, * then we can't use this token, so ignore it and rely on @@ -1507,8 +1523,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) err = bpf_obj_name_cpy(map->name, attr->map_name, sizeof(attr->map_name)); - if (err < 0) + if (err < 0) { + bpf_log(log, "Invalid map_name.\n"); goto free_map; + } preempt_disable(); map->cookie = gen_cookie_next(&bpf_map_cookie); @@ -1531,6 +1549,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { + bpf_log(log, "Invalid btf_fd.\n"); err = PTR_ERR(btf); goto free_map; } @@ -1558,6 +1577,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel); if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) { + bpf_log(log, "Invalid excl_prog_hash_size.\n"); err = -EINVAL; goto free_map; } @@ -1573,6 +1593,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) goto free_map; } } else if (attr->excl_prog_hash_size) { + bpf_log(log, "Invalid excl_prog_hash_size.\n"); err = -EINVAL; goto free_map; } @@ -1611,6 +1632,31 @@ put_token: return err; } +static int map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_common_attr *attr_common, + bpfptr_t uattr_common, u32 size_common) +{ + struct bpf_verifier_log *log; + struct bpf_log_attr attr_log; + int err, ret; + + log = bpf_log_attr_create_vlog(&attr_log, attr_common, uattr_common, size_common); + if (IS_ERR(log)) + return PTR_ERR(log); + + err = __map_create(attr, uattr, log); + + /* preserve original error even if log finalization is successful */ + ret = bpf_log_attr_finalize(&attr_log, log); + if (ret) { + if (err >= 0) + close_fd(err); + err = ret; + } + + kfree(log); + return err; +} + void bpf_map_inc(struct bpf_map *map) { atomic64_inc(&map->refcnt); @@ -6250,7 +6296,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr, uattr); + err = map_create(&attr, uattr, &attr_common, uattr_common, size_common); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); -- cgit v1.2.3 From 1cb229a54af1e36f19f7e8359692fa0d76fbc360 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:00 -0700 Subject: bpf: Remove copy_register_state wrapper function Remove the copy_register_state() helper which was just a plain struct assignment wrapper and replace all call sites with direct struct assignment. This simplifies the code in preparation for upcoming stack argument support. No functional change. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045000.2382933-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0e654ef01ae0..1dd9736c2a13 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3410,12 +3410,6 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, src_reg->id = ++env->id_gen; } -/* Copy src state preserving dst->parent and dst->live fields */ -static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) -{ - *dst = *src; -} - static void save_register_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi, struct bpf_reg_state *reg, @@ -3423,7 +3417,7 @@ static void save_register_state(struct bpf_verifier_env *env, { int i; - copy_register_state(&state->stack[spi].spilled_ptr, reg); + state->stack[spi].spilled_ptr = *reg; for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--) state->stack[spi].slot_type[i - 1] = STACK_SPILL; @@ -3822,7 +3816,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, * with the destination register on fill. */ assign_scalar_id_before_mov(env, reg); - copy_register_state(&state->regs[dst_regno], reg); + state->regs[dst_regno] = *reg; state->regs[dst_regno].subreg_def = subreg_def; /* Break the relation on a narrowing fill. @@ -3877,7 +3871,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, * with the destination register on fill. */ assign_scalar_id_before_mov(env, reg); - copy_register_state(&state->regs[dst_regno], reg); + state->regs[dst_regno] = *reg; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions @@ -6031,7 +6025,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b size); return -EACCES; } - copy_register_state(®s[value_regno], reg); + regs[value_regno] = *reg; add_scalar_to_reg(®s[value_regno], off); regs[value_regno].type = PTR_TO_INSN; } else { @@ -13248,7 +13242,7 @@ do_sim: */ if (!ptr_is_dst_reg) { tmp = *dst_reg; - copy_register_state(dst_reg, ptr_reg); + *dst_reg = *ptr_reg; } err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx); if (err < 0) @@ -14698,7 +14692,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * copy register state to dest reg */ assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { /* case: R1 = (s8, s16 s32)R2 */ @@ -14713,7 +14707,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) no_sext = reg_umax(src_reg) < (1ULL << (insn->off - 1)); if (no_sext) assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; if (!no_sext) clear_scalar_id(dst_reg); coerce_reg_to_size_sx(dst_reg, insn->off >> 3); @@ -14735,7 +14729,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (is_src_reg_u32) assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; /* Make sure ID is cleared if src_reg is not in u32 * range otherwise dst_reg min/max could be incorrectly * propagated into src_reg by sync_linked_regs() @@ -14749,7 +14743,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (no_sext) assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; if (!no_sext) clear_scalar_id(dst_reg); dst_reg->subreg_def = env->insn_idx + 1; @@ -15629,7 +15623,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s reg->delta == known_reg->delta) { s32 saved_subreg_def = reg->subreg_def; - copy_register_state(reg, known_reg); + *reg = *known_reg; reg->subreg_def = saved_subreg_def; } else { s32 saved_subreg_def = reg->subreg_def; @@ -15640,7 +15634,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s __mark_reg_known(&fake_reg, (s64)reg->delta - (s64)known_reg->delta); /* reg = known_reg; reg += delta */ - copy_register_state(reg, known_reg); + *reg = *known_reg; /* * Must preserve off, id and subreg_def flag, * otherwise another sync_linked_regs() will be incorrect. @@ -15743,10 +15737,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - copy_register_state(&env->false_reg1, dst_reg); - copy_register_state(&env->false_reg2, src_reg); - copy_register_state(&env->true_reg1, dst_reg); - copy_register_state(&env->true_reg2, src_reg); + env->false_reg1 = *dst_reg; + env->false_reg2 = *src_reg; + env->true_reg1 = *dst_reg; + env->true_reg2 = *src_reg; pred = is_branch_taken(env, dst_reg, src_reg, opcode, is_jmp32); if (pred >= 0) { /* If we get here with a dst_reg pointer type it is because @@ -15815,11 +15809,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (err) return err; - copy_register_state(dst_reg, &env->false_reg1); - copy_register_state(src_reg, &env->false_reg2); - copy_register_state(&other_branch_regs[insn->dst_reg], &env->true_reg1); + *dst_reg = env->false_reg1; + *src_reg = env->false_reg2; + other_branch_regs[insn->dst_reg] = env->true_reg1; if (BPF_SRC(insn->code) == BPF_X) - copy_register_state(&other_branch_regs[insn->src_reg], &env->true_reg2); + other_branch_regs[insn->src_reg] = env->true_reg2; if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id && -- cgit v1.2.3 From 3ab5bd317ee280b198b00ea2114adaad7a458ef8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:10 -0700 Subject: bpf: Set sub->arg_cnt earlier in btf_prepare_func_args() Move the "sub->arg_cnt = nargs" assignment to immediately after nargs is computed from btf_type_vlen(), instead of at the end of btf_prepare_func_args(). btf_prepare_func_args() can return -EINVAL early in several cases, e.g. when a static function has some non-int/enum arguments. Since -EINVAL from btf_prepare_func_args() does not immediately reject verification, arg_cnt remains zero after the early return. This causes later stack argument based load/store insns to incorrectly assume the function has no arguments. Setting arg_cnt right after nargs ensures it is available regardless of which path btf_prepare_func_args() takes. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045010.2384635-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a6bf4781943c..099d7ca5a980 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7864,6 +7864,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); + sub->arg_cnt = nargs; if (nargs > MAX_BPF_FUNC_REG_ARGS) { if (!is_global) return -EINVAL; @@ -8051,7 +8052,6 @@ skip_pointer: return -EINVAL; } - sub->arg_cnt = nargs; sub->args_cached = true; return 0; -- cgit v1.2.3 From 0f6bd5e7a804af27e7f34b8306afde7a6b269318 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:15 -0700 Subject: bpf: Support stack arguments for bpf functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently BPF functions (subprogs) are limited to 5 register arguments. With [1], the compiler can emit code that passes additional arguments via a dedicated stack area through bpf register BPF_REG_PARAMS (r11), introduced in an earlier patch ([2]). The compiler uses positive r11 offsets for incoming (callee-side) args and negative r11 offsets for outgoing (caller-side) args, following the x86_64/arm64 calling convention direction. There is an 8-byte gap at offset 0 separating two regions: Incoming (callee reads): r11+8 (arg6), r11+16 (arg7), ... Outgoing (caller writes): r11-8 (arg6), r11-16 (arg7), ... The following is an example to show how stack arguments are saved and transferred between caller and callee: int foo(int a1, int a2, int a3, int a4, int a5, int a6, int a7) { ... bar(a1, a2, a3, a4, a5, a6, a7, a8); ... } Caller (foo) Callee (bar) ============ ============ Incoming (positive offsets): Incoming (positive offsets): r11+8: [incoming arg 6] r11+8: [incoming arg 6] <-+ r11+16: [incoming arg 7] r11+16: [incoming arg 7] <-|+ r11+24: [incoming arg 8] <-||+ Outgoing (negative offsets): ||| r11-8: [outgoing arg 6 to bar] -------->-------------------------+|| r11-16: [outgoing arg 7 to bar] -------->--------------------------+| r11-24: [outgoing arg 8 to bar] -------->---------------------------+ If the bpf function has more than one call: int foo(int a1, int a2, int a3, int a4, int a5, int a6, int a7) { ... bar1(a1, a2, a3, a4, a5, a6, a7, a8); ... bar2(a1, a2, a3, a4, a5, a6, a7, a8, a9); ... } Caller (foo) Callee (bar2) ============ ============== Incoming (positive offsets): Incoming (positive offsets): r11+8: [incoming arg 6] r11+8: [incoming arg 6] <+ r11+16: [incoming arg 7] r11+16: [incoming arg 7] <|+ r11+24: [incoming arg 8] <||+ Outgoing for bar2 (negative offsets): r11+32: [incoming arg 9] <|||+ r11-8: [outgoing arg 6] ---->----------->-------------------------+||| r11-16: [outgoing arg 7] ---->----------->--------------------------+|| r11-24: [outgoing arg 8] ---->----------->---------------------------+| r11-32: [outgoing arg 9] ---->----------->----------------------------+ The verifier tracks outgoing stack arguments in stack_arg_regs[] and out_stack_arg_cnt in bpf_func_state, separately from the regular r10 stack. The callee does not copy incoming args — it reads them directly from the caller's outgoing slots at positive r11 offsets. Similar to stacksafe(), introduce stack_arg_safe() to do pruning check. Outgoing stack arg slots are invalidated when the callee returns (e.g. in prepare_func_exit), not at call time. This allows the callee to read incoming args from the caller's outgoing slots during verification. The following are a few examples. Example 1: *(u64 *)(r11 - 8) = r6; *(u64 *)(r11 - 16) = r7; call bar1; // arg6 = r6, arg7 = r7 call bar2; // expected with 2 stack arguments, failed Example 2: To fix the Example 1: *(u64 *)(r11 - 8) = r6; *(u64 *)(r11 - 16) = r7; call bar1; // arg6 = r6, arg7 = r7 *(u64 *)(r11 - 8) = r8; *(u64 *)(r11 - 16) = r9; call bar2; // arg6 = r8, arg7 = r9 Example 3: The compiler can hoist the shared stack arg stores above the branch: *(u64 *)(r11 - 16) = r7; if cond goto else; *(u64 *)(r11 - 8) = r8; call bar1; // arg6 = r8, arg7 = r7 goto end; else: *(u64 *)(r11 - 8) = r9; call bar2; // arg6 = r9, arg7 = r7 end: Example 4: Within a loop: loop: *(u64 *)(r11 - 8) = r6; // arg6, before loop call bar; // reuses arg6 each iteration if ... goto loop; A separate max_out_stack_arg_cnt field in bpf_subprog_info tracks the deepest outgoing slot actually written. This intends to reject programs that write to slots beyond what any callee expects. It is necessary for JIT. Similar to typical compiler generated code, enforce the following orderings: - all stack arg reads must be ahead of any stack arg write - all stack arg reads must be before any bpf func, kfunc and helpers This is needed as JIT may emit 'mov' insns for read/write with the same register and bpf function, kfunc and helper will invalidate all arguments immediately after the call. Callback functions with stack arguments need kernel setup parameter types (including stack parameters) properly and then callback function can retrieve such information for verification purpose. Global subprogs and freplace with >5 args are not yet supported. [1] https://github.com/llvm/llvm-project/pull/189060 [2] https://lore.kernel.org/bpf/20260423033506.2542005-1-yonghong.song@linux.dev/ Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045015.2385013-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 43 ++++++++- kernel/bpf/btf.c | 14 ++- kernel/bpf/fixups.c | 16 +++- kernel/bpf/states.c | 32 +++++++ kernel/bpf/verifier.c | 203 ++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 294 insertions(+), 14 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 203fb751eeae..5398a02a1280 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -402,6 +402,7 @@ struct bpf_func_state { bool in_callback_fn; bool in_async_callback_fn; bool in_exception_callback_fn; + bool no_stack_arg_load; /* For callback calling functions that limit number of possible * callback executions (e.g. bpf_loop) keeps track of current * simulated iteration number. @@ -427,6 +428,9 @@ struct bpf_func_state { * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE. */ int allocated_stack; + + u16 out_stack_arg_cnt; /* Number of outgoing on-stack argument slots */ + struct bpf_reg_state *stack_arg_regs; /* Outgoing on-stack arguments */ }; #define MAX_CALL_FRAMES 8 @@ -465,8 +469,10 @@ struct bpf_jmp_history_entry { u64 linked_regs; }; -/* Maximum number of register states that can exist at once */ -#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES) +/* Maximum number of bpf_reg_state objects that can exist at once */ +#define MAX_STACK_ARG_SLOTS (MAX_BPF_FUNC_ARGS - MAX_BPF_FUNC_REG_ARGS) +#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE + \ + MAX_STACK_ARG_SLOTS) * MAX_CALL_FRAMES) struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; @@ -561,12 +567,27 @@ bpf_get_spilled_reg(int slot, struct bpf_func_state *frame, u32 mask) return NULL; } +static inline struct bpf_reg_state * +bpf_get_spilled_stack_arg(int slot, struct bpf_func_state *frame) +{ + if (slot < frame->out_stack_arg_cnt && + frame->stack_arg_regs[slot].type != NOT_INIT) + return &frame->stack_arg_regs[slot]; + return NULL; +} + /* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */ #define bpf_for_each_spilled_reg(iter, frame, reg, mask) \ for (iter = 0, reg = bpf_get_spilled_reg(iter, frame, mask); \ iter < frame->allocated_stack / BPF_REG_SIZE; \ iter++, reg = bpf_get_spilled_reg(iter, frame, mask)) +/* Iterate over 'frame', setting 'reg' to either NULL or a spilled stack arg. */ +#define bpf_for_each_spilled_stack_arg(iter, frame, reg) \ + for (iter = 0, reg = bpf_get_spilled_stack_arg(iter, frame); \ + iter < frame->out_stack_arg_cnt; \ + iter++, reg = bpf_get_spilled_stack_arg(iter, frame)) + #define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr) \ ({ \ struct bpf_verifier_state *___vstate = __vst; \ @@ -584,6 +605,11 @@ bpf_get_spilled_reg(int slot, struct bpf_func_state *frame, u32 mask) continue; \ (void)(__expr); \ } \ + bpf_for_each_spilled_stack_arg(___j, __state, __reg) { \ + if (!__reg) \ + continue; \ + (void)(__expr); \ + } \ } \ }) @@ -815,12 +841,21 @@ struct bpf_subprog_info { bool keep_fastcall_stack: 1; bool changes_pkt_data: 1; bool might_sleep: 1; - u8 arg_cnt:3; + u8 arg_cnt:4; enum priv_stack_mode priv_stack_mode; - struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; + struct bpf_subprog_arg_info args[MAX_BPF_FUNC_ARGS]; + u16 stack_arg_cnt; /* incoming + max outgoing */ + u16 max_out_stack_arg_cnt; }; +static inline u16 bpf_in_stack_arg_cnt(const struct bpf_subprog_info *sub) +{ + if (sub->arg_cnt > MAX_BPF_FUNC_REG_ARGS) + return sub->arg_cnt - MAX_BPF_FUNC_REG_ARGS; + return 0; +} + struct bpf_verifier_env; struct backtrack_state { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 099d7ca5a980..4fb8641546b8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7865,10 +7865,16 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); sub->arg_cnt = nargs; - if (nargs > MAX_BPF_FUNC_REG_ARGS) { - if (!is_global) - return -EINVAL; - bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n", + if (nargs > MAX_BPF_FUNC_ARGS) { + bpf_log(log, "kernel supports at most %d parameters, function %s has %d\n", + MAX_BPF_FUNC_ARGS, tname, nargs); + return -EFAULT; + } + if (nargs > MAX_BPF_FUNC_REG_ARGS) + sub->stack_arg_cnt = nargs - MAX_BPF_FUNC_REG_ARGS; + + if (is_global && nargs > MAX_BPF_FUNC_REG_ARGS) { + bpf_log(log, "global function %s has %d > %d args, stack args not supported\n", tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index fba9e8c00878..ba86039789fd 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -1378,9 +1378,21 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env) struct bpf_prog *prog = env->prog; struct bpf_insn *insn = prog->insnsi; bool has_kfunc_call = bpf_prog_has_kfunc_call(prog); - int i, depth; + int depth; #endif - int err = 0; + int i, err = 0; + + for (i = 0; i < env->subprog_cnt; i++) { + struct bpf_subprog_info *subprog = &env->subprog_info[i]; + u16 outgoing = subprog->stack_arg_cnt - bpf_in_stack_arg_cnt(subprog); + + if (subprog->max_out_stack_arg_cnt > outgoing) { + verbose(env, + "func#%d writes %u stack arg slots, but calls only require %u\n", + i, subprog->max_out_stack_arg_cnt, outgoing); + return -EINVAL; + } + } if (env->prog->jit_requested && !bpf_prog_is_offloaded(env->prog->aux)) { diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index bd9c22945050..3ce6d2652b27 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -833,6 +833,32 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, return true; } +/* + * Compare stack arg slots between old and current states. + * Outgoing stack args are path-local state and must agree for pruning. + */ +static bool stack_arg_safe(struct bpf_verifier_env *env, struct bpf_func_state *old, + struct bpf_func_state *cur, struct bpf_idmap *idmap, + enum exact_level exact) +{ + int i, nslots; + + nslots = max(old->out_stack_arg_cnt, cur->out_stack_arg_cnt); + for (i = 0; i < nslots; i++) { + struct bpf_reg_state *old_arg, *cur_arg; + struct bpf_reg_state not_init = { .type = NOT_INIT }; + + old_arg = i < old->out_stack_arg_cnt ? + &old->stack_arg_regs[i] : ¬_init; + cur_arg = i < cur->out_stack_arg_cnt ? + &cur->stack_arg_regs[i] : ¬_init; + if (!regsafe(env, old_arg, cur_arg, idmap, exact)) + return false; + } + + return true; +} + static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur, struct bpf_idmap *idmap) { @@ -915,6 +941,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat if (old->callback_depth > cur->callback_depth) return false; + if (!old->no_stack_arg_load && cur->no_stack_arg_load) + return false; + for (i = 0; i < MAX_BPF_REG; i++) if (((1 << i) & live_regs) && !regsafe(env, &old->regs[i], &cur->regs[i], @@ -924,6 +953,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat if (!stacksafe(env, old, cur, &env->idmap_scratch, exact)) return false; + if (!stack_arg_safe(env, old, cur, &env->idmap_scratch, exact)) + return false; + return true; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1dd9736c2a13..a29b3003cbec 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1362,6 +1362,18 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st return -ENOMEM; dst->allocated_stack = src->allocated_stack; + + /* copy stack args state */ + n = src->out_stack_arg_cnt; + if (n) { + dst->stack_arg_regs = copy_array(dst->stack_arg_regs, src->stack_arg_regs, n, + sizeof(struct bpf_reg_state), + GFP_KERNEL_ACCOUNT); + if (!dst->stack_arg_regs) + return -ENOMEM; + } + + dst->out_stack_arg_cnt = src->out_stack_arg_cnt; return 0; } @@ -1403,6 +1415,23 @@ static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state return 0; } +static int grow_stack_arg_slots(struct bpf_verifier_env *env, + struct bpf_func_state *state, int cnt) +{ + size_t old_n = state->out_stack_arg_cnt; + + if (old_n >= cnt) + return 0; + + state->stack_arg_regs = realloc_array(state->stack_arg_regs, old_n, cnt, + sizeof(struct bpf_reg_state)); + if (!state->stack_arg_regs) + return -ENOMEM; + + state->out_stack_arg_cnt = cnt; + return 0; +} + /* Acquire a pointer id from the env and update the state->refs to include * this new pointer reference. * On success, returns a valid pointer id to associate with the register @@ -1565,6 +1594,7 @@ static void free_func_state(struct bpf_func_state *state) { if (!state) return; + kfree(state->stack_arg_regs); kfree(state->stack); kfree(state); } @@ -4050,6 +4080,103 @@ static int check_stack_write(struct bpf_verifier_env *env, return err; } +/* + * Write a value to the outgoing stack arg area. + * off is a negative offset from r11 (e.g. -8 for arg6, -16 for arg7). + */ +static int check_stack_arg_write(struct bpf_verifier_env *env, struct bpf_func_state *state, + int off, struct bpf_reg_state *value_reg) +{ + int max_stack_arg_regs = MAX_BPF_FUNC_ARGS - MAX_BPF_FUNC_REG_ARGS; + struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno]; + int spi = -off / BPF_REG_SIZE - 1; + struct bpf_reg_state *arg; + int err; + + if (spi >= max_stack_arg_regs) { + verbose(env, "stack arg write offset %d exceeds max %d stack args\n", + off, max_stack_arg_regs); + return -EINVAL; + } + + err = grow_stack_arg_slots(env, state, spi + 1); + if (err) + return err; + + /* Track the max outgoing stack arg slot count. */ + if (spi + 1 > subprog->max_out_stack_arg_cnt) + subprog->max_out_stack_arg_cnt = spi + 1; + + if (value_reg) { + state->stack_arg_regs[spi] = *value_reg; + } else { + /* BPF_ST: store immediate, treat as scalar */ + arg = &state->stack_arg_regs[spi]; + arg->type = SCALAR_VALUE; + __mark_reg_known(arg, env->prog->insnsi[env->insn_idx].imm); + } + state->no_stack_arg_load = true; + return 0; +} + +/* + * Read a value from the incoming stack arg area. + * off is a positive offset from r11 (e.g. +8 for arg6, +16 for arg7). + */ +static int check_stack_arg_read(struct bpf_verifier_env *env, struct bpf_func_state *state, + int off, int dst_regno) +{ + struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno]; + struct bpf_verifier_state *vstate = env->cur_state; + int spi = off / BPF_REG_SIZE - 1; + struct bpf_func_state *caller, *cur; + struct bpf_reg_state *arg; + + if (state->no_stack_arg_load) { + verbose(env, "r11 load must be before any r11 store or call insn\n"); + return -EINVAL; + } + + if (spi + 1 > bpf_in_stack_arg_cnt(subprog)) { + verbose(env, "invalid read from stack arg off %d depth %d\n", + off, bpf_in_stack_arg_cnt(subprog) * BPF_REG_SIZE); + return -EACCES; + } + + caller = vstate->frame[vstate->curframe - 1]; + arg = &caller->stack_arg_regs[spi]; + cur = vstate->frame[vstate->curframe]; + cur->regs[dst_regno] = *arg; + return 0; +} + +static int check_outgoing_stack_args(struct bpf_verifier_env *env, struct bpf_func_state *caller, + int nargs) +{ + int i, spi; + + for (i = MAX_BPF_FUNC_REG_ARGS; i < nargs; i++) { + spi = i - MAX_BPF_FUNC_REG_ARGS; + if (spi >= caller->out_stack_arg_cnt || + caller->stack_arg_regs[spi].type == NOT_INIT) { + verbose(env, "callee expects %d args, stack arg%d is not initialized\n", + nargs, spi + 1); + return -EFAULT; + } + } + + return 0; +} + +static struct bpf_reg_state *get_func_arg_reg(struct bpf_func_state *caller, + struct bpf_reg_state *regs, int arg) +{ + if (arg < MAX_BPF_FUNC_REG_ARGS) + return ®s[arg + 1]; + + return &caller->stack_arg_regs[arg - MAX_BPF_FUNC_REG_ARGS]; +} + static int check_map_access_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int off, int size, enum bpf_access_type type) { @@ -6217,10 +6344,20 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, bool strict_alignment_once, bool is_ldsx, bool allow_trust_mismatch, const char *ctx) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = cur_regs(env); enum bpf_reg_type src_reg_type; int err; + /* Handle stack arg read */ + if (is_stack_arg_ldx(insn)) { + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + return check_stack_arg_read(env, state, insn->off, insn->dst_reg); + } + /* check src operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) @@ -6249,10 +6386,20 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, bool strict_alignment_once) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = cur_regs(env); enum bpf_reg_type dst_reg_type; int err; + /* Handle stack arg write */ + if (is_stack_arg_stx(insn)) { + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + return check_stack_arg_write(env, state, insn->off, regs + insn->src_reg); + } + /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) @@ -8860,6 +9007,15 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env, } } +static void invalidate_outgoing_stack_args(const struct bpf_verifier_env *env, + struct bpf_func_state *state) +{ + int i, nslots = state->out_stack_arg_cnt; + + for (i = 0; i < nslots; i++) + bpf_mark_reg_not_init(env, &state->stack_arg_regs[i]); +} + typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, @@ -8922,6 +9078,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs) { struct bpf_subprog_info *sub = subprog_info(env, subprog); + struct bpf_func_state *caller = cur_func(env); struct bpf_verifier_log *log = &env->log; u32 i; int ret; @@ -8930,13 +9087,16 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, if (ret) return ret; + ret = check_outgoing_stack_args(env, caller, sub->arg_cnt); + if (ret) + return ret; + /* check that BTF function arguments match actual types that the * verifier sees. */ for (i = 0; i < sub->arg_cnt; i++) { argno_t argno = argno_from_arg(i + 1); - u32 regno = i + 1; - struct bpf_reg_state *reg = ®s[regno]; + struct bpf_reg_state *reg = get_func_arg_reg(caller, regs, i); struct bpf_subprog_arg_info *arg = &sub->args[i]; if (arg->arg_type == ARG_ANYTHING) { @@ -9124,6 +9284,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; + struct bpf_subprog_info *caller_info; + u16 callee_incoming, stack_arg_cnt; struct bpf_func_state *caller; int err, subprog, target_insn; @@ -9166,6 +9328,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* mark global subprog for verifying after main prog */ subprog_aux(env, subprog)->called = true; clear_caller_saved_regs(env, caller->regs); + invalidate_outgoing_stack_args(env, cur_func(env)); /* All non-void global functions return a 64-bit SCALAR_VALUE. */ if (!subprog_returns_void(env, subprog)) { @@ -9177,6 +9340,16 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return 0; } + /* + * Track caller's total stack arg count (incoming + max outgoing). + * This is needed so the JIT knows how much stack arg space to allocate. + */ + caller_info = &env->subprog_info[caller->subprogno]; + callee_incoming = bpf_in_stack_arg_cnt(&env->subprog_info[subprog]); + stack_arg_cnt = bpf_in_stack_arg_cnt(caller_info) + callee_incoming; + if (stack_arg_cnt > caller_info->stack_arg_cnt) + caller_info->stack_arg_cnt = stack_arg_cnt; + /* for regular function entry setup new frame and continue * from that frame. */ @@ -9534,6 +9707,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) * bpf_throw, this will be done by copy_verifier_state for extra frames. */ free_func_state(callee); state->frame[state->curframe--] = NULL; + invalidate_outgoing_stack_args(env, caller); /* for callbacks widen imprecise scalars to make programs like below verify: * @@ -10160,6 +10334,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn bpf_mark_reg_not_init(env, ®s[caller_saved[i]]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } + invalidate_outgoing_stack_args(env, cur_func(env)); /* helper call returns 64-bit value. */ regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; @@ -12842,6 +13017,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, bpf_mark_reg_not_init(env, ®s[regno]); regs[regno].subreg_def = DEF_NOT_SUBREG; } + invalidate_outgoing_stack_args(env, cur_func(env)); /* Check return type */ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL); @@ -16961,6 +17137,14 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) return check_store_reg(env, insn, false); case BPF_ST: { + /* Handle stack arg write (store immediate) */ + if (is_stack_arg_st(insn)) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + + return check_stack_arg_write(env, state, insn->off, NULL); + } + enum bpf_reg_type dst_reg_type; err = check_reg_arg(env, insn->dst_reg, SRC_OP); @@ -16995,6 +17179,8 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) } } mark_reg_scratched(env, BPF_REG_0); + if (bpf_in_stack_arg_cnt(&env->subprog_info[cur_func(env)->subprogno])) + cur_func(env)->no_stack_arg_load = true; if (insn->src_reg == BPF_PSEUDO_CALL) return check_func_call(env, insn, &env->insn_idx); if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) @@ -18110,7 +18296,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) goto out; } } - for (i = BPF_REG_1; i <= sub->arg_cnt; i++) { + for (i = BPF_REG_1; i <= min_t(u32, sub->arg_cnt, MAX_BPF_FUNC_REG_ARGS); i++) { arg = &sub->args[i - BPF_REG_1]; reg = ®s[i]; @@ -18153,6 +18339,12 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) goto out; } } + if (env->prog->type == BPF_PROG_TYPE_EXT && sub->arg_cnt > MAX_BPF_FUNC_REG_ARGS) { + verbose(env, "freplace programs with >%d args not supported yet\n", + MAX_BPF_FUNC_REG_ARGS); + ret = -EINVAL; + goto out; + } } else { /* if main BPF program has associated BTF info, validate that * it's matching expected signature, and otherwise mark BTF @@ -18160,8 +18352,11 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) */ if (env->prog->aux->func_info_aux) { ret = btf_prepare_func_args(env, 0); - if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX) + if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX) { env->prog->aux->func_info_aux[0].unreliable = true; + sub->arg_cnt = 1; + sub->stack_arg_cnt = 0; + } } /* 1st arg to a function */ -- cgit v1.2.3 From 3a656670fd6da624f6241038ca4cf350f24fd5e8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:20 -0700 Subject: bpf: Refactor jmp history to use dedicated spi/frame fields Move stack slot index (spi) and frame number out of the flags field in bpf_jmp_history_entry into dedicated bitfields. This simplifies the encoding and makes room for new flags. Previously, spi and frame were packed into the lower 9 bits of the 12-bit flags field (3 bits frame + 6 bits spi), with INSN_F_STACK_ACCESS at BIT(9) and INSN_F_DST/SRC_REG_STACK at BIT(10)/BIT(11). But this has no room for an INSN_F_* flag for stack arguments. To resolve this issue, bpf_jmp_history_entry field idx is narrowed to 20 bits (sufficient for insn indices up to 1M), and the freed bits hold spi (6 bits) and frame (3 bits) as dedicated struct fields. The flags enum is simplified accordingly: INSN_F_STACK_ACCESS -> BIT(0) INSN_F_DST_REG_STACK -> BIT(1) INSN_F_SRC_REG_STACK -> BIT(2) which allows more room for additional INSN_F_* flags. bpf_push_jmp_history() now takes explicit spi and frame parameters instead of encoding them into flags. The insn_stack_access_flags(), insn_stack_access_spi(), and insn_stack_access_frameno() helpers are removed. No functional change. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045020.2385962-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 37 ++++++++++++++++--------------------- kernel/bpf/backtrack.c | 24 +++++++++--------------- kernel/bpf/states.c | 2 +- kernel/bpf/verifier.c | 23 +++++++++++------------ 4 files changed, 37 insertions(+), 49 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5398a02a1280..3ec338169981 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -435,40 +435,35 @@ struct bpf_func_state { #define MAX_CALL_FRAMES 8 -/* instruction history flags, used in bpf_jmp_history_entry.flags field */ +/* instruction history flags, used in bpf_jmp_history_entry.flags field. + * Frame number and SPI are stored in dedicated fields of bpf_jmp_history_entry. + */ enum { - /* instruction references stack slot through PTR_TO_STACK register; - * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8) - * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512, - * 8 bytes per slot, so slot index (spi) is [0, 63]) - */ - INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */ - - INSN_F_SPI_MASK = 0x3f, /* 6 bits */ - INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */ + INSN_F_STACK_ACCESS = BIT(0), - INSN_F_STACK_ACCESS = BIT(9), - - INSN_F_DST_REG_STACK = BIT(10), /* dst_reg is PTR_TO_STACK */ - INSN_F_SRC_REG_STACK = BIT(11), /* src_reg is PTR_TO_STACK */ - /* total 12 bits are used now. */ + INSN_F_DST_REG_STACK = BIT(1), /* dst_reg is PTR_TO_STACK */ + INSN_F_SRC_REG_STACK = BIT(2), /* src_reg is PTR_TO_STACK */ }; -static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES); -static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8); - struct bpf_jmp_history_entry { - u32 idx; /* insn idx can't be bigger than 1 million */ + u32 idx : 20; + u32 frame : 3; /* stack access frame number */ + u32 spi : 6; /* stack slot index (0..63) */ + u32 : 3; u32 prev_idx : 20; /* special INSN_F_xxx flags */ - u32 flags : 12; + u32 flags : 4; + u32 : 8; /* additional registers that need precision tracking when this * jump is backtracked, vector of six 10-bit records */ u64 linked_regs; }; +static_assert(MAX_CALL_FRAMES <= (1 << 3)); +static_assert(MAX_BPF_STACK / 8 <= (1 << 6)); + /* Maximum number of bpf_reg_state objects that can exist at once */ #define MAX_STACK_ARG_SLOTS (MAX_BPF_FUNC_ARGS - MAX_BPF_FUNC_REG_ARGS) #define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE + \ @@ -1198,7 +1193,7 @@ struct list_head *bpf_explored_state(struct bpf_verifier_env *env, int idx); void bpf_free_verifier_state(struct bpf_verifier_state *state, bool free_self); void bpf_free_backedges(struct bpf_scc_visit *visit); int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs); + int insn_flags, int spi, int frame, u64 linked_regs); void bpf_bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist); void bpf_mark_reg_not_init(const struct bpf_verifier_env *env, struct bpf_reg_state *reg); diff --git a/kernel/bpf/backtrack.c b/kernel/bpf/backtrack.c index 854731dc93fe..5e93e57fb7ae 100644 --- a/kernel/bpf/backtrack.c +++ b/kernel/bpf/backtrack.c @@ -9,7 +9,7 @@ /* for any branch, call, exit record the history of jmps in the given state */ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs) + int insn_flags, int spi, int frame, u64 linked_regs) { u32 cnt = cur->jmp_history_cnt; struct bpf_jmp_history_entry *p; @@ -25,6 +25,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state env, "insn history: insn_idx %d cur flags %x new flags %x", env->insn_idx, env->cur_hist_ent->flags, insn_flags); env->cur_hist_ent->flags |= insn_flags; + env->cur_hist_ent->spi = spi; + env->cur_hist_ent->frame = frame; verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env, "insn history: insn_idx %d linked_regs: %#llx", env->insn_idx, env->cur_hist_ent->linked_regs); @@ -43,6 +45,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; + p->spi = spi; + p->frame = frame; p->linked_regs = linked_regs; cur->jmp_history_cnt = cnt; env->cur_hist_ent = p; @@ -64,16 +68,6 @@ static bool is_atomic_fetch_insn(const struct bpf_insn *insn) (insn->imm & BPF_FETCH); } -static int insn_stack_access_spi(int insn_flags) -{ - return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK; -} - -static int insn_stack_access_frameno(int insn_flags) -{ - return insn_flags & INSN_F_FRAMENO_MASK; -} - /* Backtrack one insn at a time. If idx is not at the top of recorded * history then previous instruction came from straight line execution. * Return -ENOENT if we exhausted all instructions within given state. @@ -353,8 +347,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * that [fp - off] slot contains scalar that needs to be * tracked with precision */ - spi = insn_stack_access_spi(hist->flags); - fr = insn_stack_access_frameno(hist->flags); + spi = hist->spi; + fr = hist->frame; bpf_bt_set_frame_slot(bt, fr, spi); } else if (class == BPF_STX || class == BPF_ST) { if (bt_is_reg_set(bt, dreg)) @@ -366,8 +360,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, /* scalars can only be spilled into stack */ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) return 0; - spi = insn_stack_access_spi(hist->flags); - fr = insn_stack_access_frameno(hist->flags); + spi = hist->spi; + fr = hist->frame; if (!bt_is_frame_slot_set(bt, fr, spi)) return 0; bt_clear_frame_slot(bt, fr, spi); diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index 3ce6d2652b27..877338136009 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -1403,7 +1403,7 @@ hit: */ err = 0; if (bpf_is_jmp_point(env, env->insn_idx)) - err = bpf_push_jmp_history(env, cur, 0, 0); + err = bpf_push_jmp_history(env, cur, 0, 0, 0, 0); err = err ? : propagate_precision(env, &sl->state, cur, NULL); if (err) return err; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a29b3003cbec..d15aef2fe4a1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3198,11 +3198,6 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return __check_reg_arg(env, state->regs, regno, t); } -static int insn_stack_access_flags(int frameno, int spi) -{ - return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno; -} - static void mark_indirect_target(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].indirect_target = true; @@ -3517,7 +3512,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; struct bpf_reg_state *reg = NULL; - int insn_flags = insn_stack_access_flags(state->frameno, spi); + int insn_flags = INSN_F_STACK_ACCESS; + int hist_spi = spi, hist_frame = state->frameno; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, * so it's aligned access and [off, off + size) are within stack limits @@ -3613,7 +3609,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0); + return bpf_push_jmp_history(env, env->cur_state, insn_flags, + hist_spi, hist_frame, 0); return 0; } @@ -3809,7 +3806,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; struct bpf_reg_state *reg; u8 *stype, type; - int insn_flags = insn_stack_access_flags(reg_state->frameno, spi); + int insn_flags = INSN_F_STACK_ACCESS; + int hist_spi = spi, hist_frame = reg_state->frameno; stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; @@ -3940,7 +3938,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0); + return bpf_push_jmp_history(env, env->cur_state, insn_flags, + hist_spi, hist_frame, 0); return 0; } @@ -15907,7 +15906,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } if (insn_flags) { - err = bpf_push_jmp_history(env, this_branch, insn_flags, 0); + err = bpf_push_jmp_history(env, this_branch, insn_flags, 0, 0, 0); if (err) return err; } @@ -15971,7 +15970,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && dst_reg->id) collect_linked_regs(env, this_branch, dst_reg->id, &linked_regs); if (linked_regs.cnt > 1) { - err = bpf_push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + err = bpf_push_jmp_history(env, this_branch, 0, 0, 0, linked_regs_pack(&linked_regs)); if (err) return err; } @@ -17278,7 +17277,7 @@ static int do_check(struct bpf_verifier_env *env) } if (bpf_is_jmp_point(env, env->insn_idx)) { - err = bpf_push_jmp_history(env, state, 0, 0); + err = bpf_push_jmp_history(env, state, 0, 0, 0, 0); if (err) return err; } -- cgit v1.2.3 From 0a0fdc64b68c28dab40f9deb0cffdf544e04b0ba Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:25 -0700 Subject: bpf: Add precision marking and backtracking for stack argument slots Extend the precision marking and backtracking infrastructure to support stack argument slots (r11-based accesses). Without this, precision demands for scalar values passed through stack arguments are silently dropped, which could allow the verifier to incorrectly prune states with different constant values in stack arg slots. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045025.2387526-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 8 ++++++ kernel/bpf/backtrack.c | 58 +++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/verifier.c | 32 ++++++++++++++++++++---- 3 files changed, 92 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3ec338169981..6f12fc40b682 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -443,6 +443,8 @@ enum { INSN_F_DST_REG_STACK = BIT(1), /* dst_reg is PTR_TO_STACK */ INSN_F_SRC_REG_STACK = BIT(2), /* src_reg is PTR_TO_STACK */ + + INSN_F_STACK_ARG_ACCESS = BIT(3), }; struct bpf_jmp_history_entry { @@ -858,6 +860,7 @@ struct backtrack_state { u32 frame; u32 reg_masks[MAX_CALL_FRAMES]; u64 stack_masks[MAX_CALL_FRAMES]; + u8 stack_arg_masks[MAX_CALL_FRAMES]; }; struct bpf_id_pair { @@ -1256,6 +1259,11 @@ static inline void bpf_bt_set_frame_slot(struct backtrack_state *bt, u32 frame, bt->stack_masks[frame] |= 1ull << slot; } +static inline void bt_set_frame_stack_arg_slot(struct backtrack_state *bt, u32 frame, u32 slot) +{ + bt->stack_arg_masks[frame] |= 1 << slot; +} + static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg) { return bt->reg_masks[frame] & (1 << reg); diff --git a/kernel/bpf/backtrack.c b/kernel/bpf/backtrack.c index 5e93e57fb7ae..2e4ae0ef0860 100644 --- a/kernel/bpf/backtrack.c +++ b/kernel/bpf/backtrack.c @@ -129,11 +129,21 @@ static inline u32 bt_empty(struct backtrack_state *bt) int i; for (i = 0; i <= bt->frame; i++) - mask |= bt->reg_masks[i] | bt->stack_masks[i]; + mask |= bt->reg_masks[i] | bt->stack_masks[i] | bt->stack_arg_masks[i]; return mask == 0; } +static inline void bt_clear_frame_stack_arg_slot(struct backtrack_state *bt, u32 frame, u32 slot) +{ + bt->stack_arg_masks[frame] &= ~(1 << slot); +} + +static inline bool bt_is_frame_stack_arg_slot_set(struct backtrack_state *bt, u32 frame, u32 slot) +{ + return bt->stack_arg_masks[frame] & (1 << slot); +} + static inline int bt_subprog_enter(struct backtrack_state *bt) { if (bt->frame == MAX_CALL_FRAMES - 1) { @@ -194,6 +204,11 @@ static inline u64 bt_stack_mask(struct backtrack_state *bt) return bt->stack_masks[bt->frame]; } +static inline u8 bt_stack_arg_mask(struct backtrack_state *bt) +{ + return bt->stack_arg_masks[bt->frame]; +} + static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) { return bt->reg_masks[bt->frame] & (1 << reg); @@ -335,6 +350,19 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, return 0; bt_clear_reg(bt, load_reg); + if (hist && hist->flags & INSN_F_STACK_ARG_ACCESS) { + spi = hist->spi; + /* + * Stack arg read: callee reads from r11+off, but + * the data lives in the caller's stack_arg_regs. + * Set the mask in the caller frame so precision + * is marked in the caller's slot at the callee + * entry checkpoint. + */ + bt_set_frame_stack_arg_slot(bt, bt->frame - 1, spi); + return 0; + } + /* scalars can only be spilled into stack w/o losing precision. * Load from any other memory can be zero extended. * The desire to keep that precision is already indicated @@ -357,6 +385,17 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * encountered a case of pointer subtraction. */ return -ENOTSUPP; + + if (hist && hist->flags & INSN_F_STACK_ARG_ACCESS) { + spi = hist->spi; + if (!bt_is_frame_stack_arg_slot_set(bt, bt->frame, spi)) + return 0; + bt_clear_frame_stack_arg_slot(bt, bt->frame, spi); + if (class == BPF_STX) + bt_set_reg(bt, sreg); + return 0; + } + /* scalars can only be spilled into stack */ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) return 0; @@ -425,6 +464,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, bpf_bt_set_frame_reg(bt, bt->frame - 1, i); } } + if (bt_stack_arg_mask(bt)) { + verifier_bug(env, + "static subprog leftover stack arg slots %x", + bt_stack_arg_mask(bt)); + return -EFAULT; + } if (bt_subprog_exit(bt)) return -EFAULT; return 0; @@ -895,6 +940,17 @@ int bpf_mark_chain_precision(struct bpf_verifier_env *env, *changed = true; } } + for (i = 0; i < func->out_stack_arg_cnt; i++) { + if (!bt_is_frame_stack_arg_slot_set(bt, fr, i)) + continue; + reg = &func->stack_arg_regs[i]; + if (reg->type != SCALAR_VALUE || reg->precise) { + bt_clear_frame_stack_arg_slot(bt, fr, i); + } else { + reg->precise = true; + *changed = true; + } + } if (env->log.level & BPF_LOG_LEVEL2) { fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_frame_reg_mask(bt, fr)); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d15aef2fe4a1..ebd13661933e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -292,6 +292,11 @@ static int arg_from_argno(argno_t a) return -1; } +static int arg_idx_from_argno(argno_t a) +{ + return arg_from_argno(a) - 1; +} + static const char *btf_type_name(const struct btf *btf, u32 id) { return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); @@ -4115,7 +4120,8 @@ static int check_stack_arg_write(struct bpf_verifier_env *env, struct bpf_func_s __mark_reg_known(arg, env->prog->insnsi[env->insn_idx].imm); } state->no_stack_arg_load = true; - return 0; + return bpf_push_jmp_history(env, env->cur_state, + INSN_F_STACK_ARG_ACCESS, spi, 0, 0); } /* @@ -4146,7 +4152,17 @@ static int check_stack_arg_read(struct bpf_verifier_env *env, struct bpf_func_st arg = &caller->stack_arg_regs[spi]; cur = vstate->frame[vstate->curframe]; cur->regs[dst_regno] = *arg; - return 0; + return bpf_push_jmp_history(env, env->cur_state, + INSN_F_STACK_ARG_ACCESS, spi, 0, 0); +} + +static int mark_stack_arg_precision(struct bpf_verifier_env *env, int arg_idx) +{ + struct bpf_func_state *caller = cur_func(env); + int spi = arg_idx - MAX_BPF_FUNC_REG_ARGS; + + bt_set_frame_stack_arg_slot(&env->bt, caller->frameno, spi); + return mark_chain_precision_batch(env, env->cur_state); } static int check_outgoing_stack_args(struct bpf_verifier_env *env, struct bpf_func_state *caller, @@ -6875,8 +6891,14 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, } err = check_helper_mem_access(env, mem_reg, mem_argno, reg_umax(size_reg), access_type, zero_size_allowed, meta); - if (!err) - err = mark_chain_precision(env, reg_from_argno(size_argno)); + if (!err) { + int regno = reg_from_argno(size_argno); + + if (regno >= 0) + err = mark_chain_precision(env, regno); + else + err = mark_stack_arg_precision(env, arg_idx_from_argno(size_argno)); + } return err; } @@ -7325,7 +7347,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state * struct bpf_kfunc_call_arg_meta *meta) { const struct btf_type *t; - u32 arg_idx = arg_from_argno(argno) - 1; + u32 arg_idx = arg_idx_from_argno(argno); int spi, err, i, nr_slots, btf_id; if (reg->type != PTR_TO_STACK) { -- cgit v1.2.3 From 84dd7df76efef9fecb6f3e0defe2ea3ad89cd3cb Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:30 -0700 Subject: bpf: Refactor record_call_access() to extract per-arg logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract the per-argument FP-derived pointer handling from record_call_access() into a new record_arg_access() helper. The existing loop body — checking arg_is_fp, querying stack access bytes, and calling record_stack_access/record_imprecise — will be reused for stack argument slots in the next patch. Factoring it out now avoids duplicating the logic. No functional change. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045030.2388067-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/liveness.c | 65 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 27 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 58197d73b120..c81337dfbfc7 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -1343,6 +1343,42 @@ static int record_load_store_access(struct bpf_verifier_env *env, return 0; } +static int record_arg_access(struct bpf_verifier_env *env, + struct func_instance *instance, + struct bpf_insn *insn, + struct arg_track *at, int arg_idx, + int insn_idx) +{ + int depth = instance->depth; + int frame = at->frame; + int err = 0; + s64 bytes; + + if (!arg_is_fp(at)) + return 0; + + if (bpf_helper_call(insn)) { + bytes = bpf_helper_stack_access_bytes(env, insn, arg_idx, insn_idx); + } else if (bpf_pseudo_kfunc_call(insn)) { + bytes = bpf_kfunc_stack_access_bytes(env, insn, arg_idx, insn_idx); + } else { + for (int f = 0; f <= depth; f++) { + err = mark_stack_read(instance, f, insn_idx, SPIS_ALL); + if (err) + return err; + } + return 0; + } + if (bytes == 0) + return 0; + + if (frame >= 0 && frame <= depth) + err = record_stack_access(instance, at, bytes, frame, insn_idx); + else if (frame == ARG_IMPRECISE) + err = record_imprecise(instance, at->mask, insn_idx); + return err; +} + /* Record stack access for a given 'at' state of helper/kfunc 'insn' */ static int record_call_access(struct bpf_verifier_env *env, struct func_instance *instance, @@ -1350,9 +1386,8 @@ static int record_call_access(struct bpf_verifier_env *env, int insn_idx) { struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; - int depth = instance->depth; struct bpf_call_summary cs; - int r, err = 0, num_params = 5; + int r, err, num_params = 5; if (bpf_pseudo_call(insn)) return 0; @@ -1361,31 +1396,7 @@ static int record_call_access(struct bpf_verifier_env *env, num_params = cs.num_params; for (r = BPF_REG_1; r < BPF_REG_1 + num_params; r++) { - int frame = at[r].frame; - s64 bytes; - - if (!arg_is_fp(&at[r])) - continue; - - if (bpf_helper_call(insn)) { - bytes = bpf_helper_stack_access_bytes(env, insn, r - 1, insn_idx); - } else if (bpf_pseudo_kfunc_call(insn)) { - bytes = bpf_kfunc_stack_access_bytes(env, insn, r - 1, insn_idx); - } else { - for (int f = 0; f <= depth; f++) { - err = mark_stack_read(instance, f, insn_idx, SPIS_ALL); - if (err) - return err; - } - return 0; - } - if (bytes == 0) - continue; - - if (frame >= 0 && frame <= depth) - err = record_stack_access(instance, &at[r], bytes, frame, insn_idx); - else if (frame == ARG_IMPRECISE) - err = record_imprecise(instance, at[r].mask, insn_idx); + err = record_arg_access(env, instance, insn, &at[r], r - 1, insn_idx); if (err) return err; } -- cgit v1.2.3 From f44e815e65a30f465fe4dd793df8cb98f1f9c0b1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:35 -0700 Subject: bpf: Use arg_is_fp() in has_fp_args() Replace "frame != ARG_NONE" with arg_is_fp() in has_fp_args(). The function's purpose is to check whether any argument is derived from a frame pointer, which is exactly what arg_is_fp() tests (frame >= 0 || frame == ARG_IMPRECISE). Using the dedicated predicate is clearer and more consistent with the rest of the file. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045035.2388671-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/liveness.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index c81337dfbfc7..13dc5ae44d2b 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -1689,7 +1689,7 @@ err_free: static bool has_fp_args(struct arg_track *args) { for (int r = BPF_REG_1; r <= BPF_REG_5; r++) - if (args[r].frame != ARG_NONE) + if (arg_is_fp(&args[r])) return true; return false; } -- cgit v1.2.3 From 2af4e792773f9fc05e5dbd5f297707cfe15cd817 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:40 -0700 Subject: bpf: Extend liveness analysis to track stack argument slots BPF_REG_PARAMS (R11) is at index MAX_BPF_REG, which is beyond the register tracking arrays in const_fold.c and liveness.c. Handle it explicitly to avoid out-of-bounds accesses. Extend the arg tracking dataflow to cover stack arg slots. Otherwise, pointers passed through stack args are invisible to liveness, causing the pointed-to stack slots to be incorrectly poisoned. Extend the at_out tracking array to MAX_AT_TRACK_REGS (registers plus stack arg slots) so that outgoing stack arg stores are tracked alongside registers. Add a separate at_stack_arg_entry array in compute_subprog_args(), passed to arg_track_xfer(), to restore FP-derived values on incoming stack arg reads. Extend record_call_access() to check stack arg slots for FP-derived pointers at kfunc call sites, reusing the record_arg_access() helper extracted in the previous patch. Pass stack arg state from caller to callee in analyze_subprog() so that callees can track pointers received through stack args, hence avoid poisoning. Skip stack arg instructions in record_load_store_access(). Stack arg STX uses dst_reg=BPF_REG_PARAMS (index 11), but at[11] is repurposed to track the value stored in stack arg slot 0. Without the skip, if a prior stack arg STX stored an FP-derived pointer (e.g., fp-64) into slot 0, a subsequent stack arg STX would read that FP-derived value as the base pointer and spuriously mark a regular stack slot (e.g., fp-72 from -64 + -8) as accessed in the liveness bitmap. Extend arg_track_log() to log state transitions for outgoing stack arg slots at indices MAX_BPF_REG through MAX_AT_TRACK_REGS-1. Without this, changes to at_out[11..17] caused by stack arg store instructions are silently omitted from BPF_LOG_LEVEL2 output. For example, when a caller passes fp-64 through a stack argument: subprog#0: 10: (bf) r6 = r10 11: (07) r6 += -64 12: (7b) *(u64 *)(r11 -8) = r6 sa0: none -> fp0-64 13: (85) call pc+5 Without the fix, the "sa0: none -> fp0-64" transition at insn 12 would not appear. Extend print_subprog_arg_access() to include stack arg slots in the per-instruction FP-derived state dump. For example: subprog#0: 12: (7b) *(u64 *)(r11 - 8) = r6 // r6=fp0-64 13: (85) call pc+5 // r6=fp0-64 sa0=fp0-64 Without the fix, the "sa0=fp0-64" annotation at insn 13 would not appear, making it harder to debug liveness analysis for programs that pass FP-derived pointers through stack arguments. Extend has_fp_args() to also check stack arg slots for FP-derived pointers, so that callees receiving pointers only through stack args are still recursively analyzed. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045043.2389049-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/const_fold.c | 8 ++++ kernel/bpf/liveness.c | 114 +++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 106 insertions(+), 16 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/const_fold.c b/kernel/bpf/const_fold.c index db73c4740b1e..b2a19acadb91 100644 --- a/kernel/bpf/const_fold.c +++ b/kernel/bpf/const_fold.c @@ -58,6 +58,14 @@ static void const_reg_xfer(struct bpf_verifier_env *env, struct const_arg_info * u8 opcode = BPF_OP(insn->code) | BPF_SRC(insn->code); int r; + /* Stack arg stores (r11-based) are outside the tracked register set. */ + if (is_stack_arg_st(insn) || is_stack_arg_stx(insn)) + return; + if (is_stack_arg_ldx(insn)) { + ci_out[insn->dst_reg] = unknown; + return; + } + switch (class) { case BPF_ALU: case BPF_ALU64: diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 13dc5ae44d2b..7f4a0e4c2c49 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -610,6 +610,21 @@ enum arg_track_state { /* Track callee stack slots fp-8 through fp-512 (64 slots of 8 bytes each) */ #define MAX_ARG_SPILL_SLOTS 64 +/* + * Combined register + stack arg tracking: R0-R10 at indices 0-10, + * outgoing stack arg slots at indices MAX_BPF_REG..MAX_BPF_REG+6. + */ +#define MAX_AT_TRACK_REGS (MAX_BPF_REG + MAX_STACK_ARG_SLOTS) + +static int stack_arg_off_to_slot(s16 off) +{ + int aoff = off < 0 ? -off : off; + + if (aoff / 8 > MAX_STACK_ARG_SLOTS) + return -1; + return aoff / 8 - 1; +} + static bool arg_is_visited(const struct arg_track *at) { return at->frame != ARG_UNVISITED; @@ -1032,6 +1047,21 @@ static void arg_track_log(struct bpf_verifier_env *env, struct bpf_insn *insn, i verbose(env, "\tr%d: ", i); verbose_arg_track(env, &at_in[i]); verbose(env, " -> "); verbose_arg_track(env, &at_out[i]); } + /* Log outgoing stack arg slot transitions at indices MAX_BPF_REG..MAX_AT_TRACK_REGS-1 */ + for (i = 0; i < MAX_STACK_ARG_SLOTS; i++) { + int ai = MAX_BPF_REG + i; + + if (arg_track_eq(&at_out[ai], &at_in[ai])) + continue; + if (!printed) { + verbose(env, "%3d: ", idx); + bpf_verbose_insn(env, insn); + bpf_vlog_reset(&env->log, env->log.end_pos - 1); + printed = true; + } + verbose(env, "\tsa%d: ", i); verbose_arg_track(env, &at_in[ai]); + verbose(env, " -> "); verbose_arg_track(env, &at_out[ai]); + } for (i = 0; i < MAX_ARG_SPILL_SLOTS; i++) { if (arg_track_eq(&at_stack_out[i], &at_stack_in[i])) continue; @@ -1062,6 +1092,7 @@ static bool can_be_local_fp(int depth, int regno, struct arg_track *at) static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn, int insn_idx, struct arg_track *at_out, struct arg_track *at_stack_out, + const struct arg_track *at_stack_arg_entry, struct func_instance *instance, u32 *callsites) { @@ -1071,9 +1102,21 @@ static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn, struct arg_track *dst = &at_out[insn->dst_reg]; struct arg_track *src = &at_out[insn->src_reg]; struct arg_track none = { .frame = ARG_NONE }; - int r; - - if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) { + int r, slot; + + /* Handle stack arg stores and loads. */ + if (is_stack_arg_st(insn) || is_stack_arg_stx(insn)) { + slot = stack_arg_off_to_slot(insn->off); + if (slot >= 0) { + if (is_stack_arg_stx(insn)) + at_out[MAX_BPF_REG + slot] = at_out[insn->src_reg]; + else + at_out[MAX_BPF_REG + slot] = none; + } + } else if (is_stack_arg_ldx(insn)) { + slot = stack_arg_off_to_slot(insn->off); + at_out[insn->dst_reg] = (slot >= 0) ? at_stack_arg_entry[slot] : none; + } else if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) { if (code == BPF_MOV) { *dst = none; } else if (dst->frame >= 0) { @@ -1297,6 +1340,16 @@ static int record_load_store_access(struct bpf_verifier_env *env, struct arg_track resolved, *ptr; int oi; + /* + * Stack arg insns use dst_reg/src_reg=BPF_REG_PARAMS(11). Since at[] + * is extended to MAX_AT_TRACK_REGS, at[11] holds the arg_track for + * outgoing stack arg slot 0 — not the pointer used for the memory + * access. Skip so the slot's tracked value isn't confused with the + * base register that record_stack_access() expects. + */ + if (is_stack_arg_stx(insn) || is_stack_arg_st(insn) || is_stack_arg_ldx(insn)) + return 0; + switch (class) { case BPF_LDX: ptr = &at[insn->src_reg]; @@ -1395,11 +1448,18 @@ static int record_call_access(struct bpf_verifier_env *env, if (bpf_get_call_summary(env, insn, &cs)) num_params = cs.num_params; - for (r = BPF_REG_1; r < BPF_REG_1 + num_params; r++) { + for (r = BPF_REG_1; r < BPF_REG_1 + min(num_params, MAX_BPF_FUNC_REG_ARGS); r++) { err = record_arg_access(env, instance, insn, &at[r], r - 1, insn_idx); if (err) return err; } + + for (r = 0; r < MAX_STACK_ARG_SLOTS && r < num_params - MAX_BPF_FUNC_REG_ARGS; r++) { + err = record_arg_access(env, instance, insn, &at[MAX_BPF_REG + r], + r + MAX_BPF_FUNC_REG_ARGS, insn_idx); + if (err) + return err; + } return 0; } @@ -1456,7 +1516,7 @@ static int find_callback_subprog(struct bpf_verifier_env *env, /* Per-subprog intermediate state kept alive across analysis phases */ struct subprog_at_info { - struct arg_track (*at_in)[MAX_BPF_REG]; + struct arg_track (*at_in)[MAX_AT_TRACK_REGS]; int len; }; @@ -1490,6 +1550,9 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env, for (r = 0; r < MAX_BPF_REG - 1; r++) if (arg_is_fp(&info->at_in[i][r])) has_extra = true; + for (r = 0; r < MAX_STACK_ARG_SLOTS; r++) + if (arg_is_fp(&info->at_in[i][MAX_BPF_REG + r])) + has_extra = true; } if (is_ldx_stx_call) { for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) @@ -1514,6 +1577,12 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env, verbose(env, " r%d=", r); verbose_arg_track(env, &info->at_in[i][r]); } + for (r = 0; r < MAX_STACK_ARG_SLOTS; r++) { + if (!arg_is_fp(&info->at_in[i][MAX_BPF_REG + r])) + continue; + verbose(env, " sa%d=", r); + verbose_arg_track(env, &info->at_in[i][MAX_BPF_REG + r]); + } } if (is_ldx_stx_call) { @@ -1536,7 +1605,7 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env, * Runs forward fixed-point with arg_track_xfer(), then records * memory accesses in a single linear pass over converged state. * - * @callee_entry: pre-populated entry state for R1-R5 + * @callee_entry: pre-populated entry state for R1-R5 and stack args * NULL for main (subprog 0). * @info: stores at_in, len for debug printing. */ @@ -1554,10 +1623,11 @@ static int compute_subprog_args(struct bpf_verifier_env *env, int end = env->subprog_info[subprog + 1].start; int po_end = env->subprog_info[subprog + 1].postorder_start; int len = end - start; - struct arg_track (*at_in)[MAX_BPF_REG] = NULL; - struct arg_track at_out[MAX_BPF_REG]; + struct arg_track (*at_in)[MAX_AT_TRACK_REGS] = NULL; + struct arg_track at_out[MAX_AT_TRACK_REGS]; struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS] = NULL; struct arg_track *at_stack_out = NULL; + struct arg_track at_stack_arg_entry[MAX_STACK_ARG_SLOTS]; struct arg_track unvisited = { .frame = ARG_UNVISITED }; struct arg_track none = { .frame = ARG_NONE }; bool changed; @@ -1576,13 +1646,13 @@ static int compute_subprog_args(struct bpf_verifier_env *env, goto err_free; for (i = 0; i < len; i++) { - for (r = 0; r < MAX_BPF_REG; r++) + for (r = 0; r < MAX_AT_TRACK_REGS; r++) at_in[i][r] = unvisited; for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) at_stack_in[i][r] = unvisited; } - for (r = 0; r < MAX_BPF_REG; r++) + for (r = 0; r < MAX_AT_TRACK_REGS; r++) at_in[0][r] = none; /* Entry: R10 is always precisely the current frame's FP */ @@ -1598,6 +1668,10 @@ static int compute_subprog_args(struct bpf_verifier_env *env, for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) at_stack_in[0][r] = none; + /* Entry: incoming stack args from caller, or ARG_NONE for main */ + for (r = 0; r < MAX_STACK_ARG_SLOTS; r++) + at_stack_arg_entry[r] = callee_entry ? callee_entry[MAX_BPF_REG + r] : none; + if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "subprog#%d: analyzing (depth %d)...\n", subprog, depth); @@ -1616,7 +1690,8 @@ redo: memcpy(at_out, at_in[i], sizeof(at_out)); memcpy(at_stack_out, at_stack_in[i], MAX_ARG_SPILL_SLOTS * sizeof(*at_stack_out)); - arg_track_xfer(env, insn, idx, at_out, at_stack_out, instance, callsites); + arg_track_xfer(env, insn, idx, at_out, at_stack_out, + at_stack_arg_entry, instance, callsites); arg_track_log(env, insn, idx, at_in[i], at_stack_in[i], at_out, at_stack_out); /* Propagate to successors within this subprogram */ @@ -1630,7 +1705,7 @@ redo: continue; ti = target - start; - for (r = 0; r < MAX_BPF_REG; r++) + for (r = 0; r < MAX_AT_TRACK_REGS; r++) changed |= arg_track_join(env, idx, target, r, &at_in[ti][r], at_out[r]); @@ -1685,12 +1760,15 @@ err_free: return err; } -/* Return true if any of R1-R5 is derived from a frame pointer. */ +/* Return true if any of R1-R5 or stack args is derived from a frame pointer. */ static bool has_fp_args(struct arg_track *args) { for (int r = BPF_REG_1; r <= BPF_REG_5; r++) if (arg_is_fp(&args[r])) return true; + for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++) + if (arg_is_fp(&args[MAX_BPF_REG + r])) + return true; return false; } @@ -1814,7 +1892,7 @@ static int analyze_subprog(struct bpf_verifier_env *env, /* For each reachable call site in the subprog, recurse into callees */ for (int p = po_start; p < po_end; p++) { int idx = env->cfg.insn_postorder[p]; - struct arg_track callee_args[BPF_REG_5 + 1]; + struct arg_track callee_args[MAX_AT_TRACK_REGS] = {}; struct arg_track none = { .frame = ARG_NONE }; struct bpf_insn *insn = &insns[idx]; struct func_instance *callee_instance; @@ -1829,9 +1907,11 @@ static int analyze_subprog(struct bpf_verifier_env *env, if (callee < 0) continue; - /* Build entry args: R1-R5 from at_in at call site */ + /* Build entry args: R1-R5 and stack args from at_in at call site */ for (int r = BPF_REG_1; r <= BPF_REG_5; r++) callee_args[r] = info[subprog].at_in[j][r]; + for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++) + callee_args[MAX_BPF_REG + r] = info[subprog].at_in[j][MAX_BPF_REG + r]; } else if (bpf_calls_callback(env, idx)) { callee = find_callback_subprog(env, insn, idx, &caller_reg, &cb_callee_reg); if (callee == -2) { @@ -1853,6 +1933,8 @@ static int analyze_subprog(struct bpf_verifier_env *env, for (int r = BPF_REG_1; r <= BPF_REG_5; r++) callee_args[r] = none; + for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++) + callee_args[MAX_BPF_REG + r] = none; callee_args[cb_callee_reg] = info[subprog].at_in[j][caller_reg]; } else { continue; @@ -2096,7 +2178,7 @@ static void compute_insn_live_regs(struct bpf_verifier_env *env, def = ALL_CALLER_SAVED_REGS; use = def & ~BIT(BPF_REG_0); if (bpf_get_call_summary(env, insn, &cs)) - use = GENMASK(cs.num_params, 1); + use = GENMASK(min_t(u8, cs.num_params, MAX_BPF_FUNC_REG_ARGS), 1); break; default: def = 0; -- cgit v1.2.3 From dc8f1cf6787c4bb1d8cabfac1e44d2d0ab435caa Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:49 -0700 Subject: bpf: Reject stack arguments in non-JITed programs The interpreter does not understand the bpf register r11 (BPF_REG_PARAMS) used for stack arguments. So reject interpreter usage if stack arguments are used either in the main program or any subprogram. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045049.2390444-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 +- kernel/bpf/fixups.c | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ae10b9ca018d..958d86f0beac 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2599,7 +2599,7 @@ struct bpf_prog *__bpf_prog_select_runtime(struct bpf_verifier_env *env, struct goto finalize; if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) || - bpf_prog_has_kfunc_call(fp)) + bpf_prog_has_kfunc_call(fp) || (env && env->subprog_info[0].stack_arg_cnt)) jit_needed = true; if (!bpf_prog_select_interpreter(fp)) diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index ba86039789fd..19056016eed8 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -1407,6 +1407,12 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env) verbose(env, "calling kernel functions are not allowed in non-JITed programs\n"); return -EINVAL; } + for (i = 1; i < env->subprog_cnt; i++) { + if (bpf_in_stack_arg_cnt(&env->subprog_info[i])) { + verbose(env, "stack args are not supported in non-JITed programs\n"); + return -EINVAL; + } + } if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { /* When JIT fails the progs with bpf2bpf calls and tail_calls * have to be rejected, since interpreter doesn't support them yet. -- cgit v1.2.3 From 848d624acf668ae0d71b128f163d1d18d2ac6b90 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:54 -0700 Subject: bpf: Prepare architecture JIT support for stack arguments Add bpf_jit_supports_stack_args() as a weak function defaulting to false. Architectures that implement JIT support for stack arguments override it to return true. Reject BPF functions with more than 5 parameters at verification time if the architecture does not support stack arguments. Acked-by: Puranjay Mohan Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045054.2390945-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 1 + kernel/bpf/btf.c | 8 +++++++- kernel/bpf/core.c | 5 +++++ 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/include/linux/filter.h b/include/linux/filter.h index 918d9b34eac6..a515a9769078 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1184,6 +1184,7 @@ bool bpf_jit_inlines_helper_call(s32 imm); bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_percpu_insn(void); bool bpf_jit_supports_kfunc_call(void); +bool bpf_jit_supports_stack_args(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4fb8641546b8..17d4ab0a8206 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7870,8 +7870,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) MAX_BPF_FUNC_ARGS, tname, nargs); return -EFAULT; } - if (nargs > MAX_BPF_FUNC_REG_ARGS) + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + if (!bpf_jit_supports_stack_args()) { + bpf_log(log, "JIT does not support function %s() with %d args\n", + tname, nargs); + return -EFAULT; + } sub->stack_arg_cnt = nargs - MAX_BPF_FUNC_REG_ARGS; + } if (is_global && nargs > MAX_BPF_FUNC_REG_ARGS) { bpf_log(log, "global function %s has %d > %d args, stack args not supported\n", diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 958d86f0beac..e6b836f846eb 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3217,6 +3217,11 @@ bool __weak bpf_jit_supports_kfunc_call(void) return false; } +bool __weak bpf_jit_supports_stack_args(void) +{ + return false; +} + bool __weak bpf_jit_supports_far_kfunc_call(void) { return false; -- cgit v1.2.3 From 9fae4cba3bfd583198afe15ed4b4433eafafd11c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:50:59 -0700 Subject: bpf: Enable r11 based insns BPF_REG_PARAMS (r11) is used for stack argument accesses and the following are only insns with r11 presence: - load incoming stack arg - store register to outgoing stack arg - store immediate to outgoing stack arg The detailed insn format can be found in is_stack_arg_ldx/st/stx() helpers. After this patch, stack arg ldx/st/stx insns become valid for kernel and these insns can be properly checked by verifier. The LLVM compiler [1] implemented the above BPF_REG_PARAMS insns. [1] https://github.com/llvm/llvm-project/pull/189060 Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045059.2391192-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ebd13661933e..b0d3c2d179e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18006,11 +18006,12 @@ static int check_and_resolve_insns(struct bpf_verifier_env *env) return err; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->dst_reg >= MAX_BPF_REG) { + if (insn->dst_reg >= MAX_BPF_REG && + !is_stack_arg_st(insn) && !is_stack_arg_stx(insn)) { verbose(env, "R%d is invalid\n", insn->dst_reg); return -EINVAL; } - if (insn->src_reg >= MAX_BPF_REG) { + if (insn->src_reg >= MAX_BPF_REG && !is_stack_arg_ldx(insn)) { verbose(env, "R%d is invalid\n", insn->src_reg); return -EINVAL; } -- cgit v1.2.3 From e0b7b91c72db6dae0392dd90db3b866218a7870b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:51:04 -0700 Subject: bpf: Support stack arguments for kfunc calls Extend the stack argument mechanism to kfunc calls, allowing kfuncs with more than 5 parameters to receive additional arguments via the r11-based stack arg area. For kfuncs, the caller is a BPF program and the callee is a kernel function. The BPF program writes outgoing args at negative r11 offsets, following the same convention as BPF-to-BPF calls: Outgoing: r11 - 8 (arg6), ..., r11 - N*8 (last arg) The following is an example: int foo(int a1, int a2, int a3, int a4, int a5, int a6, int a7) { ... kfunc1(a1, a2, a3, a4, a5, a6, a7, a8); ... kfunc2(a1, a2, a3, a4, a5, a6, a7, a8, a9); ... } Caller (foo), generated by llvm =============================== Incoming (positive offsets): r11+8: [incoming arg 6] r11+16: [incoming arg 7] Outgoing for kfunc1 (negative offsets): r11-8: [outgoing arg 6] r11-16: [outgoing arg 7] r11-24: [outgoing arg 8] Outgoing for kfunc2 (negative offsets): r11-8: [outgoing arg 6] r11-16: [outgoing arg 7] r11-24: [outgoing arg 8] r11-32: [outgoing arg 9] Later JIT will marshal outgoing arguments to the native calling convention for kfunc1() and kfunc2(). For kfunc calls where stack args are used as constant or size parameters, a mark_stack_arg_precision() helper is used to propagate precision and do proper backtracking. There are two places where meta->release_regno needs to keep regno for later releasing the reference. Also, 'cur_aux(env)->arg_prog = regno' is also keeping regno for later fixup. Since stack arguments don't have a valid register number (regno is negative), these three cases are rejected for now if the argument is on the stack. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045104.2391543-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 77 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 17 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b0d3c2d179e4..1a734ab91a31 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11160,14 +11160,12 @@ bool bpf_is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta) } static enum kfunc_ptr_arg_type -get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, - struct bpf_kfunc_call_arg_meta *meta, +get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_func_state *caller, + struct bpf_reg_state *regs, struct bpf_kfunc_call_arg_meta *meta, const struct btf_type *t, const struct btf_type *ref_t, const char *ref_tname, const struct btf_param *args, int arg, int nargs, argno_t argno, struct bpf_reg_state *reg) { - u32 regno = arg + 1; - struct bpf_reg_state *regs = cur_regs(env); bool arg_mem_size = false; if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || @@ -11176,8 +11174,8 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, return KF_ARG_PTR_TO_CTX; if (arg + 1 < nargs && - (is_kfunc_arg_mem_size(meta->btf, &args[arg + 1], ®s[regno + 1]) || - is_kfunc_arg_const_mem_size(meta->btf, &args[arg + 1], ®s[regno + 1]))) + (is_kfunc_arg_mem_size(meta->btf, &args[arg + 1], get_func_arg_reg(caller, regs, arg + 1)) || + is_kfunc_arg_const_mem_size(meta->btf, &args[arg + 1], get_func_arg_reg(caller, regs, arg + 1)))) arg_mem_size = true; /* In this function, we verify the kfunc's BTF as per the argument type, @@ -11842,6 +11840,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ int insn_idx) { const char *func_name = meta->func_name, *ref_tname; + struct bpf_func_state *caller = cur_func(env); + struct bpf_reg_state *regs = cur_regs(env); const struct btf *btf = meta->btf; const struct btf_param *args; struct btf_record *rec; @@ -11850,21 +11850,31 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ args = (const struct btf_param *)(meta->func_proto + 1); nargs = btf_type_vlen(meta->func_proto); - if (nargs > MAX_BPF_FUNC_REG_ARGS) { + if (nargs > MAX_BPF_FUNC_ARGS) { verbose(env, "Function %s has %d > %d args\n", func_name, nargs, - MAX_BPF_FUNC_REG_ARGS); + MAX_BPF_FUNC_ARGS); return -EINVAL; } + if (nargs > MAX_BPF_FUNC_REG_ARGS && !bpf_jit_supports_stack_args()) { + verbose(env, "JIT does not support kfunc %s() with %d args\n", + func_name, nargs); + return -ENOTSUPP; + } + + ret = check_outgoing_stack_args(env, caller, nargs); + if (ret) + return ret; /* Check that BTF function arguments match actual types that the * verifier sees. */ for (i = 0; i < nargs; i++) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[i + 1]; + struct bpf_reg_state *reg = get_func_arg_reg(caller, regs, i); const struct btf_type *t, *ref_t, *resolve_ret; enum bpf_arg_type arg_type = ARG_DONTCARE; argno_t argno = argno_from_arg(i + 1); - u32 regno = i + 1, ref_id, type_size; + int regno = reg_from_argno(argno); + u32 ref_id, type_size; bool is_ret_buf_sz = false; int kf_arg_type; @@ -11874,6 +11884,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc"); return -EFAULT; } + if (regno < 0) { + verbose(env, "%s prog->aux cannot be a stack argument\n", + reg_arg_name(env, argno)); + return -EINVAL; + } meta->arg_prog = true; cur_aux(env)->arg_prog = regno; continue; @@ -11900,7 +11915,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ reg_arg_name(env, argno)); return -EINVAL; } - ret = mark_chain_precision(env, regno); + if (regno >= 0) + ret = mark_chain_precision(env, regno); + else + ret = mark_stack_arg_precision(env, i); if (ret < 0) return ret; meta->arg_constant.found = true; @@ -11925,7 +11943,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } meta->r0_size = reg->var_off.value; - ret = mark_chain_precision(env, regno); + if (regno >= 0) + ret = mark_chain_precision(env, regno); + else + ret = mark_stack_arg_precision(env, i); if (ret) return ret; } @@ -11953,14 +11974,21 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } meta->ref_obj_id = reg->ref_obj_id; - if (is_kfunc_release(meta)) + if (is_kfunc_release(meta)) { + if (regno < 0) { + verbose(env, "%s release arg cannot be a stack argument\n", + reg_arg_name(env, argno)); + return -EINVAL; + } meta->release_regno = regno; + } } ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs, argno, reg); + kf_arg_type = get_kfunc_ptr_arg_type(env, caller, regs, meta, t, ref_t, ref_tname, + args, i, nargs, argno, reg); if (kf_arg_type < 0) return kf_arg_type; @@ -12110,6 +12138,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ dynptr_arg_type |= DYNPTR_TYPE_FILE; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) { dynptr_arg_type |= DYNPTR_TYPE_FILE | OBJ_RELEASE; + if (regno < 0) { + verbose(env, "%s release arg cannot be a stack argument\n", + reg_arg_name(env, argno)); + return -EINVAL; + } meta->release_regno = regno; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { @@ -12264,9 +12297,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_MEM_SIZE: { - struct bpf_reg_state *buff_reg = ®s[regno]; + struct bpf_reg_state *buff_reg = reg; const struct btf_param *buff_arg = &args[i]; - struct bpf_reg_state *size_reg = ®s[regno + 1]; + struct bpf_reg_state *size_reg = get_func_arg_reg(caller, regs, i + 1); const struct btf_param *size_arg = &args[i + 1]; argno_t next_argno = argno_from_arg(i + 2); @@ -13171,8 +13204,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, clear_all_pkt_pointers(env); nargs = btf_type_vlen(meta.func_proto); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + struct bpf_func_state *caller = cur_func(env); + struct bpf_subprog_info *caller_info = &env->subprog_info[caller->subprogno]; + u16 out_stack_arg_cnt = nargs - MAX_BPF_FUNC_REG_ARGS; + u16 stack_arg_cnt = bpf_in_stack_arg_cnt(caller_info) + out_stack_arg_cnt; + + if (stack_arg_cnt > caller_info->stack_arg_cnt) + caller_info->stack_arg_cnt = stack_arg_cnt; + } + args = (const struct btf_param *)(meta.func_proto + 1); - for (i = 0; i < nargs; i++) { + for (i = 0; i < min_t(int, nargs, MAX_BPF_FUNC_REG_ARGS); i++) { u32 regno = i + 1; t = btf_type_skip_modifiers(desc_btf, args[i].type, NULL); -- cgit v1.2.3 From 35b78733160c120767332d924a0447a87109bbde Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:51:09 -0700 Subject: bpf: Reject stack arguments if tail call reachable Tail calls are deprecated and will be replaced by indirect calls in the future. Reject programs that combine tail calls with stack arguments rather than adding complexity for a deprecated feature. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045109.2392108-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1a734ab91a31..a10cc045057d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5267,14 +5267,23 @@ continue_func: * this info will be utilized by JIT so that we will be preserving the * tail call counter throughout bpf2bpf calls combined with tailcalls */ - if (tail_call_reachable) + if (tail_call_reachable) { for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) { if (subprog[tmp].is_exception_cb) { verbose(env, "cannot tail call within exception cb\n"); return -EINVAL; } + if (subprog[tmp].stack_arg_cnt) { + verbose(env, "tail_calls are not allowed in programs with stack args\n"); + return -EINVAL; + } subprog[tmp].tail_call_reachable = true; } + } else if (!idx && subprog[0].has_tail_call && subprog[0].stack_arg_cnt) { + verbose(env, "tail_calls are not allowed in programs with stack args\n"); + return -EINVAL; + } + if (subprog[0].tail_call_reachable) env->prog->aux->tail_call_reachable = true; -- cgit v1.2.3 From cb6af5314056cb06456cfa8774aa158d61929bcd Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:51:14 -0700 Subject: bpf: Disable private stack for x86_64 if stack arguments used Other architectures like arm64, riscv, etc. have enough register and for them private stack can be used together with stack arguments. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045114.2392291-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a10cc045057d..82b9531f87f6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5163,7 +5163,10 @@ process_func: } subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth); - if (priv_stack_supported) { + if (IS_ENABLED(CONFIG_X86_64) && subprog[idx].stack_arg_cnt) { + /* x86-64 uses R9 for both private stack frame pointer and arg6. */ + subprog[idx].priv_stack_mode = NO_PRIV_STACK; + } else if (priv_stack_supported) { /* Request private stack support only if the subprog stack * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to * avoid jit penalty if the stack usage is small. -- cgit v1.2.3 From 324c3ca6eed6fb7ec4e50f31d537953038b13c5f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 May 2026 21:51:19 -0700 Subject: bpf,x86: Implement JIT support for stack arguments Add x86_64 JIT support for BPF functions and kfuncs with more than 5 arguments. The extra arguments are passed through a stack area addressed by register r11 (BPF_REG_PARAMS) in BPF bytecode, which the JIT translates to native code. The JIT follows the x86-64 calling convention for both BPF-to-BPF and kfunc calls: - Arg 6 is passed in the R9 register - Args 7+ are passed on the stack Incoming arg 6 (BPF r11+8) is translated to a MOV from R9 rather than a memory load. Incoming args 7+ (BPF r11+16, r11+24, ...) map directly to [rbp + 16], [rbp + 24], ..., matching the x86-64 stack layout after CALL + PUSH RBP, so no offset adjustment is needed. tail_call_reachable is rejected by the verifier and priv_stack is disabled by the JIT when stack args exist, so R9 is always available. When BPF bytecode writes to the arg-6 stack slot (offset -8), the JIT emits a MOV into R9 instead of a memory store. Outgoing args 7+ are placed at [rsp] in a pre-allocated area below callee-saved registers, using: native_off = outgoing_arg_base - outgoing_rsp - bpf_off - 16 The native x86_64 stack layout with stack arguments: high address +-------------------------+ | incoming stack arg N | [rbp + 16 + (N-7)*8] (from caller) | ... | | incoming stack arg 7 | [rbp + 16] +-------------------------+ | return address | [rbp + 8] | saved rbp | [rbp] +-------------------------+ | BPF program stack | (round_up(stack_depth, 8) bytes) +-------------------------+ | callee-saved regs | (r12, rbx, r13, r14, r15 as needed) +-------------------------+ | outgoing arg M | [rsp + (M-7)*8] | ... | | outgoing arg 7 | [rsp] +-------------------------+ rsp low address Acked-by: Puranjay Mohan Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260513045122.2393118-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 149 ++++++++++++++++++++++++++++++++++++++++++-- include/linux/bpf.h | 1 + kernel/bpf/core.c | 10 +++ 3 files changed, 154 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index ea9e707e8abf..ceefefb4da21 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -390,6 +391,34 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) *pprog = prog; } +/* add rsp, depth */ +static void emit_add_rsp(u8 **pprog, u16 depth) +{ + u8 *prog = *pprog; + + if (!depth) + return; + if (is_imm8(depth)) + EMIT4(0x48, 0x83, 0xC4, depth); /* add rsp, imm8 */ + else + EMIT3_off32(0x48, 0x81, 0xC4, depth); /* add rsp, imm32 */ + *pprog = prog; +} + +/* sub rsp, depth */ +static void emit_sub_rsp(u8 **pprog, u16 depth) +{ + u8 *prog = *pprog; + + if (!depth) + return; + if (is_imm8(depth)) + EMIT4(0x48, 0x83, 0xEC, depth); /* sub rsp, imm8 */ + else + EMIT3_off32(0x48, 0x81, 0xEC, depth); /* sub rsp, imm32 */ + *pprog = prog; +} + static void emit_nops(u8 **pprog, int len) { u8 *prog = *pprog; @@ -1659,21 +1688,47 @@ static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int * bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; void __percpu *priv_frame_ptr = NULL; + u16 out_stack_arg_cnt, outgoing_rsp; u64 arena_vm_start, user_vm_start; void __percpu *priv_stack_ptr; int i, excnt = 0; int ilen, proglen = 0; u8 *ip, *prog = temp; u32 stack_depth; + int callee_saved_size; + s32 outgoing_arg_base; int err; stack_depth = bpf_prog->aux->stack_depth; + out_stack_arg_cnt = bpf_out_stack_arg_cnt(env, bpf_prog); priv_stack_ptr = bpf_prog->aux->priv_stack_ptr; if (priv_stack_ptr) { priv_frame_ptr = priv_stack_ptr + PRIV_STACK_GUARD_SZ + round_up(stack_depth, 8); stack_depth = 0; } + /* + * Follow x86-64 calling convention for both BPF-to-BPF and + * kfunc calls: + * - Arg 6 is passed in R9 register + * - Args 7+ are passed on the stack at [rsp] + * + * Incoming arg 6 is read from R9 (BPF r11+8 → MOV from R9). + * Incoming args 7+ are read from [rbp + 16], [rbp + 24], ... + * (BPF r11+16, r11+24, ... map directly with no offset change). + * + * tail_call_reachable is rejected by the verifier and priv_stack + * is disabled by the JIT when stack args exist, so R9 is always + * available. + * + * Stack layout (high to low): + * [rbp + 16 + ...] incoming stack args 7+ (from caller) + * [rbp + 8] return address + * [rbp] saved rbp + * [rbp - prog_stack] program stack + * [below] callee-saved regs + * [below] outgoing args 7+ (= rsp) + */ arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena); user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena); @@ -1700,6 +1755,42 @@ static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int * push_r12(&prog); push_callee_regs(&prog, callee_regs_used); } + + /* Compute callee-saved register area size. */ + callee_saved_size = 0; + if (bpf_prog->aux->exception_boundary || arena_vm_start) + callee_saved_size += 8; /* r12 */ + if (bpf_prog->aux->exception_boundary) { + callee_saved_size += 4 * 8; /* rbx, r13, r14, r15 */ + } else { + int j; + + for (j = 0; j < 4; j++) + if (callee_regs_used[j]) + callee_saved_size += 8; + } + /* + * Base offset from rbp for translating BPF outgoing args 7+ + * to native offsets. BPF uses negative offsets from r11 + * (r11-8 for arg6, r11-16 for arg7, ...) while x86 uses + * positive offsets from rsp ([rsp+0] for arg7, [rsp+8] for + * arg8, ...). Arg 6 goes to R9 directly. + * + * The translation reverses direction: + * native_off = outgoing_arg_base - outgoing_rsp - bpf_off - 16 + * + * Note that tail_call_reachable is guaranteed to be false when + * stack args exist, so tcc pushes need not be accounted for. + */ + outgoing_arg_base = -(round_up(stack_depth, 8) + callee_saved_size); + + /* + * Allocate outgoing stack arg area for args 7+ only. + * Arg 6 goes into r9 register, not on stack. + */ + outgoing_rsp = out_stack_arg_cnt > 1 ? (out_stack_arg_cnt - 1) * 8 : 0; + emit_sub_rsp(&prog, outgoing_rsp); + if (arena_vm_start) emit_mov_imm64(&prog, X86_REG_R12, arena_vm_start >> 32, (u32) arena_vm_start); @@ -1721,7 +1812,7 @@ static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int * u8 b2 = 0, b3 = 0; u8 *start_of_ldx; s64 jmp_offset; - s16 insn_off; + s32 insn_off; u8 jmp_cond; u8 *func; int nops; @@ -2134,12 +2225,27 @@ static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int * EMIT1(0xC7); goto st; case BPF_ST | BPF_MEM | BPF_DW: + if (dst_reg == BPF_REG_PARAMS && insn->off == -8) { + /* Arg 6: store immediate in r9 register */ + emit_mov_imm64(&prog, X86_REG_R9, imm32 >> 31, (u32)imm32); + break; + } EMIT2(add_1mod(0x48, dst_reg), 0xC7); -st: if (is_imm8(insn->off)) - EMIT2(add_1reg(0x40, dst_reg), insn->off); +st: insn_off = insn->off; + if (dst_reg == BPF_REG_PARAMS) { + /* + * Args 7+: reverse BPF negative offsets to + * x86 positive rsp offsets. + * BPF off=-16 → [rsp+0], off=-24 → [rsp+8], ... + */ + insn_off = outgoing_arg_base - outgoing_rsp - insn_off - 16; + dst_reg = BPF_REG_FP; + } + if (is_imm8(insn_off)) + EMIT2(add_1reg(0x40, dst_reg), insn_off); else - EMIT1_off32(add_1reg(0x80, dst_reg), insn->off); + EMIT1_off32(add_1reg(0x80, dst_reg), insn_off); EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(insn->code))); break; @@ -2149,7 +2255,17 @@ st: if (is_imm8(insn->off)) case BPF_STX | BPF_MEM | BPF_H: case BPF_STX | BPF_MEM | BPF_W: case BPF_STX | BPF_MEM | BPF_DW: - emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + if (dst_reg == BPF_REG_PARAMS && insn->off == -8) { + /* Arg 6: store register value in r9 */ + EMIT_mov(X86_REG_R9, src_reg); + break; + } + insn_off = insn->off; + if (dst_reg == BPF_REG_PARAMS) { + insn_off = outgoing_arg_base - outgoing_rsp - insn_off - 16; + dst_reg = BPF_REG_FP; + } + emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off); break; case BPF_ST | BPF_PROBE_MEM32 | BPF_B: @@ -2248,6 +2364,19 @@ populate_extable: case BPF_LDX | BPF_PROBE_MEMSX | BPF_H: case BPF_LDX | BPF_PROBE_MEMSX | BPF_W: insn_off = insn->off; + if (src_reg == BPF_REG_PARAMS) { + if (insn_off == 8) { + /* Incoming arg 6: read from r9 */ + EMIT_mov(dst_reg, X86_REG_R9); + break; + } + src_reg = BPF_REG_FP; + /* + * Incoming args 7+: native_off == bpf_off + * (r11+16 → [rbp+16], r11+24 → [rbp+24], ...) + * No offset adjustment needed. + */ + } if (BPF_MODE(insn->code) == BPF_PROBE_MEM || BPF_MODE(insn->code) == BPF_PROBE_MEMSX) { @@ -2736,6 +2865,8 @@ emit_jmp: if (emit_spectre_bhb_barrier(&prog, ip, bpf_prog)) return -EINVAL; } + /* Deallocate outgoing args 7+ area. */ + emit_add_rsp(&prog, outgoing_rsp); if (bpf_prog->aux->exception_boundary) { pop_callee_regs(&prog, all_callee_regs_used); pop_r12(&prog); @@ -3793,7 +3924,8 @@ skip_init_addrs: for (pass = 0; pass < MAX_PASSES || image; pass++) { if (!padding && pass >= PADDING_PASSES) padding = true; - proglen = do_jit(env, prog, addrs, image, rw_image, oldproglen, &ctx, padding); + proglen = do_jit(env, prog, addrs, image, rw_image, oldproglen, + &ctx, padding); if (proglen <= 0) { out_image: image = NULL; @@ -3910,6 +4042,11 @@ bool bpf_jit_supports_kfunc_call(void) return true; } +bool bpf_jit_supports_stack_args(void) +{ + return true; +} + void *bpf_arch_text_copy(void *dst, void *src, size_t len) { if (text_poke_copy(dst, src, len) == NULL) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9e16e91647d3..242f9597d9ab 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1548,6 +1548,7 @@ void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struct bpf_prog *prog, int insn_idx); +u16 bpf_out_stack_arg_cnt(const struct bpf_verifier_env *env, const struct bpf_prog *prog); #else static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e6b836f846eb..427a6d828e01 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1582,6 +1582,16 @@ bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struc insn_idx += prog->aux->subprog_start; return env->insn_aux_data[insn_idx].indirect_target; } + +u16 bpf_out_stack_arg_cnt(const struct bpf_verifier_env *env, const struct bpf_prog *prog) +{ + const struct bpf_subprog_info *sub; + + if (!env) + return 0; + sub = &env->subprog_info[prog->aux->func_idx]; + return sub->stack_arg_cnt - bpf_in_stack_arg_cnt(sub); +} #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, -- cgit v1.2.3 From f41f34ec64748e16e5a90ab391cec39e30942f32 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 13 May 2026 21:34:50 +0200 Subject: bpf: Report maximum combined stack depth We've hit the 512 bytes limit on stack depth a few times in Cilium recently. As a result, we started reporting in CI our current maximum stack depth across all configurations for each BPF program. Unfortunately, that is not trivial to compute in userspace. The verifier reports the stack depths of individual subprogs at the end of the logs. However the maximum combined stack depth also depends on the callgraph of those subprogs (the max combined stack depth is the height of the callgraph weighted by per-subprog stack depths). We can compute a callgraph in userspace from the loaded instructions, but it often doesn't match the verifier's own callgraph because of dead code elimination. Our current approach relies on dumping the BPF_LOG_LEVEL2 logs, but this feels overkill considering the verifier already has the information we need. The patch lets the verifier dump the maximum combined stack depth in the logs, on the same line as the per-subprog stack depths: stack depth 16+256 max 272 The per-subprog stack depths and the new max stack depth are not directly comparable. The former is sometimes updated during fixups, while the latter is not. As a result, even with a single subprog, we may end up with two slightly different values. The aim of the new max value is to be closest to what is actually enforced by the verifier. Signed-off-by: Paul Chaignon Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/d3d23a0410f87f116f3bbaa98a815dbae113bda2.1778700777.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ kernel/bpf/verifier.c | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6f12fc40b682..20c421b43849 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -989,6 +989,8 @@ struct bpf_verifier_env { u32 prev_insn_processed, insn_processed; /* number of jmps, calls, exits analyzed so far */ u32 prev_jmps_processed, jmps_processed; + /* maximum combined stack depth */ + u32 max_stack_depth; /* total verification time */ u64 verification_time; /* maximum number of verifier states kept in 'branching' instructions */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 82b9531f87f6..76a07f09ab64 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5177,6 +5177,8 @@ process_func: } if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) { + if (subprog_depth > env->max_stack_depth) + env->max_stack_depth = subprog_depth; if (subprog_depth > MAX_BPF_STACK) { verbose(env, "stack size of subprog %d is %d. Too large\n", idx, subprog_depth); @@ -5184,6 +5186,8 @@ process_func: } } else { depth += subprog_depth; + if (depth > env->max_stack_depth) + env->max_stack_depth = depth; if (depth > MAX_BPF_STACK) { total = 0; for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) @@ -18555,7 +18559,7 @@ static void print_verification_stats(struct bpf_verifier_env *env) verbose(env, "stack depth %d", env->subprog_info[0].stack_depth); for (i = 1; i < subprog_cnt; i++) verbose(env, "+%d", env->subprog_info[i].stack_depth); - verbose(env, "\n"); + verbose(env, " max %d\n", env->max_stack_depth); verbose(env, "insns processed %d", env->subprog_info[0].insn_processed); for (i = 1; i < subprog_cnt; i++) if (bpf_subprog_is_global(env, i)) -- cgit v1.2.3 From 4286f5deee14b26a9f0447b566d4c7cb7e2e2702 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 May 2026 15:50:40 -0700 Subject: bpf: Validate outgoing stack args when btf_prepare_func_args fails btf_prepare_func_args() sets sub->arg_cnt before validating arg types. If validation fails (e.g. unsupported pointer type in a static subprog), check_outgoing_stack_args() is skipped because btf_check_func_arg_match() returns early. For static subprogs, check_func_call() ignores non-EFAULT errors and proceeds with the call. This causes the callee to read stack arg slots that the caller never stored or not initialized, potentially dereferencing NULL caller->stack_arg_regs or getting no-initialized value. To fix the issue, when btf_prepare_func_args() fails and the subprog expects stack args, call check_outgoing_stack_args() to verify the caller initialized the slots. Return -EFAULT on failure so the error is not ignored. Fixes: 3ab5bd317ee2 ("bpf: Set sub->arg_cnt earlier in btf_prepare_func_args()") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260515225040.821515-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 76a07f09ab64..8dd79b735a69 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9118,11 +9118,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_func_state *caller = cur_func(env); struct bpf_verifier_log *log = &env->log; u32 i; - int ret; + int ret, err; ret = btf_prepare_func_args(env, subprog); - if (ret) + if (ret) { + if (bpf_in_stack_arg_cnt(sub) > 0) { + err = check_outgoing_stack_args(env, caller, sub->arg_cnt); + if (err) + return err; + } return ret; + } ret = check_outgoing_stack_args(env, caller, sub->arg_cnt); if (ret) -- cgit v1.2.3 From d1dbe443a0abb4ea3ec35a16e36efe6d3bbf72f6 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 May 2026 15:50:56 -0700 Subject: bpf: Fix arg_track_join log to use sa prefix for stack arg slots arg_track_join() logs state transitions at CFG merge points. For stack arg slots (r >= MAX_BPF_REG), it printed "r11:", "r12:", etc., which is misleading since r11 is a special register (BPF_REG_PARAMS) not meaningful to the user. Fix it to print "sa0:", "sa1:", etc., matching the per-instruction transition log in arg_track_log() which already uses the "sa" prefix. Update the existing stack_arg_pruning_type_mismatch selftest to expect the corrected format. Fixes: 2af4e792773f ("bpf: Extend liveness analysis to track stack argument slots") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260515225056.823086-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/liveness.c | 4 +++- tools/testing/selftests/bpf/progs/verifier_stack_arg.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 7f4a0e4c2c49..0aadfbae0acc 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -806,7 +806,9 @@ static bool arg_track_join(struct bpf_verifier_env *env, int idx, int target, in return true; verbose(env, "arg JOIN insn %d -> %d ", idx, target); - if (r >= 0) + if (r >= MAX_BPF_REG) + verbose(env, "sa%d: ", r - MAX_BPF_REG); + else if (r >= 0) verbose(env, "r%d: ", r); else verbose(env, "fp%+d: ", r * 8); diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_arg.c b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c index df0c3438529e..7e0ce5db28a0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_stack_arg.c +++ b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c @@ -117,7 +117,7 @@ __description("stack_arg: pruning with different stack arg types") __failure __log_level(2) __flag(BPF_F_TEST_STATE_FREQ) __msg("arg JOIN insn 9 -> 10 r1: fp0-8 + _ => fp0-8|fp0+0") -__msg("arg JOIN insn 9 -> 10 r11: fp0-8 + _ => fp0-8|fp0+0") +__msg("arg JOIN insn 9 -> 10 sa0: fp0-8 + _ => fp0-8|fp0+0") __msg("R{{[0-9]}} invalid mem access 'scalar'") __naked void stack_arg_pruning_type_mismatch(void) { -- cgit v1.2.3 From 98540a12823a016e2e1fa0db15543b22ac1fa056 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 May 2026 15:51:01 -0700 Subject: bpf: Clean up redundant stack arg checks for non-JITed programs Remove a redundant stack_arg_cnt check in __bpf_prog_select_runtime() and start the stack arg loop from index 0 in bpf_fixup_call_args(). Both changes are no-ops that simplify the code: In __bpf_prog_select_runtime(), the subprog_info[0].stack_arg_cnt check is unreachable: - when there is only a main program (no bpf-to-bpf calls), subprog_info[0].stack_arg_cnt is always 0 because the main program's arg_cnt is forced to 1 - when bpf-to-bpf calls use stack args and JIT succeeds, fp->bpf_func is set and this code is skipped - when JIT fails, bpf_fixup_call_args() rejects the program before we get to __bpf_prog_select_runtime(). In bpf_fixup_call_args(), starting the loop at i=1 skipped subprog 0, which is safe since the main program always has arg_cnt=1 and thus bpf_in_stack_arg_cnt() returns 0. Starting at i=0 removes the need to reason about this invariant. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260515225101.824054-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 +- kernel/bpf/fixups.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 427a6d828e01..cdbe9fdf474f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2609,7 +2609,7 @@ struct bpf_prog *__bpf_prog_select_runtime(struct bpf_verifier_env *env, struct goto finalize; if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) || - bpf_prog_has_kfunc_call(fp) || (env && env->subprog_info[0].stack_arg_cnt)) + bpf_prog_has_kfunc_call(fp)) jit_needed = true; if (!bpf_prog_select_interpreter(fp)) diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index 19056016eed8..2cec4e8cd4a0 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -1407,7 +1407,7 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env) verbose(env, "calling kernel functions are not allowed in non-JITed programs\n"); return -EINVAL; } - for (i = 1; i < env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { if (bpf_in_stack_arg_cnt(&env->subprog_info[i])) { verbose(env, "stack args are not supported in non-JITed programs\n"); return -EINVAL; -- cgit v1.2.3 From 18a37465b0ab5237a1d0ebf93a2a3b6a2da540b3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 17 May 2026 08:07:02 -0700 Subject: bpf,x86: Fix exception unwinding with outgoing stack arguments When a main program with exception_boundary has outgoing stack arguments (e.g. from calling subprogs with >5 args), bpf_throw() fails to correctly restore callee-saved registers, causing a kernel crash. The x86 JIT allocates the outgoing stack arg area below the callee-saved registers via 'sub rsp, outgoing_rsp' in the prologue. When bpf_throw() unwinds, it captures the main program's sp (which includes this outgoing area) and passes it to the exception callback. The callback gets rsp and rbp, followed by pop_callee_regs, but rsp points into the outgoing arg area rather than the callee-saved registers, so the pops restore garbage values. Returning to the kernel with corrupted callee-saved registers causes a crash. Fix this by adjusting the sp (adding stack_arg_sp_adjust) passed to the exception callback, so it points to the bottom of the callee-saved registers instead of the outgoing arg area. When stack_arg_sp_adjust is 0 (the common case), this is a no-op. Fixes: 324c3ca6eed6 ("bpf,x86: Implement JIT support for stack arguments") Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260517150702.288031-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 2 ++ include/linux/bpf.h | 1 + kernel/bpf/fixups.c | 1 + kernel/bpf/helpers.c | 2 +- 4 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index ceefefb4da21..a0c541a441cf 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1789,6 +1789,8 @@ static int do_jit(struct bpf_verifier_env *env, struct bpf_prog *bpf_prog, int * * Arg 6 goes into r9 register, not on stack. */ outgoing_rsp = out_stack_arg_cnt > 1 ? (out_stack_arg_cnt - 1) * 8 : 0; + if (bpf_prog->aux->exception_boundary) + bpf_prog->aux->stack_arg_sp_adjust = outgoing_rsp; emit_sub_rsp(&prog, outgoing_rsp); if (arena_vm_start) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 242f9597d9ab..1b28cacc3075 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1736,6 +1736,7 @@ struct bpf_prog_aux { struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64); + u16 stack_arg_sp_adjust; #ifdef CONFIG_SECURITY void *security; #endif diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index 2cec4e8cd4a0..52535671cb9a 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -1265,6 +1265,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->aux->real_func_cnt = env->subprog_cnt; prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func; prog->aux->exception_boundary = func[0]->aux->exception_boundary; + prog->aux->stack_arg_sp_adjust = func[0]->aux->stack_arg_sp_adjust; bpf_prog_jit_attempt_done(prog); return 0; out_free: diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index baa12b24bb64..07de26e7314c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3301,7 +3301,7 @@ __bpf_kfunc void bpf_throw(u64 cookie) * which skips compiler generated instrumentation to do the same. */ kasan_unpoison_task_stack_below((void *)(long)ctx.sp); - ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0); + ctx.aux->bpf_exception_cb(cookie, ctx.sp + ctx.aux->stack_arg_sp_adjust, ctx.bp, 0, 0); WARN(1, "A call to BPF exception callback should never return\n"); } -- cgit v1.2.3 From f05ddc6771c4a2eb1801dfdd0f7a212a78fa18a7 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Mon, 18 May 2026 22:54:42 +0800 Subject: bpf: Check tail zero of bpf_common_attr using offsetofend Because of the 8-byte alignment, the compiler will pad struct bpf_common_attr to 24 bytes. That said, sizeof(attr_common) is 24 instead of 20. When check tail zero using sizeof(attr_common) in bpf_check_uarg_tail_zero(), there will be 4 bytes that won't be checked. To also check the padding 4 bytes, replace sizeof(attr_common) with offsetofend(struct bpf_common_attr, log_true_size). Fixes: f28771c0691b ("bpf: Extend BPF syscall with common attributes support") Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260518145446.6794-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6600e126fbfb..83de8fb9b9aa 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6278,7 +6278,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, memset(&attr_common, 0, sizeof(attr_common)); if (cmd & BPF_COMMON_ATTRS) { - err = bpf_check_uarg_tail_zero(uattr_common, sizeof(attr_common), size_common); + err = bpf_check_uarg_tail_zero(uattr_common, + offsetofend(struct bpf_common_attr, log_true_size), + size_common); if (err) return err; -- cgit v1.2.3 From cb339ac61d72f7fb7f57bfc0516b7b2b65bc1bad Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 21 May 2026 11:22:59 +0800 Subject: bpf: refactor __bpf_list_del to take list node pointer Refactor __bpf_list_del to accept (head, struct list_head *n) instead of (head, bool tail). The caller now passes the specific node to remove: bpf_list_pop_front passes h->next, bpf_list_pop_back passes h->prev. Prepares for introducing bpf_list_del(head, node) kfunc to remove an arbitrary node when the user holds ownership. Signed-off-by: Kaitao Cheng Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260521032306.97118-2-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 07de26e7314c..094457c3e6d3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2550,37 +2550,44 @@ __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, return bpf_list_push_back(head, node, meta__ign, off); } -static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) +static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, + struct list_head *n) { - struct list_head *n, *h = (void *)head; + struct list_head *h = (void *)head; struct bpf_list_node_kern *node; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here */ - if (unlikely(!h->next)) + if (unlikely(!h->next)) { INIT_LIST_HEAD(h); + return NULL; + } if (list_empty(h)) return NULL; - n = tail ? h->prev : h->next; node = container_of(n, struct bpf_list_node_kern, list_head); - if (WARN_ON_ONCE(READ_ONCE(node->owner) != head)) + if (unlikely(READ_ONCE(node->owner) != head)) return NULL; list_del_init(n); - WRITE_ONCE(node->owner, NULL); + /* Ensure __bpf_list_add() sees the node as unlinked. */ + smp_store_release(&node->owner, NULL); return (struct bpf_list_node *)n; } __bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) { - return __bpf_list_del(head, false); + struct list_head *h = (void *)head; + + return __bpf_list_del(head, h->next); } __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) { - return __bpf_list_del(head, true); + struct list_head *h = (void *)head; + + return __bpf_list_del(head, h->prev); } __bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head) -- cgit v1.2.3 From cfa6afa4b931aed08288454943e5077f114fd7f3 Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 21 May 2026 11:23:00 +0800 Subject: bpf: clear list node owner and unlink before drop The issue only becomes exposed once bpf_list_del() is available: callers can pass an arbitrary bpf_list_head and bpf_list_node pair, including nodes that are not actually linked to the supplied head, or nodes that outlive their original head after refcount-based retention. This was not practically reachable for callers restricted to pop-style helpers alone; bpf_list_del() widens the API surface. A failure mode appears when bpf_list_head_free() runs while a program still holds an independent refcount on a node (for example via bpf_refcount_acquire()). The list head value embedded in map memory can go away while the node object survives. If node->owner is left pointing at the old head address until drop completes, that pointer becomes stale. If a new bpf_list_head is later allocated at the same address and the stale node is passed to bpf_list_del(), the owner comparison can succeed even though the node is not really linked to the new head, and list_del_init() will follow bogus next/prev pointers with the risk of memory corruption. When draining a bpf_list_head, mark each node owner with BPF_PTR_POISON under the map spinlock while moving it to a private drain list, then list_del_init() the node and clear owner to NULL before calling __bpf_obj_drop_impl(). Concurrent readers therefore never observe a node that appears linked to a head while its list_head is inconsistent, and surviving refcounted nodes never retain a stale non-NULL owner. Signed-off-by: Kaitao Cheng Link: https://lore.kernel.org/r/20260521032306.97118-3-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 094457c3e6d3..59855b434f0b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2247,10 +2247,11 @@ EXPORT_SYMBOL_GPL(bpf_base_func_proto); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock) { - struct list_head *head = list_head, *orig_head = list_head; + struct list_head *head = list_head, drain, *pos, *n; BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head)); BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head)); + INIT_LIST_HEAD(&drain); /* Do the actual list draining outside the lock to not hold the lock for * too long, and also prevent deadlocks if tracing programs end up @@ -2261,20 +2262,30 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head, __bpf_spin_lock_irqsave(spin_lock); if (!head->next || list_empty(head)) goto unlock; - head = head->next; + list_for_each_safe(pos, n, head) { + struct bpf_list_node_kern *node; + + node = container_of(pos, struct bpf_list_node_kern, list_head); + WRITE_ONCE(node->owner, BPF_PTR_POISON); + list_move_tail(pos, &drain); + } unlock: - INIT_LIST_HEAD(orig_head); + INIT_LIST_HEAD(head); __bpf_spin_unlock_irqrestore(spin_lock); - while (head != orig_head) { - void *obj = head; + while (!list_empty(&drain)) { + struct bpf_list_node_kern *node; - obj -= field->graph_root.node_offset; - head = head->next; + pos = drain.next; + node = container_of(pos, struct bpf_list_node_kern, list_head); + list_del_init(pos); + /* Ensure __bpf_list_add() sees the node as unlinked. */ + smp_store_release(&node->owner, NULL); /* The contained type can also have resources, including a * bpf_list_head which needs to be freed. */ - __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); + __bpf_obj_drop_impl((char *)pos - field->graph_root.node_offset, + field->graph_root.value_rec, false); } } -- cgit v1.2.3 From 7c8c71591b768284caa81e92cb47a6912952d3f2 Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 21 May 2026 11:23:01 +0800 Subject: bpf: allow non-owning list-node args via __nonown_allowed KF_ARG_PTR_TO_LIST_NODE normally requires an owning reference (PTR_TO_BTF_ID | MEM_ALLOC with ref_obj_id). Introduce the __nonown_allowed annotation on selected list-node arguments so non-owning references with ref_obj_id==0 are accepted as well. This patch only adds the generic verifier support and documents the annotation. Later patches in the series will apply it to bpf_list_add /del(), and bpf_list_is_first/last(), allowing bpf_list_front/back() results to be used as the insertion point, deletion target, or query target for those kfuncs. Verifier keeps existing owning-ref checks by default; only arguments annotated with __nonown_allowed bypass MEM_ALLOC/ref_obj_id checks and then follow the same list-node validation path. Signed-off-by: Kaitao Cheng Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260521032306.97118-4-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 22 ++++++++++++++++++++-- kernel/bpf/verifier.c | 13 +++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 75e6c078e0e7..3a9db1108b95 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -207,8 +207,26 @@ Here, the buffer may be NULL. If the buffer is not NULL, it must be at least buffer__szk bytes in size. The kfunc is responsible for checking if the buffer is NULL before using it. -2.3.5 __str Annotation ----------------------------- +2.3.5 __nonown_allowed Annotation +--------------------------------- + +This annotation is used to indicate that the parameter may be a non-owning reference. + +An example is given below:: + + __bpf_kfunc int bpf_list_add(..., struct bpf_list_node + *prev__nonown_allowed, ...) + { + ... + } + +For the ``prev__nonown_allowed`` parameter (resolved as ``KF_ARG_PTR_TO_LIST_NODE``), +suffix ``__nonown_allowed`` retains the usual owning-pointer rules and also +permits a non-owning reference with no ref_obj_id (e.g. the return value of +bpf_list_front() / bpf_list_back()). + +2.3.6 __str Annotation +---------------------- This annotation is used to indicate that the argument is a constant string. An example is given below:: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8dd79b735a69..f3cf8d85bea0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10714,6 +10714,11 @@ static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param return btf_param_match_suffix(btf, arg, "__nullable"); } +static bool is_kfunc_arg_nonown_allowed(const struct btf *btf, const struct btf_param *arg) +{ + return btf_param_match_suffix(btf, arg, "__nonown_allowed"); +} + static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg) { return btf_param_match_suffix(btf, arg, "__str"); @@ -12244,6 +12249,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return ret; break; case KF_ARG_PTR_TO_LIST_NODE: + if (is_kfunc_arg_nonown_allowed(btf, &args[i]) && + type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) { + /* Allow bpf_list_front/back return value for + * __nonown_allowed list-node arguments. + */ + goto check_ok; + } if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { verbose(env, "%s expected pointer to allocated object\n", reg_arg_name(env, argno)); @@ -12253,6 +12265,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "allocated object must be referenced\n"); return -EINVAL; } +check_ok: ret = process_kf_arg_ptr_to_list_node(env, reg, argno, meta); if (ret < 0) return ret; -- cgit v1.2.3 From 187baa10963ac9f1db5123fa2ab761ab34ea06b9 Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 21 May 2026 11:23:02 +0800 Subject: bpf: Introduce the bpf_list_del kfunc. Allow users to remove any node from a linked list. We have added an additional parameter bpf_list_head *head to bpf_list_del, as the verifier requires the head parameter to check whether the lock is being held. Signed-off-by: Kaitao Cheng Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260521032306.97118-5-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 10 ++++++++++ kernel/bpf/verifier.c | 6 +++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 59855b434f0b..804c201c28f3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2601,6 +2601,15 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) return __bpf_list_del(head, h->prev); } +__bpf_kfunc struct bpf_list_node *bpf_list_del(struct bpf_list_head *head, + struct bpf_list_node *node__nonown_allowed) +{ + struct bpf_list_node_kern *kn = (void *)node__nonown_allowed; + + /* verifier guarantees node is a list node rather than list head */ + return __bpf_list_del(head, &kn->list_head); +} + __bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head) { struct list_head *h = (struct list_head *)head; @@ -4733,6 +4742,7 @@ BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f3cf8d85bea0..35eebb5e7769 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10961,6 +10961,7 @@ enum special_kfunc_type { KF_bpf_list_push_back, KF_bpf_list_pop_front, KF_bpf_list_pop_back, + KF_bpf_list_del, KF_bpf_list_front, KF_bpf_list_back, KF_bpf_cast_to_kern_ctx, @@ -11029,6 +11030,7 @@ BTF_ID(func, bpf_list_push_back_impl) BTF_ID(func, bpf_list_push_back) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_list_del) BTF_ID(func, bpf_list_front) BTF_ID(func, bpf_list_back) BTF_ID(func, bpf_cast_to_kern_ctx) @@ -11549,6 +11551,7 @@ static bool is_bpf_list_api_kfunc(u32 btf_id) return is_bpf_list_push_kfunc(btf_id) || btf_id == special_kfunc_list[KF_bpf_list_pop_front] || btf_id == special_kfunc_list[KF_bpf_list_pop_back] || + btf_id == special_kfunc_list[KF_bpf_list_del] || btf_id == special_kfunc_list[KF_bpf_list_front] || btf_id == special_kfunc_list[KF_bpf_list_back]; } @@ -11671,7 +11674,8 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, switch (node_field_type) { case BPF_LIST_NODE: - ret = is_bpf_list_push_kfunc(kfunc_btf_id); + ret = is_bpf_list_push_kfunc(kfunc_btf_id) || + kfunc_btf_id == special_kfunc_list[KF_bpf_list_del]; break; case BPF_RB_NODE: ret = (is_bpf_rbtree_add_kfunc(kfunc_btf_id) || -- cgit v1.2.3 From e6919ff67c1e612ec1ce3be51dba6e7ffc47997a Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 21 May 2026 11:23:03 +0800 Subject: bpf: refactor __bpf_list_add to take insertion point via **prev_ptr Refactor __bpf_list_add to accept (node, head, struct list_head **prev_ptr, ..) instead of (node, head, bool tail, ..). Load prev from *prev_ptr after INIT_LIST_HEAD(h), so we never dereference an uninitialized h->prev when head was 0-initialized (e.g. push_back passes &h->prev). When prev is not the list head, validate that prev is in the list via its owner. Prepares for bpf_list_add(head, new, prev, ..) to insert after a given list node. Signed-off-by: Kaitao Cheng Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260521032306.97118-6-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 804c201c28f3..1c69476c8a09 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2478,9 +2478,11 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta static int __bpf_list_add(struct bpf_list_node_kern *node, struct bpf_list_head *head, - bool tail, struct btf_record *rec, u64 off) + struct list_head **prev_ptr, + struct btf_record *rec, u64 off) { struct list_head *n = &node->list_head, *h = (void *)head; + struct list_head *prev; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here @@ -2488,19 +2490,31 @@ static int __bpf_list_add(struct bpf_list_node_kern *node, if (unlikely(!h->next)) INIT_LIST_HEAD(h); + prev = *prev_ptr; + + /* When prev is not the list head, it must be a node in this list. */ + if (prev != h) { + struct bpf_list_node_kern *prev_kn = + container_of(prev, struct bpf_list_node_kern, list_head); + + if (unlikely(READ_ONCE(prev_kn->owner) != head)) + goto fail; + } + /* node->owner != NULL implies !list_empty(n), no need to separately * check the latter */ - if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { - /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl((void *)n - off, rec, false); - return -EINVAL; - } + if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) + goto fail; - tail ? list_add_tail(n, h) : list_add(n, h); + list_add(n, prev); WRITE_ONCE(node->owner, head); - return 0; + +fail: + /* Only called from BPF prog, no need to migrate_disable */ + __bpf_obj_drop_impl((void *)n - off, rec, false); + return -EINVAL; } /** @@ -2521,8 +2535,9 @@ __bpf_kfunc int bpf_list_push_front(struct bpf_list_head *head, u64 off) { struct bpf_list_node_kern *n = (void *)node; + struct list_head *h = (void *)head; - return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off); + return __bpf_list_add(n, head, &h, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head, @@ -2550,8 +2565,9 @@ __bpf_kfunc int bpf_list_push_back(struct bpf_list_head *head, u64 off) { struct bpf_list_node_kern *n = (void *)node; + struct list_head *h = (void *)head; - return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off); + return __bpf_list_add(n, head, &h->prev, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, -- cgit v1.2.3 From a3493ca504f16877bf29a123f27835c3f841a05f Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 21 May 2026 11:23:04 +0800 Subject: bpf: Add bpf_list_add to insert node after a given list node Add a new kfunc bpf_list_add(head, new, prev, meta, off) that inserts 'new' after 'prev' in the BPF linked list. Both must be in the same list; 'prev' must already be in the list. The new node must be an owning reference (e.g. from bpf_obj_new); the kfunc consumes that reference and the node becomes non-owning once inserted. We have added an additional parameter bpf_list_head *head to bpf_list_add, as the verifier requires the head parameter to check whether the lock is being held. Returns 0 on success, -EINVAL if 'prev' is not in a list or 'new' is already in a list (or duplicate insertion). On failure, the kernel drops the passed-in node. Signed-off-by: Kaitao Cheng Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260521032306.97118-7-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 11 +++++++++++ kernel/bpf/verifier.c | 12 +++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1c69476c8a09..89579165ef4d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2577,6 +2577,16 @@ __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, return bpf_list_push_back(head, node, meta__ign, off); } +__bpf_kfunc int bpf_list_add(struct bpf_list_head *head, struct bpf_list_node *new, + struct bpf_list_node *prev__nonown_allowed, + struct btf_struct_meta *meta, u64 off) +{ + struct bpf_list_node_kern *n = (void *)new, *p = (void *)prev__nonown_allowed; + struct list_head *prev_ptr = &p->list_head; + + return __bpf_list_add(n, head, &prev_ptr, meta ? meta->record : NULL, off); +} + static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, struct list_head *n) { @@ -4756,6 +4766,7 @@ BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_push_back_impl) +BTF_ID_FLAGS(func, bpf_list_add, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 35eebb5e7769..662ad7312697 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10959,6 +10959,7 @@ enum special_kfunc_type { KF_bpf_list_push_front, KF_bpf_list_push_back_impl, KF_bpf_list_push_back, + KF_bpf_list_add, KF_bpf_list_pop_front, KF_bpf_list_pop_back, KF_bpf_list_del, @@ -11028,6 +11029,7 @@ BTF_ID(func, bpf_list_push_front_impl) BTF_ID(func, bpf_list_push_front) BTF_ID(func, bpf_list_push_back_impl) BTF_ID(func, bpf_list_push_back) +BTF_ID(func, bpf_list_add) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) BTF_ID(func, bpf_list_del) @@ -11140,7 +11142,8 @@ static bool is_bpf_list_push_kfunc(u32 func_id) return func_id == special_kfunc_list[KF_bpf_list_push_front] || func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || func_id == special_kfunc_list[KF_bpf_list_push_back] || - func_id == special_kfunc_list[KF_bpf_list_push_back_impl]; + func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || + func_id == special_kfunc_list[KF_bpf_list_add]; } static bool is_bpf_rbtree_add_kfunc(u32 func_id) @@ -19524,8 +19527,11 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int struct_meta_reg = BPF_REG_3; int node_offset_reg = BPF_REG_4; - /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */ - if (is_bpf_rbtree_add_kfunc(desc->func_id)) { + /* list_add/rbtree_add have an extra arg (prev/less), + * so args-to-fixup are in diff regs. + */ + if (desc->func_id == special_kfunc_list[KF_bpf_list_add] || + is_bpf_rbtree_add_kfunc(desc->func_id)) { struct_meta_reg = BPF_REG_4; node_offset_reg = BPF_REG_5; } -- cgit v1.2.3 From 745515d386eb5e6891d9f91a92ad15dace3a33ef Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Thu, 21 May 2026 11:23:05 +0800 Subject: bpf: add bpf_list_is_first/last/empty kfuncs Add three kfuncs for BPF linked list queries: - bpf_list_is_first(head, node): true if node is the first in the list. - bpf_list_is_last(head, node): true if node is the last in the list. - bpf_list_empty(head): true if the list has no entries. Currently, without these kfuncs, to implement the above functionality it is necessary to first call bpf_list_pop_front/back to retrieve the first or last node before checking whether the passed-in node was the first or last one. After the check, the node had to be pushed back into the list using bpf_list_push_front/back, which was very inefficient. Now, with the bpf_list_is_first/last/empty kfuncs, we can directly check whether a node is the first, last, or whether the list is empty, without having to first retrieve the node. Signed-off-by: Kaitao Cheng Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260521032306.97118-8-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 40 ++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 15 +++++++++++++-- 2 files changed, 53 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 89579165ef4d..b6c3d02d5593 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2656,6 +2656,43 @@ __bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head) return (struct bpf_list_node *)h->prev; } +__bpf_kfunc bool bpf_list_is_first(struct bpf_list_head *head, + struct bpf_list_node *node__nonown_allowed) +{ + struct list_head *h = (struct list_head *)head; + struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed; + + if (READ_ONCE(kn->owner) != head) + return false; + + return list_is_first(&kn->list_head, h); +} + +__bpf_kfunc bool bpf_list_is_last(struct bpf_list_head *head, + struct bpf_list_node *node__nonown_allowed) +{ + struct list_head *h = (struct list_head *)head; + struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed; + + if (READ_ONCE(kn->owner) != head) + return false; + + return list_is_last(&kn->list_head, h); +} + +__bpf_kfunc bool bpf_list_empty(struct bpf_list_head *head) +{ + struct list_head *h = (struct list_head *)head; + + /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't + * called on its fields, so init here + */ + if (unlikely(!h->next)) + INIT_LIST_HEAD(h); + + return list_empty(h); +} + __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { @@ -4772,6 +4809,9 @@ BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_is_first) +BTF_ID_FLAGS(func, bpf_list_is_last) +BTF_ID_FLAGS(func, bpf_list_empty) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 662ad7312697..d9bdc3b32c05 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10965,6 +10965,9 @@ enum special_kfunc_type { KF_bpf_list_del, KF_bpf_list_front, KF_bpf_list_back, + KF_bpf_list_is_first, + KF_bpf_list_is_last, + KF_bpf_list_empty, KF_bpf_cast_to_kern_ctx, KF_bpf_rdonly_cast, KF_bpf_rcu_read_lock, @@ -11035,6 +11038,9 @@ BTF_ID(func, bpf_list_pop_back) BTF_ID(func, bpf_list_del) BTF_ID(func, bpf_list_front) BTF_ID(func, bpf_list_back) +BTF_ID(func, bpf_list_is_first) +BTF_ID(func, bpf_list_is_last) +BTF_ID(func, bpf_list_empty) BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rcu_read_lock) @@ -11556,7 +11562,10 @@ static bool is_bpf_list_api_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_list_pop_back] || btf_id == special_kfunc_list[KF_bpf_list_del] || btf_id == special_kfunc_list[KF_bpf_list_front] || - btf_id == special_kfunc_list[KF_bpf_list_back]; + btf_id == special_kfunc_list[KF_bpf_list_back] || + btf_id == special_kfunc_list[KF_bpf_list_is_first] || + btf_id == special_kfunc_list[KF_bpf_list_is_last] || + btf_id == special_kfunc_list[KF_bpf_list_empty]; } static bool is_bpf_rbtree_api_kfunc(u32 btf_id) @@ -11678,7 +11687,9 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, switch (node_field_type) { case BPF_LIST_NODE: ret = is_bpf_list_push_kfunc(kfunc_btf_id) || - kfunc_btf_id == special_kfunc_list[KF_bpf_list_del]; + kfunc_btf_id == special_kfunc_list[KF_bpf_list_del] || + kfunc_btf_id == special_kfunc_list[KF_bpf_list_is_first] || + kfunc_btf_id == special_kfunc_list[KF_bpf_list_is_last]; break; case BPF_RB_NODE: ret = (is_bpf_rbtree_add_kfunc(kfunc_btf_id) || -- cgit v1.2.3 From dc11a4dba2464e5144c318ffaf7fb16b1a5c74d6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 22 May 2026 07:22:13 -1000 Subject: bpf: Recover arena kernel faults with scratch page BPF arena usage is becoming more prevalent, but kernel <-> BPF communication over arena memory is awkward today. Data has to be staged through a trusted kernel pointer with extra code and copying on the BPF side. While reads through arena pointers can use a fault-safe helper, writes don't have a good solution. The in-line alternative would need instruction emulation or asm fixup labels. Enable direct kernel-side reads and writes within GUARD_SZ / 2 of any handed-in arena pointer, without bounds checking. A per-arena scratch page is installed by the arch fault path into empty arena kernel PTEs - x86 from page_fault_oops() for not-present faults, arm64 from __do_kernel_fault() for translation faults, both after the existing exception-table and KFENCE handling. The faulting instruction retries and the access is also reported through the program's BPF stream, preserving error reporting. bpf_prog_find_from_stack() resolves the current BPF program (and its arena) from the kernel stack - no new bpf_run_ctx state is added. Recovery covers the 4 GiB arena plus the upper half-guard (GUARD_SZ / 2). The lower half-guard is excluded because well-behaved kfuncs only access forward from arena pointers. The kfunc-author contract - access at most GUARD_SZ / 2 past a handed-in pointer - is documented in Documentation/bpf/kfuncs.rst. The install is lock-free via ptep_try_set(). On race-loss the winning installer's PTE is already valid, so the access retry succeeds. The arena clear path uses ptep_get_and_clear() so installer and clearer race through atomic accessors. No flush_tlb_kernel_range() afterwards. Stale "not mapped" entries just cause one extra re-fault, cheaper than a global IPI on every install. Scratch exists only to keep the kernel from oopsing on an in-line arena access. Its presence at a PTE means the BPF program has already malfunctioned, and the violation is reported through the program's BPF stream. The only requirement for behavior on a scratched PTE is that the kernel doesn't crash. In particular, any user-side access through such a PTE may segfault. The shared scratch page is freed once during map destruction. BPF instruction faults continue to use the existing JIT exception-table path. This patch changes only the kernel-text fault path. No UAPI flag is added. The new behavior is the default. v2: Use ptep_get_and_clear() in apply_range_clear_cb(). (David) v3: Stub bpf_arena_handle_page_fault() for !CONFIG_BPF_SYSCALL. (lkp) Suggested-by: Alexei Starovoitov Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Tejun Heo Reviewed-by: Emil Tsalapatis Cc: David Hildenbrand Link: https://lore.kernel.org/r/20260522172219.1423324-3-tj@kernel.org Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 14 ++++ arch/arm64/mm/fault.c | 10 ++- arch/x86/mm/fault.c | 12 ++- include/linux/bpf.h | 1 + include/linux/bpf_defs.h | 19 +++++ kernel/bpf/arena.c | 177 +++++++++++++++++++++++++++++++++---------- kernel/bpf/core.c | 5 ++ 7 files changed, 191 insertions(+), 47 deletions(-) create mode 100644 include/linux/bpf_defs.h (limited to 'kernel/bpf') diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 75e6c078e0e7..6d497e720998 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -462,6 +462,20 @@ In order to accommodate such requirements, the verifier will enforce strict PTR_TO_BTF_ID type matching if two types have the exact same name, with one being suffixed with ``___init``. +2.8 Accessing arena memory through kfunc arguments +-------------------------------------------------- + +A read or write at any address inside an arena does not oops the kernel. +Unallocated arena pages are lazily backed by a scratch page and the +access is reported through the program's BPF stream as an error. Only +the BPF program's correctness is affected; the kernel itself remains +intact. + +The arena is followed by a ``GUARD_SZ / 2`` (32 KiB) guard region that +is also covered by this recovery. A kfunc handed an arena pointer may +therefore access up to ``GUARD_SZ / 2`` past it without bounds-checking +against the arena. Larger accesses must verify the range explicitly. + .. _BPF_kfunc_lifecycle_expectations: 3. kfunc lifecycle expectations diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 0f3c5c7ca054..b4290d16ff92 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -436,9 +437,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr, } else if (is_pkvm_stage2_abort(esr)) { msg = "access to hypervisor-protected memory"; } else { - if (esr_fsc_is_translation_fault(esr) && - kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) - return; + if (esr_fsc_is_translation_fault(esr)) { + if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) + return; + if (bpf_arena_handle_page_fault(addr, esr & ESR_ELx_WNR, regs->pc)) + return; + } msg = "paging request"; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 63de8e8684f2..7ea6a9362173 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -8,6 +8,7 @@ #include /* task_stack_*(), ... */ #include /* oops_begin/end, ... */ #include /* max_low_pfn */ +#include /* bpf_arena_handle_page_fault */ #include /* kfence_handle_page_fault */ #include /* NOKPROBE_SYMBOL, ... */ #include /* kmmio_handler, ... */ @@ -688,10 +689,13 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code, if (IS_ENABLED(CONFIG_EFI)) efi_crash_gracefully_on_page_fault(address, regs); - /* Only not-present faults should be handled by KFENCE. */ - if (!(error_code & X86_PF_PROT) && - kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs)) - return; + /* Only not-present faults should be handled by KFENCE or BPF arena. */ + if (!(error_code & X86_PF_PROT)) { + if (kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs)) + return; + if (bpf_arena_handle_page_fault(address, error_code & X86_PF_WRITE, regs->ip)) + return; + } oops: /* diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 01e203964892..bb4261a5df64 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -6,6 +6,7 @@ #include #include +#include #include #include diff --git a/include/linux/bpf_defs.h b/include/linux/bpf_defs.h new file mode 100644 index 000000000000..2185cd3966d4 --- /dev/null +++ b/include/linux/bpf_defs.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Subset of bpf.h declarations, split out so files that need only these + * declarations can avoid bpf.h's full include cost. + */ +#ifndef _LINUX_BPF_DEFS_H +#define _LINUX_BPF_DEFS_H + +#ifdef CONFIG_BPF_SYSCALL +bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip); +#else +static inline bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, + unsigned long fault_ip) +{ + return false; +} +#endif + +#endif /* _LINUX_BPF_DEFS_H */ diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 49a8f7b1beef..2da2c275cff6 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -53,6 +53,7 @@ struct bpf_arena { u64 user_vm_start; u64 user_vm_end; struct vm_struct *kern_vm; + struct page *scratch_page; struct range_tree rt; /* protects rt */ rqspinlock_t spinlock; @@ -118,6 +119,11 @@ struct apply_range_data { int i; }; +struct clear_range_data { + struct llist_head *free_pages; + struct page *scratch_page; +}; + static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data) { struct apply_range_data *d = data; @@ -144,33 +150,59 @@ static void flush_vmap_cache(unsigned long start, unsigned long size) flush_cache_vmap(start, start + size); } -static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages) +static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data) { + struct clear_range_data *d = data; pte_t old_pte; struct page *page; - /* sanity check */ - old_pte = ptep_get(pte); + /* + * Pairs with ptep_try_set() in the kernel-fault scratch installer. + * Both sides must be atomic. + */ + old_pte = ptep_get_and_clear(&init_mm, addr, pte); if (pte_none(old_pte) || !pte_present(old_pte)) - return 0; /* nothing to do */ + return 0; page = pte_page(old_pte); if (WARN_ON_ONCE(!page)) return -EINVAL; - pte_clear(&init_mm, addr, pte); + /* + * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr + * scratches its PTE. A later bpf_arena_free_pages() over that range walks + * here. Without the skip, scratch_page would be freed. + */ + if (page == d->scratch_page) + return 0; + + __llist_add(&page->pcp_llist, d->free_pages); + return 0; +} - /* Add page to the list so it is freed later */ - if (free_pages) - __llist_add(&page->pcp_llist, free_pages); +static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data) +{ + struct page *scratch_page = data; + if (!pte_none(ptep_get(pte))) + return 0; + /* + * Best-effort install. ptep_try_set() returns false only if another + * installer (real allocation or concurrent fault) won the cmpxchg. + * Their PTE is already valid, so the access retry succeeds. + * + * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just + * cause one extra re-fault through this same path. + */ + ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL)); return 0; } static int populate_pgtable_except_pte(struct bpf_arena *arena) { + /* Populate intermediates for the recovery range (4 GiB + upper half-guard). */ return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), - KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL); + SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL); } static struct bpf_map *arena_map_alloc(union bpf_attr *attr) @@ -221,22 +253,29 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) init_irq_work(&arena->free_irq, arena_free_irq); INIT_WORK(&arena->free_work, arena_free_worker); bpf_map_init_from_attr(&arena->map, attr); + + err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page); + if (err) + goto err_free_arena; + range_tree_init(&arena->rt); err = range_tree_set(&arena->rt, 0, attr->max_entries); - if (err) { - bpf_map_area_free(arena); - goto err; - } + if (err) + goto err_free_scratch; mutex_init(&arena->lock); raw_res_spin_lock_init(&arena->spinlock); err = populate_pgtable_except_pte(arena); - if (err) { - range_tree_destroy(&arena->rt); - bpf_map_area_free(arena); - goto err; - } + if (err) + goto err_destroy_rt; return &arena->map; + +err_destroy_rt: + range_tree_destroy(&arena->rt); +err_free_scratch: + __free_page(arena->scratch_page); +err_free_arena: + bpf_map_area_free(arena); err: free_vm_area(kern_vm); return ERR_PTR(err); @@ -244,6 +283,7 @@ err: static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) { + struct bpf_arena *arena = data; struct page *page; pte_t pte; @@ -251,6 +291,12 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) if (!pte_present(pte)) /* sanity check */ return 0; page = pte_page(pte); + /* + * Skip the scratch page. The walk is page-table-driven, not range-tree-driven, + * so it can visit scratch PTEs at uaddrs the BPF program never allocated. + */ + if (page == arena->scratch_page) + return 0; /* * We do not update pte here: * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug @@ -286,9 +332,10 @@ static void arena_map_free(struct bpf_map *map) * free those pages. */ apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), - KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); + SZ_4G + GUARD_SZ / 2, existing_page_cb, arena); free_vm_area(arena->kern_vm); range_tree_destroy(&arena->rt); + __free_page(arena->scratch_page); bpf_map_area_free(arena); } @@ -384,33 +431,37 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) return VM_FAULT_RETRY; page = vmalloc_to_page((void *)kaddr); - if (page) + if (page) { + if (page == arena->scratch_page) + /* BPF triggered scratch here; don't lazy-alloc over it */ + goto out_sigsegv; /* already have a page vmap-ed */ goto out; + } bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) /* User space requested to segfault when page is not allocated by bpf prog */ - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); if (ret) - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; struct apply_range_data data = { .pages = &page, .i = 0 }; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; } ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); free_pages_nolock(page, 0); - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; } flush_vmap_cache(kaddr, PAGE_SIZE); bpf_map_memcg_exit(old_memcg, new_memcg); @@ -419,8 +470,9 @@ out: raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); vmf->page = page; return 0; -out_unlock_sigsegv: +out_sigsegv_memcg: bpf_map_memcg_exit(old_memcg, new_memcg); +out_sigsegv: raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); return VM_FAULT_SIGSEGV; } @@ -685,6 +737,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, struct llist_head free_pages; struct llist_node *pos, *t; struct arena_free_span *s; + struct clear_range_data cdata; unsigned long flags; int ret = 0; @@ -713,9 +766,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, range_tree_set(&arena->rt, pgoff, page_cnt); init_llist_head(&free_pages); + cdata.free_pages = &free_pages; + cdata.scratch_page = arena->scratch_page; /* clear ptes and collect struct pages */ apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, - apply_range_clear_cb, &free_pages); + apply_range_clear_cb, &cdata); /* drop the lock to do the tlb flush and zap pages */ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); @@ -805,6 +860,7 @@ static void arena_free_worker(struct work_struct *work) struct arena_free_span *s; u64 arena_vm_start, user_vm_start; struct llist_head free_pages; + struct clear_range_data cdata; struct page *page; unsigned long full_uaddr; long kaddr, page_cnt, pgoff; @@ -818,6 +874,8 @@ static void arena_free_worker(struct work_struct *work) bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); init_llist_head(&free_pages); + cdata.free_pages = &free_pages; + cdata.scratch_page = arena->scratch_page; arena_vm_start = bpf_arena_get_kern_vm_start(arena); user_vm_start = bpf_arena_get_user_vm_start(arena); @@ -830,7 +888,7 @@ static void arena_free_worker(struct work_struct *work) /* clear ptes and collect pages in free_pages llist */ apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, - apply_range_clear_cb, &free_pages); + apply_range_clear_cb, &cdata); range_tree_set(&arena->rt, pgoff, page_cnt); } @@ -945,23 +1003,12 @@ static int __init kfunc_init(void) } late_initcall(kfunc_init); -void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) +static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write, + unsigned long addr, unsigned long fault_ip) { struct bpf_stream_stage ss; - struct bpf_prog *prog; u64 user_vm_start; - /* - * The RCU read lock is held to safely traverse the latch tree, but we - * don't need its protection when accessing the prog, since it will not - * disappear while we are handling the fault. - */ - rcu_read_lock(); - prog = bpf_prog_ksym_find(fault_ip); - rcu_read_unlock(); - if (!prog) - return; - /* Use main prog for stream access */ prog = prog->aux->main_prog_aux->prog; @@ -974,3 +1021,53 @@ void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned lo bpf_stream_dump_stack(ss); })); } + +bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip) +{ + struct bpf_arena *arena; + struct bpf_prog *prog; + unsigned long kbase; + unsigned long page_addr = addr & PAGE_MASK; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return false; + + arena = prog->aux->arena; + /* a prog not using arena may be on stack, so arena can be NULL */ + if (!arena) + return false; + + kbase = bpf_arena_get_kern_vm_start(arena); + + /* + * Recovery covers the 4 GiB mappable band plus the upper half-guard. + * Lower guard is unreachable from kfuncs; an address there indicates + * a different bug class - leave it to the regular kernel oops path. + */ + if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2) + return false; + + apply_to_page_range(&init_mm, page_addr, PAGE_SIZE, + apply_range_set_scratch_cb, arena->scratch_page); + flush_vmap_cache(page_addr, PAGE_SIZE); + __bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip); + return true; +} + +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) +{ + struct bpf_prog *prog; + + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it will not + * disappear while we are handling the fault. + */ + rcu_read_lock(); + prog = bpf_prog_ksym_find(fault_ip); + rcu_read_unlock(); + if (!prog) + return; + __bpf_prog_report_arena_violation(prog, write, addr, fault_ip); +} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8b018ff48875..fc3ee67486ce 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3350,6 +3350,11 @@ __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) { return 0; } +__weak bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, + unsigned long fault_ip) +{ + return false; +} #ifdef CONFIG_BPF_SYSCALL static int __init bpf_global_ma_init(void) -- cgit v1.2.3 From f211c81ddc368e5cc6ad69d171bca0fa52e71ad7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2026 07:22:14 -1000 Subject: bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers The existing kernel-side export of bpf_arena_alloc_pages is _non_sleepable only - it's used by the verifier to inline the kfunc when the call site is non-sleepable. There is no sleepable equivalent for kernel callers. The kfunc bpf_arena_alloc_pages itself is BPF-only. sched_ext needs sleepable kernel-side allocs for its arena pool init/grow paths. Add bpf_arena_alloc_pages_sleepable() mirroring the _non_sleepable wrapper but passing sleepable=true to arena_alloc_pages(). Signed-off-by: Tejun Heo Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260522172219.1423324-4-tj@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++++ kernel/bpf/arena.c | 13 +++++++++++++ 2 files changed, 21 insertions(+) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bb4261a5df64..c00be24e7244 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -679,6 +679,8 @@ int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id, u64 flags); void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt); +void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id, + u64 flags); #else static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id, u64 flags) @@ -689,6 +691,12 @@ static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt) { } + +static inline void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + return NULL; +} #endif extern const struct bpf_map_ops bpf_map_offload_ops; diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 2da2c275cff6..9e379ef27d41 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -951,6 +951,19 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false); } + +void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) + return NULL; + + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true); +} + __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) { struct bpf_map *map = p__map; -- cgit v1.2.3 From 7c48a28c1bbe26e272bc978a42adb757fc6aa639 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2026 07:22:15 -1000 Subject: bpf: Add bpf_struct_ops_for_each_prog() Add a helper that walks the member progs of the struct_ops map containing a given @kdata vmtable. struct_ops ->reg() callbacks (and similar) sometimes need to inspect the loaded BPF programs, e.g. to discover maps they reference via prog->aux->used_maps. The implementation mirrors bpf_struct_ops_id(): container_of @kdata to recover the bpf_struct_ops_map, then iterate st_map->links[i]->prog for i in [0, funcs_cnt). Same access pattern, no new locking - by the time ->reg() fires st_map is fully populated and stable. A sched_ext follow-up walks the member progs of a cid-form scheduler's struct_ops map, reads prog->aux->arena directly, and requires all member progs to reference exactly one arena, without requiring the BPF program to call a registration kfunc. Signed-off-by: Tejun Heo Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260522172219.1423324-5-tj@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ kernel/bpf/bpf_struct_ops.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c00be24e7244..491cc6750504 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2131,6 +2131,9 @@ int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map); void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog); void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux); u32 bpf_struct_ops_id(const void *kdata); +int bpf_struct_ops_for_each_prog(const void *kdata, + int (*cb)(struct bpf_prog *prog, void *data), + void *data); #ifdef CONFIG_NET /* Define it here to avoid the use of forward declaration */ diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 521cb9d7e8c7..5e51c1211673 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -1204,6 +1204,42 @@ u32 bpf_struct_ops_id(const void *kdata) } EXPORT_SYMBOL_GPL(bpf_struct_ops_id); +/** + * bpf_struct_ops_for_each_prog - Invoke @cb for each member prog + * @kdata: kernel-side struct_ops vmtable (the @kdata arg to ->reg/->update/->unreg) + * @cb: callback invoked once per member prog; non-zero return stops iteration + * @data: opaque argument passed to @cb + * + * Walks the struct_ops member progs registered on the map containing @kdata. + * Intended for use from struct_ops ->reg() callbacks (and similar) that need to + * inspect the loaded BPF programs (for example to discover maps they reference + * via @prog->aux->used_maps). + * + * Return 0 if iteration completed, otherwise the first non-zero @cb return. + */ +int bpf_struct_ops_for_each_prog(const void *kdata, + int (*cb)(struct bpf_prog *prog, void *data), + void *data) +{ + struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map; + u32 i; + int ret; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->links[i]) + continue; + ret = cb(st_map->links[i]->prog, data); + if (ret) + return ret; + } + return 0; +} +EXPORT_SYMBOL_GPL(bpf_struct_ops_for_each_prog); + static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; -- cgit v1.2.3 From 53cc12a2dc88c2c6f62f507548640885a70a56a8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2026 07:22:16 -1000 Subject: bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena() struct bpf_arena is opaque to callers outside arena.c. Add two helpers for struct_ops subsystems that need to reach into an arena: bpf_arena_map_kern_vm_start(struct bpf_map *map) returns @map's kern_vm_start. A sched_ext follow-up needs this to translate kern_va <-> uaddr. bpf_prog_arena(struct bpf_prog *prog) returns the bpf_map of the arena referenced by @prog (NULL if @prog references no arena). The verifier enforces at most one arena per program. Used by struct_ops callers that auto-discover an arena from a member prog and need to take a map reference. Suggested-by: Kumar Kartikeya Dwivedi Signed-off-by: Tejun Heo Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260522172219.1423324-6-tj@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/arena.c | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 491cc6750504..c323b3e027fe 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -618,6 +618,8 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock); u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena); u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena); +u64 bpf_arena_map_kern_vm_start(struct bpf_map *map); +struct bpf_map *bpf_prog_arena(struct bpf_prog *prog); int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 9e379ef27d41..1727503b25d8 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -84,6 +84,32 @@ u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) return arena ? arena->user_vm_start : 0; } +/** + * bpf_arena_map_kern_vm_start - kern_vm_start lookup by struct bpf_map * + * @map: a BPF_MAP_TYPE_ARENA map + * + * Return @map's kern_vm_start. + */ +u64 bpf_arena_map_kern_vm_start(struct bpf_map *map) +{ + return bpf_arena_get_kern_vm_start(container_of(map, struct bpf_arena, map)); +} + +/** + * bpf_prog_arena - return the bpf_map of the arena referenced by @prog + * @prog: a loaded BPF program + * + * The verifier enforces at most one arena per program and stores it in + * prog->aux->arena. Return that arena's underlying bpf_map, or NULL if + * @prog does not reference an arena. + */ +struct bpf_map *bpf_prog_arena(struct bpf_prog *prog) +{ + struct bpf_arena *arena = prog->aux->arena; + + return arena ? &arena->map : NULL; +} + static long arena_map_peek_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; -- cgit v1.2.3 From e42e53ae23b7d41df22ccd7788192bf578f24da2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 27 May 2026 09:26:32 -1000 Subject: bpf: Fix bpf_arena_handle_page_fault() redefinition without CONFIG_BPF_SYSCALL On configs with CONFIG_BPF=y but CONFIG_BPF_SYSCALL=n (e.g. arm multi_v7_defconfig), kernel/bpf/core.c defines a __weak bpf_arena_handle_page_fault() while bpf_defs.h already supplies a static inline stub for it, causing a redefinition error. Build the __weak definition only under CONFIG_BPF_SYSCALL, matching the bpf_defs.h declaration and the CONFIG_BPF_SYSCALL-gated strong definition in arena.c. Fixes: dc11a4dba246 ("bpf: Recover arena kernel faults with scratch page") Reported-by: Mark Brown Signed-off-by: Tejun Heo Acked-by: Song Liu Link: https://lore.kernel.org/r/20260527192632.2109419-1-tj@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8ecba2989d88..a656a8572bdb 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3376,13 +3376,14 @@ __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) { return 0; } + +#ifdef CONFIG_BPF_SYSCALL __weak bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip) { return false; } -#ifdef CONFIG_BPF_SYSCALL static int __init bpf_global_ma_init(void) { int ret; -- cgit v1.2.3 From fc99547a8bda22a6a489284641385d8dcfb3ecd8 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 25 May 2026 15:39:46 -0700 Subject: bpf: Factor out stack_map build ID helpers Factor out helpers from stack_map_get_build_id_offset() in preparation for adding a sleepable build ID resolution path: stack_map_build_id_set_ip(), stack_map_build_id_offset(), and stack_map_build_id_set_valid(). While here, refactor stack_map_get_build_id_offset(): * use continue-driven control flow in the main loop and remove build_id_valid label * update prev_vma and prev_build_id on the fall-back-to-IP branch so the cache reflects the actual VMA seen on the previous IP [1] * guard fetch_build_id() with vma_is_anonymous() [2] to skip parse attempts that would otherwise fail the ELF magic check [1] https://lore.kernel.org/bpf/CAEf4Bzac9uWWqBvzH0iFzKvJcq3vxscZ3pKm0sUHmN-F-z9wVQ@mail.gmail.com/ [2] https://lore.kernel.org/bpf/226398c1ff3f2b686c0aeb010408d85fb15df13f9ff60a045bee31e79b9e41e9@mail.kernel.org/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/bpf/20260525223948.1920986-2-ihor.solodrai@linux.dev --- kernel/bpf/stackmap.c | 57 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 17 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index da3d328f5c15..e23be7d44503 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -152,6 +152,28 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b : build_id_parse_nofault(vma, build_id, NULL); } +static inline void stack_map_build_id_set_ip(struct bpf_stack_build_id *id) +{ + id->status = BPF_STACK_BUILD_ID_IP; + memset(id->build_id, 0, BUILD_ID_SIZE_MAX); +} + +static inline u64 stack_map_build_id_offset(unsigned long vm_pgoff, + unsigned long vm_start, u64 ip) +{ + return (vm_pgoff << PAGE_SHIFT) + ip - vm_start; +} + +static inline void stack_map_build_id_set_valid(struct bpf_stack_build_id *id, + u64 offset, + const unsigned char *build_id) +{ + id->status = BPF_STACK_BUILD_ID_VALID; + id->offset = offset; + if (id->build_id != build_id) + memcpy(id->build_id, build_id, BUILD_ID_SIZE_MAX); +} + /* * Expects all id_offs[i].ip values to be set to correct initial IPs. * They will be subsequently: @@ -165,44 +187,45 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u32 trace_nr, bool user, bool may_fault) { - int i; struct mmap_unlock_irq_work *work = NULL; bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); + bool has_user_ctx = user && current && current->mm; struct vm_area_struct *vma, *prev_vma = NULL; - const char *prev_build_id; + const unsigned char *prev_build_id = NULL; + int i; /* If the irq_work is in use, fall back to report ips. Same * fallback is used for kernel stack (!user) on a stackmap with * build_id. */ - if (!user || !current || !current->mm || irq_work_busy || - !mmap_read_trylock(current->mm)) { + if (!has_user_ctx || irq_work_busy || !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ - for (i = 0; i < trace_nr; i++) { - id_offs[i].status = BPF_STACK_BUILD_ID_IP; - memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); - } + for (i = 0; i < trace_nr; i++) + stack_map_build_id_set_ip(&id_offs[i]); return; } for (i = 0; i < trace_nr; i++) { u64 ip = READ_ONCE(id_offs[i].ip); + u64 offset; - if (range_in_vma(prev_vma, ip, ip)) { + if (prev_build_id && range_in_vma(prev_vma, ip, ip)) { vma = prev_vma; - memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX); - goto build_id_valid; + offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + stack_map_build_id_set_valid(&id_offs[i], offset, prev_build_id); + continue; } vma = find_vma(current->mm, ip); - if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) { + if (!vma || vma_is_anonymous(vma) || + fetch_build_id(vma, id_offs[i].build_id, may_fault)) { /* per entry fall back to ips */ - id_offs[i].status = BPF_STACK_BUILD_ID_IP; - memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); + stack_map_build_id_set_ip(&id_offs[i]); + prev_vma = vma; + prev_build_id = NULL; continue; } -build_id_valid: - id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start; - id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); prev_vma = vma; prev_build_id = id_offs[i].build_id; } -- cgit v1.2.3 From fad3021faf7b0b64e9daea41c5662b65c8ad7379 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 25 May 2026 15:39:47 -0700 Subject: bpf: Avoid faultable build ID reads under mm locks Sleepable build ID parsing can block in __kernel_read() [1], so the stackmap sleepable path must not call it while holding mmap_lock or a per-VMA read lock. The issue and the fix are conceptually similar to a recent procfs patch [2]. A similar VMA locking pattern has already been used in PROCMAP_QUERY [3]. Resolve each covered VMA with a stable read-side reference, preferring lock_vma_under_rcu() and falling back to mmap_read_trylock() only long enough to acquire the VMA read lock. Take a reference to the backing file, drop the VMA lock, and then parse the build ID through (sleepable) build_id_parse_file(). We have to use mmap_read_trylock() (and give up on failure) in this context because taking mmap_read_lock() is generally unsafe on code paths reachable from BPF programs [4], and may lead to deadlocks. [1] https://lore.kernel.org/all/20251218005818.614819-1-shakeel.butt@linux.dev/ [2] https://lore.kernel.org/all/20260128183232.2854138-1-andrii@kernel.org/ [3] https://lore.kernel.org/all/20250808152850.2580887-1-surenb@google.com/ [4] https://lore.kernel.org/bpf/2895ecd8-df1e-4cc0-b9f9-aef893dc2360@linux.dev/ Fixes: d4dd9775ec24 ("bpf: wire up sleepable bpf_get_stack() and bpf_get_task_stack() helpers") Suggested-by: Puranjay Mohan Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260525223948.1920986-3-ihor.solodrai@linux.dev --- kernel/bpf/stackmap.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index e23be7d44503..c53cfd9a67cf 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "percpu_freelist.h" #include "mmap_unlock_work.h" @@ -174,6 +175,109 @@ static inline void stack_map_build_id_set_valid(struct bpf_stack_build_id *id, memcpy(id->build_id, build_id, BUILD_ID_SIZE_MAX); } +struct stack_map_vma_lock { + struct vm_area_struct *vma; + struct mm_struct *mm; +}; + +/* + * Acquire a stable read-side reference on the VMA covering @ip. + * + * With CONFIG_PER_VMA_LOCK=y this returns a VMA with its per-VMA read + * lock held and mmap_lock dropped, so the caller may sleep. + * + * With CONFIG_PER_VMA_LOCK=n it returns a VMA with mmap_lock still + * held; the caller must snapshot any fields it needs and pin vm_file + * with get_file() before stack_map_unlock_vma() drops mmap_lock, as + * the VMA may be split, merged, or freed after that. + * + * Returns NULL on failure, in which case no lock is held. + */ +static struct vm_area_struct * +stack_map_lock_vma(struct stack_map_vma_lock *lock, unsigned long ip) +{ + struct mm_struct *mm = lock->mm; + struct vm_area_struct *vma; + + /* noop under !CONFIG_PER_VMA_LOCK */ + vma = lock_vma_under_rcu(mm, ip); + if (vma) { + lock->vma = vma; + return vma; + } + + /* + * Taking mmap_read_lock() is unsafe here, because the caller BPF + * program might already hold it, causing a deadlock. + */ + if (!mmap_read_trylock(mm)) + return NULL; + + vma = vma_lookup(mm, ip); + if (!vma) { + mmap_read_unlock(mm); + return NULL; + } + +#ifdef CONFIG_PER_VMA_LOCK + if (!vma_start_read_locked(vma)) { + mmap_read_unlock(mm); + return NULL; + } + mmap_read_unlock(mm); +#endif + + lock->vma = vma; + return vma; +} + +static void stack_map_unlock_vma(struct stack_map_vma_lock *lock) +{ +#ifdef CONFIG_PER_VMA_LOCK + vma_end_read(lock->vma); +#else + mmap_read_unlock(lock->mm); +#endif + lock->vma = NULL; +} + +static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *id_offs, + u32 trace_nr) +{ + struct mm_struct *mm = current->mm; + struct stack_map_vma_lock lock = { .mm = mm }; + struct vm_area_struct *vma; + struct file *file; + u64 offset; + u64 ip; + + for (u32 i = 0; i < trace_nr; i++) { + ip = READ_ONCE(id_offs[i].ip); + + vma = stack_map_lock_vma(&lock, ip); + if (!vma) { + stack_map_build_id_set_ip(&id_offs[i]); + continue; + } + if (vma_is_anonymous(vma) || !vma->vm_file) { + stack_map_build_id_set_ip(&id_offs[i]); + stack_map_unlock_vma(&lock); + continue; + } + + file = get_file(vma->vm_file); + offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + stack_map_unlock_vma(&lock); + + /* build_id_parse_file() may block on filesystem reads */ + if (build_id_parse_file(file, id_offs[i].build_id, NULL)) + stack_map_build_id_set_ip(&id_offs[i]); + else + stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); + fput(file); + } +} + /* * Expects all id_offs[i].ip values to be set to correct initial IPs. * They will be subsequently: @@ -194,6 +298,11 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, const unsigned char *prev_build_id = NULL; int i; + if (may_fault && has_user_ctx) { + stack_map_get_build_id_offset_sleepable(id_offs, trace_nr); + return; + } + /* If the irq_work is in use, fall back to report ips. Same * fallback is used for kernel stack (!user) on a stackmap with * build_id. -- cgit v1.2.3 From 5e9099d8ff24ec32aa5af872a4d84d086bd70579 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 25 May 2026 15:39:48 -0700 Subject: bpf: Cache build IDs in sleepable stackmap path Stack traces often contain adjacent IPs from the same VMA or from different VMAs backed by the same ELF file. Cache the last successfully parsed build id together with the resolved VMA range and backing file so the sleepable build id path can avoid repeated VMA locking and file parsing in common cases. Suggested-by: Mykyta Yatsenko Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Acked-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260525223948.1920986-4-ihor.solodrai@linux.dev --- kernel/bpf/stackmap.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index c53cfd9a67cf..77ba03216c09 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -246,6 +246,14 @@ static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *i { struct mm_struct *mm = current->mm; struct stack_map_vma_lock lock = { .mm = mm }; + struct { + struct file *file; + const unsigned char *build_id; + unsigned long vm_start; + unsigned long vm_end; + unsigned long vm_pgoff; + } cache = {}; + unsigned long vm_pgoff, vm_start, vm_end; struct vm_area_struct *vma; struct file *file; u64 offset; @@ -254,6 +262,17 @@ static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *i for (u32 i = 0; i < trace_nr; i++) { ip = READ_ONCE(id_offs[i].ip); + /* + * Range cache fast path: if ip falls within the previously + * resolved VMA range, reuse the cache build_id without + * re-acquiring the VMA lock. + */ + if (cache.build_id && ip >= cache.vm_start && ip < cache.vm_end) { + offset = stack_map_build_id_offset(cache.vm_pgoff, cache.vm_start, ip); + stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id); + continue; + } + vma = stack_map_lock_vma(&lock, ip); if (!vma) { stack_map_build_id_set_ip(&id_offs[i]); @@ -265,17 +284,47 @@ static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *i continue; } - file = get_file(vma->vm_file); - offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + file = vma->vm_file; + vm_pgoff = vma->vm_pgoff; + vm_start = vma->vm_start; + vm_end = vma->vm_end; + offset = stack_map_build_id_offset(vm_pgoff, vm_start, ip); + + /* + * Same backing file as previous (e.g. different VMAs + * of the same ELF binary). Reuse the cache build_id. + */ + if (file == cache.file) { + stack_map_unlock_vma(&lock); + stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id); + cache.vm_start = vm_start; + cache.vm_end = vm_end; + cache.vm_pgoff = vm_pgoff; + continue; + } + + file = get_file(file); stack_map_unlock_vma(&lock); /* build_id_parse_file() may block on filesystem reads */ - if (build_id_parse_file(file, id_offs[i].build_id, NULL)) + if (build_id_parse_file(file, id_offs[i].build_id, NULL)) { stack_map_build_id_set_ip(&id_offs[i]); - else - stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); - fput(file); + fput(file); + continue; + } + + stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); + if (cache.file) + fput(cache.file); + cache.file = file; + cache.build_id = id_offs[i].build_id; + cache.vm_start = vm_start; + cache.vm_end = vm_end; + cache.vm_pgoff = vm_pgoff; } + + if (cache.file) + fput(cache.file); } /* -- cgit v1.2.3 From 9b435d23f51e55b62dda3a345a9f8931248ca514 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Thu, 21 May 2026 22:29:09 +0800 Subject: bpf: Fix race between bpf_map_new_fd() and close_fd() Because there is time gap between bpf_map_new_fd() and close_fd(), a concurrent thread is able to close the new fd and opens a new, unrelated file with the exact same fd number. Thereafter, this close_fd() might inadvertently close the unrelated file. To avoid such regression, do finalize log before security_bpf_map_create(). However, in order to achieve it, move bpf_get_file_flag(), security_bpf_map_create(), bpf_map_alloc_id(), and bpf_map_new_fd() from __map_create() to map_create(). And, rename __map_create() to map_create_alloc() meanwhile. Then, in order to reuse the map and token when all checks pass in map_create_alloc(), pass "struct bpf_map **" and "struct bpf_token **" to map_create_alloc(). Fixes: 49f9b2b2a18c ("bpf: Add syscall common attributes support for map_create") Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260521142909.95818-1-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 80 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 33 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8d4ea700aac6..9e91fb2fb492 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1359,7 +1359,8 @@ free_map_tab: #define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ -static int __map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifier_log *log) +static int map_create_alloc(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifier_log *log, + struct bpf_map **mapp, struct bpf_token **tokenp) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1367,7 +1368,6 @@ static int __map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifie u32 map_type = attr->map_type; struct bpf_map *map; bool token_flag; - int f_flags; int err; err = CHECK_ATTR(BPF_MAP_CREATE); @@ -1403,10 +1403,6 @@ static int __map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifie return -EINVAL; } - f_flags = bpf_get_file_flag(attr->map_flags); - if (f_flags < 0) - return f_flags; - if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || !node_online(numa_node))) { @@ -1598,6 +1594,49 @@ static int __map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifie goto free_map; } + *mapp = map; + *tokenp = token; + return 0; + +free_map: + bpf_map_free(map); +put_token: + bpf_token_put(token); + return err; +} + +static int map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_common_attr *attr_common, + bpfptr_t uattr_common, u32 size_common) +{ + struct bpf_token *token = NULL; + struct bpf_verifier_log *log; + struct bpf_log_attr attr_log; + struct bpf_map *map = NULL; + int err, ret; + int f_flags; + + log = bpf_log_attr_create_vlog(&attr_log, attr_common, uattr_common, size_common); + if (IS_ERR(log)) + return PTR_ERR(log); + + err = map_create_alloc(attr, uattr, log, &map, &token); + + /* preserve original error even if log finalization is successful */ + ret = bpf_log_attr_finalize(&attr_log, log); + if (ret) + err = ret; + + kfree(log); + + if (err) + goto free_map; + + f_flags = bpf_get_file_flag(attr->map_flags); + if (f_flags < 0) { + err = f_flags; + goto free_map; + } + err = security_bpf_map_create(map, attr, token, uattr.is_kernel); if (err) goto free_map_sec; @@ -1626,37 +1665,12 @@ static int __map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifie free_map_sec: security_bpf_map_free(map); free_map: - bpf_map_free(map); -put_token: + if (map) + bpf_map_free(map); bpf_token_put(token); return err; } -static int map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_common_attr *attr_common, - bpfptr_t uattr_common, u32 size_common) -{ - struct bpf_verifier_log *log; - struct bpf_log_attr attr_log; - int err, ret; - - log = bpf_log_attr_create_vlog(&attr_log, attr_common, uattr_common, size_common); - if (IS_ERR(log)) - return PTR_ERR(log); - - err = __map_create(attr, uattr, log); - - /* preserve original error even if log finalization is successful */ - ret = bpf_log_attr_finalize(&attr_log, log); - if (ret) { - if (err >= 0) - close_fd(err); - err = ret; - } - - kfree(log); - return err; -} - void bpf_map_inc(struct bpf_map *map) { atomic64_inc(&map->refcnt); -- cgit v1.2.3 From 21c4b99b27f3f85b89256e81b3e997dec0a460d0 Mon Sep 17 00:00:00 2001 From: Yuyang Huang Date: Sun, 31 May 2026 15:55:59 +0800 Subject: bpf: fix BPF_PROG_QUERY OOB write and cgroup backward compat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPF_PROG_QUERY writes back the 'query.revision' field unconditionally to userspace. If userspace passes a smaller 'bpf_attr' structure (e.g. 40 bytes, which was the layout before the addition of 'query.revision'), the kernel performs an out-of-bounds write. Fix this by propagating the user-provided attribute size 'uattr_size' down to the cgroup query handlers, and conditionally skipping writing the revision field to userspace when the provided buffer size is insufficient. query.revision in bpf_mprog_query is structurally identical to the cgroup case: a late tail field, written unconditionally. But the backward-compat hazard is not the same. The min-historical-size test is per command, and bpf_mprog_query only serves attach types that were born with revision in the struct: - tcx_prog_query -> BPF_TCX_INGRESS/EGRESS - netkit_prog_query -> BPF_NETKIT_PRIMARY/PEER tcx, netkit, the revision field, and bpf_mprog_query itself all landed in the same v6.6 merge window (053c8e1f235d added the mprog query API + revision; tcx in e420bed02507, netkit in 35dfaad7188c). There has never been a tcx/netkit BPF_PROG_QUERY userspace that doesn't know about revision. So for these commands the minimum legitimate struct already covers offset 56-64 — no old binary can be broken here. Contrast with cgroup: BPF_PROG_QUERY on cgroup attach types shipped in 2017; revision write-back was bolted on years later (120933984460). That path has a real population of pre-revision callers. Fixes: 120933984460 ("bpf: Implement mprog API on top of existing cgroup progs") Cc: Maciej Żenczykowski Cc: Lorenzo Colitti Signed-off-by: Yuyang Huang Link: https://lore.kernel.org/r/20260531075600.4058207-2-yuyanghuang@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 5 +++-- kernel/bpf/cgroup.c | 13 +++++++------ kernel/bpf/syscall.c | 6 +++--- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index b2e79c2b41d5..4d0cc65976a1 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -421,7 +421,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int cgroup_bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr); + union bpf_attr __user *uattr, u32 uattr_size); const struct bpf_func_proto * cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -452,7 +452,8 @@ static inline int cgroup_bpf_link_attach(const union bpf_attr *attr, } static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, + u32 uattr_size) { return -EINVAL; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 876f6a81a9b6..2c2bdaa86aa7 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1208,7 +1208,7 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* Must be called with cgroup_mutex held to avoid races. */ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE; @@ -1259,7 +1259,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, return -EFAULT; if (!effective_query && from_atype == to_atype) revision = cgrp->bpf.revisions[from_atype]; - if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) + if (uattr_size >= offsetofend(union bpf_attr, query.revision) && + copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) return -EFAULT; if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) /* return early if user requested only program count + flags */ @@ -1312,12 +1313,12 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { int ret; cgroup_lock(); - ret = __cgroup_bpf_query(cgrp, attr, uattr); + ret = __cgroup_bpf_query(cgrp, attr, uattr, uattr_size); cgroup_unlock(); return ret; } @@ -1520,7 +1521,7 @@ out_put_cgroup: } int cgroup_bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { struct cgroup *cgrp; int ret; @@ -1529,7 +1530,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, if (IS_ERR(cgrp)) return PTR_ERR(cgrp); - ret = cgroup_bpf_query(cgrp, attr, uattr); + ret = cgroup_bpf_query(cgrp, attr, uattr, uattr_size); cgroup_put(cgrp); return ret; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9e91fb2fb492..93bbbe610a7a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4719,7 +4719,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) #define BPF_PROG_QUERY_LAST_FIELD query.revision static int bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { if (!bpf_net_capable()) return -EPERM; @@ -4758,7 +4758,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: case BPF_LSM_CGROUP: - return cgroup_bpf_prog_query(attr, uattr); + return cgroup_bpf_prog_query(attr, uattr, uattr_size); case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: @@ -6376,7 +6376,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, err = bpf_prog_detach(&attr); break; case BPF_PROG_QUERY: - err = bpf_prog_query(&attr, uattr.user); + err = bpf_prog_query(&attr, uattr.user, size); break; case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr.user); -- cgit v1.2.3 From de36adca634634c205a9eb8b56a28175ab7abf5f Mon Sep 17 00:00:00 2001 From: Taegu Ha Date: Thu, 28 May 2026 15:21:55 +0900 Subject: bpf: reject overlarge global subprog argument sizes Global subprogram argument checking derives generic pointer sizes from BTF and passes the resolved size to check_mem_reg() as a u32. The access-size validation path then uses a signed int, and stack pointers negate the value before calling check_helper_mem_access(). This creates a wrap when BTF describes a pointee size larger than S32_MAX. For example, a global subprogram argument of type: int (*p)[0x3fffffff] has a BTF-resolved pointee size of 0xfffffffc bytes. At a call site the caller can pass a pointer to a 4-byte stack slot at fp-4. The current PTR_TO_STACK path computes: size = -(int)mem_size so 0xfffffffc becomes -4 as a signed int and the negation validates only a 4-byte stack range. That range is covered by the caller's stack slot, so the call is accepted. The callee is then verified independently with R1 as PTR_TO_MEM and mem_size 0xfffffffc. A small instruction such as: r0 = *(u32 *)(r1 + 4) is accepted as being inside that BTF-described memory region. At run time, however, the actual argument value is still fp-4, so r1 + 4 addresses fp+0, outside the 4-byte object that the caller provided. Reject sizes that cannot be represented by the verifier's signed access-size API before the stack-specific negation. Add a verifier regression test for the oversized BTF argument. Fixes: 2cb27158adb3 ("bpf: poison dead stack slots") Signed-off-by: Taegu Ha Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260528062155.3988156-1-hataegu0826@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++++ .../selftests/bpf/progs/verifier_global_subprogs.c | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c8d980fdd709..3a270bc485c2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6927,6 +6927,12 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg if (bpf_register_is_null(reg)) return 0; + if (mem_size > S32_MAX) { + verbose(env, "%s memory size %u is too large\n", + reg_arg_name(env, argno), mem_size); + return -EACCES; + } + /* Assuming that the register contains a value check if the memory * access is safe. Temporarily save and restore the register's state as * the conversion shouldn't be visible to a caller. diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c index dc09d0e2d8ad..75a2e3f48d0f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c @@ -152,6 +152,23 @@ int anon_user_mem_valid(void *ctx) return subprog_user_anon_mem(&t); } +__noinline __weak int subprog_user_anon_mem_huge(int (*p)[0x3fffffff]) +{ + return p ? (*p)[1] : 0; +} + +SEC("?tracepoint") +__failure __log_level(2) +__msg("R1 memory size 4294967292 is too large") +int anon_user_mem_huge_size_invalid(void *ctx) +{ + int (*p)[0x3fffffff]; + int tiny = 42; + + p = (void *)&tiny; + return subprog_user_anon_mem_huge(p) + tiny; +} + __noinline __weak int subprog_nonnull_ptr_good(int *p1 __arg_nonnull, int *p2 __arg_nonnull) { return (*p1) * (*p2); /* good, no need for NULL checks */ -- cgit v1.2.3 From 868d43cf8f970b456fd93334bee40f792cf27e4d Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Sat, 23 May 2026 12:00:26 -0400 Subject: bpf: Fix security_bpf_prog_load() error handling If security_bpf_prog_load() fails there is no need to call into security_bpf_prog_free() as the LSM will handle the cleanup of any partial LSM state before returning to the caller with an error. Thankfully this isn't an issue with any of the existing code as the LSMs which currently provide BPF hook callback implementations don't allocate any internal state, but this is something we want to fix for potential future users. Signed-off-by: Paul Moore Link: https://lore.kernel.org/r/20260523160025.16363-2-paul@paul-moore.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 93bbbe610a7a..2aafd2131983 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3136,7 +3136,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_at err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel); if (err) - goto free_prog_sec; + goto free_prog; /* run eBPF verifier */ err = bpf_check(&prog, attr, uattr, attr_log); @@ -3182,8 +3182,6 @@ free_used_maps: __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); return err; -free_prog_sec: - security_bpf_prog_free(prog); free_prog: free_uid(prog->aux->user); if (prog->aux->attach_btf) -- cgit v1.2.3 From bab08d1f70438cf06de9ab438313b7ef3099514c Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:24 -0700 Subject: bpf: Simplify mark_stack_slot_obj_read() and callers Rename mark_stack_slot_obj_read() as mark_stack_slots_scratched() and directly call it from functions processing iter, dynptr and irq_flag. Commit 6762e3a0bce5 ("bpf: simplify liveness to use (callsite, depth) keyed func_instances") has removed the dynamic liveness component in mark_stack_slot_obj_read(). The function effectively only marks stack slots as scratched and always succeed. Therefore, return void, drop the unused bpf_reg_state argument and rename it to mark_stack_slots_scratched() to reflect what it does now. In addition, to prepare for unifying dynptr handling, dynptr_get_spi() will be moved out of mark_dynptr_read(). As mark_dynptr_read() would join mark_iter_read() as a thin wrapper of mark_stack_slots_scratched(), just open code these helpers. Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-2-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 69 ++++++++++++++++----------------------------------- 1 file changed, 21 insertions(+), 48 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3a270bc485c2..d048f3033220 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3006,50 +3006,13 @@ out: return ret; } -static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int spi, int nr_slots) +static void mark_stack_slots_scratched(struct bpf_verifier_env *env, + int spi, int nr_slots) { int i; for (i = 0; i < nr_slots; i++) mark_stack_slot_scratched(env, spi - i); - return 0; -} - -static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - int spi; - - /* For CONST_PTR_TO_DYNPTR, it must have already been done by - * check_reg_arg in check_helper_call and mark_btf_func_reg_size in - * check_kfunc_call. - */ - if (reg->type == CONST_PTR_TO_DYNPTR) - return 0; - spi = dynptr_get_spi(env, reg); - if (spi < 0) - return spi; - /* Caller ensures dynptr is valid and initialized, which means spi is in - * bounds and spi is the first dynptr slot. Simply mark stack slot as - * read. - */ - return mark_stack_slot_obj_read(env, reg, spi, BPF_DYNPTR_NR_SLOTS); -} - -static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int spi, int nr_slots) -{ - return mark_stack_slot_obj_read(env, reg, spi, nr_slots); -} - -static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - int spi; - - spi = irq_flag_get_spi(env, reg); - if (spi < 0) - return spi; - return mark_stack_slot_obj_read(env, reg, spi, 1); } /* This function is supposed to be used by the following 32-bit optimization @@ -7261,7 +7224,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, enum bpf_arg_type arg_type, int clone_ref_obj_id) { - int err; + int spi, err = 0; if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { verbose(env, @@ -7323,7 +7286,17 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat return -EINVAL; } - err = mark_dynptr_read(env, reg); + if (reg->type != CONST_PTR_TO_DYNPTR) { + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; + + /* + * For CONST_PTR_TO_DYNPTR, reg is already scratched by check_reg_arg + * in check_helper_call and mark_btf_func_reg_size in check_kfunc_call. + */ + mark_stack_slots_scratched(env, spi, BPF_DYNPTR_NR_SLOTS); + } } return err; } @@ -7433,9 +7406,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state * if (spi < 0) return spi; - err = mark_iter_read(env, reg, spi, nr_slots); - if (err) - return err; + mark_stack_slots_scratched(env, spi, nr_slots); /* remember meta->iter info for process_iter_next_call() */ meta->iter.spi = spi; @@ -11399,7 +11370,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - int err, kfunc_class = IRQ_NATIVE_KFUNC; + int err, spi, kfunc_class = IRQ_NATIVE_KFUNC; bool irq_save; if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] || @@ -11440,9 +11411,11 @@ static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state * return err; } - err = mark_irq_flag_read(env, reg); - if (err) - return err; + spi = irq_flag_get_spi(env, reg); + if (spi < 0) + return spi; + + mark_stack_slots_scratched(env, spi, 1); err = unmark_stack_slot_irq_flag(env, reg, kfunc_class); if (err) -- cgit v1.2.3 From b5c0a07eb2c23bfd0c42ad6b461e6881b4b0995b Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:25 -0700 Subject: bpf: Unify dynptr handling in the verifier Simplify dynptr checking for helper and kfunc by unifying it. Remember the initialized dynptr (i.e.,g !(arg_type |= MEM_UNINIT)) pass to a dynptr kfunc during process_dynptr_func() so that we can easily retrieve the information for verification later. By saving it in meta->dynptr, there is no need to call dynptr helpers such as dynptr_id(), dynptr_ref_obj_id() and dynptr_type() in check_func_arg(). Remove and open code the helpers in process_dynptr_func() when saving id, ref_obj_id, and type. Besides, since dynptr ref_obj_id information is now pass around in meta->bpf_dynptr_desc, drop the check in helper_multiple_ref_obj_use. Acked-by: Eduard Zingerman Acked-by: Mykyta Yatsenko Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-3-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 13 ++-- kernel/bpf/verifier.c | 178 ++++++++----------------------------------- 2 files changed, 40 insertions(+), 151 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5cbad3b64130..3a5c226bf1c3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1438,6 +1438,13 @@ struct bpf_map_desc { int uid; }; +/* The last initialized dynptr; Populated by process_dynptr_func() */ +struct bpf_dynptr_desc { + enum bpf_dynptr_type type; + u32 id; + u32 ref_obj_id; +}; + struct bpf_kfunc_call_arg_meta { /* In parameters */ struct btf *btf; @@ -1478,16 +1485,12 @@ struct bpf_kfunc_call_arg_meta { struct { struct btf_field *field; } arg_rbtree_root; - struct { - enum bpf_dynptr_type type; - u32 id; - u32 ref_obj_id; - } initialized_dynptr; struct { u8 spi; u8 frameno; } iter; struct bpf_map_desc map; + struct bpf_dynptr_desc dynptr; u64 mem_size; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d048f3033220..0c9792d42668 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -233,6 +233,7 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) struct bpf_call_arg_meta { struct bpf_map_desc map; + struct bpf_dynptr_desc dynptr; bool raw_mode; bool pkt_access; u8 release_regno; @@ -241,7 +242,6 @@ struct bpf_call_arg_meta { int mem_size; u64 msize_max_value; int ref_obj_id; - int dynptr_id; int func_id; struct btf *btf; u32 btf_id; @@ -470,11 +470,6 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_request_sock; } -static bool is_dynptr_ref_function(enum bpf_func_id func_id) -{ - return func_id == BPF_FUNC_dynptr_data; -} - static bool is_sync_callback_calling_kfunc(u32 btf_id); static bool is_async_callback_calling_kfunc(u32 btf_id); static bool is_callback_calling_kfunc(u32 btf_id); @@ -542,8 +537,6 @@ static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, ref_obj_uses++; if (is_acquire_function(func_id, map)) ref_obj_uses++; - if (is_dynptr_ref_function(func_id)) - ref_obj_uses++; return ref_obj_uses > 1; } @@ -7221,8 +7214,9 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, * use case. The second level is tracked using the upper bit of bpf_dynptr->size * and checked dynamically during runtime. */ -static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, - enum bpf_arg_type arg_type, int clone_ref_obj_id) +static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + argno_t argno, int insn_idx, enum bpf_arg_type arg_type, + int clone_ref_obj_id, struct bpf_dynptr_desc *dynptr) { int spi, err = 0; @@ -7287,6 +7281,8 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat } if (reg->type != CONST_PTR_TO_DYNPTR) { + struct bpf_func_state *state = bpf_func(env, reg); + spi = dynptr_get_spi(env, reg); if (spi < 0) return spi; @@ -7296,6 +7292,14 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat * in check_helper_call and mark_btf_func_reg_size in check_kfunc_call. */ mark_stack_slots_scratched(env, spi, BPF_DYNPTR_NR_SLOTS); + + reg = &state->stack[spi].spilled_ptr; + } + + if (dynptr) { + dynptr->type = reg->dynptr.type; + dynptr->id = reg->id; + dynptr->ref_obj_id = reg->ref_obj_id; } } return err; @@ -8065,72 +8069,6 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, } } -static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env, - const struct bpf_func_proto *fn, - struct bpf_reg_state *regs) -{ - struct bpf_reg_state *state = NULL; - int i; - - for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) - if (arg_type_is_dynptr(fn->arg_type[i])) { - if (state) { - verbose(env, "verifier internal error: multiple dynptr args\n"); - return NULL; - } - state = ®s[BPF_REG_1 + i]; - } - - if (!state) - verbose(env, "verifier internal error: no dynptr arg found\n"); - - return state; -} - -static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - if (reg->type == CONST_PTR_TO_DYNPTR) - return reg->id; - spi = dynptr_get_spi(env, reg); - if (spi < 0) - return spi; - return state->stack[spi].spilled_ptr.id; -} - -static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - if (reg->type == CONST_PTR_TO_DYNPTR) - return reg->ref_obj_id; - spi = dynptr_get_spi(env, reg); - if (spi < 0) - return spi; - return state->stack[spi].spilled_ptr.ref_obj_id; -} - -static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, - struct bpf_reg_state *reg) -{ - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - if (reg->type == CONST_PTR_TO_DYNPTR) - return reg->dynptr.type; - - spi = bpf_get_spi(reg->var_off.value); - if (spi < 0) { - verbose(env, "verifier internal error: invalid spi when querying dynptr type\n"); - return BPF_DYNPTR_TYPE_INVALID; - } - - return state->stack[spi].spilled_ptr.dynptr.type; -} - static int check_arg_const_str(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno) { @@ -8488,7 +8426,8 @@ skip_type_check: true, meta); break; case ARG_PTR_TO_DYNPTR: - err = process_dynptr_func(env, reg, argno_from_reg(regno), insn_idx, arg_type, 0); + err = process_dynptr_func(env, reg, argno_from_reg(regno), insn_idx, arg_type, 0, + &meta->dynptr); if (err) return err; break; @@ -9170,7 +9109,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, if (ret) return ret; - ret = process_dynptr_func(env, reg, argno, -1, arg->arg_type, 0); + ret = process_dynptr_func(env, reg, argno, -1, arg->arg_type, 0, NULL); if (ret) return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { @@ -10278,52 +10217,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } break; - case BPF_FUNC_dynptr_data: - { - struct bpf_reg_state *reg; - int id, ref_obj_id; - - reg = get_dynptr_arg_reg(env, fn, regs); - if (!reg) - return -EFAULT; - - - if (meta.dynptr_id) { - verifier_bug(env, "meta.dynptr_id already set"); - return -EFAULT; - } - if (meta.ref_obj_id) { - verifier_bug(env, "meta.ref_obj_id already set"); - return -EFAULT; - } - - id = dynptr_id(env, reg); - if (id < 0) { - verifier_bug(env, "failed to obtain dynptr id"); - return id; - } - - ref_obj_id = dynptr_ref_obj_id(env, reg); - if (ref_obj_id < 0) { - verifier_bug(env, "failed to obtain dynptr ref_obj_id"); - return ref_obj_id; - } - - meta.dynptr_id = id; - meta.ref_obj_id = ref_obj_id; - - break; - } case BPF_FUNC_dynptr_write: { - enum bpf_dynptr_type dynptr_type; - struct bpf_reg_state *reg; + enum bpf_dynptr_type dynptr_type = meta.dynptr.type; - reg = get_dynptr_arg_reg(env, fn, regs); - if (!reg) - return -EFAULT; - - dynptr_type = dynptr_get_type(env, reg); if (dynptr_type == BPF_DYNPTR_TYPE_INVALID) return -EFAULT; @@ -10515,10 +10412,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } - if (is_dynptr_ref_function(func_id)) - regs[BPF_REG_0].dynptr_id = meta.dynptr_id; - - if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) { + if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; } else if (is_acquire_function(func_id, meta.map.ptr)) { @@ -10532,6 +10426,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].ref_obj_id = id; } + if (func_id == BPF_FUNC_dynptr_data) { + regs[BPF_REG_0].dynptr_id = meta.dynptr.id; + regs[BPF_REG_0].ref_obj_id = meta.dynptr.ref_obj_id; + } + err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta); if (err) return err; @@ -12187,7 +12086,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ meta->release_regno = regno; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { - enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; + enum bpf_dynptr_type parent_type = meta->dynptr.type; if (parent_type == BPF_DYNPTR_TYPE_INVALID) { verifier_bug(env, "no dynptr type for parent of clone"); @@ -12195,30 +12094,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type); - clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id; + clone_ref_obj_id = meta->dynptr.ref_obj_id; if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) { verifier_bug(env, "missing ref obj id for parent of clone"); return -EFAULT; } } - ret = process_dynptr_func(env, reg, argno, insn_idx, - dynptr_arg_type, clone_ref_obj_id); + ret = process_dynptr_func(env, reg, argno, insn_idx, dynptr_arg_type, + clone_ref_obj_id, &meta->dynptr); if (ret < 0) return ret; - - if (!(dynptr_arg_type & MEM_UNINIT)) { - int id = dynptr_id(env, reg); - - if (id < 0) { - verifier_bug(env, "failed to obtain dynptr id"); - return id; - } - meta->initialized_dynptr.id = id; - meta->initialized_dynptr.type = dynptr_get_type(env, reg); - meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg); - } - break; } case KF_ARG_PTR_TO_ITER: @@ -12849,7 +12735,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca } } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] || meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { - enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type); + enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->dynptr.type); mark_reg_known_zero(env, regs, BPF_REG_0); @@ -12873,11 +12759,11 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca } } - if (!meta->initialized_dynptr.id) { + if (!meta->dynptr.id) { verifier_bug(env, "no dynptr id"); return -EFAULT; } - regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id; + regs[BPF_REG_0].dynptr_id = meta->dynptr.id; /* we don't need to set BPF_REG_0's ref obj id * because packet slices are not refcounted (see @@ -13063,7 +12949,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.release_regno) { struct bpf_reg_state *reg = ®s[meta.release_regno]; - if (meta.initialized_dynptr.ref_obj_id) { + if (meta.dynptr.ref_obj_id) { err = unmark_stack_slots_dynptr(env, reg); } else { err = release_reference(env, reg->ref_obj_id); -- cgit v1.2.3 From 94ac7553361f3f20b0a60c13f9e7d0a859c73c12 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:26 -0700 Subject: bpf: Assign reg->id when getting referenced kptr from ctx Assign reg->id when getting referenced kptr from read program context to be consistent with R0 of KF_ACQUIRE kfunc. skb dynptr will track the referenced skb in qdisc programs using a new field reg->parent_id in a later patch. Acked-by: Andrii Nakryiko Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-4-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0c9792d42668..a6974ff50514 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6208,8 +6208,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b } else { mark_reg_known_zero(env, regs, value_regno); - if (type_may_be_null(info.reg_type)) - regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the * insn. When the dst is PTR, it is for sure not @@ -6219,8 +6217,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b if (base_type(info.reg_type) == PTR_TO_BTF_ID) { regs[value_regno].btf = info.btf; regs[value_regno].btf_id = info.btf_id; + regs[value_regno].id = info.ref_obj_id; regs[value_regno].ref_obj_id = info.ref_obj_id; } + if (type_may_be_null(info.reg_type) && !regs[value_regno].id) + regs[value_regno].id = ++env->id_gen; } regs[value_regno].type = info.reg_type; } -- cgit v1.2.3 From 06d518a5583fa681dc5c204d578a894f5b11ffa5 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:27 -0700 Subject: bpf: Preserve reg->id of pointer objects after null-check Preserve reg->id of pointer objects after null-checking the register so that children objects derived from it can still refer to it in the new object relationship tracking mechanism introduced in a later patch. This change incurs a slight increase in the number of states in one selftest bpf object, rbtree_search.bpf.o. For Meta bpf objects, the increase of states is also negligible. Selftest BPF objects with insns_diff > 0 Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) ------------------------ --------- --------- -------------- ---------- ---------- ------------- rbtree_search 6820 7326 +506 (+7.42%) 379 398 +19 (+5.01%) Meta BPF objects with insns_diff > 0 Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) ------------------------ --------- --------- -------------- ---------- ---------- ------------- ned_imex_be_tclass 52 57 +5 (+9.62%) 5 6 +1 (+20.00%) ned_imex_be_tclass 52 57 +5 (+9.62%) 5 6 +1 (+20.00%) ned_skop_auto_flowlabel 523 526 +3 (+0.57%) 39 40 +1 (+2.56%) ned_skop_mss 289 292 +3 (+1.04%) 20 20 +0 (+0.00%) ned_skopt_bet_classifier 78 82 +4 (+5.13%) 8 8 +0 (+0.00%) dctcp_update_alpha 252 320 +68 (+26.98%) 21 27 +6 (+28.57%) dctcp_update_alpha 252 320 +68 (+26.98%) 21 27 +6 (+28.57%) ned_ts_func 119 126 +7 (+5.88%) 6 7 +1 (+16.67%) tw_egress 1119 1128 +9 (+0.80%) 95 96 +1 (+1.05%) tw_ingress 1128 1137 +9 (+0.80%) 95 96 +1 (+1.05%) tw_tproxy_router 4380 4465 +85 (+1.94%) 114 118 +4 (+3.51%) tw_tproxy_router4 3093 3170 +77 (+2.49%) 83 88 +5 (+6.02%) ttls_tc_ingress 34656 35717 +1061 (+3.06%) 936 970 +34 (+3.63%) tw_twfw_egress 222327 222338 +11 (+0.00%) 10563 10564 +1 (+0.01%) tw_twfw_ingress 78295 78299 +4 (+0.01%) 3825 3826 +1 (+0.03%) tw_twfw_tc_eg 222839 222859 +20 (+0.01%) 10584 10585 +1 (+0.01%) tw_twfw_tc_in 78295 78299 +4 (+0.01%) 3825 3826 +1 (+0.03%) tw_twfw_egress 8080 8085 +5 (+0.06%) 456 456 +0 (+0.00%) tw_twfw_ingress 8053 8056 +3 (+0.04%) 454 454 +0 (+0.00%) tw_twfw_tc_eg 8154 8174 +20 (+0.25%) 456 457 +1 (+0.22%) tw_twfw_tc_in 8060 8063 +3 (+0.04%) 455 455 +0 (+0.00%) tw_twfw_egress 222327 222338 +11 (+0.00%) 10563 10564 +1 (+0.01%) tw_twfw_ingress 78295 78299 +4 (+0.01%) 3825 3826 +1 (+0.03%) tw_twfw_tc_eg 222839 222859 +20 (+0.01%) 10584 10585 +1 (+0.01%) tw_twfw_tc_in 78295 78299 +4 (+0.01%) 3825 3826 +1 (+0.03%) tw_twfw_egress 8080 8085 +5 (+0.06%) 456 456 +0 (+0.00%) tw_twfw_ingress 8053 8056 +3 (+0.04%) 454 454 +0 (+0.00%) tw_twfw_tc_eg 8154 8174 +20 (+0.25%) 456 457 +1 (+0.22%) tw_twfw_tc_in 8060 8063 +3 (+0.04%) 455 455 +0 (+0.00%) Looking into rbtree_search, the reason for such increase is that the verifier has to explore the main loop shown below for one more iteration until state pruning decides the current state is safe. long rbtree_search(void *ctx) { ... bpf_spin_lock(&glock0); rb_n = bpf_rbtree_root(&groot0); while (can_loop) { if (!rb_n) { bpf_spin_unlock(&glock0); return __LINE__; } n = rb_entry(rb_n, struct node_data, r0); if (lookup_key == n->key0) break; if (nr_gc < NR_NODES) gc_ns[nr_gc++] = rb_n; if (lookup_key < n->key0) rb_n = bpf_rbtree_left(&groot0, rb_n); else rb_n = bpf_rbtree_right(&groot0, rb_n); } ... } Below is what the verifier sees at the start of each iteration (65: may_goto) after preserving id of rb_n. Without id of rb_n, the verifier stops exploring the loop at iter 16. rb_n gc_ns[15] iter 15 257 257 iter 16 290 257 rb_n: idmap add 257->290 gc_ns[15]: check 257 != 290 --> state not equal iter 17 325 257 rb_n: idmap add 290->325 gc_ns[15]: idmap add 257->257 --> state safe Acked-by: Andrii Nakryiko Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-5-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a6974ff50514..0d8be0b68bd8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15576,15 +15576,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, mark_ptr_not_null_reg(reg); - if (!reg_may_point_to_spin_lock(reg)) { - /* For not-NULL ptr, reg->ref_obj_id will be reset - * in release_reference(). - * - * reg->id is still used by spin_lock ptr. Other - * than spin_lock ptr type, reg->id can be reset. - */ - reg->id = 0; - } + /* + * reg->id is preserved for object relationship tracking + * and spin_lock lock state tracking + */ } } -- cgit v1.2.3 From 308c7a0ae8859b34d9d90a3dff953b2d14242145 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:28 -0700 Subject: bpf: Refactor object relationship tracking and fix dynptr UAF bug Refactor object relationship tracking in the verifier and fix a dynptr use-after-free bug where file/skb dynptrs are not invalidated when the parent referenced object is freed. Add parent_id to bpf_reg_state to precisely track child-parent relationships. A child object's parent_id points to the parent object's id. This replaces the PTR_TO_MEM-specific dynptr_id. Remove ref_obj_id from bpf_reg_state by folding its role into the existing id field. Previously, id tracked pointer identity for null checking while ref_obj_id tracked the owning reference for lifetime management. These are now unified: acquire helpers and kfuncs set id to the acquired reference id, and release paths use id directly. Add reg_is_referenced() which checks if a register is referenced by looking up its id in the reference array. This replaces all former ref_obj_id checks. For release_reference(), invalidating an object now also invalidates all descendants by traversing the object tree. This is done using stack-based DFS to avoid recursive call chains of release_reference() -> unmark_stack_slots_dynptr() -> release_reference(). Referenced objects encountered during tree traversal are reported as leaked references. Add parent_id to bpf_reference_state to enable hierarchical reference tracking. When acquiring a reference, a parent_id can be specified to link the new reference to an existing one (e.g., referenced dynptrs acquire a reference with parent_id linking to the parent object's reference). Pointer casting: For pointer casting helpers (bpf_sk_fullsock, bpf_tcp_sock), instead of propagating ref_obj_id, the cast result reuses the same reference id as the source pointer. Since the cast may return NULL for a non-NULL input, the NULL case is explored as a separate verifier branch. This allows releasing any of the original or cast pointers to invalidate all others. Referenced dynptrs: When constructing a referenced dynptr, acquire a intermediate reference with parent_id linking to the parent referenced object. The dynptr and all clones share the same parent_id (pointing to the intermediate ref) but get unique ids for independent slice tracking. Releasing a referenced dynptr releases the parent reference, which in turn invalidates all clones and their derived slices. Owning to non-owning reference conversion: After converting owning to non-owning by clearing id (e.g., object(id=1) -> object(id=0)), the verifier releases the reference state via release_reference_nomark(). Note that the error message "reference has not been acquired before" in the helper and kfunc release paths is removed. This message was already unreachable. The verifier only calls release_reference() after confirming the reference is valid, so the condition could never trigger in practice. Fixes: 870c28588afa ("bpf: net_sched: Add basic bpf qdisc kfuncs") Signed-off-by: Amery Hung Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260529014936.2811085-6-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 +- include/linux/bpf_verifier.h | 77 ++- kernel/bpf/btf.c | 2 +- kernel/bpf/fixups.c | 2 +- kernel/bpf/log.c | 18 +- kernel/bpf/states.c | 11 +- kernel/bpf/verifier.c | 560 ++++++++++----------- tools/testing/selftests/bpf/prog_tests/spin_lock.c | 4 +- tools/testing/selftests/bpf/progs/dynptr_fail.c | 4 +- .../selftests/bpf/progs/iters_state_safety.c | 4 +- .../selftests/bpf/progs/iters_testmod_seq.c | 12 +- 11 files changed, 338 insertions(+), 360 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1c6863ce89e0..d1a17c118316 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1062,7 +1062,7 @@ struct bpf_insn_access_aux { struct { struct btf *btf; u32 btf_id; - u32 ref_obj_id; + u32 ref_id; }; }; struct bpf_verifier_log *log; /* for verbose logs */ @@ -1631,7 +1631,7 @@ struct bpf_ctx_arg_aux { enum bpf_reg_type reg_type; struct btf *btf; u32 btf_id; - u32 ref_obj_id; + u32 ref_id; bool refcounted; }; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3a5c226bf1c3..75b287d8d92f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -66,7 +66,6 @@ struct bpf_reg_state { struct { /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ u32 mem_size; - u32 dynptr_id; /* for dynptr slices */ }; /* For dynptr stack slots */ @@ -148,46 +147,14 @@ struct bpf_reg_state { #define BPF_ADD_CONST32 (1U << 30) #define BPF_ADD_CONST (BPF_ADD_CONST64 | BPF_ADD_CONST32) u32 id; - /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned - * from a pointer-cast helper, bpf_sk_fullsock() and - * bpf_tcp_sock(). - * - * Consider the following where "sk" is a reference counted - * pointer returned from "sk = bpf_sk_lookup_tcp();": - * - * 1: sk = bpf_sk_lookup_tcp(); - * 2: if (!sk) { return 0; } - * 3: fullsock = bpf_sk_fullsock(sk); - * 4: if (!fullsock) { bpf_sk_release(sk); return 0; } - * 5: tp = bpf_tcp_sock(fullsock); - * 6: if (!tp) { bpf_sk_release(sk); return 0; } - * 7: bpf_sk_release(sk); - * 8: snd_cwnd = tp->snd_cwnd; // verifier will complain - * - * After bpf_sk_release(sk) at line 7, both "fullsock" ptr and - * "tp" ptr should be invalidated also. In order to do that, - * the reg holding "fullsock" and "sk" need to remember - * the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id - * such that the verifier can reset all regs which have - * ref_obj_id matching the sk_reg->id. - * - * sk_reg->ref_obj_id is set to sk_reg->id at line 1. - * sk_reg->id will stay as NULL-marking purpose only. - * After NULL-marking is done, sk_reg->id can be reset to 0. - * - * After "fullsock = bpf_sk_fullsock(sk);" at line 3, - * fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id. - * - * After "tp = bpf_tcp_sock(fullsock);" at line 5, - * tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id - * which is the same as sk_reg->ref_obj_id. - * - * From the verifier perspective, if sk, fullsock and tp - * are not NULL, they are the same ptr with different - * reg->type. In particular, bpf_sk_release(tp) is also - * allowed and has the same effect as bpf_sk_release(sk). + /* + * Tracks the parent object this register was derived from. + * Used for cascading invalidation: when the parent object is + * released or invalidated, all registers with matching parent_id + * are also invalidated. For example, a slice from bpf_dynptr_data() + * gets parent_id set to the dynptr's id. */ - u32 ref_obj_id; + u32 parent_id; /* Inside the callee two registers can be both PTR_TO_STACK like * R1=fp-8 and R2=fp-8, but one of them points to this function stack * while another to the caller's stack. To differentiate them 'frameno' @@ -364,10 +331,14 @@ struct bpf_reference_state { * is used purely to inform the user of a reference leak. */ int insn_idx; - /* Use to keep track of the source object of a lock, to ensure - * it matches on unlock. - */ - void *ptr; + union { + /* For REF_TYPE_PTR */ + int parent_id; + /* Use to keep track of the source object of a lock, to ensure + * it matches on unlock. + */ + void *ptr; + }; }; struct bpf_retval_range { @@ -585,7 +556,7 @@ bpf_get_spilled_stack_arg(int slot, struct bpf_func_state *frame) iter < frame->out_stack_arg_cnt; \ iter++, reg = bpf_get_spilled_stack_arg(iter, frame)) -#define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr) \ +#define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __stack, __mask, __expr) \ ({ \ struct bpf_verifier_state *___vstate = __vst; \ int ___i, ___j; \ @@ -593,6 +564,7 @@ bpf_get_spilled_stack_arg(int slot, struct bpf_func_state *frame) struct bpf_reg_state *___regs; \ __state = ___vstate->frame[___i]; \ ___regs = __state->regs; \ + __stack = NULL; \ for (___j = 0; ___j < MAX_BPF_REG; ___j++) { \ __reg = &___regs[___j]; \ (void)(__expr); \ @@ -600,8 +572,10 @@ bpf_get_spilled_stack_arg(int slot, struct bpf_func_state *frame) bpf_for_each_spilled_reg(___j, __state, __reg, __mask) { \ if (!__reg) \ continue; \ + __stack = &__state->stack[___j]; \ (void)(__expr); \ } \ + __stack = NULL; \ bpf_for_each_spilled_stack_arg(___j, __state, __reg) { \ if (!__reg) \ continue; \ @@ -611,8 +585,13 @@ bpf_get_spilled_stack_arg(int slot, struct bpf_func_state *frame) }) /* Invoke __expr over regsiters in __vst, setting __state and __reg */ -#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ - bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, 1 << STACK_SPILL, __expr) +#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ + ({ \ + struct bpf_stack_state * ___stack; \ + (void)___stack; \ + bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, ___stack,\ + 1 << STACK_SPILL, __expr); \ + }) /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { @@ -1442,7 +1421,7 @@ struct bpf_map_desc { struct bpf_dynptr_desc { enum bpf_dynptr_type type; u32 id; - u32 ref_obj_id; + u32 parent_id; }; struct bpf_kfunc_call_arg_meta { @@ -1453,7 +1432,7 @@ struct bpf_kfunc_call_arg_meta { const struct btf_type *func_proto; const char *func_name; /* Out parameters */ - u32 ref_obj_id; + u32 id; u8 release_regno; bool r0_rdonly; u32 ret_btf_id; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 17d4ab0a8206..f429f6f58cb2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6957,7 +6957,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = ctx_arg_info->reg_type; info->btf = ctx_arg_info->btf ? : btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; - info->ref_obj_id = ctx_arg_info->ref_obj_id; + info->ref_id = ctx_arg_info->ref_id; return true; } } diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index 12739add2dda..5aa3f7d99ac9 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -870,7 +870,7 @@ int bpf_convert_ctx_accesses(struct bpf_verifier_env *env) case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | PTR_UNTRUSTED: /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike - * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot + * PTR_TO_BTF_ID, and an active referenced id, but the same cannot * be said once it is marked PTR_UNTRUSTED, hence we must handle * any faults for loads into such types. BPF_WRITE is disallowed * for this case. diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 62fe6ed18374..b740fa73ee26 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -665,8 +665,8 @@ static void print_reg_state(struct bpf_verifier_env *env, verbose_a("id=%d", reg->id & ~BPF_ADD_CONST); if (reg->id & BPF_ADD_CONST) verbose(env, "%+d", reg->delta); - if (reg->ref_obj_id) - verbose_a("ref_obj_id=%d", reg->ref_obj_id); + if (reg->parent_id) + verbose_a("parent_id=%d", reg->parent_id); if (type_is_non_owning_ref(reg->type)) verbose_a("%s", "non_own_ref"); if (type_is_map_ptr(t)) { @@ -768,21 +768,19 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type)); if (reg->id) verbose_a("id=%d", reg->id); - if (reg->ref_obj_id) - verbose_a("ref_id=%d", reg->ref_obj_id); - if (reg->dynptr_id) - verbose_a("dynptr_id=%d", reg->dynptr_id); + if (reg->parent_id) + verbose_a("parent_id=%d", reg->parent_id); verbose(env, ")"); break; case STACK_ITER: - /* only main slot has ref_obj_id set; skip others */ - if (!reg->ref_obj_id) + /* only main slot has id set; skip others */ + if (!reg->id) continue; - verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)", + verbose(env, " fp%d=iter_%s(id=%d,state=%s,depth=%u)", (-i - 1) * BPF_REG_SIZE, iter_type_str(reg->iter.btf, reg->iter.btf_id), - reg->ref_obj_id, iter_state_str(reg->iter.state), + reg->id, iter_state_str(reg->iter.state), reg->iter.depth); break; case STACK_MISC: diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index 877338136009..5945956a7573 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -489,7 +489,7 @@ static bool regs_exact(const struct bpf_reg_state *rold, { return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && check_ids(rold->id, rcur->id, idmap) && - check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); + check_ids(rold->parent_id, rcur->parent_id, idmap); } enum exact_level { @@ -614,7 +614,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off) && check_ids(rold->id, rcur->id, idmap) && - check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); + check_ids(rold->parent_id, rcur->parent_id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: /* We must have at least as much range as the old ptr @@ -794,7 +794,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, cur_reg = &cur->stack[spi].spilled_ptr; if (old_reg->dynptr.type != cur_reg->dynptr.type || old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot || - !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + !check_ids(old_reg->id, cur_reg->id, idmap) || + !check_ids(old_reg->parent_id, cur_reg->parent_id, idmap)) return false; break; case STACK_ITER: @@ -810,13 +811,13 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, old_reg->iter.btf_id != cur_reg->iter.btf_id || old_reg->iter.state != cur_reg->iter.state || /* ignore {old_reg,cur_reg}->iter.depth, see above */ - !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + !check_ids(old_reg->id, cur_reg->id, idmap)) return false; break; case STACK_IRQ_FLAG: old_reg = &old->stack[spi].spilled_ptr; cur_reg = &cur->stack[spi].spilled_ptr; - if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) || + if (!check_ids(old_reg->id, cur_reg->id, idmap) || old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class) return false; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0d8be0b68bd8..6d82ca5acacb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -200,14 +200,14 @@ struct bpf_verifier_stack_elem { #define BPF_PRIV_STACK_MIN_SIZE 64 -static int acquire_reference(struct bpf_verifier_env *env, int insn_idx); -static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id); -static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); +static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, int parent_id); +static int release_reference_nomark(struct bpf_verifier_state *state, int id); +static int release_reference(struct bpf_verifier_env *env, int id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); -static bool is_trusted_reg(const struct bpf_reg_state *reg); +static bool is_trusted_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg); static inline bool in_sleepable_context(struct bpf_verifier_env *env); static const char *non_sleepable_context_description(struct bpf_verifier_env *env); static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg); @@ -241,7 +241,7 @@ struct bpf_call_arg_meta { int access_size; int mem_size; u64 msize_max_value; - int ref_obj_id; + u32 id; int func_id; struct btf *btf; u32 btf_id; @@ -339,7 +339,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval); } -static bool reg_not_null(const struct bpf_reg_state *reg) +static bool reg_not_null(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) { enum bpf_reg_type type; @@ -353,7 +353,7 @@ static bool reg_not_null(const struct bpf_reg_state *reg) type == PTR_TO_MAP_VALUE || type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON || - (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) || + (type == PTR_TO_BTF_ID && is_trusted_reg(env, reg)) || (type == PTR_TO_MEM && !(reg->type & PTR_UNTRUSTED)) || type == CONST_PTR_TO_MAP; } @@ -638,43 +638,44 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) } } -static bool dynptr_type_refcounted(enum bpf_dynptr_type type) +static bool dynptr_type_referenced(enum bpf_dynptr_type type) { return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE; } static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, - bool first_slot, int dynptr_id); + bool first_slot, int id, int parent_id); static void mark_dynptr_stack_regs(struct bpf_verifier_env *env, struct bpf_reg_state *sreg1, struct bpf_reg_state *sreg2, - enum bpf_dynptr_type type) + enum bpf_dynptr_type type, int parent_id) { int id = ++env->id_gen; - __mark_dynptr_reg(sreg1, type, true, id); - __mark_dynptr_reg(sreg2, type, false, id); + __mark_dynptr_reg(sreg1, type, true, id, parent_id); + __mark_dynptr_reg(sreg2, type, false, id, parent_id); } static void mark_dynptr_cb_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_dynptr_type type) { - __mark_dynptr_reg(reg, type, true, ++env->id_gen); + __mark_dynptr_reg(reg, type, true, ++env->id_gen, 0); } static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi); static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id) + enum bpf_arg_type arg_type, int insn_idx, int parent_id, + struct bpf_dynptr_desc *dynptr) { struct bpf_func_state *state = bpf_func(env, reg); - enum bpf_dynptr_type type; int spi, i, err; + enum bpf_dynptr_type type; spi = dynptr_get_spi(env, reg); if (spi < 0) @@ -705,85 +706,62 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (type == BPF_DYNPTR_TYPE_INVALID) return -EINVAL; - mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr, - &state->stack[spi - 1].spilled_ptr, type); - - if (dynptr_type_refcounted(type)) { - /* The id is used to track proper releasing */ - int id; + if (dynptr->type == BPF_DYNPTR_TYPE_INVALID) { /* dynptr constructors */ + if (dynptr_type_referenced(type)) { + int id; - if (clone_ref_obj_id) - id = clone_ref_obj_id; - else - id = acquire_reference(env, insn_idx); - - if (id < 0) - return id; + /* + * Create an intermediate reference that tracks the referenced + * object for the referenced dynptr. Freeing a referenced dynptr + * through helpers/kfuncs will invalidate all clones. + */ + id = acquire_reference(env, insn_idx, parent_id); + if (id < 0) + return id; - state->stack[spi].spilled_ptr.ref_obj_id = id; - state->stack[spi - 1].spilled_ptr.ref_obj_id = id; + parent_id = id; + } + } else { /* bpf_dynptr_clone() */ + parent_id = dynptr->parent_id; } + mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr, + &state->stack[spi - 1].spilled_ptr, type, parent_id); + return 0; } -static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi) +static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_stack_state *stack) { int i; for (i = 0; i < BPF_REG_SIZE; i++) { - state->stack[spi].slot_type[i] = STACK_INVALID; - state->stack[spi - 1].slot_type[i] = STACK_INVALID; + stack[0].slot_type[i] = STACK_INVALID; + stack[1].slot_type[i] = STACK_INVALID; } - bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr); - bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); + bpf_mark_reg_not_init(env, &stack[0].spilled_ptr); + bpf_mark_reg_not_init(env, &stack[1].spilled_ptr); } static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = bpf_func(env, reg); - int spi, ref_obj_id, i; + int spi; spi = dynptr_get_spi(env, reg); if (spi < 0) return spi; - if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - invalidate_dynptr(env, state, spi); - return 0; - } - - ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id; - - /* If the dynptr has a ref_obj_id, then we need to invalidate - * two things: - * - * 1) Any dynptrs with a matching ref_obj_id (clones) - * 2) Any slices derived from this dynptr. + /* + * For referenced dynptr, release the parent ref which cascades to + * all clones and derived slices. For non-referenced dynptr, only + * the dynptr and slices derived from it will be invalidated. */ - - /* Invalidate any slices associated with this dynptr */ - WARN_ON_ONCE(release_reference(env, ref_obj_id)); - - /* Invalidate any dynptr clones */ - for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id) - continue; - - /* it should always be the case that if the ref obj id - * matches then the stack slot also belongs to a - * dynptr - */ - if (state->stack[i].slot_type[0] != STACK_DYNPTR) { - verifier_bug(env, "misconfigured ref_obj_id"); - return -EFAULT; - } - if (state->stack[i].spilled_ptr.dynptr.first_slot) - invalidate_dynptr(env, state, i); - } - - return 0; + reg = &state->stack[spi].spilled_ptr; + return release_reference(env, dynptr_type_referenced(reg->dynptr.type) + ? reg->parent_id + : reg->id); } static void __mark_reg_unknown(const struct bpf_verifier_env *env, @@ -800,9 +778,7 @@ static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi) { - struct bpf_func_state *fstate; - struct bpf_reg_state *dreg; - int i, dynptr_id; + int i, err = 0; /* We always ensure that STACK_DYNPTR is never set partially, * hence just checking for slot_type[0] is enough. This is @@ -816,13 +792,13 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, if (!state->stack[spi].spilled_ptr.dynptr.first_slot) spi = spi + 1; - if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - int ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id; + if (dynptr_type_referenced(state->stack[spi].spilled_ptr.dynptr.type)) { + int v_parent_id = state->stack[spi].spilled_ptr.parent_id; int ref_cnt = 0; /* * A referenced dynptr can be overwritten only if there is at - * least one other dynptr sharing the same ref_obj_id, + * least one other dynptr sharing the same virtual ref parent, * ensuring the reference can still be properly released. */ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { @@ -830,7 +806,7 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, continue; if (!state->stack[i].spilled_ptr.dynptr.first_slot) continue; - if (state->stack[i].spilled_ptr.ref_obj_id == ref_obj_id) + if (state->stack[i].spilled_ptr.parent_id == v_parent_id) ref_cnt++; } @@ -840,32 +816,14 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, } } - mark_stack_slot_scratched(env, spi); - mark_stack_slot_scratched(env, spi - 1); - - /* Writing partially to one dynptr stack slot destroys both. */ - for (i = 0; i < BPF_REG_SIZE; i++) { - state->stack[spi].slot_type[i] = STACK_INVALID; - state->stack[spi - 1].slot_type[i] = STACK_INVALID; + /* Invalidate the dynptr and any derived slices */ + err = release_reference(env, state->stack[spi].spilled_ptr.id); + if (!err) { + mark_stack_slot_scratched(env, spi); + mark_stack_slot_scratched(env, spi - 1); } - dynptr_id = state->stack[spi].spilled_ptr.id; - /* Invalidate any slices associated with this dynptr */ - bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({ - /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */ - if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM) - continue; - if (dreg->dynptr_id == dynptr_id) - mark_reg_invalid(env, dreg); - })); - - /* Do not release reference state, we are destroying dynptr on stack, - * not using some helper to release it. Just reset register. - */ - bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr); - bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - - return 0; + return err; } static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) @@ -965,7 +923,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, if (spi < 0) return spi; - id = acquire_reference(env, insn_idx); + id = acquire_reference(env, insn_idx, 0); if (id < 0) return id; @@ -981,7 +939,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, else st->type |= PTR_UNTRUSTED; } - st->ref_obj_id = i == 0 ? id : 0; + st->id = i == 0 ? id : 0; st->iter.btf = btf; st->iter.btf_id = btf_id; st->iter.state = BPF_ITER_STATE_ACTIVE; @@ -1011,7 +969,7 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, struct bpf_reg_state *st = &slot->spilled_ptr; if (i == 0) - WARN_ON_ONCE(release_reference(env, st->ref_obj_id)); + WARN_ON_ONCE(release_reference(env, st->id)); bpf_mark_reg_not_init(env, st); @@ -1067,10 +1025,10 @@ static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_s if (st->type & PTR_UNTRUSTED) return -EPROTO; - /* only main (first) slot has ref_obj_id set */ - if (i == 0 && !st->ref_obj_id) + /* only main (first) slot has id set */ + if (i == 0 && !st->id) return -EINVAL; - if (i != 0 && st->ref_obj_id) + if (i != 0 && st->id) return -EINVAL; if (st->iter.btf != btf || st->iter.btf_id != btf_id) return -EINVAL; @@ -1109,7 +1067,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, __mark_reg_known_zero(st); st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ - st->ref_obj_id = id; + st->id = id; st->irq.kfunc_class = kfunc_class; for (i = 0; i < BPF_REG_SIZE; i++) @@ -1143,7 +1101,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r return -EINVAL; } - err = release_irq_state(env->cur_state, st->ref_obj_id); + err = release_irq_state(env->cur_state, st->id); WARN_ON_ONCE(err && err != -EACCES); if (err) { int insn_idx = 0; @@ -1207,7 +1165,7 @@ static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_r slot = &state->stack[spi]; st = &slot->spilled_ptr; - if (!st->ref_obj_id) + if (!st->id) return -EINVAL; for (i = 0; i < BPF_REG_SIZE; i++) @@ -1448,7 +1406,7 @@ static struct bpf_reference_state *acquire_reference_state(struct bpf_verifier_e return &state->refs[new_ofs]; } -static int acquire_reference(struct bpf_verifier_env *env, int insn_idx) +static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, int parent_id) { struct bpf_reference_state *s; @@ -1457,6 +1415,7 @@ static int acquire_reference(struct bpf_verifier_env *env, int insn_idx) return -ENOMEM; s->type = REF_TYPE_PTR; s->id = ++env->id_gen; + s->parent_id = parent_id; return s->id; } @@ -1513,17 +1472,25 @@ static void release_reference_state(struct bpf_verifier_state *state, int idx) return; } -static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id) +static bool find_reference_state(struct bpf_verifier_state *state, int id) { int i; - for (i = 0; i < state->acquired_refs; i++) - if (state->refs[i].id == ptr_id) + for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].type != REF_TYPE_PTR) + continue; + if (state->refs[i].id == id) return true; + } return false; } +static bool reg_is_referenced(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) +{ + return find_reference_state(env->cur_state, reg->id); +} + static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr) { void *prev_ptr = NULL; @@ -1837,7 +1804,7 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) memset(((u8 *)reg) + sizeof(reg->type), 0, offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); reg->id = 0; - reg->ref_obj_id = 0; + reg->parent_id = 0; ___mark_reg_known(reg, imm); } @@ -1872,7 +1839,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, } static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, - bool first_slot, int dynptr_id) + bool first_slot, int id, int parent_id) { /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply @@ -1881,7 +1848,8 @@ static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type ty __mark_reg_known_zero(reg); reg->type = CONST_PTR_TO_DYNPTR; /* Give each dynptr a unique id to uniquely associate slices to it. */ - reg->id = dynptr_id; + reg->id = id; + reg->parent_id = parent_id; reg->dynptr.type = type; reg->dynptr.first_slot = first_slot; } @@ -2161,17 +2129,12 @@ out: /* Mark a register as having a completely unknown (scalar) value. */ void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg) { - /* - * Clear type, off, and union(map_ptr, range) and - * padding between 'type' and union - */ - memset(reg, 0, offsetof(struct bpf_reg_state, var_off)); + s32 subreg_def = reg->subreg_def; + + memset(reg, 0, sizeof(*reg)); reg->type = SCALAR_VALUE; - reg->id = 0; - reg->ref_obj_id = 0; reg->var_off = tnum_unknown; - reg->frameno = 0; - reg->precise = false; + reg->subreg_def = subreg_def; __mark_reg_unbounded(reg); } @@ -4330,7 +4293,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the * normal store of unreferenced kptr, we must ensure var_off is zero. * Since ref_ptr cannot be accessed directly by BPF insns, check for - * reg->ref_obj_id is not needed here. + * reg->id is not needed here. */ if (__check_ptr_off_reg(env, reg, argno_from_reg(regno), true)) return -EACCES; @@ -4703,8 +4666,8 @@ static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int of * type of narrower access. */ if (base_type(info->reg_type) == PTR_TO_BTF_ID) { - if (info->ref_obj_id && - !find_reference_state(env->cur_state, info->ref_obj_id)) { + if (info->ref_id && + !find_reference_state(env->cur_state, info->ref_id)) { verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n", off); return -EACCES; @@ -4873,10 +4836,10 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { [CONST_PTR_TO_MAP] = btf_bpf_map_id, }; -static bool is_trusted_reg(const struct bpf_reg_state *reg) +static bool is_trusted_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) { /* A referenced register is always trusted. */ - if (reg->ref_obj_id) + if (reg_is_referenced(env, reg)) return true; /* Types listed in the reg2btf_ids are always trusted */ @@ -5790,7 +5753,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, ret = env->ops->btf_struct_access(&env->log, reg, off, size); } else { /* Writes are permitted with default btf_struct_access for - * program allocated objects (which always have ref_obj_id > 0), + * program allocated objects (which always have id > 0), * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC. */ if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) { @@ -5799,8 +5762,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) && - !(reg->type & MEM_RCU) && !reg->ref_obj_id) { - verifier_bug(env, "ref_obj_id for allocated object must be non-zero"); + !(reg->type & MEM_RCU) && !reg_is_referenced(env, reg)) { + verifier_bug(env, "allocated object must have a referenced id"); return -EFAULT; } @@ -5819,7 +5782,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, */ flag = PTR_UNTRUSTED; - } else if (is_trusted_reg(reg) || is_rcu_reg(reg)) { + } else if (is_trusted_reg(env, reg) || is_rcu_reg(reg)) { /* By default any pointer obtained from walking a trusted pointer is no * longer trusted, unless the field being accessed has explicitly been * marked as inheriting its parent's state of trust (either full or RCU). @@ -6217,8 +6180,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b if (base_type(info.reg_type) == PTR_TO_BTF_ID) { regs[value_regno].btf = info.btf; regs[value_regno].btf_id = info.btf_id; - regs[value_regno].id = info.ref_obj_id; - regs[value_regno].ref_obj_id = info.ref_obj_id; + regs[value_regno].id = info.ref_id; } if (type_may_be_null(info.reg_type) && !regs[value_regno].id) regs[value_regno].id = ++env->id_gen; @@ -7201,7 +7163,16 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return 0; } -/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK +/* + * Validate dynptr arguments for helper, kfunc and subprog. + * + * @dynptr is both input and output. It is populated when the argument is + * tagged with MEM_UNINIT (i.e., the dynptr argument that will be constructed) + * and consumed when the argument is expecting to be an initialized dynptr. + * @parent_id is used to track the referenced parent object (e.g., file or skb in + * qdisc program) when constructing a dynptr. + * + * There are two register types representing a bpf_dynptr, one is PTR_TO_STACK * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR. * * In both cases we deal with the first 8 bytes, but need to mark the next 8 @@ -7217,7 +7188,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, */ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, enum bpf_arg_type arg_type, - int clone_ref_obj_id, struct bpf_dynptr_desc *dynptr) + int parent_id, struct bpf_dynptr_desc *dynptr) { int spi, err = 0; @@ -7258,7 +7229,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat return err; } - err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id); + err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, parent_id, dynptr); } else /* OBJ_RELEASE and None case from above */ { /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ if (reg->type == CONST_PTR_TO_DYNPTR && (arg_type & OBJ_RELEASE)) { @@ -7300,17 +7271,17 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat if (dynptr) { dynptr->type = reg->dynptr.type; dynptr->id = reg->id; - dynptr->ref_obj_id = reg->ref_obj_id; + dynptr->parent_id = reg->parent_id; } } return err; } -static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi) +static u32 iter_ref_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi) { struct bpf_func_state *state = bpf_func(env, reg); - return state->stack[spi].spilled_ptr.ref_obj_id; + return state->stack[spi].spilled_ptr.id; } static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta) @@ -7416,7 +7387,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state * /* remember meta->iter info for process_iter_next_call() */ meta->iter.spi = spi; meta->iter.frameno = reg->frameno; - meta->ref_obj_id = iter_ref_obj_id(env, reg, spi); + meta->id = iter_ref_id(env, reg, spi); if (is_iter_destroy_kfunc(meta)) { err = unmark_stack_slots_iter(env, reg, nr_slots); @@ -7999,7 +7970,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, /* When referenced register is passed to release function, its fixed * offset must be 0. * - * We will check arg_type_is_release reg has ref_obj_id when storing + * We will check arg_type_is_release reg has id when storing * meta->release_regno. */ if (arg_type_is_release(arg_type)) { @@ -8260,7 +8231,7 @@ skip_type_check: */ if (reg->type == PTR_TO_STACK) { spi = dynptr_get_spi(env, reg); - if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) { + if (spi < 0 || !state->stack[spi].spilled_ptr.id) { verbose(env, "arg %d is an unacquired reference\n", regno); return -EINVAL; } @@ -8268,7 +8239,7 @@ skip_type_check: verbose(env, "cannot release unowned const bpf_dynptr\n"); return -EINVAL; } - } else if (!reg->ref_obj_id && !bpf_register_is_null(reg)) { + } else if (!reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { verbose(env, "R%d must be referenced when passed to release function\n", regno); return -EINVAL; @@ -8280,14 +8251,14 @@ skip_type_check: meta->release_regno = regno; } - if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { - if (meta->ref_obj_id) { - verbose(env, "more than one arg with ref_obj_id R%d %u %u", - regno, reg->ref_obj_id, - meta->ref_obj_id); + if (reg_is_referenced(env, reg) && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { + if (meta->id) { + verbose(env, "more than one arg with referenced id R%d %u %u", + regno, reg->id, + meta->id); return -EACCES; } - meta->ref_obj_id = reg->ref_obj_id; + meta->id = reg->id; } switch (base_type(arg_type)) { @@ -8898,14 +8869,14 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range reg->range = AT_PKT_END; } -static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id) +static int release_reference_nomark(struct bpf_verifier_state *state, int id) { int i; for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].type != REF_TYPE_PTR) continue; - if (state->refs[i].id == ref_obj_id) { + if (state->refs[i].id == id) { release_reference_state(state, i); return 0; } @@ -8913,26 +8884,83 @@ static int release_reference_nomark(struct bpf_verifier_state *state, int ref_ob return -EINVAL; } -/* The pointer with the specified id has released its reference to kernel - * resources. Identify all copies of the same pointer and clear the reference. - * - * This is the release function corresponding to acquire_reference(). Idempotent. - */ -static int release_reference(struct bpf_verifier_env *env, int ref_obj_id) +static int idstack_push(struct bpf_idmap *idmap, u32 id) +{ + int i; + + if (!id) + return 0; + + for (i = 0; i < idmap->cnt; i++) + if (idmap->map[i].old == id) + return 0; + + if (WARN_ON_ONCE(idmap->cnt >= BPF_ID_MAP_SIZE)) + return -EFAULT; + + idmap->map[idmap->cnt++].old = id; + return 0; +} + +static int idstack_pop(struct bpf_idmap *idmap) { + if (!idmap->cnt) + return 0; + + return idmap->map[--idmap->cnt].old; +} + +/* Release id and objects derived from it iteratively in a DFS manner */ +static int release_reference(struct bpf_verifier_env *env, int id) +{ + u32 mask = (1 << STACK_SPILL) | (1 << STACK_DYNPTR); struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_idmap *idstack = &env->idmap_scratch; + struct bpf_stack_state *stack; struct bpf_func_state *state; struct bpf_reg_state *reg; - int err; + int i, err; - err = release_reference_nomark(vstate, ref_obj_id); + idstack->cnt = 0; + err = idstack_push(idstack, id); if (err) return err; - bpf_for_each_reg_in_vstate(vstate, state, reg, ({ - if (reg->ref_obj_id == ref_obj_id) - mark_reg_invalid(env, reg); - })); + if (find_reference_state(vstate, id)) + WARN_ON_ONCE(release_reference_nomark(vstate, id)); + + while ((id = idstack_pop(idstack))) { + /* + * Child references are inaccessible after parent is released, + * any child references that exist at this point are a leak. + */ + for (i = 0; i < vstate->acquired_refs; i++) { + if (vstate->refs[i].type != REF_TYPE_PTR) + continue; + if (vstate->refs[i].parent_id != id) + continue; + verbose(env, "Leaking reference id=%d alloc_insn=%d. Release it first.\n", + vstate->refs[i].id, vstate->refs[i].insn_idx); + return -EINVAL; + } + + bpf_for_each_reg_in_vstate_mask(vstate, state, reg, stack, mask, ({ + if (reg->id != id && reg->parent_id != id) + continue; + + /* Free objects derived from the current object */ + if (reg->parent_id == id) { + err = idstack_push(idstack, reg->id); + if (err) + return err; + } + + if (!stack || stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL) + mark_reg_invalid(env, reg); + else if (stack->slot_type[BPF_REG_SIZE - 1] == STACK_DYNPTR) + invalidate_dynptr(env, stack); + })); + } return 0; } @@ -9833,7 +9861,7 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi * kernel. Type checks are performed later in check_return_code. */ if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit && - reg->ref_obj_id == state->refs[i].id) + reg->id == state->refs[i].id) continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); @@ -10116,18 +10144,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = -EINVAL; if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); - } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) { - u32 ref_obj_id = meta.ref_obj_id; + } else if (func_id == BPF_FUNC_kptr_xchg && meta.id) { + u32 id = meta.id; bool in_rcu = in_rcu_cs(env); struct bpf_func_state *state; struct bpf_reg_state *reg; - err = release_reference_nomark(env->cur_state, ref_obj_id); + err = release_reference_nomark(env->cur_state, id); if (!err) { bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ - if (reg->ref_obj_id == ref_obj_id) { + if (reg->id == id) { if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) { - reg->ref_obj_id = 0; + reg->id = 0; reg->type &= ~MEM_ALLOC; reg->type |= MEM_RCU; } else { @@ -10136,19 +10164,16 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } })); } - } else if (meta.ref_obj_id) { - err = release_reference(env, meta.ref_obj_id); + } else if (meta.id) { + err = release_reference(env, meta.id); } else if (bpf_register_is_null(®s[meta.release_regno])) { - /* meta.ref_obj_id can only be 0 if register that is meant to be + /* meta.id can only be 0 if register that is meant to be * released is NULL, which must be > R0. */ err = 0; } - if (err) { - verbose(env, "func %s#%d reference has not been acquired before\n", - func_id_name(func_id), func_id); + if (err) return err; - } } switch (func_id) { @@ -10413,24 +10438,40 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } - if (is_ptr_cast_function(func_id)) { - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + if (is_ptr_cast_function(func_id) && + find_reference_state(env->cur_state, meta.id)) { + struct bpf_verifier_state *branch; + struct bpf_reg_state *r0; + + /* + * In order for a release of any of the original or cast pointers + * to invalidate all other pointers, reuse the same reference id for + * the cast result. + * This reference id can't be used for nullness propagation, + * as cast might return NULL for a non-NULL input. + * Hence, explore the NULL case as a separate branch. + */ + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (IS_ERR(branch)) + return PTR_ERR(branch); + + r0 = &branch->frame[branch->curframe]->regs[BPF_REG_0]; + __mark_reg_known_zero(r0); + r0->type = SCALAR_VALUE; + + regs[BPF_REG_0].type &= ~PTR_MAYBE_NULL; + regs[BPF_REG_0].id = meta.id; } else if (is_acquire_function(func_id, meta.map.ptr)) { - int id = acquire_reference(env, insn_idx); + int id = acquire_reference(env, insn_idx, 0); if (id < 0) return id; - /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = id; - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = id; } - if (func_id == BPF_FUNC_dynptr_data) { - regs[BPF_REG_0].dynptr_id = meta.dynptr.id; - regs[BPF_REG_0].ref_obj_id = meta.dynptr.ref_obj_id; - } + if (func_id == BPF_FUNC_dynptr_data) + regs[BPF_REG_0].parent_id = meta.dynptr.id; err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta); if (err) @@ -11242,7 +11283,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, * btf_struct_ids_match() to walk the struct at the 0th offset, and * resolve types. */ - if ((is_kfunc_release(meta) && reg->ref_obj_id) || + if ((is_kfunc_release(meta) && reg_is_referenced(env, reg)) || btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id)) strict_type_match = true; @@ -11346,36 +11387,21 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state return 0; } -static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id) +static void ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 id) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_func_state *unused; struct bpf_reg_state *reg; - int i; - if (!ref_obj_id) { - verifier_bug(env, "ref_obj_id is zero for owning -> non-owning conversion"); - return -EFAULT; - } + WARN_ON_ONCE(release_reference_nomark(env->cur_state, id)); - for (i = 0; i < state->acquired_refs; i++) { - if (state->refs[i].id != ref_obj_id) - continue; - - /* Clear ref_obj_id here so release_reference doesn't clobber - * the whole reg - */ - bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ - if (reg->ref_obj_id == ref_obj_id) { - reg->ref_obj_id = 0; - ref_set_non_owning(env, reg); - } - })); - return 0; - } + bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ + if (reg->id == id) { + reg->id = 0; + ref_set_non_owning(env, reg); + } + })); - verifier_bug(env, "ref state missing for ref_obj_id"); - return -EFAULT; + return; } /* Implementation details: @@ -11907,14 +11933,14 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EACCES; } - if (reg->ref_obj_id) { - if (is_kfunc_release(meta) && meta->ref_obj_id) { - verifier_bug(env, "more than one arg with ref_obj_id %s %u %u", - reg_arg_name(env, argno), reg->ref_obj_id, - meta->ref_obj_id); + if (reg_is_referenced(env, reg)) { + if (is_kfunc_release(meta) && meta->id) { + verifier_bug(env, "more than one arg with referenced id %s %u %u", + reg_arg_name(env, argno), reg->id, + meta->id); return -EFAULT; } - meta->ref_obj_id = reg->ref_obj_id; + meta->id = reg->id; if (is_kfunc_release(meta)) { if (regno < 0) { verbose(env, "%s release arg cannot be a stack argument\n", @@ -11975,7 +12001,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: - if (!is_trusted_reg(reg)) { + if (!is_trusted_reg(env, reg)) { if (!is_kfunc_rcu(meta)) { verbose(env, "%s must be referenced or trusted\n", reg_arg_name(env, argno)); @@ -12013,7 +12039,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } - if (is_kfunc_release(meta) && reg->ref_obj_id) + if (is_kfunc_release(meta) && reg_is_referenced(env, reg)) arg_type |= OBJ_RELEASE; ret = check_func_arg_reg_off(env, reg, argno, arg_type); if (ret < 0) @@ -12052,7 +12078,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ reg_arg_name(env, argno)); return -EINVAL; } - if (!reg->ref_obj_id) { + if (!reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } @@ -12064,7 +12090,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_DYNPTR: { enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR; - int clone_ref_obj_id = 0; if (is_kfunc_arg_uninit(btf, &args[i])) dynptr_arg_type |= MEM_UNINIT; @@ -12095,15 +12120,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type); - clone_ref_obj_id = meta->dynptr.ref_obj_id; - if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) { - verifier_bug(env, "missing ref obj id for parent of clone"); - return -EFAULT; - } } ret = process_dynptr_func(env, reg, argno, insn_idx, dynptr_arg_type, - clone_ref_obj_id, &meta->dynptr); + meta->id, &meta->dynptr); if (ret < 0) return ret; break; @@ -12126,7 +12146,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ reg_arg_name(env, argno)); return -EINVAL; } - if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { + if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && + !reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } @@ -12141,7 +12162,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ reg_arg_name(env, argno)); return -EINVAL; } - if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { + if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && + !reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } @@ -12151,7 +12173,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_LIST_NODE: if (is_kfunc_arg_nonown_allowed(btf, &args[i]) && - type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) { + type_is_non_owning_ref(reg->type) && !reg_is_referenced(env, reg)) { /* Allow bpf_list_front/back return value for * __nonown_allowed list-node arguments. */ @@ -12162,7 +12184,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ reg_arg_name(env, argno)); return -EINVAL; } - if (!reg->ref_obj_id) { + if (!reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } @@ -12178,12 +12200,13 @@ check_ok: reg_arg_name(env, argno)); return -EINVAL; } - if (!reg->ref_obj_id) { + if (!reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } } else { - if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) { + if (!type_is_non_owning_ref(reg->type) && + !reg_is_referenced(env, reg)) { verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name); return -EINVAL; } @@ -12764,12 +12787,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca verifier_bug(env, "no dynptr id"); return -EFAULT; } - regs[BPF_REG_0].dynptr_id = meta->dynptr.id; - - /* we don't need to set BPF_REG_0's ref obj id - * because packet slices are not refcounted (see - * dynptr_type_refcounted) - */ + regs[BPF_REG_0].parent_id = meta->dynptr.id; } else { return 0; } @@ -12783,13 +12801,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable; - u32 i, nargs, ptr_type_id, release_ref_obj_id; struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; const struct btf_type *t, *ptr_type; struct bpf_kfunc_call_arg_meta meta; struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; + u32 i, nargs, ptr_type_id, id; const struct btf_param *args; struct btf *desc_btf; @@ -12902,6 +12920,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (rcu_lock) { env->cur_state->active_rcu_locks++; } else if (rcu_unlock) { + struct bpf_stack_state *stack; struct bpf_func_state *state; struct bpf_reg_state *reg; u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); @@ -12911,7 +12930,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } if (--env->cur_state->active_rcu_locks == 0) { - bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({ + bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, clear_mask, ({ if (reg->type & MEM_RCU) { reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); reg->type |= PTR_UNTRUSTED; @@ -12950,35 +12969,20 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.release_regno) { struct bpf_reg_state *reg = ®s[meta.release_regno]; - if (meta.dynptr.ref_obj_id) { + if (meta.dynptr.id) { err = unmark_stack_slots_dynptr(env, reg); } else { - err = release_reference(env, reg->ref_obj_id); - if (err) - verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, meta.func_id); + err = release_reference(env, reg->id); } if (err) return err; } if (is_bpf_list_push_kfunc(meta.func_id) || is_bpf_rbtree_add_kfunc(meta.func_id)) { - release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; + id = regs[BPF_REG_2].id; insn_aux->insert_off = regs[BPF_REG_2].var_off.value; insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); - err = ref_convert_owning_non_owning(env, release_ref_obj_id); - if (err) { - verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n", - func_name, meta.func_id); - return err; - } - - err = release_reference(env, release_ref_obj_id); - if (err) { - verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, meta.func_id); - return err; - } + ref_convert_owning_non_owning(env, id); } if (meta.func_id == special_kfunc_list[KF_bpf_throw]) { @@ -13063,8 +13067,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].type |= MEM_RDONLY; /* Ensures we don't access the memory after a release_reference() */ - if (meta.ref_obj_id) - regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + if (meta.id) + regs[BPF_REG_0].parent_id = meta.id; if (is_kfunc_rcu_protected(&meta)) regs[BPF_REG_0].type |= MEM_RCU; @@ -13110,13 +13114,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); if (is_kfunc_acquire(&meta)) { - int id = acquire_reference(env, insn_idx); - + id = acquire_reference(env, insn_idx, 0); if (id < 0) return id; - if (is_kfunc_ret_null(&meta)) - regs[BPF_REG_0].id = id; - regs[BPF_REG_0].ref_obj_id = id; + regs[BPF_REG_0].id = id; } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) { ref_set_non_owning(env, ®s[BPF_REG_0]); } @@ -15347,7 +15348,7 @@ static int is_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *r if (!is_reg_const(reg2, is_jmp32)) return -1; - if (!reg_not_null(reg1)) + if (!reg_not_null(env, reg1)) return -1; /* If pointer is valid tests against zero will fail so we can @@ -15564,7 +15565,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, WARN_ON_ONCE(!tnum_equals_const(reg->var_off, 0))) return; if (is_null) { - /* We don't need id and ref_obj_id from this point + /* We don't need id from this point * onwards anymore, thus we should better reset it, * so that state pruning has chances to take effect. */ @@ -15591,10 +15592,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, { struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *reg; - u32 ref_obj_id = regs[regno].ref_obj_id; u32 id = regs[regno].id; - if (ref_obj_id && ref_obj_id == id && is_null) + if (is_null && find_reference_state(vstate, id)) /* regs[regno] is in the " == NULL" branch. * No one could have freed the reference state before * doing the NULL check. @@ -16433,7 +16433,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char ret_type = btf_type_resolve_ptr(prog->aux->attach_btf, prog->aux->attach_func_proto->type, NULL); - if (ret_type && ret_type == reg_type && reg->ref_obj_id) + if (ret_type && ret_type == reg_type && reg_is_referenced(env, reg)) return __check_ptr_off_reg(env, reg, argno_from_reg(regno), false); } @@ -18302,7 +18302,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_unknown(env, regs, i); } else if (arg->arg_type == ARG_PTR_TO_DYNPTR) { /* assume unspecial LOCAL dynptr type */ - __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen); + __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen, 0); } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { reg->type = PTR_TO_MEM; reg->type |= arg->arg_type & @@ -18361,8 +18361,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) /* Acquire references for struct_ops program arguments tagged with "__ref" */ if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) { for (i = 0; i < aux->ctx_arg_info_size; i++) - aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ? - acquire_reference(env, 0) : 0; + aux->ctx_arg_info[i].ref_id = aux->ctx_arg_info[i].refcounted ? + acquire_reference(env, 0, 0) : 0; } ret = do_check(env); diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c index bbe476f4c47d..5c3579438427 100644 --- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c @@ -13,8 +13,8 @@ static struct { const char *err_msg; } spin_lock_fail_tests[] = { { "lock_id_kptr_preserve", - "[0-9]\\+: (bf) r1 = r0 ; R0=ptr_foo(id=2,ref_obj_id=2)" - " R1=ptr_foo(id=2,ref_obj_id=2) refs=2\n" + "[0-9]\\+: (bf) r1 = r0 ; R0=ptr_foo(id=2)" + " R1=ptr_foo(id=2) refs=2\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=ptr_ expected=percpu_ptr_" }, { "lock_id_global_zero", diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index dbd97add5a5a..fa0beeaad1be 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -78,7 +78,7 @@ static int get_map_val_dynptr(struct bpf_dynptr *ptr) * bpf_ringbuf_submit/discard_dynptr call */ SEC("?raw_tp") -__failure __msg("Unreleased reference id=2") +__failure __msg("Unreleased reference id=1") int ringbuf_missing_release1(void *ctx) { struct bpf_dynptr ptr = {}; @@ -91,7 +91,7 @@ int ringbuf_missing_release1(void *ctx) } SEC("?raw_tp") -__failure __msg("Unreleased reference id=4") +__failure __msg("Unreleased reference id=3") int ringbuf_missing_release2(void *ctx) { struct bpf_dynptr ptr1, ptr2; diff --git a/tools/testing/selftests/bpf/progs/iters_state_safety.c b/tools/testing/selftests/bpf/progs/iters_state_safety.c index af8f9ec1ea98..646026430e9b 100644 --- a/tools/testing/selftests/bpf/progs/iters_state_safety.c +++ b/tools/testing/selftests/bpf/progs/iters_state_safety.c @@ -30,7 +30,7 @@ int force_clang_to_emit_btf_for_externs(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(id=1,state=active,depth=0)") int create_and_destroy(void *ctx) { struct bpf_iter_num iter; @@ -196,7 +196,7 @@ int leak_iter_from_subprog_fail(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(id=1,state=active,depth=0)") int valid_stack_reuse(void *ctx) { struct bpf_iter_num iter; diff --git a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c index 9b760dac333e..d00888f6687a 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c @@ -20,8 +20,8 @@ __s64 res_empty; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") -__msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_empty(const void *ctx) { @@ -38,8 +38,8 @@ __s64 res_full; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") -__msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_full(const void *ctx) { @@ -58,8 +58,8 @@ static volatile int zero = 0; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") -__msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_truncated(const void *ctx) { -- cgit v1.2.3 From 92d681b42746d4497dcc8afb45edd4af5737542f Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:29 -0700 Subject: bpf: Remove redundant dynptr arg check for helper unmark_stack_slots_dynptr() already makes sure that CONST_PTR_TO_DYNPTR cannot be released. process_dynptr_func() also prevents passing uninitialized dynptr to helpers expecting initialized dynptr. Now that unmark_stack_slots_dynptr() also reports error returned from release_reference(), there should be no reason to keep these redundant checks. Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-7-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 21 +-------------------- tools/testing/selftests/bpf/progs/dynptr_fail.c | 6 +++--- .../testing/selftests/bpf/progs/user_ringbuf_fail.c | 4 ++-- 3 files changed, 6 insertions(+), 25 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6d82ca5acacb..4f75e5f95d27 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8220,26 +8220,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, skip_type_check: if (arg_type_is_release(arg_type)) { - if (arg_type_is_dynptr(arg_type)) { - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - /* Only dynptr created on stack can be released, thus - * the get_spi and stack state checks for spilled_ptr - * should only be done before process_dynptr_func for - * PTR_TO_STACK. - */ - if (reg->type == PTR_TO_STACK) { - spi = dynptr_get_spi(env, reg); - if (spi < 0 || !state->stack[spi].spilled_ptr.id) { - verbose(env, "arg %d is an unacquired reference\n", regno); - return -EINVAL; - } - } else { - verbose(env, "cannot release unowned const bpf_dynptr\n"); - return -EINVAL; - } - } else if (!reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { + if (!arg_type_is_dynptr(arg_type) && !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { verbose(env, "R%d must be referenced when passed to release function\n", regno); return -EINVAL; diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index fa0beeaad1be..40a14a5174a5 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -136,7 +136,7 @@ int ringbuf_missing_release_callback(void *ctx) /* Can't call bpf_ringbuf_submit/discard_dynptr on a non-initialized dynptr */ SEC("?raw_tp") -__failure __msg("arg 1 is an unacquired reference") +__failure __msg("Expected an initialized dynptr as R1") int ringbuf_release_uninit_dynptr(void *ctx) { struct bpf_dynptr ptr; @@ -650,7 +650,7 @@ int invalid_offset(void *ctx) /* Can't release a dynptr twice */ SEC("?raw_tp") -__failure __msg("arg 1 is an unacquired reference") +__failure __msg("Expected an initialized dynptr as R1") int release_twice(void *ctx) { struct bpf_dynptr ptr; @@ -677,7 +677,7 @@ static int release_twice_callback_fn(__u32 index, void *data) * within a callback function, fails */ SEC("?raw_tp") -__failure __msg("arg 1 is an unacquired reference") +__failure __msg("Expected an initialized dynptr as R1") int release_twice_callback(void *ctx) { struct bpf_dynptr ptr; diff --git a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c index 54de0389f878..c0d0422b8030 100644 --- a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c +++ b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c @@ -146,7 +146,7 @@ try_discard_dynptr(struct bpf_dynptr *dynptr, void *context) * not be able to read past the end of the pointer. */ SEC("?raw_tp") -__failure __msg("cannot release unowned const bpf_dynptr") +__failure __msg("CONST_PTR_TO_DYNPTR cannot be released") int user_ringbuf_callback_discard_dynptr(void *ctx) { bpf_user_ringbuf_drain(&user_ringbuf, try_discard_dynptr, NULL, 0); @@ -166,7 +166,7 @@ try_submit_dynptr(struct bpf_dynptr *dynptr, void *context) * not be able to read past the end of the pointer. */ SEC("?raw_tp") -__failure __msg("cannot release unowned const bpf_dynptr") +__failure __msg("CONST_PTR_TO_DYNPTR cannot be released") int user_ringbuf_callback_submit_dynptr(void *ctx) { bpf_user_ringbuf_drain(&user_ringbuf, try_submit_dynptr, NULL, 0); -- cgit v1.2.3 From b7dd2b388657d99689161e82ed13515505838232 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:30 -0700 Subject: bpf: Unify referenced object tracking in verifier Helpers and kfuncs independently tracked referenced object metadata using standalone id fields in their respective arg_meta structs. This led to duplicated logic and inconsistent error handling between the two paths. Introduce struct ref_obj_desc to consolidate id and parent_id along with a count of how many arguments carry a reference. Add update_ref_obj() to populate it from a bpf_reg_state, replacing open-coded assignments in check_func_arg(), check_kfunc_args(), and process_iter_arg(). Add validate_ref_obj() to check for ambiguous ref_obj before using it. For ref_obj releasing helpers and kfuncs, keep checking it before calling update_ref_obj() for now. A later patch will make these functions not depending on ref_obj. For other users of ref_obj, move the checks to the use locations. For helper, this means moving the checks inside helper_multiple_ref_obj_use() to use locations. is_acquire_function() is dropped as ref_obj is never used. Pass ref_obj_desc into process_dynptr_func()/mark_stack_slots_dynptr() instead of a bare parent_id to make it less confusing. Drop the selftest introduced in 7ec899ac90a2 ("selftests/bpf: Negative test case for ref_obj_id in args") since the verifier no longer complains about ambiguous ref_obj if it is not used. Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-8-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 14 ++- kernel/bpf/verifier.c | 122 +++++++++++---------- .../selftests/bpf/progs/test_ringbuf_map_key.c | 11 +- tools/testing/selftests/bpf/verifier/calls.c | 24 ---- 4 files changed, 78 insertions(+), 93 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 75b287d8d92f..b0521ba7787a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1424,6 +1424,18 @@ struct bpf_dynptr_desc { u32 parent_id; }; +/* + * The last seen rereferenced object; Updated by update_ref_obj() when a register refers to a + * referenced object. Used when the helper or kfunc is releasing a referenced object, casting + * a referenced object, returning allocated memory derived from referenced object or creating + * a dynptr with a referenced object as parent. + */ +struct ref_obj_desc { + u32 id; + u32 parent_id; + u8 cnt; +}; + struct bpf_kfunc_call_arg_meta { /* In parameters */ struct btf *btf; @@ -1432,7 +1444,6 @@ struct bpf_kfunc_call_arg_meta { const struct btf_type *func_proto; const char *func_name; /* Out parameters */ - u32 id; u8 release_regno; bool r0_rdonly; u32 ret_btf_id; @@ -1470,6 +1481,7 @@ struct bpf_kfunc_call_arg_meta { } iter; struct bpf_map_desc map; struct bpf_dynptr_desc dynptr; + struct ref_obj_desc ref_obj; u64 mem_size; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4f75e5f95d27..bc8a09c858d8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -231,9 +231,28 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } +static void update_ref_obj(struct ref_obj_desc *ref_obj, struct bpf_reg_state *reg) +{ + ref_obj->id = reg->id; + ref_obj->parent_id = reg->parent_id; + ref_obj->cnt++; +} + +static int validate_ref_obj(struct bpf_verifier_env *env, struct ref_obj_desc *ref_obj) +{ + if (ref_obj->cnt > 1) { + verifier_bug(env, "function expects only one referenced object but got %d\n", + ref_obj->cnt); + return -EFAULT; + } + + return 0; +} + struct bpf_call_arg_meta { struct bpf_map_desc map; struct bpf_dynptr_desc dynptr; + struct ref_obj_desc ref_obj; bool raw_mode; bool pkt_access; u8 release_regno; @@ -241,7 +260,6 @@ struct bpf_call_arg_meta { int access_size; int mem_size; u64 msize_max_value; - u32 id; int func_id; struct btf *btf; u32 btf_id; @@ -528,20 +546,6 @@ bool bpf_is_may_goto_insn(struct bpf_insn *insn) return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO; } -static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, - const struct bpf_map *map) -{ - int ref_obj_uses = 0; - - if (is_ptr_cast_function(func_id)) - ref_obj_uses++; - if (is_acquire_function(func_id, map)) - ref_obj_uses++; - - return ref_obj_uses > 1; -} - - static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots) { int allocated_slots = state->allocated_stack / BPF_REG_SIZE; @@ -670,11 +674,11 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi); static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - enum bpf_arg_type arg_type, int insn_idx, int parent_id, - struct bpf_dynptr_desc *dynptr) + enum bpf_arg_type arg_type, int insn_idx, + struct ref_obj_desc *ref_obj, struct bpf_dynptr_desc *dynptr) { struct bpf_func_state *state = bpf_func(env, reg); - int spi, i, err; + int spi, i, err, parent_id = 0; enum bpf_dynptr_type type; spi = dynptr_get_spi(env, reg); @@ -707,6 +711,13 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ return -EINVAL; if (dynptr->type == BPF_DYNPTR_TYPE_INVALID) { /* dynptr constructors */ + err = validate_ref_obj(env, ref_obj); + if (err) + return err; + + /* Track parent's id if the parent is a referenced object */ + parent_id = ref_obj->id; + if (dynptr_type_referenced(type)) { int id; @@ -7188,7 +7199,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, */ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, enum bpf_arg_type arg_type, - int parent_id, struct bpf_dynptr_desc *dynptr) + struct ref_obj_desc *ref_obj, struct bpf_dynptr_desc *dynptr) { int spi, err = 0; @@ -7229,7 +7240,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat return err; } - err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, parent_id, dynptr); + err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, ref_obj, dynptr); } else /* OBJ_RELEASE and None case from above */ { /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ if (reg->type == CONST_PTR_TO_DYNPTR && (arg_type & OBJ_RELEASE)) { @@ -7277,13 +7288,6 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat return err; } -static u32 iter_ref_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi) -{ - struct bpf_func_state *state = bpf_func(env, reg); - - return state->stack[spi].spilled_ptr.id; -} - static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY); @@ -7316,6 +7320,7 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx, static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, struct bpf_kfunc_call_arg_meta *meta) { + struct bpf_func_state *state = bpf_func(env, reg); const struct btf_type *t; u32 arg_idx = arg_idx_from_argno(argno); int spi, err, i, nr_slots, btf_id; @@ -7387,7 +7392,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state * /* remember meta->iter info for process_iter_next_call() */ meta->iter.spi = spi; meta->iter.frameno = reg->frameno; - meta->id = iter_ref_id(env, reg, spi); + update_ref_obj(&meta->ref_obj, &state->stack[spi].spilled_ptr); if (is_iter_destroy_kfunc(meta)) { err = unmark_stack_slots_iter(env, reg, nr_slots); @@ -8166,6 +8171,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_arg_type arg_type = fn->arg_type[arg]; + argno_t argno = argno_from_arg(arg + 1); enum bpf_reg_type type = reg->type; u32 *arg_btf_id = NULL; u32 key_size; @@ -8232,15 +8238,8 @@ skip_type_check: meta->release_regno = regno; } - if (reg_is_referenced(env, reg) && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { - if (meta->id) { - verbose(env, "more than one arg with referenced id R%d %u %u", - regno, reg->id, - meta->id); - return -EACCES; - } - meta->id = reg->id; - } + if (reg_is_referenced(env, reg)) + update_ref_obj(&meta->ref_obj, reg); switch (base_type(arg_type)) { case ARG_CONST_MAP_PTR: @@ -8379,7 +8378,7 @@ skip_type_check: true, meta); break; case ARG_PTR_TO_DYNPTR: - err = process_dynptr_func(env, reg, argno_from_reg(regno), insn_idx, arg_type, 0, + err = process_dynptr_func(env, reg, argno_from_reg(regno), insn_idx, arg_type, &meta->ref_obj, &meta->dynptr); if (err) return err; @@ -9042,6 +9041,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_subprog_info *sub = subprog_info(env, subprog); struct bpf_func_state *caller = cur_func(env); struct bpf_verifier_log *log = &env->log; + struct ref_obj_desc ref_obj = {}; u32 i; int ret, err; @@ -9119,7 +9119,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, if (ret) return ret; - ret = process_dynptr_func(env, reg, argno, -1, arg->arg_type, 0, NULL); + ret = process_dynptr_func(env, reg, argno, -1, arg->arg_type, &ref_obj, NULL); if (ret) return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { @@ -10125,8 +10125,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = -EINVAL; if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); - } else if (func_id == BPF_FUNC_kptr_xchg && meta.id) { - u32 id = meta.id; + } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj.id) { + u32 id = meta.ref_obj.id; bool in_rcu = in_rcu_cs(env); struct bpf_func_state *state; struct bpf_reg_state *reg; @@ -10145,10 +10145,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } })); } - } else if (meta.id) { - err = release_reference(env, meta.id); + } else if (meta.ref_obj.id) { + err = release_reference(env, meta.ref_obj.id); } else if (bpf_register_is_null(®s[meta.release_regno])) { - /* meta.id can only be 0 if register that is meant to be + /* meta.ref_obj.id can only be 0 if register that is meant to be * released is NULL, which must be > R0. */ err = 0; @@ -10413,17 +10413,15 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; - if (helper_multiple_ref_obj_use(func_id, meta.map.ptr)) { - verifier_bug(env, "func %s#%d sets ref_obj_id more than once", - func_id_name(func_id), func_id); - return -EFAULT; - } - if (is_ptr_cast_function(func_id) && - find_reference_state(env->cur_state, meta.id)) { + find_reference_state(env->cur_state, meta.ref_obj.id)) { struct bpf_verifier_state *branch; struct bpf_reg_state *r0; + err = validate_ref_obj(env, &meta.ref_obj); + if (err) + return err; + /* * In order for a release of any of the original or cast pointers * to invalidate all other pointers, reuse the same reference id for @@ -10441,7 +10439,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn r0->type = SCALAR_VALUE; regs[BPF_REG_0].type &= ~PTR_MAYBE_NULL; - regs[BPF_REG_0].id = meta.id; + regs[BPF_REG_0].id = meta.ref_obj.id; } else if (is_acquire_function(func_id, meta.map.ptr)) { int id = acquire_reference(env, insn_idx, 0); @@ -11915,13 +11913,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (reg_is_referenced(env, reg)) { - if (is_kfunc_release(meta) && meta->id) { - verifier_bug(env, "more than one arg with referenced id %s %u %u", - reg_arg_name(env, argno), reg->id, - meta->id); + if (is_kfunc_release(meta) && meta->ref_obj.cnt) { + verbose(env, "more than one arg with referenced id %s %u %u", + reg_arg_name(env, argno), reg->id, + meta->ref_obj.id); return -EFAULT; } - meta->id = reg->id; + update_ref_obj(&meta->ref_obj, reg); if (is_kfunc_release(meta)) { if (regno < 0) { verbose(env, "%s release arg cannot be a stack argument\n", @@ -12104,7 +12102,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } ret = process_dynptr_func(env, reg, argno, insn_idx, dynptr_arg_type, - meta->id, &meta->dynptr); + &meta->ref_obj, &meta->dynptr); if (ret < 0) return ret; break; @@ -13048,8 +13046,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].type |= MEM_RDONLY; /* Ensures we don't access the memory after a release_reference() */ - if (meta.id) - regs[BPF_REG_0].parent_id = meta.id; + if (meta.ref_obj.id) { + err = validate_ref_obj(env, &meta.ref_obj); + if (err) + return err; + regs[BPF_REG_0].parent_id = meta.ref_obj.id; + } if (is_kfunc_rcu_protected(&meta)) regs[BPF_REG_0].type |= MEM_RCU; diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c b/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c index 21bb7da90ea5..0efafa927a3d 100644 --- a/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c @@ -35,7 +35,7 @@ SEC("fentry/" SYS_PREFIX "sys_getpgid") int test_ringbuf_mem_map_key(void *ctx) { int cur_pid = bpf_get_current_pid_tgid() >> 32; - struct sample *sample, sample_copy; + struct sample *sample; int *lookup_val; if (cur_pid != pid) @@ -55,16 +55,11 @@ int test_ringbuf_mem_map_key(void *ctx) lookup_val = (int *)bpf_map_lookup_elem(&hash_map, sample); __sink(lookup_val); - /* workaround - memcpy is necessary so that verifier doesn't - * complain with: - * verifier internal error: more than one arg with ref_obj_id R3 - * when trying to do bpf_map_update_elem(&hash_map, sample, &sample->seq, BPF_ANY); - * + /* * Since bpf_map_lookup_elem above uses 'sample' as key, test using * sample field as value below */ - __builtin_memcpy(&sample_copy, sample, sizeof(struct sample)); - bpf_map_update_elem(&hash_map, &sample_copy, &sample->seq, BPF_ANY); + bpf_map_update_elem(&hash_map, sample, &sample->seq, BPF_ANY); bpf_ringbuf_submit(sample, 0); return 0; diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 0bb4337552c8..42d523a21a43 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -2410,27 +2410,3 @@ .errstr_unpriv = "", .prog_type = BPF_PROG_TYPE_CGROUP_SKB, }, -{ - "calls: several args with ref_obj_id", - .insns = { - /* Reserve at least sizeof(struct iphdr) bytes in the ring buffer. - * With a smaller size, the verifier would reject the call to - * bpf_tcp_raw_gen_syncookie_ipv4 before we can reach the - * ref_obj_id error. - */ - BPF_MOV64_IMM(BPF_REG_2, 20), - BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), - /* if r0 == 0 goto */ - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), - BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tcp_raw_gen_syncookie_ipv4), - BPF_EXIT_INSN(), - }, - .fixup_map_ringbuf = { 2 }, - .result = REJECT, - .errstr = "more than one arg with ref_obj_id", - .prog_type = BPF_PROG_TYPE_SCHED_CLS, -}, -- cgit v1.2.3 From bcfcb15fde94ed39068eb1d6e4b9b37d27111965 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:31 -0700 Subject: bpf: Unify release handling for helpers and kfuncs Introduce release_reg() to consolidate the release logic shared by both helpers and kfuncs: dynptr release, kptr_xchg percpu-to-RCU conversion, regular reference release, and NULL pass-through. NULL pass-through is only allowed if the prototype indicates the argument may be null. Determine release_regno from the function prototype/metadata before argument checking, rather than discovering it dynamically during argument processing. For helpers, scan the arg_type array in check_func_proto() via check_proto_release_reg(). For kfuncs, set release_regno to BPF_REG_1 in bpf_fetch_kfunc_arg_meta() when KF_RELEASE is set. In the future when we start adding decl_tag to kfunc arguments, we can just look at the function prototype instead of a release_regno. Extract ref_convert_alloc_rcu_protected() and invalidate_rcu_protected_refs() to make it more clear what the code is doing. For ref_convert_alloc_rcu_protected(), it pre-converts MEM_ALLOC | MEM_PERCPU registers to MEM_RCU (clearing id so they survive), then calls release_reference() to invalidate the remaining registers and release the reference state. Add KF_RELEASE to bpf_dynptr_file_discard() so its release_regno is set via fetch_kfunc_meta rather than being assigned manually in the dynptr argument processing. Set arg_type to ARG_PTR_TO_DYNPTR for KF_ARG_PTR_TO_DYNPTR so that check_func_arg_reg_off() correctly allows non-zero stack offsets for dynptr release arguments same as helper. Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-9-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 +- kernel/bpf/helpers.c | 2 +- kernel/bpf/verifier.c | 198 +++++++++++---------- tools/testing/selftests/bpf/prog_tests/cb_refs.c | 2 +- .../selftests/bpf/progs/cgrp_kfunc_failure.c | 6 +- tools/testing/selftests/bpf/progs/map_kptr_fail.c | 2 +- .../selftests/bpf/progs/task_kfunc_failure.c | 6 +- .../selftests/bpf/progs/verifier_global_ptr_args.c | 2 +- .../selftests/bpf/progs/verifier_ref_tracking.c | 2 +- tools/testing/selftests/bpf/progs/verifier_sock.c | 6 +- .../selftests/bpf/progs/verifier_vfs_reject.c | 2 +- .../selftests/bpf/progs/wakeup_source_fail.c | 2 +- 12 files changed, 122 insertions(+), 114 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b0521ba7787a..3dd2d21230af 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1426,9 +1426,9 @@ struct bpf_dynptr_desc { /* * The last seen rereferenced object; Updated by update_ref_obj() when a register refers to a - * referenced object. Used when the helper or kfunc is releasing a referenced object, casting - * a referenced object, returning allocated memory derived from referenced object or creating - * a dynptr with a referenced object as parent. + * referenced object. Used when the helper or kfunc is casting a referenced object, returning + * allocated memory derived from referenced object or creating a dynptr with a referenced + * object as parent. */ struct ref_obj_desc { u32 id; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9ca195104667..03004e4451f5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4957,7 +4957,7 @@ BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) -BTF_ID_FLAGS(func, bpf_dynptr_file_discard) +BTF_ID_FLAGS(func, bpf_dynptr_file_discard, KF_RELEASE) BTF_ID_FLAGS(func, bpf_timer_cancel_async) BTF_KFUNCS_END(common_btf_ids) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bc8a09c858d8..caa455fad877 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8225,17 +8225,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return err; skip_type_check: - if (arg_type_is_release(arg_type)) { - if (!arg_type_is_dynptr(arg_type) && !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { - verbose(env, "R%d must be referenced when passed to release function\n", - regno); - return -EINVAL; - } - if (meta->release_regno) { - verifier_bug(env, "more than one release argument"); - return -EFAULT; - } - meta->release_regno = regno; + if (arg_type_is_release(arg_type) && !arg_type_is_dynptr(arg_type) && + !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { + verbose(env, "release helper %s expects referenced PTR_TO_BTF_ID passed to %s\n", + func_id_name(meta->func_id), reg_arg_name(env, argno)); + return -EINVAL; } if (reg_is_referenced(env, reg)) @@ -8798,11 +8792,29 @@ static bool check_mem_arg_rw_flag_ok(const struct bpf_func_proto *fn) return true; } -static int check_func_proto(const struct bpf_func_proto *fn) +static bool check_proto_release_reg(const struct bpf_func_proto *fn, struct bpf_call_arg_meta *meta) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { + enum bpf_arg_type arg_type = fn->arg_type[i]; + + if (arg_type_is_release(arg_type)) { + if (meta->release_regno) + return false; + meta->release_regno = i + 1; + } + } + + return true; +} + +static int check_func_proto(const struct bpf_func_proto *fn, struct bpf_call_arg_meta *meta) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && check_mem_arg_rw_flag_ok(fn) && + check_proto_release_reg(fn, meta) && check_btf_id_ok(fn) ? 0 : -EINVAL; } @@ -8956,6 +8968,42 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env) })); } +static void invalidate_rcu_protected_refs(struct bpf_verifier_env *env) +{ + struct bpf_stack_state *stack; + struct bpf_func_state *state; + struct bpf_reg_state *reg; + u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); + + bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, clear_mask, ({ + if (reg->type & MEM_RCU) { + reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); + reg->type |= PTR_UNTRUSTED; + } + })); +} + +static int ref_convert_alloc_rcu_protected(struct bpf_verifier_env *env, u32 id) +{ + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int err; + + err = release_reference_nomark(env->cur_state, id); + + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->id != id) + continue; + if ((reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) { + reg->id = 0; + reg->type &= ~MEM_ALLOC; + reg->type |= MEM_RCU; + } + })); + + return err; +} + static void clear_caller_saved_regs(struct bpf_verifier_env *env, struct bpf_reg_state *regs) { @@ -10028,6 +10076,24 @@ static const char *non_sleepable_context_description(struct bpf_verifier_env *en return "non-sleepable prog"; } +static int release_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + bool convert_rcu, bool release_dynptr) +{ + int err = -EINVAL; + + if (bpf_register_is_null(reg)) + return 0; + + if (release_dynptr) + err = unmark_stack_slots_dynptr(env, reg); + else if (convert_rcu) + err = ref_convert_alloc_rcu_protected(env, reg->id); + else if (reg_is_referenced(env, reg)) + err = release_reference(env, reg->id); + + return err; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -10077,7 +10143,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn); + err = check_func_proto(fn, &meta); if (err) { verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id); return err; @@ -10122,37 +10188,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } if (meta.release_regno) { - err = -EINVAL; - if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { - err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); - } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj.id) { - u32 id = meta.ref_obj.id; - bool in_rcu = in_rcu_cs(env); - struct bpf_func_state *state; - struct bpf_reg_state *reg; - - err = release_reference_nomark(env->cur_state, id); - if (!err) { - bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ - if (reg->id == id) { - if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) { - reg->id = 0; - reg->type &= ~MEM_ALLOC; - reg->type |= MEM_RCU; - } else { - mark_reg_invalid(env, reg); - } - } - })); - } - } else if (meta.ref_obj.id) { - err = release_reference(env, meta.ref_obj.id); - } else if (bpf_register_is_null(®s[meta.release_regno])) { - /* meta.ref_obj.id can only be 0 if register that is meant to be - * released is NULL, which must be > R0. - */ - err = 0; - } + struct bpf_reg_state *reg = ®s[meta.release_regno]; + bool convert_rcu = (func_id == BPF_FUNC_kptr_xchg) && in_rcu_cs(env) && + (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU); + + err = release_reg(env, reg, convert_rcu, !!meta.dynptr.id); if (err) return err; } @@ -10547,7 +10587,6 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RELEASE; } - static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_DESTRUCTIVE; @@ -11912,24 +11951,16 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EACCES; } - if (reg_is_referenced(env, reg)) { - if (is_kfunc_release(meta) && meta->ref_obj.cnt) { - verbose(env, "more than one arg with referenced id %s %u %u", - reg_arg_name(env, argno), reg->id, - meta->ref_obj.id); - return -EFAULT; - } - update_ref_obj(&meta->ref_obj, reg); - if (is_kfunc_release(meta)) { - if (regno < 0) { - verbose(env, "%s release arg cannot be a stack argument\n", - reg_arg_name(env, argno)); - return -EINVAL; - } - meta->release_regno = regno; - } + if (regno == meta->release_regno && !is_kfunc_arg_dynptr(meta->btf, &args[i]) && + !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { + verbose(env, "release kfunc %s expects referenced PTR_TO_BTF_ID passed to %s\n", + func_name, reg_arg_name(env, argno)); + return -EINVAL; } + if (reg_is_referenced(env, reg)) + update_ref_obj(&meta->ref_obj, reg); + ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); @@ -11993,7 +12024,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } fallthrough; - case KF_ARG_PTR_TO_DYNPTR: case KF_ARG_PTR_TO_ITER: case KF_ARG_PTR_TO_LIST_HEAD: case KF_ARG_PTR_TO_LIST_NODE: @@ -12010,6 +12040,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; + case KF_ARG_PTR_TO_DYNPTR: + arg_type = ARG_PTR_TO_DYNPTR; + break; case KF_ARG_PTR_TO_CTX: arg_type = ARG_PTR_TO_CTX; break; @@ -12018,7 +12051,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } - if (is_kfunc_release(meta) && reg_is_referenced(env, reg)) + if (regno == meta->release_regno) arg_type |= OBJ_RELEASE; ret = check_func_arg_reg_off(env, reg, argno, arg_type); if (ret < 0) @@ -12083,12 +12116,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ dynptr_arg_type |= DYNPTR_TYPE_FILE; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) { dynptr_arg_type |= DYNPTR_TYPE_FILE | OBJ_RELEASE; - if (regno < 0) { - verbose(env, "%s release arg cannot be a stack argument\n", - reg_arg_name(env, argno)); - return -EINVAL; - } - meta->release_regno = regno; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { enum bpf_dynptr_type parent_type = meta->dynptr.type; @@ -12377,12 +12404,6 @@ check_ok: } } - if (is_kfunc_release(meta) && !meta->release_regno) { - verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", - func_name); - return -EINVAL; - } - return 0; } @@ -12409,6 +12430,10 @@ int bpf_fetch_kfunc_arg_meta(struct bpf_verifier_env *env, meta->kfunc_flags = *kfunc.flags; + /* Only support release referenced argument passed by register */ + if (is_kfunc_release(meta)) + meta->release_regno = BPF_REG_1; + return 0; } @@ -12899,23 +12924,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (rcu_lock) { env->cur_state->active_rcu_locks++; } else if (rcu_unlock) { - struct bpf_stack_state *stack; - struct bpf_func_state *state; - struct bpf_reg_state *reg; - u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); - if (env->cur_state->active_rcu_locks == 0) { verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); return -EINVAL; } - if (--env->cur_state->active_rcu_locks == 0) { - bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, clear_mask, ({ - if (reg->type & MEM_RCU) { - reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); - reg->type |= PTR_UNTRUSTED; - } - })); - } + if (--env->cur_state->active_rcu_locks == 0) + invalidate_rcu_protected_refs(env); } else if (preempt_disable) { env->cur_state->active_preempt_locks++; } else if (preempt_enable) { @@ -12946,13 +12960,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ if (meta.release_regno) { - struct bpf_reg_state *reg = ®s[meta.release_regno]; - - if (meta.dynptr.id) { - err = unmark_stack_slots_dynptr(env, reg); - } else { - err = release_reference(env, reg->id); - } + err = release_reg(env, ®s[meta.release_regno], false, !!meta.dynptr.id); if (err) return err; } diff --git a/tools/testing/selftests/bpf/prog_tests/cb_refs.c b/tools/testing/selftests/bpf/prog_tests/cb_refs.c index 6300b67a3a84..78566b817fd7 100644 --- a/tools/testing/selftests/bpf/prog_tests/cb_refs.c +++ b/tools/testing/selftests/bpf/prog_tests/cb_refs.c @@ -11,7 +11,7 @@ struct { const char *prog_name; const char *err_msg; } cb_refs_tests[] = { - { "underflow_prog", "must point to scalar, or struct with scalar" }, + { "underflow_prog", "release kfunc bpf_kfunc_call_test_release expects referenced PTR_TO_BTF_ID passed to R1" }, { "leak_prog", "Possibly NULL pointer passed to helper R2" }, { "nested_cb", "Unreleased reference id=4 alloc_insn=2" }, /* alloc_insn=2{4,5} */ { "non_cb_transfer_ref", "Unreleased reference id=4 alloc_insn=1" }, /* alloc_insn=1{1,2} */ diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c index a875ba8e5007..d0d65d6d450c 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c @@ -154,7 +154,7 @@ int BPF_PROG(cgrp_kfunc_xchg_unreleased, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("must be referenced or trusted") +__failure __msg("release kfunc bpf_cgroup_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(cgrp_kfunc_rcu_get_release, struct cgroup *cgrp, const char *path) { struct cgroup *kptr; @@ -191,7 +191,7 @@ int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path } SEC("tp_btf/cgroup_mkdir") -__failure __msg("R1 pointer type STRUCT cgroup must point") +__failure __msg("release kfunc bpf_cgroup_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(cgrp_kfunc_release_fp, struct cgroup *cgrp, const char *path) { struct cgroup *acquired = (struct cgroup *)&path; @@ -237,7 +237,7 @@ int BPF_PROG(cgrp_kfunc_release_null, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("release kernel function bpf_cgroup_release expects") +__failure __msg("release kfunc bpf_cgroup_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(cgrp_kfunc_release_unacquired, struct cgroup *cgrp, const char *path) { /* Cannot release trusted cgroup pointer which was not acquired. */ diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c index 8f36e74fd8f9..f11848dfa78f 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c @@ -252,7 +252,7 @@ int reject_untrusted_store_to_ref(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("R2 must be referenced") +__failure __msg("release helper bpf_kptr_xchg expects referenced PTR_TO_BTF_ID passed to R2") int reject_untrusted_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *p; diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c index 41047d81ec42..8e947d445f8e 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c @@ -178,7 +178,7 @@ int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_f } SEC("tp_btf/task_newtask") -__failure __msg("R1 pointer type STRUCT task_struct must point") +__failure __msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(task_kfunc_release_fp, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired = (struct task_struct *)&clone_flags; @@ -224,7 +224,7 @@ int BPF_PROG(task_kfunc_release_null, struct task_struct *task, u64 clone_flags) } SEC("tp_btf/task_newtask") -__failure __msg("release kernel function bpf_task_release expects") +__failure __msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(task_kfunc_release_unacquired, struct task_struct *task, u64 clone_flags) { /* Cannot release trusted task pointer which was not acquired. */ @@ -313,7 +313,7 @@ int BPF_PROG(task_access_comm4, struct task_struct *task, const char *buf, bool } SEC("tp_btf/task_newtask") -__failure __msg("R1 must be referenced or trusted") +__failure __msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(task_kfunc_release_in_map, struct task_struct *task, u64 clone_flags) { struct task_struct *local; diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c index e7dae0cf9c17..ea273e152209 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -153,7 +153,7 @@ __weak int subprog_trusted_destroy(struct task_struct *task __arg_trusted) SEC("?tp_btf/task_newtask") __failure __log_level(2) -__msg("release kernel function bpf_task_release expects refcounted PTR_TO_BTF_ID") +__msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(trusted_destroy_fail, struct task_struct *task, u64 clone_flags) { return subprog_trusted_destroy(task); diff --git a/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c b/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c index 139f70bb3595..199ad18f8eb5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c +++ b/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c @@ -1288,7 +1288,7 @@ l1_%=: r1 = r6; \ SEC("tc") __description("reference tracking: bpf_sk_release(listen_sk)") -__failure __msg("R1 must be referenced when passed to release function") +__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1") __naked void bpf_sk_release_listen_sk(void) { asm volatile ( diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c index a2132c72d3b8..9f680cf44512 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sock.c +++ b/tools/testing/selftests/bpf/progs/verifier_sock.c @@ -603,7 +603,7 @@ l2_%=: r0 = *(u32*)(r0 + %[bpf_tcp_sock_snd_cwnd]); \ SEC("tc") __description("bpf_sk_release(skb->sk)") -__failure __msg("R1 must be referenced when passed to release function") +__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1") __naked void bpf_sk_release_skb_sk(void) { asm volatile (" \ @@ -620,7 +620,7 @@ l0_%=: r0 = 0; \ SEC("tc") __description("bpf_sk_release(bpf_sk_fullsock(skb->sk))") -__failure __msg("R1 must be referenced when passed to release function") +__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1") __naked void bpf_sk_fullsock_skb_sk(void) { asm volatile (" \ @@ -644,7 +644,7 @@ l1_%=: r1 = r0; \ SEC("tc") __description("bpf_sk_release(bpf_tcp_sock(skb->sk))") -__failure __msg("R1 must be referenced when passed to release function") +__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1") __naked void bpf_tcp_sock_skb_sk(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c index 0990de076844..2870738d93f7 100644 --- a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c @@ -80,7 +80,7 @@ int BPF_PROG(get_task_exe_file_kfunc_unreleased) } SEC("lsm.s/file_open") -__failure __msg("release kernel function bpf_put_file expects") +__failure __msg("release kfunc bpf_put_file expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(put_file_kfunc_unacquired, struct file *file) { /* Can't release an unacquired pointer. */ diff --git a/tools/testing/selftests/bpf/progs/wakeup_source_fail.c b/tools/testing/selftests/bpf/progs/wakeup_source_fail.c index b8bbb61d4d4e..d4d0f1610853 100644 --- a/tools/testing/selftests/bpf/progs/wakeup_source_fail.c +++ b/tools/testing/selftests/bpf/progs/wakeup_source_fail.c @@ -42,7 +42,7 @@ int wakeup_source_access_lock_fields(void *ctx) } SEC("syscall") -__failure __msg("type=scalar expected=fp") +__failure __msg("release kfunc bpf_wakeup_sources_read_unlock expects referenced PTR_TO_BTF_ID passed to R1") int wakeup_source_unlock_no_lock(void *ctx) { struct bpf_ws_lock *lock = (void *)0x1; -- cgit v1.2.3 From 2eee6fe8ac2cd21e931c009d475e5a8407d42d76 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 28 May 2026 18:49:32 -0700 Subject: bpf: Fix dynptr ref counting to scan all call frames When checking whether a referenced dynptr can be overwritten, destroy_if_dynptr_stack_slot only counted sibling dynptrs in the current call frame. If a clone sharing the same virtual ref parent existed in a different frame (e.g., passed to a subprog), it would not be counted, causing the verifier to incorrectly reject the overwrite with "cannot overwrite referenced dynptr". Fix by extracting the counting into dynptr_ref_cnt() which uses bpf_for_each_reg_in_vstate_mask() to scan dynptr stack slots across all call frames. Fixes: 017f5c4ef73c ("bpf: Allow overwriting referenced dynptr when refcnt > 1") Reported-by: Eduard Zingerman Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260529014936.2811085-10-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 52 ++++++++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 23 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index caa455fad877..5d8f2656dbfd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -786,10 +786,29 @@ static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_ __mark_reg_unknown(env, reg); } +static int dynptr_ref_cnt(struct bpf_verifier_env *env, int v_parent_id) +{ + struct bpf_stack_state *stack; + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int ref_cnt = 0; + + bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, 1 << STACK_DYNPTR, ({ + if (!stack || stack->slot_type[0] != STACK_DYNPTR) + continue; + if (!stack->spilled_ptr.dynptr.first_slot) + continue; + if (stack->spilled_ptr.parent_id == v_parent_id) + ref_cnt++; + })); + + return ref_cnt; +} + static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi) { - int i, err = 0; + int err = 0; /* We always ensure that STACK_DYNPTR is never set partially, * hence just checking for slot_type[0] is enough. This is @@ -803,28 +822,15 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, if (!state->stack[spi].spilled_ptr.dynptr.first_slot) spi = spi + 1; - if (dynptr_type_referenced(state->stack[spi].spilled_ptr.dynptr.type)) { - int v_parent_id = state->stack[spi].spilled_ptr.parent_id; - int ref_cnt = 0; - - /* - * A referenced dynptr can be overwritten only if there is at - * least one other dynptr sharing the same virtual ref parent, - * ensuring the reference can still be properly released. - */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_DYNPTR) - continue; - if (!state->stack[i].spilled_ptr.dynptr.first_slot) - continue; - if (state->stack[i].spilled_ptr.parent_id == v_parent_id) - ref_cnt++; - } - - if (ref_cnt <= 1) { - verbose(env, "cannot overwrite referenced dynptr\n"); - return -EINVAL; - } + /* + * A referenced dynptr can be overwritten only if there is at + * least one other dynptr sharing the same virtual ref parent, + * ensuring the reference can still be properly released. + */ + if (dynptr_type_referenced(state->stack[spi].spilled_ptr.dynptr.type) && + dynptr_ref_cnt(env, state->stack[spi].spilled_ptr.parent_id) <= 1) { + verbose(env, "cannot overwrite referenced dynptr\n"); + return -EINVAL; } /* Invalidate the dynptr and any derived slices */ -- cgit v1.2.3 From 9a3c3c49c333760c8944dadacbe114c1884546ef Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 1 Jun 2026 17:02:42 +0200 Subject: bpf: Reject exclusive maps as inner maps in map-in-map An exclusive map (created with excl_prog_hash) is bound to a single program by hash: check_map_prog_compatibility() refuses to load any program whose digest does not match map->excl_prog_sha. That check only runs for maps a program references directly, i.e. its used_maps. A map reached at runtime through a map-of-maps is never in used_maps, and bpf_map_meta_equal() does not consider excl_prog_sha, so an exclusive map can be inserted into a non-exclusive outer map and then looked up and mutated by an unrelated program, bypassing the exclusivity guarantee. For the signed loader this defeats the metadata map exclusivity check added in the signed loader: the cached map->sha[] is validated against the signed hash while another program on a hostile host rewrites the frozen map's contents through the outer map. Fixes: baefdbdf6812 ("bpf: Implement exclusive map creation") Reported-by: sashiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260601150248.394863-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/map_in_map.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 645bd30bc9a9..d2cbab4bdf64 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -20,7 +20,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Does not support >1 level map-in-map */ if (inner_map->inner_map_meta) return ERR_PTR(-EINVAL); - + if (inner_map->excl_prog_sha) + return ERR_PTR(-ENOTSUPP); if (!inner_map->ops->map_meta_equal) return ERR_PTR(-ENOTSUPP); @@ -101,6 +102,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; + if (inner_map->excl_prog_sha) + return ERR_PTR(-ENOTSUPP); inner_map_meta = map->inner_map_meta; if (inner_map_meta->ops->map_meta_equal(inner_map_meta, inner_map)) -- cgit v1.2.3 From c48c3a7e7d5bed644208ed443d63bb6a6f411676 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 1 Jun 2026 17:02:43 +0200 Subject: bpf: Drop redundant hash_buf from map_get_hash operation bpf_map_get_info_by_fd() is the only caller of the ->map_get_hash and always invokes it with hash_buf == map->sha and hash_buf_size of SHA256_DIGEST_SIZE. array_map_get_hash() in turn lets sha256() write the digest directly into that buffer (map->sha) and then performs a trailing memcpy(), which evaluates to memcpy(map->sha, map->sha, 32): a redundant self-copy. The hash_buf_size argument was never used at all. Simplify this a bit, no functional change. Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260601150248.394863-3-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/arraymap.c | 6 ++---- kernel/bpf/syscall.c | 8 +++----- 3 files changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d1a17c118316..c0510d223685 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -111,7 +111,7 @@ struct bpf_map_ops { long (*map_pop_elem)(struct bpf_map *map, void *value); long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); - int (*map_get_hash)(struct bpf_map *map, u32 hash_buf_size, void *hash_buf); + int (*map_get_hash)(struct bpf_map *map); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index dfb2110ab733..e6271a2bf6d6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -175,14 +175,12 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + (u64)array->elem_size * (index & array->index_mask); } -static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size, - void *hash_buf) +static int array_map_get_hash(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); sha256(array->value, (u64)array->elem_size * array->map.max_entries, - hash_buf); - memcpy(array->map.sha, hash_buf, sizeof(array->map.sha)); + array->map.sha); return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2aafd2131983..a27fa2b9b405 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5434,18 +5434,16 @@ static int bpf_map_get_info_by_fd(struct file *file, if (!map->ops->map_get_hash) return -EINVAL; - - if (info.hash_size != SHA256_DIGEST_SIZE) + if (info.hash_size != sizeof(map->sha)) return -EINVAL; - if (!READ_ONCE(map->frozen)) return -EPERM; - err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); + err = map->ops->map_get_hash(map); if (err != 0) return err; - if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0) + if (copy_to_user(uhash, map->sha, sizeof(map->sha)) != 0) return -EFAULT; } else if (info.hash_size) { return -EINVAL; -- cgit v1.2.3 From 0fb6c9ed6493b4af01be8bb0a384574eba7df636 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Mon, 1 Jun 2026 17:02:44 +0200 Subject: libbpf: Reject non-exclusive metadata maps in the signed loader The loader verifies map->sha against the metadata hash in its instructions. map->sha is calculated when BPF_OBJ_GET_INFO_BY_FD is called on the frozen map. While the map is frozen, the /signed loader/ must also ensure the map is exclusive, as, without exclusivity (which a hostile host could just omit when loading the loader), another BPF program with map access can mutate the contents afterwards, so the check passes on stale data. With the extra check as part of the signed loader, it now refuses to move on with map->sha validation if the host set it up wrongly. Fixes: fb2b0e290147 ("libbpf: Update light skeleton for signing") Signed-off-by: KP Singh Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260601150248.394863-4-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 7 +++++++ tools/lib/bpf/gen_loader.c | 17 +++++++++++++++++ 3 files changed, 25 insertions(+) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c0510d223685..8599b451dd7a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -296,6 +296,7 @@ struct bpf_map_owner { struct bpf_map { u8 sha[SHA256_DIGEST_SIZE]; + u32 excl; const struct bpf_map_ops *ops; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a27fa2b9b405..625a4366fe6d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1588,6 +1588,13 @@ static int map_create_alloc(union bpf_attr *attr, bpfptr_t uattr, struct bpf_ver err = -EFAULT; goto free_map; } + + /* See libbpf: emit_signature_match() */ + BUILD_BUG_ON(offsetof(struct bpf_map, excl) != SHA256_DIGEST_SIZE); + BUILD_BUG_ON(!__same_type(map->excl, u32)); + BUILD_BUG_ON(offsetof(struct bpf_map, sha) != 0); + BUILD_BUG_ON(!__same_type(map->sha, u8[SHA256_DIGEST_SIZE])); + map->excl = 1; } else if (attr->excl_prog_hash_size) { bpf_log(log, "Invalid excl_prog_hash_size.\n"); err = -EINVAL; diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 3702c5944bc0..66a02039da8c 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -586,6 +586,23 @@ static void emit_signature_match(struct bpf_gen *gen) __s64 off; int i; + /* + * Reject if the metadata map is not exclusive. Without exclusivity + * the cached map->sha[] verified above can be stale: another BPF + * program with map access could have mutated the contents between + * BPF_OBJ_GET_INFO_BY_FD and loader execution. + */ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX, + 0, 0, 0, 0)); + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, SHA256_DIGEST_LENGTH)); + off = -(gen->insn_cur - gen->insn_start - gen->cleanup_label) / 8 - 2; + if (is_simm16(off)) { + emit(gen, BPF_MOV64_IMM(BPF_REG_7, -EINVAL)); + emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, off)); + } else { + gen->error = -ERANGE; + } + for (i = 0; i < SHA256_DWORD_SIZE; i++) { emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX, 0, 0, 0, 0)); -- cgit v1.2.3 From 5ab4bc67d818ba27388d64cb9c52cb0c3bdac254 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Mon, 1 Jun 2026 20:41:16 -0400 Subject: verifier: parse BTF type tags for function arguments The BTF parsing logic for function arguments goes through the arguments' decl tags, but does not go into their type tags. Add type tag parsing for function arguments. Acked-by: Eduard Zingerman Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260602004120.17087-3-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 120 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 85 insertions(+), 35 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f429f6f58cb2..6fb3461e3ac2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7802,6 +7802,84 @@ enum btf_arg_tag { ARG_TAG_ARENA = BIT_ULL(5), }; +static int btf_scan_decl_tags(struct bpf_verifier_env *env, + const struct btf *btf, + const struct btf_type *fn_t, + u32 arg_idx, bool is_global, u32 *tags) +{ + int id = btf_named_start_id(btf, false) - 1; + + /* + * The 'arg:' decl_tag takes precedence over the derivation + * of the register type from the BTF type itself. + */ + while ((id = btf_find_next_decl_tag(btf, fn_t, arg_idx, "arg:", id)) > 0) { + const struct btf_type *tag_t = btf_type_by_id(btf, id); + const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4; + + /* disallow arg tags in static subprogs */ + if (!is_global) { + bpf_log(&env->log, + "arg#%d type tag is not supported in static functions\n", + arg_idx); + return -EOPNOTSUPP; + } + + if (strcmp(tag, "ctx") == 0) { + *tags |= ARG_TAG_CTX; + } else if (strcmp(tag, "trusted") == 0) { + *tags |= ARG_TAG_TRUSTED; + } else if (strcmp(tag, "untrusted") == 0) { + *tags |= ARG_TAG_UNTRUSTED; + } else if (strcmp(tag, "nonnull") == 0) { + *tags |= ARG_TAG_NONNULL; + } else if (strcmp(tag, "nullable") == 0) { + *tags |= ARG_TAG_NULLABLE; + } else if (strcmp(tag, "arena") == 0) { + *tags |= ARG_TAG_ARENA; + } else { + bpf_log(&env->log, "arg#%d has unsupported set of tags\n", arg_idx); + return -EOPNOTSUPP; + } + } + if (id != -ENOENT) { + bpf_log(&env->log, "arg#%d type tag fetching failure: %d\n", arg_idx, id); + return id; + } + + return 0; +} + +static int btf_scan_type_tags(struct bpf_verifier_env *env, + const struct btf *btf, u32 type_id, + u32 *tags) +{ + const struct btf_type *t; + + /* Find the first pointer type in the chain. */ + t = btf_type_skip_modifiers(btf, type_id, NULL); + if (!t || !btf_type_is_ptr(t)) + return 0; + + /* We got a pointer, get all associated type tags. */ + t = btf_type_by_id(btf, t->type); + while (t && btf_type_is_type_tag(t)) { + const char *tag = __btf_name_by_offset(btf, t->name_off); + + if (strcmp(tag, "arena") == 0) { + *tags |= ARG_TAG_ARENA; + } else { + bpf_log(&env->log, "function signature member has unsupported type tag '%s'\n", + tag); + return -EOPNOTSUPP; + } + + t = btf_type_by_id(btf, t->type); + } + + return 0; +} + /* Process BTF of a function to produce high-level expectation of function * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information * is cached in subprog info for reuse. @@ -7820,6 +7898,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) struct btf *btf = prog->aux->btf; const struct btf_param *args; const struct btf_type *t, *ref_t, *fn_t; + int err; u32 i, nargs, btf_id; const char *tname; @@ -7903,42 +7982,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) */ for (i = 0; i < nargs; i++) { u32 tags = 0; - int id = btf_named_start_id(btf, false) - 1; - - /* 'arg:' decl_tag takes precedence over derivation of - * register type from BTF type itself - */ - while ((id = btf_find_next_decl_tag(btf, fn_t, i, "arg:", id)) > 0) { - const struct btf_type *tag_t = btf_type_by_id(btf, id); - const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4; - - /* disallow arg tags in static subprogs */ - if (!is_global) { - bpf_log(log, "arg#%d type tag is not supported in static functions\n", i); - return -EOPNOTSUPP; - } + err = btf_scan_decl_tags(env, btf, fn_t, i, is_global, &tags); + if (err) + return err; - if (strcmp(tag, "ctx") == 0) { - tags |= ARG_TAG_CTX; - } else if (strcmp(tag, "trusted") == 0) { - tags |= ARG_TAG_TRUSTED; - } else if (strcmp(tag, "untrusted") == 0) { - tags |= ARG_TAG_UNTRUSTED; - } else if (strcmp(tag, "nonnull") == 0) { - tags |= ARG_TAG_NONNULL; - } else if (strcmp(tag, "nullable") == 0) { - tags |= ARG_TAG_NULLABLE; - } else if (strcmp(tag, "arena") == 0) { - tags |= ARG_TAG_ARENA; - } else { - bpf_log(log, "arg#%d has unsupported set of tags\n", i); - return -EOPNOTSUPP; - } - } - if (id != -ENOENT) { - bpf_log(log, "arg#%d type tag fetching failure: %d\n", i, id); - return id; - } + err = btf_scan_type_tags(env, btf, args[i].type, &tags); + if (err) + return err; t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) -- cgit v1.2.3 From 3e924e9272c80939677aa6902aced311c85fe48c Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Mon, 1 Jun 2026 20:41:17 -0400 Subject: bpf: Allow subprogs to return arena pointers BPF subprogs currently only return void or scalar values. However, it is also safe to return arena pointers between subprogs in the same BPF program: Arena pointers are guaranteed to be safe for both programs at any point. Expand the verifier to permit returning an arena pointer to the caller. The main subprog is still not allowed to return an arena pointer because arena pointers are internal to the BPF program, and the return values permitted for each main subprog depend on the program type anyway. Acked-by: Eduard Zingerman Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260602004120.17087-4-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 66 ++++++++++++++++++++++++++++++++++++++------------- kernel/bpf/verifier.c | 4 ++++ 2 files changed, 54 insertions(+), 16 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6fb3461e3ac2..68921d9172b5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7858,12 +7858,22 @@ static int btf_scan_type_tags(struct bpf_verifier_env *env, /* Find the first pointer type in the chain. */ t = btf_type_skip_modifiers(btf, type_id, NULL); + + /* + * We currently reject type tags on non-pointer types, + * which neither LLVM nor GCC support anyway. + */ if (!t || !btf_type_is_ptr(t)) return 0; /* We got a pointer, get all associated type tags. */ - t = btf_type_by_id(btf, t->type); - while (t && btf_type_is_type_tag(t)) { + for (t = btf_type_by_id(btf, t->type); t && btf_type_is_modifier(t); + t = btf_type_by_id(btf, t->type)) { + + /* Skip non-type tag modifiers. */ + if (!btf_type_is_type_tag(t)) + continue; + const char *tag = __btf_name_by_offset(btf, t->name_off); if (strcmp(tag, "arena") == 0) { @@ -7873,13 +7883,39 @@ static int btf_scan_type_tags(struct bpf_verifier_env *env, tag); return -EOPNOTSUPP; } - - t = btf_type_by_id(btf, t->type); } return 0; } +/* Check whether the type is a valid return type. */ +static int btf_validate_return_type(struct bpf_verifier_env *env, struct btf *btf, + const struct btf_type *t, int subprog) +{ + u32 tags = 0; + int err; + + err = btf_scan_type_tags(env, btf, t->type, &tags); + if (err) + return err; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + /* + * We allow all subprogs except for the main one to return any kind of arena pointer. + * General arena variables are not allowed, since it makes no sense to return by value + * a variable that's on the heap in the first place. + */ + if (subprog && (tags & ARG_TAG_ARENA) && btf_type_is_ptr(t)) + return 0; + + /* We always accept void or scalars. */ + if (btf_type_is_void(t) || btf_type_is_int(t) || btf_is_any_enum(t)) + return 0; + + return -EOPNOTSUPP; +} + /* Process BTF of a function to produce high-level expectation of function * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information * is cached in subprog info for reuse. @@ -7963,18 +7999,16 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } - /* check that function is void or returns int, exception cb also requires this */ - t = btf_type_by_id(btf, t->type); - while (btf_type_is_modifier(t)) - t = btf_type_by_id(btf, t->type); - if (!btf_type_is_void(t) && !btf_type_is_int(t) && !btf_is_any_enum(t)) { - if (!is_global) - return -EINVAL; - bpf_log(log, - "Global function %s() return value not void or scalar. " - "Only those are supported.\n", - tname); - return -EINVAL; + + err = btf_validate_return_type(env, btf, t, subprog); + if (err) { + if (is_global) { + bpf_log(log, + "Global function %s() return value not void or scalar. " + "Only those are supported.\n", + tname); + } + return err; } /* Convert BTF function arguments into verifier types. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5d8f2656dbfd..8ed484cb1a8a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16503,6 +16503,10 @@ static int check_global_subprog_return_code(struct bpf_verifier_env *env) if (err) return err; + /* Pointers to arena are safe to pass between subprograms. */ + if (is_arena_reg(env, BPF_REG_0)) + return 0; + if (is_pointer_value(env, BPF_REG_0)) { verbose(env, "R%d leaks addr as return value\n", BPF_REG_0); return -EACCES; -- cgit v1.2.3 From b93c55b4932dd7e32dca8cf34a3443cc87a02906 Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Tue, 2 Jun 2026 08:22:49 +0530 Subject: bpf: fix UAF by restoring RCU-delayed inode freeing in bpffs commit 4f375ade6aa9 ("bpf: Avoid RCU context warning when unpinning htab with internal structs") moved inode cleanup from ->free_inode() into ->destroy_inode() to avoid sleeping in RCU context when calling bpf_any_put(). However this removed the RCU delay on freeing the inode itself and the cached symlink body (i_link), both of which can be accessed by RCU pathwalk (pick_link, may_lookup etc.). This causes a use-after-free when a concurrent unlinkat() drops the last inode reference and destroy_inode() frees the inode immediately, while another task is still walking the path in RCU mode and reads inode->i_opflags (offset +2) inside current_time() -> is_mgtime(). KASAN reports: BUG: KASAN: slab-use-after-free in is_mgtime include/linux/fs.h:2313 Read of size 2 at addr ffff8880407e4282 (offset +2 = i_opflags) The rules (per Al Viro): ->destroy_inode() called immediately, can sleep, use for blocking cleanup e.g. bpf_any_put() ->free_inode() called after RCU grace period, use for freeing inode and anything RCU-accessible e.g. i_link Fix: split the two concerns properly: - keep bpf_any_put() in bpf_destroy_inode() since it is blocking and needs to run promptly - introduce bpf_free_inode() to handle kfree(i_link) and free_inode_nonrcu() with proper RCU delay, preventing the UAF Fixes: 4f375ade6aa9 ("bpf: Avoid RCU context warning when unpinning htab with internal structs") Reported-by: syzbot+36e50496c8ac4bcde3f9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=36e50496c8ac4bcde3f9 Suggested-by: Al Viro Link: https://lore.kernel.org/all/20260423043906.GN3518998@ZenIV/ Link: https://lore.kernel.org/all/20260602002607.110866-1-kartikey406@gmail.com/T/ [v1] Signed-off-by: Deepanshu Kartikey Acked-by: Al Viro Link: https://lore.kernel.org/r/20260602025249.113828-1-kartikey406@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/inode.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 25c06a011825..188c774a469c 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -766,10 +766,18 @@ static void bpf_destroy_inode(struct inode *inode) { enum bpf_type type; - if (S_ISLNK(inode->i_mode)) - kfree(inode->i_link); if (!bpf_inode_type(inode, &type)) bpf_any_put(inode->i_private, type); +} + +/* + * Called after RCU grace period - safe to free inode and anything + * that might be accessed by RCU pathwalk (inode fields, i_link). + */ +static void bpf_free_inode(struct inode *inode) +{ + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); free_inode_nonrcu(inode); } @@ -778,6 +786,7 @@ const struct super_operations bpf_super_ops = { .drop_inode = inode_just_drop, .show_options = bpf_show_options, .destroy_inode = bpf_destroy_inode, + .free_inode = bpf_free_inode, }; enum { -- cgit v1.2.3 From 3c56ee343f9412d81918635c3e25e22a5dd6d87e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2026 15:30:49 +0200 Subject: bpf: Reject exclusive maps for bpf_map_elem iterators Exclusive maps (aka excl_prog_hash) are meant to be reachable only from the single program whose hash matches. This is enforced by check_map_prog_compatibility() when the map is referenced from a program such as signed BPF loaders. A bpf_map_elem iterator, however, binds its target map at attach time in bpf_iter_attach_map() instead of referencing it from the program, so the exclusivity check is never reached. On top of that, the iterator exposes the map value as a writable buffer. Fixes: baefdbdf6812 ("bpf: Implement exclusive map creation") Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260602133052.423725-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/map_iter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 261a03ea73d3..ae0741a09c6d 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -112,6 +112,10 @@ static int bpf_iter_attach_map(struct bpf_prog *prog, map = bpf_map_get_with_uref(linfo->map.map_fd); if (IS_ERR(map)) return PTR_ERR(map); + if (map->excl_prog_sha) { + err = -EPERM; + goto put_map; + } if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || -- cgit v1.2.3 From fbd6dc50d9aedc594ec3196211a190170a275ab6 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Wed, 3 Jun 2026 20:18:22 +0000 Subject: bpf: clean up btf_scan_decl_tags() Refactor the newly introduced btf_scan_decl_tags() to improve readability and maintainability. The current implementation uses a manual if-else chain and a magic number offset to strip the "arg:" prefix from declaration tags. Replace the if-else logic with a table-driven approach using a static const array. This separates the tag data from the scanning logic, making the helper more extensible for future tags. Additionally, replace the magic number '4' with a sizeof-based calculation on the prefix string to ensure the offset remains synchronized with the search key. Finally, optimize the loop by moving the is_global check to the top of the block. This allows the verifier to fail-fast on static subprograms without performing unnecessary BTF string and type lookups. Signed-off-by: Matt Bobrowski Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260603201822.770596-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 68921d9172b5..55aa3ba1b1e0 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7808,14 +7808,28 @@ static int btf_scan_decl_tags(struct bpf_verifier_env *env, u32 arg_idx, bool is_global, u32 *tags) { int id = btf_named_start_id(btf, false) - 1; + const char tag_key[] = "arg:"; + static const struct { + const char *tag_value; + enum btf_arg_tag arg_tag; + } tag_values[] = { + { "ctx", ARG_TAG_CTX }, + { "trusted", ARG_TAG_TRUSTED }, + { "untrusted", ARG_TAG_UNTRUSTED }, + { "nonnull", ARG_TAG_NONNULL }, + { "nullable", ARG_TAG_NULLABLE }, + { "arena", ARG_TAG_ARENA }, + }; /* * The 'arg:' decl_tag takes precedence over the derivation * of the register type from the BTF type itself. */ - while ((id = btf_find_next_decl_tag(btf, fn_t, arg_idx, "arg:", id)) > 0) { - const struct btf_type *tag_t = btf_type_by_id(btf, id); - const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4; + while ((id = btf_find_next_decl_tag(btf, fn_t, arg_idx, tag_key, id)) > 0) { + const struct btf_type *tag_t; + const char *tag; + int i; + bool found; /* disallow arg tags in static subprogs */ if (!is_global) { @@ -7825,19 +7839,19 @@ static int btf_scan_decl_tags(struct bpf_verifier_env *env, return -EOPNOTSUPP; } - if (strcmp(tag, "ctx") == 0) { - *tags |= ARG_TAG_CTX; - } else if (strcmp(tag, "trusted") == 0) { - *tags |= ARG_TAG_TRUSTED; - } else if (strcmp(tag, "untrusted") == 0) { - *tags |= ARG_TAG_UNTRUSTED; - } else if (strcmp(tag, "nonnull") == 0) { - *tags |= ARG_TAG_NONNULL; - } else if (strcmp(tag, "nullable") == 0) { - *tags |= ARG_TAG_NULLABLE; - } else if (strcmp(tag, "arena") == 0) { - *tags |= ARG_TAG_ARENA; - } else { + tag_t = btf_type_by_id(btf, id); + tag = __btf_name_by_offset(btf, tag_t->name_off) + (sizeof(tag_key) - 1); + + found = false; + for (i = 0; i < ARRAY_SIZE(tag_values); ++i) { + if (!strcmp(tag, tag_values[i].tag_value)) { + *tags |= tag_values[i].arg_tag; + found = true; + break; + } + } + + if (!found) { bpf_log(&env->log, "arg#%d has unsupported set of tags\n", arg_idx); return -EOPNOTSUPP; } -- cgit v1.2.3 From 80b89d0226a05e8b67969de99c31b51fcd54f76a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 28 May 2026 15:20:14 -0700 Subject: bpf: Take mmap_lock in zap_pages() zap_vma_range() requires the owning mm's mmap_lock to be held. Taking mmap_read_lock under arena->lock would AB-BA against arena_vm_close() and arena_map_mmap(), both of which run with mmap_write_lock held and then acquire arena->lock. Instead drop arena->lock, mmget_not_zero() the vma's mm, take mmap_read_lock, and re-resolve the vma via find_vma() since it may have been unmapped or replaced while waiting. Track processed vmls with a per-call generation in vml->zap_gen and serialize zap_pages() callers with a new arena->zap_mutex so concurrent callers on different uaddr ranges do not mark each other's vmls processed before the zap is done. Reported-by: David Hildenbrand Fixes: 317460317a02 ("bpf: Introduce bpf_arena.") Signed-off-by: Alexei Starovoitov Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260528222014.38980-1-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 1727503b25d8..9b2dea229b38 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -60,6 +60,8 @@ struct bpf_arena { struct list_head vma_list; /* protects vma_list */ struct mutex lock; + u64 zap_gen; + struct mutex zap_mutex; struct irq_work free_irq; struct work_struct free_work; struct llist_head free_spans; @@ -289,6 +291,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) if (err) goto err_free_scratch; mutex_init(&arena->lock); + mutex_init(&arena->zap_mutex); raw_res_spin_lock_init(&arena->spinlock); err = populate_pgtable_except_pte(arena); if (err) @@ -391,6 +394,7 @@ struct vma_list { struct vm_area_struct *vma; struct list_head head; refcount_t mmap_count; + u64 zap_gen; }; static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) @@ -403,6 +407,7 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) refcount_set(&vml->mmap_count, 1); vma->vm_private_data = vml; vml->vma = vma; + vml->zap_gen = 0; list_add(&vml->head, &arena->vma_list); return 0; } @@ -746,12 +751,60 @@ out_free_pages: */ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) { + unsigned long size = (unsigned long)page_cnt << PAGE_SHIFT; + struct vm_area_struct *vma; + struct mm_struct *mm; struct vma_list *vml; + unsigned long vm_start; + u64 my_gen; - guard(mutex)(&arena->lock); - /* iterate link list under lock */ - list_for_each_entry(vml, &arena->vma_list, head) - zap_vma_range(vml->vma, uaddr, PAGE_SIZE * page_cnt); + /* + * Taking mmap_read_lock() under arena->lock would deadlock against + * arena_vm_close(), which runs with mmap_write_lock held and then + * acquires arena->lock. Drop arena->lock for mmap_read_lock(). + * + * Use per-call my_gen, recorded in vml->zap_gen, to remember which + * vmls this invocation has already processed across the lock drop. + * Hold zap_mutex around the whole walk so concurrent zap_pages() + * callers cannot overwrite each other's marks on shared vmls -- + * otherwise call B's mark would make call A skip a vml that A has + * not yet zapped for A's uaddr range. + */ + mutex_lock(&arena->zap_mutex); + mutex_lock(&arena->lock); + my_gen = ++arena->zap_gen; + for (;;) { + mm = NULL; + list_for_each_entry(vml, &arena->vma_list, head) { + if (vml->zap_gen >= my_gen) + continue; + vml->zap_gen = my_gen; + if (!mmget_not_zero(vml->vma->vm_mm)) + continue; + mm = vml->vma->vm_mm; + vm_start = vml->vma->vm_start; + break; + } + if (!mm) + break; + mutex_unlock(&arena->lock); + + mmap_read_lock(mm); + /* + * Re-resolve: while we waited the VMA could have been unmapped + * and a different mapping installed at the same address. + */ + vma = find_vma(mm, vm_start); + if (vma && vma->vm_start == vm_start && + vma->vm_file && vma->vm_file->private_data == &arena->map) + zap_vma_range(vma, uaddr, size); + mmap_read_unlock(mm); + mmput(mm); + + mutex_lock(&arena->lock); + } + mutex_unlock(&arena->lock); + mutex_unlock(&arena->zap_mutex); } static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) -- cgit v1.2.3 From 16b4d3e2fb24aac3e68a8d86e3bc5e302e1b5cb7 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Jun 2026 04:41:21 -0700 Subject: bpf: Implement resizable hashmap basic functions Use rhashtable_lookup_likely() for lookups, rhashtable_remove_fast() for deletes, and rhashtable_lookup_get_insert_fast() for inserts. Updates modify values in place under RCU rather than allocating a new element and swapping the pointer (as regular htab does). This trades read consistency for performance: concurrent readers may see partial updates. BPF_F_LOCK support and special-field handling (timers, kptrs, etc.) follow in a later commit. Initialize rhashtable with bpf_mem_alloc element cache. Require BPF_F_NO_PREALLOC. Limit max_entries to 2^31. Free elements via rhashtable_free_and_destroy(). Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260605-rhash-v7-4-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 6 + kernel/bpf/hashtab.c | 311 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 3 + kernel/bpf/verifier.c | 1 + tools/include/uapi/linux/bpf.h | 6 + 6 files changed, 328 insertions(+) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b13de31e163f..56e4c3f983d3 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -134,6 +134,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_INSN_ARRAY, insn_array_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_RHASH, rhtab_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aec171ccb6ef..bed9b1b4d5ef 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1047,6 +1047,7 @@ enum bpf_map_type { BPF_MAP_TYPE_CGRP_STORAGE, BPF_MAP_TYPE_ARENA, BPF_MAP_TYPE_INSN_ARRAY, + BPF_MAP_TYPE_RHASH, __MAX_BPF_MAP_TYPE }; @@ -1545,6 +1546,11 @@ union bpf_attr { * * BPF_MAP_TYPE_ARENA - contains the address where user space * is going to mmap() the arena. It has to be page aligned. + * + * BPF_MAP_TYPE_RHASH - initial table size hint + * (nelem_hint). 0 = use rhashtable default. Must be + * <= min(max_entries, U16_MAX). Upper 32 bits reserved, + * must be zero. */ __u64 map_extra; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3dd9b4924ae4..10f3a058747b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -2739,3 +2740,313 @@ const struct bpf_map_ops htab_of_maps_map_ops = { BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], }; + +struct rhtab_elem { + struct rhash_head node; + /* key bytes, then value bytes follow */ + u8 data[] __aligned(8); +}; + +struct bpf_rhtab { + struct bpf_map map; + struct rhashtable ht; + struct bpf_mem_alloc ma; + u32 elem_size; +}; + +static const struct rhashtable_params rhtab_params = { + .head_offset = offsetof(struct rhtab_elem, node), + .key_offset = offsetof(struct rhtab_elem, data), +}; + +static inline void *rhtab_elem_value(struct rhtab_elem *l, u32 key_size) +{ + return l->data + round_up(key_size, 8); +} + +static struct bpf_map *rhtab_map_alloc(union bpf_attr *attr) +{ + struct rhashtable_params params; + struct bpf_rhtab *rhtab; + int err = 0; + + rhtab = bpf_map_area_alloc(sizeof(*rhtab), NUMA_NO_NODE); + if (!rhtab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&rhtab->map, attr); + + if (rhtab->map.max_entries > 1UL << 31) { + err = -E2BIG; + goto free_rhtab; + } + + rhtab->elem_size = sizeof(struct rhtab_elem) + round_up(rhtab->map.key_size, 8) + + round_up(rhtab->map.value_size, 8); + + params = rhtab_params; + params.key_len = rhtab->map.key_size; + params.nelem_hint = (u32)attr->map_extra; + params.automatic_shrinking = true; + + err = rhashtable_init(&rhtab->ht, ¶ms); + if (err) + goto free_rhtab; + + /* Set max_elems after rhashtable_init() since init zeroes the struct */ + rhtab->ht.max_elems = rhtab->map.max_entries; + + err = bpf_mem_alloc_init(&rhtab->ma, rhtab->elem_size, false); + if (err) + goto destroy_rhtab; + + return &rhtab->map; + +destroy_rhtab: + rhashtable_destroy(&rhtab->ht); +free_rhtab: + bpf_map_area_free(rhtab); + return ERR_PTR(err); +} + +static int rhtab_map_alloc_check(union bpf_attr *attr) +{ + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) + return -EINVAL; + + if (attr->map_flags & BPF_F_ZERO_SEED) + return -EINVAL; + + if (attr->key_size > U16_MAX) + return -E2BIG; + + if (attr->map_extra >> 32) + return -EINVAL; + + if ((u32)attr->map_extra > U16_MAX) + return -E2BIG; + + if ((u32)attr->map_extra > attr->max_entries) + return -EINVAL; + + return htab_map_alloc_check(attr); +} + +static void rhtab_free_elem(void *ptr, void *arg) +{ + struct bpf_rhtab *rhtab = arg; + struct rhtab_elem *elem = ptr; + + bpf_mem_cache_free_rcu(&rhtab->ma, elem); +} + +static void rhtab_map_free(struct bpf_map *map) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + + rhashtable_free_and_destroy(&rhtab->ht, rhtab_free_elem, rhtab); + bpf_mem_alloc_destroy(&rhtab->ma); + bpf_map_area_free(rhtab); +} + +static void *rhtab_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + + /* Hold RCU lock in case sleepable program calls via gen_lookup */ + guard(rcu)(); + + return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params); +} + +static void *rhtab_map_lookup_elem(struct bpf_map *map, void *key) __must_hold(RCU) +{ + struct rhtab_elem *l; + + l = rhtab_lookup_elem(map, key); + return l ? rhtab_elem_value(l, map->key_size) : NULL; +} + +static void rhtab_read_elem_value(struct bpf_map *map, void *dst, struct rhtab_elem *elem, + u64 flags) +{ + void *src = rhtab_elem_value(elem, map->key_size); + + if (flags & BPF_F_LOCK) + copy_map_value_locked(map, dst, src, true); + else + copy_map_value(map, dst, src); +} + +static int rhtab_delete_elem(struct bpf_rhtab *rhtab, struct rhtab_elem *elem, void *copy, + u64 flags) +{ + int err; + + /* + * disable_instrumentation() mitigates the deadlock for programs running in NMI context. + * rhashtable locks bucket with local_irq_save(). Only NMI programs may reenter + * rhashtable code, bpf_disable_instrumentation() disables programs running in NMI, except + * raw tracepoints, which we don't have in rhashtable. + */ + bpf_disable_instrumentation(); + err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params); + bpf_enable_instrumentation(); + + if (err) + return err; + + if (copy) { + rhtab_read_elem_value(&rhtab->map, copy, elem, flags); + check_and_init_map_value(&rhtab->map, copy); + } + + bpf_mem_cache_free_rcu(&rhtab->ma, elem); + return 0; +} + + +static long rhtab_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem; + + guard(rcu)(); + + elem = rhtab_lookup_elem(map, key); + if (!elem) + return -ENOENT; + + return rhtab_delete_elem(rhtab, elem, NULL, 0); +} + +static int rhtab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem; + int err; + + err = bpf_map_check_op_flags(map, flags, BPF_F_LOCK); + if (err) + return err; + + guard(rcu)(); + + elem = rhtab_lookup_elem(map, key); + if (!elem) + return -ENOENT; + + return rhtab_delete_elem(rhtab, elem, value, flags); +} + +static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *elem, void *value, + u64 map_flags) +{ + void *old_val = rhtab_elem_value(elem, map->key_size); + + if (map_flags & BPF_NOEXIST) + return -EEXIST; + + if (map_flags & BPF_F_LOCK) + copy_map_value_locked(map, old_val, value, false); + else + copy_map_value(map, old_val, value); + return 0; +} + +static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem, *tmp; + + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) + return -EINVAL; + + if ((map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) + return -EINVAL; + + guard(rcu)(); + elem = rhtab_lookup_elem(map, key); + if (elem) + return rhtab_map_update_existing(map, elem, value, map_flags); + + if (map_flags & BPF_EXIST) + return -ENOENT; + + /* Check max_entries limit before inserting new element */ + if (atomic_read(&rhtab->ht.nelems) >= map->max_entries) + return -E2BIG; + + elem = bpf_mem_cache_alloc(&rhtab->ma); + if (!elem) + return -ENOMEM; + + memcpy(elem->data, key, map->key_size); + copy_map_value(map, rhtab_elem_value(elem, map->key_size), value); + + /* Prevent deadlock for NMI programs attempting to take bucket lock */ + bpf_disable_instrumentation(); + tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params); + bpf_enable_instrumentation(); + + if (tmp) { + bpf_mem_cache_free(&rhtab->ma, elem); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + return rhtab_map_update_existing(map, tmp, value, map_flags); + } + + return 0; +} + +static int rhtab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + const int ret = BPF_REG_0; + + BUILD_BUG_ON(!__same_type(&rhtab_lookup_elem, + (void *(*)(struct bpf_map *map, void *key)) NULL)); + *insn++ = BPF_EMIT_CALL(rhtab_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); + *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, + offsetof(struct rhtab_elem, data) + round_up(map->key_size, 8)); + + return insn - insn_buf; +} + +static void rhtab_map_free_internal_structs(struct bpf_map *map) +{ +} + +static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EOPNOTSUPP; +} + +static u64 rhtab_map_mem_usage(const struct bpf_map *map) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + u64 num_entries; + + /* Excludes rhashtable bucket overhead (~ nelems * sizeof(void *) at 75% load). */ + num_entries = atomic_read(&rhtab->ht.nelems); + return sizeof(struct bpf_rhtab) + rhtab->elem_size * num_entries; +} + +BTF_ID_LIST_SINGLE(rhtab_map_btf_ids, struct, bpf_rhtab) +const struct bpf_map_ops rhtab_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = rhtab_map_alloc_check, + .map_alloc = rhtab_map_alloc, + .map_free = rhtab_map_free, + .map_get_next_key = rhtab_map_get_next_key, + .map_release_uref = rhtab_map_free_internal_structs, + .map_lookup_elem = rhtab_map_lookup_elem, + .map_lookup_and_delete_elem = rhtab_map_lookup_and_delete_elem, + .map_update_elem = rhtab_map_update_elem, + .map_delete_elem = rhtab_map_delete_elem, + .map_gen_lookup = rhtab_map_gen_lookup, + .map_mem_usage = rhtab_map_mem_usage, + .map_btf_id = &rhtab_map_btf_ids[0], +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 625a4366fe6d..1faae184de48 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1398,6 +1398,7 @@ static int map_create_alloc(union bpf_attr *attr, bpfptr_t uattr, struct bpf_ver if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && attr->map_type != BPF_MAP_TYPE_ARENA && + attr->map_type != BPF_MAP_TYPE_RHASH && attr->map_extra != 0) { bpf_log(log, "Invalid map_extra.\n"); return -EINVAL; @@ -1469,6 +1470,7 @@ static int map_create_alloc(union bpf_attr *attr, bpfptr_t uattr, struct bpf_ver case BPF_MAP_TYPE_CGROUP_ARRAY: case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_RHASH: case BPF_MAP_TYPE_PERCPU_HASH: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: @@ -2259,6 +2261,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_RHASH || map->map_type == BPF_MAP_TYPE_STACK_TRACE) { if (!bpf_map_is_offloaded(map)) { bpf_disable_instrumentation(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8ed484cb1a8a..7d27ba396d32 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17657,6 +17657,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, if (prog->sleepable) switch (map->map_type) { case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_RHASH: case BPF_MAP_TYPE_LRU_HASH: case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_PERCPU_HASH: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 37142e6d911a..7d0b282ba674 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1047,6 +1047,7 @@ enum bpf_map_type { BPF_MAP_TYPE_CGRP_STORAGE, BPF_MAP_TYPE_ARENA, BPF_MAP_TYPE_INSN_ARRAY, + BPF_MAP_TYPE_RHASH, __MAX_BPF_MAP_TYPE }; @@ -1545,6 +1546,11 @@ union bpf_attr { * * BPF_MAP_TYPE_ARENA - contains the address where user space * is going to mmap() the arena. It has to be page aligned. + * + * BPF_MAP_TYPE_RHASH - initial table size hint + * (nelem_hint). 0 = use rhashtable default. Must be + * <= min(max_entries, U16_MAX). Upper 32 bits reserved, + * must be zero. */ __u64 map_extra; -- cgit v1.2.3 From 818e0084822742fc00eacbf5df3476a5e72c7d0e Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Jun 2026 04:41:22 -0700 Subject: bpf: Implement iteration ops for resizable hashtab MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement get_next_key, batch lookup/lookup-and-delete, for_each_map_elem callback, and the seq_file BPF iterator for BPF_MAP_TYPE_RHASH. get_next_key() and batch use rhashtable_next_key() — stateless, matches the syscall UAPI shape (no kernel-side iterator state). get_next_key falls back to the first key when prev_key was concurrently deleted (matches htab semantics). Batch reports cursor loss as -EAGAIN so userspace can distinguish it from end-of-iteration (-ENOENT) and restart from NULL. The seq_file BPF iterator uses rhashtable_walk_* instead. It runs only from read() syscall context, so the walker's spin_lock is safe, and seq_file's per-fd state lets the walker handle rehash correctly (retry on -EAGAIN) for stronger coverage than the stateless API can provide. Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260605-rhash-v7-5-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 347 +++++++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/map_iter.c | 3 +- 2 files changed, 348 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 10f3a058747b..a149713d0953 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -3020,8 +3020,79 @@ static void rhtab_map_free_internal_structs(struct bpf_map *map) } static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) + __must_hold_shared(RCU) { - return -EOPNOTSUPP; + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem; + + elem = rhashtable_next_key(&rhtab->ht, key); + + /* if not found, return the first key */ + if (PTR_ERR(elem) == -ENOENT) + elem = rhashtable_next_key(&rhtab->ht, NULL); + + if (IS_ERR(elem)) + return PTR_ERR(elem); + if (!elem) + return -ENOENT; + + memcpy(next_key, elem->data, map->key_size); + return 0; +} + +static void rhtab_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) +{ + void *value; + + /* Guarantee that hashtab value is not freed */ + guard(rcu)(); + + value = rhtab_map_lookup_elem(map, key); + if (!value) + return; + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": "); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); + seq_putc(m, '\n'); +} + +static long bpf_each_rhash_elem(struct bpf_map *map, bpf_callback_t callback_fn, + void *callback_ctx, u64 flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + void *prev_key = NULL; + struct rhtab_elem *elem; + int num_elems = 0; + u64 ret = 0; + + cant_migrate(); + + if (flags != 0) + return -EINVAL; + + rcu_read_lock(); + /* + * Best-effort iteration: if rhashtable is concurrently resized or + * elements are deleted/inserted, there may be missed or duplicate + * elements visited. + */ + while ((elem = rhashtable_next_key(&rhtab->ht, prev_key))) { + if (IS_ERR(elem)) + break; + num_elems++; + ret = callback_fn((u64)(long)map, + (u64)(long)elem->data, + (u64)(long)rhtab_elem_value(elem, map->key_size), + (u64)(long)callback_ctx, 0); + if (ret) + break; + + prev_key = elem->data; /* valid while RCU held */ + } + rcu_read_unlock(); + + return num_elems; } static u64 rhtab_map_mem_usage(const struct bpf_map *map) @@ -3034,6 +3105,275 @@ static u64 rhtab_map_mem_usage(const struct bpf_map *map) return sizeof(struct bpf_rhtab) + rhtab->elem_size * num_entries; } +static int __rhtab_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr, + bool do_delete) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + void __user *uvalues = u64_to_user_ptr(attr->batch.values); + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); + void *cursor = NULL, *keys = NULL, *values = NULL, *dst_key, *dst_val; + struct rhtab_elem **del_elems = NULL; + u32 max_count, total, key_size, value_size, i; + bool has_next_cursor = false; + struct rhtab_elem *elem; + u64 elem_map_flags, map_flags; + int ret = 0; + + elem_map_flags = attr->batch.elem_flags; + ret = bpf_map_check_op_flags(map, elem_map_flags, BPF_F_LOCK); + if (ret) + return ret; + + map_flags = attr->batch.flags; + if (map_flags) + return -EINVAL; + + max_count = attr->batch.count; + if (!max_count) + return 0; + + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + + key_size = map->key_size; + value_size = map->value_size; + + keys = kvmalloc_array(max_count, key_size, GFP_USER | __GFP_NOWARN); + values = kvmalloc_array(max_count, value_size, GFP_USER | __GFP_NOWARN); + if (do_delete) + del_elems = kvmalloc_array(max_count, sizeof(void *), + GFP_USER | __GFP_NOWARN); + cursor = kmalloc(key_size, GFP_USER | __GFP_NOWARN); + + if (!keys || !values || !cursor || (do_delete && !del_elems)) { + ret = -ENOMEM; + goto free; + } + + if (ubatch && copy_from_user(cursor, ubatch, key_size)) { + ret = -EFAULT; + goto free; + } + + dst_key = keys; + dst_val = values; + total = 0; + + rcu_read_lock(); + + /* + * Cursor stores the key of the next-to-process element (stashed by + * the previous batch). Look it up directly so the element is included + * here rather than skipped by next_key(). If the cursor was deleted + * concurrently (or by the previous do_delete batch), return -EAGAIN + * so userspace can distinguish a lost cursor from end-of-iteration + * (-ENOENT) and restart from a NULL cursor. + */ + if (ubatch) { + elem = rhtab_lookup_elem(map, cursor); + if (!elem) { + rcu_read_unlock(); + ret = -EAGAIN; + goto free; + } + } else { + elem = rhashtable_next_key(&rhtab->ht, NULL); + } + + while (elem && !IS_ERR(elem) && total < max_count) { + memcpy(dst_key, elem->data, key_size); + rhtab_read_elem_value(map, dst_val, elem, elem_map_flags); + check_and_init_map_value(map, dst_val); + + if (do_delete) + del_elems[total] = elem; + + elem = rhashtable_next_key(&rhtab->ht, dst_key); + dst_key += key_size; + dst_val += value_size; + total++; + + /* Bail to userspace to avoid stalls. */ + if (need_resched()) + break; + } + + if (elem && !IS_ERR(elem)) { + /* Stash next-to-process key as cursor for the next batch. */ + memcpy(cursor, elem->data, key_size); + has_next_cursor = true; + } + + if (do_delete) { + for (i = 0; i < total; i++) + rhtab_delete_elem(rhtab, del_elems[i], NULL, 0); + } + + rcu_read_unlock(); + + if (total == 0) { + ret = -ENOENT; + goto free; + } + + /* No more elements after this batch. */ + if (!has_next_cursor) + ret = -ENOENT; + + if (copy_to_user(ukeys, keys, (size_t)total * key_size) || + copy_to_user(uvalues, values, (size_t)total * value_size) || + put_user(total, &uattr->batch.count) || + (has_next_cursor && + copy_to_user(u64_to_user_ptr(attr->batch.out_batch), + cursor, key_size))) { + ret = -EFAULT; + goto free; + } + +free: + kfree(cursor); + kvfree(keys); + kvfree(values); + kvfree(del_elems); + return ret; +} + +static int rhtab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, false); +} + +static int rhtab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, true); +} + +struct bpf_iter_seq_rhash_map_info { + struct bpf_map *map; + struct bpf_rhtab *rhtab; + struct rhashtable_iter iter; +}; + +static void *bpf_rhash_map_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + struct rhtab_elem *elem; + + rhashtable_walk_start(&info->iter); + /* + * Re-deliver the element returned by walk_next() at the end of the + * previous read() — bpf_seq_read may have stopped before show() + * consumed it. Rehash rewinds the walker; retry on -EAGAIN. + */ + do { + elem = rhashtable_walk_peek(&info->iter); + } while (PTR_ERR(elem) == -EAGAIN); + + if (IS_ERR(elem)) + return NULL; + + if (elem && *pos == 0) + ++*pos; + return elem; +} + +static void *bpf_rhash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + struct rhtab_elem *elem; + + ++*pos; + + /* Rehash rewinds the walker; retry until it stops returning -EAGAIN. */ + do { + elem = rhashtable_walk_next(&info->iter); + } while (PTR_ERR(elem) == -EAGAIN); + + if (IS_ERR(elem)) + return NULL; + return elem; +} + +static int __bpf_rhash_map_seq_show(struct seq_file *seq, + struct rhtab_elem *elem) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + struct bpf_iter__bpf_map_elem ctx = {}; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, elem == NULL); + if (prog) { + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + ctx.key = elem->data; + ctx.value = rhtab_elem_value(elem, info->map->key_size); + } + ret = bpf_iter_run_prog(prog, &ctx); + } + + return ret; +} + +static int bpf_rhash_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_rhash_map_seq_show(seq, v); +} + +static void bpf_rhash_map_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + + if (!v) + (void)__bpf_rhash_map_seq_show(seq, NULL); + + rhashtable_walk_stop(&info->iter); +} + +static int bpf_iter_init_rhash_map(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_rhash_map_info *info = priv_data; + struct bpf_map *map = aux->map; + + bpf_map_inc_with_uref(map); + info->map = map; + info->rhtab = container_of(map, struct bpf_rhtab, map); + rhashtable_walk_enter(&info->rhtab->ht, &info->iter); + return 0; +} + +static void bpf_iter_fini_rhash_map(void *priv_data) +{ + struct bpf_iter_seq_rhash_map_info *info = priv_data; + + rhashtable_walk_exit(&info->iter); + bpf_map_put_with_uref(info->map); +} + +static const struct seq_operations bpf_rhash_map_seq_ops = { + .start = bpf_rhash_map_seq_start, + .next = bpf_rhash_map_seq_next, + .stop = bpf_rhash_map_seq_stop, + .show = bpf_rhash_map_seq_show, +}; + +static const struct bpf_iter_seq_info rhash_iter_seq_info = { + .seq_ops = &bpf_rhash_map_seq_ops, + .init_seq_private = bpf_iter_init_rhash_map, + .fini_seq_private = bpf_iter_fini_rhash_map, + .seq_priv_size = sizeof(struct bpf_iter_seq_rhash_map_info), +}; + BTF_ID_LIST_SINGLE(rhtab_map_btf_ids, struct, bpf_rhtab) const struct bpf_map_ops rhtab_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -3047,6 +3387,11 @@ const struct bpf_map_ops rhtab_map_ops = { .map_update_elem = rhtab_map_update_elem, .map_delete_elem = rhtab_map_delete_elem, .map_gen_lookup = rhtab_map_gen_lookup, + .map_seq_show_elem = rhtab_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_each_rhash_elem, .map_mem_usage = rhtab_map_mem_usage, + BATCH_OPS(rhtab), .map_btf_id = &rhtab_map_btf_ids[0], + .iter_seq_info = &rhash_iter_seq_info, }; diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index ae0741a09c6d..c19b360bad9e 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -123,7 +123,8 @@ static int bpf_iter_attach_map(struct bpf_prog *prog, is_percpu = true; else if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY) + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_RHASH) goto put_map; key_acc_size = prog->aux->max_rdonly_access; -- cgit v1.2.3 From 6905f8601298ecd2d1932a4b4849bf265201118e Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Jun 2026 04:41:23 -0700 Subject: bpf: Allow special fields in resizable hashtab Add support for timers, workqueues, task work, spin locks and kptrs. Without this, users needing deferred callbacks, BPF_F_LOCK, or refcounted kernel pointers in a dynamically-sized map have no option - fixed-size htab is the only map supporting these field types. Resizable hashtab should offer the same capability. kptr semantics under in-place updates are identical to array map. Properly clean up BTF record fields on element delete and map teardown by wiring up bpf_obj_free_fields through a memory allocator destructor, matching the pattern used by htab for non-prealloc maps. Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260605-rhash-v7-6-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++----- kernel/bpf/syscall.c | 3 ++ 2 files changed, 104 insertions(+), 10 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a149713d0953..7b9408b8320c 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -497,28 +497,26 @@ static void htab_dtor_ctx_free(void *ctx) kfree(ctx); } -static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *)) +static int bpf_ma_set_dtor(struct bpf_map *map, struct bpf_mem_alloc *ma, + void (*dtor)(void *, void *)) { - u32 key_size = htab->map.key_size; - struct bpf_mem_alloc *ma; struct htab_btf_record *hrec; int err; /* No need for dtors. */ - if (IS_ERR_OR_NULL(htab->map.record)) + if (IS_ERR_OR_NULL(map->record)) return 0; hrec = kzalloc(sizeof(*hrec), GFP_KERNEL); if (!hrec) return -ENOMEM; - hrec->key_size = key_size; - hrec->record = btf_record_dup(htab->map.record); + hrec->key_size = map->key_size; + hrec->record = btf_record_dup(map->record); if (IS_ERR(hrec->record)) { err = PTR_ERR(hrec->record); kfree(hrec); return err; } - ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma; bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec); return 0; } @@ -535,9 +533,9 @@ static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf, * populated in htab_map_alloc(), so it will always appear as NULL. */ if (htab_is_percpu(htab)) - return htab_set_dtor(htab, htab_pcpu_mem_dtor); + return bpf_ma_set_dtor(map, &htab->pcpu_ma, htab_pcpu_mem_dtor); else - return htab_set_dtor(htab, htab_mem_dtor); + return bpf_ma_set_dtor(map, &htab->ma, htab_mem_dtor); } static struct bpf_map *htab_map_alloc(union bpf_attr *attr) @@ -2752,6 +2750,7 @@ struct bpf_rhtab { struct rhashtable ht; struct bpf_mem_alloc ma; u32 elem_size; + bool freeing_internal; }; static const struct rhashtable_params rhtab_params = { @@ -2832,11 +2831,34 @@ static int rhtab_map_alloc_check(union bpf_attr *attr) return htab_map_alloc_check(attr); } +static void rhtab_check_and_free_fields(struct bpf_rhtab *rhtab, + struct rhtab_elem *elem) +{ + if (IS_ERR_OR_NULL(rhtab->map.record)) + return; + + bpf_obj_free_fields(rhtab->map.record, + rhtab_elem_value(elem, rhtab->map.key_size)); +} + +static void rhtab_mem_dtor(void *obj, void *ctx) +{ + struct htab_btf_record *hrec = ctx; + struct rhtab_elem *elem = obj; + + if (IS_ERR_OR_NULL(hrec->record)) + return; + + bpf_obj_free_fields(hrec->record, + rhtab_elem_value(elem, hrec->key_size)); +} + static void rhtab_free_elem(void *ptr, void *arg) { struct bpf_rhtab *rhtab = arg; struct rhtab_elem *elem = ptr; + bpf_map_free_internal_structs(&rhtab->map, rhtab_elem_value(elem, rhtab->map.key_size)); bpf_mem_cache_free_rcu(&rhtab->ma, elem); } @@ -2900,7 +2922,8 @@ static int rhtab_delete_elem(struct bpf_rhtab *rhtab, struct rhtab_elem *elem, v rhtab_read_elem_value(&rhtab->map, copy, elem, flags); check_and_init_map_value(&rhtab->map, copy); } - + /* Release internal structs: kptr, bpf_timer, task_work, wq */ + rhtab_check_and_free_fields(rhtab, elem); bpf_mem_cache_free_rcu(&rhtab->ma, elem); return 0; } @@ -2942,6 +2965,7 @@ static int rhtab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *elem, void *value, u64 map_flags) { + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); void *old_val = rhtab_elem_value(elem, map->key_size); if (map_flags & BPF_NOEXIST) @@ -2951,6 +2975,17 @@ static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *el copy_map_value_locked(map, old_val, value, false); else copy_map_value(map, old_val, value); + + /* + * Torn reads: a concurrent reader without BPF_F_LOCK may observe + * the value mid-copy. Callers requiring consistent reads must use + * BPF_F_LOCK, matching arraymap semantics. + * + * copy_map_value() skips special-field offsets, so old timers/ + * kptrs/etc. still sit in the slot. Cancel them after the copy + * to match arraymap's update semantics. + */ + rhtab_check_and_free_fields(rhtab, elem); return 0; } @@ -2973,6 +3008,14 @@ static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u if (map_flags & BPF_EXIST) return -ENOENT; + /* + * Reject new insertions while map_release_uref cleanup walks the + * table. Without this, new elements could keep triggering rehash + * and prevent the walk from terminating. + */ + if (READ_ONCE(rhtab->freeing_internal)) + return -EBUSY; + /* Check max_entries limit before inserting new element */ if (atomic_read(&rhtab->ht.nelems) >= map->max_entries) return -E2BIG; @@ -2983,6 +3026,7 @@ static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u memcpy(elem->data, key, map->key_size); copy_map_value(map, rhtab_elem_value(elem, map->key_size), value); + check_and_init_map_value(map, rhtab_elem_value(elem, map->key_size)); /* Prevent deadlock for NMI programs attempting to take bucket lock */ bpf_disable_instrumentation(); @@ -3015,8 +3059,54 @@ static int rhtab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } +static int rhtab_map_check_btf(struct bpf_map *map, const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + + return bpf_ma_set_dtor(map, &rhtab->ma, rhtab_mem_dtor); +} + static void rhtab_map_free_internal_structs(struct bpf_map *map) { + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhashtable_iter iter; + struct rhtab_elem *elem; + + if (!bpf_map_has_internal_structs(map)) + return; + + /* + * Block new insertions. Once observed, no new growth is triggered, + * so any in-flight rehash will drain and the walker is guaranteed + * to stop returning -EAGAIN. Treat -EAGAIN as "rehash in progress, + * retry"; do not wait for the worker. + */ + WRITE_ONCE(rhtab->freeing_internal, true); + + rhashtable_walk_enter(&rhtab->ht, &iter); + rhashtable_walk_start(&iter); + + while ((elem = rhashtable_walk_next(&iter))) { + if (IS_ERR(elem)) { + if (PTR_ERR(elem) == -EAGAIN) + continue; + break; + } + + bpf_map_free_internal_structs(map, rhtab_elem_value(elem, map->key_size)); + + if (need_resched()) { /* Avoid stalls on large maps */ + rhashtable_walk_stop(&iter); + cond_resched(); + rhashtable_walk_start(&iter); + } + } + + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); + WRITE_ONCE(rhtab->freeing_internal, false); } static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -3382,6 +3472,7 @@ const struct bpf_map_ops rhtab_map_ops = { .map_free = rhtab_map_free, .map_get_next_key = rhtab_map_get_next_key, .map_release_uref = rhtab_map_free_internal_structs, + .map_check_btf = rhtab_map_check_btf, .map_lookup_elem = rhtab_map_lookup_elem, .map_lookup_and_delete_elem = rhtab_map_lookup_and_delete_elem, .map_update_elem = rhtab_map_update_elem, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1faae184de48..31a3b70a0b5d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1280,6 +1280,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case BPF_SPIN_LOCK: case BPF_RES_SPIN_LOCK: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_RHASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && map->map_type != BPF_MAP_TYPE_SK_STORAGE && @@ -1294,6 +1295,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case BPF_WORKQUEUE: case BPF_TASK_WORK: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_RHASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { ret = -EOPNOTSUPP; @@ -1305,6 +1307,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case BPF_KPTR_PERCPU: case BPF_REFCOUNT: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_RHASH && map->map_type != BPF_MAP_TYPE_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH && -- cgit v1.2.3 From 9dcbb5045fe5a00f04c99aede5a10726f8bb937b Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 5 Jun 2026 04:41:24 -0700 Subject: bpf: Optimize word-sized keys for resizable hashtable Specialize the lookup/update/delete paths for keys whose size matches sizeof(long) (4 bytes on 32-bit, 8 bytes on 64-bit). A static-const rhashtable_params lets the compiler inline a custom XOR-fold hashfn and a single-word equality cmpfn, eliminating the indirect jhash dispatch. The same hashfn and cmpfn are installed into rhashtable's stored params at rhashtable_init time, so the rehash worker, slow-path inserts, and rhashtable_next_key() all agree with the inlined fast paths. The seq_file BPF iterator uses rhashtable_walk_* and is unaffected. Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260605-rhash-v7-7-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 7b9408b8320c..b4366cad3cfa 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2763,6 +2763,31 @@ static inline void *rhtab_elem_value(struct rhtab_elem *l, u32 key_size) return l->data + round_up(key_size, 8); } +/* Specialize hash function and objcmp for long sized key */ +static __always_inline int rhtab_key_cmp_long(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const unsigned long key1 = *(const unsigned long *)arg->key; + const struct rhtab_elem *key2 = ptr; + + return key1 != *(const unsigned long *)key2->data; +} + +static __always_inline u32 rhtab_hashfn_long(const void *data, u32 len, u32 seed) +{ + u64 k = *(const unsigned long *)data; + + return (u32)(k ^ (k >> 32)) ^ seed; +} + +static const struct rhashtable_params rhtab_params_long = { + .head_offset = offsetof(struct rhtab_elem, node), + .key_offset = offsetof(struct rhtab_elem, data), + .key_len = sizeof(long), + .hashfn = rhtab_hashfn_long, + .obj_cmpfn = rhtab_key_cmp_long, +}; + static struct bpf_map *rhtab_map_alloc(union bpf_attr *attr) { struct rhashtable_params params; @@ -2788,6 +2813,11 @@ static struct bpf_map *rhtab_map_alloc(union bpf_attr *attr) params.nelem_hint = (u32)attr->map_extra; params.automatic_shrinking = true; + if (rhtab->map.key_size == sizeof(long)) { + params.hashfn = rhtab_hashfn_long; + params.obj_cmpfn = rhtab_key_cmp_long; + } + err = rhashtable_init(&rhtab->ht, ¶ms); if (err) goto free_rhtab; @@ -2878,6 +2908,9 @@ static void *rhtab_lookup_elem(struct bpf_map *map, void *key) /* Hold RCU lock in case sleepable program calls via gen_lookup */ guard(rcu)(); + if (map->key_size == sizeof(long)) + return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params_long); + return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params); } @@ -2912,7 +2945,12 @@ static int rhtab_delete_elem(struct bpf_rhtab *rhtab, struct rhtab_elem *elem, v * raw tracepoints, which we don't have in rhashtable. */ bpf_disable_instrumentation(); - err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params); + + if (rhtab->map.key_size == sizeof(long)) + err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params_long); + else + err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params); + bpf_enable_instrumentation(); if (err) @@ -3030,7 +3068,12 @@ static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u /* Prevent deadlock for NMI programs attempting to take bucket lock */ bpf_disable_instrumentation(); - tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params); + + if (map->key_size == sizeof(long)) + tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params_long); + else + tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params); + bpf_enable_instrumentation(); if (tmp) { -- cgit v1.2.3 From 27ffbfd14d774adfc64ae1f8f76aa6195411087a Mon Sep 17 00:00:00 2001 From: Song Chen Date: Wed, 3 Jun 2026 17:19:10 +0800 Subject: bpf: Reject registration of duplicated kfunc Search for duplicated kfunc in btf_vmlinux and btf_modules before a kernel module attempts to register a kfunc. If kfunc would shadow existing kfunc then pr_err() and reject module loading. Reviewed-by: Yonghong Song Signed-off-by: Song Chen Link: https://lore.kernel.org/r/20260603091910.7212-1-chensong_2000@126.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 55aa3ba1b1e0..ef4402274786 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8771,6 +8771,39 @@ static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name, return 0; } +static int btf_check_kfunc_name(struct btf *btf, const char *func_name, u32 kind) +{ +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + struct btf_module *btf_mod, *tmp; +#endif + s32 id; + + if (!btf_is_module(btf)) + return 0; + + id = btf_find_by_name_kind(bpf_get_btf_vmlinux(), func_name, kind); + if (id >= 0) { + pr_err("kfunc %s (id: %d) is already present in vmlinux.\n", + func_name, id); + return -EINVAL; + } + +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + guard(mutex)(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->btf == btf) + continue; + id = btf_find_by_name_kind(btf_mod->btf, func_name, kind); + if (id >= 0) { + pr_err("kfunc %s (id: %d) is already present in module %s.\n", + func_name, id, btf_mod->module->name); + return -EINVAL; + } + } +#endif + return 0; +} + static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags) { const struct btf_type *func; @@ -8784,7 +8817,8 @@ static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags) /* sanity check kfunc name */ func_name = btf_name_by_offset(btf, func->name_off); - if (!func_name || !func_name[0]) + if (!func_name || !func_name[0] || + btf_check_kfunc_name(btf, func_name, BTF_INFO_KIND(func->info))) return -EINVAL; func = btf_type_by_id(btf, func->type); -- cgit v1.2.3 From aa496720618f1a6054f1c870bf10b4f6c99bf656 Mon Sep 17 00:00:00 2001 From: Zhao Zhang Date: Tue, 2 Jun 2026 16:43:33 +0800 Subject: bpf: Reject fragmented frames in devmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Devmap broadcast redirects clone the packet for all but the last destination. For native XDP, that clone path copies only the linear xdp_frame data, while fragmented frames keep skb_shared_info in tailroom outside the linear area. Cloning such a frame leaves XDP_FLAGS_HAS_FRAGS set but without valid frag metadata, and the later free path can interpret uninitialized tail data as skb_shared_info, leading to an out-of-bounds access during frame return. Reject fragmented native XDP frames in dev_map_enqueue_clone(). Add the same restriction to the generic XDP clone path in dev_map_redirect_clone(). Generic XDP represents fragmented packets as nonlinear skbs, and rejecting them here keeps clone-based broadcast support aligned between native and generic XDP. Fixes: e624d4ed4aa8 ("xdp: Extend xdp_redirect_map with broadcast support") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Zhengchuan Liang Reported-by: Xin Liu Assisted-by: Codex:GPT-5.4 Signed-off-by: Zhao Zhang Signed-off-by: Ren Wei Reviewed-by: Emil Tsalapatis Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/21c2d153dd25603d359069a02bf06779b51f6423.1780385378.git.zzhan461@ucr.edu Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index cc0a43ebab6b..5b9eac5342a9 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -581,6 +581,10 @@ static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, { struct xdp_frame *nxdpf; + /* Frags live outside the linear frame and cannot be cloned safely. */ + if (unlikely(xdp_frame_has_frags(xdpf))) + return -EOPNOTSUPP; + nxdpf = xdpf_clone(xdpf); if (!nxdpf) return -ENOMEM; @@ -726,6 +730,9 @@ static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, struct sk_buff *nskb; int err; + if (unlikely(skb_is_nonlinear(skb))) + return -EOPNOTSUPP; + nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return -ENOMEM; -- cgit v1.2.3 From f64c723741c911544cca4c838d7a291b06b3ad1d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Jun 2026 08:37:28 -1000 Subject: bpf: Replace scratch PTE atomically when allocating arena pages apply_range_set_cb() maps the pages for a new arena allocation and returned -EBUSY when the target PTE was already populated. Kernel-fault recovery leaves the per-arena scratch page in unallocated arena PTEs, so a later bpf_arena_alloc_pages() over such a page hits that -EBUSY, and every subsequent allocation of it fails the same way. Allocation must install the real page over scratch instead. Overwriting the scratch PTE in place is a valid->valid change, which arm64 forbids without break-before-make. Route through an invalid entry instead: ptep_try_set() fills only a none slot, so the PTE goes scratch->none->page. On finding scratch, clear it and flush_tlb_before_set() before retrying. The new flush_tlb_before_set() is a no-op except on arches like arm64 that need the break-before-make TLB invalidate. The loop also copes with a concurrent fault re-scratching the slot. Arches without ptep_try_set() never install the scratch page, so keep the must-be-empty check and set_pte_at() for them. Fixes: dc11a4dba246 ("bpf: Recover arena kernel faults with scratch page") Signed-off-by: Tejun Heo Cc: Alexei Starovoitov Cc: David Hildenbrand Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260601183728.1800490-1-tj@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/include/asm/pgtable.h | 11 +++++++++++ include/linux/pgtable.h | 18 ++++++++++++++++++ kernel/bpf/arena.c | 38 +++++++++++++++++++++++++++++++++----- 3 files changed, 62 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 984f0502c9d0..3ce0f2a6cab6 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1842,6 +1842,17 @@ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte) } #define ptep_try_set ptep_try_set +/* + * arm64 mandates break-before-make: a cleared kernel PTE must have its TLB + * invalidated before a different page is installed in its place. The broadcast + * TLBI is an instruction, not an IPI, so this is safe with interrupts disabled. + */ +static inline void flush_tlb_before_set(unsigned long addr) +{ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); +} +#define flush_tlb_before_set flush_tlb_before_set + #define test_and_clear_young_ptes test_and_clear_young_ptes static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index b5739bb99fc1..4c6c4081ef71 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1061,6 +1061,24 @@ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte) } #endif +#ifndef flush_tlb_before_set +/** + * flush_tlb_before_set - invalidate a kernel PTE's TLB before re-setting it + * @addr: kernel virtual address whose PTE was just cleared + * + * Some architectures (e.g. arm64) do not allow a live page-table entry to be + * repointed at a different page in one step. The old entry must first be made + * invalid and its translation flushed from every TLB, and only then may the new + * entry be written. + * + * This is only for the lockless atomic kernel-PTE installers (ptep_try_set()). + * It must be callable with interrupts disabled. + */ +static inline void flush_tlb_before_set(unsigned long addr) +{ +} +#endif + #ifndef wrprotect_ptes /** * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 9b2dea229b38..af49c154473d 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -144,6 +144,7 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr) struct apply_range_data { struct page **pages; + struct page *scratch_page; int i; }; @@ -156,19 +157,44 @@ static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data) { struct apply_range_data *d = data; struct page *page; + pte_t pteval; if (!data) return 0; - /* sanity check */ - if (unlikely(!pte_none(ptep_get(pte)))) - return -EBUSY; page = d->pages[d->i]; /* paranoia, similar to vmap_pages_pte_range() */ if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page)))) return -EINVAL; - set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); + pteval = mk_pte(page, PAGE_KERNEL); +#ifdef ptep_try_set + /* + * Kernel-fault recovery may have installed the scratch page here, and + * some architectures (arm64) prohibit valid->valid PTE transitions. + * Install atomically into a none slot. If scratch is present, clear it + * and flush_tlb_before_set() (break-before-make) before retrying. + */ + while (!ptep_try_set(pte, pteval)) { + pte_t old = ptep_get(pte); + + if (pte_none(old)) + continue; + if (WARN_ON_ONCE(pte_page(old) != d->scratch_page)) + return -EBUSY; + ptep_get_and_clear(&init_mm, addr, pte); + flush_tlb_before_set(addr); + } +#else + /* + * Without ptep_try_set() there is no atomic installer, but such arches + * also do not wire up bpf_arena_handle_page_fault(), so no scratch page + * is ever installed and the slot is always none here. + */ + if (unlikely(!pte_none(ptep_get(pte)))) + return -EBUSY; + set_pte_at(&init_mm, addr, pte, pteval); +#endif d->i++; return 0; } @@ -480,7 +506,8 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) if (ret) goto out_sigsegv_memcg; - struct apply_range_data data = { .pages = &page, .i = 0 }; + struct apply_range_data data = { .pages = &page, .i = 0, + .scratch_page = arena->scratch_page }; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { @@ -670,6 +697,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; } data.pages = pages; + data.scratch_page = arena->scratch_page; if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) goto out_free_pages; -- cgit v1.2.3 From a3863aa4f55e5c17f32e1fd64a0a64adf2af16d9 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 5 Jun 2026 13:20:52 -0700 Subject: bpf: Fix dead error check on acquire_reference() in check_kfunc_call acquire_reference() returns a signed int that may be a negative errno but was converted to unsigned, which makes the subsequent error check deadcode. Fix it by declaring 'id' as int so the error path is taken correctly. Fixes: 308c7a0ae885 ("bpf: Refactor object relationship tracking and fix dynptr UAF bug") Acked-by: Eduard Zingerman Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260605202056.1780352-2-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7d27ba396d32..a741bf447931 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12817,9 +12817,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_kfunc_call_arg_meta meta; struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; - u32 i, nargs, ptr_type_id, id; const struct btf_param *args; + u32 i, nargs, ptr_type_id; struct btf *desc_btf; + int id; /* skip for now, but return error when we find this in fixup_kfunc_call */ if (!insn->imm) -- cgit v1.2.3 From 73d475dc6c13177fce0d9d892bff33299c8ad56a Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 5 Jun 2026 13:20:53 -0700 Subject: bpf: Check acquire_reference() error for "__ref" struct_ops arguments When acquiring references for struct_ops program arguments tagged with "__ref", the return value of acquire_reference() was stored directly into u32 ctx_arg_info[i].ref_id without checking for failure. acquire_reference() returns -ENOMEM when acquire_reference_state() fails to allocate, so the error was silently stored as a ref_id instead of aborting verification. Fix it by checking the return. Fixes: a687df2008f6 ("bpf: Support getting referenced kptr from struct_ops argument") Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260605202056.1780352-3-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a741bf447931..3b874bbbaac0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18363,9 +18363,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) /* Acquire references for struct_ops program arguments tagged with "__ref" */ if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) { - for (i = 0; i < aux->ctx_arg_info_size; i++) - aux->ctx_arg_info[i].ref_id = aux->ctx_arg_info[i].refcounted ? - acquire_reference(env, 0, 0) : 0; + for (i = 0; i < aux->ctx_arg_info_size; i++) { + ret = aux->ctx_arg_info[i].refcounted ? acquire_reference(env, 0, 0) : 0; + if (ret < 0) + goto out; + + aux->ctx_arg_info[i].ref_id = ret; + } } ret = do_check(env); -- cgit v1.2.3 From 41025f441fe6addd93d2c333a3a184331e8ef6cf Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 5 Jun 2026 13:20:54 -0700 Subject: bpf: Compare parent_id in refsafe() for REF_TYPE_PTR refsafe() compared each reference's id and type but not its parent_id, so two states whose PTR references differ only in the parent object they were derived from could be wrongly treated as equivalent and pruned. Fix it by checking parent_id too. Fixes: 308c7a0ae885 ("bpf: Refactor object relationship tracking and fix dynptr UAF bug") Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260605202056.1780352-4-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/states.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index 5945956a7573..06d9ae24f006 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -890,6 +890,9 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c return false; switch (old->refs[i].type) { case REF_TYPE_PTR: + if (!check_ids(old->refs[i].parent_id, cur->refs[i].parent_id, idmap)) + return false; + break; case REF_TYPE_IRQ: break; case REF_TYPE_LOCK: -- cgit v1.2.3 From ac7f6c9da6b6b46bba34a45c51603c81e7d42eb2 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 5 Jun 2026 13:20:55 -0700 Subject: bpf: Remove WARN_ON_ONCE in check_ids() check_ids() warned when it ran out of idmap slots, assuming this was impossible because the slots are bounded by the number of registers and stack slots. That assumption no longer holds: referenced dynptrs acquire an intermediate reference that lives in refs[] but is not backed by any register or stack slot [0], so a program can accumulate more reference ids than the idmap can hold and exhaust it. Exhaustion is fine for verification correctness. check_ids() already returns false, which makes the states compare as not equivalent and prevents unsound pruning. The only effect of the WARN_ON_ONCE() is log noise, or a panic under panic_on_warn. Drop the warning and keep returning false. [0] 308c7a0ae885 ("bpf: Refactor object relationship tracking and fix dynptr UAF bug") Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20260605202056.1780352-5-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/states.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index 06d9ae24f006..32f346ce3ffc 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -343,8 +343,12 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) return true; } - /* We ran out of idmap slots, which should be impossible */ - WARN_ON_ONCE(1); + /* + * idmap slots are bounded by the number of registers and stack slots. + * Since referenced dynptrs acquire intermediate references that do + * not live in either, so the map can be exhausted. Since it is unlikely, + * fail the verification by treating the states as not equivalent. + */ return false; } -- cgit v1.2.3 From 4a7910ee060d8ce55612f5b3cc267f3a265a3cec Mon Sep 17 00:00:00 2001 From: Kaitao Cheng Date: Fri, 5 Jun 2026 17:41:43 +0800 Subject: bpf: Clear rb node linkage when freeing bpf_rb_root bpf_rb_root_free() detaches the root by copying the current rb_root_cached and then replacing the live root with RB_ROOT_CACHED. It then walks the copied root and drops each object contained in the tree. This leaves the rb node state intact while dropping the object. If the object is refcounted and survives the drop, its bpf_rb_node_kern still contains an owner pointer to the freed root and stale rb tree linkage. If a later bpf_rb_root allocation reuses the same address, bpf_rbtree_remove() can incorrectly pass the owner check and call rb_erase_cached() on a node whose rb pointers belong to the old tree. Mirror the list draining behavior by marking nodes as busy while the root is being detached, then clear the rb node and release the owner before dropping the containing object. This makes surviving nodes unowned and safe to reject from remove or accept for a later add. Fixes: 9c395c1b99bd ("bpf: Add basic bpf_rb_{root,node} support") Signed-off-by: Kaitao Cheng Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260605094143.5509-1-kaitao.cheng@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 03004e4451f5..8ba2b8965caf 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2306,6 +2306,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock) { struct rb_root_cached orig_root, *root = rb_root; + struct bpf_rb_node_kern *node; struct rb_node *pos, *n; void *obj; @@ -2314,14 +2315,20 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, __bpf_spin_lock_irqsave(spin_lock); orig_root = *root; + bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) { + node = rb_entry(pos, struct bpf_rb_node_kern, rb_node); + WRITE_ONCE(node->owner, BPF_PTR_POISON); + } *root = RB_ROOT_CACHED; __bpf_spin_unlock_irqrestore(spin_lock); bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) { obj = pos; obj -= field->graph_root.node_offset; - - + node = rb_entry(pos, struct bpf_rb_node_kern, rb_node); + RB_CLEAR_NODE(pos); + /* Ensure __bpf_rbtree_add() sees the node as unlinked. */ + smp_store_release(&node->owner, NULL); __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); } } -- cgit v1.2.3 From e2a49fdb1beed150125b4104c90eb2a96ec7f63a Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Fri, 5 Jun 2026 23:52:47 +0800 Subject: bpf: Check tail zero of bpf_map_info Since there're 4 bytes padding at the end of struct bpf_map_info, they won't be checked by bpf_check_uarg_tail_zero(). pahole -C bpf_map_info ./vmlinux struct bpf_map_info { ... __u64 hash __attribute__((__aligned__(8))); /* 88 8 */ __u32 hash_size; /* 96 4 */ /* size: 104, cachelines: 2, members: 18 */ /* padding: 4 */ /* forced alignments: 1 */ /* last cacheline: 40 bytes */ } __attribute__((__aligned__(8))); If a future kernel extension adds a new 4-byte field, older userspace programs allocating this structure on the stack might inadvertently pass uninitialized stack garbage into the new field, permanently breaking backward compatibility. -- sashiko [1] Fix it by changing sizeof(info) to offsetofend(struct bpf_map_info, hash_size). And, add "__u32 :32" to the tail of struct bpf_map_info. [1] https://lore.kernel.org/bpf/20260513224823.6494FC19425@smtp.kernel.org/ Fixes: ea2e6467ac36 ("bpf: Return hashes of maps in BPF_OBJ_GET_INFO_BY_FD") Acked-by: Mykyta Yatsenko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260605155249.20772-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 5 +++-- tools/include/uapi/linux/bpf.h | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bed9b1b4d5ef..e1730f449d9e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6733,6 +6733,7 @@ struct bpf_map_info { __u64 map_extra; __aligned_u64 hash; __u32 hash_size; + __u32 :32; } __attribute__((aligned(8))); struct bpf_btf_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 31a3b70a0b5d..89f020a44fc9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5406,10 +5406,11 @@ static int bpf_map_get_info_by_fd(struct file *file, { struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_map_info info; - u32 info_len = attr->info.info_len; + u32 info_len = attr->info.info_len, len; int err; - err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); + len = offsetofend(struct bpf_map_info, hash_size); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7d0b282ba674..7caf667e86fe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6733,6 +6733,7 @@ struct bpf_map_info { __u64 map_extra; __aligned_u64 hash; __u32 hash_size; + __u32 :32; } __attribute__((aligned(8))); struct bpf_btf_info { -- cgit v1.2.3 From 786be2b05980a5828e67fc564ad7517e2adbe9bd Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Fri, 5 Jun 2026 23:52:48 +0800 Subject: bpf: Check tail zero of bpf_prog_info Since there're 4 bytes padding at the end of struct bpf_prog_info, they won't be checked by bpf_check_uarg_tail_zero(). pahole -C bpf_prog_info ./vmlinux struct bpf_prog_info { ... __u32 attach_btf_obj_id; /* 220 4 */ __u32 attach_btf_id; /* 224 4 */ /* size: 232, cachelines: 4, members: 38 */ /* sum members: 224 */ /* sum bitfield members: 1 bits, bit holes: 1, sum bit holes: 31 bits */ /* padding: 4 */ /* forced alignments: 9 */ /* last cacheline: 40 bytes */ } __attribute__((__aligned__(8))); If a future kernel extension adds a new 4-byte field, older userspace programs allocating this structure on the stack might inadvertently pass uninitialized stack garbage into the new field, permanently breaking backward compatibility. -- sashiko [1] Fix it by changing sizeof(info) to offsetofend(struct bpf_prog_info, attach_btf_id). And, add "__u32 :32" to the tail of struct bpf_prog_info. [1] https://lore.kernel.org/bpf/20260513224823.6494FC19425@smtp.kernel.org/ Fixes: aba64c7da983 ("bpf: Add verified_insns to bpf_prog_info and fdinfo") Acked-by: Mykyta Yatsenko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260605155249.20772-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 5 +++-- tools/include/uapi/linux/bpf.h | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e1730f449d9e..d5238df5e5eb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6712,6 +6712,7 @@ struct bpf_prog_info { __u32 verified_insns; __u32 attach_btf_obj_id; __u32 attach_btf_id; + __u32 :32; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 89f020a44fc9..c5d4ae957e87 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5121,10 +5121,11 @@ static int bpf_prog_get_info_by_fd(struct file *file, u32 info_len = attr->info.info_len; struct bpf_prog_kstats stats; char __user *uinsns; - u32 ulen; + u32 ulen, len; int err; - err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); + len = offsetofend(struct bpf_prog_info, attach_btf_id); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7caf667e86fe..3829db087449 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6712,6 +6712,7 @@ struct bpf_prog_info { __u32 verified_insns; __u32 attach_btf_obj_id; __u32 attach_btf_id; + __u32 :32; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit v1.2.3 From a66e3b5bacf38d6ab29fa05a9754f7a114485605 Mon Sep 17 00:00:00 2001 From: Dawei Feng Date: Wed, 3 Jun 2026 18:53:15 +0800 Subject: bpf: NUL-terminate replaced sysctl value When writing to sysctls, proc_sys_call_handler() guarantees that the buffer passed to proc handlers is NUL-terminated. If bpf_sysctl_set_new_value() replaces the pending sysctl value, it can hand a replacement buffer directly to proc handlers. However, the helper currently copies only buf_len bytes into that buffer without appending a NUL terminator, leaving downstream parsers vulnerable to out-of-bounds access. Fix this by appending a '\0' after the replaced value to restore the expected sysctl semantics. Since the helper already rejects buf_len greater than PAGE_SIZE - 1, there is always room for the extra byte. Reproduced in a QEMU x86_64 guest booted with KASAN while exercising the sysctl replacement path with a cgroup/sysctl BPF program. The reproducer targets `/proc/sys/net/core/flow_limit_cpu_bitmap`, fills the original user write buffer with non-zero bytes, and overrides the sysctl value so the replacement buffer lacks a terminating NUL. Under that setup, the pre-fix kernel reported: BUG: KASAN: slab-out-of-bounds in strnchrnul+0x72/0x90 Read of size 1 at addr ffff88800de57000 by task repro_patch3/66 CPU: 0 UID: 0 PID: 66 Comm: repro_patch3 Not tainted 7.1.0-rc3-00269-g8370ca1f87cc #6 PREEMPT(lazy) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: dump_stack_lvl+0x68/0xa0 print_report+0xcb/0x5e0 ? __virt_addr_valid+0x21d/0x3f0 ? strnchrnul+0x72/0x90 ? strnchrnul+0x72/0x90 kasan_report+0xca/0x100 ? strnchrnul+0x72/0x90 strnchrnul+0x72/0x90 bitmap_parse+0x37/0x2e0 flow_limit_cpu_sysctl+0xc6/0x840 ? __pfx_flow_limit_cpu_sysctl+0x10/0x10 ? __kvmalloc_node_noprof+0x5ba/0x870 proc_sys_call_handler+0x31d/0x480 ? __pfx_proc_sys_call_handler+0x10/0x10 ? selinux_file_permission+0x39f/0x500 ? lock_is_held_type+0x9e/0x120 vfs_write+0x98e/0x1000 ... The buggy address is located 0 bytes to the right of allocated 4096-byte region [ffff88800de56000, ffff88800de57000) With this fix applied, rerunning the same sysctl-targeted path yields no corresponding KASAN reports. Signed-off-by: Zilin Guan Signed-off-by: Dawei Feng Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260603105317.944304-2-dawei.feng@seu.edu.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 2c2bdaa86aa7..3fe87b9d368e 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2343,6 +2343,7 @@ BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, return -E2BIG; memcpy(ctx->new_val, buf, buf_len); + ((char *)ctx->new_val)[buf_len] = '\0'; ctx->new_len = buf_len; ctx->new_updated = 1; -- cgit v1.2.3 From 4c21b5927d4364bfe7365f2700da5fea0ed0d004 Mon Sep 17 00:00:00 2001 From: Dawei Feng Date: Wed, 3 Jun 2026 18:53:16 +0800 Subject: bpf: use kvfree() for replaced sysctl write buffer proc_sys_call_handler() allocates its temporary sysctl buffer with kvzalloc() and passes it to __cgroup_bpf_run_filter_sysctl(). Since kvzalloc() may fall back to vmalloc() for large allocations, freeing that buffer with kfree() is wrong and can corrupt memory. Use kvfree() to safely handle both kmalloc and kvzalloc()/vmalloc allocations. The bug was first flagged by an experimental analysis tool we are developing for kernel memory-management bugs while analyzing v6.13-rc1. The tool is still under development and is not yet publicly available. Manual inspection confirms that the bug is still present in v7.1-rc5. Reproduced the bug based on v7.1-rc4 in a QEMU x86_64 guest booted with KASAN and CONFIG_FAILSLAB enabled. To exercise the replacement path, the test tree also included the accompanying fix for the stale ret == 1 check in __cgroup_bpf_run_filter_sysctl(). The reproducer confines failslab injections to the proc_sys_call_handler() range, uses stacktrace-depth=32, and injects fail-nth=1 while writing 8191 bytes to /proc/sys/kernel/domainname from a task in the target cgroup. Under that setup, fail-nth=1 triggered the fault: BUG: unable to handle page fault for address: ffffeb0200024d48 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 SMP KASAN NOPTI CPU: 2 UID: 0 PID: 209 Comm: repro_proc_sys_ Not tainted 7.1.0-rc4-00686-g97625979a5d4 PREEMPT(lazy) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 RIP: 0010:kfree+0x6e/0x510 ... Call Trace: ? __cgroup_bpf_run_filter_sysctl+0x626/0xc30 __cgroup_bpf_run_filter_sysctl+0x74d/0xc30 ? __pfx___cgroup_bpf_run_filter_sysctl+0x10/0x10 ? srso_return_thunk+0x5/0x5f ? __kvmalloc_node_noprof+0x345/0x870 ? proc_sys_call_handler+0x250/0x480 ? srso_return_thunk+0x5/0x5f proc_sys_call_handler+0x3a2/0x480 ? __pfx_proc_sys_call_handler+0x10/0x10 ? srso_return_thunk+0x5/0x5f ? selinux_file_permission+0x39f/0x500 ? srso_return_thunk+0x5/0x5f ? lock_is_held_type+0x9e/0x120 vfs_write+0x98e/0x1000 ... With this fix applied on top of the same test setup, rerunning the reproducer with fail-nth=1 yields no corresponding Oops reports. Fixes: 4508943794ef ("proc: use kvzalloc for our kernel buffer") Cc: stable@vger.kernel.org Reviewed-by: Emil Tsalapatis Reviewed-by: Jiayuan Chen Acked-by: Yonghong Song Signed-off-by: Zilin Guan Signed-off-by: Dawei Feng Link: https://lore.kernel.org/r/20260603105317.944304-3-dawei.feng@seu.edu.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 3fe87b9d368e..4bf0ec94e719 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1937,7 +1937,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); if (ret == 1 && ctx.new_updated) { - kfree(*buf); + kvfree(*buf); *buf = ctx.new_val; *pcount = ctx.new_len; } else { -- cgit v1.2.3 From 2566c3b24219c5b30e35205cba029ff34ff7c78b Mon Sep 17 00:00:00 2001 From: Dawei Feng Date: Wed, 3 Jun 2026 18:53:17 +0800 Subject: bpf: Restore sysctl new-value from 1 to 0 Commit 4e63acdff864 ("bpf: Introduce bpf_sysctl_{get,set}_new_value helpers") changed the success return value to 0, but failed to update the corresponding check in __cgroup_bpf_run_filter_sysctl(). Since bpf_prog_run_array_cg() now returns 0 on success, the legacy ret == 1 condition is never satisfied. As a result, the modified value is ignored, and bpf_sysctl_set_new_value() fails to replace the write buffer. Fix this by checking for a return value of 0 instead, so cgroup/sysctl programs can correctly replace the pending sysctl buffer. This bug was discovered during a manual code review. Tested via a cgroup/sysctl BPF reproducer overriding writes to a target sysctl. Pre-fix, bpf_sysctl_set_new_value("foo") was silently ignored: the write returned 8192 and the value remained "600". Post-fix, the BPF replacement buffer properly propagates: the write returns 3 and the value updates to "foo". Fixes: f10d05966196 ("bpf: Make BPF_PROG_RUN_ARRAY return -err instead of allow boolean") Cc: stable@vger.kernel.org Acked-by: Yonghong Song Signed-off-by: Zilin Guan Signed-off-by: Dawei Feng Reviewed-by: Jiayuan Chen Acked-by: Xu Kuohai Link: https://lore.kernel.org/r/20260603105317.944304-4-dawei.feng@seu.edu.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4bf0ec94e719..35d1f1428ef3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1936,7 +1936,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); - if (ret == 1 && ctx.new_updated) { + if (!ret && ctx.new_updated) { kvfree(*buf); *buf = ctx.new_val; *pcount = ctx.new_len; -- cgit v1.2.3 From b1f7f67b74c2e29e9c83f7952290a3c7f156bf9a Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 5 Jun 2026 14:02:42 +0000 Subject: bpf: Add validation for bpf_set_retval argument The bpf_set_retval() helper is used by cgroup BPF programs to set the return value of the target hook. The argument type for this helper is ARG_ANYTHING. This allows setting a positive value, which no cgroup hook expects and can cause issues, such as: - BPF_LSM_CGROUP: a positive value from bpf_lsm_socket_create bypasses the err < 0 check in __sock_create(), leaving the socket object unallocated. The positive return value is then propagated to the syscall entry __sys_socket(), which also bypasses the IS_ERR() guard and ultimately causes a NULL pointer dereference. - BPF_CGROUP_DEVICE: a positive value can be returned through cgroup device bpf prog -> devcgroup_check_permission() -> bdev_permission() -> bdev_file_open_by_dev(), where ERR_PTR(positive) produces a pointer that IS_ERR() does not catch, leading to a wild pointer dereference. - BPF_CGROUP_SOCK: a positive value can be returned through cgroup sock bpf prog -> __cgroup_bpf_run_filter_sk() -> inet_create() -> __sock_create(), where inet_create() frees the newly allocated sk via sk_common_release() and sets sock->sk = NULL on the non-zero return, but __sock_create() only checks err < 0 for cleanup, so a positive retval bypasses cleanup and returns a socket with NULL sk to userspace, triggering a NULL pointer dereference on subsequent socket operations. - BPF_CGROUP_SYSCTL: a positive value can be returned through the cgroup bpf prog -> __cgroup_bpf_run_filter_sysctl() -> proc_sys_call_handler(), where a non-zero return bypasses the normal sysctl proc_handler and is returned directly to userspace as return value of read() or write() syscall. So add validation for the argument of the bpf_set_retval() helper. For BPF_LSM_CGROUP, enforce the LSM hook specific range returned by bpf_lsm_get_retval_range(). For all other cgroup program types, restrict the argument to [-MAX_ERRNO, 0], which matches the kernel convention of 0 for success and negative errno for error. BPF_CGROUP_GETSOCKOPT is an exception, since valid getsockopt implementations may return positive values, as allowed by commit c4dcfdd406aa ("bpf: Move getsockopt retval to struct bpf_cg_run_ctx"). Also refine the return value range of bpf_get_retval() so that values returned by bpf_get_retval() can be passed directly to bpf_set_retval() without extra manual bounds checking. Fixes: b44123b4a3dc ("bpf: Add cgroup helpers bpf_{get,set}_retval to get/set syscall return value") Fixes: 69fd337a975c ("bpf: per-cgroup lsm flavor") Reported-by: Quan Sun <2022090917019@std.uestc.edu.cn> Closes: https://lore.kernel.org/all/567d3206-74a5-44e5-99c6-779c425f399e@std.uestc.edu.cn Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20260605140243.664590-3-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3b874bbbaac0..935595138aa0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9770,7 +9770,9 @@ static int do_refine_retval_range(struct bpf_verifier_env *env, int func_id, struct bpf_call_arg_meta *meta) { + struct bpf_retval_range range; struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); if (ret_type != RET_INTEGER) return 0; @@ -9790,6 +9792,29 @@ static int do_refine_retval_range(struct bpf_verifier_env *env, reg_set_urange32(ret_reg, 0, nr_cpu_ids - 1); reg_bounds_sync(ret_reg); break; + case BPF_FUNC_get_retval: + /* + * bpf_get_retval may see arbitrary value passed by bpf_prog_run_array_cg for + * CGROUP_GETSOCKOPT type. + */ + if (prog_type == BPF_PROG_TYPE_CGROUP_SOCKOPT && + env->prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT) + break; + + if (prog_type == BPF_PROG_TYPE_LSM && + env->prog->expected_attach_type == BPF_LSM_CGROUP) { + if (!env->prog->aux->attach_func_proto->type) + break; + bpf_lsm_get_retval_range(env->prog, &range); + } else { + range.minval = -MAX_ERRNO; + range.maxval = 0; + } + + reg_set_srange64(ret_reg, range.minval, range.maxval); + reg_set_srange32(ret_reg, range.minval, range.maxval); + reg_bounds_sync(ret_reg); + break; } return reg_bounds_sanity_check(env, ret_reg, "retval"); @@ -10259,6 +10284,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } break; case BPF_FUNC_set_retval: + { + struct bpf_retval_range range = { + .minval = -MAX_ERRNO, + .maxval = 0, + .return_32bit = true + }; + struct bpf_reg_state *r1 = ®s[BPF_REG_1]; + + if (r1->type != SCALAR_VALUE) { + verbose(env, "R1 is not a scalar\n"); + return -EINVAL; + } + + /* CGROUP_GETSOCKOPT is allowed to return arbitrary value */ + if (prog_type == BPF_PROG_TYPE_CGROUP_SOCKOPT && + env->prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT) + break; + if (prog_type == BPF_PROG_TYPE_LSM && env->prog->expected_attach_type == BPF_LSM_CGROUP) { if (!env->prog->aux->attach_func_proto->type) { @@ -10268,8 +10311,20 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); return -EINVAL; } + bpf_lsm_get_retval_range(env->prog, &range); } + + err = mark_chain_precision(env, BPF_REG_1); + if (err) + return err; + + if (!retval_range_within(range, r1)) { + verbose_invalid_scalar(env, r1, range, "At bpf_set_retval", "R1"); + return -EINVAL; + } + break; + } case BPF_FUNC_dynptr_write: { enum bpf_dynptr_type dynptr_type = meta.dynptr.type; -- cgit v1.2.3 From 63a673e8a4112af267106264f50584947786845a Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 5 Jun 2026 23:35:17 +0200 Subject: bpf: Expose signature verdict via bpf_prog_aux BPF_PROG_LOAD verifies the loader signature but does not record the outcome on the BPF program. [BPF] LSMs and audit can read attr->signature and attr->keyring_id to infer "was this signed, and if so, against which keyring". Add prog->aux->sig (verdict + keyring_{type,serial}), populated by bpf_prog_load before the LSM hook. keyring_type classifies the keyring the load referenced (builtin, secondary, platform or user), while keyring_serial records the serial of the keyring the signature was actually validated against. System keyrings carry a pseudo key pointer with no user-visible serial and are reported as 0, as are unsigned loads. Failed verifications reject the load before the hook runs, so it observes only either UNSIGNED or VERIFIED. Signed-off-by: KP Singh Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260605213518.544262-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 45 +++++++++++++++++++++++++++++++++++---------- kernel/bpf/syscall.c | 29 ++++++++++++++++++++++++----- 2 files changed, 59 insertions(+), 15 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8599b451dd7a..f615b56730d2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -32,6 +32,7 @@ #include #include #include +#include #include struct bpf_verifier_env; @@ -1674,6 +1675,19 @@ struct bpf_stream_stage { int len; }; +enum bpf_sig_verdict { + BPF_SIG_UNSIGNED = 0, + BPF_SIG_VERIFIED, +}; + +enum bpf_sig_keyring { + BPF_SIG_KEYRING_NONE = 0, + BPF_SIG_KEYRING_BUILTIN, + BPF_SIG_KEYRING_SECONDARY, + BPF_SIG_KEYRING_PLATFORM, + BPF_SIG_KEYRING_USER, +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -1716,6 +1730,11 @@ struct bpf_prog_aux { bool changes_pkt_data; bool might_sleep; bool kprobe_write_ctx; + struct { + s32 keyring_serial; + u8 keyring_type; + u8 verdict; + } sig; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; @@ -3697,8 +3716,14 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, #endif /* CONFIG_BPF_SYSCALL */ #endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ -#if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) +#ifdef CONFIG_KEYS +struct bpf_key { + struct key *key; + bool has_ref; +}; +#endif /* CONFIG_KEYS */ +#if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags); struct bpf_key *bpf_lookup_system_key(u64 id); void bpf_key_put(struct bpf_key *bkey); @@ -3706,6 +3731,10 @@ int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p, const struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring); +static inline s32 bpf_key_serial(const struct bpf_key *key) +{ + return key->has_ref ? key->key->serial : 0; +} #else static inline struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) { @@ -3727,6 +3756,11 @@ static inline int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p, { return -EOPNOTSUPP; } + +static inline s32 bpf_key_serial(const struct bpf_key *key) +{ + return 0; +} #endif /* defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) */ /* verifier prototypes for helper functions called from eBPF programs */ @@ -4002,15 +4036,6 @@ static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {} static inline void bpf_cgroup_atype_put(int cgroup_atype) {} #endif /* CONFIG_BPF_LSM */ -struct key; - -#ifdef CONFIG_KEYS -struct bpf_key { - struct key *key; - bool has_ref; -}; -#endif /* CONFIG_KEYS */ - static inline bool type_is_alloc(u32 type) { return type & MEM_ALLOC; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c5d4ae957e87..5fcfc32c7cb4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2871,8 +2871,22 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } } +static enum bpf_sig_keyring bpf_classify_keyring(s32 keyring_id) +{ + switch (keyring_id) { + case 0: + return BPF_SIG_KEYRING_BUILTIN; + case (s32)(unsigned long)VERIFY_USE_SECONDARY_KEYRING: + return BPF_SIG_KEYRING_SECONDARY; + case (s32)(unsigned long)VERIFY_USE_PLATFORM_KEYRING: + return BPF_SIG_KEYRING_PLATFORM; + default: + return BPF_SIG_KEYRING_USER; + } +} + static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr, - bool is_kernel) + bool is_kernel, s32 *keyring_serial) { bpfptr_t usig = make_bpfptr(attr->signature, is_kernel); struct bpf_dynptr_kern sig_ptr, insns_ptr; @@ -2908,7 +2922,8 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr, (struct bpf_dynptr *)&sig_ptr, key); - + if (!err) + *keyring_serial = bpf_key_serial(key); bpf_key_put(key); kvfree(sig); return err; @@ -3095,13 +3110,17 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_at /* eBPF programs must be GPL compatible to use GPL-ed functions */ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; - if (attr->signature) { - err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel); + err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel, + &prog->aux->sig.keyring_serial); if (err) goto free_prog; + prog->aux->sig.keyring_type = bpf_classify_keyring(attr->keyring_id); + prog->aux->sig.verdict = BPF_SIG_VERIFIED; + } else { + prog->aux->sig.keyring_type = BPF_SIG_KEYRING_NONE; + prog->aux->sig.verdict = BPF_SIG_UNSIGNED; } - prog->orig_prog = NULL; prog->jited = 0; -- cgit v1.2.3 From 37363191cbe8f83586ad6a818460d010070ead00 Mon Sep 17 00:00:00 2001 From: Nuoqi Gui Date: Sat, 6 Jun 2026 18:50:37 +0800 Subject: bpf: Fold reg->var_off into PTR_TO_FLOW_KEYS bounds check Constant pointer arithmetic on a PTR_TO_FLOW_KEYS register lands the constant in reg->var_off (e.g. flow_keys(imm=4096)), but the PTR_TO_FLOW_KEYS path in check_mem_access() passes only insn->off to check_flow_keys_access() and never folds reg->var_off.value. The verifier therefore accepts an access that, at runtime, dereferences past struct bpf_flow_keys -- a verifier/runtime divergence that yields an out-of-bounds read and write of kernel stack memory. Commit 022ac0750883 ("bpf: use reg->var_off instead of reg->off for pointers") removed the generic "off += reg->off" that check_mem_access() applied before the per-type dispatch and replaced it with per-path folding of reg->var_off.value (for example the ctx path now folds the register offset via check_ctx_access()). The PTR_TO_FLOW_KEYS path was not given the equivalent fold, so a constant offset that used to be folded and rejected is now silently accepted: before 022ac0750883: the offset stays in reg->off and is folded generically, so the access is checked with off=4096 and rejected. after 022ac0750883: the offset lands in reg->var_off, the flow_keys path checks off=0 and accepts; at runtime the access dereferences base + 0x1000. For a BPF_PROG_TYPE_FLOW_DISSECTOR program the following is accepted: r2 = *(u64 *)(r1 + 144) ; R2=flow_keys (PTR_TO_FLOW_KEYS) r2 += 0x1000 ; R2=flow_keys(imm=4096), accepted r0 = *(u64 *)(r2 + 0) ; accepted, var_off.value=0x1000 ignored while the equivalent insn->off form r0 = *(u64 *)(r2 + 0x1000) has the same effective offset but is correctly rejected with "invalid access to flow keys off=4096 size=8", which isolates the defect to the missing var_off fold. Once attached as a flow dissector, the accepted program reads kernel stack past struct bpf_flow_keys (a kernel-stack / KASLR information leak) and can likewise write past it, corrupting kernel memory. Fix it by folding reg->var_off.value into the offset before the bounds check and rejecting non-constant offsets, mirroring the other pointer types (e.g. check_ctx_access()). Fixes: 022ac0750883 ("bpf: use reg->var_off instead of reg->off for pointers") Signed-off-by: Nuoqi Gui Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260606-c3-01-v3-v3-1-97c51f592f15@mails.tsinghua.edu.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 935595138aa0..68ddd465584c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4728,9 +4728,21 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct b return err; } -static int check_flow_keys_access(struct bpf_verifier_env *env, int off, - int size) +static int check_flow_keys_access(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, argno_t argno, + int off, int size) { + /* Only a constant offset is allowed here; fold it into off. */ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "%s invalid variable offset to flow keys: off=%d, var_off=%s\n", + reg_arg_name(env, argno), off, tn_buf); + return -EACCES; + } + off += reg->var_off.value; + if (size < 0 || off < 0 || (u64)off + size > sizeof(struct bpf_flow_keys)) { verbose(env, "invalid access to flow keys off=%d size=%d\n", @@ -6239,7 +6251,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct b return -EACCES; } - err = check_flow_keys_access(env, off, size); + err = check_flow_keys_access(env, reg, argno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (type_is_sk_pointer(reg->type)) { -- cgit v1.2.3 From 5b038319be442c620f774e6fc9e9283deeca1c75 Mon Sep 17 00:00:00 2001 From: David Windsor Date: Fri, 5 Jun 2026 10:57:07 -0400 Subject: bpf: Reject sleepable BPF_LSM_CGROUP programs at load time The cgroup shim runs under rcu_read_lock_dont_migrate(), so we should not attach any sleepable BPF programs there. Add support to the verifier to explicitly reject attempts to load sleepable BPF programs destined for LSM cgroup attachment. Without this, we get the following splat from a BPF_LSM_CGROUP program marked BPF_F_SLEEPABLE attached to file_open when it calls bpf_get_dentry_xattr(): BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:1567 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 34317, name: load preempt_count: 0, expected: 0 RCU nest depth: 2, expected: 0 Call Trace: down_read+0x76/0x480 ext4_xattr_get+0x11f/0x700 __vfs_getxattr+0xf0/0x150 bpf_get_dentry_xattr+0xbb/0xf0 bpf_prog_e76a298dac9218c6_test_open+0x6a/0x85 __cgroup_bpf_run_lsm_current+0x326/0x840 bpf_trampoline_6442534646+0x62/0x14d security_file_open+0x34/0x60 do_dentry_open+0x340/0x1260 vfs_open+0x7a/0x440 path_openat+0x1bac/0x30a0 libbpf provides a .s named section variant for every sleepable program type except lsm_cgroup, reflecting that per-cgroup LSM programs are intended to only run in a non-sleepable context. The above splat was obtained by bypassing libbpf by using bpf(2) directly. Fixes: 69fd337a975c ("bpf: per-cgroup lsm flavor") Signed-off-by: David Windsor Acked-by: Yonghong Song Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20260605145707.608579-1-dwindsor@gmail.com Signed-off-by: Kumar Kartikeya Dwivedi --- kernel/bpf/verifier.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 68ddd465584c..926ff63a0b61 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19172,8 +19172,10 @@ static bool can_be_sleepable(struct bpf_prog *prog) return false; } } - return prog->type == BPF_PROG_TYPE_LSM || - prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ || + if (prog->type == BPF_PROG_TYPE_LSM) + return prog->expected_attach_type != BPF_LSM_CGROUP; + + return prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ || prog->type == BPF_PROG_TYPE_STRUCT_OPS || prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT || prog->type == BPF_PROG_TYPE_TRACEPOINT; -- cgit v1.2.3 From e6abd4cd157bf63cd89c74f8f10abae76e7b0359 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:29 +0200 Subject: bpf: Use mutex lock pool for bpf trampolines Adding mutex lock pool that replaces bpf trampolines mutex. For tracing_multi link coming in following changes we need to lock all the involved trampolines during the attachment. This could mean thousands of mutex locks, which is not convenient. As suggested by Andrii we can replace bpf trampolines mutex with mutex pool, where each trampoline is hash-ed to one of the locks from the pool. It's better to lock all the pool mutexes (32 at the moment) than thousands of them. There is 48 (MAX_LOCK_DEPTH) lock limit allowed to be simultaneously held by task, so we need to keep 32 mutexes (5 bits) in the pool, so when we lock them all in following changes the lockdep won't scream. Removing the mutex_is_locked in bpf_trampoline_put, because we removed the mutex from bpf_trampoline. Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-5-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 -- kernel/bpf/trampoline.c | 77 ++++++++++++++++++++++++++++++++++--------------- 2 files changed, 53 insertions(+), 26 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f615b56730d2..f6056bab6f23 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1353,8 +1353,6 @@ struct bpf_trampoline { /* hlist for trampoline_ip_table */ struct hlist_node hlist_ip; struct ftrace_ops *fops; - /* serializes access to fields of this trampoline */ - struct mutex mutex; refcount_t refcnt; u32 flags; u64 key; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index a4298a25d4ba..c0b4732627be 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -30,6 +30,35 @@ static struct hlist_head trampoline_ip_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline tables */ static DEFINE_MUTEX(trampoline_mutex); +/* + * Keep 32 trampoline locks (5 bits) in the pool so trampoline_lock_all() + * stays below MAX_LOCK_DEPTH. Each pool slot has a distinct lockdep + * class because trampoline_lock_all() takes all pool mutexes at once; + * otherwise lockdep would report recursive locking on same-class mutexes. + */ +#define TRAMPOLINE_LOCKS_BITS 5 +#define TRAMPOLINE_LOCKS_TABLE_SIZE (1 << TRAMPOLINE_LOCKS_BITS) + +static struct { + struct mutex mutex; + struct lock_class_key key; +} trampoline_locks[TRAMPOLINE_LOCKS_TABLE_SIZE]; + +static struct mutex *select_trampoline_lock(struct bpf_trampoline *tr) +{ + return &trampoline_locks[hash_ptr(tr, TRAMPOLINE_LOCKS_BITS)].mutex; +} + +static void trampoline_lock(struct bpf_trampoline *tr) +{ + mutex_lock(select_trampoline_lock(tr)); +} + +static void trampoline_unlock(struct bpf_trampoline *tr) +{ + mutex_unlock(select_trampoline_lock(tr)); +} + #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); @@ -69,9 +98,9 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) { /* This is called inside register_ftrace_direct_multi(), so - * tr->mutex is already locked. + * trampoline's mutex is already locked. */ - lockdep_assert_held_once(&tr->mutex); + lockdep_assert_held_once(select_trampoline_lock(tr)); /* Instead of updating the trampoline here, we propagate * -EAGAIN to register_ftrace_direct(). Then we can @@ -91,7 +120,7 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, } /* The normal locking order is - * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) + * select_trampoline_lock(tr) => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) * * The following two commands are called from * @@ -99,12 +128,12 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, * cleanup_direct_functions_after_ipmodify * * In both cases, direct_mutex is already locked. Use - * mutex_trylock(&tr->mutex) to avoid deadlock in race condition - * (something else is making changes to this same trampoline). + * mutex_trylock(select_trampoline_lock(tr)) to avoid deadlock in race condition + * (something else holds the same pool lock). */ - if (!mutex_trylock(&tr->mutex)) { - /* sleep 1 ms to make sure whatever holding tr->mutex makes - * some progress. + if (!mutex_trylock(select_trampoline_lock(tr))) { + /* sleep 1 ms to make sure whatever holding select_trampoline_lock(tr) + * makes some progress. */ msleep(1); return -EAGAIN; @@ -129,7 +158,7 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, break; } - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return ret; } #endif @@ -359,7 +388,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip) head = &trampoline_ip_table[hash_64(tr->ip, TRAMPOLINE_HASH_BITS)]; hlist_add_head(&tr->hlist_ip, head); refcount_set(&tr->refcnt, 1); - mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); out: @@ -843,9 +871,9 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, { int err; - mutex_lock(&tr->mutex); + trampoline_lock(tr); err = __bpf_trampoline_link_prog(link, tr, tgt_prog); - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return err; } @@ -886,9 +914,9 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, { int err; - mutex_lock(&tr->mutex); + trampoline_lock(tr); err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog); - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return err; } @@ -998,12 +1026,12 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, if (!tr) return -ENOMEM; - mutex_lock(&tr->mutex); + trampoline_lock(tr); shim_link = cgroup_shim_find(tr, bpf_func); if (shim_link && !IS_ERR(bpf_link_inc_not_zero(&shim_link->link.link))) { /* Reusing existing shim attached by the other program. */ - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); bpf_trampoline_put(tr); /* bpf_trampoline_get above */ return 0; } @@ -1023,16 +1051,16 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, shim_link->trampoline = tr; /* note, we're still holding tr refcnt from above */ - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return 0; err: - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); if (shim_link) bpf_link_put(&shim_link->link.link); - /* have to release tr while _not_ holding its mutex */ + /* have to release tr while _not_ holding pool mutex for trampoline */ bpf_trampoline_put(tr); /* bpf_trampoline_get above */ return err; @@ -1053,9 +1081,9 @@ void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) if (WARN_ON_ONCE(!tr)) return; - mutex_lock(&tr->mutex); + trampoline_lock(tr); shim_link = cgroup_shim_find(tr, bpf_func); - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); if (shim_link) bpf_link_put(&shim_link->link.link); @@ -1073,14 +1101,14 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key, if (!tr) return NULL; - mutex_lock(&tr->mutex); + trampoline_lock(tr); if (tr->func.addr) goto out; memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel)); tr->func.addr = (void *)tgt_info->tgt_addr; out: - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return tr; } @@ -1093,7 +1121,6 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) mutex_lock(&trampoline_mutex); if (!refcount_dec_and_test(&tr->refcnt)) goto out; - WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); for (i = 0; i < BPF_TRAMP_MAX; i++) if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i]))) @@ -1379,6 +1406,8 @@ static int __init init_trampolines(void) INIT_HLIST_HEAD(&trampoline_key_table[i]); for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) INIT_HLIST_HEAD(&trampoline_ip_table[i]); + for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++) + __mutex_init(&trampoline_locks[i].mutex, "trampoline_lock", &trampoline_locks[i].key); return 0; } late_initcall(init_trampolines); -- cgit v1.2.3 From 8a35e8db740f96ec17b85db5a0f83c028c707a3e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:30 +0200 Subject: bpf: Add struct bpf_trampoline_ops object In following changes we will need to override ftrace direct attachment behaviour. In order to do that we are adding struct bpf_trampoline_ops object that defines callbacks for ftrace direct attachment: register_fentry unregister_fentry modify_fentry The new struct bpf_trampoline_ops object is passed as an argument to __bpf_trampoline_link/unlink_prog functions. At the moment the default trampoline_ops is set to the current ftrace direct attachment functions, so there's no functional change for the current code. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-6-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 59 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 18 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index c0b4732627be..5c943832fb9d 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -59,8 +59,18 @@ static void trampoline_unlock(struct bpf_trampoline *tr) mutex_unlock(select_trampoline_lock(tr)); } +struct bpf_trampoline_ops { + int (*register_fentry)(struct bpf_trampoline *tr, void *new_addr, void *data); + int (*unregister_fentry)(struct bpf_trampoline *tr, u32 orig_flags, void *old_addr, + void *data); + int (*modify_fentry)(struct bpf_trampoline *tr, u32 orig_flags, void *old_addr, + void *new_addr, bool lock_direct_mutex, void *data); +}; + #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex, + const struct bpf_trampoline_ops *ops, void *data); +static const struct bpf_trampoline_ops trampoline_ops; #ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip) @@ -145,13 +155,15 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) - ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */, + &trampoline_ops, NULL); break; case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER: tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY; if (tr->flags & BPF_TRAMP_F_ORIG_STACK) - ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */, + &trampoline_ops, NULL); break; default: ret = -EINVAL; @@ -415,7 +427,7 @@ static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flag } static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, - void *old_addr) + void *old_addr, void *data __maybe_unused) { int ret; @@ -429,7 +441,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, void *old_addr, void *new_addr, - bool lock_direct_mutex) + bool lock_direct_mutex, void *data __maybe_unused) { int ret; @@ -443,7 +455,7 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, } /* first time registering */ -static int register_fentry(struct bpf_trampoline *tr, void *new_addr) +static int register_fentry(struct bpf_trampoline *tr, void *new_addr, void *data __maybe_unused) { void *ip = tr->func.addr; unsigned long faddr; @@ -465,6 +477,12 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) return ret; } +static const struct bpf_trampoline_ops trampoline_ops = { + .register_fentry = register_fentry, + .unregister_fentry = unregister_fentry, + .modify_fentry = modify_fentry, +}; + static struct bpf_tramp_links * bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) { @@ -632,7 +650,8 @@ out: return ERR_PTR(err); } -static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex) +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex, + const struct bpf_trampoline_ops *ops, void *data) { struct bpf_tramp_image *im; struct bpf_tramp_links *tlinks; @@ -645,7 +664,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut return PTR_ERR(tlinks); if (total == 0) { - err = unregister_fentry(tr, orig_flags, tr->cur_image->image); + err = ops->unregister_fentry(tr, orig_flags, tr->cur_image->image, data); bpf_tramp_image_put(tr->cur_image); tr->cur_image = NULL; goto out; @@ -715,11 +734,11 @@ again: if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, orig_flags, tr->cur_image->image, - im->image, lock_direct_mutex); + err = ops->modify_fentry(tr, orig_flags, tr->cur_image->image, + im->image, lock_direct_mutex, data); else /* first time registering */ - err = register_fentry(tr, im->image); + err = ops->register_fentry(tr, im->image, data); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if (err == -EAGAIN) { @@ -793,7 +812,9 @@ static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog) static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, - struct bpf_prog *tgt_prog) + struct bpf_prog *tgt_prog, + const struct bpf_trampoline_ops *ops, + void *data) { struct bpf_fsession_link *fslink = NULL; enum bpf_tramp_prog_type kind; @@ -851,7 +872,7 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, } else { tr->progs_cnt[kind]++; } - err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); + err = bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); if (err) { hlist_del_init(&link->tramp_hlist); if (kind == BPF_TRAMP_FSESSION) { @@ -872,14 +893,16 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, int err; trampoline_lock(tr); - err = __bpf_trampoline_link_prog(link, tr, tgt_prog); + err = __bpf_trampoline_link_prog(link, tr, tgt_prog, &trampoline_ops, NULL); trampoline_unlock(tr); return err; } static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, - struct bpf_prog *tgt_prog) + struct bpf_prog *tgt_prog, + const struct bpf_trampoline_ops *ops, + void *data) { enum bpf_tramp_prog_type kind; int err; @@ -904,7 +927,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; - return bpf_trampoline_update(tr, true /* lock_direct_mutex */); + return bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); } /* bpf_trampoline_unlink_prog() should never fail. */ @@ -915,7 +938,7 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, int err; trampoline_lock(tr); - err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog); + err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog, &trampoline_ops, NULL); trampoline_unlock(tr); return err; } @@ -1044,7 +1067,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, goto err; } - err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL); + err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL, &trampoline_ops, NULL); if (err) goto err; -- cgit v1.2.3 From bf4bc3e11c4195123055780b84dccfb8d2569535 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:31 +0200 Subject: bpf: Move trampoline image setup into bpf_trampoline_ops callbacks Moving trampoline image setup into bpf_trampoline_ops callbacks, so we can have different image handling for multi attachment which is coming in following changes. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-7-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 66 ++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 28 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5c943832fb9d..1006031ea021 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -60,11 +60,10 @@ static void trampoline_unlock(struct bpf_trampoline *tr) } struct bpf_trampoline_ops { - int (*register_fentry)(struct bpf_trampoline *tr, void *new_addr, void *data); - int (*unregister_fentry)(struct bpf_trampoline *tr, u32 orig_flags, void *old_addr, - void *data); - int (*modify_fentry)(struct bpf_trampoline *tr, u32 orig_flags, void *old_addr, - void *new_addr, bool lock_direct_mutex, void *data); + int (*register_fentry)(struct bpf_trampoline *tr, struct bpf_tramp_image *im, void *data); + int (*unregister_fentry)(struct bpf_trampoline *tr, u32 orig_flags, void *data); + int (*modify_fentry)(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im, + bool lock_direct_mutex, void *data); }; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS @@ -426,9 +425,11 @@ static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flag return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr); } -static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, - void *old_addr, void *data __maybe_unused) +static void bpf_tramp_image_put(struct bpf_tramp_image *im); + +static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, void *data __maybe_unused) { + void *old_addr = tr->cur_image->image; int ret; if (tr->func.ftrace_managed) @@ -436,13 +437,19 @@ static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, else ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL); - return ret; + if (ret) + return ret; + + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = NULL; + return 0; } -static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, - void *old_addr, void *new_addr, +static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im, bool lock_direct_mutex, void *data __maybe_unused) { + void *old_addr = tr->cur_image->image; + void *new_addr = im->image; int ret; if (tr->func.ftrace_managed) { @@ -451,12 +458,20 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, new_addr); } - return ret; + + if (ret) + return ret; + + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = im; + return 0; } /* first time registering */ -static int register_fentry(struct bpf_trampoline *tr, void *new_addr, void *data __maybe_unused) +static int register_fentry(struct bpf_trampoline *tr, struct bpf_tramp_image *im, + void *data __maybe_unused) { + void *new_addr = im->image; void *ip = tr->func.addr; unsigned long faddr; int ret; @@ -474,7 +489,11 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr, void *data ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr); } - return ret; + if (ret) + return ret; + + tr->cur_image = im; + return 0; } static const struct bpf_trampoline_ops trampoline_ops = { @@ -664,9 +683,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut return PTR_ERR(tlinks); if (total == 0) { - err = ops->unregister_fentry(tr, orig_flags, tr->cur_image->image, data); - bpf_tramp_image_put(tr->cur_image); - tr->cur_image = NULL; + err = ops->unregister_fentry(tr, orig_flags, data); goto out; } @@ -734,11 +751,10 @@ again: if (tr->cur_image) /* progs already running at this address */ - err = ops->modify_fentry(tr, orig_flags, tr->cur_image->image, - im->image, lock_direct_mutex, data); + err = ops->modify_fentry(tr, orig_flags, im, lock_direct_mutex, data); else /* first time registering */ - err = ops->register_fentry(tr, im->image, data); + err = ops->register_fentry(tr, im, data); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if (err == -EAGAIN) { @@ -750,22 +766,16 @@ again: goto again; } #endif - if (err) - goto out_free; - if (tr->cur_image) - bpf_tramp_image_put(tr->cur_image); - tr->cur_image = im; +out_free: + if (err) + bpf_tramp_image_free(im); out: /* If any error happens, restore previous flags */ if (err) tr->flags = orig_flags; kfree(tlinks); return err; - -out_free: - bpf_tramp_image_free(im); - goto out; } static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) -- cgit v1.2.3 From e6cc9ed677e622265bbd015892be58a1eece6238 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:32 +0200 Subject: bpf: Add bpf_trampoline_add/remove_prog functions Separate bpf_trampoline_add/remove_prog functions from __bpf_trampoline_link/unlink functions to be able to add/remove trampoline programs without the image being updated in following changes. No functional change is intended. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-8-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 108 +++++++++++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 47 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 1006031ea021..701138ef424a 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -820,41 +820,16 @@ static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog) return 0; } -static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, - struct bpf_trampoline *tr, - struct bpf_prog *tgt_prog, - const struct bpf_trampoline_ops *ops, - void *data) +static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, + struct bpf_tramp_link *link, + int cnt) { struct bpf_fsession_link *fslink = NULL; enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; struct hlist_head *prog_list; - int err = 0; - int cnt = 0, i; kind = bpf_attach_type_to_tramp(link->link.prog); - if (tr->extension_prog) - /* cannot attach fentry/fexit if extension prog is attached. - * cannot overwrite extension prog either. - */ - return -EBUSY; - - for (i = 0; i < BPF_TRAMP_MAX; i++) - cnt += tr->progs_cnt[i]; - - if (kind == BPF_TRAMP_REPLACE) { - /* Cannot attach extension if fentry/fexit are in use. */ - if (cnt) - return -EBUSY; - err = bpf_freplace_check_tgt_prog(tgt_prog); - if (err) - return err; - tr->extension_prog = link->link.prog; - return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP, - BPF_MOD_JUMP, NULL, - link->link.prog->bpf_func); - } if (kind == BPF_TRAMP_FSESSION) { prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY]; cnt++; @@ -882,17 +857,64 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, } else { tr->progs_cnt[kind]++; } - err = bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); - if (err) { - hlist_del_init(&link->tramp_hlist); - if (kind == BPF_TRAMP_FSESSION) { - tr->progs_cnt[BPF_TRAMP_FENTRY]--; - hlist_del_init(&fslink->fexit.tramp_hlist); - tr->progs_cnt[BPF_TRAMP_FEXIT]--; - } else { - tr->progs_cnt[kind]--; - } + return 0; +} + +static void bpf_trampoline_remove_prog(struct bpf_trampoline *tr, + struct bpf_tramp_link *link) +{ + struct bpf_fsession_link *fslink; + enum bpf_tramp_prog_type kind; + + kind = bpf_attach_type_to_tramp(link->link.prog); + if (kind == BPF_TRAMP_FSESSION) { + fslink = container_of(link, struct bpf_fsession_link, link.link); + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + kind = BPF_TRAMP_FENTRY; + } + hlist_del_init(&link->tramp_hlist); + tr->progs_cnt[kind]--; +} + +static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog, + const struct bpf_trampoline_ops *ops, + void *data) +{ + enum bpf_tramp_prog_type kind; + int err = 0; + int cnt = 0, i; + + kind = bpf_attach_type_to_tramp(link->link.prog); + if (tr->extension_prog) + /* cannot attach fentry/fexit if extension prog is attached. + * cannot overwrite extension prog either. + */ + return -EBUSY; + + for (i = 0; i < BPF_TRAMP_MAX; i++) + cnt += tr->progs_cnt[i]; + + if (kind == BPF_TRAMP_REPLACE) { + /* Cannot attach extension if fentry/fexit are in use. */ + if (cnt) + return -EBUSY; + err = bpf_freplace_check_tgt_prog(tgt_prog); + if (err) + return err; + tr->extension_prog = link->link.prog; + return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP, + BPF_MOD_JUMP, NULL, + link->link.prog->bpf_func); } + err = bpf_trampoline_add_prog(tr, link, cnt); + if (err) + return err; + err = bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); + if (err) + bpf_trampoline_remove_prog(tr, link); return err; } @@ -927,16 +949,8 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, guard(mutex)(&tgt_prog->aux->ext_mutex); tgt_prog->aux->is_extended = false; return err; - } else if (kind == BPF_TRAMP_FSESSION) { - struct bpf_fsession_link *fslink = - container_of(link, struct bpf_fsession_link, link.link); - - hlist_del_init(&fslink->fexit.tramp_hlist); - tr->progs_cnt[BPF_TRAMP_FEXIT]--; - kind = BPF_TRAMP_FENTRY; } - hlist_del_init(&link->tramp_hlist); - tr->progs_cnt[kind]--; + bpf_trampoline_remove_prog(tr, link); return bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); } -- cgit v1.2.3 From 65499074efaf574fef6365ac63b785a3ec98913d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:33 +0200 Subject: bpf: Add struct bpf_tramp_node object Adding struct bpf_tramp_node to decouple the link out of the trampoline attachment info. At the moment the object for attaching bpf program to the trampoline is 'struct bpf_tramp_link': struct bpf_tramp_link { struct bpf_link link; struct hlist_node tramp_hlist; u64 cookie; } The link holds the bpf_prog pointer and forces one link - one program binding logic. In following changes we want to attach program to multiple trampolines but we want to keep just one bpf_link object. Splitting struct bpf_tramp_link into: struct bpf_tramp_link { struct bpf_link link; struct bpf_tramp_node node; }; struct bpf_tramp_node { struct bpf_link *link; struct hlist_node tramp_hlist; u64 cookie; }; The 'struct bpf_tramp_link' defines standard single trampoline link and 'struct bpf_tramp_node' is the attachment trampoline object with pointer to the bpf_link object. This will allow us to define link for multiple trampolines, like: struct bpf_tracing_multi_link { struct bpf_link link; ... int nodes_cnt; struct bpf_tracing_multi_node nodes[] __counted_by(nodes_cnt); }; Cc: Hengqi Chen Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-9-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 58 ++++++++++---------- arch/loongarch/net/bpf_jit.c | 52 +++++++++--------- arch/powerpc/net/bpf_jit_comp.c | 54 +++++++++---------- arch/riscv/net/bpf_jit_comp64.c | 52 +++++++++--------- arch/s390/net/bpf_jit_comp.c | 44 +++++++-------- arch/x86/net/bpf_jit_comp.c | 54 +++++++++---------- include/linux/bpf.h | 60 +++++++++++++-------- kernel/bpf/bpf_struct_ops.c | 27 +++++----- kernel/bpf/syscall.c | 39 ++++++++------ kernel/bpf/trampoline.c | 115 ++++++++++++++++++++-------------------- net/bpf/bpf_dummy_struct_ops.c | 14 ++--- 11 files changed, 294 insertions(+), 275 deletions(-) (limited to 'kernel/bpf') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index b4abc3138f37..f6bcc0e1a950 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2335,24 +2335,24 @@ bool bpf_jit_supports_subprog_tailcalls(void) return true; } -static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l, +static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_node *node, int bargs_off, int retval_off, int run_ctx_off, bool save_ret) { __le32 *branch; u64 enter_prog; u64 exit_prog; - struct bpf_prog *p = l->link.prog; + struct bpf_prog *p = node->link->prog; int cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); enter_prog = (u64)bpf_trampoline_enter(p); exit_prog = (u64)bpf_trampoline_exit(p); - if (l->cookie == 0) { + if (node->cookie == 0) { /* if cookie is zero, one instruction is enough to store it */ emit(A64_STR64I(A64_ZR, A64_SP, run_ctx_off + cookie_off), ctx); } else { - emit_a64_mov_i64(A64_R(10), l->cookie, ctx); + emit_a64_mov_i64(A64_R(10), node->cookie, ctx); emit(A64_STR64I(A64_R(10), A64_SP, run_ctx_off + cookie_off), ctx); } @@ -2402,7 +2402,7 @@ static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l, emit_call(exit_prog, ctx); } -static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl, +static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_nodes *tn, int bargs_off, int retval_off, int run_ctx_off, __le32 **branches) { @@ -2412,8 +2412,8 @@ static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl, * Set this to 0 to avoid confusing the program. */ emit(A64_STR64I(A64_ZR, A64_SP, retval_off), ctx); - for (i = 0; i < tl->nr_links; i++) { - invoke_bpf_prog(ctx, tl->links[i], bargs_off, retval_off, + for (i = 0; i < tn->nr_nodes; i++) { + invoke_bpf_prog(ctx, tn->nodes[i], bargs_off, retval_off, run_ctx_off, true); /* if (*(u64 *)(sp + retval_off) != 0) * goto do_fexit; @@ -2544,10 +2544,10 @@ static void restore_args(struct jit_ctx *ctx, int bargs_off, int nregs) } } -static bool is_struct_ops_tramp(const struct bpf_tramp_links *fentry_links) +static bool is_struct_ops_tramp(const struct bpf_tramp_nodes *fentry_nodes) { - return fentry_links->nr_links == 1 && - fentry_links->links[0]->link.type == BPF_LINK_TYPE_STRUCT_OPS; + return fentry_nodes->nr_nodes == 1 && + fentry_nodes->nodes[0]->link->type == BPF_LINK_TYPE_STRUCT_OPS; } static void store_func_meta(struct jit_ctx *ctx, u64 func_meta, int func_meta_off) @@ -2568,7 +2568,7 @@ static void store_func_meta(struct jit_ctx *ctx, u64 func_meta, int func_meta_of * */ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, - struct bpf_tramp_links *tlinks, void *func_addr, + struct bpf_tramp_nodes *tnodes, void *func_addr, const struct btf_func_model *m, const struct arg_aux *a, u32 flags) @@ -2584,14 +2584,14 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, int run_ctx_off; int oargs_off; int nfuncargs; - struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; - struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; - struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT]; + struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN]; bool save_ret; __le32 **branches = NULL; bool is_struct_ops = is_struct_ops_tramp(fentry); int cookie_off, cookie_cnt, cookie_bargs_off; - int fsession_cnt = bpf_fsession_cnt(tlinks); + int fsession_cnt = bpf_fsession_cnt(tnodes); u64 func_meta; /* trampoline stack layout: @@ -2637,7 +2637,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, cookie_off = stack_size; /* room for session cookies */ - cookie_cnt = bpf_fsession_cookie_cnt(tlinks); + cookie_cnt = bpf_fsession_cookie_cnt(tnodes); stack_size += cookie_cnt * 8; ip_off = stack_size; @@ -2734,20 +2734,20 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, } cookie_bargs_off = (bargs_off - cookie_off) / 8; - for (i = 0; i < fentry->nr_links; i++) { - if (bpf_prog_calls_session_cookie(fentry->links[i])) { + for (i = 0; i < fentry->nr_nodes; i++) { + if (bpf_prog_calls_session_cookie(fentry->nodes[i])) { u64 meta = func_meta | (cookie_bargs_off << BPF_TRAMP_COOKIE_INDEX_SHIFT); store_func_meta(ctx, meta, func_meta_off); cookie_bargs_off--; } - invoke_bpf_prog(ctx, fentry->links[i], bargs_off, + invoke_bpf_prog(ctx, fentry->nodes[i], bargs_off, retval_off, run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET); } - if (fmod_ret->nr_links) { - branches = kcalloc(fmod_ret->nr_links, sizeof(__le32 *), + if (fmod_ret->nr_nodes) { + branches = kcalloc(fmod_ret->nr_nodes, sizeof(__le32 *), GFP_KERNEL); if (!branches) return -ENOMEM; @@ -2771,7 +2771,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, } /* update the branches saved in invoke_bpf_mod_ret with cbnz */ - for (i = 0; i < fmod_ret->nr_links && ctx->image != NULL; i++) { + for (i = 0; i < fmod_ret->nr_nodes && ctx->image != NULL; i++) { int offset = &ctx->image[ctx->idx] - branches[i]; *branches[i] = cpu_to_le32(A64_CBNZ(1, A64_R(10), offset)); } @@ -2782,14 +2782,14 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, store_func_meta(ctx, func_meta, func_meta_off); cookie_bargs_off = (bargs_off - cookie_off) / 8; - for (i = 0; i < fexit->nr_links; i++) { - if (bpf_prog_calls_session_cookie(fexit->links[i])) { + for (i = 0; i < fexit->nr_nodes; i++) { + if (bpf_prog_calls_session_cookie(fexit->nodes[i])) { u64 meta = func_meta | (cookie_bargs_off << BPF_TRAMP_COOKIE_INDEX_SHIFT); store_func_meta(ctx, meta, func_meta_off); cookie_bargs_off--; } - invoke_bpf_prog(ctx, fexit->links[i], bargs_off, retval_off, + invoke_bpf_prog(ctx, fexit->nodes[i], bargs_off, retval_off, run_ctx_off, false); } @@ -2847,7 +2847,7 @@ bool bpf_jit_supports_fsession(void) } int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr) + struct bpf_tramp_nodes *tnodes, void *func_addr) { struct jit_ctx ctx = { .image = NULL, @@ -2861,7 +2861,7 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, if (ret < 0) return ret; - ret = prepare_trampoline(&ctx, &im, tlinks, func_addr, m, &aaux, flags); + ret = prepare_trampoline(&ctx, &im, tnodes, func_addr, m, &aaux, flags); if (ret < 0) return ret; @@ -2885,7 +2885,7 @@ int arch_protect_bpf_trampoline(void *image, unsigned int size) int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, void *ro_image_end, const struct btf_func_model *m, - u32 flags, struct bpf_tramp_links *tlinks, + u32 flags, struct bpf_tramp_nodes *tnodes, void *func_addr) { u32 size = ro_image_end - ro_image; @@ -2912,7 +2912,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, ret = calc_arg_aux(m, &aaux); if (ret) goto out; - ret = prepare_trampoline(&ctx, im, tlinks, func_addr, m, &aaux, flags); + ret = prepare_trampoline(&ctx, im, tnodes, func_addr, m, &aaux, flags); if (ret > 0 && validate_code(&ctx) < 0) { ret = -EINVAL; diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c index 24913dc7f4e8..058ffbbaad85 100644 --- a/arch/loongarch/net/bpf_jit.c +++ b/arch/loongarch/net/bpf_jit.c @@ -1674,17 +1674,17 @@ static void restore_stk_args(struct jit_ctx *ctx, int nr_stk_args, int args_off, } } -static int invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l, +static int invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_node *n, int args_off, int retval_off, int run_ctx_off, bool save_ret) { int ret; u32 *branch; - struct bpf_prog *p = l->link.prog; + struct bpf_prog *p = n->link->prog; int cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); - if (l->cookie) + if (n->cookie) emit_store_stack_imm64(ctx, LOONGARCH_GPR_T1, - -run_ctx_off + cookie_off, l->cookie); + -run_ctx_off + cookie_off, n->cookie); else emit_insn(ctx, std, LOONGARCH_GPR_ZERO, LOONGARCH_GPR_FP, -run_ctx_off + cookie_off); @@ -1737,22 +1737,22 @@ static int invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l, return ret; } -static int invoke_bpf(struct jit_ctx *ctx, struct bpf_tramp_links *tl, +static int invoke_bpf(struct jit_ctx *ctx, struct bpf_tramp_nodes *tn, int args_off, int retval_off, int run_ctx_off, int func_meta_off, bool save_ret, u64 func_meta, int cookie_off) { int i, cur_cookie = (cookie_off - args_off) / 8; - for (i = 0; i < tl->nr_links; i++) { + for (i = 0; i < tn->nr_nodes; i++) { int err; - if (bpf_prog_calls_session_cookie(tl->links[i])) { + if (bpf_prog_calls_session_cookie(tn->nodes[i])) { u64 meta = func_meta | ((u64)cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT); emit_store_stack_imm64(ctx, LOONGARCH_GPR_T1, -func_meta_off, meta); cur_cookie--; } - err = invoke_bpf_prog(ctx, tl->links[i], args_off, retval_off, run_ctx_off, save_ret); + err = invoke_bpf_prog(ctx, tn->nodes[i], args_off, retval_off, run_ctx_off, save_ret); if (err) return err; } @@ -1807,7 +1807,7 @@ static void sign_extend(struct jit_ctx *ctx, int rd, int rj, u8 size, bool sign) } static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, - const struct btf_func_model *m, struct bpf_tramp_links *tlinks, + const struct btf_func_model *m, struct bpf_tramp_nodes *tnodes, void *func_addr, u32 flags) { int i, ret, save_ret; @@ -1817,9 +1817,9 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i unsigned long long func_meta; bool is_struct_ops = flags & BPF_TRAMP_F_INDIRECT; void *orig_call = func_addr; - struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; - struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; - struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT]; + struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN]; u32 **branches = NULL; /* @@ -1898,7 +1898,7 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i ip_off = stack_size; } - cookie_cnt = bpf_fsession_cookie_cnt(tlinks); + cookie_cnt = bpf_fsession_cookie_cnt(tnodes); /* Room for session cookies */ stack_size += cookie_cnt * 8; @@ -1969,7 +1969,7 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i store_args(ctx, nr_arg_slots, args_off); - if (bpf_fsession_cnt(tlinks)) { + if (bpf_fsession_cnt(tnodes)) { /* clear all session cookies' value */ for (i = 0; i < cookie_cnt; i++) emit_insn(ctx, std, LOONGARCH_GPR_ZERO, LOONGARCH_GPR_FP, -cookie_off + 8 * i); @@ -1994,20 +1994,20 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i return ret; } - if (fentry->nr_links) { + if (fentry->nr_nodes) { ret = invoke_bpf(ctx, fentry, args_off, retval_off, run_ctx_off, func_meta_off, flags & BPF_TRAMP_F_RET_FENTRY_RET, func_meta, cookie_off); if (ret) return ret; } - if (fmod_ret->nr_links) { - branches = kcalloc(fmod_ret->nr_links, sizeof(u32 *), GFP_KERNEL); + if (fmod_ret->nr_nodes) { + branches = kcalloc(fmod_ret->nr_nodes, sizeof(u32 *), GFP_KERNEL); if (!branches) return -ENOMEM; emit_insn(ctx, std, LOONGARCH_GPR_ZERO, LOONGARCH_GPR_FP, -retval_off); - for (i = 0; i < fmod_ret->nr_links; i++) { - ret = invoke_bpf_prog(ctx, fmod_ret->links[i], + for (i = 0; i < fmod_ret->nr_nodes; i++) { + ret = invoke_bpf_prog(ctx, fmod_ret->nodes[i], args_off, retval_off, run_ctx_off, true); if (ret) goto out; @@ -2035,17 +2035,17 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i emit_insn(ctx, nop); } - for (i = 0; ctx->image && i < fmod_ret->nr_links; i++) { + for (i = 0; ctx->image && i < fmod_ret->nr_nodes; i++) { int offset = (void *)(&ctx->image[ctx->idx]) - (void *)branches[i]; *branches[i] = larch_insn_gen_bne(LOONGARCH_GPR_T1, LOONGARCH_GPR_ZERO, offset); } /* Set "is_return" flag for fsession */ func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT); - if (bpf_fsession_cnt(tlinks)) + if (bpf_fsession_cnt(tnodes)) emit_store_stack_imm64(ctx, LOONGARCH_GPR_T1, -func_meta_off, func_meta); - if (fexit->nr_links) { + if (fexit->nr_nodes) { ret = invoke_bpf(ctx, fexit, args_off, retval_off, run_ctx_off, func_meta_off, false, func_meta, cookie_off); if (ret) @@ -2115,7 +2115,7 @@ out: int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, void *ro_image_end, const struct btf_func_model *m, - u32 flags, struct bpf_tramp_links *tlinks, void *func_addr) + u32 flags, struct bpf_tramp_nodes *tnodes, void *func_addr) { int ret, size; void *image, *tmp; @@ -2131,7 +2131,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, ctx.idx = 0; jit_fill_hole(image, (unsigned int)(ro_image_end - ro_image)); - ret = __arch_prepare_bpf_trampoline(&ctx, im, m, tlinks, func_addr, flags); + ret = __arch_prepare_bpf_trampoline(&ctx, im, m, tnodes, func_addr, flags); if (ret < 0) goto out; @@ -2152,7 +2152,7 @@ out: } int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr) + struct bpf_tramp_nodes *tnodes, void *func_addr) { int ret; struct jit_ctx ctx; @@ -2161,7 +2161,7 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, ctx.image = NULL; ctx.idx = 0; - ret = __arch_prepare_bpf_trampoline(&ctx, &im, m, tlinks, func_addr, flags); + ret = __arch_prepare_bpf_trampoline(&ctx, &im, m, tnodes, func_addr, flags); return ret < 0 ? ret : ret * LOONGARCH_INSN_SIZE; } diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 53ab97ad6074..6351a187ca61 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -597,22 +597,22 @@ int arch_protect_bpf_trampoline(void *image, unsigned int size) } static int invoke_bpf_prog(u32 *image, u32 *ro_image, struct codegen_context *ctx, - struct bpf_tramp_link *l, int regs_off, int retval_off, + struct bpf_tramp_node *n, int regs_off, int retval_off, int run_ctx_off, bool save_ret) { - struct bpf_prog *p = l->link.prog; + struct bpf_prog *p = n->link->prog; ppc_inst_t branch_insn; u32 jmp_idx; int ret = 0; /* Save cookie */ if (IS_ENABLED(CONFIG_PPC64)) { - PPC_LI64(_R3, l->cookie); + PPC_LI64(_R3, n->cookie); EMIT(PPC_RAW_STD(_R3, _R1, run_ctx_off + offsetof(struct bpf_tramp_run_ctx, bpf_cookie))); } else { - PPC_LI32(_R3, l->cookie >> 32); - PPC_LI32(_R4, l->cookie); + PPC_LI32(_R3, n->cookie >> 32); + PPC_LI32(_R4, n->cookie); EMIT(PPC_RAW_STW(_R3, _R1, run_ctx_off + offsetof(struct bpf_tramp_run_ctx, bpf_cookie))); EMIT(PPC_RAW_STW(_R4, _R1, @@ -679,7 +679,7 @@ static int invoke_bpf_prog(u32 *image, u32 *ro_image, struct codegen_context *ct } static int invoke_bpf_mod_ret(u32 *image, u32 *ro_image, struct codegen_context *ctx, - struct bpf_tramp_links *tl, int regs_off, int retval_off, + struct bpf_tramp_nodes *tn, int regs_off, int retval_off, int run_ctx_off, u32 *branches) { int i; @@ -690,8 +690,8 @@ static int invoke_bpf_mod_ret(u32 *image, u32 *ro_image, struct codegen_context */ EMIT(PPC_RAW_LI(_R3, 0)); EMIT(PPC_RAW_STL(_R3, _R1, retval_off)); - for (i = 0; i < tl->nr_links; i++) { - if (invoke_bpf_prog(image, ro_image, ctx, tl->links[i], regs_off, retval_off, + for (i = 0; i < tn->nr_nodes; i++) { + if (invoke_bpf_prog(image, ro_image, ctx, tn->nodes[i], regs_off, retval_off, run_ctx_off, true)) return -EINVAL; @@ -807,18 +807,18 @@ static void bpf_trampoline_restore_args_stack(u32 *image, struct codegen_context static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image, void *rw_image_end, void *ro_image, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr) { int regs_off, func_meta_off, ip_off, run_ctx_off, retval_off; int nvr_off, alt_lr_off, r4_off = 0; - struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; - struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; - struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; + struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT]; int i, ret, nr_regs, retaddr_off, bpf_frame_size = 0; struct codegen_context codegen_ctx, *ctx; int cookie_off, cookie_cnt, cookie_ctx_off; - int fsession_cnt = bpf_fsession_cnt(tlinks); + int fsession_cnt = bpf_fsession_cnt(tnodes); u64 func_meta; u32 *image = (u32 *)rw_image; ppc_inst_t branch_insn; @@ -893,7 +893,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* room for session cookies */ cookie_off = bpf_frame_size; - cookie_cnt = bpf_fsession_cookie_cnt(tlinks); + cookie_cnt = bpf_fsession_cookie_cnt(tnodes); bpf_frame_size += cookie_cnt * 8; /* Room for IP address argument */ @@ -1030,21 +1030,21 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im cookie_ctx_off = (regs_off - cookie_off) / 8; - for (i = 0; i < fentry->nr_links; i++) { - if (bpf_prog_calls_session_cookie(fentry->links[i])) { + for (i = 0; i < fentry->nr_nodes; i++) { + if (bpf_prog_calls_session_cookie(fentry->nodes[i])) { u64 meta = func_meta | (cookie_ctx_off << BPF_TRAMP_COOKIE_INDEX_SHIFT); store_func_meta(image, ctx, meta, func_meta_off); cookie_ctx_off--; } - if (invoke_bpf_prog(image, ro_image, ctx, fentry->links[i], regs_off, retval_off, + if (invoke_bpf_prog(image, ro_image, ctx, fentry->nodes[i], regs_off, retval_off, run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET)) return -EINVAL; } - if (fmod_ret->nr_links) { - branches = kcalloc(fmod_ret->nr_links, sizeof(u32), GFP_KERNEL); + if (fmod_ret->nr_nodes) { + branches = kcalloc(fmod_ret->nr_nodes, sizeof(u32), GFP_KERNEL); if (!branches) return -ENOMEM; @@ -1093,7 +1093,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im } /* Update branches saved in invoke_bpf_mod_ret with address of do_fexit */ - for (i = 0; i < fmod_ret->nr_links && image; i++) { + for (i = 0; i < fmod_ret->nr_nodes && image; i++) { if (create_cond_branch(&branch_insn, &image[branches[i]], (unsigned long)&image[ctx->idx], COND_NE << 16)) { ret = -EINVAL; @@ -1110,15 +1110,15 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im cookie_ctx_off = (regs_off - cookie_off) / 8; - for (i = 0; i < fexit->nr_links; i++) { - if (bpf_prog_calls_session_cookie(fexit->links[i])) { + for (i = 0; i < fexit->nr_nodes; i++) { + if (bpf_prog_calls_session_cookie(fexit->nodes[i])) { u64 meta = func_meta | (cookie_ctx_off << BPF_TRAMP_COOKIE_INDEX_SHIFT); store_func_meta(image, ctx, meta, func_meta_off); cookie_ctx_off--; } - if (invoke_bpf_prog(image, ro_image, ctx, fexit->links[i], regs_off, retval_off, + if (invoke_bpf_prog(image, ro_image, ctx, fexit->nodes[i], regs_off, retval_off, run_ctx_off, false)) { ret = -EINVAL; goto cleanup; @@ -1185,18 +1185,18 @@ cleanup: } int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr) + struct bpf_tramp_nodes *tnodes, void *func_addr) { struct bpf_tramp_image im; int ret; - ret = __arch_prepare_bpf_trampoline(&im, NULL, NULL, NULL, m, flags, tlinks, func_addr); + ret = __arch_prepare_bpf_trampoline(&im, NULL, NULL, NULL, m, flags, tnodes, func_addr); return ret; } int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr) { u32 size = image_end - image; @@ -1212,7 +1212,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i return -ENOMEM; ret = __arch_prepare_bpf_trampoline(im, rw_image, rw_image + size, image, m, - flags, tlinks, func_addr); + flags, tnodes, func_addr); if (ret < 0) goto out; diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index e2c70c70cca8..c03c1de16b79 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -934,15 +934,15 @@ static void emit_store_stack_imm64(u8 reg, int stack_off, u64 imm64, emit_sd(RV_REG_FP, stack_off, reg, ctx); } -static int invoke_bpf_prog(struct bpf_tramp_link *l, int args_off, int retval_off, +static int invoke_bpf_prog(struct bpf_tramp_node *node, int args_off, int retval_off, int run_ctx_off, bool save_ret, struct rv_jit_context *ctx) { int ret, branch_off; - struct bpf_prog *p = l->link.prog; + struct bpf_prog *p = node->link->prog; int cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); - if (l->cookie) - emit_store_stack_imm64(RV_REG_T1, -run_ctx_off + cookie_off, l->cookie, ctx); + if (node->cookie) + emit_store_stack_imm64(RV_REG_T1, -run_ctx_off + cookie_off, node->cookie, ctx); else emit_sd(RV_REG_FP, -run_ctx_off + cookie_off, RV_REG_ZERO, ctx); @@ -996,22 +996,22 @@ static int invoke_bpf_prog(struct bpf_tramp_link *l, int args_off, int retval_of return ret; } -static int invoke_bpf(struct bpf_tramp_links *tl, int args_off, int retval_off, +static int invoke_bpf(struct bpf_tramp_nodes *tn, int args_off, int retval_off, int run_ctx_off, int func_meta_off, bool save_ret, u64 func_meta, int cookie_off, struct rv_jit_context *ctx) { int i, cur_cookie = (cookie_off - args_off) / 8; - for (i = 0; i < tl->nr_links; i++) { + for (i = 0; i < tn->nr_nodes; i++) { int err; - if (bpf_prog_calls_session_cookie(tl->links[i])) { + if (bpf_prog_calls_session_cookie(tn->nodes[i])) { u64 meta = func_meta | ((u64)cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT); emit_store_stack_imm64(RV_REG_T1, -func_meta_off, meta, ctx); cur_cookie--; } - err = invoke_bpf_prog(tl->links[i], args_off, retval_off, run_ctx_off, + err = invoke_bpf_prog(tn->nodes[i], args_off, retval_off, run_ctx_off, save_ret, ctx); if (err) return err; @@ -1021,7 +1021,7 @@ static int invoke_bpf(struct bpf_tramp_links *tl, int args_off, int retval_off, static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, const struct btf_func_model *m, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr, u32 flags, struct rv_jit_context *ctx) { @@ -1030,9 +1030,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, int stack_size = 0, nr_arg_slots = 0; int retval_off, args_off, func_meta_off, ip_off, run_ctx_off, sreg_off, stk_arg_off; int cookie_off, cookie_cnt; - struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; - struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; - struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT]; + struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN]; bool is_struct_ops = flags & BPF_TRAMP_F_INDIRECT; void *orig_call = func_addr; bool save_ret; @@ -1115,7 +1115,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, ip_off = stack_size; } - cookie_cnt = bpf_fsession_cookie_cnt(tlinks); + cookie_cnt = bpf_fsession_cookie_cnt(tnodes); /* room for session cookies */ stack_size += cookie_cnt * 8; cookie_off = stack_size; @@ -1172,7 +1172,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, store_args(nr_arg_slots, args_off, ctx); - if (bpf_fsession_cnt(tlinks)) { + if (bpf_fsession_cnt(tnodes)) { /* clear all session cookies' value */ for (i = 0; i < cookie_cnt; i++) emit_sd(RV_REG_FP, -cookie_off + 8 * i, RV_REG_ZERO, ctx); @@ -1187,22 +1187,22 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, return ret; } - if (fentry->nr_links) { + if (fentry->nr_nodes) { ret = invoke_bpf(fentry, args_off, retval_off, run_ctx_off, func_meta_off, flags & BPF_TRAMP_F_RET_FENTRY_RET, func_meta, cookie_off, ctx); if (ret) return ret; } - if (fmod_ret->nr_links) { - branches_off = kzalloc_objs(int, fmod_ret->nr_links); + if (fmod_ret->nr_nodes) { + branches_off = kzalloc_objs(int, fmod_ret->nr_nodes); if (!branches_off) return -ENOMEM; /* cleanup to avoid garbage return value confusion */ emit_sd(RV_REG_FP, -retval_off, RV_REG_ZERO, ctx); - for (i = 0; i < fmod_ret->nr_links; i++) { - ret = invoke_bpf_prog(fmod_ret->links[i], args_off, retval_off, + for (i = 0; i < fmod_ret->nr_nodes; i++) { + ret = invoke_bpf_prog(fmod_ret->nodes[i], args_off, retval_off, run_ctx_off, true, ctx); if (ret) goto out; @@ -1230,7 +1230,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, } /* update branches saved in invoke_bpf_mod_ret with bnez */ - for (i = 0; ctx->insns && i < fmod_ret->nr_links; i++) { + for (i = 0; ctx->insns && i < fmod_ret->nr_nodes; i++) { offset = ninsns_rvoff(ctx->ninsns - branches_off[i]); insn = rv_bne(RV_REG_T1, RV_REG_ZERO, offset >> 1); *(u32 *)(ctx->insns + branches_off[i]) = insn; @@ -1238,10 +1238,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, /* set "is_return" flag for fsession */ func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT); - if (bpf_fsession_cnt(tlinks)) + if (bpf_fsession_cnt(tnodes)) emit_store_stack_imm64(RV_REG_T1, -func_meta_off, func_meta, ctx); - if (fexit->nr_links) { + if (fexit->nr_nodes) { ret = invoke_bpf(fexit, args_off, retval_off, run_ctx_off, func_meta_off, false, func_meta, cookie_off, ctx); if (ret) @@ -1305,7 +1305,7 @@ out: } int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr) + struct bpf_tramp_nodes *tnodes, void *func_addr) { struct bpf_tramp_image im; struct rv_jit_context ctx; @@ -1314,7 +1314,7 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, ctx.ninsns = 0; ctx.insns = NULL; ctx.ro_insns = NULL; - ret = __arch_prepare_bpf_trampoline(&im, m, tlinks, func_addr, flags, &ctx); + ret = __arch_prepare_bpf_trampoline(&im, m, tnodes, func_addr, flags, &ctx); return ret < 0 ? ret : ninsns_rvoff(ctx.ninsns); } @@ -1331,7 +1331,7 @@ void arch_free_bpf_trampoline(void *image, unsigned int size) int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, void *ro_image_end, const struct btf_func_model *m, - u32 flags, struct bpf_tramp_links *tlinks, + u32 flags, struct bpf_tramp_nodes *tnodes, void *func_addr) { int ret; @@ -1346,7 +1346,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, ctx.ninsns = 0; ctx.insns = image; ctx.ro_insns = ro_image; - ret = __arch_prepare_bpf_trampoline(im, m, tlinks, func_addr, flags, &ctx); + ret = __arch_prepare_bpf_trampoline(im, m, tnodes, func_addr, flags, &ctx); if (ret < 0) goto out; diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 14eaaa5b2185..31749c0362ca 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2537,19 +2537,19 @@ static void emit_store_stack_imm64(struct bpf_jit *jit, int tmp_reg, int stack_o static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, const struct btf_func_model *m, - struct bpf_tramp_link *tlink, bool save_ret) + struct bpf_tramp_node *node, bool save_ret) { struct bpf_jit *jit = &tjit->common; int cookie_off = tjit->run_ctx_off + offsetof(struct bpf_tramp_run_ctx, bpf_cookie); - struct bpf_prog *p = tlink->link.prog; + struct bpf_prog *p = node->link->prog; int patch; /* - * run_ctx.cookie = tlink->cookie; + * run_ctx.cookie = node->cookie; */ - emit_store_stack_imm64(jit, REG_W0, cookie_off, tlink->cookie); + emit_store_stack_imm64(jit, REG_W0, cookie_off, node->cookie); /* * if ((start = __bpf_prog_enter(p, &run_ctx)) == 0) @@ -2609,20 +2609,20 @@ static int invoke_bpf_prog(struct bpf_tramp_jit *tjit, static int invoke_bpf(struct bpf_tramp_jit *tjit, const struct btf_func_model *m, - struct bpf_tramp_links *tl, bool save_ret, + struct bpf_tramp_nodes *tn, bool save_ret, u64 func_meta, int cookie_off) { int i, cur_cookie = (tjit->bpf_args_off - cookie_off) / sizeof(u64); struct bpf_jit *jit = &tjit->common; - for (i = 0; i < tl->nr_links; i++) { - if (bpf_prog_calls_session_cookie(tl->links[i])) { + for (i = 0; i < tn->nr_nodes; i++) { + if (bpf_prog_calls_session_cookie(tn->nodes[i])) { u64 meta = func_meta | ((u64)cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT); emit_store_stack_imm64(jit, REG_0, tjit->func_meta_off, meta); cur_cookie--; } - if (invoke_bpf_prog(tjit, m, tl->links[i], save_ret)) + if (invoke_bpf_prog(tjit, m, tn->nodes[i], save_ret)) return -EINVAL; } @@ -2651,12 +2651,12 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, struct bpf_tramp_jit *tjit, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr) { - struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; - struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; - struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; + struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT]; int nr_bpf_args, nr_reg_args, nr_stack_args; int cookie_cnt, cookie_off, fsession_cnt; struct bpf_jit *jit = &tjit->common; @@ -2693,8 +2693,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, return -ENOTSUPP; } - cookie_cnt = bpf_fsession_cookie_cnt(tlinks); - fsession_cnt = bpf_fsession_cnt(tlinks); + cookie_cnt = bpf_fsession_cookie_cnt(tnodes); + fsession_cnt = bpf_fsession_cnt(tnodes); /* * Calculate the stack layout. @@ -2829,7 +2829,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, func_meta, cookie_off)) return -EINVAL; - if (fmod_ret->nr_links) { + if (fmod_ret->nr_nodes) { /* * retval = 0; */ @@ -2838,8 +2838,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, _EMIT6(0xd707f000 | tjit->retval_off, 0xf000 | tjit->retval_off); - for (i = 0; i < fmod_ret->nr_links; i++) { - if (invoke_bpf_prog(tjit, m, fmod_ret->links[i], true)) + for (i = 0; i < fmod_ret->nr_nodes; i++) { + if (invoke_bpf_prog(tjit, m, fmod_ret->nodes[i], true)) return -EINVAL; /* @@ -2964,7 +2964,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, } int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *orig_call) + struct bpf_tramp_nodes *tnodes, void *orig_call) { struct bpf_tramp_image im; struct bpf_tramp_jit tjit; @@ -2973,14 +2973,14 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, memset(&tjit, 0, sizeof(tjit)); ret = __arch_prepare_bpf_trampoline(&im, &tjit, m, flags, - tlinks, orig_call); + tnodes, orig_call); return ret < 0 ? ret : tjit.common.prg; } int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, - u32 flags, struct bpf_tramp_links *tlinks, + u32 flags, struct bpf_tramp_nodes *tnodes, void *func_addr) { struct bpf_tramp_jit tjit; @@ -2989,7 +2989,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, /* Compute offsets, check whether the code fits. */ memset(&tjit, 0, sizeof(tjit)); ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags, - tlinks, func_addr); + tnodes, func_addr); if (ret < 0) return ret; @@ -3003,7 +3003,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, tjit.common.prg = 0; tjit.common.prg_buf = image; ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags, - tlinks, func_addr); + tnodes, func_addr); return ret < 0 ? ret : tjit.common.prg; } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a0c541a441cf..054e043ffcd2 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3104,15 +3104,15 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_link *l, int stack_size, + struct bpf_tramp_node *node, int stack_size, int run_ctx_off, bool save_ret, void *image, void *rw_image) { u8 *prog = *pprog; u8 *jmp_insn; int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); - struct bpf_prog *p = l->link.prog; - u64 cookie = l->cookie; + struct bpf_prog *p = node->link->prog; + u64 cookie = node->cookie; /* mov rdi, cookie */ emit_mov_imm64(&prog, BPF_REG_1, (long) cookie >> 32, (u32) (long) cookie); @@ -3219,7 +3219,7 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) } static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_links *tl, int stack_size, + struct bpf_tramp_nodes *tl, int stack_size, int run_ctx_off, int func_meta_off, bool save_ret, void *image, void *rw_image, u64 func_meta, int cookie_off) @@ -3227,13 +3227,13 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, int i, cur_cookie = (cookie_off - stack_size) / 8; u8 *prog = *pprog; - for (i = 0; i < tl->nr_links; i++) { - if (tl->links[i]->link.prog->call_session_cookie) { + for (i = 0; i < tl->nr_nodes; i++) { + if (tl->nodes[i]->link->prog->call_session_cookie) { emit_store_stack_imm64(&prog, BPF_REG_0, -func_meta_off, func_meta | (cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT)); cur_cookie--; } - if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, + if (invoke_bpf_prog(m, &prog, tl->nodes[i], stack_size, run_ctx_off, save_ret, image, rw_image)) return -EINVAL; } @@ -3242,7 +3242,7 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, } static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_links *tl, int stack_size, + struct bpf_tramp_nodes *tl, int stack_size, int run_ctx_off, u8 **branches, void *image, void *rw_image) { @@ -3254,8 +3254,8 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, */ emit_mov_imm32(&prog, false, BPF_REG_0, 0); emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - for (i = 0; i < tl->nr_links; i++) { - if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true, + for (i = 0; i < tl->nr_nodes; i++) { + if (invoke_bpf_prog(m, &prog, tl->nodes[i], stack_size, run_ctx_off, true, image, rw_image)) return -EINVAL; @@ -3346,14 +3346,14 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image, void *rw_image_end, void *image, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr) { int i, ret, nr_regs = m->nr_args, stack_size = 0; int regs_off, func_meta_off, ip_off, run_ctx_off, arg_stack_off, rbx_off; - struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; - struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; - struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_nodes *fentry = &tnodes[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes *fexit = &tnodes[BPF_TRAMP_FEXIT]; + struct bpf_tramp_nodes *fmod_ret = &tnodes[BPF_TRAMP_MODIFY_RETURN]; void *orig_call = func_addr; int cookie_off, cookie_cnt; u8 **branches = NULL; @@ -3425,7 +3425,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im ip_off = stack_size; - cookie_cnt = bpf_fsession_cookie_cnt(tlinks); + cookie_cnt = bpf_fsession_cookie_cnt(tnodes); /* room for session cookies */ stack_size += cookie_cnt * 8; cookie_off = stack_size; @@ -3518,7 +3518,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im } } - if (bpf_fsession_cnt(tlinks)) { + if (bpf_fsession_cnt(tnodes)) { /* clear all the session cookies' value */ for (int i = 0; i < cookie_cnt; i++) emit_store_stack_imm64(&prog, BPF_REG_0, -cookie_off + 8 * i, 0); @@ -3526,15 +3526,15 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im emit_store_stack_imm64(&prog, BPF_REG_0, -8, 0); } - if (fentry->nr_links) { + if (fentry->nr_nodes) { if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off, func_meta_off, flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image, func_meta, cookie_off)) return -EINVAL; } - if (fmod_ret->nr_links) { - branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *), + if (fmod_ret->nr_nodes) { + branches = kcalloc(fmod_ret->nr_nodes, sizeof(u8 *), GFP_KERNEL); if (!branches) return -ENOMEM; @@ -3573,7 +3573,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im emit_nops(&prog, X86_PATCH_SIZE); } - if (fmod_ret->nr_links) { + if (fmod_ret->nr_nodes) { /* From Intel 64 and IA-32 Architectures Optimization * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler * Coding Rule 11: All branch targets should be 16-byte @@ -3583,7 +3583,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* Update the branches saved in invoke_bpf_mod_ret with the * aligned address of do_fexit. */ - for (i = 0; i < fmod_ret->nr_links; i++) { + for (i = 0; i < fmod_ret->nr_nodes; i++) { emit_cond_near_jump(&branches[i], image + (prog - (u8 *)rw_image), image + (branches[i] - (u8 *)rw_image), X86_JNE); } @@ -3591,10 +3591,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* set the "is_return" flag for fsession */ func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT); - if (bpf_fsession_cnt(tlinks)) + if (bpf_fsession_cnt(tnodes)) emit_store_stack_imm64(&prog, BPF_REG_0, -func_meta_off, func_meta); - if (fexit->nr_links) { + if (fexit->nr_nodes) { if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, func_meta_off, false, image, rw_image, func_meta, cookie_off)) { ret = -EINVAL; @@ -3668,7 +3668,7 @@ int arch_protect_bpf_trampoline(void *image, unsigned int size) int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr) { void *rw_image, *tmp; @@ -3683,7 +3683,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i return -ENOMEM; ret = __arch_prepare_bpf_trampoline(im, rw_image, rw_image + size, image, m, - flags, tlinks, func_addr); + flags, tnodes, func_addr); if (ret < 0) goto out; @@ -3696,7 +3696,7 @@ out: } int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr) + struct bpf_tramp_nodes *tnodes, void *func_addr) { struct bpf_tramp_image im; void *image; @@ -3714,7 +3714,7 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, return -ENOMEM; ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, image, - m, flags, tlinks, func_addr); + m, flags, tnodes, func_addr); bpf_jit_free_exec(image); return ret; } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f6056bab6f23..6ff35491d9c0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1251,9 +1251,9 @@ enum { #define BPF_TRAMP_COOKIE_INDEX_SHIFT 8 #define BPF_TRAMP_IS_RETURN_SHIFT 63 -struct bpf_tramp_links { - struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS]; - int nr_links; +struct bpf_tramp_nodes { + struct bpf_tramp_node *nodes[BPF_MAX_TRAMP_LINKS]; + int nr_nodes; }; struct bpf_tramp_run_ctx; @@ -1281,13 +1281,13 @@ struct bpf_tramp_run_ctx; struct bpf_tramp_image; int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr); void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); int __must_check arch_protect_bpf_trampoline(void *image, unsigned int size); int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr); + struct bpf_tramp_nodes *tnodes, void *func_addr); u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); @@ -1471,10 +1471,10 @@ static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u6 } #ifdef CONFIG_BPF_JIT -int bpf_trampoline_link_prog(struct bpf_tramp_link *link, +int bpf_trampoline_link_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog); -int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, +int bpf_trampoline_unlink_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog); struct bpf_trampoline *bpf_trampoline_get(u64 key, @@ -1561,13 +1561,13 @@ bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struc int insn_idx); u16 bpf_out_stack_arg_cnt(const struct bpf_verifier_env *env, const struct bpf_prog *prog); #else -static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link, +static inline int bpf_trampoline_link_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { return -ENOTSUPP; } -static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, +static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { @@ -1909,12 +1909,17 @@ struct bpf_link_ops { __poll_t (*poll)(struct file *file, struct poll_table_struct *pts); }; -struct bpf_tramp_link { - struct bpf_link link; +struct bpf_tramp_node { + struct bpf_link *link; struct hlist_node tramp_hlist; u64 cookie; }; +struct bpf_tramp_link { + struct bpf_link link; + struct bpf_tramp_node node; +}; + struct bpf_shim_tramp_link { struct bpf_tramp_link link; struct bpf_trampoline *trampoline; @@ -2132,8 +2137,8 @@ void bpf_struct_ops_put(const void *kdata); int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value); -int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, - struct bpf_tramp_link *link, +int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_nodes *tnodes, + struct bpf_tramp_node *node, const struct btf_func_model *model, void *stub_func, void **image, u32 *image_off, @@ -2228,31 +2233,31 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op #endif -static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) +static inline int bpf_fsession_cnt(struct bpf_tramp_nodes *nodes) { - struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes fentries = nodes[BPF_TRAMP_FENTRY]; int cnt = 0; - for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { - if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION) + for (int i = 0; i < nodes[BPF_TRAMP_FENTRY].nr_nodes; i++) { + if (fentries.nodes[i]->link->prog->expected_attach_type == BPF_TRACE_FSESSION) cnt++; } return cnt; } -static inline bool bpf_prog_calls_session_cookie(struct bpf_tramp_link *link) +static inline bool bpf_prog_calls_session_cookie(struct bpf_tramp_node *node) { - return link->link.prog->call_session_cookie; + return node->link->prog->call_session_cookie; } -static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_links *links) +static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_nodes *nodes) { - struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + struct bpf_tramp_nodes fentries = nodes[BPF_TRAMP_FENTRY]; int cnt = 0; - for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { - if (bpf_prog_calls_session_cookie(fentries.links[i])) + for (int i = 0; i < nodes[BPF_TRAMP_FENTRY].nr_nodes; i++) { + if (bpf_prog_calls_session_cookie(fentries.nodes[i])) cnt++; } @@ -2800,6 +2805,9 @@ void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog, enum bpf_attach_type attach_type, bool sleepable); +void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + enum bpf_attach_type attach_type, u64 cookie); int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); int bpf_link_settle(struct bpf_link_primer *primer); void bpf_link_cleanup(struct bpf_link_primer *primer); @@ -3223,6 +3231,12 @@ static inline void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_ { } +static inline void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + enum bpf_attach_type attach_type, u64 cookie) +{ +} + static inline int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 5e51c1211673..51b16e5f5534 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -594,8 +594,8 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = { .dealloc = bpf_struct_ops_link_dealloc, }; -int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, - struct bpf_tramp_link *link, +int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_nodes *tnodes, + struct bpf_tramp_node *node, const struct btf_func_model *model, void *stub_func, void **_image, u32 *_image_off, @@ -605,13 +605,13 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, void *image = *_image; int size; - tlinks[BPF_TRAMP_FENTRY].links[0] = link; - tlinks[BPF_TRAMP_FENTRY].nr_links = 1; + tnodes[BPF_TRAMP_FENTRY].nodes[0] = node; + tnodes[BPF_TRAMP_FENTRY].nr_nodes = 1; if (model->ret_size > 0) flags |= BPF_TRAMP_F_RET_FENTRY_RET; - size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func); + size = arch_bpf_trampoline_size(model, flags, tnodes, stub_func); if (size <= 0) return size ? : -EFAULT; @@ -628,7 +628,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, size = arch_prepare_bpf_trampoline(NULL, image + image_off, image + image_off + size, - model, flags, tlinks, stub_func); + model, flags, tnodes, stub_func); if (size <= 0) { if (image != *_image) bpf_struct_ops_image_free(image); @@ -693,7 +693,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, const struct btf_type *module_type; const struct btf_member *member; const struct btf_type *t = st_ops_desc->type; - struct bpf_tramp_links *tlinks; + struct bpf_tramp_nodes *tnodes; void *udata, *kdata; int prog_fd, err; u32 i, trampoline_start, image_off = 0; @@ -720,8 +720,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (uvalue->common.state || refcount_read(&uvalue->common.refcnt)) return -EINVAL; - tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX); - if (!tlinks) + tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX); + if (!tnodes) return -ENOMEM; uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; @@ -817,8 +817,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, err = -ENOMEM; goto reset_unlock; } - bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, - &bpf_struct_ops_link_lops, prog, prog->expected_attach_type); + bpf_tramp_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, + &bpf_struct_ops_link_lops, prog, prog->expected_attach_type, 0); + *plink++ = &link->link; /* Poison pointer on error instead of return for backward compatibility */ @@ -832,7 +833,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, *pksym++ = ksym; trampoline_start = image_off; - err = bpf_struct_ops_prepare_trampoline(tlinks, link, + err = bpf_struct_ops_prepare_trampoline(tnodes, &link->node, &st_ops->func_models[i], *(void **)(st_ops->cfi_stubs + moff), &image, &image_off, @@ -911,7 +912,7 @@ reset_unlock: memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); unlock: - kfree(tlinks); + kfree(tnodes); mutex_unlock(&st_map->lock); if (!err) bpf_struct_ops_map_add_ksyms(st_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5fcfc32c7cb4..fd69fdb9290b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3288,6 +3288,15 @@ void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, bpf_link_init_sleepable(link, type, ops, prog, attach_type, false); } +void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + enum bpf_attach_type attach_type, u64 cookie) +{ + bpf_link_init(&link->link, type, ops, prog, attach_type); + link->node.link = &link->link; + link->node.cookie = cookie; +} + static void bpf_link_free_id(int id) { if (!id) @@ -3595,7 +3604,7 @@ static void bpf_tracing_link_release(struct bpf_link *link) struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); - WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link.node, tr_link->trampoline, tr_link->tgt_prog)); @@ -3608,8 +3617,7 @@ static void bpf_tracing_link_release(struct bpf_link *link) static void bpf_tracing_link_dealloc(struct bpf_link *link) { - struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link.link); + struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); kfree(tr_link); } @@ -3617,8 +3625,8 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link) static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { - struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link.link); + struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); + u32 target_btf_id, target_obj_id; bpf_trampoline_unpack_key(tr_link->trampoline->key, @@ -3631,17 +3639,16 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, link->attach_type, target_obj_id, target_btf_id, - tr_link->link.cookie); + tr_link->link.node.cookie); } static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { - struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link.link); + struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); info->tracing.attach_type = link->attach_type; - info->tracing.cookie = tr_link->link.cookie; + info->tracing.cookie = tr_link->link.node.cookie; bpf_trampoline_unpack_key(tr_link->trampoline->key, &info->tracing.target_obj_id, &info->tracing.target_btf_id); @@ -3728,9 +3735,9 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, fslink = kzalloc_obj(*fslink, GFP_USER); if (fslink) { - bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING, - &bpf_tracing_link_lops, prog, attach_type); - fslink->fexit.cookie = bpf_cookie; + bpf_tramp_link_init(&fslink->fexit, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog, attach_type, + bpf_cookie); link = &fslink->link; } else { link = NULL; @@ -3742,10 +3749,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, - &bpf_tracing_link_lops, prog, attach_type); - - link->link.cookie = bpf_cookie; + bpf_tramp_link_init(&link->link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog, attach_type, bpf_cookie); mutex_lock(&prog->aux->dst_mutex); @@ -3848,7 +3853,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, if (err) goto out_unlock; - err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog); + err = bpf_trampoline_link_prog(&link->link.node, tr, tgt_prog); if (err) { bpf_link_cleanup(&link_primer); link = NULL; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 701138ef424a..6a45c09fc0d8 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -502,30 +502,29 @@ static const struct bpf_trampoline_ops trampoline_ops = { .modify_fentry = modify_fentry, }; -static struct bpf_tramp_links * +static struct bpf_tramp_nodes * bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) { - struct bpf_tramp_link *link; - struct bpf_tramp_links *tlinks; - struct bpf_tramp_link **links; + struct bpf_tramp_node *node, **nodes; + struct bpf_tramp_nodes *tnodes; int kind; *total = 0; - tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX); - if (!tlinks) + tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX); + if (!tnodes) return ERR_PTR(-ENOMEM); for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { - tlinks[kind].nr_links = tr->progs_cnt[kind]; + tnodes[kind].nr_nodes = tr->progs_cnt[kind]; *total += tr->progs_cnt[kind]; - links = tlinks[kind].links; + nodes = tnodes[kind].nodes; - hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { - *ip_arg |= link->link.prog->call_get_func_ip; - *links++ = link; + hlist_for_each_entry(node, &tr->progs_hlist[kind], tramp_hlist) { + *ip_arg |= node->link->prog->call_get_func_ip; + *nodes++ = node; } } - return tlinks; + return tnodes; } static void bpf_tramp_image_free(struct bpf_tramp_image *im) @@ -673,14 +672,14 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut const struct bpf_trampoline_ops *ops, void *data) { struct bpf_tramp_image *im; - struct bpf_tramp_links *tlinks; + struct bpf_tramp_nodes *tnodes; u32 orig_flags = tr->flags; bool ip_arg = false; int err, total, size; - tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg); - if (IS_ERR(tlinks)) - return PTR_ERR(tlinks); + tnodes = bpf_trampoline_get_progs(tr, &total, &ip_arg); + if (IS_ERR(tnodes)) + return PTR_ERR(tnodes); if (total == 0) { err = ops->unregister_fentry(tr, orig_flags, data); @@ -690,8 +689,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX); - if (tlinks[BPF_TRAMP_FEXIT].nr_links || - tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { + if (tnodes[BPF_TRAMP_FEXIT].nr_nodes || + tnodes[BPF_TRAMP_MODIFY_RETURN].nr_nodes) { /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME * should not be set together. */ @@ -722,7 +721,7 @@ again: #endif size = arch_bpf_trampoline_size(&tr->func.model, tr->flags, - tlinks, tr->func.addr); + tnodes, tr->func.addr); if (size < 0) { err = size; goto out; @@ -740,7 +739,7 @@ again: } err = arch_prepare_bpf_trampoline(im, im->image, im->image + size, - &tr->func.model, tr->flags, tlinks, + &tr->func.model, tr->flags, tnodes, tr->func.addr); if (err < 0) goto out_free; @@ -774,7 +773,7 @@ out: /* If any error happens, restore previous flags */ if (err) tr->flags = orig_flags; - kfree(tlinks); + kfree(tnodes); return err; } @@ -821,15 +820,15 @@ static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog) } static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, - struct bpf_tramp_link *link, + struct bpf_tramp_node *node, int cnt) { struct bpf_fsession_link *fslink = NULL; enum bpf_tramp_prog_type kind; - struct bpf_tramp_link *link_exiting; + struct bpf_tramp_node *node_existing; struct hlist_head *prog_list; - kind = bpf_attach_type_to_tramp(link->link.prog); + kind = bpf_attach_type_to_tramp(node->link->prog); if (kind == BPF_TRAMP_FSESSION) { prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY]; cnt++; @@ -838,21 +837,21 @@ static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, } if (cnt >= BPF_MAX_TRAMP_LINKS) return -E2BIG; - if (!hlist_unhashed(&link->tramp_hlist)) + if (!hlist_unhashed(&node->tramp_hlist)) /* prog already linked */ return -EBUSY; - hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) { - if (link_exiting->link.prog != link->link.prog) + hlist_for_each_entry(node_existing, prog_list, tramp_hlist) { + if (node_existing->link->prog != node->link->prog) continue; /* prog already linked */ return -EBUSY; } - hlist_add_head(&link->tramp_hlist, prog_list); + hlist_add_head(&node->tramp_hlist, prog_list); if (kind == BPF_TRAMP_FSESSION) { tr->progs_cnt[BPF_TRAMP_FENTRY]++; - fslink = container_of(link, struct bpf_fsession_link, link.link); - hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + fslink = container_of(node, struct bpf_fsession_link, link.link.node); + hlist_add_head(&fslink->fexit.node.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); tr->progs_cnt[BPF_TRAMP_FEXIT]++; } else { tr->progs_cnt[kind]++; @@ -861,23 +860,23 @@ static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, } static void bpf_trampoline_remove_prog(struct bpf_trampoline *tr, - struct bpf_tramp_link *link) + struct bpf_tramp_node *node) { struct bpf_fsession_link *fslink; enum bpf_tramp_prog_type kind; - kind = bpf_attach_type_to_tramp(link->link.prog); + kind = bpf_attach_type_to_tramp(node->link->prog); if (kind == BPF_TRAMP_FSESSION) { - fslink = container_of(link, struct bpf_fsession_link, link.link); - hlist_del_init(&fslink->fexit.tramp_hlist); + fslink = container_of(node, struct bpf_fsession_link, link.link.node); + hlist_del_init(&fslink->fexit.node.tramp_hlist); tr->progs_cnt[BPF_TRAMP_FEXIT]--; kind = BPF_TRAMP_FENTRY; } - hlist_del_init(&link->tramp_hlist); + hlist_del_init(&node->tramp_hlist); tr->progs_cnt[kind]--; } -static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, +static int __bpf_trampoline_link_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog, const struct bpf_trampoline_ops *ops, @@ -887,7 +886,7 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, int err = 0; int cnt = 0, i; - kind = bpf_attach_type_to_tramp(link->link.prog); + kind = bpf_attach_type_to_tramp(node->link->prog); if (tr->extension_prog) /* cannot attach fentry/fexit if extension prog is attached. * cannot overwrite extension prog either. @@ -904,33 +903,33 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, err = bpf_freplace_check_tgt_prog(tgt_prog); if (err) return err; - tr->extension_prog = link->link.prog; + tr->extension_prog = node->link->prog; return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP, BPF_MOD_JUMP, NULL, - link->link.prog->bpf_func); + node->link->prog->bpf_func); } - err = bpf_trampoline_add_prog(tr, link, cnt); + err = bpf_trampoline_add_prog(tr, node, cnt); if (err) return err; err = bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); if (err) - bpf_trampoline_remove_prog(tr, link); + bpf_trampoline_remove_prog(tr, node); return err; } -int bpf_trampoline_link_prog(struct bpf_tramp_link *link, +int bpf_trampoline_link_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { int err; trampoline_lock(tr); - err = __bpf_trampoline_link_prog(link, tr, tgt_prog, &trampoline_ops, NULL); + err = __bpf_trampoline_link_prog(node, tr, tgt_prog, &trampoline_ops, NULL); trampoline_unlock(tr); return err; } -static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, +static int __bpf_trampoline_unlink_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog, const struct bpf_trampoline_ops *ops, @@ -939,7 +938,7 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, enum bpf_tramp_prog_type kind; int err; - kind = bpf_attach_type_to_tramp(link->link.prog); + kind = bpf_attach_type_to_tramp(node->link->prog); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, @@ -950,19 +949,19 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, tgt_prog->aux->is_extended = false; return err; } - bpf_trampoline_remove_prog(tr, link); + bpf_trampoline_remove_prog(tr, node); return bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); } /* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, +int bpf_trampoline_unlink_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { int err; trampoline_lock(tr); - err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog, &trampoline_ops, NULL); + err = __bpf_trampoline_unlink_prog(node, tr, tgt_prog, &trampoline_ops, NULL); trampoline_unlock(tr); return err; } @@ -977,7 +976,7 @@ static void bpf_shim_tramp_link_release(struct bpf_link *link) if (!shim_link->trampoline) return; - WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL)); + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link.node, shim_link->trampoline, NULL)); bpf_trampoline_put(shim_link->trampoline); } @@ -1023,8 +1022,8 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog p->type = BPF_PROG_TYPE_LSM; p->expected_attach_type = BPF_LSM_MAC; bpf_prog_inc(p); - bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, - &bpf_shim_tramp_link_lops, p, attach_type); + bpf_tramp_link_init(&shim_link->link, BPF_LINK_TYPE_UNSPEC, + &bpf_shim_tramp_link_lops, p, attach_type, 0); bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); return shim_link; @@ -1033,15 +1032,15 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, bpf_func_t bpf_func) { - struct bpf_tramp_link *link; + struct bpf_tramp_node *node; int kind; for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { - hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { - struct bpf_prog *p = link->link.prog; + hlist_for_each_entry(node, &tr->progs_hlist[kind], tramp_hlist) { + struct bpf_prog *p = node->link->prog; if (p->bpf_func == bpf_func) - return container_of(link, struct bpf_shim_tramp_link, link); + return container_of(node, struct bpf_shim_tramp_link, link.node); } } @@ -1091,7 +1090,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, goto err; } - err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL, &trampoline_ops, NULL); + err = __bpf_trampoline_link_prog(&shim_link->link.node, tr, NULL, &trampoline_ops, NULL); if (err) goto err; @@ -1406,7 +1405,7 @@ bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog) int __weak arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr) { return -ENOTSUPP; @@ -1440,7 +1439,7 @@ int __weak arch_protect_bpf_trampoline(void *image, unsigned int size) } int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr) + struct bpf_tramp_nodes *tnodes, void *func_addr) { return -ENOTSUPP; } diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index ae5a54c350b9..191a6b3ee254 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -132,7 +132,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, const struct bpf_struct_ops *st_ops = &bpf_bpf_dummy_ops; const struct btf_type *func_proto; struct bpf_dummy_ops_test_args *args; - struct bpf_tramp_links *tlinks = NULL; + struct bpf_tramp_nodes *tnodes = NULL; struct bpf_tramp_link *link = NULL; void *image = NULL; unsigned int op_idx; @@ -158,8 +158,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, if (err) goto out; - tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX); - if (!tlinks) { + tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX); + if (!tnodes) { err = -ENOMEM; goto out; } @@ -171,11 +171,11 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, } /* prog doesn't take the ownership of the reference from caller */ bpf_prog_inc(prog); - bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog, - prog->expected_attach_type); + bpf_tramp_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, + prog, prog->expected_attach_type, 0); op_idx = prog->expected_attach_type; - err = bpf_struct_ops_prepare_trampoline(tlinks, link, + err = bpf_struct_ops_prepare_trampoline(tnodes, &link->node, &st_ops->func_models[op_idx], &dummy_ops_test_ret_function, &image, &image_off, @@ -198,7 +198,7 @@ out: bpf_struct_ops_image_free(image); if (link) bpf_link_put(&link->link); - kfree(tlinks); + kfree(tnodes); return err; } -- cgit v1.2.3 From 880db5d4abb29e931d82b9feefb4382f76fcf9e5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:34 +0200 Subject: bpf: Factor fsession link to use struct bpf_tramp_node Now that we split trampoline attachment object (bpf_tramp_node) from the link object (bpf_tramp_link) we can use bpf_tramp_node as fsession's fexit attachment object and get rid of the bpf_fsession_link object. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-10-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +----- kernel/bpf/syscall.c | 21 ++++++--------------- kernel/bpf/trampoline.c | 12 ++++++------ 3 files changed, 13 insertions(+), 26 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6ff35491d9c0..428789a9e736 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1927,15 +1927,11 @@ struct bpf_shim_tramp_link { struct bpf_tracing_link { struct bpf_tramp_link link; + struct bpf_tramp_node fexit; struct bpf_trampoline *trampoline; struct bpf_prog *tgt_prog; }; -struct bpf_fsession_link { - struct bpf_tracing_link link; - struct bpf_tramp_link fexit; -}; - struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fd69fdb9290b..0cfc8bcb3dc9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3730,21 +3730,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } - if (prog->expected_attach_type == BPF_TRACE_FSESSION) { - struct bpf_fsession_link *fslink; - - fslink = kzalloc_obj(*fslink, GFP_USER); - if (fslink) { - bpf_tramp_link_init(&fslink->fexit, BPF_LINK_TYPE_TRACING, - &bpf_tracing_link_lops, prog, attach_type, - bpf_cookie); - link = &fslink->link; - } else { - link = NULL; - } - } else { - link = kzalloc_obj(*link, GFP_USER); - } + link = kzalloc_obj(*link, GFP_USER); if (!link) { err = -ENOMEM; goto out_put_prog; @@ -3752,6 +3738,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, bpf_tramp_link_init(&link->link, BPF_LINK_TYPE_TRACING, &bpf_tracing_link_lops, prog, attach_type, bpf_cookie); + if (prog->expected_attach_type == BPF_TRACE_FSESSION) { + link->fexit.link = &link->link.link; + link->fexit.cookie = bpf_cookie; + } + mutex_lock(&prog->aux->dst_mutex); /* There are a few possible cases here: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 6a45c09fc0d8..5776d2b8e36e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -823,7 +823,7 @@ static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, struct bpf_tramp_node *node, int cnt) { - struct bpf_fsession_link *fslink = NULL; + struct bpf_tracing_link *tr_link = NULL; enum bpf_tramp_prog_type kind; struct bpf_tramp_node *node_existing; struct hlist_head *prog_list; @@ -850,8 +850,8 @@ static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, hlist_add_head(&node->tramp_hlist, prog_list); if (kind == BPF_TRAMP_FSESSION) { tr->progs_cnt[BPF_TRAMP_FENTRY]++; - fslink = container_of(node, struct bpf_fsession_link, link.link.node); - hlist_add_head(&fslink->fexit.node.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + tr_link = container_of(node, struct bpf_tracing_link, link.node); + hlist_add_head(&tr_link->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); tr->progs_cnt[BPF_TRAMP_FEXIT]++; } else { tr->progs_cnt[kind]++; @@ -862,13 +862,13 @@ static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, static void bpf_trampoline_remove_prog(struct bpf_trampoline *tr, struct bpf_tramp_node *node) { - struct bpf_fsession_link *fslink; + struct bpf_tracing_link *tr_link; enum bpf_tramp_prog_type kind; kind = bpf_attach_type_to_tramp(node->link->prog); if (kind == BPF_TRAMP_FSESSION) { - fslink = container_of(node, struct bpf_fsession_link, link.link.node); - hlist_del_init(&fslink->fexit.node.tramp_hlist); + tr_link = container_of(node, struct bpf_tracing_link, link.node); + hlist_del_init(&tr_link->fexit.tramp_hlist); tr->progs_cnt[BPF_TRAMP_FEXIT]--; kind = BPF_TRAMP_FENTRY; } -- cgit v1.2.3 From d14e6b4346bf397eca7cb5f4b7b0b8054be632d8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:35 +0200 Subject: bpf: Add multi tracing attach types Adding new program attach types multi tracing attachment: BPF_TRACE_FENTRY_MULTI BPF_TRACE_FEXIT_MULTI and their base support in verifier code. Programs with such attach type will use specific link attachment interface coming in following changes. This was suggested by Andrii some (long) time ago and turned out to be easier than having special program flag for that. Bpf programs with such types have 'bpf_multi_func' function set as their attach_btf_id and keep module reference when it's specified by attach_prog_fd. They are also accepted as sleepable programs during verification, and the real validation for specific BTF_IDs/functions will happen during the multi link attachment in following changes. Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-11-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++++ include/linux/btf_ids.h | 1 + include/uapi/linux/bpf.h | 2 ++ kernel/bpf/fixups.c | 1 + kernel/bpf/syscall.c | 28 ++++++++++++++++++++++++---- kernel/bpf/trampoline.c | 5 ++++- kernel/bpf/verifier.c | 40 +++++++++++++++++++++++++++++++++++++++- net/bpf/test_run.c | 2 ++ tools/include/uapi/linux/bpf.h | 2 ++ tools/lib/bpf/libbpf.c | 2 ++ 10 files changed, 82 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 428789a9e736..b52dc64ec92d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2113,6 +2113,11 @@ static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog) #endif } +static inline bool is_tracing_multi(enum bpf_attach_type type) +{ + return type == BPF_TRACE_FENTRY_MULTI || type == BPF_TRACE_FEXIT_MULTI; +} + #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) /* This macro helps developer to register a struct_ops type and generate * type information correctly. Developers should use this macro to register diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index af011db39ab3..8b5a9ee92513 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -284,5 +284,6 @@ extern u32 bpf_cgroup_btf_id[]; extern u32 bpf_local_storage_map_btf_id[]; extern u32 btf_bpf_map_id[]; extern u32 bpf_kmem_cache_btf_id[]; +extern u32 bpf_multi_func_btf_id[]; #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d5238df5e5eb..28d127e5040a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1156,6 +1156,8 @@ enum bpf_attach_type { BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, BPF_TRACE_FSESSION, + BPF_TRACE_FENTRY_MULTI, + BPF_TRACE_FEXIT_MULTI, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index 5aa3f7d99ac9..0cf9735929f5 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -2186,6 +2186,7 @@ patch_map_ops_generic: insn->imm == BPF_FUNC_get_func_ret) { if (eatype == BPF_TRACE_FEXIT || eatype == BPF_TRACE_FSESSION || + eatype == BPF_TRACE_FEXIT_MULTI || eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0cfc8bcb3dc9..efdd6639a598 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -2719,7 +2720,8 @@ static int bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, struct btf *attach_btf, u32 btf_id, - struct bpf_prog *dst_prog) + struct bpf_prog *dst_prog, + bool multi_func) { if (btf_id) { if (btf_id > BTF_MAX_TYPE) @@ -2739,6 +2741,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } + if (multi_func) { + if (prog_type != BPF_PROG_TYPE_TRACING) + return -EINVAL; + if (!attach_btf || btf_id) + return -EINVAL; + return 0; + } + if (attach_btf && (!btf_id || dst_prog)) return -EINVAL; @@ -2946,6 +2956,11 @@ static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog) return 0; } +extern int bpf_multi_func(void); +int __init __used bpf_multi_func(void) { return 0; } + +BTF_ID_LIST_GLOBAL_SINGLE(bpf_multi_func_btf_id, func, bpf_multi_func) + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD keyring_id @@ -2958,6 +2973,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_at bool bpf_cap; int err; char license[128]; + bool multi_func; if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; @@ -3024,6 +3040,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_at if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON)) goto put_token; + multi_func = is_tracing_multi(attr->expected_attach_type); + /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog * or btf, we need to check which one it is */ @@ -3045,7 +3063,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_at goto put_token; } } - } else if (attr->attach_btf_id) { + } else if (attr->attach_btf_id || multi_func) { /* fall back to vmlinux BTF, if BTF type ID is specified */ attach_btf = bpf_get_btf_vmlinux(); if (IS_ERR(attach_btf)) { @@ -3061,7 +3079,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_at if (bpf_prog_load_check_attach(type, attr->expected_attach_type, attach_btf, attr->attach_btf_id, - dst_prog)) { + dst_prog, multi_func)) { if (dst_prog) bpf_prog_put(dst_prog); if (attach_btf) @@ -3084,7 +3102,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_at prog->expected_attach_type = attr->expected_attach_type; prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE); prog->aux->attach_btf = attach_btf; - prog->aux->attach_btf_id = attr->attach_btf_id; + prog->aux->attach_btf_id = multi_func ? bpf_multi_func_btf_id[0] : attr->attach_btf_id; prog->aux->dst_prog = dst_prog; prog->aux->dev_bound = !!attr->prog_ifindex; prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; @@ -4480,6 +4498,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; case BPF_LSM_MAC: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5776d2b8e36e..ae7e4fdfe2a3 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -182,7 +182,8 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) switch (ptype) { case BPF_PROG_TYPE_TRACING: if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION) + eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION || + eatype == BPF_TRACE_FENTRY_MULTI || eatype == BPF_TRACE_FEXIT_MULTI) return true; return false; case BPF_PROG_TYPE_LSM: @@ -781,10 +782,12 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) { switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: + case BPF_TRACE_FENTRY_MULTI: return BPF_TRAMP_FENTRY; case BPF_MODIFY_RETURN: return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: + case BPF_TRACE_FEXIT_MULTI: return BPF_TRAMP_FEXIT; case BPF_TRACE_FSESSION: return BPF_TRAMP_FSESSION; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 926ff63a0b61..0e593f3335e9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16382,6 +16382,8 @@ static bool return_retval_range(struct bpf_verifier_env *env, struct bpf_retval_ case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: *range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: @@ -18772,6 +18774,11 @@ static int check_attach_modify_return(unsigned long addr, const char *func_name) #endif /* CONFIG_FUNCTION_ERROR_INJECTION */ +static bool is_tracing_multi_id(const struct bpf_prog *prog, u32 btf_id) +{ + return is_tracing_multi(prog->expected_attach_type) && bpf_multi_func_btf_id[0] == btf_id; +} + int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, @@ -18894,6 +18901,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, prog_extension && (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || tgt_prog->expected_attach_type == BPF_TRACE_FEXIT || + tgt_prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI || + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI || tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) { /* Program extensions can extend all program types * except fentry/fexit. The reason is the following. @@ -19000,6 +19009,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: if (prog->expected_attach_type == BPF_TRACE_FSESSION && !bpf_jit_supports_fsession()) { bpf_log(log, "JIT does not support fsession\n"); @@ -19029,7 +19040,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (ret < 0) return ret; - if (tgt_prog) { + /* + * *.multi programs don't need an address during program + * verification, we just take the module ref if needed. + */ + if (is_tracing_multi_id(prog, btf_id)) { + if (btf_is_module(btf)) { + mod = btf_try_get_module(btf); + if (!mod) + return -ENOENT; + } + addr = 0; + } else if (tgt_prog) { if (subprog == 0) addr = (long) tgt_prog->bpf_func; else @@ -19057,6 +19079,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, ret = -EINVAL; switch (prog->type) { case BPF_PROG_TYPE_TRACING: + /* *.multi sleepable programs will pass initial sleepable check, + * the actual attached btf ids are checked later during the link + * attachment. + */ + if (is_tracing_multi_id(prog, btf_id)) + ret = 0; if (!check_attach_sleepable(btf_id, addr, tname)) ret = 0; /* fentry/fexit/fmod_ret progs can also be sleepable if they are @@ -19167,6 +19195,8 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_TRACE_ITER: case BPF_TRACE_FSESSION: case BPF_TRACE_RAW_TP: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: return true; default: return false; @@ -19260,6 +19290,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } + /* + * We don't get trampoline for tracing_multi programs at this point, + * it's done when tracing_multi link is created. + */ + if (prog->type == BPF_PROG_TYPE_TRACING && + is_tracing_multi(prog->expected_attach_type)) + return 0; + key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c9aea7052ba7..67769c700cae 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -703,6 +703,8 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3829db087449..1b9aacf468e5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1156,6 +1156,8 @@ enum bpf_attach_type { BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, BPF_TRACE_FSESSION, + BPF_TRACE_FENTRY_MULTI, + BPF_TRACE_FEXIT_MULTI, __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1354bcbc8b30..1b09381d16ff 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -136,6 +136,8 @@ static const char * const attach_type_name[] = { [BPF_NETKIT_PEER] = "netkit_peer", [BPF_TRACE_KPROBE_SESSION] = "trace_kprobe_session", [BPF_TRACE_UPROBE_SESSION] = "trace_uprobe_session", + [BPF_TRACE_FENTRY_MULTI] = "trace_fentry_multi", + [BPF_TRACE_FEXIT_MULTI] = "trace_fexit_multi", }; static const char * const link_type_name[] = { -- cgit v1.2.3 From bd06659d3b8abe7a79ae473209ee89bf3a23af36 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:36 +0200 Subject: bpf: Move sleepable verification code to btf_id_allow_sleepable Move sleepable verification code to btf_id_allow_sleepable function. It will be used in following changes. Adding code to retrieve type's name instead of passing it from bpf_check_attach_target function, because this function will be called from another place in following changes and it's easier to retrieve the name directly in here. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-12-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 82 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 32 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0e593f3335e9..df21592fc560 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18779,6 +18779,55 @@ static bool is_tracing_multi_id(const struct bpf_prog *prog, u32 btf_id) return is_tracing_multi(prog->expected_attach_type) && bpf_multi_func_btf_id[0] == btf_id; } +static int btf_id_allow_sleepable(u32 btf_id, unsigned long addr, const struct bpf_prog *prog, + const struct btf *btf) +{ + const struct btf_type *t; + const char *tname; + + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + t = btf_type_by_id(btf, btf_id); + if (!t) + return -EINVAL; + tname = btf_name_by_offset(btf, t->name_off); + if (!tname) + return -EINVAL; + + /* + * *.multi sleepable programs will pass initial sleepable check, + * the actual attached btf ids are checked later during the link + * attachment. + */ + if (is_tracing_multi_id(prog, btf_id)) + return 0; + if (!check_attach_sleepable(btf_id, addr, tname)) + return 0; + /* + * fentry/fexit/fmod_ret progs can also be sleepable if they are + * in the fmodret id set with the KF_SLEEPABLE flag. + */ + else { + u32 *flags = btf_kfunc_is_modify_return(btf, btf_id, prog); + + if (flags && (*flags & KF_SLEEPABLE)) + return 0; + } + break; + case BPF_PROG_TYPE_LSM: + /* + * LSM progs check that they are attached to bpf_lsm_*() funcs. + * Only some of them are sleepable. + */ + if (bpf_lsm_is_sleepable_hook(btf_id)) + return 0; + break; + default: + break; + } + return -EINVAL; +} + int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, @@ -19076,38 +19125,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, } if (prog->sleepable) { - ret = -EINVAL; - switch (prog->type) { - case BPF_PROG_TYPE_TRACING: - /* *.multi sleepable programs will pass initial sleepable check, - * the actual attached btf ids are checked later during the link - * attachment. - */ - if (is_tracing_multi_id(prog, btf_id)) - ret = 0; - if (!check_attach_sleepable(btf_id, addr, tname)) - ret = 0; - /* fentry/fexit/fmod_ret progs can also be sleepable if they are - * in the fmodret id set with the KF_SLEEPABLE flag. - */ - else { - u32 *flags = btf_kfunc_is_modify_return(btf, btf_id, - prog); - - if (flags && (*flags & KF_SLEEPABLE)) - ret = 0; - } - break; - case BPF_PROG_TYPE_LSM: - /* LSM progs check that they are attached to bpf_lsm_*() funcs. - * Only some of them are sleepable. - */ - if (bpf_lsm_is_sleepable_hook(btf_id)) - ret = 0; - break; - default: - break; - } + ret = btf_id_allow_sleepable(btf_id, addr, prog, btf); if (ret) { module_put(mod); bpf_log(log, "%s is not sleepable\n", tname); -- cgit v1.2.3 From aef4dfa790b22d8052cfb78044eadbe03c876c39 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:37 +0200 Subject: bpf: Add bpf_trampoline_multi_attach/detach functions Adding bpf_trampoline_multi_attach/detach functions that allows to attach/detach tracing program to multiple functions/trampolines. The attachment is defined with bpf_program and array of BTF ids of functions to attach the bpf program to. Adding bpf_tracing_multi_link object that holds all the attached trampolines and is initialized in attach and used in detach. The attachment allocates or uses currently existing trampoline for each function to attach and links it with the bpf program. The attach works as follows: - we get all the needed trampolines - lock them and add the bpf program to each (__bpf_trampoline_link_prog) - the trampoline_multi_ops passed in __bpf_trampoline_link_prog gathers ftrace_hash (ip -> trampoline) objects - we call update_ftrace_direct_add/mod to update needed locations - we unlock all the trampolines The detach works as follows: - we lock all the needed trampolines - remove the program from each (__bpf_trampoline_unlink_prog) - the trampoline_multi_ops passed in __bpf_trampoline_unlink_prog gathers ftrace_hash (ip -> trampoline) objects - we call update_ftrace_direct_del/mod to update needed locations - we unlock and put all the trampolines We store the old image/flags in the trampoline before the update and use it in case we need to rollback the attachment. We keep the ftrace_hash objects allocated during attach in the link so they can be used for detach as well. Adding trampoline_(un)lock_all functions to (un)lock all trampolines to gate the tracing_multi attachment. Note this is supported only for archs (x86_64) with ftrace direct and have single ops support. CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS It also needs CONFIG_BPF_SYSCALL enabled. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-13-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 43 +++++++ include/linux/bpf_verifier.h | 4 + kernel/bpf/trampoline.c | 271 +++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 55 +++++++++ 4 files changed, 373 insertions(+) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b52dc64ec92d..bcf70f810d2c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -33,6 +33,7 @@ #include #include #include +#include #include struct bpf_verifier_env; @@ -1373,6 +1374,11 @@ struct bpf_trampoline { int progs_cnt[BPF_TRAMP_MAX]; /* Executable image of trampoline */ struct bpf_tramp_image *cur_image; + /* Used as temporary old image storage for multi_attach */ + struct { + struct bpf_tramp_image *old_image; + u32 old_flags; + } multi_attach; }; struct bpf_attach_target_info { @@ -1470,6 +1476,8 @@ static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u6 return 0; } +struct bpf_tracing_multi_link; + #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, @@ -1482,6 +1490,11 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key, void bpf_trampoline_put(struct bpf_trampoline *tr); int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs); +int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids, + struct bpf_tracing_multi_link *link); +int bpf_trampoline_multi_detach(struct bpf_prog *prog, + struct bpf_tracing_multi_link *link); + /* * When the architecture supports STATIC_CALL replace the bpf_dispatcher_fn * indirection with a direct call to the bpf program. If the architecture does @@ -1594,6 +1607,16 @@ static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { return false; } +static inline int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids, + struct bpf_tracing_multi_link *link) +{ + return -ENOTSUPP; +} +static inline int bpf_trampoline_multi_detach(struct bpf_prog *prog, + struct bpf_tracing_multi_link *link) +{ + return -ENOTSUPP; +} #endif struct bpf_func_info_aux { @@ -1932,6 +1955,26 @@ struct bpf_tracing_link { struct bpf_prog *tgt_prog; }; +struct bpf_tracing_multi_node { + struct bpf_tramp_node node; + struct bpf_trampoline *trampoline; + struct ftrace_func_entry entry; +}; + +struct bpf_tracing_multi_data { + struct ftrace_hash *unreg; + struct ftrace_hash *modify; + struct ftrace_hash *reg; + struct ftrace_func_entry *entry; +}; + +struct bpf_tracing_multi_link { + struct bpf_link link; + struct bpf_tracing_multi_data data; + int nodes_cnt; + struct bpf_tracing_multi_node nodes[] __counted_by(nodes_cnt); +}; + struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c248ff41f42a..d57b339a8cb8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1591,6 +1591,10 @@ int bpf_add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, u16 offset); int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn *insn_buf, int insn_idx, int *cnt); +/* Functions exported from verifier.c, used by trampoline.c */ +int bpf_check_attach_btf_id_multi(struct btf *btf, struct bpf_prog *prog, u32 btf_id, + struct bpf_attach_target_info *tgt_info); + /* Functions in fixups.c, called from bpf_check() */ int bpf_remove_fastcall_spills_fills(struct bpf_verifier_env *env); int bpf_optimize_bpf_loop(struct bpf_verifier_env *env); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index ae7e4fdfe2a3..957e5d7f9554 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -1447,6 +1447,277 @@ int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, return -ENOTSUPP; } +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \ + defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS) && \ + defined(CONFIG_BPF_SYSCALL) + +static void trampoline_lock_all(void) +{ + int i; + + for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++) + mutex_lock(&trampoline_locks[i].mutex); +} + +static void trampoline_unlock_all(void) +{ + int i; + + for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++) + mutex_unlock(&trampoline_locks[i].mutex); +} + +static void remove_tracing_multi_data(struct bpf_tracing_multi_data *data) +{ + ftrace_hash_remove(data->reg); + ftrace_hash_remove(data->unreg); + ftrace_hash_remove(data->modify); +} + +static void clear_tracing_multi_data(struct bpf_tracing_multi_data *data) +{ + remove_tracing_multi_data(data); + + free_ftrace_hash(data->reg); + free_ftrace_hash(data->unreg); + free_ftrace_hash(data->modify); +} + +static int init_tracing_multi_data(struct bpf_tracing_multi_data *data) +{ + data->reg = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + data->unreg = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + data->modify = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + + if (!data->reg || !data->unreg || !data->modify) { + clear_tracing_multi_data(data); + return -ENOMEM; + } + return 0; +} + +static void ftrace_hash_add(struct ftrace_hash *hash, struct ftrace_func_entry *entry, + unsigned long ip, unsigned long direct) +{ + entry->ip = ip; + entry->direct = direct; + add_ftrace_hash_entry(hash, entry); +} + +static int register_fentry_multi(struct bpf_trampoline *tr, struct bpf_tramp_image *im, void *ptr) +{ + unsigned long addr = (unsigned long) im->image; + unsigned long ip = ftrace_location(tr->ip); + struct bpf_tracing_multi_data *data = ptr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + + ftrace_hash_add(data->reg, data->entry, ip, addr); + tr->cur_image = im; + return 0; +} + +static int unregister_fentry_multi(struct bpf_trampoline *tr, u32 orig_flags, void *ptr) +{ + unsigned long addr = (unsigned long) tr->cur_image->image; + unsigned long ip = ftrace_location(tr->ip); + struct bpf_tracing_multi_data *data = ptr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + + ftrace_hash_add(data->unreg, data->entry, ip, addr); + tr->cur_image = NULL; + return 0; +} + +static int modify_fentry_multi(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im, + bool lock_direct_mutex, void *ptr) +{ + unsigned long addr = (unsigned long) im->image; + unsigned long ip = ftrace_location(tr->ip); + struct bpf_tracing_multi_data *data = ptr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + + ftrace_hash_add(data->modify, data->entry, ip, addr); + tr->cur_image = im; + return 0; +} + +static const struct bpf_trampoline_ops trampoline_multi_ops = { + .register_fentry = register_fentry_multi, + .unregister_fentry = unregister_fentry_multi, + .modify_fentry = modify_fentry_multi, +}; + +static void bpf_trampoline_multi_attach_init(struct bpf_trampoline *tr) +{ + tr->multi_attach.old_image = tr->cur_image; + tr->multi_attach.old_flags = tr->flags; +} + +static void bpf_trampoline_multi_attach_free(struct bpf_trampoline *tr) +{ + if (tr->multi_attach.old_image) + bpf_tramp_image_put(tr->multi_attach.old_image); + + tr->multi_attach.old_image = NULL; + tr->multi_attach.old_flags = 0; +} + +static void bpf_trampoline_multi_attach_rollback(struct bpf_trampoline *tr) +{ + if (tr->cur_image) + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = tr->multi_attach.old_image; + tr->flags = tr->multi_attach.old_flags; + + tr->multi_attach.old_image = NULL; + tr->multi_attach.old_flags = 0; +} + +#define for_each_mnode_cnt(mnode, link, cnt) \ + for (i = 0, mnode = &link->nodes[i]; i < cnt; i++, mnode = &link->nodes[i]) + +#define for_each_mnode(mnode, link) \ + for_each_mnode_cnt(mnode, link, link->nodes_cnt) + +int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids, + struct bpf_tracing_multi_link *link) +{ + struct bpf_tracing_multi_data *data = &link->data; + struct bpf_attach_target_info tgt_info = {}; + struct btf *btf = prog->aux->attach_btf; + struct bpf_tracing_multi_node *mnode; + struct bpf_trampoline *tr; + int i, err, rollback_cnt; + u64 key; + + for_each_mnode(mnode, link) { + rollback_cnt = i; + + err = bpf_check_attach_btf_id_multi(btf, prog, ids[i], &tgt_info); + if (err) + goto rollback_put; + + key = bpf_trampoline_compute_key(NULL, btf, ids[i]); + + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) { + err = -ENOMEM; + goto rollback_put; + } + + mnode->trampoline = tr; + mnode->node.link = &link->link; + + cond_resched(); + } + + err = init_tracing_multi_data(data); + if (err) { + rollback_cnt = link->nodes_cnt; + goto rollback_put; + } + + trampoline_lock_all(); + + for_each_mnode(mnode, link) { + bpf_trampoline_multi_attach_init(mnode->trampoline); + + data->entry = &mnode->entry; + err = __bpf_trampoline_link_prog(&mnode->node, mnode->trampoline, NULL, + &trampoline_multi_ops, data); + if (err) { + rollback_cnt = i; + goto rollback_unlink; + } + } + + rollback_cnt = link->nodes_cnt; + if (ftrace_hash_count(data->reg)) { + err = update_ftrace_direct_add(&direct_ops, data->reg); + if (err) + goto rollback_unlink; + } + + if (ftrace_hash_count(data->modify)) { + err = update_ftrace_direct_mod(&direct_ops, data->modify, true); + if (err) { + if (ftrace_hash_count(data->reg)) + WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->reg)); + goto rollback_unlink; + } + } + + for_each_mnode(mnode, link) + bpf_trampoline_multi_attach_free(mnode->trampoline); + + trampoline_unlock_all(); + + remove_tracing_multi_data(data); + return 0; + +rollback_unlink: + for_each_mnode_cnt(mnode, link, rollback_cnt) { + bpf_trampoline_remove_prog(mnode->trampoline, &mnode->node); + bpf_trampoline_multi_attach_rollback(mnode->trampoline); + } + + trampoline_unlock_all(); + + clear_tracing_multi_data(data); + rollback_cnt = link->nodes_cnt; + +rollback_put: + for_each_mnode_cnt(mnode, link, rollback_cnt) + bpf_trampoline_put(mnode->trampoline); + + return err; +} + +int bpf_trampoline_multi_detach(struct bpf_prog *prog, struct bpf_tracing_multi_link *link) +{ + struct bpf_tracing_multi_data *data = &link->data; + struct bpf_tracing_multi_node *mnode; + int i; + + trampoline_lock_all(); + + for_each_mnode(mnode, link) { + data->entry = &mnode->entry; + bpf_trampoline_multi_attach_init(mnode->trampoline); + WARN_ON_ONCE(__bpf_trampoline_unlink_prog(&mnode->node, mnode->trampoline, + NULL, &trampoline_multi_ops, data)); + } + + if (ftrace_hash_count(data->unreg)) + WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->unreg)); + if (ftrace_hash_count(data->modify)) + WARN_ON_ONCE(update_ftrace_direct_mod(&direct_ops, data->modify, true)); + + for_each_mnode(mnode, link) + bpf_trampoline_multi_attach_free(mnode->trampoline); + + trampoline_unlock_all(); + + for_each_mnode(mnode, link) + bpf_trampoline_put(mnode->trampoline); + + clear_tracing_multi_data(data); + return 0; +} + +#undef for_each_mnode_cnt +#undef for_each_mnode + +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && + CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS && + CONFIG_BPF_SYSCALL */ + static int __init init_trampolines(void) { int i; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df21592fc560..5c594047ff0a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19328,6 +19328,61 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return 0; } +int bpf_check_attach_btf_id_multi(struct btf *btf, struct bpf_prog *prog, u32 btf_id, + struct bpf_attach_target_info *tgt_info) +{ + const struct btf_type *t; + unsigned long addr; + const char *tname; + int err; + + if (!btf_id || !btf) + return -EINVAL; + + /* Check noreturn attachment. */ + if (prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI && + btf_id_set_contains(&noreturn_deny, btf_id)) + return -EINVAL; + /* Check denied attachment. */ + if (btf_id_set_contains(&btf_id_deny, btf_id)) + return -EINVAL; + + /* Check and get function target data. */ + t = btf_type_by_id(btf, btf_id); + if (!t) + return -EINVAL; + tname = btf_name_by_offset(btf, t->name_off); + if (!tname) + return -EINVAL; + if (!btf_type_is_func(t)) + return -EINVAL; + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_func_proto(t)) + return -EINVAL; + err = btf_distill_func_proto(NULL, btf, t, tname, &tgt_info->fmodel); + if (err < 0) + return err; + if (btf_is_module(btf)) { + /* The bpf program already holds reference to module. */ + if (WARN_ON_ONCE(!prog->aux->mod)) + return -EINVAL; + addr = find_kallsyms_symbol_value(prog->aux->mod, tname); + } else { + addr = kallsyms_lookup_name(tname); + } + if (!addr || !ftrace_location(addr)) + return -ENOENT; + + /* Check sleepable program attachment. */ + if (prog->sleepable) { + err = btf_id_allow_sleepable(btf_id, addr, prog, btf); + if (err) + return err; + } + tgt_info->tgt_addr = addr; + return 0; +} + struct btf *bpf_get_btf_vmlinux(void) { if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { -- cgit v1.2.3 From c1d32dea5d4694c1a6c14d1d1c3192d0e18ffc7b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:38 +0200 Subject: bpf: Add support for tracing multi link Adding new link to allow to attach program to multiple function BTF IDs. The link is represented by struct bpf_tracing_multi_link. To configure the link, new fields are added to bpf_attr::link_create to pass array of BTF IDs; struct { __aligned_u64 ids; __u32 cnt; } tracing_multi; Each BTF ID represents function (BTF_KIND_FUNC) that the link will attach bpf program to. We use previously added bpf_trampoline_multi_attach/detach functions to attach/detach the link. The linkinfo/fdinfo callbacks will be implemented in following changes. Note this is supported only for archs (x86_64) with ftrace direct and have single ops support. CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS Note using sort_r (instead of plain sort) in check_dup_ids, because we will use the swap callback in following changes. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-14-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_types.h | 1 + include/linux/trace_events.h | 6 ++ include/uapi/linux/bpf.h | 5 ++ kernel/bpf/syscall.c | 2 + kernel/trace/bpf_trace.c | 130 +++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 6 ++ tools/lib/bpf/libbpf.c | 1 + 7 files changed, 151 insertions(+) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 56e4c3f983d3..e5906829aa6f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -156,3 +156,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi) BPF_LINK_TYPE(BPF_LINK_TYPE_STRUCT_OPS, struct_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_UPROBE_MULTI, uprobe_multi) +BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING_MULTI, tracing_multi) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index d49338c44014..308c76b57d13 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -787,6 +787,7 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, unsigned long *missed); int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -844,6 +845,11 @@ bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } +static inline int +bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} #endif enum { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 28d127e5040a..9f603731d267 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1182,6 +1182,7 @@ enum bpf_link_type { BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, BPF_LINK_TYPE_SOCKMAP = 14, + BPF_LINK_TYPE_TRACING_MULTI = 15, __MAX_BPF_LINK_TYPE, }; @@ -1877,6 +1878,10 @@ union bpf_attr { }; __u64 expected_revision; } cgroup; + struct { + __aligned_u64 ids; + __u32 cnt; + } tracing_multi; }; } link_create; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index efdd6639a598..d551b9da0cfb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5885,6 +5885,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) ret = cgroup_bpf_link_attach(attr, prog); + else if (is_tracing_multi(prog->expected_attach_type)) + ret = bpf_tracing_multi_attach(prog, attr); else ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d853f97bd154..9e3cb547651e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -42,6 +42,7 @@ #define MAX_UPROBE_MULTI_CNT (1U << 20) #define MAX_KPROBE_MULTI_CNT (1U << 20) +#define MAX_TRACING_MULTI_CNT (1U << 20) #ifdef CONFIG_MODULES struct bpf_trace_module { @@ -3641,3 +3642,132 @@ __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(const struct bpf_dynptr *dptr } __bpf_kfunc_end_defs(); + +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \ + defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS) + +static void bpf_tracing_multi_link_release(struct bpf_link *link) +{ + struct bpf_tracing_multi_link *tr_link = + container_of(link, struct bpf_tracing_multi_link, link); + + WARN_ON_ONCE(bpf_trampoline_multi_detach(link->prog, tr_link)); +} + +static void bpf_tracing_multi_link_dealloc(struct bpf_link *link) +{ + struct bpf_tracing_multi_link *tr_link = + container_of(link, struct bpf_tracing_multi_link, link); + + kvfree(tr_link); +} + +static const struct bpf_link_ops bpf_tracing_multi_link_lops = { + .release = bpf_tracing_multi_link_release, + .dealloc_deferred = bpf_tracing_multi_link_dealloc, +}; + +static int ids_cmp_r(const void *pa, const void *pb, const void *priv __maybe_unused) +{ + u32 a = *(u32 *) pa; + u32 b = *(u32 *) pb; + + return (a > b) - (a < b); +} + +static void ids_swap_r(void *a, void *b, int size __maybe_unused, + const void *priv __maybe_unused) +{ + u32 *id_a = a, *id_b = b; + + swap(*id_a, *id_b); +} + +static int check_dup_ids(u32 *ids, u32 cnt) +{ + int err = 0; + + /* + * Sort ids array (together with cookies array if defined) + * and check it for duplicates. The ids and cookies arrays + * are left sorted. + */ + sort_r_nonatomic(ids, cnt, sizeof(ids[0]), ids_cmp_r, ids_swap_r, NULL); + + for (int i = 1; i < cnt; i++) { + if (ids[i] == ids[i - 1]) { + err = -EINVAL; + break; + } + } + return err; +} + +int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) +{ + struct bpf_tracing_multi_link *link = NULL; + struct bpf_link_primer link_primer; + u32 cnt, *ids = NULL; + u32 __user *uids; + int err; + + uids = u64_to_user_ptr(attr->link_create.tracing_multi.ids); + cnt = attr->link_create.tracing_multi.cnt; + + if (!cnt || !uids) + return -EINVAL; + if (cnt > MAX_TRACING_MULTI_CNT) + return -E2BIG; + if (attr->link_create.flags || attr->link_create.target_fd) + return -EINVAL; + + ids = kvmalloc_objs(*ids, cnt); + if (!ids) + return -ENOMEM; + + if (copy_from_user(ids, uids, cnt * sizeof(*ids))) { + err = -EFAULT; + goto error; + } + + err = check_dup_ids(ids, cnt); + if (err) + goto error; + + link = kvzalloc_flex(*link, nodes, cnt); + if (!link) { + err = -ENOMEM; + goto error; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING_MULTI, + &bpf_tracing_multi_link_lops, prog, prog->expected_attach_type); + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto error; + + link->nodes_cnt = cnt; + + err = bpf_trampoline_multi_attach(prog, ids, link); + kvfree(ids); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + return bpf_link_settle(&link_primer); + +error: + kvfree(ids); + kvfree(link); + return err; +} + +#else + +int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} + +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1b9aacf468e5..9f603731d267 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1182,6 +1182,7 @@ enum bpf_link_type { BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, BPF_LINK_TYPE_SOCKMAP = 14, + BPF_LINK_TYPE_TRACING_MULTI = 15, __MAX_BPF_LINK_TYPE, }; @@ -1877,6 +1878,10 @@ union bpf_attr { }; __u64 expected_revision; } cgroup; + struct { + __aligned_u64 ids; + __u32 cnt; + } tracing_multi; }; } link_create; @@ -7254,6 +7259,7 @@ enum { TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ SK_BPF_BYPASS_PROT_MEM = 1010, /* Get or Set sk->sk_bypass_prot_mem */ + }; enum { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1b09381d16ff..59405d318624 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -156,6 +156,7 @@ static const char * const link_type_name[] = { [BPF_LINK_TYPE_UPROBE_MULTI] = "uprobe_multi", [BPF_LINK_TYPE_NETKIT] = "netkit", [BPF_LINK_TYPE_SOCKMAP] = "sockmap", + [BPF_LINK_TYPE_TRACING_MULTI] = "tracing_multi", }; static const char * const map_type_name[] = { -- cgit v1.2.3 From 46b42af27d40021a97c147d23de8cb29eb5020df Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:39 +0200 Subject: bpf: Add support for tracing_multi link cookies Add support to specify cookies for tracing_multi link. Cookies are provided in array where each value is paired with provided BTF ID value with the same array index. Such cookie can be retrieved by bpf program with bpf_get_attach_cookie helper call. We need to sort cookies array together with ids array in check_dup_ids, to keep the id->cookie relation. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-15-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/trampoline.c | 1 + kernel/trace/bpf_trace.c | 37 +++++++++++++++++++++++++++++++++---- tools/include/uapi/linux/bpf.h | 1 + 5 files changed, 37 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bcf70f810d2c..e9d2b42a3981 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1971,6 +1971,7 @@ struct bpf_tracing_multi_data { struct bpf_tracing_multi_link { struct bpf_link link; struct bpf_tracing_multi_data data; + u64 *cookies; int nodes_cnt; struct bpf_tracing_multi_node nodes[] __counted_by(nodes_cnt); }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9f603731d267..569c15e1cae3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1880,6 +1880,7 @@ union bpf_attr { } cgroup; struct { __aligned_u64 ids; + __aligned_u64 cookies; __u32 cnt; } tracing_multi; }; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 957e5d7f9554..a3537fda50cf 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -1613,6 +1613,7 @@ int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids, mnode->trampoline = tr; mnode->node.link = &link->link; + mnode->node.cookie = link->cookies ? link->cookies[i] : 0; cond_resched(); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9e3cb547651e..e33492739ed1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3659,6 +3659,7 @@ static void bpf_tracing_multi_link_dealloc(struct bpf_link *link) struct bpf_tracing_multi_link *tr_link = container_of(link, struct bpf_tracing_multi_link, link); + kvfree(tr_link->cookies); kvfree(tr_link); } @@ -3678,13 +3679,24 @@ static int ids_cmp_r(const void *pa, const void *pb, const void *priv __maybe_un static void ids_swap_r(void *a, void *b, int size __maybe_unused, const void *priv __maybe_unused) { - u32 *id_a = a, *id_b = b; + u64 *cookie_a, *cookie_b, *cookies; + u32 *id_a = a, *id_b = b, *ids; + void **data = (void **) priv; + ids = data[0]; + cookies = data[1]; + + if (cookies) { + cookie_a = cookies + (id_a - ids); + cookie_b = cookies + (id_b - ids); + swap(*cookie_a, *cookie_b); + } swap(*id_a, *id_b); } -static int check_dup_ids(u32 *ids, u32 cnt) +static int check_dup_ids(u32 *ids, u64 *cookies, u32 cnt) { + void *data[2] = { ids, cookies }; int err = 0; /* @@ -3692,7 +3704,7 @@ static int check_dup_ids(u32 *ids, u32 cnt) * and check it for duplicates. The ids and cookies arrays * are left sorted. */ - sort_r_nonatomic(ids, cnt, sizeof(ids[0]), ids_cmp_r, ids_swap_r, NULL); + sort_r_nonatomic(ids, cnt, sizeof(ids[0]), ids_cmp_r, ids_swap_r, data); for (int i = 1; i < cnt; i++) { if (ids[i] == ids[i - 1]) { @@ -3708,6 +3720,8 @@ int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) struct bpf_tracing_multi_link *link = NULL; struct bpf_link_primer link_primer; u32 cnt, *ids = NULL; + u64 __user *ucookies; + u64 *cookies = NULL; u32 __user *uids; int err; @@ -3730,7 +3744,20 @@ int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) goto error; } - err = check_dup_ids(ids, cnt); + ucookies = u64_to_user_ptr(attr->link_create.tracing_multi.cookies); + if (ucookies) { + cookies = kvmalloc_objs(*cookies, cnt); + if (!cookies) { + err = -ENOMEM; + goto error; + } + if (copy_from_user(cookies, ucookies, cnt * sizeof(*cookies))) { + err = -EFAULT; + goto error; + } + } + + err = check_dup_ids(ids, cookies, cnt); if (err) goto error; @@ -3748,6 +3775,7 @@ int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) goto error; link->nodes_cnt = cnt; + link->cookies = cookies; err = bpf_trampoline_multi_attach(prog, ids, link); kvfree(ids); @@ -3758,6 +3786,7 @@ int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) return bpf_link_settle(&link_primer); error: + kvfree(cookies); kvfree(ids); kvfree(link); return err; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9f603731d267..569c15e1cae3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1880,6 +1880,7 @@ union bpf_attr { } cgroup; struct { __aligned_u64 ids; + __aligned_u64 cookies; __u32 cnt; } tracing_multi; }; -- cgit v1.2.3 From ba042ed6446fc524c1d804227765b45616f9cba3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jun 2026 14:39:40 +0200 Subject: bpf: Add support for tracing_multi link session Adding support to use session attachment with tracing_multi link. Adding new BPF_TRACE_FSESSION_MULTI program attach type, that follows the BPF_TRACE_FSESSION behaviour but on the tracing_multi link. Such program is called on entry and exit of the attached function and allows to pass cookie value from entry to exit execution. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260606123955.345967-16-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++++- include/uapi/linux/bpf.h | 1 + kernel/bpf/fixups.c | 1 + kernel/bpf/syscall.c | 1 + kernel/bpf/trampoline.c | 44 ++++++++++++++++++++++++++++++++++-------- kernel/bpf/verifier.c | 20 ++++++++++++++----- kernel/trace/bpf_trace.c | 15 +++++++++++++- net/bpf/test_run.c | 1 + tools/include/uapi/linux/bpf.h | 1 + tools/lib/bpf/libbpf.c | 1 + 10 files changed, 76 insertions(+), 15 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e9d2b42a3981..62bba7a4876f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1972,6 +1972,7 @@ struct bpf_tracing_multi_link { struct bpf_link link; struct bpf_tracing_multi_data data; u64 *cookies; + struct bpf_tramp_node *fexits; int nodes_cnt; struct bpf_tracing_multi_node nodes[] __counted_by(nodes_cnt); }; @@ -2159,7 +2160,8 @@ static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog) static inline bool is_tracing_multi(enum bpf_attach_type type) { - return type == BPF_TRACE_FENTRY_MULTI || type == BPF_TRACE_FEXIT_MULTI; + return type == BPF_TRACE_FENTRY_MULTI || type == BPF_TRACE_FEXIT_MULTI || + type == BPF_TRACE_FSESSION_MULTI; } #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) @@ -2286,6 +2288,8 @@ static inline int bpf_fsession_cnt(struct bpf_tramp_nodes *nodes) for (int i = 0; i < nodes[BPF_TRAMP_FENTRY].nr_nodes; i++) { if (fentries.nodes[i]->link->prog->expected_attach_type == BPF_TRACE_FSESSION) cnt++; + if (fentries.nodes[i]->link->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) + cnt++; } return cnt; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 569c15e1cae3..11dd610fa5fa 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1158,6 +1158,7 @@ enum bpf_attach_type { BPF_TRACE_FSESSION, BPF_TRACE_FENTRY_MULTI, BPF_TRACE_FEXIT_MULTI, + BPF_TRACE_FSESSION_MULTI, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index 0cf9735929f5..3cf2cc6e3ab6 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -2187,6 +2187,7 @@ patch_map_ops_generic: if (eatype == BPF_TRACE_FEXIT || eatype == BPF_TRACE_FSESSION || eatype == BPF_TRACE_FEXIT_MULTI || + eatype == BPF_TRACE_FSESSION_MULTI || eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d551b9da0cfb..d4188a992bd8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4498,6 +4498,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FSESSION_MULTI: case BPF_TRACE_FENTRY_MULTI: case BPF_TRACE_FEXIT_MULTI: case BPF_MODIFY_RETURN: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index a3537fda50cf..1a721fc4bef5 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -183,7 +183,8 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) case BPF_PROG_TYPE_TRACING: if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION || - eatype == BPF_TRACE_FENTRY_MULTI || eatype == BPF_TRACE_FEXIT_MULTI) + eatype == BPF_TRACE_FENTRY_MULTI || eatype == BPF_TRACE_FEXIT_MULTI || + eatype == BPF_TRACE_FSESSION_MULTI) return true; return false; case BPF_PROG_TYPE_LSM: @@ -790,6 +791,7 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) case BPF_TRACE_FEXIT_MULTI: return BPF_TRAMP_FEXIT; case BPF_TRACE_FSESSION: + case BPF_TRACE_FSESSION_MULTI: return BPF_TRAMP_FSESSION; case BPF_LSM_MAC: if (!prog->aux->attach_func_proto->type) @@ -822,13 +824,30 @@ static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog) return 0; } +static struct bpf_tramp_node *fsession_exit(struct bpf_tramp_node *node) +{ + if (node->link->type == BPF_LINK_TYPE_TRACING) { + struct bpf_tracing_link *link; + + link = container_of(node->link, struct bpf_tracing_link, link.link); + return &link->fexit; + } else if (node->link->type == BPF_LINK_TYPE_TRACING_MULTI) { + struct bpf_tracing_multi_link *link; + struct bpf_tracing_multi_node *mnode; + + link = container_of(node->link, struct bpf_tracing_multi_link, link); + mnode = container_of(node, struct bpf_tracing_multi_node, node); + return &link->fexits[mnode - link->nodes]; + } + return NULL; +} + static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, struct bpf_tramp_node *node, int cnt) { - struct bpf_tracing_link *tr_link = NULL; enum bpf_tramp_prog_type kind; - struct bpf_tramp_node *node_existing; + struct bpf_tramp_node *node_existing, *fexit; struct hlist_head *prog_list; kind = bpf_attach_type_to_tramp(node->link->prog); @@ -853,8 +872,10 @@ static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, hlist_add_head(&node->tramp_hlist, prog_list); if (kind == BPF_TRAMP_FSESSION) { tr->progs_cnt[BPF_TRAMP_FENTRY]++; - tr_link = container_of(node, struct bpf_tracing_link, link.node); - hlist_add_head(&tr_link->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + fexit = fsession_exit(node); + if (WARN_ON_ONCE(!fexit)) + return -EINVAL; + hlist_add_head(&fexit->tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); tr->progs_cnt[BPF_TRAMP_FEXIT]++; } else { tr->progs_cnt[kind]++; @@ -865,13 +886,15 @@ static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, static void bpf_trampoline_remove_prog(struct bpf_trampoline *tr, struct bpf_tramp_node *node) { - struct bpf_tracing_link *tr_link; enum bpf_tramp_prog_type kind; + struct bpf_tramp_node *fexit; kind = bpf_attach_type_to_tramp(node->link->prog); if (kind == BPF_TRAMP_FSESSION) { - tr_link = container_of(node, struct bpf_tracing_link, link.node); - hlist_del_init(&tr_link->fexit.tramp_hlist); + fexit = fsession_exit(node); + if (WARN_ON_ONCE(!fexit)) + return; + hlist_del_init(&fexit->tramp_hlist); tr->progs_cnt[BPF_TRAMP_FEXIT]--; kind = BPF_TRAMP_FENTRY; } @@ -1615,6 +1638,11 @@ int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids, mnode->node.link = &link->link; mnode->node.cookie = link->cookies ? link->cookies[i] : 0; + if (prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) { + link->fexits[i].link = &link->link; + link->fexits[i].cookie = link->cookies ? link->cookies[i] : 0; + } + cond_resched(); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c594047ff0a..0c1cf506c219 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16384,6 +16384,7 @@ static bool return_retval_range(struct bpf_verifier_env *env, struct bpf_retval_ case BPF_TRACE_FSESSION: case BPF_TRACE_FENTRY_MULTI: case BPF_TRACE_FEXIT_MULTI: + case BPF_TRACE_FSESSION_MULTI: *range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: @@ -18952,7 +18953,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, tgt_prog->expected_attach_type == BPF_TRACE_FEXIT || tgt_prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI || tgt_prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI || - tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) { + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION || + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) { /* Program extensions can extend all program types * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance @@ -19058,9 +19060,11 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FSESSION_MULTI: case BPF_TRACE_FENTRY_MULTI: case BPF_TRACE_FEXIT_MULTI: - if (prog->expected_attach_type == BPF_TRACE_FSESSION && + if ((prog->expected_attach_type == BPF_TRACE_FSESSION || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) && !bpf_jit_supports_fsession()) { bpf_log(log, "JIT does not support fsession\n"); return -EOPNOTSUPP; @@ -19215,6 +19219,7 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY_MULTI: case BPF_TRACE_FEXIT_MULTI: + case BPF_TRACE_FSESSION_MULTI: return true; default: return false; @@ -19301,6 +19306,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || prog->expected_attach_type == BPF_TRACE_FSESSION || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI || prog->expected_attach_type == BPF_MODIFY_RETURN) && btf_id_set_contains(&noreturn_deny, btf_id)) { verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n", @@ -19340,7 +19346,8 @@ int bpf_check_attach_btf_id_multi(struct btf *btf, struct bpf_prog *prog, u32 bt return -EINVAL; /* Check noreturn attachment. */ - if (prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI && + if ((prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) && btf_id_set_contains(&noreturn_deny, btf_id)) return -EINVAL; /* Check denied attachment. */ @@ -19623,7 +19630,9 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] && - env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + (env->prog->expected_attach_type == BPF_TRACE_FSESSION || + env->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) { + /* * inline the bpf_session_is_return() for fsession: * bool bpf_session_is_return(void *ctx) @@ -19636,7 +19645,8 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1); *cnt = 3; } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] && - env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + (env->prog->expected_attach_type == BPF_TRACE_FSESSION || + env->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) { /* * inline bpf_session_cookie() for fsession: * __u64 *bpf_session_cookie(void *ctx) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e33492739ed1..a0d688fffc5a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1334,7 +1334,8 @@ static inline bool is_uprobe_session(const struct bpf_prog *prog) static inline bool is_trace_fsession(const struct bpf_prog *prog) { return prog->type == BPF_PROG_TYPE_TRACING && - prog->expected_attach_type == BPF_TRACE_FSESSION; + (prog->expected_attach_type == BPF_TRACE_FSESSION || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI); } static const struct bpf_func_proto * @@ -3659,6 +3660,7 @@ static void bpf_tracing_multi_link_dealloc(struct bpf_link *link) struct bpf_tracing_multi_link *tr_link = container_of(link, struct bpf_tracing_multi_link, link); + kvfree(tr_link->fexits); kvfree(tr_link->cookies); kvfree(tr_link); } @@ -3718,6 +3720,7 @@ static int check_dup_ids(u32 *ids, u64 *cookies, u32 cnt) int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) { struct bpf_tracing_multi_link *link = NULL; + struct bpf_tramp_node *fexits = NULL; struct bpf_link_primer link_primer; u32 cnt, *ids = NULL; u64 __user *ucookies; @@ -3761,6 +3764,14 @@ int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) if (err) goto error; + if (prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) { + fexits = kvmalloc_objs(*fexits, cnt); + if (!fexits) { + err = -ENOMEM; + goto error; + } + } + link = kvzalloc_flex(*link, nodes, cnt); if (!link) { err = -ENOMEM; @@ -3776,6 +3787,7 @@ int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) link->nodes_cnt = cnt; link->cookies = cookies; + link->fexits = fexits; err = bpf_trampoline_multi_attach(prog, ids, link); kvfree(ids); @@ -3786,6 +3798,7 @@ int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) return bpf_link_settle(&link_primer); error: + kvfree(fexits); kvfree(cookies); kvfree(ids); kvfree(link); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 67769c700cae..a831682ee982 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -705,6 +705,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, case BPF_TRACE_FSESSION: case BPF_TRACE_FENTRY_MULTI: case BPF_TRACE_FEXIT_MULTI: + case BPF_TRACE_FSESSION_MULTI: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 569c15e1cae3..11dd610fa5fa 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1158,6 +1158,7 @@ enum bpf_attach_type { BPF_TRACE_FSESSION, BPF_TRACE_FENTRY_MULTI, BPF_TRACE_FEXIT_MULTI, + BPF_TRACE_FSESSION_MULTI, __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 59405d318624..62f088359c5e 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -138,6 +138,7 @@ static const char * const attach_type_name[] = { [BPF_TRACE_UPROBE_SESSION] = "trace_uprobe_session", [BPF_TRACE_FENTRY_MULTI] = "trace_fentry_multi", [BPF_TRACE_FEXIT_MULTI] = "trace_fexit_multi", + [BPF_TRACE_FSESSION_MULTI] = "trace_fsession_multi", }; static const char * const link_type_name[] = { -- cgit v1.2.3 From 89edbdfc5d0308cef57b71359331de5c4ddbf763 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sun, 7 Jun 2026 13:30:41 -0700 Subject: bpf: Fix NMI/tracepoint re-entry deadlock on lru locks NMI and tracepoint BPF programs can re-enter the per-CPU or global LRU lock that bpf_lru_pop_free()/push_free() already hold on the same CPU, AA-deadlocking. Lockdep reports "inconsistent {INITIAL USE} -> {IN-NMI}" on &l->lock (syzbot c69a0a2c816716f1e0d5) and "possible recursive locking detected" on &loc_l->lock (syzbot 18b26edb69b2e19f3b33). Prior trylock and rqspinlock based fixes (see links) were nacked because compromised on reliability. This patch converts every LRU lock site to rqspinlock_t and adds a recovery path for some failure windows to avoid node leaks. Failure recovery: - *_pop_free top-level: return NULL; prealloc_lru_pop() already treats that as no-free-element (-ENOMEM). - Cross-CPU steal: skip the victim's locked loc_l, try next CPU. - Post-steal local lock fail: publish stolen node to lockless per-CPU free_llist; next pop on this CPU picks it up. - push_free fail: mark node pending_free=1. __local_list_flush(), __local_list_pop_pending() reclaim the node from pending_list. __bpf_lru_list_shrink_inactive() reclaims the node from inactive list. Nodes from active list are reclaimed by __bpf_lru_list_shrink() or after __bpf_lru_list_rotate_active() demotes it to the inactive. Fixes: 3a08c2fd7634 ("bpf: LRU List") Reported-by: syzbot+c69a0a2c816716f1e0d5@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c69a0a2c816716f1e0d5 Reported-by: syzbot+18b26edb69b2e19f3b33@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=18b26edb69b2e19f3b33 Link: https://lore.kernel.org/bpf/CAPPBnEYO4R+m+SpVc2gNj_x31R6fo1uJvj2bK2YS1P09GWT6kQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/CAPPBnEZmFA3ab8Uc=PEm0bdojZy=7T_F5_+eyZSHyZR3MBG4Vw@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20251030030010.95352-1-dongml2@chinatelecom.cn/ Link: https://lore.kernel.org/bpf/20260119142120.28170-1-leon.hwang@linux.dev/ Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260607-lru_map_spin-v3-1-bcd9332e911b@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lru_list.c | 165 ++++++++++++++++++++++++++++------------------ kernel/bpf/bpf_lru_list.h | 25 +++++-- 2 files changed, 119 insertions(+), 71 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index e7a2fc60523f..5ed7cb4b98c0 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -13,23 +13,8 @@ #define PERCPU_FREE_TARGET (4) #define PERCPU_NR_SCANS PERCPU_FREE_TARGET -/* Helpers to get the local list index */ -#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) -#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) -#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) -/* Local list helpers */ -static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) -{ - return &loc_l->lists[LOCAL_FREE_LIST_IDX]; -} - -static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) -{ - return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; -} - /* bpf_lru_node helpers */ static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) { @@ -72,6 +57,7 @@ static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, bpf_lru_list_count_dec(l, node->type); node->type = tgt_free_type; + WRITE_ONCE(node->pending_free, 0); list_move(&node->list, free_list); } @@ -87,6 +73,9 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l, bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; bpf_lru_node_clear_ref(node); + /* Reset pending_free only when moving to the free list */ + if (tgt_type == BPF_LRU_LIST_T_FREE) + WRITE_ONCE(node->pending_free, 0); list_move(&node->list, &l->lists[tgt_type]); } @@ -212,9 +201,11 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru, unsigned int i = 0; list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { - if (bpf_lru_node_is_ref(node)) { + if (bpf_lru_node_is_ref(node) && + !READ_ONCE(node->pending_free)) { __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); - } else if (lru->del_from_htab(lru->del_arg, node)) { + } else if (READ_ONCE(node->pending_free) || + lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); if (++nshrinked == tgt_nshrink) @@ -273,7 +264,8 @@ static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, list) { - if (lru->del_from_htab(lru->del_arg, node)) { + if (READ_ONCE(node->pending_free) || + lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); return 1; @@ -290,8 +282,10 @@ static void __local_list_flush(struct bpf_lru_list *l, struct bpf_lru_node *node, *tmp_node; list_for_each_entry_safe_reverse(node, tmp_node, - local_pending_list(loc_l), list) { - if (bpf_lru_node_is_ref(node)) + &loc_l->pending_list, list) { + if (READ_ONCE(node->pending_free)) + __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_FREE); + else if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); else __bpf_lru_node_move_in(l, node, @@ -307,9 +301,12 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l, if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return; - raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) { + WRITE_ONCE(node->pending_free, 1); + return; + } __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); } static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, @@ -318,8 +315,10 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, struct bpf_lru_list *l = &lru->common_lru.lru_list; struct bpf_lru_node *node, *tmp_node; unsigned int nfree = 0; + LIST_HEAD(tmp_free); - raw_spin_lock(&l->lock); + if (raw_res_spin_lock(&l->lock)) + return; __local_list_flush(l, loc_l); @@ -327,7 +326,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], list) { - __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), + __bpf_lru_node_move_to_free(l, node, &tmp_free, BPF_LRU_LOCAL_LIST_T_FREE); if (++nfree == lru->target_free) break; @@ -335,10 +334,19 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, if (nfree < lru->target_free) __bpf_lru_list_shrink(lru, l, lru->target_free - nfree, - local_free_list(loc_l), + &tmp_free, BPF_LRU_LOCAL_LIST_T_FREE); - raw_spin_unlock(&l->lock); + raw_res_spin_unlock(&l->lock); + + /* + * Transfer the harvested nodes from the temporary list_head into + * the lockless per-CPU free llist. + */ + list_for_each_entry_safe(node, tmp_node, &tmp_free, list) { + list_del(&node->list); + llist_add(&node->llist, &loc_l->free_llist); + } } static void __local_list_add_pending(struct bpf_lru *lru, @@ -350,22 +358,21 @@ static void __local_list_add_pending(struct bpf_lru *lru, *(u32 *)((void *)node + lru->hash_offset) = hash; node->cpu = cpu; node->type = BPF_LRU_LOCAL_LIST_T_PENDING; + WRITE_ONCE(node->pending_free, 0); bpf_lru_node_clear_ref(node); - list_add(&node->list, local_pending_list(loc_l)); + list_add(&node->list, &loc_l->pending_list); } static struct bpf_lru_node * __local_list_pop_free(struct bpf_lru_locallist *loc_l) { - struct bpf_lru_node *node; + struct llist_node *llnode; - node = list_first_entry_or_null(local_free_list(loc_l), - struct bpf_lru_node, - list); - if (node) - list_del(&node->list); + llnode = llist_del_first(&loc_l->free_llist); + if (!llnode) + return NULL; - return node; + return container_of(llnode, struct bpf_lru_node, llist); } static struct bpf_lru_node * @@ -376,10 +383,10 @@ __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) ignore_ref: /* Get from the tail (i.e. older element) of the pending list. */ - list_for_each_entry_reverse(node, local_pending_list(loc_l), - list) { + list_for_each_entry_reverse(node, &loc_l->pending_list, list) { if ((!bpf_lru_node_is_ref(node) || force) && - lru->del_from_htab(lru->del_arg, node)) { + (READ_ONCE(node->pending_free) || + lru->del_from_htab(lru->del_arg, node))) { list_del(&node->list); return node; } @@ -404,7 +411,8 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, l = per_cpu_ptr(lru->percpu_lru, cpu); - raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) + return NULL; __bpf_lru_list_rotate(lru, l); @@ -420,7 +428,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); } - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); return node; } @@ -437,7 +445,8 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, loc_l = per_cpu_ptr(clru->local_list, cpu); - raw_spin_lock_irqsave(&loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) + return NULL; node = __local_list_pop_free(loc_l); if (!node) { @@ -448,17 +457,22 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, if (node) __local_list_add_pending(lru, loc_l, cpu, node, hash); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); if (node) return node; - /* No free nodes found from the local free list and + /* + * No free nodes found from the local free list and * the global LRU list. * * Steal from the local free/pending list of the * current CPU and remote CPU in RR. It starts * with the loc_l->next_steal CPU. + * + * Acquire the victim's lock before touching either list. On + * acquisition failure (rqspinlock AA or timeout) skip the victim + * and try the next CPU. */ first_steal = loc_l->next_steal; @@ -466,24 +480,36 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, do { steal_loc_l = per_cpu_ptr(clru->local_list, steal); - raw_spin_lock_irqsave(&steal_loc_l->lock, flags); - - node = __local_list_pop_free(steal_loc_l); - if (!node) - node = __local_list_pop_pending(lru, steal_loc_l); - - raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + if (!raw_res_spin_lock_irqsave(&steal_loc_l->lock, flags)) { + node = __local_list_pop_free(steal_loc_l); + if (!node) + node = __local_list_pop_pending(lru, steal_loc_l); + raw_res_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + } steal = cpumask_next_wrap(steal, cpu_possible_mask); } while (!node && steal != first_steal); loc_l->next_steal = steal; - if (node) { - raw_spin_lock_irqsave(&loc_l->lock, flags); - __local_list_add_pending(lru, loc_l, cpu, node, hash); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + if (!node) + return NULL; + + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) { + /* + * The local pending lock can't be acquired (rqspinlock AA + * or timeout). Return the stolen node to the per-CPU + * free_llist instead of orphaning it; the next pop_free on + * this CPU will pick it up. + */ + node->type = BPF_LRU_LOCAL_LIST_T_FREE; + bpf_lru_node_clear_ref(node); + WRITE_ONCE(node->pending_free, 0); + llist_add(&node->llist, &loc_l->free_llist); + return NULL; } + __local_list_add_pending(lru, loc_l, cpu, node, hash); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); return node; } @@ -511,18 +537,24 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru, loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); - raw_spin_lock_irqsave(&loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) { + WRITE_ONCE(node->pending_free, 1); + return; + } if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, + flags); goto check_lru_list; } node->type = BPF_LRU_LOCAL_LIST_T_FREE; bpf_lru_node_clear_ref(node); - list_move(&node->list, local_free_list(loc_l)); + list_del(&node->list); + + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + llist_add(&node->llist, &loc_l->free_llist); return; } @@ -538,11 +570,14 @@ static void bpf_percpu_lru_push_free(struct bpf_lru *lru, l = per_cpu_ptr(lru->percpu_lru, node->cpu); - raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) { + WRITE_ONCE(node->pending_free, 1); + return; + } __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); } void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) @@ -565,6 +600,7 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, node = (struct bpf_lru_node *)(buf + node_offset); node->type = BPF_LRU_LIST_T_FREE; + node->pending_free = 0; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; @@ -594,6 +630,7 @@ again: node = (struct bpf_lru_node *)(buf + node_offset); node->cpu = cpu; node->type = BPF_LRU_LIST_T_FREE; + node->pending_free = 0; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); i++; @@ -618,14 +655,12 @@ void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) { - int i; - - for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) - INIT_LIST_HEAD(&loc_l->lists[i]); + INIT_LIST_HEAD(&loc_l->pending_list); + init_llist_head(&loc_l->free_llist); loc_l->next_steal = cpu; - raw_spin_lock_init(&loc_l->lock); + raw_res_spin_lock_init(&loc_l->lock); } static void bpf_lru_list_init(struct bpf_lru_list *l) @@ -640,7 +675,7 @@ static void bpf_lru_list_init(struct bpf_lru_list *l) l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; - raw_spin_lock_init(&l->lock); + raw_res_spin_lock_init(&l->lock); } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index fe2661a58ea9..8d0ee61622af 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -6,11 +6,11 @@ #include #include -#include +#include +#include #define NR_BPF_LRU_LIST_T (3) #define NR_BPF_LRU_LIST_COUNT (2) -#define NR_BPF_LRU_LOCAL_LIST_T (2) #define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T enum bpf_lru_list_type { @@ -22,10 +22,22 @@ enum bpf_lru_list_type { }; struct bpf_lru_node { - struct list_head list; + /* + * A node is in at most one list at a time. The free path on the + * per-CPU locallist uses an llist, so share storage via a union. + */ + union { + struct list_head list; + struct llist_node llist; + }; u16 cpu; u8 type; u8 ref; + /* + * Marks nodes whose *_push_free() lock acquire failed; reclaimed + * by flush/shrink which honor the flag instead of del_from_htab(). + */ + u8 pending_free; }; struct bpf_lru_list { @@ -34,13 +46,14 @@ struct bpf_lru_list { /* The next inactive list rotation starts from here */ struct list_head *next_inactive_rotation; - raw_spinlock_t lock ____cacheline_aligned_in_smp; + rqspinlock_t lock ____cacheline_aligned_in_smp; }; struct bpf_lru_locallist { - struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; + struct list_head pending_list; + struct llist_head free_llist; u16 next_steal; - raw_spinlock_t lock; + rqspinlock_t lock; }; struct bpf_common_lru { -- cgit v1.2.3 From 53040a81ae57cdca8af8ac36fe4e661730cf7c6b Mon Sep 17 00:00:00 2001 From: Nuoqi Gui Date: Sun, 7 Jun 2026 21:24:13 +0800 Subject: bpf: Keep dynamic inner array lookups nullable An ARRAY_OF_MAPS can use an array created with BPF_F_INNER_MAP as its inner map template. A concrete inner array with a different max_entries value can then replace the template. After a successful outer map lookup, the verifier represents the resulting map pointer using the inner map template. Const-key lookup nullness elision consequently uses the template max_entries even though the runtime helper uses the concrete inner map max_entries. Do not elide lookup result nullness for maps marked with BPF_F_INNER_MAP, because the template max_entries does not prove that the key is in bounds for the concrete runtime map. Fixes: d2102f2f5d75 ("bpf: verifier: Support eliding map lookup nullness") Signed-off-by: Nuoqi Gui Acked-by: Eduard Zingerman Acked-by: Jiri Olsa Cc: stable@vger.kernel.org Link: https://lore.kernel.org/bpf/20260607-f01-v2-v2-1-da48453146e8@mails.tsinghua.edu.cn Signed-off-by: Kumar Kartikeya Dwivedi --- kernel/bpf/verifier.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0c1cf506c219..ed7ba0e6a9ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8179,7 +8179,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, return 0; } -static bool can_elide_value_nullness(enum bpf_map_type type); +static bool can_elide_value_nullness(const struct bpf_map *map); static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, @@ -8298,7 +8298,7 @@ skip_type_check: err = check_helper_mem_access(env, reg, argno_from_reg(regno), key_size, BPF_READ, false, NULL); if (err) return err; - if (can_elide_value_nullness(meta->map.ptr->map_type)) { + if (can_elide_value_nullness(meta->map.ptr)) { err = get_constant_map_key(env, reg, key_size, &meta->const_map_key); if (err < 0) { meta->const_map_key = -1; @@ -10068,13 +10068,16 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno state->callback_subprogno == subprogno); } -/* Returns whether or not the given map type can potentially elide +/* Returns whether or not the given map can potentially elide * lookup return value nullness check. This is possible if the key * is statically known. */ -static bool can_elide_value_nullness(enum bpf_map_type type) +static bool can_elide_value_nullness(const struct bpf_map *map) { - switch (type) { + if (map->map_flags & BPF_F_INNER_MAP) + return false; + + switch (map->map_type) { case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_PERCPU_ARRAY: return true; @@ -10414,7 +10417,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } if (func_id == BPF_FUNC_map_lookup_elem && - can_elide_value_nullness(meta.map.ptr->map_type) && + can_elide_value_nullness(meta.map.ptr) && meta.const_map_key >= 0 && meta.const_map_key < meta.map.ptr->max_entries) ret_flag &= ~PTR_MAYBE_NULL; -- cgit v1.2.3 From 50dff00615522f3ec03449680ca23beb4cfc549c Mon Sep 17 00:00:00 2001 From: Sechang Lim Date: Mon, 8 Jun 2026 05:00:00 +0000 Subject: bpf: Fix NULL pointer dereference in bpf_task_from_vpid() bpf_task_from_vpid() looks up a task in the pid namespace of the current task, via find_task_by_vpid(): find_task_by_vpid(vpid) find_task_by_pid_ns(vpid, task_active_pid_ns(current)) find_pid_ns(nr, ns) -> idr_find(&ns->idr, nr) cgroup_skb programs run in softirq, which may interrupt a task that is itself in do_exit(). Once that task has passed exit_notify() -> release_task() -> __unhash_process(), its thread_pid is cleared, so task_active_pid_ns(current) returns NULL and find_pid_ns() dereferences &NULL->idr: BUG: kernel NULL pointer dereference, address: 0000000000000050 RIP: 0010:idr_find+0x11/0x30 lib/idr.c:176 Call Trace: find_pid_ns kernel/pid.c:370 [inline] find_task_by_pid_ns+0x3b/0xe0 kernel/pid.c:485 bpf_task_from_vpid+0x5b/0x200 kernel/bpf/helpers.c:2916 bpf_prog_run_array_cg+0x17e/0x530 kernel/bpf/cgroup.c:81 __cgroup_bpf_run_filter_skb+0x12b/0x250 kernel/bpf/cgroup.c:1612 sk_filter_trim_cap+0x1dc/0x4c0 net/core/filter.c:148 tcp_v4_rcv+0x18d1/0x2200 net/ipv4/tcp_ipv4.c:2223 do_exit+0xa63/0x1270 kernel/exit.c:1010 get_signal+0x141c/0x1530 kernel/signal.c:3037 Bail out when current has no pid namespace. Fixes: 675c3596ff32 ("bpf: Add bpf_task_from_vpid() kfunc") Signed-off-by: Sechang Lim Acked-by: Leon Hwang Link: https://lore.kernel.org/bpf/20260608050001.2545245-1-rhkrqnwk98@gmail.com Signed-off-by: Kumar Kartikeya Dwivedi --- kernel/bpf/helpers.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8ba2b8965caf..8e196c9b7c50 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3009,11 +3009,13 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) { struct task_struct *p; - rcu_read_lock(); + guard(rcu)(); + if (!task_active_pid_ns(current)) + return NULL; + p = find_task_by_vpid(vpid); if (p) p = bpf_task_acquire(p); - rcu_read_unlock(); return p; } -- cgit v1.2.3 From b9452b594fd3aecbfd4aa0a6a1f741330a37dab7 Mon Sep 17 00:00:00 2001 From: Paul Moses Date: Fri, 5 Jun 2026 23:43:09 +0000 Subject: bpf: Validate BTF repeated field counts before expansion btf_parse_struct_metas() walks user-supplied BTF during BPF_BTF_LOAD, and btf_repeat_fields() expands repeatable fields from array elements into the fixed BTF_FIELDS_MAX scratch array used by btf_parse_fields(). The remaining-capacity check performs the expanded field count calculation in u32. A malformed BTF can wrap that calculation, causing the check to pass even when the expanded field count exceeds the scratch array capacity. The following memcpy() can then write past the end of the array. Use checked addition and multiplication before copying repeated fields and reject impossible counts. Fixes: 797d73ee232d ("bpf: Check the remaining info_cnt before repeating btf fields") Cc: stable@vger.kernel.org Signed-off-by: Paul Moses Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260605234301.1109063-1-p@1g4.org Signed-off-by: Kumar Kartikeya Dwivedi --- kernel/bpf/btf.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ef4402274786..15ae7c43f594 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3667,7 +3667,7 @@ end: static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, u32 field_cnt, u32 repeat_cnt, u32 elem_size) { - u32 i, j; + u32 i, j, total_cnt, total_repeats; u32 cur; /* Ensure not repeating fields that should not be repeated. */ @@ -3685,10 +3685,9 @@ static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, } } - /* The type of struct size or variable size is u32, - * so the multiplication will not overflow. - */ - if (field_cnt * (repeat_cnt + 1) > info_cnt) + if (check_add_overflow(repeat_cnt, 1, &total_repeats) || + check_mul_overflow(field_cnt, total_repeats, &total_cnt) || + total_cnt > (u32)info_cnt) return -E2BIG; cur = field_cnt; -- cgit v1.2.3 From fa75b7c85b0d2b6ab1c3ee0f06d35e2b98078c45 Mon Sep 17 00:00:00 2001 From: Nuoqi Gui Date: Tue, 9 Jun 2026 22:43:50 +0800 Subject: bpf: Enforce write checks for BTF pointer helper access check_mem_reg() verifies both read and write access for global subprogram memory arguments. When the caller register is PTR_TO_BTF_ID, check_helper_mem_access() currently forwards the access to check_ptr_to_btf_access() as BPF_READ regardless of the requested access type. This lets a BTF-backed kernel object field pointer pass the caller-side writable memory check for a global subprogram argument. The callee is then validated with a generic writable PTR_TO_MEM argument and can store through it, even though an equivalent direct BTF field store is rejected with "only read is supported". Forward the requested access type to check_ptr_to_btf_access(). This enforces existing BTF write restrictions for global subprogram memory arguments as well. Fixes: 3e30be4288b3 ("bpf: Allow helpers access trusted PTR_TO_BTF_ID.") Signed-off-by: Nuoqi Gui Link: https://lore.kernel.org/bpf/20260609-f01-04-btf-writable-arg-v1-1-f449cd970669@mails.tsinghua.edu.cn Signed-off-by: Kumar Kartikeya Dwivedi --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ed7ba0e6a9ce..cdff3e6eb96e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6777,7 +6777,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_ zero_size_allowed, access_type, meta); case PTR_TO_BTF_ID: return check_ptr_to_btf_access(env, regs, reg, argno, 0, - access_size, BPF_READ, -1); + access_size, access_type, -1); case PTR_TO_CTX: /* Only permit reading or writing syscall context using helper calls. */ if (is_var_ctx_off_allowed(env->prog)) { -- cgit v1.2.3 From 2f884d371fafea137afea504d49ee4a7c8d7985b Mon Sep 17 00:00:00 2001 From: Vlad Poenaru Date: Tue, 9 Jun 2026 06:55:57 -0700 Subject: bpf: Allow LPM map access from sleepable BPF programs trie_lookup_elem() annotates its rcu_dereference_check() walks with only rcu_read_lock_bh_held(). Because rcu_dereference_check(p, c) resolves to "c || rcu_read_lock_held()", this passes for XDP/NAPI and classic RCU readers but fails for sleepable BPF programs, which enter via __bpf_prog_enter_sleepable() and hold only rcu_read_lock_trace(). trie_update_elem() and trie_delete_elem() have the same problem in a different form: they walk the trie with plain rcu_dereference(), which asserts rcu_read_lock_held() unconditionally. Both are reachable from sleepable BPF programs via the bpf_map_update_elem / bpf_map_delete_elem helpers, and from the syscall path under classic rcu_read_lock(). In the writer paths the trie is actually protected by trie->lock (an rqspinlock taken across the walk); we never relied on the RCU read-side lock to keep nodes alive there. A sleepable LSM hook that ends up touching an LPM trie therefore triggers lockdep on debug kernels: ============================= WARNING: suspicious RCU usage 7.1.0-... Tainted: G E ----------------------------- kernel/bpf/lpm_trie.c:249 suspicious rcu_dereference_check() usage! 1 lock held by net_tests/540: #0: (rcu_tasks_trace_srcu_struct){....}-{0:0}, at: __bpf_prog_enter_sleepable+0x26/0x280 Call Trace: dump_stack_lvl lockdep_rcu_suspicious trie_lookup_elem bpf_prog_..._enforce_security_socket_connect bpf_trampoline_... security_socket_connect __sys_connect do_syscall_64 This is lockdep-only -- no UAF, since Tasks Trace RCU does serialize against the trie's reclaim path -- but it spams the console once per distinct callsite on every debug kernel running a sleepable BPF LSM that touches an LPM trie, which is increasingly common. For the lookup path, switch the rcu_dereference_check() annotation from rcu_read_lock_bh_held() to bpf_rcu_lock_held(), which accepts all three contexts (classic, BH, Tasks Trace). Other map types already follow this convention. For trie_update_elem() and trie_delete_elem(), annotate the walks as rcu_dereference_protected(*p, 1) -- matching trie_free() in the same file -- since trie->lock is held across the walk. rqspinlock has no lockdep_map, so the predicate degenerates to '1' rather than lockdep_is_held(&trie->lock); the protection is real but not machine-verifiable. trie_get_next_key() also uses bare rcu_dereference() but is reachable only from the BPF syscall, which holds classic rcu_read_lock() before dispatching, so it is left untouched. Fixes: 694cea395fde ("bpf: Allow RCU-protected lookups to happen from bh context") Cc: stable@vger.kernel.org Signed-off-by: Vlad Poenaru Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260609135558.193287-2-vlad.wing@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/lpm_trie.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 0f57608b385d..4d6f25db9ba1 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -246,7 +246,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) /* Start walking the trie from the root node ... */ - for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held()); + for (node = rcu_dereference_check(trie->root, bpf_rcu_lock_held()); node;) { unsigned int next_bit; size_t matchlen; @@ -280,7 +280,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) */ next_bit = extract_bit(key->data, node->prefixlen); node = rcu_dereference_check(node->child[next_bit], - rcu_read_lock_bh_held()); + bpf_rcu_lock_held()); } if (!found) @@ -359,7 +359,7 @@ static long trie_update_elem(struct bpf_map *map, */ slot = &trie->root; - while ((node = rcu_dereference(*slot))) { + while ((node = rcu_dereference_protected(*slot, 1))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -482,7 +482,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) trim = &trie->root; trim2 = trim; parent = NULL; - while ((node = rcu_dereference(*trim))) { + while ((node = rcu_dereference_protected(*trim, 1))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || -- cgit v1.2.3 From a3d76e27bbbf91d1025ce99eb55068ae0aa14322 Mon Sep 17 00:00:00 2001 From: Vlad Poenaru Date: Tue, 9 Jun 2026 06:55:58 -0700 Subject: bpf: Allow sleepable programs to use LPM trie maps directly The previous change relaxed the rcu_dereference annotations in lpm_trie.c so the trie walks no longer trip lockdep when reached from a sleepable BPF program holding only rcu_read_lock_trace(). By itself that only helps tries reached as the inner map of a map-of-maps, or from the classic-RCU syscall path: a sleepable program that references an LPM trie directly is still rejected at load time by check_map_prog_compatibility(), whose sleepable whitelist omits BPF_MAP_TYPE_LPM_TRIE: Sleepable programs can only use array, hash, ringbuf and local storage maps LPM trie nodes are allocated from a bpf_mem_alloc (trie->ma) and freed with bpf_mem_cache_free_rcu(), which chains a regular RCU grace period into a Tasks Trace grace period before the node -- and the value embedded in it that trie_lookup_elem() returns to the program -- is released. That is the same reclaim discipline BPF_MAP_TYPE_HASH relies on for sleepable access, so a value handed to a sleepable reader cannot be freed while the program is still running under rcu_read_lock_trace(). The writer paths take trie->lock across the walk and never relied on the RCU read-side lock to keep nodes alive. Add BPF_MAP_TYPE_LPM_TRIE to the sleepable map whitelist so these programs can use LPM tries directly. Signed-off-by: Vlad Poenaru Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260609135558.193287-3-vlad.wing@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cdff3e6eb96e..954b85609f32 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17737,6 +17737,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_PERCPU_HASH: case BPF_MAP_TYPE_PERCPU_ARRAY: case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_LPM_TRIE: case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: -- cgit v1.2.3 From 94c8d1c21be40a845357854f98ec07e21bb14bc9 Mon Sep 17 00:00:00 2001 From: Justin Suess Date: Tue, 9 Jun 2026 22:25:43 +0200 Subject: bpf: Reject bpf_obj_drop() from tracing progs bpf_obj_drop() runs bpf_obj_free_fields() synchronously for program-allocated objects. When such an object contains NMI unsafe fields, tracing programs that can run from arbitrary instrumented context can reach that destruction from unsafe contexts, including NMI. NMI is likely one instance of this problem, and other instances would include possible unsafe reentrancy. Deferring bpf_obj_drop() is not appealing either: it would add delayed-free machinery to a release operation that otherwise has straightforward synchronous ownership semantics. Reject bpf_obj_drop() and bpf_percpu_obj_drop() from tracing programs that may run from unsafe contexts unless every field in the object's BTF record is explicitly NMI safe. Do not reject sleepable BPF_PROG_TYPE_TRACING programs, since they are not the arbitrary/NMI contexts that motivate the restriction. Note that while bpf_rb_root and bpf_list_head would be NMI safe on their own to free, the objects recursively held by them may not be; be conservative and just mark them as not NMI safe for now. Use a whitelist for the NMI-safe field set instead of listing only known NMI unsafe fields. Locks, async fields, unreferenced kptrs, and refcounts are known to be NMI safe because their destruction is either a no-op, simple state reset, or async cancellation. Referenced kptrs, percpu referenced kptrs, uptrs, graph roots, graph nodes, and any future field type are rejected until audited for arbitrary tracing and NMI contexts. This is less susceptible to future changes in fields that were previously safe by exclusion, and to new fields being added without updating this check. Convert the existing recursive local-object drop success case to a syscall program in the same commit, since this verifier change makes the old tracing program form invalid. The test still exercises bpf_obj_drop() releasing a referenced task kptr from a safe program type. Fixes: ac9f06050a35 ("bpf: Introduce bpf_obj_drop") Signed-off-by: Justin Suess Co-developed-by: Kumar Kartikeya Dwivedi Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260609202548.3571690-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 29 +++++++++++++++ kernel/bpf/verifier.c | 17 +++++++++ .../testing/selftests/bpf/prog_tests/task_kfunc.c | 42 +++++++++++++++++++++- .../selftests/bpf/progs/task_kfunc_success.c | 13 ++++--- 4 files changed, 93 insertions(+), 8 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 62bba7a4876f..0654d2ffadc1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -492,6 +492,35 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f return rec->field_mask & type; } +static inline bool btf_field_is_nmi_safe(enum btf_field_type type) +{ + switch (type) { + case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: + case BPF_TIMER: + case BPF_WORKQUEUE: + case BPF_TASK_WORK: + case BPF_KPTR_UNREF: + case BPF_REFCOUNT: + return true; + default: + return false; + } +} + +static inline bool btf_record_has_nmi_unsafe_fields(const struct btf_record *rec) +{ + int i; + + if (IS_ERR_OR_NULL(rec)) + return false; + for (i = 0; i < rec->cnt; i++) { + if (!btf_field_is_nmi_safe(rec->fields[i].type)) + return true; + } + return false; +} + static inline void bpf_obj_init(const struct btf_record *rec, void *obj) { int i; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 954b85609f32..eb46a81a8c51 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -205,6 +205,7 @@ static int release_reference_nomark(struct bpf_verifier_state *state, int id); static int release_reference(struct bpf_verifier_env *env, int id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); +static bool is_tracing_prog_type(enum bpf_prog_type type); static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); static bool is_trusted_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg); @@ -12881,6 +12882,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable; + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; const struct btf_type *t, *ptr_type; @@ -12957,6 +12959,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (err < 0) return err; + if ((is_bpf_obj_drop_kfunc(meta.func_id) || + is_bpf_percpu_obj_drop_kfunc(meta.func_id)) && (is_tracing_prog_type(prog_type) || + /* is_tracing_prog_type() for now doesn't cover non-iterator tracing progs. */ + (prog_type == BPF_PROG_TYPE_TRACING && env->prog->expected_attach_type != BPF_TRACE_ITER + && !env->prog->sleepable))) { + struct btf_struct_meta *struct_meta; + + struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); + if (struct_meta && btf_record_has_nmi_unsafe_fields(struct_meta->record)) { + verbose(env, "%s cannot be used in tracing programs on types with NMI unsafe fields\n", + func_name); + return -EINVAL; + } + } + if (is_bpf_rbtree_add_kfunc(meta.func_id)) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_rbtree_add_callback_state); diff --git a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c index 83b90335967a..e6e95c1416e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c +++ b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c @@ -68,6 +68,36 @@ cleanup: task_kfunc_success__destroy(skel); } +static void run_syscall_success_test(const char *prog_name) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct task_kfunc_success *skel; + struct bpf_program *prog; + int err; + + skel = open_load_task_kfunc_skel(); + if (!ASSERT_OK_PTR(skel, "open_load_skel")) + return; + + if (!ASSERT_OK(skel->bss->err, "pre_run_err")) + goto cleanup; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto cleanup; + + err = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts); + if (!ASSERT_OK(err, "bpf_prog_test_run_opts")) + goto cleanup; + if (!ASSERT_EQ(opts.retval, 0, "retval")) + goto cleanup; + + ASSERT_OK(skel->bss->err, "post_run_err"); + +cleanup: + task_kfunc_success__destroy(skel); +} + static int run_vpid_test(void *prog_name) { struct task_kfunc_success *skel; @@ -140,7 +170,6 @@ static const char * const success_tests[] = { "test_task_acquire_release_argument", "test_task_acquire_release_current", "test_task_acquire_leave_in_map", - "test_task_xchg_release", "test_task_map_acquire_release", "test_task_current_acquire_release", "test_task_from_pid_arg", @@ -151,6 +180,10 @@ static const char * const success_tests[] = { "test_task_kfunc_flavor_relo_not_found", }; +static const char * const syscall_success_tests[] = { + "test_task_xchg_release", +}; + static const char * const vpid_success_tests[] = { "test_task_from_vpid_current", "test_task_from_vpid_invalid", @@ -167,6 +200,13 @@ void test_task_kfunc(void) run_success_test(success_tests[i]); } + for (i = 0; i < ARRAY_SIZE(syscall_success_tests); i++) { + if (!test__start_subtest(syscall_success_tests[i])) + continue; + + run_syscall_success_test(syscall_success_tests[i]); + } + for (i = 0; i < ARRAY_SIZE(vpid_success_tests); i++) { if (!test__start_subtest(vpid_success_tests[i])) continue; diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c index 5fb4fc19d26a..d63a79ee33dc 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c @@ -140,17 +140,17 @@ int BPF_PROG(test_task_acquire_leave_in_map, struct task_struct *task, u64 clone return 0; } -SEC("tp_btf/task_newtask") -int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) +SEC("syscall") +int test_task_xchg_release(const void *ctx) { - struct task_struct *kptr, *acquired; + struct task_struct *task, *kptr, *acquired; struct __tasks_kfunc_map_value *v, *local; int refcnt, refcnt_after_drop; long status; - if (!is_test_kfunc_task()) - return 0; + (void)ctx; + task = bpf_get_current_task_btf(); status = tasks_kfunc_map_insert(task); if (status) { err = 1; @@ -191,7 +191,7 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) return 0; } - /* Stash a copy into local kptr and check if it is released recursively */ + /* Stash a copy into local kptr and check if it is released recursively. */ acquired = bpf_task_acquire(kptr); if (!acquired) { err = 7; @@ -220,7 +220,6 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) } bpf_task_release(kptr); - return 0; } -- cgit v1.2.3 From a3a81d247651218e47153f2d2afd7aee236726fd Mon Sep 17 00:00:00 2001 From: Justin Suess Date: Tue, 9 Jun 2026 22:25:44 +0200 Subject: bpf: Cancel special fields on map value recycle Map update and delete paths currently call bpf_obj_free_fields() when a value is being replaced or recycled. That makes field destruction depend on the context of the update/delete operation. For tracing programs this can include NMI context, where referenced kptr destructors, uptr unpinning, and graph root destruction are not generally safe. Introduce bpf_obj_cancel_fields() for the reusable-value path. It only performs NMI-safe cleanup for timer, workqueue, and task_work fields. Fields that need full destruction are left attached to the recycled value and are destroyed by the final cleanup path instead. Switch array and hashtab update/delete/recycle paths to this cancel helper. Keep bpf_obj_free_fields() for final map destruction and for bpf_mem_alloc destructors. Preallocated hashtabs do not have allocator destructors, so teardown continues to walk the normal and extra elements and fully destroy their fields. This deliberately relaxes the eager-free semantics of map update/delete for special fields. Programs that relied on a recycled map slot becoming empty immediately after update/delete were relying on behavior that cannot be implemented safely from every BPF execution context without offloading arbitrary destructors. There is a chance this change breaks programs making assumptions regarding the eager freeing of fields. If so, we can relax semantics to cancellation only when irqs_disabled() is true in the future. However, theoretically, map values that get reused eagerly already have weaker guarantees as parallel users can recreate freed fields before the new element becomes visible again. Fixes: 14a324f6a67e ("bpf: Wire up freeing of referenced kptr") Signed-off-by: Justin Suess Co-developed-by: Kumar Kartikeya Dwivedi Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260609202548.3571690-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/arraymap.c | 8 +-- kernel/bpf/hashtab.c | 32 +++++----- kernel/bpf/syscall.c | 5 ++ .../testing/selftests/bpf/prog_tests/htab_update.c | 4 +- .../testing/selftests/bpf/prog_tests/linked_list.c | 33 +++++----- tools/testing/selftests/bpf/prog_tests/map_kptr.c | 10 +-- .../selftests/bpf/prog_tests/refcounted_kptr.c | 8 ++- tools/testing/selftests/bpf/progs/htab_update.c | 4 +- tools/testing/selftests/bpf/progs/linked_list.c | 71 ++++++++++++++++++++++ .../testing/selftests/bpf/progs/refcounted_kptr.c | 20 +++++- 11 files changed, 146 insertions(+), 50 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0654d2ffadc1..56f5da2b437f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2717,6 +2717,7 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); void bpf_obj_free_task_work(const struct btf_record *rec, void *obj); +void bpf_obj_cancel_fields(struct bpf_map *map, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e6271a2bf6d6..248b4818178c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -384,7 +384,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value, if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { val = this_cpu_ptr(array->pptrs[index & array->index_mask]); copy_map_value(map, val, value); - bpf_obj_free_fields(array->map.record, val); + bpf_obj_cancel_fields(map, val); } else { val = array->value + (u64)array->elem_size * (index & array->index_mask); @@ -392,7 +392,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); - bpf_obj_free_fields(array->map.record, val); + bpf_obj_cancel_fields(map, val); } return 0; } @@ -432,14 +432,14 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, cpu = map_flags >> 32; ptr = per_cpu_ptr(pptr, cpu); copy_map_value(map, ptr, value); - bpf_obj_free_fields(array->map.record, ptr); + bpf_obj_cancel_fields(map, ptr); goto unlock; } for_each_possible_cpu(cpu) { ptr = per_cpu_ptr(pptr, cpu); val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; copy_map_value(map, ptr, val); - bpf_obj_free_fields(array->map.record, ptr); + bpf_obj_cancel_fields(map, ptr); } unlock: rcu_read_unlock(); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b4366cad3cfa..9f394e1aa2e8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -243,6 +243,10 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab) if (IS_ERR_OR_NULL(htab->map.record)) return; + /* + * Preallocated maps do not have a bpf_mem_alloc destructor, so fully + * destroy every element, including the extra elements. + */ if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); for (i = 0; i < num_entries; i++) { @@ -833,8 +837,8 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } -static void check_and_free_fields(struct bpf_htab *htab, - struct htab_elem *elem) +static void check_and_cancel_fields(struct bpf_htab *htab, + struct htab_elem *elem) { if (IS_ERR_OR_NULL(htab->map.record)) return; @@ -844,11 +848,11 @@ static void check_and_free_fields(struct bpf_htab *htab, int cpu; for_each_possible_cpu(cpu) - bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); + bpf_obj_cancel_fields(&htab->map, per_cpu_ptr(pptr, cpu)); } else { void *map_value = htab_elem_value(elem, htab->map.key_size); - bpf_obj_free_fields(htab->map.record, map_value); + bpf_obj_cancel_fields(&htab->map, map_value); } } @@ -883,7 +887,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) htab_unlock_bucket(b, flags); if (l == tgt_l) - check_and_free_fields(htab, l); + check_and_cancel_fields(htab, l); return l == tgt_l; } @@ -948,7 +952,7 @@ find_first_elem: static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { - check_and_free_fields(htab, l); + check_and_cancel_fields(htab, l); if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); @@ -1001,7 +1005,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) if (htab_is_prealloc(htab)) { bpf_map_dec_elem_count(&htab->map); - check_and_free_fields(htab, l); + check_and_cancel_fields(htab, l); pcpu_freelist_push(&htab->freelist, &l->fnode); } else { dec_elem_count(htab); @@ -1018,7 +1022,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, /* copy true value_size bytes */ ptr = this_cpu_ptr(pptr); copy_map_value(&htab->map, ptr, value); - bpf_obj_free_fields(htab->map.record, ptr); + bpf_obj_cancel_fields(&htab->map, ptr); } else { u32 size = round_up(htab->map.value_size, 8); void *val; @@ -1028,7 +1032,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, cpu = map_flags >> 32; ptr = per_cpu_ptr(pptr, cpu); copy_map_value(&htab->map, ptr, value); - bpf_obj_free_fields(htab->map.record, ptr); + bpf_obj_cancel_fields(&htab->map, ptr); return; } @@ -1036,7 +1040,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, ptr = per_cpu_ptr(pptr, cpu); val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; copy_map_value(&htab->map, ptr, val); - bpf_obj_free_fields(htab->map.record, ptr); + bpf_obj_cancel_fields(&htab->map, ptr); } } } @@ -1252,11 +1256,11 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (l_old) { hlist_nulls_del_rcu(&l_old->hash_node); - /* l_old has already been stashed in htab->extra_elems, free - * its special fields before it is available for reuse. + /* l_old has already been stashed in htab->extra_elems, cancel + * its reusable special fields before it is available for reuse. */ if (htab_is_prealloc(htab)) - check_and_free_fields(htab, l_old); + check_and_cancel_fields(htab, l_old); } htab_unlock_bucket(b, flags); if (l_old && !htab_is_prealloc(htab)) @@ -1269,7 +1273,7 @@ err: static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) { - check_and_free_fields(htab, elem); + check_and_cancel_fields(htab, elem); bpf_map_dec_elem_count(&htab->map); bpf_lru_push_free(&htab->lru, &elem->lru_node); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d4188a992bd8..7ed949f70f82 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -808,6 +808,11 @@ void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) bpf_task_work_cancel_and_free(obj + rec->task_work_off); } +void bpf_obj_cancel_fields(struct bpf_map *map, void *obj) +{ + bpf_map_free_internal_structs(map, obj); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; diff --git a/tools/testing/selftests/bpf/prog_tests/htab_update.c b/tools/testing/selftests/bpf/prog_tests/htab_update.c index ea1a6766fbe9..0a28d4346924 100644 --- a/tools/testing/selftests/bpf/prog_tests/htab_update.c +++ b/tools/testing/selftests/bpf/prog_tests/htab_update.c @@ -23,7 +23,7 @@ static void test_reenter_update(void) if (!ASSERT_OK_PTR(skel, "htab_update__open")) return; - bpf_program__set_autoload(skel->progs.bpf_obj_free_fields, true); + bpf_program__set_autoload(skel->progs.bpf_obj_cancel_fields, true); err = htab_update__load(skel); if (!ASSERT_TRUE(!err, "htab_update__load") || err) goto out; @@ -50,7 +50,7 @@ static void test_reenter_update(void) /* * Second update: replace existing element with same key and trigger * the reentrancy of bpf_map_update_elem(). - * check_and_free_fields() calls bpf_obj_free_fields() on the old + * check_and_cancel_fields() calls bpf_obj_cancel_fields() on the old * value, which is where fentry program runs and performs a nested * bpf_map_update_elem(), triggering -EDEADLK. */ diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c index dbff099860ba..8defea0253ed 100644 --- a/tools/testing/selftests/bpf/prog_tests/linked_list.c +++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c @@ -131,13 +131,14 @@ end: linked_list_fail__destroy(skel); } -static void clear_fields(struct bpf_map *map) +static void clear_fields(struct bpf_program *prog) { - char buf[24]; - int key = 0; + LIBBPF_OPTS(bpf_test_run_opts, opts); + int ret; - memset(buf, 0xff, sizeof(buf)); - ASSERT_OK(bpf_map__update_elem(map, &key, sizeof(key), buf, sizeof(buf), 0), "check_and_free_fields"); + ret = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts); + ASSERT_OK(ret, "clear_fields"); + ASSERT_OK(opts.retval, "clear_fields retval"); } enum { @@ -170,31 +171,31 @@ static void test_linked_list_success(int mode, bool leave_in_map) ASSERT_OK(ret, "map_list_push_pop"); ASSERT_OK(opts.retval, "map_list_push_pop retval"); if (!leave_in_map) - clear_fields(skel->maps.array_map); + clear_fields(skel->progs.clear_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.inner_map_list_push_pop), &opts); ASSERT_OK(ret, "inner_map_list_push_pop"); ASSERT_OK(opts.retval, "inner_map_list_push_pop retval"); if (!leave_in_map) - clear_fields(skel->maps.inner_map); + clear_fields(skel->progs.clear_inner_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_push_pop), &opts); ASSERT_OK(ret, "global_list_push_pop"); ASSERT_OK(opts.retval, "global_list_push_pop retval"); if (!leave_in_map) - clear_fields(skel->maps.bss_A); + clear_fields(skel->progs.clear_global_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_push_pop_nested), &opts); ASSERT_OK(ret, "global_list_push_pop_nested"); ASSERT_OK(opts.retval, "global_list_push_pop_nested retval"); if (!leave_in_map) - clear_fields(skel->maps.bss_A); + clear_fields(skel->progs.clear_global_nested_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_array_push_pop), &opts); ASSERT_OK(ret, "global_list_array_push_pop"); ASSERT_OK(opts.retval, "global_list_array_push_pop retval"); if (!leave_in_map) - clear_fields(skel->maps.bss_A); + clear_fields(skel->progs.clear_global_array_list); if (mode == PUSH_POP) goto end; @@ -204,19 +205,19 @@ ppm: ASSERT_OK(ret, "map_list_push_pop_multiple"); ASSERT_OK(opts.retval, "map_list_push_pop_multiple retval"); if (!leave_in_map) - clear_fields(skel->maps.array_map); + clear_fields(skel->progs.clear_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.inner_map_list_push_pop_multiple), &opts); ASSERT_OK(ret, "inner_map_list_push_pop_multiple"); ASSERT_OK(opts.retval, "inner_map_list_push_pop_multiple retval"); if (!leave_in_map) - clear_fields(skel->maps.inner_map); + clear_fields(skel->progs.clear_inner_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_push_pop_multiple), &opts); ASSERT_OK(ret, "global_list_push_pop_multiple"); ASSERT_OK(opts.retval, "global_list_push_pop_multiple retval"); if (!leave_in_map) - clear_fields(skel->maps.bss_A); + clear_fields(skel->progs.clear_global_list); if (mode == PUSH_POP_MULT) goto end; @@ -226,19 +227,19 @@ lil: ASSERT_OK(ret, "map_list_in_list"); ASSERT_OK(opts.retval, "map_list_in_list retval"); if (!leave_in_map) - clear_fields(skel->maps.array_map); + clear_fields(skel->progs.clear_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.inner_map_list_in_list), &opts); ASSERT_OK(ret, "inner_map_list_in_list"); ASSERT_OK(opts.retval, "inner_map_list_in_list retval"); if (!leave_in_map) - clear_fields(skel->maps.inner_map); + clear_fields(skel->progs.clear_inner_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_in_list), &opts); ASSERT_OK(ret, "global_list_in_list"); ASSERT_OK(opts.retval, "global_list_in_list retval"); if (!leave_in_map) - clear_fields(skel->maps.bss_A); + clear_fields(skel->progs.clear_global_list); end: linked_list__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr.c b/tools/testing/selftests/bpf/prog_tests/map_kptr.c index 03b46f17cf53..ec6f2f2e8308 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_kptr.c @@ -51,7 +51,6 @@ static void test_map_kptr_success(bool test_run) ret = bpf_map__update_elem(skel->maps.array_map, &key, sizeof(key), buf, sizeof(buf), 0); ASSERT_OK(ret, "array_map update"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); @@ -59,49 +58,42 @@ static void test_map_kptr_success(bool test_run) ret = bpf_map__update_elem(skel->maps.pcpu_array_map, &key, sizeof(key), pbuf, cpu * sizeof(buf), 0); ASSERT_OK(ret, "pcpu_array_map update"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.hash_map, &key, sizeof(key), 0); ASSERT_OK(ret, "hash_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.pcpu_hash_map, &key, sizeof(key), 0); ASSERT_OK(ret, "pcpu_hash_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.hash_malloc_map, &key, sizeof(key), 0); ASSERT_OK(ret, "hash_malloc_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.pcpu_hash_malloc_map, &key, sizeof(key), 0); ASSERT_OK(ret, "pcpu_hash_malloc_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.lru_hash_map, &key, sizeof(key), 0); ASSERT_OK(ret, "lru_hash_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.lru_pcpu_hash_map, &key, sizeof(key), 0); ASSERT_OK(ret, "lru_pcpu_hash_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); @@ -175,7 +167,7 @@ void serial_test_map_kptr(void) ASSERT_OK(kern_sync_rcu(), "sync rcu"); wait_for_map_release(); - /* Observe refcount dropping to 1 on synchronous delete elem */ + /* Observe refcount dropping to 1 on map release. */ test_map_kptr_success(true); } diff --git a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c index d2c0542716a8..1737eba34323 100644 --- a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c @@ -57,6 +57,7 @@ void test_percpu_hash_refcounted_kptr_refcount_leak(void) .data_size_in = sizeof(pkt_v4), .repeat = 1, ); + LIBBPF_OPTS(bpf_test_run_opts, syscall_opts); cpu_nr = libbpf_num_possible_cpus(); if (!ASSERT_GT(cpu_nr, 0, "libbpf_num_possible_cpus")) @@ -87,8 +88,11 @@ void test_percpu_hash_refcounted_kptr_refcount_leak(void) if (!ASSERT_EQ(opts.retval, 2, "opts.retval")) goto out; - err = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, 0); - if (!ASSERT_OK(err, "bpf_map__update_elem")) + fd = bpf_program__fd(skel->progs.clear_percpu_hash_kptr); + err = bpf_prog_test_run_opts(fd, &syscall_opts); + if (!ASSERT_OK(err, "bpf_prog_test_run_opts")) + goto out; + if (!ASSERT_EQ(syscall_opts.retval, 1, "syscall_opts.retval")) goto out; fd = bpf_program__fd(skel->progs.check_percpu_hash_refcount); diff --git a/tools/testing/selftests/bpf/progs/htab_update.c b/tools/testing/selftests/bpf/progs/htab_update.c index 195d3b2fba00..62c1b1325ec2 100644 --- a/tools/testing/selftests/bpf/progs/htab_update.c +++ b/tools/testing/selftests/bpf/progs/htab_update.c @@ -22,8 +22,8 @@ struct { int pid = 0; int update_err = 0; -SEC("?fentry/bpf_obj_free_fields") -int bpf_obj_free_fields(void *ctx) +SEC("?fentry/bpf_obj_cancel_fields") +int bpf_obj_cancel_fields(void *ctx) { __u32 key = 0; struct val value = { .payload = 1 }; diff --git a/tools/testing/selftests/bpf/progs/linked_list.c b/tools/testing/selftests/bpf/progs/linked_list.c index 421f40835acd..fa97faa5358b 100644 --- a/tools/testing/selftests/bpf/progs/linked_list.c +++ b/tools/testing/selftests/bpf/progs/linked_list.c @@ -290,6 +290,77 @@ int test_list_in_list(struct bpf_spin_lock *lock, struct bpf_list_head *head) return list_in_list(lock, head, true); } +#define MAX_LIST_CLEAR_NODES 256 + +static __always_inline +int clear_list(struct bpf_spin_lock *lock, struct bpf_list_head *head) +{ + struct bpf_list_node *n; + int i; + + for (i = 0; i < MAX_LIST_CLEAR_NODES; i++) { + bpf_spin_lock(lock); + n = bpf_list_pop_front(head); + bpf_spin_unlock(lock); + if (!n) + return 0; + bpf_obj_drop(container_of(n, struct foo, node2)); + } + return 1; +} + +SEC("syscall") +int clear_map_list(void *ctx) +{ + struct map_value *v; + + v = bpf_map_lookup_elem(&array_map, &(int){0}); + if (!v) + return 1; + return clear_list(&v->lock, &v->head); +} + +SEC("syscall") +int clear_inner_map_list(void *ctx) +{ + struct map_value *v; + void *map; + + map = bpf_map_lookup_elem(&map_of_maps, &(int){0}); + if (!map) + return 1; + v = bpf_map_lookup_elem(map, &(int){0}); + if (!v) + return 1; + return clear_list(&v->lock, &v->head); +} + +SEC("syscall") +int clear_global_list(void *ctx) +{ + return clear_list(&glock, &ghead); +} + +SEC("syscall") +int clear_global_nested_list(void *ctx) +{ + return clear_list(&ghead_nested.inner.lock, &ghead_nested.inner.head); +} + +SEC("syscall") +int clear_global_array_list(void *ctx) +{ + int ret; + + ret = clear_list(&glock_c, &ghead_array[0]); + if (ret) + return ret; + ret = clear_list(&glock_c, &ghead_array[1]); + if (ret) + return ret; + return clear_list(&glock_c, &ghead_array_one[0]); +} + SEC("tc") int map_list_push_pop(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr.c b/tools/testing/selftests/bpf/progs/refcounted_kptr.c index 13de169ad68f..61906f48025c 100644 --- a/tools/testing/selftests/bpf/progs/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/progs/refcounted_kptr.c @@ -1036,13 +1036,31 @@ int percpu_hash_refcount_leak(void *ctx) struct map_value *v; int key = 0; - v = bpf_map_lookup_elem(&percpu_hash, &key); + v = bpf_map_lookup_percpu_elem(&percpu_hash, &key, 0); if (!v) return 0; return __insert_in_list(&head, &lock, &v->node); } +SEC("syscall") +int clear_percpu_hash_kptr(void *ctx) +{ + struct node_data *n; + struct map_value *v; + int key = 0; + + v = bpf_map_lookup_percpu_elem(&percpu_hash, &key, 0); + if (!v) + return 0; + + n = bpf_kptr_xchg(&v->node, NULL); + if (!n) + return 0; + bpf_obj_drop(n); + return probe_read_refcount(); +} + SEC("tc") int check_percpu_hash_refcount(void *ctx) { -- cgit v1.2.3 From 10627ddc0167aab5c1c390a10ef461e9937aba08 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Jun 2026 12:55:38 +0200 Subject: bpf: Tighten cgroup storage cookie checks for prog arrays The fix in commit abad3d0bad72 ("bpf: Fix oob access in cgroup local storage") is still incomplete. The prog-array compatibility check treats a program with no cgroup storage as compatible with any stored storage cookie. This allows a storage-less program to bridge a tail call chain between an entry program and a storage-using callee even though cgroup local storage at runtime still follows the caller's context, that is, A -> B(no storage) -> C(storage) path. Requiring exact cookie equality would break the legitimate case of a storage-less leaf program being tail called from a storage-using one. Instead, only accept a zero storage cookie if the program cannot perform tail calls itself. This keeps A -> B(no storage) working while rejecting the A -> B(no storage) -> C(storage) bridge. Fixes: abad3d0bad72 ("bpf: Fix oob access in cgroup local storage") Reported-by: Lin Ma Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260610105539.705887-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a656a8572bdb..649cce41e13f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2481,7 +2481,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, cookie = aux->cgroup_storage[i] ? aux->cgroup_storage[i]->cookie : 0; ret = map->owner->storage_cookie[i] == cookie || - !cookie; + (!cookie && !aux->tail_call_reachable); } if (ret && map->owner->attach_func_proto != aux->attach_func_proto) { -- cgit v1.2.3 From 6001896f00984d317fb75160ba05c4a885fbe2a0 Mon Sep 17 00:00:00 2001 From: Sun Jian Date: Fri, 12 Jun 2026 19:40:31 +0800 Subject: bpf: Run generic devmap egress prog on private skb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generic XDP devmap multi redirect uses skb_clone() for intermediate destinations and sends the last destination with the original skb. This can leave multiple destinations sharing the same packet data. This becomes visible after generic devmap egress-program support was added: a devmap egress program may mutate packet data, and another destination sharing the same data can observe that mutation. Native XDP broadcast redirect does not have this issue because xdpf_clone() copies the frame data for each destination. Generic XDP should provide the same per-destination isolation before running a devmap egress program. Fix this by making cloned skbs private before running the generic devmap egress program. Use skb_copy() instead of skb_unshare() so allocation failure does not consume the skb and the existing caller error paths keep their ownership semantics. Fixes: 2ea5eabaf04a ("bpf: devmap: Implement devmap prog execution for generic XDP") Suggested-by: Jiayuan Chen Suggested-by: Jakub Kicinski Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Sun Jian Link: https://lore.kernel.org/r/20260612114032.244616-2-sun.jian.kdev@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel/bpf') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 5b9eac5342a9..dc7b859e8bbf 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -710,6 +710,18 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, if (unlikely(err)) return err; + if (dst->xdp_prog && skb_cloned(skb)) { + struct sk_buff *nskb; + + nskb = skb_copy(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + nskb->mac_len = skb->mac_len; + consume_skb(skb); + skb = nskb; + } + /* Redirect has already succeeded semantically at this point, so we just * return 0 even if packet is dropped. Helper below takes care of * freeing skb. -- cgit v1.2.3 From 4c71303c837449158815c521fcee4ec3b8721dbd Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Wed, 10 Jun 2026 20:17:23 +0000 Subject: bpf: Fix setting retval to -EPERM for cgroup hooks not returning errno When a cgroup BPF program exits with 0, bpf_prog_run_array_cg() sets the hook return value to -EPERM if it is not a valid errno. This is correct for errno-based hooks, which return 0 on success and negative errno on failure, but wrong for boolean and void LSM hooks. Boolean LSM hooks should only return true or false, and void LSM hooks have no return value at all. Fix it by skipping setting -EPERM for hooks not returning errno. Fixes: 69fd337a975c ("bpf: per-cgroup lsm flavor") Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20260610201724.733943-2-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_lsm.h | 6 ++++++ kernel/bpf/bpf_lsm.c | 20 ++++++++++++++++++++ kernel/bpf/cgroup.c | 47 ++++++++++++++++++++++++++++++++++------------- 3 files changed, 60 insertions(+), 13 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 643809cc78c3..143775a27a2a 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -52,6 +52,7 @@ int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str, const struct bpf_dynptr *value_p, int flags); int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str); bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog); +bool bpf_lsm_hook_returns_errno(u32 btf_id); #else /* !CONFIG_BPF_LSM */ @@ -104,6 +105,11 @@ static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog) { return false; } + +static inline bool bpf_lsm_hook_returns_errno(u32 btf_id) +{ + return true; +} #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index c5c925f00202..564071a92d7d 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -427,6 +427,26 @@ BTF_ID(func, bpf_lsm_audit_rule_known) BTF_ID(func, bpf_lsm_inode_xattr_skipcap) BTF_SET_END(bool_lsm_hooks) +/* hooks returning void */ +#define LSM_HOOK_void(DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME) +#define LSM_HOOK_int(DEFAULT, NAME, ...) /* nothing */ +#define LSM_HOOK(RET, DEFAULT, NAME, ...) LSM_HOOK_##RET(DEFAULT, NAME, __VA_ARGS__) +BTF_SET_START(void_lsm_hooks) +#include +#undef LSM_HOOK +#undef LSM_HOOK_void +#undef LSM_HOOK_int +BTF_SET_END(void_lsm_hooks) + +bool bpf_lsm_hook_returns_errno(u32 btf_id) +{ + if (btf_id_set_contains(&bool_lsm_hooks, btf_id)) + return false; + if (btf_id_set_contains(&void_lsm_hooks, btf_id)) + return false; + return true; +} + int bpf_lsm_get_retval_range(const struct bpf_prog *prog, struct bpf_retval_range *retval_range) { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 35d1f1428ef3..83ce66296ac1 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -55,6 +55,28 @@ void __init cgroup_bpf_lifetime_notifier_init(void) &cgroup_bpf_lifetime_nb)); } +#ifdef CONFIG_BPF_LSM +struct cgroup_lsm_atype { + u32 attach_btf_id; + int refcnt; + bool returns_errno; +}; + +static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; + +static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype) +{ + if (atype >= CGROUP_LSM_START && atype <= CGROUP_LSM_END) + return READ_ONCE(cgroup_lsm_atype[atype - CGROUP_LSM_START].returns_errno); + return true; +} +#else +static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype) +{ + return true; +} +#endif + /* __always_inline is necessary to prevent indirect call through run_prog * function pointer. */ @@ -83,7 +105,8 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, *(ret_flags) |= (func_ret >> 1); func_ret &= 1; } - if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval)) + if (!func_ret && cgroup_bpf_hook_returns_errno(atype) && + !IS_ERR_VALUE((long)run_ctx.retval)) run_ctx.retval = -EPERM; item++; } @@ -156,13 +179,6 @@ unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, } #ifdef CONFIG_BPF_LSM -struct cgroup_lsm_atype { - u32 attach_btf_id; - int refcnt; -}; - -static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; - static enum cgroup_bpf_attach_type bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) { @@ -191,10 +207,13 @@ void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) lockdep_assert_held(&cgroup_mutex); - WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id && - cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); - - cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; + if (!cgroup_lsm_atype[i].attach_btf_id) { + cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; + WRITE_ONCE(cgroup_lsm_atype[i].returns_errno, + bpf_lsm_hook_returns_errno(attach_btf_id)); + } else { + WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); + } cgroup_lsm_atype[i].refcnt++; } @@ -203,8 +222,10 @@ void bpf_cgroup_atype_put(int cgroup_atype) int i = cgroup_atype - CGROUP_LSM_START; cgroup_lock(); - if (--cgroup_lsm_atype[i].refcnt <= 0) + if (--cgroup_lsm_atype[i].refcnt <= 0) { + WRITE_ONCE(cgroup_lsm_atype[i].returns_errno, true); cgroup_lsm_atype[i].attach_btf_id = 0; + } WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0); cgroup_unlock(); } -- cgit v1.2.3 From 2148794eeaf2a898adc791e9472eb80ea55984da Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 13 Jun 2026 11:07:55 -0700 Subject: bpf: Raise maximum call chain depth to 16 frames Bump MAX_CALL_FRAMES from 8 to 16 to allow deeper call chains that Rust-BPF requires and update selftests. Link: https://lore.kernel.org/r/20260613180755.29671-1-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 13 +++--- kernel/bpf/verifier.c | 15 ++++--- .../selftests/bpf/progs/test_global_func3.c | 52 +++++++++++++++++++++- .../selftests/bpf/progs/verifier_liveness_exp.c | 2 +- .../selftests/bpf/progs/verifier_scalar_ids.c | 25 +++++------ tools/testing/selftests/bpf/verifier/calls.c | 48 ++++++++++++++++++++ 6 files changed, 128 insertions(+), 27 deletions(-) (limited to 'kernel/bpf') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d57b339a8cb8..39a851e690ec 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -404,7 +404,7 @@ struct bpf_func_state { struct bpf_reg_state *stack_arg_regs; /* Outgoing on-stack arguments */ }; -#define MAX_CALL_FRAMES 8 +#define MAX_CALL_FRAMES 16 /* instruction history flags, used in bpf_jmp_history_entry.flags field. * Frame number and SPI are stored in dedicated fields of bpf_jmp_history_entry. @@ -421,20 +421,21 @@ enum { struct bpf_jmp_history_entry { /* insn idx can't be bigger than 1 million */ u32 idx : 20; - u32 frame : 3; /* stack access frame number */ + u32 frame : 4; /* stack access frame number */ u32 spi : 6; /* stack slot index (0..63) */ - u32 : 3; + u32 : 2; u32 prev_idx : 20; /* special INSN_F_xxx flags */ u32 flags : 4; u32 : 8; - /* additional registers that need precision tracking when this - * jump is backtracked, vector of six 10-bit records + /* + * additional registers that need precision tracking when this + * jump is backtracked, vector of five 11-bit records */ u64 linked_regs; }; -static_assert(MAX_CALL_FRAMES <= (1 << 3)); +static_assert(MAX_CALL_FRAMES <= (1 << 4)); static_assert(MAX_BPF_STACK / 8 <= (1 << 6)); /* Maximum number of bpf_reg_state objects that can exist at once */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb46a81a8c51..2abc79dbf281 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3144,7 +3144,7 @@ static void mark_indirect_target(struct bpf_verifier_env *env, int idx) env->insn_aux_data[idx].indirect_target = true; } -#define LR_FRAMENO_BITS 3 +#define LR_FRAMENO_BITS 4 #define LR_SPI_BITS 6 #define LR_ENTRY_BITS (LR_SPI_BITS + LR_FRAMENO_BITS + 1) #define LR_SIZE_BITS 4 @@ -3153,7 +3153,11 @@ static void mark_indirect_target(struct bpf_verifier_env *env, int idx) #define LR_SIZE_MASK ((1ull << LR_SIZE_BITS) - 1) #define LR_SPI_OFF LR_FRAMENO_BITS #define LR_IS_REG_OFF (LR_SPI_BITS + LR_FRAMENO_BITS) -#define LINKED_REGS_MAX 6 +#define LINKED_REGS_MAX 5 + +static_assert(MAX_CALL_FRAMES <= (1 << LR_FRAMENO_BITS)); +static_assert(LINKED_REGS_MAX < (1 << LR_SIZE_BITS)); +static_assert(LINKED_REGS_MAX * LR_ENTRY_BITS + LR_SIZE_BITS <= 64); struct linked_reg { u8 frameno; @@ -3177,10 +3181,11 @@ static struct linked_reg *linked_regs_push(struct linked_regs *s) return NULL; } -/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track +/* + * Use u64 as a vector of 5 11-bit values, use first 4-bits to track * number of elements currently in stack. - * Pack one history entry for linked registers as 10 bits in the following format: - * - 3-bits frameno + * Pack one history entry for linked registers as 11 bits in the following format: + * - 4-bits frameno * - 6-bits spi_or_reg * - 1-bit is_reg */ diff --git a/tools/testing/selftests/bpf/progs/test_global_func3.c b/tools/testing/selftests/bpf/progs/test_global_func3.c index 974fd8c19561..b66abb350fb0 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func3.c +++ b/tools/testing/selftests/bpf/progs/test_global_func3.c @@ -53,9 +53,57 @@ int f8(struct __sk_buff *skb) return f7(skb); } +static __attribute__ ((noinline)) +int f9(struct __sk_buff *skb) +{ + return f8(skb); +} + +static __attribute__ ((noinline)) +int f10(struct __sk_buff *skb) +{ + return f9(skb); +} + +static __attribute__ ((noinline)) +int f11(struct __sk_buff *skb) +{ + return f10(skb); +} + +static __attribute__ ((noinline)) +int f12(struct __sk_buff *skb) +{ + return f11(skb); +} + +static __attribute__ ((noinline)) +int f13(struct __sk_buff *skb) +{ + return f12(skb); +} + +static __attribute__ ((noinline)) +int f14(struct __sk_buff *skb) +{ + return f13(skb); +} + +static __attribute__ ((noinline)) +int f15(struct __sk_buff *skb) +{ + return f14(skb); +} + +static __attribute__ ((noinline)) +int f16(struct __sk_buff *skb) +{ + return f15(skb); +} + SEC("tc") -__failure __msg("the call stack of 9 frames") +__failure __msg("the call stack of 17 frames") int global_func3(struct __sk_buff *skb) { - return f8(skb); + return f16(skb); } diff --git a/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c b/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c index b058de623200..72646fa2745e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c +++ b/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c @@ -15,7 +15,7 @@ * FP offset at each call site. arg_track keys on (frame, off[]), so * r1=fp-8, r1=fp-16, ... r1=fp-400 produce 50 unique cache keys per level. * - * This test chains 8 subprograms (the MAX_CALL_FRAMES limit). Each + * This test chains 8 subprograms (within the MAX_CALL_FRAMES limit). Each * intermediate function calls the next one 50 times, each time with a * different FP-relative offset in r1. * diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index 70ae14d6084f..e38f102da45f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -372,37 +372,36 @@ __naked void precision_two_ids(void) SEC("socket") __success __log_level(2) __flag(BPF_F_TEST_STATE_FREQ) -/* check that r0 and r6 have different IDs after 'if', - * collect_linked_regs() can't tie more than 6 registers for a single insn. +/* + * check that r0 and r5 have different IDs after 'if', + * collect_linked_regs() can't tie more than 5 registers for a single insn. */ -__msg("8: (25) if r0 > 0x7 goto pc+0 ; R0=scalar(id=1") -__msg("14: (bf) r6 = r6 ; R6=scalar(id=2") -/* check that r{0-5} are marked precise after 'if' */ -__msg("frame0: regs=r0 stack= before 8: (25) if r0 > 0x7 goto pc+0") -__msg("frame0: parent state regs=r0,r1,r2,r3,r4,r5 stack=:") +__msg("7: (25) if r0 > 0x7 goto pc+0 ; R0=scalar(id=1") +__msg("12: (bf) r5 = r5 ; R5=scalar(id=2") +/* check that r{0-4} are marked precise after 'if' */ +__msg("frame0: regs=r0 stack= before 7: (25) if r0 > 0x7 goto pc+0") +__msg("frame0: parent state regs=r0,r1,r2,r3,r4 stack=:") __naked void linked_regs_too_many_regs(void) { asm volatile ( /* r0 = random number up to 0xff */ "call %[bpf_ktime_get_ns];" "r0 &= 0xff;" - /* tie r{0-6} IDs */ + /* tie r{0-5} IDs */ "r1 = r0;" "r2 = r0;" "r3 = r0;" "r4 = r0;" "r5 = r0;" - "r6 = r0;" - /* propagate range for r{0-6} */ + /* propagate range for r{0-5} */ "if r0 > 7 goto +0;" - /* keep r{1-5} live */ + /* keep r{1-4} live */ "r1 = r1;" "r2 = r2;" "r3 = r3;" "r4 = r4;" + /* make r5 appear in the log */ "r5 = r5;" - /* make r6 appear in the log */ - "r6 = r6;" /* force r0 to be precise, * this would cause r{0-4} to be precise because of shared IDs */ diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 42d523a21a43..302d712e0d7e 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -1219,6 +1219,30 @@ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call H */ BPF_EXIT_INSN(), /* H */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call I */ + BPF_EXIT_INSN(), + /* I */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call J */ + BPF_EXIT_INSN(), + /* J */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call K */ + BPF_EXIT_INSN(), + /* K */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call L */ + BPF_EXIT_INSN(), + /* L */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call M */ + BPF_EXIT_INSN(), + /* M */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call N */ + BPF_EXIT_INSN(), + /* N */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call O */ + BPF_EXIT_INSN(), + /* O */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call P */ + BPF_EXIT_INSN(), + /* P */ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -1257,6 +1281,30 @@ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call H */ BPF_EXIT_INSN(), /* H */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call I */ + BPF_EXIT_INSN(), + /* I */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call J */ + BPF_EXIT_INSN(), + /* J */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call K */ + BPF_EXIT_INSN(), + /* K */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call L */ + BPF_EXIT_INSN(), + /* L */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call M */ + BPF_EXIT_INSN(), + /* M */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call N */ + BPF_EXIT_INSN(), + /* N */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call O */ + BPF_EXIT_INSN(), + /* O */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call P */ + BPF_EXIT_INSN(), + /* P */ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, -- cgit v1.2.3 From 26330a9226417c9a3395db9fdb403f7d7371e6b7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 11 Jun 2026 13:42:26 +0200 Subject: bpf: Add support to specify uprobe_multi target via file descriptor Allow uprobe_multi link to identify the target binary by an already opened file descriptor. Adding new BPF_F_UPROBE_MULTI_PATH_FD flag and the path_fd field for the attr.link_create.uprobe_multi struct. When the flag is set, we resolve the target from path_fd, without the flag, we keep the existing string path behavior. I don't see a use case for supporting O_PATH file descriptors, because we need to read the binary first to get probes offsets, so I'm using the CLASS(fd, f), which fails for O_PATH fds. Assisted-by: Codex:GPT-5.4 Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260611114230.950379-4-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 7 ++++++- kernel/bpf/syscall.c | 4 ++-- kernel/trace/bpf_trace.c | 43 ++++++++++++++++++++++++++++++++++++------ tools/include/uapi/linux/bpf.h | 7 ++++++- 4 files changed, 51 insertions(+), 10 deletions(-) (limited to 'kernel/bpf') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 11dd610fa5fa..89b36de5fdbb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1327,7 +1327,11 @@ enum { * BPF_TRACE_UPROBE_MULTI attach type to create return probe. */ enum { - BPF_F_UPROBE_MULTI_RETURN = (1U << 0) + /* Get return uprobe. */ + BPF_F_UPROBE_MULTI_RETURN = (1U << 0), + + /* Get path from provided path_fd. */ + BPF_F_UPROBE_MULTI_PATH_FD = (1U << 1), }; /* link_create.netfilter.flags used in LINK_CREATE command for @@ -1864,6 +1868,7 @@ union bpf_attr { __u32 cnt; __u32 flags; __u32 pid; + __u32 path_fd; } uprobe_multi; struct { union { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7ed949f70f82..b44106c8ea75 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3480,7 +3480,7 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ? "kretprobe_multi" : "kprobe_multi"); else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI) - seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ? + seq_printf(m, "link_type:\t%s\n", link->flags & BPF_F_UPROBE_MULTI_RETURN ? "uretprobe_multi" : "uprobe_multi"); else seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); @@ -5840,7 +5840,7 @@ err_put: return err; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid +#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.path_fd static int link_create(union bpf_attr *attr, bpfptr_t uattr) { struct bpf_prog *prog; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f8990bc6b64c..82f8feea6931 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -3214,6 +3215,38 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) return run_ctx->uprobe->cookie; } +static int bpf_uprobe_multi_get_path(const union bpf_attr *attr, struct path *path) +{ + void __user *upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); + u32 path_fd = attr->link_create.uprobe_multi.path_fd; + u32 flags = attr->link_create.uprobe_multi.flags; + + if (flags & BPF_F_UPROBE_MULTI_PATH_FD) { + /* + * When BPF_F_UPROBE_MULTI_PATH_FD is set, the executable is + * identified by path_fd, upath must be NULL. + */ + if (upath) + return -EINVAL; + + CLASS(fd, f)(path_fd); + if (fd_empty(f)) + return -EBADF; + *path = fd_file(f)->f_path; + path_get(path); + return 0; + } + + /* + * When BPF_F_UPROBE_MULTI_PATH_FD is not set, the path is resolved + * relative to the cwd (AT_FDCWD) or absolute using the upath string. + */ + if (!upath || path_fd) + return -EINVAL; + + return user_path_at(AT_FDCWD, upath, LOOKUP_FOLLOW, path); +} + int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_uprobe_multi_link *link = NULL; @@ -3223,7 +3256,6 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr struct task_struct *task = NULL; unsigned long __user *uoffsets; u64 __user *ucookies; - void __user *upath; unsigned long size; u32 flags, cnt, i; struct path path; @@ -3241,19 +3273,18 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr return -EINVAL; flags = attr->link_create.uprobe_multi.flags; - if (flags & ~BPF_F_UPROBE_MULTI_RETURN) + if (flags & ~(BPF_F_UPROBE_MULTI_RETURN | BPF_F_UPROBE_MULTI_PATH_FD)) return -EINVAL; /* - * path, offsets and cnt are mandatory, + * offsets and cnt are mandatory, * ref_ctr_offsets and cookies are optional */ - upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets); cnt = attr->link_create.uprobe_multi.cnt; pid = attr->link_create.uprobe_multi.pid; - if (!upath || !uoffsets || !cnt || pid < 0) + if (!uoffsets || !cnt || pid < 0) return -EINVAL; if (cnt > MAX_UPROBE_MULTI_CNT) return -E2BIG; @@ -3271,7 +3302,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr !access_ok(ucookies, size)) return -EFAULT; - err = user_path_at(AT_FDCWD, upath, LOOKUP_FOLLOW, &path); + err = bpf_uprobe_multi_get_path(attr, &path); if (err) return err; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 11dd610fa5fa..89b36de5fdbb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1327,7 +1327,11 @@ enum { * BPF_TRACE_UPROBE_MULTI attach type to create return probe. */ enum { - BPF_F_UPROBE_MULTI_RETURN = (1U << 0) + /* Get return uprobe. */ + BPF_F_UPROBE_MULTI_RETURN = (1U << 0), + + /* Get path from provided path_fd. */ + BPF_F_UPROBE_MULTI_PATH_FD = (1U << 1), }; /* link_create.netfilter.flags used in LINK_CREATE command for @@ -1864,6 +1868,7 @@ union bpf_attr { __u32 cnt; __u32 flags; __u32 pid; + __u32 path_fd; } uprobe_multi; struct { union { -- cgit v1.2.3