From 720e6a435194fb5237833a4a7ec6aa60a78964a8 Mon Sep 17 00:00:00 2001 From: Yonghong Song <yhs@fb.com> Date: Wed, 31 Aug 2022 08:26:46 -0700 Subject: bpf: Allow struct argument in trampoline based programs Allow struct argument in trampoline based programs where the struct size should be <= 16 bytes. In such cases, the argument will be put into up to 2 registers for bpf, x86_64 and arm64 architectures. To support arch-specific trampoline manipulation, add arg_flags for additional struct information about arguments in btf_func_model. Such information will be used in arch specific function arch_prepare_bpf_trampoline() to prepare argument access properly in trampoline. Signed-off-by: Yonghong Song <yhs@fb.com> Link: https://lore.kernel.org/r/20220831152646.2078089-1-yhs@fb.com Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9c1674973e03..4d32f125f4af 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -727,10 +727,14 @@ enum bpf_cgroup_storage_type { */ #define MAX_BPF_FUNC_REG_ARGS 5 +/* The argument is a structure. */ +#define BTF_FMODEL_STRUCT_ARG BIT(0) + struct btf_func_model { u8 ret_size; u8 nr_args; u8 arg_size[MAX_BPF_FUNC_ARGS]; + u8 arg_flags[MAX_BPF_FUNC_ARGS]; }; /* Restore arguments before returning from trampoline to let original function -- cgit v1.2.3 From 95f2f26f3cac06cfc046d2b29e60719d7848ea54 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires <benjamin.tissoires@redhat.com> Date: Tue, 6 Sep 2022 17:12:58 +0200 Subject: bpf: split btf_check_subprog_arg_match in two btf_check_subprog_arg_match() was used twice in verifier.c: - when checking for the type mismatches between a (sub)prog declaration and BTF - when checking the call of a subprog to see if the provided arguments are correct and valid This is problematic when we check if the first argument of a program (pointer to ctx) is correctly accessed: To be able to ensure we access a valid memory in the ctx, the verifier assumes the pointer to context is not null. This has the side effect of marking the program accessing the entire context, even if the context is never dereferenced. For example, by checking the context access with the current code, the following eBPF program would fail with -EINVAL if the ctx is set to null from the userspace: ``` SEC("syscall") int prog(struct my_ctx *args) { return 0; } ``` In that particular case, we do not want to actually check that the memory is correct while checking for the BTF validity, but we just want to ensure that the (sub)prog definition matches the BTF we have. So split btf_check_subprog_arg_match() in two so we can actually check for the memory used when in a call, and ignore that part when not. Note that a further patch is in preparation to disentangled btf_check_func_arg_match() from these two purposes, and so right now we just add a new hack around that by adding a boolean to this function. Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com> Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Link: https://lore.kernel.org/r/20220906151303.2780789-3-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/bpf.h | 2 ++ kernel/bpf/btf.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++----- kernel/bpf/verifier.c | 2 +- 3 files changed, 52 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4d32f125f4af..3cf161cfd396 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1947,6 +1947,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); +int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs); int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ea94527e5d70..9291e2b2c950 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6203,7 +6203,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, bool ptr_to_mem_ok, - u32 kfunc_flags) + u32 kfunc_flags, + bool processing_call) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); bool rel = false, kptr_get = false, trusted_arg = false; @@ -6389,7 +6390,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_tname); return -EINVAL; } - } else if (ptr_to_mem_ok) { + } else if (ptr_to_mem_ok && processing_call) { const struct btf_type *resolve_ret; u32 type_size; @@ -6464,7 +6465,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return rel ? ref_regno : 0; } -/* Compare BTF of a function with given bpf_reg_state. +/* Compare BTF of a function declaration with given bpf_reg_state. * Returns: * EFAULT - there is a verifier bug. Abort verification. * EINVAL - there is a type mismatch or BTF is not available. @@ -6491,7 +6492,50 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, false); + + /* Compiler optimizations can remove arguments from static functions + * or mismatched type can be passed into a global function. + * In such cases mark the function as unreliable from BTF point of view. + */ + if (err) + prog->aux->func_info_aux[subprog].unreliable = true; + return err; +} + +/* Compare BTF of a function call with given bpf_reg_state. + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - there is a type mismatch or BTF is not available. + * 0 - BTF matches with what bpf_reg_state expects. + * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. + * + * NOTE: the code is duplicated from btf_check_subprog_arg_match() + * because btf_check_func_arg_match() is still doing both. Once that + * function is split in 2, we can call from here btf_check_subprog_arg_match() + * first, and then treat the calling part in a new code path. + */ +int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs) +{ + struct bpf_prog *prog = env->prog; + struct btf *btf = prog->aux->btf; + bool is_global; + u32 btf_id; + int err; + + if (!prog->aux->func_info) + return -EINVAL; + + btf_id = prog->aux->func_info[subprog].type_id; + if (!btf_id) + return -EFAULT; + + if (prog->aux->func_info_aux[subprog].unreliable) + return -EINVAL; + + is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, true); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. @@ -6507,7 +6551,7 @@ int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 kfunc_flags) { - return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags); + return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags, true); } /* Convert BTF of a function into bpf_reg_state if possible diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 003f7ba19558..7d9a2e18ca8a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6629,7 +6629,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_info_aux = env->prog->aux->func_info_aux; if (func_info_aux) is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_subprog_arg_match(env, subprog, caller->regs); + err = btf_check_subprog_call(env, subprog, caller->regs); if (err == -EFAULT) return err; if (is_global) { -- cgit v1.2.3 From eb1f7f71c126c8fd50ea81af98f97c4b581ea4ae Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires <benjamin.tissoires@redhat.com> Date: Tue, 6 Sep 2022 17:13:02 +0200 Subject: bpf/verifier: allow kfunc to return an allocated mem For drivers (outside of network), the incoming data is not statically defined in a struct. Most of the time the data buffer is kzalloc-ed and thus we can not rely on eBPF and BTF to explore the data. This commit allows to return an arbitrary memory, previously allocated by the driver. An interesting extra point is that the kfunc can mark the exported memory region as read only or read/write. So, when a kfunc is not returning a pointer to a struct but to a plain type, we can consider it is a valid allocated memory assuming that: - one of the arguments is either called rdonly_buf_size or rdwr_buf_size - and this argument is a const from the caller point of view We can then use this parameter as the size of the allocated memory. The memory is either read-only or read-write based on the name of the size parameter. Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com> Link: https://lore.kernel.org/r/20220906151303.2780789-7-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/bpf.h | 9 +++- include/linux/bpf_verifier.h | 2 + include/linux/btf.h | 10 +++++ kernel/bpf/btf.c | 101 ++++++++++++++++++++++++++++++++++--------- kernel/bpf/verifier.c | 45 +++++++++++++------ 5 files changed, 133 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3cf161cfd396..79883f883ff3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1944,6 +1944,13 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, const char *func_name, struct btf_func_model *m); +struct bpf_kfunc_arg_meta { + u64 r0_size; + bool r0_rdonly; + int ref_obj_id; + u32 flags; +}; + struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); @@ -1952,7 +1959,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, - u32 kfunc_flags); + struct bpf_kfunc_arg_meta *meta); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1fdddbf3546b..8fbc1d05281e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -598,6 +598,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, struct bpf_attach_target_info *tgt_info); void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab); +int mark_chain_precision(struct bpf_verifier_env *env, int regno); + #define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) /* extract base type from bpf_{arg, return, reg}_type. */ diff --git a/include/linux/btf.h b/include/linux/btf.h index ad93c2d9cc1c..1fcc833a8690 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -441,4 +441,14 @@ static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dt } #endif +static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) +{ + if (!btf_type_is_ptr(t)) + return false; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + return btf_type_is_struct(t); +} + #endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2c2d8190ca4a..9d12212fcd61 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6199,11 +6199,36 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf, return true; } +static bool btf_is_kfunc_arg_mem_size(const struct btf *btf, + const struct btf_param *arg, + const struct bpf_reg_state *reg, + const char *name) +{ + int len, target_len = strlen(name); + const struct btf_type *t; + const char *param_name; + + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) + return false; + + param_name = btf_name_by_offset(btf, arg->name_off); + if (str_is_empty(param_name)) + return false; + len = strlen(param_name); + if (len != target_len) + return false; + if (strcmp(param_name, name)) + return false; + + return true; +} + static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, bool ptr_to_mem_ok, - u32 kfunc_flags, + struct bpf_kfunc_arg_meta *kfunc_meta, bool processing_call) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); @@ -6241,12 +6266,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } - if (is_kfunc) { + if (is_kfunc && kfunc_meta) { /* Only kfunc can be release func */ - rel = kfunc_flags & KF_RELEASE; - kptr_get = kfunc_flags & KF_KPTR_GET; - trusted_arg = kfunc_flags & KF_TRUSTED_ARGS; - sleepable = kfunc_flags & KF_SLEEPABLE; + rel = kfunc_meta->flags & KF_RELEASE; + kptr_get = kfunc_meta->flags & KF_KPTR_GET; + trusted_arg = kfunc_meta->flags & KF_TRUSTED_ARGS; + sleepable = kfunc_meta->flags & KF_SLEEPABLE; } /* check that BTF function arguments match actual types that the @@ -6259,6 +6284,38 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, t = btf_type_skip_modifiers(btf, args[i].type, NULL); if (btf_type_is_scalar(t)) { + if (is_kfunc && kfunc_meta) { + bool is_buf_size = false; + + /* check for any const scalar parameter of name "rdonly_buf_size" + * or "rdwr_buf_size" + */ + if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg, + "rdonly_buf_size")) { + kfunc_meta->r0_rdonly = true; + is_buf_size = true; + } else if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg, + "rdwr_buf_size")) + is_buf_size = true; + + if (is_buf_size) { + if (kfunc_meta->r0_size) { + bpf_log(log, "2 or more rdonly/rdwr_buf_size parameters for kfunc"); + return -EINVAL; + } + + if (!tnum_is_const(reg->var_off)) { + bpf_log(log, "R%d is not a const\n", regno); + return -EINVAL; + } + + kfunc_meta->r0_size = reg->var_off.value; + ret = mark_chain_precision(env, regno); + if (ret) + return ret; + } + } + if (reg->type == SCALAR_VALUE) continue; bpf_log(log, "R%d is not a scalar\n", regno); @@ -6289,6 +6346,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (ret < 0) return ret; + if (is_kfunc && reg->ref_obj_id) { + /* Ensure only one argument is referenced PTR_TO_BTF_ID */ + if (ref_obj_id) { + bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, ref_obj_id); + return -EFAULT; + } + ref_regno = regno; + ref_obj_id = reg->ref_obj_id; + } + /* kptr_get is only true for kfunc */ if (i == 0 && kptr_get) { struct bpf_map_value_off_desc *off_desc; @@ -6361,16 +6429,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (reg->type == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; - /* Ensure only one argument is referenced PTR_TO_BTF_ID */ - if (reg->ref_obj_id) { - if (ref_obj_id) { - bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, ref_obj_id); - return -EFAULT; - } - ref_regno = regno; - ref_obj_id = reg->ref_obj_id; - } } else { reg_btf = btf_vmlinux; reg_ref_id = *reg2btf_ids[base_type(reg->type)]; @@ -6461,6 +6519,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + if (kfunc_meta && ref_obj_id) + kfunc_meta->ref_obj_id = ref_obj_id; + /* returns argument register number > 0 in case of reference release kfunc */ return rel ? ref_regno : 0; } @@ -6492,7 +6553,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, false); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, false); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. @@ -6535,7 +6596,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, true); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, true); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. @@ -6549,9 +6610,9 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, - u32 kfunc_flags) + struct bpf_kfunc_arg_meta *meta) { - return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags, true); + return btf_check_func_arg_match(env, btf, func_id, regs, true, meta, true); } /* Convert BTF of a function into bpf_reg_state if possible diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3cfe60206de6..f3344a86d88d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2908,7 +2908,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, return 0; } -static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +int mark_chain_precision(struct bpf_verifier_env *env, int regno) { return __mark_chain_precision(env, regno, -1); } @@ -7595,6 +7595,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, { const struct btf_type *t, *func, *func_proto, *ptr_type; struct bpf_reg_state *regs = cur_regs(env); + struct bpf_kfunc_arg_meta meta = { 0 }; const char *func_name, *ptr_type_name; u32 i, nargs, func_id, ptr_type_id; int err, insn_idx = *insn_idx_p; @@ -7629,8 +7630,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, acq = *kfunc_flags & KF_ACQUIRE; + meta.flags = *kfunc_flags; + /* Check the arguments */ - err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, *kfunc_flags); + err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, &meta); if (err < 0) return err; /* In case of release function, we get register number of refcounted @@ -7651,7 +7654,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Check return type */ t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); - if (acq && !btf_type_is_ptr(t)) { + if (acq && !btf_type_is_struct_ptr(desc_btf, t)) { verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); return -EINVAL; } @@ -7663,17 +7666,33 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); if (!btf_type_is_struct(ptr_type)) { - ptr_type_name = btf_name_by_offset(desc_btf, - ptr_type->name_off); - verbose(env, "kernel function %s returns pointer type %s %s is not supported\n", - func_name, btf_type_str(ptr_type), - ptr_type_name); - return -EINVAL; + if (!meta.r0_size) { + ptr_type_name = btf_name_by_offset(desc_btf, + ptr_type->name_off); + verbose(env, + "kernel function %s returns pointer type %s %s is not supported\n", + func_name, + btf_type_str(ptr_type), + ptr_type_name); + return -EINVAL; + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM; + regs[BPF_REG_0].mem_size = meta.r0_size; + + if (meta.r0_rdonly) + regs[BPF_REG_0].type |= MEM_RDONLY; + + /* Ensures we don't access the memory after a release_reference() */ + if (meta.ref_obj_id) + regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + } else { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].type = PTR_TO_BTF_ID; + regs[BPF_REG_0].btf_id = ptr_type_id; } - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].type = PTR_TO_BTF_ID; - regs[BPF_REG_0].btf_id = ptr_type_id; if (*kfunc_flags & KF_RET_NULL) { regs[BPF_REG_0].type |= PTR_MAYBE_NULL; /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */ -- cgit v1.2.3 From 448325199f574d33824dbf9121efb03558412966 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi <memxor@gmail.com> Date: Sun, 4 Sep 2022 22:41:14 +0200 Subject: bpf: Add copy_map_value_long to copy to remote percpu memory bpf_long_memcpy is used while copying to remote percpu regions from BPF syscall and helpers, so that the copy is atomic at word size granularity. This might not be possible when you copy from map value hosting kptrs from or to percpu maps, as the alignment or size in disjoint regions may not be multiple of word size. Hence, to avoid complicating the copy loop, we only use bpf_long_memcpy when special fields are not present, otherwise use normal memcpy to copy the disjoint regions. Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Link: https://lore.kernel.org/r/20220904204145.3089-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/bpf.h | 52 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 79883f883ff3..6a73e94821c4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -280,14 +280,33 @@ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) } } -/* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */ -static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) +/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and + * forced to use 'long' read/writes to try to atomically copy long counters. + * Best-effort only. No barriers here, since it _will_ race with concurrent + * updates from BPF programs. Called from bpf syscall and mostly used with + * size 8 or 16 bytes, so ask compiler to inline it. + */ +static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) +{ + const long *lsrc = src; + long *ldst = dst; + + size /= sizeof(long); + while (size--) + *ldst++ = *lsrc++; +} + +/* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ +static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, bool long_memcpy) { u32 curr_off = 0; int i; if (likely(!map->off_arr)) { - memcpy(dst, src, map->value_size); + if (long_memcpy) + bpf_long_memcpy(dst, src, round_up(map->value_size, 8)); + else + memcpy(dst, src, map->value_size); return; } @@ -299,6 +318,17 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) } memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off); } + +static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) +{ + __copy_map_value(map, dst, src, false); +} + +static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src) +{ + __copy_map_value(map, dst, src, true); +} + void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); @@ -1827,22 +1857,6 @@ int bpf_get_file_flag(int flags); int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size); -/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and - * forced to use 'long' read/writes to try to atomically copy long counters. - * Best-effort only. No barriers here, since it _will_ race with concurrent - * updates from BPF programs. Called from bpf syscall and mostly used with - * size 8 or 16 bytes, so ask compiler to inline it. - */ -static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) -{ - const long *lsrc = src; - long *ldst = dst; - - size /= sizeof(long); - while (size--) - *ldst++ = *lsrc++; -} - /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr); -- cgit v1.2.3 From cc48755808c646666436745b35629c3f0d05e165 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi <memxor@gmail.com> Date: Sun, 4 Sep 2022 22:41:16 +0200 Subject: bpf: Add zero_map_value to zero map value with special fields We need this helper to skip over special fields (bpf_spin_lock, bpf_timer, kptrs) while zeroing a map value. Use the same logic as copy_map_value but memset instead of memcpy. Currently, the code zeroing map value memory does not have to deal with special fields, hence this is a prerequisite for introducing such support. Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Link: https://lore.kernel.org/r/20220904204145.3089-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/bpf.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6a73e94821c4..48ae05099f36 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -329,6 +329,25 @@ static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src __copy_map_value(map, dst, src, true); } +static inline void zero_map_value(struct bpf_map *map, void *dst) +{ + u32 curr_off = 0; + int i; + + if (likely(!map->off_arr)) { + memset(dst, 0, map->value_size); + return; + } + + for (i = 0; i < map->off_arr->cnt; i++) { + u32 next_off = map->off_arr->field_off[i]; + + memset(dst + curr_off, 0, next_off - curr_off); + curr_off += map->off_arr->field_sz[i]; + } + memset(dst + curr_off, 0, map->value_size - curr_off); +} + void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); -- cgit v1.2.3 From b239da34203f49c40b5d656220c39647c3ff0b3c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi <memxor@gmail.com> Date: Sun, 4 Sep 2022 22:41:28 +0200 Subject: bpf: Add helper macro bpf_for_each_reg_in_vstate For a lot of use cases in future patches, we will want to modify the state of registers part of some same 'group' (e.g. same ref_obj_id). It won't just be limited to releasing reference state, but setting a type flag dynamically based on certain actions, etc. Hence, we need a way to easily pass a callback to the function that iterates over all registers in current bpf_verifier_state in all frames upto (and including) the curframe. While in C++ we would be able to easily use a lambda to pass state and the callback together, sadly we aren't using C++ in the kernel. The next best thing to avoid defining a function for each case seems like statement expressions in GNU C. The kernel already uses them heavily, hence they can passed to the macro in the style of a lambda. The statement expression will then be substituted in the for loop bodies. Variables __state and __reg are set to current bpf_func_state and reg for each invocation of the expression inside the passed in verifier state. Then, convert mark_ptr_or_null_regs, clear_all_pkt_pointers, release_reference, find_good_pkt_pointers, find_equal_scalars to use bpf_for_each_reg_in_vstate. Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Link: https://lore.kernel.org/r/20220904204145.3089-16-memxor@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/bpf_verifier.h | 21 +++++++ kernel/bpf/verifier.c | 135 +++++++++---------------------------------- 2 files changed, 49 insertions(+), 107 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8fbc1d05281e..b49a349cc6ae 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -348,6 +348,27 @@ struct bpf_verifier_state { iter < frame->allocated_stack / BPF_REG_SIZE; \ iter++, reg = bpf_get_spilled_reg(iter, frame)) +/* Invoke __expr over regsiters in __vst, setting __state and __reg */ +#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ + ({ \ + struct bpf_verifier_state *___vstate = __vst; \ + int ___i, ___j; \ + for (___i = 0; ___i <= ___vstate->curframe; ___i++) { \ + struct bpf_reg_state *___regs; \ + __state = ___vstate->frame[___i]; \ + ___regs = __state->regs; \ + for (___j = 0; ___j < MAX_BPF_REG; ___j++) { \ + __reg = &___regs[___j]; \ + (void)(__expr); \ + } \ + bpf_for_each_spilled_reg(___j, __state, __reg) { \ + if (!__reg) \ + continue; \ + (void)(__expr); \ + } \ + } \ + }) + /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { struct bpf_verifier_state state; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f3344a86d88d..c0f175ac187a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6513,31 +6513,15 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id) /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] * are now invalid, so turn them into unknown SCALAR_VALUE. */ -static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, - struct bpf_func_state *state) +static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { - struct bpf_reg_state *regs = state->regs, *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - if (reg_is_pkt_pointer_any(®s[i])) - mark_reg_unknown(env, regs, i); + struct bpf_func_state *state; + struct bpf_reg_state *reg; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ if (reg_is_pkt_pointer_any(reg)) __mark_reg_unknown(env, reg); - } -} - -static void clear_all_pkt_pointers(struct bpf_verifier_env *env) -{ - struct bpf_verifier_state *vstate = env->cur_state; - int i; - - for (i = 0; i <= vstate->curframe; i++) - __clear_all_pkt_pointers(env, vstate->frame[i]); + })); } enum { @@ -6566,41 +6550,24 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range reg->range = AT_PKT_END; } -static void release_reg_references(struct bpf_verifier_env *env, - struct bpf_func_state *state, - int ref_obj_id) -{ - struct bpf_reg_state *regs = state->regs, *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].ref_obj_id == ref_obj_id) - mark_reg_unknown(env, regs, i); - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->ref_obj_id == ref_obj_id) - __mark_reg_unknown(env, reg); - } -} - /* The pointer with the specified id has released its reference to kernel * resources. Identify all copies of the same pointer and clear the reference. */ static int release_reference(struct bpf_verifier_env *env, int ref_obj_id) { - struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state; + struct bpf_reg_state *reg; int err; - int i; err = release_reference_state(cur_func(env), ref_obj_id); if (err) return err; - for (i = 0; i <= vstate->curframe; i++) - release_reg_references(env, vstate->frame[i], ref_obj_id); + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->ref_obj_id == ref_obj_id) + __mark_reg_unknown(env, reg); + })); return 0; } @@ -9335,34 +9302,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static void __find_good_pkt_pointers(struct bpf_func_state *state, - struct bpf_reg_state *dst_reg, - enum bpf_reg_type type, int new_range) -{ - struct bpf_reg_state *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) { - reg = &state->regs[i]; - if (reg->type == type && reg->id == dst_reg->id) - /* keep the maximum range already checked */ - reg->range = max(reg->range, new_range); - } - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); - } -} - static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { - int new_range, i; + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int new_range; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -9427,9 +9374,11 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, * the range won't allow anything. * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ - for (i = 0; i <= vstate->curframe; i++) - __find_good_pkt_pointers(vstate->frame[i], dst_reg, type, - new_range); + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + if (reg->type == type && reg->id == dst_reg->id) + /* keep the maximum range already checked */ + reg->range = max(reg->range, new_range); + })); } static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) @@ -9918,7 +9867,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (!reg_may_point_to_spin_lock(reg)) { /* For not-NULL ptr, reg->ref_obj_id will be reset - * in release_reg_references(). + * in release_reference(). * * reg->id is still used by spin_lock ptr. Other * than spin_lock ptr type, reg->id can be reset. @@ -9928,22 +9877,6 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } } -static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id, - bool is_null) -{ - struct bpf_reg_state *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - mark_ptr_or_null_reg(state, &state->regs[i], id, is_null); - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - mark_ptr_or_null_reg(state, reg, id, is_null); - } -} - /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ @@ -9951,10 +9884,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs = state->regs, *reg; u32 ref_obj_id = regs[regno].ref_obj_id; u32 id = regs[regno].id; - int i; if (ref_obj_id && ref_obj_id == id && is_null) /* regs[regno] is in the " == NULL" branch. @@ -9963,8 +9895,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, */ WARN_ON_ONCE(release_reference_state(state, id)); - for (i = 0; i <= vstate->curframe; i++) - __mark_ptr_or_null_regs(vstate->frame[i], id, is_null); + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + mark_ptr_or_null_reg(state, reg, id, is_null); + })); } static bool try_match_pkt_pointers(const struct bpf_insn *insn, @@ -10077,23 +10010,11 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate, { struct bpf_func_state *state; struct bpf_reg_state *reg; - int i, j; - for (i = 0; i <= vstate->curframe; i++) { - state = vstate->frame[i]; - for (j = 0; j < MAX_BPF_REG; j++) { - reg = &state->regs[j]; - if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) - *reg = *known_reg; - } - - bpf_for_each_spilled_reg(j, state, reg) { - if (!reg) - continue; - if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) - *reg = *known_reg; - } - } + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) + *reg = *known_reg; + })); } static int check_cond_jmp_op(struct bpf_verifier_env *env, -- cgit v1.2.3 From d4f7bdb2ed7bf320a8772258fdc257655d225afb Mon Sep 17 00:00:00 2001 From: Daniel Xu <dxu@dxuuu.xyz> Date: Wed, 7 Sep 2022 10:40:37 -0600 Subject: bpf: Add stub for btf_struct_access() Add corresponding unimplemented stub for when CONFIG_BPF_SYSCALL=n Signed-off-by: Daniel Xu <dxu@dxuuu.xyz> Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Link: https://lore.kernel.org/r/4021398e884433b1fef57a4d28361bb9fcf1bd05.1662568410.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/bpf.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 48ae05099f36..54178b9e9c3a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2211,6 +2211,15 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id) return ERR_PTR(-ENOTSUPP); } +static inline int btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, + u32 *next_btf_id, enum bpf_type_flag *flag) +{ + return -EACCES; +} + static inline const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { -- cgit v1.2.3 From 1bfe26fb082724be453e4d7fd9bb358e3ba669b2 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky <davemarchevsky@fb.com> Date: Thu, 8 Sep 2022 16:07:16 -0700 Subject: bpf: Add verifier support for custom callback return range Verifier logic to confirm that a callback function returns 0 or 1 was added in commit 69c087ba6225b ("bpf: Add bpf_for_each_map_elem() helper"). At the time, callback return value was only used to continue or stop iteration. In order to support callbacks with a broader return value range, such as those added in rbtree series[0] and others, add a callback_ret_range to bpf_func_state. Verifier's helpers which set in_callback_fn will also set the new field, which the verifier will later use to check return value bounds. Default to tnum_range(0, 0) instead of using tnum_unknown as a sentinel value as the latter would prevent the valid range (0, U64_MAX) being used. Previous global default tnum_range(0, 1) is explicitly set for extant callback helpers. The change to global default was made after discussion around this patch in rbtree series [1], goal here is to make it more obvious that callback_ret_range should be explicitly set. [0]: lore.kernel.org/bpf/20220830172759.4069786-1-davemarchevsky@fb.com/ [1]: lore.kernel.org/bpf/20220830172759.4069786-2-davemarchevsky@fb.com/ Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com> Reviewed-by: Stanislav Fomichev <sdf@google.com> Link: https://lore.kernel.org/r/20220908230716.2751723-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b49a349cc6ae..e197f8fb27e2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -248,6 +248,7 @@ struct bpf_func_state { */ u32 async_entry_cnt; bool in_callback_fn; + struct tnum callback_ret_range; bool in_async_callback_fn; /* The following fields should be last. See copy_func_state() */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9109e07b759a..c259d734f863 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1750,6 +1750,7 @@ static void init_func_state(struct bpf_verifier_env *env, state->callsite = callsite; state->frameno = frameno; state->subprogno = subprogno; + state->callback_ret_range = tnum_range(0, 0); init_reg_state(env, state); mark_verifier_state_scratched(env); } @@ -6790,6 +6791,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return err; callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6811,6 +6813,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6840,6 +6843,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_async_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6867,6 +6871,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6894,7 +6899,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) caller = state->frame[state->curframe]; if (callee->in_callback_fn) { /* enforce R0 return value range [0, 1]. */ - struct tnum range = tnum_range(0, 1); + struct tnum range = callee->callback_ret_range; if (r0->type != SCALAR_VALUE) { verbose(env, "R0 not a scalar value\n"); -- cgit v1.2.3 From 47e34cb74d376ddfeaef94abb1d6dfb3c905ee51 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky <davemarchevsky@fb.com> Date: Mon, 12 Sep 2022 08:45:44 -0700 Subject: bpf: Add verifier check for BPF_PTR_POISON retval and arg BPF_PTR_POISON was added in commit c0a5a21c25f37 ("bpf: Allow storing referenced kptr in map") to denote a bpf_func_proto btf_id which the verifier will replace with a dynamically-determined btf_id at verification time. This patch adds verifier 'poison' functionality to BPF_PTR_POISON in order to prepare for expanded use of the value to poison ret- and arg-btf_id in ongoing work, namely rbtree and linked list patchsets [0, 1]. Specifically, when the verifier checks helper calls, it assumes that BPF_PTR_POISON'ed ret type will be replaced with a valid type before - or in lieu of - the default ret_btf_id logic. Similarly for arg btf_id. If poisoned btf_id reaches default handling block for either, consider this a verifier internal error and fail verification. Otherwise a helper w/ poisoned btf_id but no verifier logic replacing the type will cause a crash as the invalid pointer is dereferenced. Also move BPF_PTR_POISON to existing include/linux/posion.h header and remove unnecessary shift. [0]: lore.kernel.org/bpf/20220830172759.4069786-1-davemarchevsky@fb.com [1]: lore.kernel.org/bpf/20220904204145.3089-1-memxor@gmail.com Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com> Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Link: https://lore.kernel.org/r/20220912154544.1398199-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/poison.h | 3 +++ kernel/bpf/helpers.c | 6 +++--- kernel/bpf/verifier.c | 30 +++++++++++++++++++++++------- 3 files changed, 29 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/poison.h b/include/linux/poison.h index d62ef5a6b4e9..2d3249eb0e62 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -81,4 +81,7 @@ /********** net/core/page_pool.c **********/ #define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) +/********** kernel/bpf/ **********/ +#define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA)) + #endif diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index fc08035f14ed..41aeaf3862ec 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -15,6 +15,7 @@ #include <linux/ctype.h> #include <linux/jiffies.h> #include <linux/pid_namespace.h> +#include <linux/poison.h> #include <linux/proc_ns.h> #include <linux/security.h> #include <linux/btf_ids.h> @@ -1376,10 +1377,9 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) } /* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg() - * helper is determined dynamically by the verifier. + * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to + * denote type that verifier will determine. */ -#define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) - static const struct bpf_func_proto bpf_kptr_xchg_proto = { .func = bpf_kptr_xchg, .gpl_only = false, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c259d734f863..8c6fbcd0afaf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23,6 +23,7 @@ #include <linux/error-injection.h> #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> +#include <linux/poison.h> #include "disasm.h" @@ -5782,13 +5783,22 @@ found: if (meta->func_id == BPF_FUNC_kptr_xchg) { if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno)) return -EACCES; - } else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - btf_vmlinux, *arg_btf_id, - strict_type_match)) { - verbose(env, "R%d is of type %s but %s is expected\n", - regno, kernel_type_name(reg->btf, reg->btf_id), - kernel_type_name(btf_vmlinux, *arg_btf_id)); - return -EACCES; + } else { + if (arg_btf_id == BPF_PTR_POISON) { + verbose(env, "verifier internal error:"); + verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n", + regno); + return -EACCES; + } + + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + btf_vmlinux, *arg_btf_id, + strict_type_match)) { + verbose(env, "R%d is of type %s but %s is expected\n", + regno, kernel_type_name(reg->btf, reg->btf_id), + kernel_type_name(btf_vmlinux, *arg_btf_id)); + return -EACCES; + } } } @@ -7457,6 +7467,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn ret_btf = meta.kptr_off_desc->kptr.btf; ret_btf_id = meta.kptr_off_desc->kptr.btf_id; } else { + if (fn->ret_btf_id == BPF_PTR_POISON) { + verbose(env, "verifier internal error:"); + verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n", + func_id_name(func_id)); + return -EINVAL; + } ret_btf = btf_vmlinux; ret_btf_id = *fn->ret_btf_id; } -- cgit v1.2.3 From bfeb7e399bacae4ee46ad978f5fce3e47f0978d6 Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta <ykaliuta@redhat.com> Date: Mon, 5 Sep 2022 12:01:49 +0300 Subject: bpf: Use bpf_capable() instead of CAP_SYS_ADMIN for blinding decision The full CAP_SYS_ADMIN requirement for blinding looks too strict nowadays. These days given unprivileged BPF is disabled by default, the main users for constant blinding coming from unprivileged in particular via cBPF -> eBPF migration (e.g. old-style socket filters). Signed-off-by: Yauheni Kaliuta <ykaliuta@redhat.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/bpf/20220831090655.156434-1-ykaliuta@redhat.com Link: https://lore.kernel.org/bpf/20220905090149.61221-1-ykaliuta@redhat.com --- Documentation/admin-guide/sysctl/net.rst | 3 +++ include/linux/filter.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 555681ef6195..6394f5dc2303 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -102,6 +102,9 @@ Values: - 1 - enable JIT hardening for unprivileged users only - 2 - enable JIT hardening for all users +where "privileged user" in this context means a process having +CAP_BPF or CAP_SYS_ADMIN in the root user name space. + bpf_jit_kallsyms ---------------- diff --git a/include/linux/filter.h b/include/linux/filter.h index 527ae1d64e27..75335432fcbc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1099,7 +1099,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) return false; if (!bpf_jit_harden) return false; - if (bpf_jit_harden == 1 && capable(CAP_SYS_ADMIN)) + if (bpf_jit_harden == 1 && bpf_capable()) return false; return true; -- cgit v1.2.3 From ceea991a019c57a1fb0edd12a5f836a0fa431aee Mon Sep 17 00:00:00 2001 From: Jiri Olsa <jolsa@kernel.org> Date: Sat, 3 Sep 2022 15:11:54 +0200 Subject: bpf: Move bpf_dispatcher function out of ftrace locations The dispatcher function is attached/detached to trampoline by dispatcher update function. At the same time it's available as ftrace attachable function. After discussion [1] the proposed solution is to use compiler attributes to alter bpf_dispatcher_##name##_func function: - remove it from being instrumented with __no_instrument_function__ attribute, so ftrace has no track of it - but still generate 5 nop instructions with patchable_function_entry(5) attribute, which are expected by bpf_arch_text_poke used by dispatcher update function Enabling HAVE_DYNAMIC_FTRACE_NO_PATCHABLE option for x86, so __patchable_function_entries functions are not part of ftrace/mcount locations. Adding attributes to bpf_dispatcher_XXX function on x86_64 so it's kept out of ftrace locations and has 5 byte nop generated at entry. These attributes need to be arch specific as pointed out by Ilya Leoshkevic in here [2]. The dispatcher image is generated only for x86_64 arch, so the code can stay as is for other archs. [1] https://lore.kernel.org/bpf/20220722110811.124515-1-jolsa@kernel.org/ [2] https://lore.kernel.org/bpf/969a14281a7791c334d476825863ee449964dd0c.camel@linux.ibm.com/ Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Jiri Olsa <jolsa@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/bpf/20220903131154.420467-3-jolsa@kernel.org --- arch/x86/Kconfig | 1 + include/linux/bpf.h | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f9920f1341c8..089c20cefd2b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -284,6 +284,7 @@ config X86 select PROC_PID_ARCH_STATUS if PROC_FS select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI + select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE config INSTRUCTION_DECODER def_bool y diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 54178b9e9c3a..e0dbe0c0a17e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -977,7 +977,14 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs); }, \ } +#ifdef CONFIG_X86_64 +#define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5))) +#else +#define BPF_DISPATCHER_ATTRIBUTES +#endif + #define DEFINE_BPF_DISPATCHER(name) \ + notrace BPF_DISPATCHER_ATTRIBUTES \ noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ -- cgit v1.2.3 From fdf214978a71b2749d26f6da2b1d51d9ac23831d Mon Sep 17 00:00:00 2001 From: Daniel Xu <dxu@dxuuu.xyz> Date: Tue, 20 Sep 2022 08:15:24 -0600 Subject: bpf: Move nf_conn extern declarations to filter.h We're seeing the following new warnings on netdev/build_32bit and netdev/build_allmodconfig_warn CI jobs: ../net/core/filter.c:8608:1: warning: symbol 'nf_conn_btf_access_lock' was not declared. Should it be static? ../net/core/filter.c:8611:5: warning: symbol 'nfct_bsa' was not declared. Should it be static? Fix by ensuring extern declaration is present while compiling filter.o. Fixes: 864b656f82cc ("bpf: Add support for writing to nf_conn:mark") Signed-off-by: Daniel Xu <dxu@dxuuu.xyz> Link: https://lore.kernel.org/r/2bd2e0283df36d8a4119605878edb1838d144174.1663683114.git.dxu@dxuuu.xyz Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org> --- include/linux/filter.h | 6 ++++++ include/net/netfilter/nf_conntrack_bpf.h | 7 ------- net/netfilter/nf_conntrack_bpf.c | 1 + 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 75335432fcbc..98e28126c24b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -567,6 +567,12 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); +extern struct mutex nf_conn_btf_access_lock; +extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); + typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, unsigned int (*bpf_func)(const void *, diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h index 1199d4f8e019..c8b80add1142 100644 --- a/include/net/netfilter/nf_conntrack_bpf.h +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -4,7 +4,6 @@ #define _NF_CONNTRACK_BPF_H #include <linux/kconfig.h> -#include <linux/mutex.h> #if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) @@ -12,12 +11,6 @@ extern int register_nf_conntrack_bpf(void); extern void cleanup_nf_conntrack_bpf(void); -extern struct mutex nf_conn_btf_access_lock; -extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, u32 *next_btf_id, - enum bpf_type_flag *flag); - #else static inline int register_nf_conntrack_bpf(void) diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 29c4efb3da5e..67df64283aef 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -9,6 +9,7 @@ #include <linux/bpf_verifier.h> #include <linux/bpf.h> #include <linux/btf.h> +#include <linux/filter.h> #include <linux/mutex.h> #include <linux/types.h> #include <linux/btf_ids.h> -- cgit v1.2.3 From 583c1f420173f7d84413a1a1fbf5109d798b4faa Mon Sep 17 00:00:00 2001 From: David Vernet <void@manifault.com> Date: Mon, 19 Sep 2022 19:00:57 -0500 Subject: bpf: Define new BPF_MAP_TYPE_USER_RINGBUF map type We want to support a ringbuf map type where samples are published from user-space, to be consumed by BPF programs. BPF currently supports a kernel -> user-space circular ring buffer via the BPF_MAP_TYPE_RINGBUF map type. We'll need to define a new map type for user-space -> kernel, as none of the helpers exported for BPF_MAP_TYPE_RINGBUF will apply to a user-space producer ring buffer, and we'll want to add one or more helper functions that would not apply for a kernel-producer ring buffer. This patch therefore adds a new BPF_MAP_TYPE_USER_RINGBUF map type definition. The map type is useless in its current form, as there is no way to access or use it for anything until we one or more BPF helpers. A follow-on patch will therefore add a new helper function that allows BPF programs to run callbacks on samples that are published to the ring buffer. Signed-off-by: David Vernet <void@manifault.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Acked-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/bpf/20220920000100.477320-2-void@manifault.com --- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/ringbuf.c | 62 ++++++++++++++++++++++--- kernel/bpf/verifier.c | 3 ++ tools/bpf/bpftool/Documentation/bpftool-map.rst | 2 +- tools/bpf/bpftool/map.c | 2 +- tools/include/uapi/linux/bpf.h | 1 + tools/lib/bpf/libbpf.c | 1 + 8 files changed, 65 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2b9112b80171..2c6a4f2562a7 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -126,6 +126,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3df78c56c1bf..e18c85324db6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -928,6 +928,7 @@ enum bpf_map_type { BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, }; /* Note that tracing related programs such as diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index b483aea35f41..754e915748fb 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -38,10 +38,27 @@ struct bpf_ringbuf { struct page **pages; int nr_pages; spinlock_t spinlock ____cacheline_aligned_in_smp; - /* Consumer and producer counters are put into separate pages to allow - * mapping consumer page as r/w, but restrict producer page to r/o. - * This protects producer position from being modified by user-space - * application and ruining in-kernel position tracking. + /* Consumer and producer counters are put into separate pages to + * allow each position to be mapped with different permissions. + * This prevents a user-space application from modifying the + * position and ruining in-kernel tracking. The permissions of the + * pages depend on who is producing samples: user-space or the + * kernel. + * + * Kernel-producer + * --------------- + * The producer position and data pages are mapped as r/o in + * userspace. For this approach, bits in the header of samples are + * used to signal to user-space, and to other producers, whether a + * sample is currently being written. + * + * User-space producer + * ------------------- + * Only the page containing the consumer position is mapped r/o in + * user-space. User-space producers also use bits of the header to + * communicate to the kernel, but the kernel must carefully check and + * validate each sample to ensure that they're correctly formatted, and + * fully contained within the ring buffer. */ unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); @@ -224,7 +241,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, return -ENOTSUPP; } -static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; @@ -242,6 +259,26 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + RINGBUF_PGOFF); } +static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + + if (vma->vm_flags & VM_WRITE) { + if (vma->vm_pgoff == 0) + /* Disallow writable mappings to the consumer pointer, + * and allow writable mappings to both the producer + * position, and the ring buffer data itself. + */ + return -EPERM; + } else { + vma->vm_flags &= ~VM_MAYWRITE; + } + /* remap_vmalloc_range() checks size and offset constraints */ + return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); +} + static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) { unsigned long cons_pos, prod_pos; @@ -269,7 +306,7 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, - .map_mmap = ringbuf_map_mmap, + .map_mmap = ringbuf_map_mmap_kern, .map_poll = ringbuf_map_poll, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, @@ -278,6 +315,19 @@ const struct bpf_map_ops ringbuf_map_ops = { .map_btf_id = &ringbuf_map_btf_ids[0], }; +BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map) +const struct bpf_map_ops user_ringbuf_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap_user, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, + .map_btf_id = &user_ringbuf_map_btf_ids[0], +}; + /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, * calculate offset from record metadata to ring buffer in pages, rounded * down. This page offset is stored as part of record metadata and allows to diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8c6fbcd0afaf..83710b60e708 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6240,6 +6240,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_ringbuf_discard_dynptr) goto error; break; + case BPF_MAP_TYPE_USER_RINGBUF: + goto error; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -12635,6 +12637,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: case BPF_MAP_TYPE_INODE_STORAGE: case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_TASK_STORAGE: diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 7c188a598444..7f3b67a8b48f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -55,7 +55,7 @@ MAP COMMANDS | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** | | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** -| | **task_storage** | **bloom_filter** } +| | **task_storage** | **bloom_filter** | **user_ringbuf** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 38b6bc9c26c3..9a6ca9f31133 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1459,7 +1459,7 @@ static int do_help(int argc, char **argv) " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n" - " task_storage | bloom_filter }\n" + " task_storage | bloom_filter | user_ringbuf }\n" " " HELP_SPEC_OPTIONS " |\n" " {-f|--bpffs} | {-n|--nomount} }\n" "", diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3df78c56c1bf..e18c85324db6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -928,6 +928,7 @@ enum bpf_map_type { BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, }; /* Note that tracing related programs such as diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2ca30ccc774c..d480da05b6de 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -163,6 +163,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", [BPF_MAP_TYPE_TASK_STORAGE] = "task_storage", [BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter", + [BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf", }; static const char * const prog_type_name[] = { -- cgit v1.2.3 From 20571567384428dfc9fe5cf9f2e942e1df13c2dd Mon Sep 17 00:00:00 2001 From: David Vernet <void@manifault.com> Date: Mon, 19 Sep 2022 19:00:58 -0500 Subject: bpf: Add bpf_user_ringbuf_drain() helper In a prior change, we added a new BPF_MAP_TYPE_USER_RINGBUF map type which will allow user-space applications to publish messages to a ring buffer that is consumed by a BPF program in kernel-space. In order for this map-type to be useful, it will require a BPF helper function that BPF programs can invoke to drain samples from the ring buffer, and invoke callbacks on those samples. This change adds that capability via a new BPF helper function: bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) BPF programs may invoke this function to run callback_fn() on a series of samples in the ring buffer. callback_fn() has the following signature: long callback_fn(struct bpf_dynptr *dynptr, void *context); Samples are provided to the callback in the form of struct bpf_dynptr *'s, which the program can read using BPF helper functions for querying struct bpf_dynptr's. In order to support bpf_ringbuf_drain(), a new PTR_TO_DYNPTR register type is added to the verifier to reflect a dynptr that was allocated by a helper function and passed to a BPF program. Unlike PTR_TO_STACK dynptrs which are allocated on the stack by a BPF program, PTR_TO_DYNPTR dynptrs need not use reference tracking, as the BPF helper is trusted to properly free the dynptr before returning. The verifier currently only supports PTR_TO_DYNPTR registers that are also DYNPTR_TYPE_LOCAL. Note that while the corresponding user-space libbpf logic will be added in a subsequent patch, this patch does contain an implementation of the .map_poll() callback for BPF_MAP_TYPE_USER_RINGBUF maps. This .map_poll() callback guarantees that an epoll-waiting user-space producer will receive at least one event notification whenever at least one sample is drained in an invocation of bpf_user_ringbuf_drain(), provided that the function is not invoked with the BPF_RB_NO_WAKEUP flag. If the BPF_RB_FORCE_WAKEUP flag is provided, a wakeup notification is sent even if no sample was drained. Signed-off-by: David Vernet <void@manifault.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/bpf/20220920000100.477320-3-void@manifault.com --- include/linux/bpf.h | 11 ++- include/uapi/linux/bpf.h | 38 +++++++++ kernel/bpf/helpers.c | 2 + kernel/bpf/ringbuf.c | 181 +++++++++++++++++++++++++++++++++++++++-- kernel/bpf/verifier.c | 61 +++++++++++++- tools/include/uapi/linux/bpf.h | 38 +++++++++ 6 files changed, 320 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e0dbe0c0a17e..33e543b86e1a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -451,7 +451,7 @@ enum bpf_type_flag { /* DYNPTR points to memory local to the bpf program. */ DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS), - /* DYNPTR points to a ringbuf record. */ + /* DYNPTR points to a kernel-produced ringbuf record. */ DYNPTR_TYPE_RINGBUF = BIT(9 + BPF_BASE_TYPE_BITS), /* Size is known at compile time. */ @@ -656,6 +656,7 @@ enum bpf_reg_type { PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_DYNPTR, /* reg points to a dynptr */ __BPF_REG_TYPE_MAX, /* Extended reg_types. */ @@ -1394,6 +1395,11 @@ struct bpf_array { #define BPF_MAP_CAN_READ BIT(0) #define BPF_MAP_CAN_WRITE BIT(1) +/* Maximum number of user-producer ring buffer samples that can be drained in + * a call to bpf_user_ringbuf_drain(). + */ +#define BPF_MAX_USER_RINGBUF_SAMPLES (128 * 1024) + static inline u32 bpf_map_flags_to_cap(struct bpf_map *map) { u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); @@ -2495,6 +2501,7 @@ extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_copy_from_user_task_proto; extern const struct bpf_func_proto bpf_set_retval_proto; extern const struct bpf_func_proto bpf_get_retval_proto; +extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -2639,7 +2646,7 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_INVALID, /* Points to memory that is local to the bpf program */ BPF_DYNPTR_TYPE_LOCAL, - /* Underlying data is a ringbuf record */ + /* Underlying data is a kernel-produced ringbuf record */ BPF_DYNPTR_TYPE_RINGBUF, }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e18c85324db6..ead35f39f185 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5388,6 +5388,43 @@ union bpf_attr { * Return * Current *ktime*. * + * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) + * Description + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * Return + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5599,6 +5636,7 @@ union bpf_attr { FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ FN(ktime_get_tai_ns), \ + FN(user_ringbuf_drain), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 41aeaf3862ec..cb5564c77482 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1659,6 +1659,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_for_each_map_elem_proto; case BPF_FUNC_loop: return &bpf_loop_proto; + case BPF_FUNC_user_ringbuf_drain: + return &bpf_user_ringbuf_drain_proto; default: break; } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 754e915748fb..9e832acf4692 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -38,6 +38,22 @@ struct bpf_ringbuf { struct page **pages; int nr_pages; spinlock_t spinlock ____cacheline_aligned_in_smp; + /* For user-space producer ring buffers, an atomic_t busy bit is used + * to synchronize access to the ring buffers in the kernel, rather than + * the spinlock that is used for kernel-producer ring buffers. This is + * done because the ring buffer must hold a lock across a BPF program's + * callback: + * + * __bpf_user_ringbuf_peek() // lock acquired + * -> program callback_fn() + * -> __bpf_user_ringbuf_sample_release() // lock released + * + * It is unsafe and incorrect to hold an IRQ spinlock across what could + * be a long execution window, so we instead simply disallow concurrent + * access to the ring buffer by kernel consumers, and return -EBUSY from + * __bpf_user_ringbuf_peek() if the busy bit is held by another task. + */ + atomic_t busy ____cacheline_aligned_in_smp; /* Consumer and producer counters are put into separate pages to * allow each position to be mapped with different permissions. * This prevents a user-space application from modifying the @@ -153,6 +169,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) return NULL; spin_lock_init(&rb->spinlock); + atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -288,8 +305,13 @@ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) return prod_pos - cons_pos; } -static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, - struct poll_table_struct *pts) +static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) +{ + return rb->mask + 1; +} + +static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) { struct bpf_ringbuf_map *rb_map; @@ -301,13 +323,26 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, return 0; } +static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + poll_wait(filp, &rb_map->rb->waitq, pts); + + if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb)) + return EPOLLOUT | EPOLLWRNORM; + return 0; +} + BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, .map_mmap = ringbuf_map_mmap_kern, - .map_poll = ringbuf_map_poll, + .map_poll = ringbuf_map_poll_kern, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, @@ -321,6 +356,7 @@ const struct bpf_map_ops user_ringbuf_map_ops = { .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, .map_mmap = ringbuf_map_mmap_user, + .map_poll = ringbuf_map_poll_user, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, @@ -362,7 +398,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) return NULL; len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); - if (len > rb->mask + 1) + if (len > ringbuf_total_data_sz(rb)) return NULL; cons_pos = smp_load_acquire(&rb->consumer_pos); @@ -509,7 +545,7 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) case BPF_RB_AVAIL_DATA: return ringbuf_avail_data_sz(rb); case BPF_RB_RING_SIZE: - return rb->mask + 1; + return ringbuf_total_data_sz(rb); case BPF_RB_CONS_POS: return smp_load_acquire(&rb->consumer_pos); case BPF_RB_PROD_POS: @@ -603,3 +639,138 @@ const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; + +static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size) +{ + int err; + u32 hdr_len, sample_len, total_len, flags, *hdr; + u64 cons_pos, prod_pos; + + /* Synchronizes with smp_store_release() in user-space producer. */ + prod_pos = smp_load_acquire(&rb->producer_pos); + if (prod_pos % 8) + return -EINVAL; + + /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */ + cons_pos = smp_load_acquire(&rb->consumer_pos); + if (cons_pos >= prod_pos) + return -ENODATA; + + hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask)); + /* Synchronizes with smp_store_release() in user-space producer. */ + hdr_len = smp_load_acquire(hdr); + flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT); + sample_len = hdr_len & ~flags; + total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8); + + /* The sample must fit within the region advertised by the producer position. */ + if (total_len > prod_pos - cons_pos) + return -EINVAL; + + /* The sample must fit within the data region of the ring buffer. */ + if (total_len > ringbuf_total_data_sz(rb)) + return -E2BIG; + + /* The sample must fit into a struct bpf_dynptr. */ + err = bpf_dynptr_check_size(sample_len); + if (err) + return -E2BIG; + + if (flags & BPF_RINGBUF_DISCARD_BIT) { + /* If the discard bit is set, the sample should be skipped. + * + * Update the consumer pos, and return -EAGAIN so the caller + * knows to skip this sample and try to read the next one. + */ + smp_store_release(&rb->consumer_pos, cons_pos + total_len); + return -EAGAIN; + } + + if (flags & BPF_RINGBUF_BUSY_BIT) + return -ENODATA; + + *sample = (void *)((uintptr_t)rb->data + + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask)); + *size = sample_len; + return 0; +} + +static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags) +{ + u64 consumer_pos; + u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + + /* Using smp_load_acquire() is unnecessary here, as the busy-bit + * prevents another task from writing to consumer_pos after it was read + * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek(). + */ + consumer_pos = rb->consumer_pos; + /* Synchronizes with smp_load_acquire() in user-space producer. */ + smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size); +} + +BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map, + void *, callback_fn, void *, callback_ctx, u64, flags) +{ + struct bpf_ringbuf *rb; + long samples, discarded_samples = 0, ret = 0; + bpf_callback_t callback = (bpf_callback_t)callback_fn; + u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP; + int busy = 0; + + if (unlikely(flags & ~wakeup_flags)) + return -EINVAL; + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + + /* If another consumer is already consuming a sample, wait for them to finish. */ + if (!atomic_try_cmpxchg(&rb->busy, &busy, 1)) + return -EBUSY; + + for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) { + int err; + u32 size; + void *sample; + struct bpf_dynptr_kern dynptr; + + err = __bpf_user_ringbuf_peek(rb, &sample, &size); + if (err) { + if (err == -ENODATA) { + break; + } else if (err == -EAGAIN) { + discarded_samples++; + continue; + } else { + ret = err; + goto schedule_work_return; + } + } + + bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size); + ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0); + __bpf_user_ringbuf_sample_release(rb, size, flags); + } + ret = samples - discarded_samples; + +schedule_work_return: + /* Prevent the clearing of the busy-bit from being reordered before the + * storing of any rb consumer or producer positions. + */ + smp_mb__before_atomic(); + atomic_set(&rb->busy, 0); + + if (flags & BPF_RB_FORCE_WAKEUP) + irq_work_queue(&rb->work); + else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0) + irq_work_queue(&rb->work); + return ret; +} + +const struct bpf_func_proto bpf_user_ringbuf_drain_proto = { + .func = bpf_user_ringbuf_drain, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 83710b60e708..c76fa45a5906 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -563,6 +563,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", + [PTR_TO_DYNPTR] = "dynptr_ptr", }; if (type & PTR_MAYBE_NULL) { @@ -5688,6 +5689,12 @@ static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types dynptr_types = { + .types = { + PTR_TO_STACK, + PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL, + } +}; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -5714,7 +5721,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, [ARG_PTR_TO_KPTR] = &kptr_types, - [ARG_PTR_TO_DYNPTR] = &stack_ptr_types, + [ARG_PTR_TO_DYNPTR] = &dynptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -6066,6 +6073,13 @@ skip_type_check: err = check_mem_size_reg(env, reg, regno, true, meta); break; case ARG_PTR_TO_DYNPTR: + /* We only need to check for initialized / uninitialized helper + * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the + * assumption is that if it is, that a helper function + * initialized the dynptr on behalf of the BPF program. + */ + if (base_type(reg->type) == PTR_TO_DYNPTR) + break; if (arg_type & MEM_UNINIT) { if (!is_dynptr_reg_valid_uninit(env, reg)) { verbose(env, "Dynptr has to be an uninitialized dynptr\n"); @@ -6241,7 +6255,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_MAP_TYPE_USER_RINGBUF: - goto error; + if (func_id != BPF_FUNC_user_ringbuf_drain) + goto error; + break; case BPF_MAP_TYPE_STACK_TRACE: if (func_id != BPF_FUNC_get_stackid) goto error; @@ -6361,6 +6377,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_RINGBUF) goto error; break; + case BPF_FUNC_user_ringbuf_drain: + if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF) + goto error; + break; case BPF_FUNC_get_stackid: if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; @@ -6887,6 +6907,29 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void + * callback_ctx, u64 flags); + * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx); + */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_0]); + callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -7346,12 +7389,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn case BPF_FUNC_dynptr_data: for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { if (arg_type_is_dynptr(fn->arg_type[i])) { + struct bpf_reg_state *reg = ®s[BPF_REG_1 + i]; + if (meta.ref_obj_id) { verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); return -EFAULT; } - /* Find the id of the dynptr we're tracking the reference of */ - meta.ref_obj_id = stack_slot_get_id(env, ®s[BPF_REG_1 + i]); + + if (base_type(reg->type) != PTR_TO_DYNPTR) + /* Find the id of the dynptr we're + * tracking the reference of + */ + meta.ref_obj_id = stack_slot_get_id(env, reg); break; } } @@ -7360,6 +7409,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } break; + case BPF_FUNC_user_ringbuf_drain: + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_user_ringbuf_callback_state); + break; } if (err) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e18c85324db6..ead35f39f185 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5388,6 +5388,43 @@ union bpf_attr { * Return * Current *ktime*. * + * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) + * Description + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * Return + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5599,6 +5636,7 @@ union bpf_attr { FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ FN(ktime_get_tai_ns), \ + FN(user_ringbuf_drain), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From b8d31762a0ae6861e1115302ee338560d853e317 Mon Sep 17 00:00:00 2001 From: Roberto Sassu <roberto.sassu@huawei.com> Date: Tue, 20 Sep 2022 09:59:42 +0200 Subject: btf: Allow dynamic pointer parameters in kfuncs Allow dynamic pointers (struct bpf_dynptr_kern *) to be specified as parameters in kfuncs. Also, ensure that dynamic pointers passed as argument are valid and initialized, are a pointer to the stack, and of the type local. More dynamic pointer types can be supported in the future. To properly detect whether a parameter is of the desired type, introduce the stringify_struct() macro to compare the returned structure name with the desired name. In addition, protect against structure renames, by halting the build with BUILD_BUG_ON(), so that developers have to revisit the code. To check if a dynamic pointer passed to the kfunc is valid and initialized, and if its type is local, export the existing functions is_dynptr_reg_valid_init() and is_dynptr_type_expected(). Cc: Joanne Koong <joannelkoong@gmail.com> Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com> Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com> Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Link: https://lore.kernel.org/r/20220920075951.929132-5-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/bpf_verifier.h | 5 +++++ include/linux/btf.h | 9 +++++++++ kernel/bpf/btf.c | 33 +++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 10 +++++----- 4 files changed, 52 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e197f8fb27e2..9e1e6965f407 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -593,6 +593,11 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); +bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, + struct bpf_reg_state *reg); +bool is_dynptr_type_expected(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + enum bpf_arg_type arg_type); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, diff --git a/include/linux/btf.h b/include/linux/btf.h index 1fcc833a8690..f9aababc5d78 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -52,6 +52,15 @@ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ +/* + * Return the name of the passed struct, if exists, or halt the build if for + * example the structure gets renamed. In this way, developers have to revisit + * the code using that structure name, and update it accordingly. + */ +#define stringify_struct(x) \ + ({ BUILD_BUG_ON(sizeof(struct x) < 0); \ + __stringify(x); }) + struct btf; struct btf_member; struct btf_type; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dbcf020c4124..13faede0f2b4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6449,15 +6449,20 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (is_kfunc) { bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], ®s[regno + 1]); + bool arg_dynptr = btf_type_is_struct(ref_t) && + !strcmp(ref_tname, + stringify_struct(bpf_dynptr_kern)); /* Permit pointer to mem, but only when argument * type is pointer to scalar, or struct composed * (recursively) of scalars. * When arg_mem_size is true, the pointer can be * void *. + * Also permit initialized local dynamic pointers. */ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(log, btf, ref_t, 0) && + !arg_dynptr && (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { bpf_log(log, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", @@ -6465,6 +6470,34 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + if (arg_dynptr) { + if (reg->type != PTR_TO_STACK) { + bpf_log(log, "arg#%d pointer type %s %s not to stack\n", + i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + if (!is_dynptr_reg_valid_init(env, reg)) { + bpf_log(log, + "arg#%d pointer type %s %s must be valid and initialized\n", + i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + if (!is_dynptr_type_expected(env, reg, + ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) { + bpf_log(log, + "arg#%d pointer type %s %s points to unsupported dynamic pointer type\n", + i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + continue; + } + /* Check for mem, len pair */ if (arg_mem_size) { if (check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1)) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c08dde19eb67..6f6d2d511c06 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -782,8 +782,8 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_ return true; } -static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, - struct bpf_reg_state *reg) +bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, + struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); int spi = get_spi(reg->off); @@ -802,9 +802,9 @@ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, return true; } -static bool is_dynptr_type_expected(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - enum bpf_arg_type arg_type) +bool is_dynptr_type_expected(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + enum bpf_arg_type arg_type) { struct bpf_func_state *state = func(env, reg); enum bpf_dynptr_type dynptr_type; -- cgit v1.2.3 From 51df4865718540f51bb5d3e552c50dc88e1333d6 Mon Sep 17 00:00:00 2001 From: Roberto Sassu <roberto.sassu@huawei.com> Date: Tue, 20 Sep 2022 09:59:43 +0200 Subject: bpf: Export bpf_dynptr_get_size() Export bpf_dynptr_get_size(), so that kernel code dealing with eBPF dynamic pointers can obtain the real size of data carried by this data structure. Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com> Reviewed-by: Joanne Koong <joannelkoong@gmail.com> Acked-by: KP Singh <kpsingh@kernel.org> Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Link: https://lore.kernel.org/r/20220920075951.929132-6-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/bpf.h | 1 + kernel/bpf/helpers.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 33e543b86e1a..6535fb1e21b9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2654,6 +2654,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); int bpf_dynptr_check_size(u32 size); +u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6d69e30f42d8..b069517a3da0 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1408,7 +1408,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ ptr->size |= type << DYNPTR_TYPE_SHIFT; } -static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) +u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } -- cgit v1.2.3 From 90fd8f26edd47942203639bf3a5dde8fa1931a0e Mon Sep 17 00:00:00 2001 From: Roberto Sassu <roberto.sassu@huawei.com> Date: Tue, 20 Sep 2022 09:59:44 +0200 Subject: KEYS: Move KEY_LOOKUP_ to include/linux/key.h and define KEY_LOOKUP_ALL In preparation for the patch that introduces the bpf_lookup_user_key() eBPF kfunc, move KEY_LOOKUP_ definitions to include/linux/key.h, to be able to validate the kfunc parameters. Add them to enum key_lookup_flag, so that all the current ones and the ones defined in the future are automatically exported through BTF and available to eBPF programs. Also, add KEY_LOOKUP_ALL to the enum, with the logical OR of currently defined flags as value, to facilitate checking whether a variable contains only those flags. Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com> Acked-by: Jarkko Sakkinen <jarkko@kernel.org> Link: https://lore.kernel.org/r/20220920075951.929132-7-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/key.h | 6 ++++++ security/keys/internal.h | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/key.h b/include/linux/key.h index 7febc4881363..d27477faf00d 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -88,6 +88,12 @@ enum key_need_perm { KEY_DEFER_PERM_CHECK, /* Special: permission check is deferred */ }; +enum key_lookup_flag { + KEY_LOOKUP_CREATE = 0x01, + KEY_LOOKUP_PARTIAL = 0x02, + KEY_LOOKUP_ALL = (KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL), +}; + struct seq_file; struct user_struct; struct signal_struct; diff --git a/security/keys/internal.h b/security/keys/internal.h index 9b9cf3b6fcbb..3c1e7122076b 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -165,8 +165,6 @@ extern struct key *request_key_and_link(struct key_type *type, extern bool lookup_user_key_possessed(const struct key *key, const struct key_match_data *match_data); -#define KEY_LOOKUP_CREATE 0x01 -#define KEY_LOOKUP_PARTIAL 0x02 extern long join_session_keyring(const char *name); extern void key_change_session_keyring(struct callback_head *twork); -- cgit v1.2.3 From f3cf4134c5c6c47b9b5c7aa3cb2d67e107887a7b Mon Sep 17 00:00:00 2001 From: Roberto Sassu <roberto.sassu@huawei.com> Date: Tue, 20 Sep 2022 09:59:45 +0200 Subject: bpf: Add bpf_lookup_*_key() and bpf_key_put() kfuncs Add the bpf_lookup_user_key(), bpf_lookup_system_key() and bpf_key_put() kfuncs, to respectively search a key with a given key handle serial number and flags, obtain a key from a pre-determined ID defined in include/linux/verification.h, and cleanup. Introduce system_keyring_id_check() to validate the keyring ID parameter of bpf_lookup_system_key(). Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com> Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Acked-by: Song Liu <song@kernel.org> Link: https://lore.kernel.org/r/20220920075951.929132-8-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/bpf.h | 8 +++ include/linux/verification.h | 8 +++ kernel/trace/bpf_trace.c | 135 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 151 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6535fb1e21b9..a1435b019aca 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2664,4 +2664,12 @@ static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {} static inline void bpf_cgroup_atype_put(int cgroup_atype) {} #endif /* CONFIG_BPF_LSM */ +struct key; + +#ifdef CONFIG_KEYS +struct bpf_key { + struct key *key; + bool has_ref; +}; +#endif /* CONFIG_KEYS */ #endif /* _LINUX_BPF_H */ diff --git a/include/linux/verification.h b/include/linux/verification.h index a655923335ae..f34e50ebcf60 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -17,6 +17,14 @@ #define VERIFY_USE_SECONDARY_KEYRING ((struct key *)1UL) #define VERIFY_USE_PLATFORM_KEYRING ((struct key *)2UL) +static inline int system_keyring_id_check(u64 id) +{ + if (id > (unsigned long)VERIFY_USE_PLATFORM_KEYRING) + return -EINVAL; + + return 0; +} + /* * The use to which an asymmetric key is being put. */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 68e5cdd24cef..ab183dbaa8d1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -20,6 +20,8 @@ #include <linux/fprobe.h> #include <linux/bsearch.h> #include <linux/sort.h> +#include <linux/key.h> +#include <linux/verification.h> #include <net/bpf_sk_storage.h> @@ -1181,6 +1183,139 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +#ifdef CONFIG_KEYS +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "kfuncs which will be used in BPF programs"); + +/** + * bpf_lookup_user_key - lookup a key by its serial + * @serial: key handle serial number + * @flags: lookup-specific flags + * + * Search a key with a given *serial* and the provided *flags*. + * If found, increment the reference count of the key by one, and + * return it in the bpf_key structure. + * + * The bpf_key structure must be passed to bpf_key_put() when done + * with it, so that the key reference count is decremented and the + * bpf_key structure is freed. + * + * Permission checks are deferred to the time the key is used by + * one of the available key-specific kfuncs. + * + * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested + * special keyring (e.g. session keyring), if it doesn't yet exist. + * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting + * for the key construction, and to retrieve uninstantiated keys (keys + * without data attached to them). + * + * Return: a bpf_key pointer with a valid key pointer if the key is found, a + * NULL pointer otherwise. + */ +struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) +{ + key_ref_t key_ref; + struct bpf_key *bkey; + + if (flags & ~KEY_LOOKUP_ALL) + return NULL; + + /* + * Permission check is deferred until the key is used, as the + * intent of the caller is unknown here. + */ + key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); + if (IS_ERR(key_ref)) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); + if (!bkey) { + key_put(key_ref_to_ptr(key_ref)); + return NULL; + } + + bkey->key = key_ref_to_ptr(key_ref); + bkey->has_ref = true; + + return bkey; +} + +/** + * bpf_lookup_system_key - lookup a key by a system-defined ID + * @id: key ID + * + * Obtain a bpf_key structure with a key pointer set to the passed key ID. + * The key pointer is marked as invalid, to prevent bpf_key_put() from + * attempting to decrement the key reference count on that pointer. The key + * pointer set in such way is currently understood only by + * verify_pkcs7_signature(). + * + * Set *id* to one of the values defined in include/linux/verification.h: + * 0 for the primary keyring (immutable keyring of system keys); + * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring + * (where keys can be added only if they are vouched for by existing keys + * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform + * keyring (primarily used by the integrity subsystem to verify a kexec'ed + * kerned image and, possibly, the initramfs signature). + * + * Return: a bpf_key pointer with an invalid key pointer set from the + * pre-determined ID on success, a NULL pointer otherwise + */ +struct bpf_key *bpf_lookup_system_key(u64 id) +{ + struct bpf_key *bkey; + + if (system_keyring_id_check(id) < 0) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); + if (!bkey) + return NULL; + + bkey->key = (struct key *)(unsigned long)id; + bkey->has_ref = false; + + return bkey; +} + +/** + * bpf_key_put - decrement key reference count if key is valid and free bpf_key + * @bkey: bpf_key structure + * + * Decrement the reference count of the key inside *bkey*, if the pointer + * is valid, and free *bkey*. + */ +void bpf_key_put(struct bpf_key *bkey) +{ + if (bkey->has_ref) + key_put(bkey->key); + + kfree(bkey); +} + +__diag_pop(); + +BTF_SET8_START(key_sig_kfunc_set) +BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) +BTF_SET8_END(key_sig_kfunc_set) + +static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { + .owner = THIS_MODULE, + .set = &key_sig_kfunc_set, +}; + +static int __init bpf_key_sig_kfuncs_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &bpf_key_sig_kfunc_set); +} + +late_initcall(bpf_key_sig_kfuncs_init); +#endif /* CONFIG_KEYS */ + static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { -- cgit v1.2.3 From 05b24ff9b2cfabfcfd951daaa915a036ab53c9e1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa <jolsa@kernel.org> Date: Fri, 16 Sep 2022 09:19:14 +0200 Subject: bpf: Prevent bpf program recursion for raw tracepoint probes We got report from sysbot [1] about warnings that were caused by bpf program attached to contention_begin raw tracepoint triggering the same tracepoint by using bpf_trace_printk helper that takes trace_printk_lock lock. Call Trace: <TASK> ? trace_event_raw_event_bpf_trace_printk+0x5f/0x90 bpf_trace_printk+0x2b/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 bpf_trace_printk+0x3f/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 bpf_trace_printk+0x3f/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 bpf_trace_printk+0x3f/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 __unfreeze_partials+0x5b/0x160 ... The can be reproduced by attaching bpf program as raw tracepoint on contention_begin tracepoint. The bpf prog calls bpf_trace_printk helper. Then by running perf bench the spin lock code is forced to take slow path and call contention_begin tracepoint. Fixing this by skipping execution of the bpf program if it's already running, Using bpf prog 'active' field, which is being currently used by trampoline programs for the same reason. Moving bpf_prog_inc_misses_counter to syscall.c because trampoline.c is compiled in just for CONFIG_BPF_JIT option. Reviewed-by: Stanislav Fomichev <sdf@google.com> Reported-by: syzbot+2251879aa068ad9c960d@syzkaller.appspotmail.com [1] https://lore.kernel.org/bpf/YxhFe3EwqchC%2FfYf@krava/T/#t Signed-off-by: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220916071914.7156-1-jolsa@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/bpf.h | 6 ++++++ kernel/bpf/syscall.c | 11 +++++++++++ kernel/bpf/trampoline.c | 15 ++------------- kernel/trace/bpf_trace.c | 6 ++++++ 4 files changed, 25 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a1435b019aca..edd43edb27d6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2042,6 +2042,8 @@ static inline bool has_current_bpf_ctx(void) { return !!current->bpf_ctx; } + +void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2264,6 +2266,10 @@ static inline bool has_current_bpf_ctx(void) { return false; } + +static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog) +{ +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dab156f09f8d..372fad5ef3d3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2093,6 +2093,17 @@ struct bpf_prog_kstats { u64 misses; }; +void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) +{ + struct bpf_prog_stats *stats; + unsigned int flags; + + stats = this_cpu_ptr(prog->stats); + flags = u64_stats_update_begin_irqsave(&stats->syncp); + u64_stats_inc(&stats->misses); + u64_stats_update_end_irqrestore(&stats->syncp, flags); +} + static void bpf_prog_get_stats(const struct bpf_prog *prog, struct bpf_prog_kstats *stats) { diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index ad76940b02cc..41b67eb83ab3 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -863,17 +863,6 @@ static __always_inline u64 notrace bpf_prog_start_time(void) return start; } -static void notrace inc_misses_counter(struct bpf_prog *prog) -{ - struct bpf_prog_stats *stats; - unsigned int flags; - - stats = this_cpu_ptr(prog->stats); - flags = u64_stats_update_begin_irqsave(&stats->syncp); - u64_stats_inc(&stats->misses); - u64_stats_update_end_irqrestore(&stats->syncp, flags); -} - /* The logic is similar to bpf_prog_run(), but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into @@ -896,7 +885,7 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *ru run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { - inc_misses_counter(prog); + bpf_prog_inc_misses_counter(prog); return 0; } return bpf_prog_start_time(); @@ -967,7 +956,7 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r might_fault(); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { - inc_misses_counter(prog); + bpf_prog_inc_misses_counter(prog); return 0; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9df53c40cffd..b05f0310dbd3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2222,9 +2222,15 @@ static __always_inline void __bpf_trace_run(struct bpf_prog *prog, u64 *args) { cant_sleep(); + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + bpf_prog_inc_misses_counter(prog); + goto out; + } rcu_read_lock(); (void) bpf_prog_run(prog, args); rcu_read_unlock(); +out: + this_cpu_dec(*(prog->active)); } #define UNPACK(...) __VA_ARGS__ -- cgit v1.2.3 From bf7a87f1075f67c286f794519f0fedfa8b0b18cc Mon Sep 17 00:00:00 2001 From: Jiri Olsa <jolsa@kernel.org> Date: Mon, 26 Sep 2022 17:33:35 +0200 Subject: kprobes: Add new KPROBE_FLAG_ON_FUNC_ENTRY kprobe flag Adding KPROBE_FLAG_ON_FUNC_ENTRY kprobe flag to indicate that attach address is on function entry. This is used in following changes in get_func_ip helper to return correct function address. Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> Signed-off-by: Jiri Olsa <jolsa@kernel.org> Link: https://lore.kernel.org/r/20220926153340.1621984-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/kprobes.h | 1 + kernel/kprobes.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 55041d2f884d..a0b92be98984 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -103,6 +103,7 @@ struct kprobe { * this flag is only for optimized_kprobe. */ #define KPROBE_FLAG_FTRACE 8 /* probe is using ftrace */ +#define KPROBE_FLAG_ON_FUNC_ENTRY 16 /* probe is on the function entry */ /* Has this kprobe gone ? */ static inline bool kprobe_gone(struct kprobe *p) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 08350e35aba2..51adc3c94503 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1606,9 +1606,10 @@ int register_kprobe(struct kprobe *p) struct kprobe *old_p; struct module *probed_mod; kprobe_opcode_t *addr; + bool on_func_entry; /* Adjust probe address from symbol */ - addr = kprobe_addr(p); + addr = _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry); if (IS_ERR(addr)) return PTR_ERR(addr); p->addr = addr; @@ -1628,6 +1629,9 @@ int register_kprobe(struct kprobe *p) mutex_lock(&kprobe_mutex); + if (on_func_entry) + p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY; + old_p = get_kprobe(p->addr); if (old_p) { /* Since this may unoptimize 'old_p', locking 'text_mutex'. */ -- cgit v1.2.3 From 19c02415da2345d0dda2b5c4495bc17cc14b18b5 Mon Sep 17 00:00:00 2001 From: Song Liu <song@kernel.org> Date: Mon, 26 Sep 2022 11:47:38 -0700 Subject: bpf: use bpf_prog_pack for bpf_dispatcher Allocate bpf_dispatcher with bpf_prog_pack_alloc so that bpf_dispatcher can share pages with bpf programs. arch_prepare_bpf_dispatcher() is updated to provide a RW buffer as working area for arch code to write to. This also fixes CPA W^X warnning like: CPA refuse W^X violation: 8000000000000163 -> 0000000000000163 range: ... Signed-off-by: Song Liu <song@kernel.org> Link: https://lore.kernel.org/r/20220926184739.3512547-2-song@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- arch/x86/net/bpf_jit_comp.c | 16 ++++++++-------- include/linux/bpf.h | 3 ++- include/linux/filter.h | 5 +++++ kernel/bpf/core.c | 9 +++++++-- kernel/bpf/dispatcher.c | 27 +++++++++++++++++++++------ 5 files changed, 43 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index d4a6183197e9..35796db58116 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2242,7 +2242,7 @@ cleanup: return ret; } -static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) +static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf) { u8 *jg_reloc, *prog = *pprog; int pivot, err, jg_bytes = 1; @@ -2258,12 +2258,12 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3), progs[a]); err = emit_cond_near_jump(&prog, /* je func */ - (void *)progs[a], prog, + (void *)progs[a], image + (prog - buf), X86_JE); if (err) return err; - emit_indirect_jump(&prog, 2 /* rdx */, prog); + emit_indirect_jump(&prog, 2 /* rdx */, image + (prog - buf)); *pprog = prog; return 0; @@ -2288,7 +2288,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) jg_reloc = prog; err = emit_bpf_dispatcher(&prog, a, a + pivot, /* emit lower_part */ - progs); + progs, image, buf); if (err) return err; @@ -2302,7 +2302,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) emit_code(jg_reloc - jg_bytes, jg_offset, jg_bytes); err = emit_bpf_dispatcher(&prog, a + pivot + 1, /* emit upper_part */ - b, progs); + b, progs, image, buf); if (err) return err; @@ -2322,12 +2322,12 @@ static int cmp_ips(const void *a, const void *b) return 0; } -int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) +int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs) { - u8 *prog = image; + u8 *prog = buf; sort(funcs, num_funcs, sizeof(funcs[0]), cmp_ips, NULL); - return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs); + return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf); } struct x64_jit_data { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index edd43edb27d6..9ae155c75014 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -946,6 +946,7 @@ struct bpf_dispatcher { struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX]; int num_progs; void *image; + void *rw_image; u32 image_off; struct bpf_ksym ksym; }; @@ -964,7 +965,7 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); -int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs); +int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs); #define BPF_DISPATCHER_INIT(_name) { \ .mutex = __MUTEX_INITIALIZER(_name.mutex), \ .func = &_name##_func, \ diff --git a/include/linux/filter.h b/include/linux/filter.h index 98e28126c24b..efc42a6e3aed 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1023,6 +1023,8 @@ extern long bpf_jit_limit_max; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); +void bpf_jit_fill_hole_with_zero(void *area, unsigned int size); + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -1035,6 +1037,9 @@ void bpf_jit_free(struct bpf_prog *fp); struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); +void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); +void bpf_prog_pack_free(struct bpf_binary_header *hdr); + static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { return list_empty(&fp->aux->ksym.lnode) || diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d1be78c28619..711fd293b6de 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -825,6 +825,11 @@ struct bpf_prog_pack { unsigned long bitmap[]; }; +void bpf_jit_fill_hole_with_zero(void *area, unsigned int size) +{ + memset(area, 0, size); +} + #define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) static DEFINE_MUTEX(pack_mutex); @@ -864,7 +869,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins return pack; } -static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) +void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) { unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); struct bpf_prog_pack *pack; @@ -905,7 +910,7 @@ out: return ptr; } -static void bpf_prog_pack_free(struct bpf_binary_header *hdr) +void bpf_prog_pack_free(struct bpf_binary_header *hdr) { struct bpf_prog_pack *pack = NULL, *tmp; unsigned int nbits; diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index 2444bd15cc2d..fa64b80b8bca 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -85,12 +85,12 @@ static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d, return false; } -int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) +int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs) { return -ENOTSUPP; } -static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image) +static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf) { s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0]; int i; @@ -99,12 +99,12 @@ static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image) if (d->progs[i].prog) *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func; } - return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs); + return arch_prepare_bpf_dispatcher(image, buf, &ips[0], d->num_progs); } static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) { - void *old, *new; + void *old, *new, *tmp; u32 noff; int err; @@ -117,8 +117,14 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) } new = d->num_progs ? d->image + noff : NULL; + tmp = d->num_progs ? d->rw_image + noff : NULL; if (new) { - if (bpf_dispatcher_prepare(d, new)) + /* Prepare the dispatcher in d->rw_image. Then use + * bpf_arch_text_copy to update d->image, which is RO+X. + */ + if (bpf_dispatcher_prepare(d, new, tmp)) + return; + if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2))) return; } @@ -140,9 +146,18 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, mutex_lock(&d->mutex); if (!d->image) { - d->image = bpf_jit_alloc_exec_page(); + d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero); if (!d->image) goto out; + d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!d->rw_image) { + u32 size = PAGE_SIZE; + + bpf_arch_text_copy(d->image, &size, sizeof(size)); + bpf_prog_pack_free((struct bpf_binary_header *)d->image); + d->image = NULL; + goto out; + } bpf_image_ksym_add(d->image, &d->ksym); } -- cgit v1.2.3 From 5b0d1c7bd5722467960829af51d523f5a6ffd848 Mon Sep 17 00:00:00 2001 From: Song Liu <song@kernel.org> Date: Mon, 26 Sep 2022 11:47:39 -0700 Subject: bpf: Enforce W^X for bpf trampoline Mark the trampoline as RO+X after arch_prepare_bpf_trampoline, so that the trampoine follows W^X rule strictly. This will turn off warnings like CPA refuse W^X violation: 8000000000000163 -> 0000000000000163 range: ... Also remove bpf_jit_alloc_exec_page(), since it is not used any more. Signed-off-by: Song Liu <song@kernel.org> Link: https://lore.kernel.org/r/20220926184739.3512547-3-song@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/bpf.h | 1 - kernel/bpf/trampoline.c | 22 +++++----------------- 2 files changed, 5 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9ae155c75014..5161fac0513f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1008,7 +1008,6 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); /* Called only from JIT-enabled code, so there's no need for stubs. */ -void *bpf_jit_alloc_exec_page(void); void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 41b67eb83ab3..6f7b939321d6 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -116,22 +116,6 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } -void *bpf_jit_alloc_exec_page(void) -{ - void *image; - - image = bpf_jit_alloc_exec(PAGE_SIZE); - if (!image) - return NULL; - - set_vm_flush_reset_perms(image); - /* Keep image as writeable. The alternative is to keep flipping ro/rw - * every time new program is attached or detached. - */ - set_memory_x((long)image, 1); - return image; -} - void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) { ksym->start = (unsigned long) data; @@ -404,9 +388,10 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) goto out_free_im; err = -ENOMEM; - im->image = image = bpf_jit_alloc_exec_page(); + im->image = image = bpf_jit_alloc_exec(PAGE_SIZE); if (!image) goto out_uncharge; + set_vm_flush_reset_perms(image); err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); if (err) @@ -483,6 +468,9 @@ again: if (err < 0) goto out; + set_memory_ro((long)im->image, 1); + set_memory_x((long)im->image, 1); + WARN_ON(tr->cur_image && tr->selector == 0); WARN_ON(!tr->cur_image && tr->selector); if (tr->cur_image) -- cgit v1.2.3 From f0d74c4da1f060d2a66976193712a5e6abd361f5 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee <kuifeng@fb.com> Date: Mon, 26 Sep 2022 11:49:53 -0700 Subject: bpf: Parameterize task iterators. Allow creating an iterator that loops through resources of one thread/process. People could only create iterators to loop through all resources of files, vma, and tasks in the system, even though they were interested in only the resources of a specific task or process. Passing the additional parameters, people can now create an iterator to go through all resources or only the resources of a task. Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Acked-by: Yonghong Song <yhs@fb.com> Acked-by: Martin KaFai Lau <martin.lau@kernel.org> Link: https://lore.kernel.org/bpf/20220926184957.208194-2-kuifeng@fb.com --- include/linux/bpf.h | 25 ++++++ include/uapi/linux/bpf.h | 6 ++ kernel/bpf/task_iter.c | 188 ++++++++++++++++++++++++++++++++++++----- tools/include/uapi/linux/bpf.h | 6 ++ 4 files changed, 203 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5161fac0513f..0f3eaf3ed98c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1796,6 +1796,27 @@ int bpf_obj_get_user(const char __user *pathname, int flags); extern int bpf_iter_ ## target(args); \ int __init bpf_iter_ ## target(args) { return 0; } +/* + * The task type of iterators. + * + * For BPF task iterators, they can be parameterized with various + * parameters to visit only some of tasks. + * + * BPF_TASK_ITER_ALL (default) + * Iterate over resources of every task. + * + * BPF_TASK_ITER_TID + * Iterate over resources of a task/tid. + * + * BPF_TASK_ITER_TGID + * Iterate over resources of every task of a process / task group. + */ +enum bpf_iter_task_type { + BPF_TASK_ITER_ALL = 0, + BPF_TASK_ITER_TID, + BPF_TASK_ITER_TGID, +}; + struct bpf_iter_aux_info { /* for map_elem iter */ struct bpf_map *map; @@ -1805,6 +1826,10 @@ struct bpf_iter_aux_info { struct cgroup *start; /* starting cgroup */ enum bpf_cgroup_iter_order order; } cgroup; + struct { + enum bpf_iter_task_type type; + u32 pid; + } task; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d6bd10759eaf..455b21a53aac 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -110,6 +110,12 @@ union bpf_iter_link_info { __u32 cgroup_fd; __u64 cgroup_id; } cgroup; + /* Parameters of task iterators. */ + struct { + __u32 tid; + __u32 pid; + __u32 pid_fd; + } task; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 8c921799def4..8b2f47e7139d 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -12,6 +12,9 @@ struct bpf_iter_seq_task_common { struct pid_namespace *ns; + enum bpf_iter_task_type type; + u32 pid; + u32 pid_visiting; }; struct bpf_iter_seq_task_info { @@ -22,18 +25,115 @@ struct bpf_iter_seq_task_info { u32 tid; }; -static struct task_struct *task_seq_get_next(struct pid_namespace *ns, +static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common, + u32 *tid, + bool skip_if_dup_files) +{ + struct task_struct *task, *next_task; + struct pid *pid; + u32 saved_tid; + + if (!*tid) { + /* The first time, the iterator calls this function. */ + pid = find_pid_ns(common->pid, common->ns); + if (!pid) + return NULL; + + task = get_pid_task(pid, PIDTYPE_TGID); + if (!task) + return NULL; + + *tid = common->pid; + common->pid_visiting = common->pid; + + return task; + } + + /* If the control returns to user space and comes back to the + * kernel again, *tid and common->pid_visiting should be the + * same for task_seq_start() to pick up the correct task. + */ + if (*tid == common->pid_visiting) { + pid = find_pid_ns(common->pid_visiting, common->ns); + task = get_pid_task(pid, PIDTYPE_PID); + + return task; + } + + pid = find_pid_ns(common->pid_visiting, common->ns); + if (!pid) + return NULL; + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return NULL; + +retry: + if (!pid_alive(task)) { + put_task_struct(task); + return NULL; + } + + next_task = next_thread(task); + put_task_struct(task); + if (!next_task) + return NULL; + + saved_tid = *tid; + *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns); + if (!*tid || *tid == common->pid) { + /* Run out of tasks of a process. The tasks of a + * thread_group are linked as circular linked list. + */ + *tid = saved_tid; + return NULL; + } + + get_task_struct(next_task); + common->pid_visiting = *tid; + + if (skip_if_dup_files && task->files == task->group_leader->files) { + task = next_task; + goto retry; + } + + return next_task; +} + +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, u32 *tid, bool skip_if_dup_files) { struct task_struct *task = NULL; struct pid *pid; + if (common->type == BPF_TASK_ITER_TID) { + if (*tid && *tid != common->pid) + return NULL; + rcu_read_lock(); + pid = find_pid_ns(common->pid, common->ns); + if (pid) { + task = get_pid_task(pid, PIDTYPE_TGID); + *tid = common->pid; + } + rcu_read_unlock(); + + return task; + } + + if (common->type == BPF_TASK_ITER_TGID) { + rcu_read_lock(); + task = task_group_seq_get_next(common, tid, skip_if_dup_files); + rcu_read_unlock(); + + return task; + } + rcu_read_lock(); retry: - pid = find_ge_pid(*tid, ns); + pid = find_ge_pid(*tid, common->ns); if (pid) { - *tid = pid_nr_ns(pid, ns); + *tid = pid_nr_ns(pid, common->ns); task = get_pid_task(pid, PIDTYPE_PID); if (!task) { ++*tid; @@ -56,7 +156,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_task_info *info = seq->private; struct task_struct *task; - task = task_seq_get_next(info->common.ns, &info->tid, false); + task = task_seq_get_next(&info->common, &info->tid, false); if (!task) return NULL; @@ -73,7 +173,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; ++info->tid; put_task_struct((struct task_struct *)v); - task = task_seq_get_next(info->common.ns, &info->tid, false); + task = task_seq_get_next(&info->common, &info->tid, false); if (!task) return NULL; @@ -117,6 +217,41 @@ static void task_seq_stop(struct seq_file *seq, void *v) put_task_struct((struct task_struct *)v); } +static int bpf_iter_attach_task(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + unsigned int flags; + struct pid *pid; + pid_t tgid; + + if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1) + return -EINVAL; + + aux->task.type = BPF_TASK_ITER_ALL; + if (linfo->task.tid != 0) { + aux->task.type = BPF_TASK_ITER_TID; + aux->task.pid = linfo->task.tid; + } + if (linfo->task.pid != 0) { + aux->task.type = BPF_TASK_ITER_TGID; + aux->task.pid = linfo->task.pid; + } + if (linfo->task.pid_fd != 0) { + aux->task.type = BPF_TASK_ITER_TGID; + + pid = pidfd_get_pid(linfo->task.pid_fd, &flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + tgid = pid_nr_ns(pid, task_active_pid_ns(current)); + aux->task.pid = tgid; + put_pid(pid); + } + + return 0; +} + static const struct seq_operations task_seq_ops = { .start = task_seq_start, .next = task_seq_next, @@ -137,8 +272,7 @@ struct bpf_iter_seq_task_file_info { static struct file * task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) { - struct pid_namespace *ns = info->common.ns; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; struct task_struct *curr_task; unsigned int curr_fd = info->fd; @@ -151,21 +285,18 @@ again: curr_task = info->task; curr_fd = info->fd; } else { - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { info->task = NULL; - info->tid = curr_tid; return NULL; } - /* set info->task and info->tid */ + /* set info->task */ info->task = curr_task; - if (curr_tid == info->tid) { + if (saved_tid == info->tid) curr_fd = info->fd; - } else { - info->tid = curr_tid; + else curr_fd = 0; - } } rcu_read_lock(); @@ -186,9 +317,15 @@ again: /* the current task is done, go to the next task */ rcu_read_unlock(); put_task_struct(curr_task); + + if (info->common.type == BPF_TASK_ITER_TID) { + info->task = NULL; + return NULL; + } + info->task = NULL; info->fd = 0; - curr_tid = ++(info->tid); + saved_tid = ++(info->tid); goto again; } @@ -269,6 +406,9 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) struct bpf_iter_seq_task_common *common = priv_data; common->ns = get_pid_ns(task_active_pid_ns(current)); + common->type = aux->task.type; + common->pid = aux->task.pid; + return 0; } @@ -307,11 +447,10 @@ enum bpf_task_vma_iter_find_op { static struct vm_area_struct * task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) { - struct pid_namespace *ns = info->common.ns; enum bpf_task_vma_iter_find_op op; struct vm_area_struct *curr_vma; struct task_struct *curr_task; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; /* If this function returns a non-NULL vma, it holds a reference to * the task_struct, and holds read lock on vma->mm->mmap_lock. @@ -371,14 +510,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) } } else { again: - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { - info->tid = curr_tid + 1; + info->tid++; goto finish; } - if (curr_tid != info->tid) { - info->tid = curr_tid; + if (saved_tid != info->tid) { /* new task, process the first vma */ op = task_vma_iter_first_vma; } else { @@ -430,9 +568,12 @@ again: return curr_vma; next_task: + if (info->common.type == BPF_TASK_ITER_TID) + goto finish; + put_task_struct(curr_task); info->task = NULL; - curr_tid++; + info->tid++; goto again; finish: @@ -533,6 +674,7 @@ static const struct bpf_iter_seq_info task_seq_info = { static struct bpf_iter_reg task_reg_info = { .target = "task", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { @@ -551,6 +693,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = { static struct bpf_iter_reg task_file_reg_info = { .target = "task_file", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { @@ -571,6 +714,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = { static struct bpf_iter_reg task_vma_reg_info = { .target = "task_vma", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d6bd10759eaf..455b21a53aac 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -110,6 +110,12 @@ union bpf_iter_link_info { __u32 cgroup_fd; __u64 cgroup_id; } cgroup; + /* Parameters of task iterators. */ + struct { + __u32 tid; + __u32 pid; + __u32 pid_fd; + } task; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ -- cgit v1.2.3 From 64696c40d03c01e0ea2e3e9aa1c490a7b6a1b6be Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau <martin.lau@kernel.org> Date: Thu, 29 Sep 2022 00:04:03 -0700 Subject: bpf: Add __bpf_prog_{enter,exit}_struct_ops for struct_ops trampoline The struct_ops prog is to allow using bpf to implement the functions in a struct (eg. kernel module). The current usage is to implement the tcp_congestion. The kernel does not call the tcp-cc's ops (ie. the bpf prog) in a recursive way. The struct_ops is sharing the tracing-trampoline's enter/exit function which tracks prog->active to avoid recursion. It is needed for tracing prog. However, it turns out the struct_ops bpf prog will hit this prog->active and unnecessarily skipped running the struct_ops prog. eg. The '.ssthresh' may run in_task() and then interrupted by softirq that runs the same '.ssthresh'. Skip running the '.ssthresh' will end up returning random value to the caller. The patch adds __bpf_prog_{enter,exit}_struct_ops for the struct_ops trampoline. They do not track the prog->active to detect recursion. One exception is when the tcp_congestion's '.init' ops is doing bpf_setsockopt(TCP_CONGESTION) and then recurs to the same '.init' ops. This will be addressed in the following patches. Fixes: ca06f55b9002 ("bpf: Add per-program recursion prevention mechanism") Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org> Link: https://lore.kernel.org/r/20220929070407.965581-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- arch/x86/net/bpf_jit_comp.c | 3 +++ include/linux/bpf.h | 4 ++++ kernel/bpf/trampoline.c | 23 +++++++++++++++++++++++ 3 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 35796db58116..5b6230779cf3 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1836,6 +1836,9 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (p->aux->sleepable) { enter = __bpf_prog_enter_sleepable; exit = __bpf_prog_exit_sleepable; + } else if (p->type == BPF_PROG_TYPE_STRUCT_OPS) { + enter = __bpf_prog_enter_struct_ops; + exit = __bpf_prog_exit_struct_ops; } else if (p->expected_attach_type == BPF_LSM_CGROUP) { enter = __bpf_prog_enter_lsm_cgroup; exit = __bpf_prog_exit_lsm_cgroup; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0f3eaf3ed98c..9e7d46d16032 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -864,6 +864,10 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); +u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 6f7b939321d6..bf0906e1e2b9 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -964,6 +964,29 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, rcu_read_unlock_trace(); } +u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) + __acquires(RCU) +{ + rcu_read_lock(); + migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + + return bpf_prog_start_time(); +} + +void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) + __releases(RCU) +{ + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + + update_prog_stats(prog, start); + migrate_enable(); + rcu_read_unlock(); +} + void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) { percpu_ref_get(&tr->pcref); -- cgit v1.2.3 From 061ff040710e9f6f043d1fa80b1b362d2845b17a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau <martin.lau@kernel.org> Date: Thu, 29 Sep 2022 00:04:06 -0700 Subject: bpf: tcp: Stop bpf_setsockopt(TCP_CONGESTION) in init ops to recur itself When a bad bpf prog '.init' calls bpf_setsockopt(TCP_CONGESTION, "itself"), it will trigger this loop: .init => bpf_setsockopt(tcp_cc) => .init => bpf_setsockopt(tcp_cc) ... ... => .init => bpf_setsockopt(tcp_cc). It was prevented by the prog->active counter before but the prog->active detection cannot be used in struct_ops as explained in the earlier patch of the set. In this patch, the second bpf_setsockopt(tcp_cc) is not allowed in order to break the loop. This is done by using a bit of an existing 1 byte hole in tcp_sock to check if there is on-going bpf_setsockopt(TCP_CONGESTION) in this tcp_sock. Note that this essentially limits only the first '.init' can call bpf_setsockopt(TCP_CONGESTION) to pick a fallback cc (eg. peer does not support ECN) and the second '.init' cannot fallback to another cc. This applies even the second bpf_setsockopt(TCP_CONGESTION) will not cause a loop. Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org> Reviewed-by: Eric Dumazet <edumazet@google.com> Link: https://lore.kernel.org/r/20220929070407.965581-5-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- include/linux/tcp.h | 6 ++++++ net/core/filter.c | 28 +++++++++++++++++++++++++++- net/ipv4/tcp_minisocks.c | 1 + 3 files changed, 34 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a9fbe22732c3..3bdf687e2fb3 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -388,6 +388,12 @@ struct tcp_sock { u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs * values defined in uapi/linux/tcp.h */ + u8 bpf_chg_cc_inprogress:1; /* In the middle of + * bpf_setsockopt(TCP_CONGESTION), + * it is to avoid the bpf_tcp_cc->init() + * to recur itself by calling + * bpf_setsockopt(TCP_CONGESTION, "itself"). + */ #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) #else #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 diff --git a/net/core/filter.c b/net/core/filter.c index 96f2f7a65e65..ac4c45c02da5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5105,6 +5105,9 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, int *optlen, bool getopt) { + struct tcp_sock *tp; + int ret; + if (*optlen < 2) return -EINVAL; @@ -5125,8 +5128,31 @@ static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) return -ENOTSUPP; - return do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + /* It stops this looping + * + * .init => bpf_setsockopt(tcp_cc) => .init => + * bpf_setsockopt(tcp_cc)" => .init => .... + * + * The second bpf_setsockopt(tcp_cc) is not allowed + * in order to break the loop when both .init + * are the same bpf prog. + * + * This applies even the second bpf_setsockopt(tcp_cc) + * does not cause a loop. This limits only the first + * '.init' can call bpf_setsockopt(TCP_CONGESTION) to + * pick a fallback cc (eg. peer does not support ECN) + * and the second '.init' cannot fallback to + * another. + */ + tp = tcp_sk(sk); + if (tp->bpf_chg_cc_inprogress) + return -EBUSY; + + tp->bpf_chg_cc_inprogress = 1; + ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, KERNEL_SOCKPTR(optval), *optlen); + tp->bpf_chg_cc_inprogress = 0; + return ret; } static int sol_tcp_sockopt(struct sock *sk, int optname, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index cb95d88497ae..ddcdc2bc4c04 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -541,6 +541,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); + newtp->bpf_chg_cc_inprogress = 0; tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); -- cgit v1.2.3