From a171fbec88a2c730b108c7147ac5e7b2f5a02b47 Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Thu, 17 Aug 2023 19:58:14 -0700 Subject: lwt: Check LWTUNNEL_XMIT_CONTINUE strictly LWTUNNEL_XMIT_CONTINUE is implicitly assumed in ip(6)_finish_output2, such that any positive return value from a xmit hook could cause unexpected continue behavior, despite that related skb may have been freed. This could be error-prone for future xmit hook ops. One of the possible errors is to return statuses of dst_output directly. To make the code safer, redefine LWTUNNEL_XMIT_CONTINUE value to distinguish from dst_output statuses and check the continue condition explicitly. Fixes: 3a0af8fd61f9 ("bpf: BPF for lightweight tunnel infrastructure") Suggested-by: Dan Carpenter Signed-off-by: Yan Zhai Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/96b939b85eda00e8df4f7c080f770970a4c5f698.1692326837.git.yan@cloudflare.com --- include/net/lwtunnel.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 6f15e6fa154e..53bd2d02a4f0 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -16,9 +16,12 @@ #define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1) #define LWTUNNEL_STATE_XMIT_REDIRECT BIT(2) +/* LWTUNNEL_XMIT_CONTINUE should be distinguishable from dst_output return + * values (NET_XMIT_xxx and NETDEV_TX_xxx in linux/netdevice.h) for safety. + */ enum { LWTUNNEL_XMIT_DONE, - LWTUNNEL_XMIT_CONTINUE, + LWTUNNEL_XMIT_CONTINUE = 0x100, }; -- cgit v1.2.3 From c5487f8d91868eeab17a59cf4d164ea113f90252 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 9 Aug 2023 10:34:13 +0200 Subject: bpf: Switch BPF_F_KPROBE_MULTI_RETURN macro to enum Switching BPF_F_KPROBE_MULTI_RETURN macro to anonymous enum, so it'd show up in vmlinux.h. There's not functional change compared to having this as macro. Acked-by: Yafang Shao Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230809083440.3209381-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 4 +++- tools/include/uapi/linux/bpf.h | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d21deb46f49f..a4e55e5e84a7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1186,7 +1186,9 @@ enum bpf_perf_event_type { /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. */ -#define BPF_F_KPROBE_MULTI_RETURN (1U << 0) +enum { + BPF_F_KPROBE_MULTI_RETURN = (1U << 0) +}; /* link_create.netfilter.flags used in LINK_CREATE command for * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d21deb46f49f..a4e55e5e84a7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1186,7 +1186,9 @@ enum bpf_perf_event_type { /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. */ -#define BPF_F_KPROBE_MULTI_RETURN (1U << 0) +enum { + BPF_F_KPROBE_MULTI_RETURN = (1U << 0) +}; /* link_create.netfilter.flags used in LINK_CREATE command for * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation. -- cgit v1.2.3 From 89ae89f53d201143560f1e9ed4bfa62eee34f88e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 9 Aug 2023 10:34:15 +0200 Subject: bpf: Add multi uprobe link Adding new multi uprobe link that allows to attach bpf program to multiple uprobes. Uprobes to attach are specified via new link_create uprobe_multi union: struct { __aligned_u64 path; __aligned_u64 offsets; __aligned_u64 ref_ctr_offsets; __u32 cnt; __u32 flags; } uprobe_multi; Uprobes are defined for single binary specified in path and multiple calling sites specified in offsets array with optional reference counters specified in ref_ctr_offsets array. All specified arrays have length of 'cnt'. The 'flags' supports single bit for now that marks the uprobe as return probe. Acked-by: Andrii Nakryiko Acked-by: Yafang Shao Signed-off-by: Jiri Olsa Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230809083440.3209381-4-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/trace_events.h | 6 ++ include/uapi/linux/bpf.h | 16 +++ kernel/bpf/syscall.c | 14 ++- kernel/trace/bpf_trace.c | 233 +++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 16 +++ 5 files changed, 282 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index e66d04dbe56a..5b85cf18c350 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -752,6 +752,7 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr); int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -798,6 +799,11 @@ bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } +static inline int +bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif enum { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a4e55e5e84a7..e48780951fc7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1039,6 +1039,7 @@ enum bpf_attach_type { BPF_NETFILTER, BPF_TCX_INGRESS, BPF_TCX_EGRESS, + BPF_TRACE_UPROBE_MULTI, __MAX_BPF_ATTACH_TYPE }; @@ -1057,6 +1058,7 @@ enum bpf_link_type { BPF_LINK_TYPE_STRUCT_OPS = 9, BPF_LINK_TYPE_NETFILTER = 10, BPF_LINK_TYPE_TCX = 11, + BPF_LINK_TYPE_UPROBE_MULTI = 12, MAX_BPF_LINK_TYPE, }; @@ -1190,6 +1192,13 @@ enum { BPF_F_KPROBE_MULTI_RETURN = (1U << 0) }; +/* link_create.uprobe_multi.flags used in LINK_CREATE command for + * BPF_TRACE_UPROBE_MULTI attach type to create return probe. + */ +enum { + BPF_F_UPROBE_MULTI_RETURN = (1U << 0) +}; + /* link_create.netfilter.flags used in LINK_CREATE command for * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation. */ @@ -1626,6 +1635,13 @@ union bpf_attr { }; __u64 expected_revision; } tcx; + struct { + __aligned_u64 path; + __aligned_u64 offsets; + __aligned_u64 ref_ctr_offsets; + __u32 cnt; + __u32 flags; + } uprobe_multi; }; } link_create; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b14036cccd08..fe291b000814 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2815,10 +2815,12 @@ static void bpf_link_free_id(int id) /* Clean up bpf_link and corresponding anon_inode file and FD. After * anon_inode is created, bpf_link can't be just kfree()'d due to deferred - * anon_inode's release() call. This helper marksbpf_link as + * anon_inode's release() call. This helper marks bpf_link as * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt * is not decremented, it's the responsibility of a calling code that failed * to complete bpf_link initialization. + * This helper eventually calls link's dealloc callback, but does not call + * link's release callback. */ void bpf_link_cleanup(struct bpf_link_primer *primer) { @@ -3756,8 +3758,12 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_KPROBE_MULTI) return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && + attach_type != BPF_TRACE_UPROBE_MULTI) + return -EINVAL; if (attach_type != BPF_PERF_EVENT && - attach_type != BPF_TRACE_KPROBE_MULTI) + attach_type != BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; return 0; case BPF_PROG_TYPE_SCHED_CLS: @@ -4953,8 +4959,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_KPROBE: if (attr->link_create.attach_type == BPF_PERF_EVENT) ret = bpf_perf_link_attach(attr, prog); - else + else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI) ret = bpf_kprobe_multi_link_attach(attr, prog); + else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI) + ret = bpf_uprobe_multi_link_attach(attr, prog); break; default: ret = -EINVAL; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 792445e1f3f0..f0b54a3480f8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -2970,3 +2971,235 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) return 0; } #endif + +#ifdef CONFIG_UPROBES +struct bpf_uprobe_multi_link; + +struct bpf_uprobe { + struct bpf_uprobe_multi_link *link; + loff_t offset; + struct uprobe_consumer consumer; +}; + +struct bpf_uprobe_multi_link { + struct path path; + struct bpf_link link; + u32 cnt; + struct bpf_uprobe *uprobes; +}; + +struct bpf_uprobe_multi_run_ctx { + struct bpf_run_ctx run_ctx; + unsigned long entry_ip; +}; + +static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes, + u32 cnt) +{ + u32 i; + + for (i = 0; i < cnt; i++) { + uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset, + &uprobes[i].consumer); + } +} + +static void bpf_uprobe_multi_link_release(struct bpf_link *link) +{ + struct bpf_uprobe_multi_link *umulti_link; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt); +} + +static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) +{ + struct bpf_uprobe_multi_link *umulti_link; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + path_put(&umulti_link->path); + kvfree(umulti_link->uprobes); + kfree(umulti_link); +} + +static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { + .release = bpf_uprobe_multi_link_release, + .dealloc = bpf_uprobe_multi_link_dealloc, +}; + +static int uprobe_prog_run(struct bpf_uprobe *uprobe, + unsigned long entry_ip, + struct pt_regs *regs) +{ + struct bpf_uprobe_multi_link *link = uprobe->link; + struct bpf_uprobe_multi_run_ctx run_ctx = { + .entry_ip = entry_ip, + }; + struct bpf_prog *prog = link->link.prog; + bool sleepable = prog->aux->sleepable; + struct bpf_run_ctx *old_run_ctx; + int err = 0; + + if (sleepable) + rcu_read_lock_trace(); + else + rcu_read_lock(); + + migrate_disable(); + + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + err = bpf_prog_run(link->link.prog, regs); + bpf_reset_run_ctx(old_run_ctx); + + migrate_enable(); + + if (sleepable) + rcu_read_unlock_trace(); + else + rcu_read_unlock(); + return err; +} + +static int +uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs) +{ + struct bpf_uprobe *uprobe; + + uprobe = container_of(con, struct bpf_uprobe, consumer); + return uprobe_prog_run(uprobe, instruction_pointer(regs), regs); +} + +static int +uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs) +{ + struct bpf_uprobe *uprobe; + + uprobe = container_of(con, struct bpf_uprobe, consumer); + return uprobe_prog_run(uprobe, func, regs); +} + +int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_uprobe_multi_link *link = NULL; + unsigned long __user *uref_ctr_offsets; + unsigned long *ref_ctr_offsets = NULL; + struct bpf_link_primer link_primer; + struct bpf_uprobe *uprobes = NULL; + unsigned long __user *uoffsets; + void __user *upath; + u32 flags, cnt, i; + struct path path; + char *name; + int err; + + /* no support for 32bit archs yet */ + if (sizeof(u64) != sizeof(void *)) + return -EOPNOTSUPP; + + if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI) + return -EINVAL; + + flags = attr->link_create.uprobe_multi.flags; + if (flags & ~BPF_F_UPROBE_MULTI_RETURN) + return -EINVAL; + + /* + * path, offsets and cnt are mandatory, + * ref_ctr_offsets is optional + */ + upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); + uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets); + cnt = attr->link_create.uprobe_multi.cnt; + + if (!upath || !uoffsets || !cnt) + return -EINVAL; + + uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets); + + name = strndup_user(upath, PATH_MAX); + if (IS_ERR(name)) { + err = PTR_ERR(name); + return err; + } + + err = kern_path(name, LOOKUP_FOLLOW, &path); + kfree(name); + if (err) + return err; + + if (!d_is_reg(path.dentry)) { + err = -EBADF; + goto error_path_put; + } + + err = -ENOMEM; + + link = kzalloc(sizeof(*link), GFP_KERNEL); + uprobes = kvcalloc(cnt, sizeof(*uprobes), GFP_KERNEL); + + if (!uprobes || !link) + goto error_free; + + if (uref_ctr_offsets) { + ref_ctr_offsets = kvcalloc(cnt, sizeof(*ref_ctr_offsets), GFP_KERNEL); + if (!ref_ctr_offsets) + goto error_free; + } + + for (i = 0; i < cnt; i++) { + if (uref_ctr_offsets && __get_user(ref_ctr_offsets[i], uref_ctr_offsets + i)) { + err = -EFAULT; + goto error_free; + } + if (__get_user(uprobes[i].offset, uoffsets + i)) { + err = -EFAULT; + goto error_free; + } + + uprobes[i].link = link; + + if (flags & BPF_F_UPROBE_MULTI_RETURN) + uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler; + else + uprobes[i].consumer.handler = uprobe_multi_link_handler; + } + + link->cnt = cnt; + link->uprobes = uprobes; + link->path = path; + + bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, + &bpf_uprobe_multi_link_lops, prog); + + for (i = 0; i < cnt; i++) { + err = uprobe_register_refctr(d_real_inode(link->path.dentry), + uprobes[i].offset, + ref_ctr_offsets ? ref_ctr_offsets[i] : 0, + &uprobes[i].consumer); + if (err) { + bpf_uprobe_unregister(&path, uprobes, i); + goto error_free; + } + } + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto error_free; + + kvfree(ref_ctr_offsets); + return bpf_link_settle(&link_primer); + +error_free: + kvfree(ref_ctr_offsets); + kvfree(uprobes); + kfree(link); +error_path_put: + path_put(&path); + return err; +} +#else /* !CONFIG_UPROBES */ +int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_UPROBES */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a4e55e5e84a7..e48780951fc7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1039,6 +1039,7 @@ enum bpf_attach_type { BPF_NETFILTER, BPF_TCX_INGRESS, BPF_TCX_EGRESS, + BPF_TRACE_UPROBE_MULTI, __MAX_BPF_ATTACH_TYPE }; @@ -1057,6 +1058,7 @@ enum bpf_link_type { BPF_LINK_TYPE_STRUCT_OPS = 9, BPF_LINK_TYPE_NETFILTER = 10, BPF_LINK_TYPE_TCX = 11, + BPF_LINK_TYPE_UPROBE_MULTI = 12, MAX_BPF_LINK_TYPE, }; @@ -1190,6 +1192,13 @@ enum { BPF_F_KPROBE_MULTI_RETURN = (1U << 0) }; +/* link_create.uprobe_multi.flags used in LINK_CREATE command for + * BPF_TRACE_UPROBE_MULTI attach type to create return probe. + */ +enum { + BPF_F_UPROBE_MULTI_RETURN = (1U << 0) +}; + /* link_create.netfilter.flags used in LINK_CREATE command for * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation. */ @@ -1626,6 +1635,13 @@ union bpf_attr { }; __u64 expected_revision; } tcx; + struct { + __aligned_u64 path; + __aligned_u64 offsets; + __aligned_u64 ref_ctr_offsets; + __u32 cnt; + __u32 flags; + } uprobe_multi; }; } link_create; -- cgit v1.2.3 From 0b779b61f651851df5c5c42938a6c441eb1b5100 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 9 Aug 2023 10:34:16 +0200 Subject: bpf: Add cookies support for uprobe_multi link Adding support to specify cookies array for uprobe_multi link. The cookies array share indexes and length with other uprobe_multi arrays (offsets/ref_ctr_offsets). The cookies[i] value defines cookie for i-the uprobe and will be returned by bpf_get_attach_cookie helper when called from ebpf program hooked to that specific uprobe. Acked-by: Andrii Nakryiko Acked-by: Yafang Shao Signed-off-by: Jiri Olsa Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230809083440.3209381-5-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 2 +- kernel/trace/bpf_trace.c | 45 ++++++++++++++++++++++++++++++++++++++---- tools/include/uapi/linux/bpf.h | 1 + 4 files changed, 44 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e48780951fc7..d7f4f50b1e58 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1639,6 +1639,7 @@ union bpf_attr { __aligned_u64 path; __aligned_u64 offsets; __aligned_u64 ref_ctr_offsets; + __aligned_u64 cookies; __u32 cnt; __u32 flags; } uprobe_multi; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fe291b000814..ef17cd05262e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4882,7 +4882,7 @@ err_put: return err; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies +#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.flags static int link_create(union bpf_attr *attr, bpfptr_t uattr) { struct bpf_prog *prog; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f0b54a3480f8..0d59cac30c7e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -87,6 +87,8 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx); static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx); +static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx); + /** * trace_call_bpf - invoke BPF program * @call: tracepoint event @@ -1104,6 +1106,18 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_attach_cookie_uprobe_multi, struct pt_regs *, regs) +{ + return bpf_uprobe_multi_cookie(current->bpf_ctx); +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_umulti = { + .func = bpf_get_attach_cookie_uprobe_multi, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx) { struct bpf_trace_run_ctx *run_ctx; @@ -1550,9 +1564,11 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) &bpf_get_func_ip_proto_kprobe_multi : &bpf_get_func_ip_proto_kprobe; case BPF_FUNC_get_attach_cookie: - return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ? - &bpf_get_attach_cookie_proto_kmulti : - &bpf_get_attach_cookie_proto_trace; + if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI) + return &bpf_get_attach_cookie_proto_kmulti; + if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) + return &bpf_get_attach_cookie_proto_umulti; + return &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } @@ -2978,6 +2994,7 @@ struct bpf_uprobe_multi_link; struct bpf_uprobe { struct bpf_uprobe_multi_link *link; loff_t offset; + u64 cookie; struct uprobe_consumer consumer; }; @@ -2991,6 +3008,7 @@ struct bpf_uprobe_multi_link { struct bpf_uprobe_multi_run_ctx { struct bpf_run_ctx run_ctx; unsigned long entry_ip; + struct bpf_uprobe *uprobe; }; static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes, @@ -3034,6 +3052,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, struct bpf_uprobe_multi_link *link = uprobe->link; struct bpf_uprobe_multi_run_ctx run_ctx = { .entry_ip = entry_ip, + .uprobe = uprobe, }; struct bpf_prog *prog = link->link.prog; bool sleepable = prog->aux->sleepable; @@ -3078,6 +3097,14 @@ uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, s return uprobe_prog_run(uprobe, func, regs); } +static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) +{ + struct bpf_uprobe_multi_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx); + return run_ctx->uprobe->cookie; +} + int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_uprobe_multi_link *link = NULL; @@ -3086,6 +3113,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr struct bpf_link_primer link_primer; struct bpf_uprobe *uprobes = NULL; unsigned long __user *uoffsets; + u64 __user *ucookies; void __user *upath; u32 flags, cnt, i; struct path path; @@ -3105,7 +3133,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr /* * path, offsets and cnt are mandatory, - * ref_ctr_offsets is optional + * ref_ctr_offsets and cookies are optional */ upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets); @@ -3115,6 +3143,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr return -EINVAL; uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets); + ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies); name = strndup_user(upath, PATH_MAX); if (IS_ERR(name)) { @@ -3147,6 +3176,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } for (i = 0; i < cnt; i++) { + if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) { + err = -EFAULT; + goto error_free; + } if (uref_ctr_offsets && __get_user(ref_ctr_offsets[i], uref_ctr_offsets + i)) { err = -EFAULT; goto error_free; @@ -3202,4 +3235,8 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr { return -EOPNOTSUPP; } +static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) +{ + return 0; +} #endif /* CONFIG_UPROBES */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e48780951fc7..d7f4f50b1e58 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1639,6 +1639,7 @@ union bpf_attr { __aligned_u64 path; __aligned_u64 offsets; __aligned_u64 ref_ctr_offsets; + __aligned_u64 cookies; __u32 cnt; __u32 flags; } uprobe_multi; -- cgit v1.2.3 From b733eeade4204423711793595c3c8d78a2fa8b2e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 9 Aug 2023 10:34:17 +0200 Subject: bpf: Add pid filter support for uprobe_multi link Adding support to specify pid for uprobe_multi link and the uprobes are created only for task with given pid value. Using the consumer.filter filter callback for that, so the task gets filtered during the uprobe installation. We still need to check the task during runtime in the uprobe handler, because the handler could get executed if there's another system wide consumer on the same uprobe (thanks Oleg for the insight). Cc: Oleg Nesterov Reviewed-by: Oleg Nesterov Signed-off-by: Jiri Olsa Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230809083440.3209381-6-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 2 +- kernel/trace/bpf_trace.c | 33 +++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 1 + 4 files changed, 36 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d7f4f50b1e58..8790b3962e4b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1642,6 +1642,7 @@ union bpf_attr { __aligned_u64 cookies; __u32 cnt; __u32 flags; + __u32 pid; } uprobe_multi; }; } link_create; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ef17cd05262e..10666d17b9e3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4882,7 +4882,7 @@ err_put: return err; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.flags +#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid static int link_create(union bpf_attr *attr, bpfptr_t uattr) { struct bpf_prog *prog; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0d59cac30c7e..e45aebed62f5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3003,6 +3003,7 @@ struct bpf_uprobe_multi_link { struct bpf_link link; u32 cnt; struct bpf_uprobe *uprobes; + struct task_struct *task; }; struct bpf_uprobe_multi_run_ctx { @@ -3035,6 +3036,8 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) struct bpf_uprobe_multi_link *umulti_link; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + if (umulti_link->task) + put_task_struct(umulti_link->task); path_put(&umulti_link->path); kvfree(umulti_link->uprobes); kfree(umulti_link); @@ -3059,6 +3062,9 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, struct bpf_run_ctx *old_run_ctx; int err = 0; + if (link->task && current != link->task) + return 0; + if (sleepable) rcu_read_lock_trace(); else @@ -3079,6 +3085,16 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, return err; } +static bool +uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx, + struct mm_struct *mm) +{ + struct bpf_uprobe *uprobe; + + uprobe = container_of(con, struct bpf_uprobe, consumer); + return uprobe->link->task->mm == mm; +} + static int uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs) { @@ -3112,12 +3128,14 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr unsigned long *ref_ctr_offsets = NULL; struct bpf_link_primer link_primer; struct bpf_uprobe *uprobes = NULL; + struct task_struct *task = NULL; unsigned long __user *uoffsets; u64 __user *ucookies; void __user *upath; u32 flags, cnt, i; struct path path; char *name; + pid_t pid; int err; /* no support for 32bit archs yet */ @@ -3161,6 +3179,15 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr goto error_path_put; } + pid = attr->link_create.uprobe_multi.pid; + if (pid) { + rcu_read_lock(); + task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + rcu_read_unlock(); + if (!task) + goto error_path_put; + } + err = -ENOMEM; link = kzalloc(sizeof(*link), GFP_KERNEL); @@ -3195,11 +3222,15 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler; else uprobes[i].consumer.handler = uprobe_multi_link_handler; + + if (pid) + uprobes[i].consumer.filter = uprobe_multi_link_filter; } link->cnt = cnt; link->uprobes = uprobes; link->path = path; + link->task = task; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, &bpf_uprobe_multi_link_lops, prog); @@ -3226,6 +3257,8 @@ error_free: kvfree(ref_ctr_offsets); kvfree(uprobes); kfree(link); + if (task) + put_task_struct(task); error_path_put: path_put(&path); return err; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d7f4f50b1e58..8790b3962e4b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1642,6 +1642,7 @@ union bpf_attr { __aligned_u64 cookies; __u32 cnt; __u32 flags; + __u32 pid; } uprobe_multi; }; } link_create; -- cgit v1.2.3 From 2a6d50b50d6d589d43a90d6ca990b8b811e67701 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 21 Aug 2023 12:33:06 -0700 Subject: bpf: Consider non-owning refs trusted Recent discussions around default kptr "trustedness" led to changes such as commit 6fcd486b3a0a ("bpf: Refactor RCU enforcement in the verifier."). One of the conclusions of those discussions, as expressed in code and comments in that patch, is that we'd like to move away from 'raw' PTR_TO_BTF_ID without some type flag or other register state indicating trustedness. Although PTR_TRUSTED and PTR_UNTRUSTED flags mark this state explicitly, the verifier currently considers trustedness implied by other register state. For example, owning refs to graph collection nodes must have a nonzero ref_obj_id, so they pass the is_trusted_reg check despite having no explicit PTR_{UN}TRUSTED flag. This patch makes trustedness of non-owning refs to graph collection nodes explicit as well. By definition, non-owning refs are currently trusted. Although the ref has no control over pointee lifetime, due to non-owning ref clobbering rules (see invalidate_non_owning_refs) dereferencing a non-owning ref is safe in the critical section controlled by bpf_spin_lock associated with its owning collection. Note that the previous statement does not hold true for nodes with shared ownership due to the use-after-free issue that this series is addressing. True shared ownership was disabled by commit 7deca5eae833 ("bpf: Disable bpf_refcount_acquire kfunc calls until race conditions are fixed"), though, so the statement holds for now. Further patches in the series will change the trustedness state of non-owning refs before re-enabling bpf_refcount_acquire. Let's add NON_OWN_REF type flag to BPF_REG_TRUSTED_MODIFIERS such that a non-owning ref reg state would pass is_trusted_reg check. Somewhat surprisingly, this doesn't result in any change to user-visible functionality elsewhere in the verifier: graph collection nodes are all marked MEM_ALLOC, which tends to be handled in separate codepaths from "raw" PTR_TO_BTF_ID. Regardless, let's be explicit here and document the current state of things before changing it elsewhere in the series. Signed-off-by: Dave Marchevsky Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230821193311.3290257-3-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f70f9ac884d2..b6e58dab8e27 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -745,7 +745,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) } } -#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED) +#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED | NON_OWN_REF) static inline bool bpf_type_has_unsafe_modifiers(u32 type) { -- cgit v1.2.3 From 0816b8c6bf7fc87cec4273dc199e8f0764b9e7b1 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 21 Aug 2023 12:33:09 -0700 Subject: bpf: Consider non-owning refs to refcounted nodes RCU protected An earlier patch in the series ensures that the underlying memory of nodes with bpf_refcount - which can have multiple owners - is not reused until RCU grace period has elapsed. This prevents use-after-free with non-owning references that may point to recently-freed memory. While RCU read lock is held, it's safe to dereference such a non-owning ref, as by definition RCU GP couldn't have elapsed and therefore underlying memory couldn't have been reused. From the perspective of verifier "trustedness" non-owning refs to refcounted nodes are now trusted only in RCU CS and therefore should no longer pass is_trusted_reg, but rather is_rcu_reg. Let's mark them MEM_RCU in order to reflect this new state. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230821193311.3290257-6-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- kernel/bpf/verifier.c | 13 ++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eced6400f778..12596af59c00 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -653,7 +653,8 @@ enum bpf_type_flag { MEM_RCU = BIT(13 + BPF_BASE_TYPE_BITS), /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning. - * Currently only valid for linked-list and rbtree nodes. + * Currently only valid for linked-list and rbtree nodes. If the nodes + * have a bpf_refcount_field, they must be tagged MEM_RCU as well. */ NON_OWN_REF = BIT(14 + BPF_BASE_TYPE_BITS), diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b875f511c3b7..4b638eb1bdad 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8007,6 +8007,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_BTF_ID | PTR_TRUSTED: case PTR_TO_BTF_ID | MEM_RCU: case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF: + case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU: /* When referenced PTR_TO_BTF_ID is passed to release function, * its fixed offset must be 0. In the other cases, fixed offset * can be non-zero. This was already checked above. So pass @@ -10473,6 +10474,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_verifier_state *state = env->cur_state; + struct btf_record *rec = reg_btf_record(reg); if (!state->active_lock.ptr) { verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n"); @@ -10485,6 +10487,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state } reg->type |= NON_OWN_REF; + if (rec->refcount_off >= 0) + reg->type |= MEM_RCU; + return 0; } @@ -11322,6 +11327,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_func_state *state; struct bpf_reg_state *reg; + if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { + verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); + return -EACCES; + } + if (rcu_lock) { verbose(env, "nested rcu read lock (kernel function %s)\n", func_name); return -EINVAL; @@ -16684,7 +16694,8 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - if (env->cur_state->active_rcu_lock) { + if (env->cur_state->active_rcu_lock && + !in_rbtree_lock_required_cb(env)) { verbose(env, "bpf_rcu_read_unlock is missing\n"); return -EINVAL; } -- cgit v1.2.3