From 324bda9e6c5add86ba2e1066476481c48132aca0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:21 -0700 Subject: bpf: multi program support for cgroup+bpf introduce BPF_F_ALLOW_MULTI flag that can be used to attach multiple bpf programs to a cgroup. The difference between three possible flags for BPF_PROG_ATTACH command: - NONE(default): No further bpf programs allowed in the subtree. - BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, the program in this cgroup yields to sub-cgroup program. - BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, that cgroup program gets run in addition to the program in this cgroup. NONE and BPF_F_ALLOW_OVERRIDE existed before. This patch doesn't change their behavior. It only clarifies the semantics in relation to new flag. Only one program is allowed to be attached to a cgroup with NONE or BPF_F_ALLOW_OVERRIDE flag. Multiple programs are allowed to be attached to a cgroup with BPF_F_ALLOW_MULTI flag. They are executed in FIFO order (those that were attached first, run first) The programs of sub-cgroup are executed first, then programs of this cgroup and then programs of parent cgroup. All eligible programs are executed regardless of return code from earlier programs. To allow efficient execution of multiple programs attached to a cgroup and to avoid penalizing cgroups without any programs attached introduce 'struct bpf_prog_array' which is RCU protected array of pointers to bpf programs. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau for cgroup bits Acked-by: Tejun Heo Signed-off-by: David S. Miller --- kernel/bpf/cgroup.c | 467 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 336 insertions(+), 131 deletions(-) (limited to 'kernel/bpf/cgroup.c') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 546113430049..6b7500bbdb53 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -27,129 +27,361 @@ void cgroup_bpf_put(struct cgroup *cgrp) { unsigned int type; - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { - struct bpf_prog *prog = cgrp->bpf.prog[type]; - - if (prog) { - bpf_prog_put(prog); + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { + struct list_head *progs = &cgrp->bpf.progs[type]; + struct bpf_prog_list *pl, *tmp; + + list_for_each_entry_safe(pl, tmp, progs, node) { + list_del(&pl->node); + bpf_prog_put(pl->prog); + kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } + bpf_prog_array_free(cgrp->bpf.effective[type]); + } +} + +/* count number of elements in the list. + * it's slow but the list cannot be long + */ +static u32 prog_list_length(struct list_head *head) +{ + struct bpf_prog_list *pl; + u32 cnt = 0; + + list_for_each_entry(pl, head, node) { + if (!pl->prog) + continue; + cnt++; } + return cnt; +} + +/* if parent has non-overridable prog attached, + * disallow attaching new programs to the descendent cgroup. + * if parent has overridable or multi-prog, allow attaching + */ +static bool hierarchy_allows_attach(struct cgroup *cgrp, + enum bpf_attach_type type, + u32 new_flags) +{ + struct cgroup *p; + + p = cgroup_parent(cgrp); + if (!p) + return true; + do { + u32 flags = p->bpf.flags[type]; + u32 cnt; + + if (flags & BPF_F_ALLOW_MULTI) + return true; + cnt = prog_list_length(&p->bpf.progs[type]); + WARN_ON_ONCE(cnt > 1); + if (cnt == 1) + return !!(flags & BPF_F_ALLOW_OVERRIDE); + p = cgroup_parent(p); + } while (p); + return true; +} + +/* compute a chain of effective programs for a given cgroup: + * start from the list of programs in this cgroup and add + * all parent programs. + * Note that parent's F_ALLOW_OVERRIDE-type program is yielding + * to programs in this cgroup + */ +static int compute_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type, + struct bpf_prog_array __rcu **array) +{ + struct bpf_prog_array __rcu *progs; + struct bpf_prog_list *pl; + struct cgroup *p = cgrp; + int cnt = 0; + + /* count number of effective programs by walking parents */ + do { + if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + cnt += prog_list_length(&p->bpf.progs[type]); + p = cgroup_parent(p); + } while (p); + + progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); + if (!progs) + return -ENOMEM; + + /* populate the array with effective progs */ + cnt = 0; + p = cgrp; + do { + if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + list_for_each_entry(pl, + &p->bpf.progs[type], node) { + if (!pl->prog) + continue; + rcu_dereference_protected(progs, 1)-> + progs[cnt++] = pl->prog; + } + p = cgroup_parent(p); + } while (p); + + *array = progs; + return 0; +} + +static void activate_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type, + struct bpf_prog_array __rcu *array) +{ + struct bpf_prog_array __rcu *old_array; + + old_array = xchg(&cgrp->bpf.effective[type], array); + /* free prog array after grace period, since __cgroup_bpf_run_*() + * might be still walking the array + */ + bpf_prog_array_free(old_array); } /** * cgroup_bpf_inherit() - inherit effective programs from parent * @cgrp: the cgroup to modify - * @parent: the parent to inherit from */ -void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) +int cgroup_bpf_inherit(struct cgroup *cgrp) { - unsigned int type; +/* has to use marco instead of const int, since compiler thinks + * that array below is variable length + */ +#define NR ARRAY_SIZE(cgrp->bpf.effective) + struct bpf_prog_array __rcu *arrays[NR] = {}; + int i; - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { - struct bpf_prog *e; + for (i = 0; i < NR; i++) + INIT_LIST_HEAD(&cgrp->bpf.progs[i]); - e = rcu_dereference_protected(parent->bpf.effective[type], - lockdep_is_held(&cgroup_mutex)); - rcu_assign_pointer(cgrp->bpf.effective[type], e); - cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type]; - } + for (i = 0; i < NR; i++) + if (compute_effective_progs(cgrp, i, &arrays[i])) + goto cleanup; + + for (i = 0; i < NR; i++) + activate_effective_progs(cgrp, i, arrays[i]); + + return 0; +cleanup: + for (i = 0; i < NR; i++) + bpf_prog_array_free(arrays[i]); + return -ENOMEM; } +#define BPF_CGROUP_MAX_PROGS 64 + /** - * __cgroup_bpf_update() - Update the pinned program of a cgroup, and + * __cgroup_bpf_attach() - Attach the program to a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse - * @parent: The parent of @cgrp, or %NULL if @cgrp is the root - * @prog: A new program to pin - * @type: Type of pinning operation (ingress/egress) - * - * Each cgroup has a set of two pointers for bpf programs; one for eBPF - * programs it owns, and which is effective for execution. - * - * If @prog is not %NULL, this function attaches a new program to the cgroup - * and releases the one that is currently attached, if any. @prog is then made - * the effective program of type @type in that cgroup. - * - * If @prog is %NULL, the currently attached program of type @type is released, - * and the effective program of the parent cgroup (if any) is inherited to - * @cgrp. - * - * Then, the descendants of @cgrp are walked and the effective program for - * each of them is set to the effective program of @cgrp unless the - * descendant has its own program attached, in which case the subbranch is - * skipped. This ensures that delegated subcgroups with own programs are left - * untouched. + * @prog: A program to attach + * @type: Type of attach operation * * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, - struct bpf_prog *prog, enum bpf_attach_type type, - bool new_overridable) +int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) { - struct bpf_prog *old_prog, *effective = NULL; - struct cgroup_subsys_state *pos; - bool overridable = true; - - if (parent) { - overridable = !parent->bpf.disallow_override[type]; - effective = rcu_dereference_protected(parent->bpf.effective[type], - lockdep_is_held(&cgroup_mutex)); - } - - if (prog && effective && !overridable) - /* if parent has non-overridable prog attached, disallow - * attaching new programs to descendent cgroup - */ + struct list_head *progs = &cgrp->bpf.progs[type]; + struct bpf_prog *old_prog = NULL; + struct cgroup_subsys_state *css; + struct bpf_prog_list *pl; + bool pl_was_allocated; + u32 old_flags; + int err; + + if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) + /* invalid combination */ + return -EINVAL; + + if (!hierarchy_allows_attach(cgrp, type, flags)) return -EPERM; - if (prog && effective && overridable != new_overridable) - /* if parent has overridable prog attached, only - * allow overridable programs in descendent cgroup + if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) + /* Disallow attaching non-overridable on top + * of existing overridable in this cgroup. + * Disallow attaching multi-prog if overridable or none */ return -EPERM; - old_prog = cgrp->bpf.prog[type]; - - if (prog) { - overridable = new_overridable; - effective = prog; - if (old_prog && - cgrp->bpf.disallow_override[type] == new_overridable) - /* disallow attaching non-overridable on top - * of existing overridable in this cgroup - * and vice versa - */ - return -EPERM; + if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) + return -E2BIG; + + if (flags & BPF_F_ALLOW_MULTI) { + list_for_each_entry(pl, progs, node) + if (pl->prog == prog) + /* disallow attaching the same prog twice */ + return -EINVAL; + + pl = kmalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + pl_was_allocated = true; + pl->prog = prog; + list_add_tail(&pl->node, progs); + } else { + if (list_empty(progs)) { + pl = kmalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + pl_was_allocated = true; + list_add_tail(&pl->node, progs); + } else { + pl = list_first_entry(progs, typeof(*pl), node); + old_prog = pl->prog; + pl_was_allocated = false; + } + pl->prog = prog; } - if (!prog && !old_prog) - /* report error when trying to detach and nothing is attached */ - return -ENOENT; + old_flags = cgrp->bpf.flags[type]; + cgrp->bpf.flags[type] = flags; - cgrp->bpf.prog[type] = prog; + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); - css_for_each_descendant_pre(pos, &cgrp->self) { - struct cgroup *desc = container_of(pos, struct cgroup, self); - - /* skip the subtree if the descendant has its own program */ - if (desc->bpf.prog[type] && desc != cgrp) { - pos = css_rightmost_descendant(pos); - } else { - rcu_assign_pointer(desc->bpf.effective[type], - effective); - desc->bpf.disallow_override[type] = !overridable; - } + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; } - if (prog) - static_branch_inc(&cgroup_bpf_enabled_key); + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + static_branch_inc(&cgroup_bpf_enabled_key); if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* and cleanup the prog list */ + pl->prog = old_prog; + if (pl_was_allocated) { + list_del(&pl->node); + kfree(pl); + } + return err; +} + +/** + * __cgroup_bpf_detach() - Detach the program from a cgroup, and + * propagate the change to descendants + * @cgrp: The cgroup which descendants to traverse + * @prog: A program to detach or NULL + * @type: Type of detach operation + * + * Must be called with cgroup_mutex held. + */ +int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 unused_flags) +{ + struct list_head *progs = &cgrp->bpf.progs[type]; + u32 flags = cgrp->bpf.flags[type]; + struct bpf_prog *old_prog = NULL; + struct cgroup_subsys_state *css; + struct bpf_prog_list *pl; + int err; + + if (flags & BPF_F_ALLOW_MULTI) { + if (!prog) + /* to detach MULTI prog the user has to specify valid FD + * of the program to be detached + */ + return -EINVAL; + } else { + if (list_empty(progs)) + /* report error when trying to detach and nothing is attached */ + return -ENOENT; + } + + if (flags & BPF_F_ALLOW_MULTI) { + /* find the prog and detach it */ + list_for_each_entry(pl, progs, node) { + if (pl->prog != prog) + continue; + old_prog = prog; + /* mark it deleted, so it's ignored while + * recomputing effective + */ + pl->prog = NULL; + break; + } + if (!old_prog) + return -ENOENT; + } else { + /* to maintain backward compatibility NONE and OVERRIDE cgroups + * allow detaching with invalid FD (prog==NULL) + */ + pl = list_first_entry(progs, typeof(*pl), node); + old_prog = pl->prog; + pl->prog = NULL; + } + + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; + } + + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* now can actually delete it from this cgroup list */ + list_del(&pl->node); + kfree(pl); + if (list_empty(progs)) + /* last program was detached, reset flags to zero */ + cgrp->bpf.flags[type] = 0; + + bpf_prog_put(old_prog); + static_branch_dec(&cgroup_bpf_enabled_key); + return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* and restore back old_prog */ + pl->prog = old_prog; + return err; } /** @@ -171,36 +403,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum bpf_attach_type type) { - struct bpf_prog *prog; + unsigned int offset = skb->data - skb_network_header(skb); + struct sock *save_sk; struct cgroup *cgrp; - int ret = 0; + int ret; if (!sk || !sk_fullsock(sk)) return 0; - if (sk->sk_family != AF_INET && - sk->sk_family != AF_INET6) + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) { - unsigned int offset = skb->data - skb_network_header(skb); - struct sock *save_sk = skb->sk; - - skb->sk = sk; - __skb_push(skb, offset); - ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; - __skb_pull(skb, offset); - skb->sk = save_sk; - } - - rcu_read_unlock(); - - return ret; + save_sk = skb->sk; + skb->sk = sk; + __skb_push(skb, offset); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + bpf_prog_run_save_cb); + __skb_pull(skb, offset); + skb->sk = save_sk; + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); @@ -221,19 +443,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - struct bpf_prog *prog; - int ret = 0; - - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) - ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; + int ret; - rcu_read_unlock(); - - return ret; + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -258,18 +471,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, enum bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - struct bpf_prog *prog; - int ret = 0; - - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) - ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM; - - rcu_read_unlock(); + int ret; - return ret; + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, + BPF_PROG_RUN); + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); -- cgit v1.2.3 From 468e2f64d220fe2dc11caa2bcb9b3a1e50fc7321 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:22 -0700 Subject: bpf: introduce BPF_PROG_QUERY command introduce BPF_PROG_QUERY command to retrieve a set of either attached programs to given cgroup or a set of effective programs that will execute for events within a cgroup Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau for cgroup bits Acked-by: Tejun Heo Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 4 ++++ include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 13 +++++++++++++ kernel/bpf/cgroup.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 38 ++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 34 ++++++++++++++++++++++++++++++++++ kernel/cgroup/cgroup.c | 10 ++++++++++ 7 files changed, 148 insertions(+) (limited to 'kernel/bpf/cgroup.c') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 102e56fbb6de..359b6f5d3d90 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -44,12 +44,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); +int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr); /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); +int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr); int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a6964b75f070..a67daea731ab 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -260,6 +260,9 @@ struct bpf_prog_array { struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array __rcu *progs); +int bpf_prog_array_length(struct bpf_prog_array __rcu *progs); +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, + __u32 __user *prog_ids, u32 cnt); #define BPF_PROG_RUN_ARRAY(array, ctx, func) \ ({ \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 762f74bc6c47..cb2b9f95160a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -92,6 +92,7 @@ enum bpf_cmd { BPF_PROG_GET_FD_BY_ID, BPF_MAP_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, + BPF_PROG_QUERY, }; enum bpf_map_type { @@ -211,6 +212,9 @@ enum bpf_attach_type { /* Specify numa node during map creation */ #define BPF_F_NUMA_NODE (1U << 2) +/* flags for BPF_PROG_QUERY */ +#define BPF_F_QUERY_EFFECTIVE (1U << 0) + #define BPF_OBJ_NAME_LEN 16U union bpf_attr { @@ -289,6 +293,15 @@ union bpf_attr { __u32 info_len; __aligned_u64 info; } info; + + struct { /* anonymous struct used by BPF_PROG_QUERY command */ + __u32 target_fd; /* container object to query */ + __u32 attach_type; + __u32 query_flags; + __u32 attach_flags; + __aligned_u64 prog_ids; + __u32 prog_cnt; + } query; } __attribute__((aligned(8))); /* BPF helper function descriptions: diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6b7500bbdb53..e88abc0865d5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -384,6 +384,52 @@ cleanup: return err; } +/* Must be called with cgroup_mutex held to avoid races. */ +int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + enum bpf_attach_type type = attr->query.attach_type; + struct list_head *progs = &cgrp->bpf.progs[type]; + u32 flags = cgrp->bpf.flags[type]; + int cnt, ret = 0, i; + + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) + cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); + else + cnt = prog_list_length(progs); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) + return -EFAULT; + if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) + /* return early if user requested only program count + flags */ + return 0; + if (attr->query.prog_cnt < cnt) { + cnt = attr->query.prog_cnt; + ret = -ENOSPC; + } + + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], + prog_ids, cnt); + } else { + struct bpf_prog_list *pl; + u32 id; + + i = 0; + list_for_each_entry(pl, progs, node) { + id = pl->prog->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) + return -EFAULT; + if (++i == cnt) + break; + } + } + return ret; +} + /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6b49e1991ae7..eba966c09053 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1412,6 +1412,44 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) kfree_rcu(progs, rcu); } +int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) +{ + struct bpf_prog **prog; + u32 cnt = 0; + + rcu_read_lock(); + prog = rcu_dereference(progs)->progs; + for (; *prog; prog++) + cnt++; + rcu_read_unlock(); + return cnt; +} + +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, + __u32 __user *prog_ids, u32 cnt) +{ + struct bpf_prog **prog; + u32 i = 0, id; + + rcu_read_lock(); + prog = rcu_dereference(progs)->progs; + for (; *prog; prog++) { + id = (*prog)->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) { + rcu_read_unlock(); + return -EFAULT; + } + if (++i == cnt) { + prog++; + break; + } + } + rcu_read_unlock(); + if (*prog) + return -ENOSPC; + return 0; +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 51bee695d32c..0048cb24ba7b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1272,6 +1272,37 @@ static int bpf_prog_detach(const union bpf_attr *attr) return ret; } +#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt + +static int bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct cgroup *cgrp; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (CHECK_ATTR(BPF_PROG_QUERY)) + return -EINVAL; + if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) + return -EINVAL; + + switch (attr->query.attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_SOCK_OPS: + break; + default: + return -EINVAL; + } + cgrp = cgroup_get_from_fd(attr->query.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + ret = cgroup_bpf_query(cgrp, attr, uattr); + cgroup_put(cgrp); + return ret; +} #endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration @@ -1568,6 +1599,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_DETACH: err = bpf_prog_detach(&attr); break; + case BPF_PROG_QUERY: + err = bpf_prog_query(&attr, uattr); + break; #endif case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 57eb866ae78d..269512b94a94 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5761,4 +5761,14 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, mutex_unlock(&cgroup_mutex); return ret; } +int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_query(cgrp, attr, uattr); + mutex_unlock(&cgroup_mutex); + return ret; +} #endif /* CONFIG_CGROUP_BPF */ -- cgit v1.2.3 From 952925dec0f276b407b2abce4aee82cba7c700c3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 11 Oct 2017 11:56:23 +0100 Subject: bpf: remove redundant variable old_flags Variable old_flags is being assigned but is never read; it is redundant and can be removed. Cleans up clang warning: Value stored to 'old_flags' is never read Signed-off-by: Colin Ian King Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/bpf/cgroup.c') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e88abc0865d5..3db5a17fcfe8 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -192,7 +192,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct cgroup_subsys_state *css; struct bpf_prog_list *pl; bool pl_was_allocated; - u32 old_flags; int err; if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) @@ -239,7 +238,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = prog; } - old_flags = cgrp->bpf.flags[type]; cgrp->bpf.flags[type] = flags; /* allocate and recompute effective prog arrays */ -- cgit v1.2.3 From ebc614f687369f9df99828572b1d85a7c2de3d92 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sun, 5 Nov 2017 08:15:32 -0500 Subject: bpf, cgroup: implement eBPF-based device controller for cgroup v2 Cgroup v2 lacks the device controller, provided by cgroup v1. This patch adds a new eBPF program type, which in combination of previously added ability to attach multiple eBPF programs to a cgroup, will provide a similar functionality, but with some additional flexibility. This patch introduces a BPF_PROG_TYPE_CGROUP_DEVICE program type. A program takes major and minor device numbers, device type (block/character) and access type (mknod/read/write) as parameters and returns an integer which defines if the operation should be allowed or terminated with -EPERM. Signed-off-by: Roman Gushchin Acked-by: Alexei Starovoitov Acked-by: Tejun Heo Cc: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 15 ++++++++++ include/linux/bpf_types.h | 3 ++ include/linux/device_cgroup.h | 8 ++++- include/uapi/linux/bpf.h | 15 ++++++++++ kernel/bpf/cgroup.c | 67 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 7 +++++ kernel/bpf/verifier.c | 1 + tools/include/uapi/linux/bpf.h | 15 ++++++++++ 8 files changed, 130 insertions(+), 1 deletion(-) (limited to 'kernel/bpf/cgroup.c') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 87a7db9feb38..a7f16e0f8d68 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -67,6 +67,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum bpf_attach_type type); +int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, + short access, enum bpf_attach_type type); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -112,6 +115,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, } \ __ret; \ }) + +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ + access, \ + BPF_CGROUP_DEVICE); \ + \ + __ret; \ +}) #else struct cgroup_bpf {}; @@ -122,6 +136,7 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) #endif /* CONFIG_CGROUP_BPF */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 53c5b9ad7220..978c1d9c9383 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -19,6 +19,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) #endif +#ifdef CONFIG_CGROUP_BPF +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h index 2d93d7ecd479..8557efe096dc 100644 --- a/include/linux/device_cgroup.h +++ b/include/linux/device_cgroup.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include #define DEVCG_ACC_MKNOD 1 #define DEVCG_ACC_READ 2 @@ -19,10 +20,15 @@ static inline int __devcgroup_check_permission(short type, u32 major, u32 minor, { return 0; } #endif -#ifdef CONFIG_CGROUP_DEVICE +#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) static inline int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { + int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access); + + if (rc) + return -EPERM; + return __devcgroup_check_permission(type, major, minor, access); } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4455dd195201..e880ae6434ee 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -132,6 +132,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_XMIT, BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, + BPF_PROG_TYPE_CGROUP_DEVICE, }; enum bpf_attach_type { @@ -141,6 +142,7 @@ enum bpf_attach_type { BPF_CGROUP_SOCK_OPS, BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, + BPF_CGROUP_DEVICE, __MAX_BPF_ATTACH_TYPE }; @@ -991,4 +993,17 @@ struct bpf_perf_event_value { __u64 running; }; +#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) +#define BPF_DEVCG_ACC_READ (1ULL << 1) +#define BPF_DEVCG_ACC_WRITE (1ULL << 2) + +#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) +#define BPF_DEVCG_DEV_CHAR (1ULL << 1) + +struct bpf_cgroup_dev_ctx { + __u32 access_type; /* (access << 16) | type */ + __u32 major; + __u32 minor; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 3db5a17fcfe8..b789ab78d28f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -522,3 +522,70 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); + +int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, + short access, enum bpf_attach_type type) +{ + struct cgroup *cgrp; + struct bpf_cgroup_dev_ctx ctx = { + .access_type = (access << 16) | dev_type, + .major = major, + .minor = minor, + }; + int allow = 1; + + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); + allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, + BPF_PROG_RUN); + rcu_read_unlock(); + + return !allow; +} +EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); + +static const struct bpf_func_proto * +cgroup_dev_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_trace_printk: + if (capable(CAP_SYS_ADMIN)) + return bpf_get_trace_printk_proto(); + default: + return NULL; + } +} + +static bool cgroup_dev_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + + if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) + return false; + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; +} + +const struct bpf_prog_ops cg_dev_prog_ops = { +}; + +const struct bpf_verifier_ops cg_dev_verifier_ops = { + .get_func_proto = cgroup_dev_func_proto, + .is_valid_access = cgroup_dev_is_valid_access, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 416d70cdfc76..09badc37e864 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1326,6 +1326,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; + case BPF_CGROUP_DEVICE: + ptype = BPF_PROG_TYPE_CGROUP_DEVICE; + break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, true); @@ -1378,6 +1381,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; + case BPF_CGROUP_DEVICE: + ptype = BPF_PROG_TYPE_CGROUP_DEVICE; + break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, false); @@ -1420,6 +1426,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_DEVICE: break; default: return -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index add845fe788a..4a942e2e753d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3124,6 +3124,7 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_CGROUP_DEVICE: break; default: return 0; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e92f62cf933a..b280f37cd057 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -131,6 +131,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_XMIT, BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, + BPF_PROG_TYPE_CGROUP_DEVICE, }; enum bpf_attach_type { @@ -140,6 +141,7 @@ enum bpf_attach_type { BPF_CGROUP_SOCK_OPS, BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, + BPF_CGROUP_DEVICE, __MAX_BPF_ATTACH_TYPE }; @@ -990,4 +992,17 @@ struct bpf_perf_event_value { __u64 running; }; +#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) +#define BPF_DEVCG_ACC_READ (1ULL << 1) +#define BPF_DEVCG_ACC_WRITE (1ULL << 2) + +#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) +#define BPF_DEVCG_DEV_CHAR (1ULL << 1) + +struct bpf_cgroup_dev_ctx { + __u32 access_type; /* (access << 16) | type */ + __u32 major; + __u32 minor; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3