diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-03-31 17:29:33 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-03-31 17:29:33 -0700 |
commit | 29d9f30d4ce6c7a38745a54a8cddface10013490 (patch) | |
tree | 85649ba6a7b39203584d8db9365e03f64e62c136 /kernel | |
parent | 56a451b780676bc1cdac011735fe2869fa2e9abf (diff) | |
parent | 7f80ccfe996871ca69648efee74a60ae7ad0dcd9 (diff) | |
download | lwn-29d9f30d4ce6c7a38745a54a8cddface10013490.tar.gz lwn-29d9f30d4ce6c7a38745a54a8cddface10013490.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from David Miller:
"Highlights:
1) Fix the iwlwifi regression, from Johannes Berg.
2) Support BSS coloring and 802.11 encapsulation offloading in
hardware, from John Crispin.
3) Fix some potential Spectre issues in qtnfmac, from Sergey
Matyukevich.
4) Add TTL decrement action to openvswitch, from Matteo Croce.
5) Allow paralleization through flow_action setup by not taking the
RTNL mutex, from Vlad Buslov.
6) A lot of zero-length array to flexible-array conversions, from
Gustavo A. R. Silva.
7) Align XDP statistics names across several drivers for consistency,
from Lorenzo Bianconi.
8) Add various pieces of infrastructure for offloading conntrack, and
make use of it in mlx5 driver, from Paul Blakey.
9) Allow using listening sockets in BPF sockmap, from Jakub Sitnicki.
10) Lots of parallelization improvements during configuration changes
in mlxsw driver, from Ido Schimmel.
11) Add support to devlink for generic packet traps, which report
packets dropped during ACL processing. And use them in mlxsw
driver. From Jiri Pirko.
12) Support bcmgenet on ACPI, from Jeremy Linton.
13) Make BPF compatible with RT, from Thomas Gleixnet, Alexei
Starovoitov, and your's truly.
14) Support XDP meta-data in virtio_net, from Yuya Kusakabe.
15) Fix sysfs permissions when network devices change namespaces, from
Christian Brauner.
16) Add a flags element to ethtool_ops so that drivers can more simply
indicate which coalescing parameters they actually support, and
therefore the generic layer can validate the user's ethtool
request. Use this in all drivers, from Jakub Kicinski.
17) Offload FIFO qdisc in mlxsw, from Petr Machata.
18) Support UDP sockets in sockmap, from Lorenz Bauer.
19) Fix stretch ACK bugs in several TCP congestion control modules,
from Pengcheng Yang.
20) Support virtual functiosn in octeontx2 driver, from Tomasz
Duszynski.
21) Add region operations for devlink and use it in ice driver to dump
NVM contents, from Jacob Keller.
22) Add support for hw offload of MACSEC, from Antoine Tenart.
23) Add support for BPF programs that can be attached to LSM hooks,
from KP Singh.
24) Support for multiple paths, path managers, and counters in MPTCP.
From Peter Krystad, Paolo Abeni, Florian Westphal, Davide Caratti,
and others.
25) More progress on adding the netlink interface to ethtool, from
Michal Kubecek"
* git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2121 commits)
net: ipv6: rpl_iptunnel: Fix potential memory leak in rpl_do_srh_inline
cxgb4/chcr: nic-tls stats in ethtool
net: dsa: fix oops while probing Marvell DSA switches
net/bpfilter: remove superfluous testing message
net: macb: Fix handling of fixed-link node
net: dsa: ksz: Select KSZ protocol tag
netdevsim: dev: Fix memory leak in nsim_dev_take_snapshot_write
net: stmmac: add EHL 2.5Gbps PCI info and PCI ID
net: stmmac: add EHL PSE0 & PSE1 1Gbps PCI info and PCI ID
net: stmmac: create dwmac-intel.c to contain all Intel platform
net: dsa: bcm_sf2: Support specifying VLAN tag egress rule
net: dsa: bcm_sf2: Add support for matching VLAN TCI
net: dsa: bcm_sf2: Move writing of CFP_DATA(5) into slicing functions
net: dsa: bcm_sf2: Check earlier for FLOW_EXT and FLOW_MAC_EXT
net: dsa: bcm_sf2: Disable learning for ASP port
net: dsa: b53: Deny enslaving port 7 for 7278 into a bridge
net: dsa: b53: Prevent tagged VLAN on port 7 for 7278
net: dsa: b53: Restore VLAN entries upon (re)configuration
net: dsa: bcm_sf2: Fix overflow checks
hv_netvsc: Remove unnecessary round_up for recv_completion_cnt
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/Makefile | 1 | ||||
-rw-r--r-- | kernel/bpf/bpf_lsm.c | 54 | ||||
-rw-r--r-- | kernel/bpf/bpf_struct_ops.c | 12 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 62 | ||||
-rw-r--r-- | kernel/bpf/cgroup.c | 505 | ||||
-rw-r--r-- | kernel/bpf/core.c | 122 | ||||
-rw-r--r-- | kernel/bpf/dispatcher.c | 5 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 174 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 63 | ||||
-rw-r--r-- | kernel/bpf/inode.c | 42 | ||||
-rw-r--r-- | kernel/bpf/lpm_trie.c | 14 | ||||
-rw-r--r-- | kernel/bpf/percpu_freelist.c | 20 | ||||
-rw-r--r-- | kernel/bpf/reuseport_array.c | 5 | ||||
-rw-r--r-- | kernel/bpf/stackmap.c | 18 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 656 | ||||
-rw-r--r-- | kernel/bpf/sysfs_btf.c | 11 | ||||
-rw-r--r-- | kernel/bpf/tnum.c | 15 | ||||
-rw-r--r-- | kernel/bpf/trampoline.c | 178 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 1649 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 41 | ||||
-rw-r--r-- | kernel/events/core.c | 11 | ||||
-rw-r--r-- | kernel/extable.c | 2 | ||||
-rw-r--r-- | kernel/seccomp.c | 4 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 77 | ||||
-rw-r--r-- | kernel/trace/trace_uprobe.c | 11 |
25 files changed, 2676 insertions, 1076 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 046ce5d98033..f2d7be596966 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -29,4 +29,5 @@ obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o endif ifeq ($(CONFIG_BPF_JIT),y) obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o +obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c new file mode 100644 index 000000000000..19636703b24e --- /dev/null +++ b/kernel/bpf/bpf_lsm.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2020 Google LLC. + */ + +#include <linux/filter.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/lsm_hooks.h> +#include <linux/bpf_lsm.h> +#include <linux/kallsyms.h> +#include <linux/bpf_verifier.h> + +/* For every LSM hook that allows attachment of BPF programs, declare a nop + * function where a BPF program can be attached. + */ +#define LSM_HOOK(RET, DEFAULT, NAME, ...) \ +noinline RET bpf_lsm_##NAME(__VA_ARGS__) \ +{ \ + return DEFAULT; \ +} + +#include <linux/lsm_hook_defs.h> +#undef LSM_HOOK + +#define BPF_LSM_SYM_PREFX "bpf_lsm_" + +int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, + const struct bpf_prog *prog) +{ + if (!prog->gpl_compatible) { + bpf_log(vlog, + "LSM programs must have a GPL compatible license\n"); + return -EINVAL; + } + + if (strncmp(BPF_LSM_SYM_PREFX, prog->aux->attach_func_name, + sizeof(BPF_LSM_SYM_PREFX) - 1)) { + bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", + prog->aux->attach_btf_id, prog->aux->attach_func_name); + return -EINVAL; + } + + return 0; +} + +const struct bpf_prog_ops lsm_prog_ops = { +}; + +const struct bpf_verifier_ops lsm_verifier_ops = { + .get_func_proto = bpf_tracing_func_proto, + .is_valid_access = btf_ctx_access, +}; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 68a89a9f7ccd..26cb51f2db72 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -23,7 +23,7 @@ enum bpf_struct_ops_state { struct bpf_struct_ops_value { BPF_STRUCT_OPS_COMMON_VALUE; - char data[0] ____cacheline_aligned_in_smp; + char data[] ____cacheline_aligned_in_smp; }; struct bpf_struct_ops_map { @@ -320,6 +320,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, struct bpf_struct_ops_value *uvalue, *kvalue; const struct btf_member *member; const struct btf_type *t = st_ops->type; + struct bpf_tramp_progs *tprogs = NULL; void *udata, *kdata; int prog_fd, err = 0; void *image; @@ -343,6 +344,10 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (uvalue->state || refcount_read(&uvalue->refcnt)) return -EINVAL; + tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); + if (!tprogs) + return -ENOMEM; + uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; @@ -425,10 +430,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto reset_unlock; } + tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; + tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; err = arch_prepare_bpf_trampoline(image, st_map->image + PAGE_SIZE, &st_ops->func_models[i], 0, - &prog, 1, NULL, 0, NULL); + tprogs, NULL); if (err < 0) goto reset_unlock; @@ -469,6 +476,7 @@ reset_unlock: memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); unlock: + kfree(tprogs); mutex_unlock(&st_map->lock); return err; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7787bdcb5d68..d65c6912bdaf 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3477,8 +3477,8 @@ errout: return ERR_PTR(err); } -extern char __weak _binary__btf_vmlinux_bin_start[]; -extern char __weak _binary__btf_vmlinux_bin_end[]; +extern char __weak __start_BTF[]; +extern char __weak __stop_BTF[]; extern struct btf *btf_vmlinux; #define BPF_MAP_TYPE(_id, _ops) @@ -3605,9 +3605,8 @@ struct btf *btf_parse_vmlinux(void) } env->btf = btf; - btf->data = _binary__btf_vmlinux_bin_start; - btf->data_size = _binary__btf_vmlinux_bin_end - - _binary__btf_vmlinux_bin_start; + btf->data = __start_BTF; + btf->data_size = __stop_BTF - __start_BTF; err = btf_parse_hdr(env); if (err) @@ -3710,23 +3709,60 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, nr_args--; } - if (prog->expected_attach_type == BPF_TRACE_FEXIT && - arg == nr_args) { - if (!t) - /* Default prog with 5 args. 6th arg is retval. */ - return true; - /* function return type */ - t = btf_type_by_id(btf, t->type); - } else if (arg >= nr_args) { + if (arg > nr_args) { bpf_log(log, "func '%s' doesn't have %d-th argument\n", tname, arg + 1); return false; + } + + if (arg == nr_args) { + switch (prog->expected_attach_type) { + case BPF_LSM_MAC: + case BPF_TRACE_FEXIT: + /* When LSM programs are attached to void LSM hooks + * they use FEXIT trampolines and when attached to + * int LSM hooks, they use MODIFY_RETURN trampolines. + * + * While the LSM programs are BPF_MODIFY_RETURN-like + * the check: + * + * if (ret_type != 'int') + * return -EINVAL; + * + * is _not_ done here. This is still safe as LSM hooks + * have only void and int return types. + */ + if (!t) + return true; + t = btf_type_by_id(btf, t->type); + break; + case BPF_MODIFY_RETURN: + /* For now the BPF_MODIFY_RETURN can only be attached to + * functions that return an int. + */ + if (!t) + return false; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + if (!btf_type_is_int(t)) { + bpf_log(log, + "ret type %s not allowed for fmod_ret\n", + btf_kind_str[BTF_INFO_KIND(t->info)]); + return false; + } + break; + default: + bpf_log(log, "func '%s' doesn't have %d-th argument\n", + tname, arg + 1); + return false; + } } else { if (!t) /* Default prog with 5 args */ return true; t = btf_type_by_id(btf, args[arg].type); } + /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4f1472409ef8..cb305e71e7de 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -28,6 +28,69 @@ void cgroup_bpf_offline(struct cgroup *cgrp) percpu_ref_kill(&cgrp->bpf.refcnt); } +static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[]) +{ + enum bpf_cgroup_storage_type stype; + + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storages[stype]); +} + +static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[], + struct bpf_prog *prog) +{ + enum bpf_cgroup_storage_type stype; + + for_each_cgroup_storage_type(stype) { + storages[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(storages[stype])) { + storages[stype] = NULL; + bpf_cgroup_storages_free(storages); + return -ENOMEM; + } + } + + return 0; +} + +static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[], + struct bpf_cgroup_storage *src[]) +{ + enum bpf_cgroup_storage_type stype; + + for_each_cgroup_storage_type(stype) + dst[stype] = src[stype]; +} + +static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[], + struct cgroup* cgrp, + enum bpf_attach_type attach_type) +{ + enum bpf_cgroup_storage_type stype; + + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_link(storages[stype], cgrp, attach_type); +} + +static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[]) +{ + enum bpf_cgroup_storage_type stype; + + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_unlink(storages[stype]); +} + +/* Called when bpf_cgroup_link is auto-detached from dying cgroup. + * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It + * doesn't free link memory, which will eventually be done by bpf_link's + * release() callback, when its last FD is closed. + */ +static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link) +{ + cgroup_put(link->cgroup); + link->cgroup = NULL; +} + /** * cgroup_bpf_release() - put references of all bpf programs and * release all cgroup bpf data @@ -37,7 +100,6 @@ static void cgroup_bpf_release(struct work_struct *work) { struct cgroup *p, *cgrp = container_of(work, struct cgroup, bpf.release_work); - enum bpf_cgroup_storage_type stype; struct bpf_prog_array *old_array; unsigned int type; @@ -49,11 +111,12 @@ static void cgroup_bpf_release(struct work_struct *work) list_for_each_entry_safe(pl, tmp, progs, node) { list_del(&pl->node); - bpf_prog_put(pl->prog); - for_each_cgroup_storage_type(stype) { - bpf_cgroup_storage_unlink(pl->storage[stype]); - bpf_cgroup_storage_free(pl->storage[stype]); - } + if (pl->prog) + bpf_prog_put(pl->prog); + if (pl->link) + bpf_cgroup_link_auto_detach(pl->link); + bpf_cgroup_storages_unlink(pl->storage); + bpf_cgroup_storages_free(pl->storage); kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -85,6 +148,18 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref) queue_work(system_wq, &cgrp->bpf.release_work); } +/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through + * link or direct prog. + */ +static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) +{ + if (pl->prog) + return pl->prog; + if (pl->link) + return pl->link->link.prog; + return NULL; +} + /* count number of elements in the list. * it's slow but the list cannot be long */ @@ -94,7 +169,7 @@ static u32 prog_list_length(struct list_head *head) u32 cnt = 0; list_for_each_entry(pl, head, node) { - if (!pl->prog) + if (!prog_list_prog(pl)) continue; cnt++; } @@ -138,7 +213,7 @@ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, struct bpf_prog_array **array) { - enum bpf_cgroup_storage_type stype; + struct bpf_prog_array_item *item; struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; @@ -163,13 +238,13 @@ static int compute_effective_progs(struct cgroup *cgrp, continue; list_for_each_entry(pl, &p->bpf.progs[type], node) { - if (!pl->prog) + if (!prog_list_prog(pl)) continue; - progs->items[cnt].prog = pl->prog; - for_each_cgroup_storage_type(stype) - progs->items[cnt].cgroup_storage[stype] = - pl->storage[stype]; + item = &progs->items[cnt]; + item->prog = prog_list_prog(pl); + bpf_cgroup_storages_assign(item->cgroup_storage, + pl->storage); cnt++; } } while ((p = cgroup_parent(p))); @@ -287,19 +362,60 @@ cleanup: #define BPF_CGROUP_MAX_PROGS 64 +static struct bpf_prog_list *find_attach_entry(struct list_head *progs, + struct bpf_prog *prog, + struct bpf_cgroup_link *link, + struct bpf_prog *replace_prog, + bool allow_multi) +{ + struct bpf_prog_list *pl; + + /* single-attach case */ + if (!allow_multi) { + if (list_empty(progs)) + return NULL; + return list_first_entry(progs, typeof(*pl), node); + } + + list_for_each_entry(pl, progs, node) { + if (prog && pl->prog == prog) + /* disallow attaching the same prog twice */ + return ERR_PTR(-EINVAL); + if (link && pl->link == link) + /* disallow attaching the same link twice */ + return ERR_PTR(-EINVAL); + } + + /* direct prog multi-attach w/ replacement case */ + if (replace_prog) { + list_for_each_entry(pl, progs, node) { + if (pl->prog == replace_prog) + /* a match found */ + return pl; + } + /* prog to replace not found for cgroup */ + return ERR_PTR(-ENOENT); + } + + return NULL; +} + /** - * __cgroup_bpf_attach() - Attach the program to a cgroup, and + * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to attach + * @link: A link to attach * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set * @type: Type of attach operation * @flags: Option flags * + * Exactly one of @prog or @link can be non-null. * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_prog *replace_prog, +int __cgroup_bpf_attach(struct cgroup *cgrp, + struct bpf_prog *prog, struct bpf_prog *replace_prog, + struct bpf_cgroup_link *link, enum bpf_attach_type type, u32 flags) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); @@ -307,14 +423,19 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; - struct bpf_prog_list *pl, *replace_pl = NULL; - enum bpf_cgroup_storage_type stype; + struct bpf_prog_list *pl; int err; if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) /* invalid combination */ return -EINVAL; + if (link && (prog || replace_prog)) + /* only either link or prog/replace_prog can be specified */ + return -EINVAL; + if (!!replace_prog != !!(flags & BPF_F_REPLACE)) + /* replace_prog implies BPF_F_REPLACE, and vice versa */ + return -EINVAL; if (!hierarchy_allows_attach(cgrp, type)) return -EPERM; @@ -329,140 +450,203 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; - if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) { - if (pl->prog == prog) - /* disallow attaching the same prog twice */ - return -EINVAL; - if (pl->prog == replace_prog) - replace_pl = pl; - } - if ((flags & BPF_F_REPLACE) && !replace_pl) - /* prog to replace not found for cgroup */ - return -ENOENT; - } else if (!list_empty(progs)) { - replace_pl = list_first_entry(progs, typeof(*pl), node); - } + pl = find_attach_entry(progs, prog, link, replace_prog, + flags & BPF_F_ALLOW_MULTI); + if (IS_ERR(pl)) + return PTR_ERR(pl); - for_each_cgroup_storage_type(stype) { - storage[stype] = bpf_cgroup_storage_alloc(prog, stype); - if (IS_ERR(storage[stype])) { - storage[stype] = NULL; - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); - return -ENOMEM; - } - } + if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog)) + return -ENOMEM; - if (replace_pl) { - pl = replace_pl; + if (pl) { old_prog = pl->prog; - for_each_cgroup_storage_type(stype) { - old_storage[stype] = pl->storage[stype]; - bpf_cgroup_storage_unlink(old_storage[stype]); - } + bpf_cgroup_storages_unlink(pl->storage); + bpf_cgroup_storages_assign(old_storage, pl->storage); } else { pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storages_free(storage); return -ENOMEM; } list_add_tail(&pl->node, progs); } pl->prog = prog; - for_each_cgroup_storage_type(stype) - pl->storage[stype] = storage[stype]; - + pl->link = link; + bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[type] = saved_flags; err = update_effective_progs(cgrp, type); if (err) goto cleanup; - static_branch_inc(&cgroup_bpf_enabled_key); - for_each_cgroup_storage_type(stype) { - if (!old_storage[stype]) - continue; - bpf_cgroup_storage_free(old_storage[stype]); - } - if (old_prog) { + bpf_cgroup_storages_free(old_storage); + if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key); - } - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_link(storage[stype], cgrp, type); + else + static_branch_inc(&cgroup_bpf_enabled_key); + bpf_cgroup_storages_link(pl->storage, cgrp, type); return 0; cleanup: - /* and cleanup the prog list */ - pl->prog = old_prog; - for_each_cgroup_storage_type(stype) { - bpf_cgroup_storage_free(pl->storage[stype]); - pl->storage[stype] = old_storage[stype]; - bpf_cgroup_storage_link(old_storage[stype], cgrp, type); + if (old_prog) { + pl->prog = old_prog; + pl->link = NULL; } - if (!replace_pl) { + bpf_cgroup_storages_free(pl->storage); + bpf_cgroup_storages_assign(pl->storage, old_storage); + bpf_cgroup_storages_link(pl->storage, cgrp, type); + if (!old_prog) { list_del(&pl->node); kfree(pl); } return err; } +/* Swap updated BPF program for given link in effective program arrays across + * all descendant cgroups. This function is guaranteed to succeed. + */ +static void replace_effective_prog(struct cgroup *cgrp, + enum bpf_attach_type type, + struct bpf_cgroup_link *link) +{ + struct bpf_prog_array_item *item; + struct cgroup_subsys_state *css; + struct bpf_prog_array *progs; + struct bpf_prog_list *pl; + struct list_head *head; + struct cgroup *cg; + int pos; + + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + if (percpu_ref_is_zero(&desc->bpf.refcnt)) + continue; + + /* find position of link in effective progs array */ + for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { + if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + continue; + + head = &cg->bpf.progs[type]; + list_for_each_entry(pl, head, node) { + if (!prog_list_prog(pl)) + continue; + if (pl->link == link) + goto found; + pos++; + } + } +found: + BUG_ON(!cg); + progs = rcu_dereference_protected( + desc->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + item = &progs->items[pos]; + WRITE_ONCE(item->prog, link->link.prog); + } +} + /** - * __cgroup_bpf_detach() - Detach the program from a cgroup, and + * __cgroup_bpf_replace() - Replace link's program and propagate the change + * to descendants + * @cgrp: The cgroup which descendants to traverse + * @link: A link for which to replace BPF program + * @type: Type of attach operation + * + * Must be called with cgroup_mutex held. + */ +int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, + struct bpf_prog *new_prog) +{ + struct list_head *progs = &cgrp->bpf.progs[link->type]; + struct bpf_prog *old_prog; + struct bpf_prog_list *pl; + bool found = false; + + if (link->link.prog->type != new_prog->type) + return -EINVAL; + + list_for_each_entry(pl, progs, node) { + if (pl->link == link) { + found = true; + break; + } + } + if (!found) + return -ENOENT; + + old_prog = xchg(&link->link.prog, new_prog); + replace_effective_prog(cgrp, link->type, link); + bpf_prog_put(old_prog); + return 0; +} + +static struct bpf_prog_list *find_detach_entry(struct list_head *progs, + struct bpf_prog *prog, + struct bpf_cgroup_link *link, + bool allow_multi) +{ + struct bpf_prog_list *pl; + + if (!allow_multi) { + if (list_empty(progs)) + /* report error when trying to detach and nothing is attached */ + return ERR_PTR(-ENOENT); + + /* to maintain backward compatibility NONE and OVERRIDE cgroups + * allow detaching with invalid FD (prog==NULL) in legacy mode + */ + return list_first_entry(progs, typeof(*pl), node); + } + + if (!prog && !link) + /* to detach MULTI prog the user has to specify valid FD + * of the program or link to be detached + */ + return ERR_PTR(-EINVAL); + + /* find the prog or link and detach it */ + list_for_each_entry(pl, progs, node) { + if (pl->prog == prog && pl->link == link) + return pl; + } + return ERR_PTR(-ENOENT); +} + +/** + * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to detach or NULL + * @prog: A link to detach or NULL * @type: Type of detach operation * + * At most one of @prog or @link can be non-NULL. * Must be called with cgroup_mutex held. */ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type) + struct bpf_cgroup_link *link, enum bpf_attach_type type) { struct list_head *progs = &cgrp->bpf.progs[type]; - enum bpf_cgroup_storage_type stype; u32 flags = cgrp->bpf.flags[type]; - struct bpf_prog *old_prog = NULL; struct bpf_prog_list *pl; + struct bpf_prog *old_prog; int err; - if (flags & BPF_F_ALLOW_MULTI) { - if (!prog) - /* to detach MULTI prog the user has to specify valid FD - * of the program to be detached - */ - return -EINVAL; - } else { - if (list_empty(progs)) - /* report error when trying to detach and nothing is attached */ - return -ENOENT; - } + if (prog && link) + /* only one of prog or link can be specified */ + return -EINVAL; - if (flags & BPF_F_ALLOW_MULTI) { - /* find the prog and detach it */ - list_for_each_entry(pl, progs, node) { - if (pl->prog != prog) - continue; - old_prog = prog; - /* mark it deleted, so it's ignored while - * recomputing effective - */ - pl->prog = NULL; - break; - } - if (!old_prog) - return -ENOENT; - } else { - /* to maintain backward compatibility NONE and OVERRIDE cgroups - * allow detaching with invalid FD (prog==NULL) - */ - pl = list_first_entry(progs, typeof(*pl), node); - old_prog = pl->prog; - pl->prog = NULL; - } + pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI); + if (IS_ERR(pl)) + return PTR_ERR(pl); + + /* mark it deleted, so it's ignored while recomputing effective */ + old_prog = pl->prog; + pl->prog = NULL; + pl->link = NULL; err = update_effective_progs(cgrp, type); if (err) @@ -470,22 +654,21 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ list_del(&pl->node); - for_each_cgroup_storage_type(stype) { - bpf_cgroup_storage_unlink(pl->storage[stype]); - bpf_cgroup_storage_free(pl->storage[stype]); - } + bpf_cgroup_storages_unlink(pl->storage); + bpf_cgroup_storages_free(pl->storage); kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ cgrp->bpf.flags[type] = 0; - - bpf_prog_put(old_prog); + if (old_prog) + bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); return 0; cleanup: - /* and restore back old_prog */ + /* restore back prog or link */ pl->prog = old_prog; + pl->link = link; return err; } @@ -498,6 +681,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; struct bpf_prog_array *effective; + struct bpf_prog *prog; int cnt, ret = 0, i; effective = rcu_dereference_protected(cgrp->bpf.effective[type], @@ -528,7 +712,8 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, i = 0; list_for_each_entry(pl, progs, node) { - id = pl->prog->aux->id; + prog = prog_list_prog(pl); + id = prog->aux->id; if (copy_to_user(prog_ids + i, &id, sizeof(id))) return -EFAULT; if (++i == cnt) @@ -558,8 +743,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, } } - ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type, - attr->attach_flags); + ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL, + attr->attach_type, attr->attach_flags); if (replace_prog) bpf_prog_put(replace_prog); @@ -581,7 +766,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) if (IS_ERR(prog)) prog = NULL; - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type); if (prog) bpf_prog_put(prog); @@ -589,6 +774,90 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) return ret; } +static void bpf_cgroup_link_release(struct bpf_link *link) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + + /* link might have been auto-detached by dying cgroup already, + * in that case our work is done here + */ + if (!cg_link->cgroup) + return; + + mutex_lock(&cgroup_mutex); + + /* re-check cgroup under lock again */ + if (!cg_link->cgroup) { + mutex_unlock(&cgroup_mutex); + return; + } + + WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, + cg_link->type)); + + mutex_unlock(&cgroup_mutex); + cgroup_put(cg_link->cgroup); +} + +static void bpf_cgroup_link_dealloc(struct bpf_link *link) +{ + struct bpf_cgroup_link *cg_link = + container_of(link, struct bpf_cgroup_link, link); + + kfree(cg_link); +} + +const struct bpf_link_ops bpf_cgroup_link_lops = { + .release = bpf_cgroup_link_release, + .dealloc = bpf_cgroup_link_dealloc, +}; + +int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_cgroup_link *link; + struct file *link_file; + struct cgroup *cgrp; + int err, link_fd; + + if (attr->link_create.flags) + return -EINVAL; + + cgrp = cgroup_get_from_fd(attr->link_create.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_cgroup; + } + bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog); + link->cgroup = cgrp; + link->type = attr->link_create.attach_type; + + link_file = bpf_link_new_file(&link->link, &link_fd); + if (IS_ERR(link_file)) { + kfree(link); + err = PTR_ERR(link_file); + goto out_put_cgroup; + } + + err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, + BPF_F_ALLOW_MULTI); + if (err) { + bpf_link_cleanup(&link->link, link_file, link_fd); + goto out_put_cgroup; + } + + fd_install(link_fd, link_file); + return link_fd; + +out_put_cgroup: + cgroup_put(cgrp); + return err; +} + int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 973a20d49749..916f5132a984 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -97,7 +97,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); - INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); + INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); return fp; } @@ -523,22 +523,22 @@ int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; long bpf_jit_limit __read_mostly; -static __always_inline void -bpf_get_prog_addr_region(const struct bpf_prog *prog, - unsigned long *symbol_start, - unsigned long *symbol_end) +static void +bpf_prog_ksym_set_addr(struct bpf_prog *prog) { const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog); unsigned long addr = (unsigned long)hdr; WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); - *symbol_start = addr; - *symbol_end = addr + hdr->pages * PAGE_SIZE; + prog->aux->ksym.start = (unsigned long) prog->bpf_func; + prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE; } -void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +static void +bpf_prog_ksym_set_name(struct bpf_prog *prog) { + char *sym = prog->aux->ksym.name; const char *end = sym + KSYM_NAME_LEN; const struct btf_type *type; const char *func_name; @@ -572,36 +572,27 @@ void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) *sym = 0; } -static __always_inline unsigned long -bpf_get_prog_addr_start(struct latch_tree_node *n) +static unsigned long bpf_get_ksym_start(struct latch_tree_node *n) { - unsigned long symbol_start, symbol_end; - const struct bpf_prog_aux *aux; - - aux = container_of(n, struct bpf_prog_aux, ksym_tnode); - bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); - - return symbol_start; + return container_of(n, struct bpf_ksym, tnode)->start; } static __always_inline bool bpf_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) { - return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b); + return bpf_get_ksym_start(a) < bpf_get_ksym_start(b); } static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) { unsigned long val = (unsigned long)key; - unsigned long symbol_start, symbol_end; - const struct bpf_prog_aux *aux; + const struct bpf_ksym *ksym; - aux = container_of(n, struct bpf_prog_aux, ksym_tnode); - bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + ksym = container_of(n, struct bpf_ksym, tnode); - if (val < symbol_start) + if (val < ksym->start) return -1; - if (val >= symbol_end) + if (val >= ksym->end) return 1; return 0; @@ -616,20 +607,29 @@ static DEFINE_SPINLOCK(bpf_lock); static LIST_HEAD(bpf_kallsyms); static struct latch_tree_root bpf_tree __cacheline_aligned; -static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) +void bpf_ksym_add(struct bpf_ksym *ksym) { - WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); - list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms); - latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); + spin_lock_bh(&bpf_lock); + WARN_ON_ONCE(!list_empty(&ksym->lnode)); + list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms); + latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops); + spin_unlock_bh(&bpf_lock); } -static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux) +static void __bpf_ksym_del(struct bpf_ksym *ksym) { - if (list_empty(&aux->ksym_lnode)) + if (list_empty(&ksym->lnode)) return; - latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); - list_del_rcu(&aux->ksym_lnode); + latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops); + list_del_rcu(&ksym->lnode); +} + +void bpf_ksym_del(struct bpf_ksym *ksym) +{ + spin_lock_bh(&bpf_lock); + __bpf_ksym_del(ksym); + spin_unlock_bh(&bpf_lock); } static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) @@ -639,8 +639,8 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { - return list_empty(&fp->aux->ksym_lnode) || - fp->aux->ksym_lnode.prev == LIST_POISON2; + return list_empty(&fp->aux->ksym.lnode) || + fp->aux->ksym.lnode.prev == LIST_POISON2; } void bpf_prog_kallsyms_add(struct bpf_prog *fp) @@ -649,9 +649,11 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp) !capable(CAP_SYS_ADMIN)) return; - spin_lock_bh(&bpf_lock); - bpf_prog_ksym_node_add(fp->aux); - spin_unlock_bh(&bpf_lock); + bpf_prog_ksym_set_addr(fp); + bpf_prog_ksym_set_name(fp); + fp->aux->ksym.prog = true; + + bpf_ksym_add(&fp->aux->ksym); } void bpf_prog_kallsyms_del(struct bpf_prog *fp) @@ -659,33 +661,30 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp) if (!bpf_prog_kallsyms_candidate(fp)) return; - spin_lock_bh(&bpf_lock); - bpf_prog_ksym_node_del(fp->aux); - spin_unlock_bh(&bpf_lock); + bpf_ksym_del(&fp->aux->ksym); } -static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) +static struct bpf_ksym *bpf_ksym_find(unsigned long addr) { struct latch_tree_node *n; n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); - return n ? - container_of(n, struct bpf_prog_aux, ksym_tnode)->prog : - NULL; + return n ? container_of(n, struct bpf_ksym, tnode) : NULL; } const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { - unsigned long symbol_start, symbol_end; - struct bpf_prog *prog; + struct bpf_ksym *ksym; char *ret = NULL; rcu_read_lock(); - prog = bpf_prog_kallsyms_find(addr); - if (prog) { - bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end); - bpf_get_prog_name(prog, sym); + ksym = bpf_ksym_find(addr); + if (ksym) { + unsigned long symbol_start = ksym->start; + unsigned long symbol_end = ksym->end; + + strncpy(sym, ksym->name, KSYM_NAME_LEN); ret = sym; if (size) @@ -703,19 +702,28 @@ bool is_bpf_text_address(unsigned long addr) bool ret; rcu_read_lock(); - ret = bpf_prog_kallsyms_find(addr) != NULL; + ret = bpf_ksym_find(addr) != NULL; rcu_read_unlock(); return ret; } +static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) +{ + struct bpf_ksym *ksym = bpf_ksym_find(addr); + + return ksym && ksym->prog ? + container_of(ksym, struct bpf_prog_aux, ksym)->prog : + NULL; +} + const struct exception_table_entry *search_bpf_extables(unsigned long addr) { const struct exception_table_entry *e = NULL; struct bpf_prog *prog; rcu_read_lock(); - prog = bpf_prog_kallsyms_find(addr); + prog = bpf_prog_ksym_find(addr); if (!prog) goto out; if (!prog->aux->num_exentries) @@ -730,7 +738,7 @@ out: int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { - struct bpf_prog_aux *aux; + struct bpf_ksym *ksym; unsigned int it = 0; int ret = -ERANGE; @@ -738,13 +746,13 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return ret; rcu_read_lock(); - list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) { + list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) { if (it++ != symnum) continue; - bpf_get_prog_name(aux->prog, sym); + strncpy(sym, ksym->name, KSYM_NAME_LEN); - *value = (unsigned long)aux->prog->bpf_func; + *value = ksym->start; *type = BPF_SYM_ELF_TYPE; ret = 0; @@ -2148,7 +2156,9 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; +const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; +const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index b3e5b214fed8..2444bd15cc2d 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -113,7 +113,7 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) noff = 0; } else { old = d->image + d->image_off; - noff = d->image_off ^ (BPF_IMAGE_SIZE / 2); + noff = d->image_off ^ (PAGE_SIZE / 2); } new = d->num_progs ? d->image + noff : NULL; @@ -140,9 +140,10 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, mutex_lock(&d->mutex); if (!d->image) { - d->image = bpf_image_alloc(); + d->image = bpf_jit_alloc_exec_page(); if (!d->image) goto out; + bpf_image_ksym_add(d->image, &d->ksym); } prev_num_progs = d->num_progs; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a1468e3f5af2..d541c8486c95 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -27,9 +27,62 @@ .map_delete_batch = \ generic_map_delete_batch +/* + * The bucket lock has two protection scopes: + * + * 1) Serializing concurrent operations from BPF programs on differrent + * CPUs + * + * 2) Serializing concurrent operations from BPF programs and sys_bpf() + * + * BPF programs can execute in any context including perf, kprobes and + * tracing. As there are almost no limits where perf, kprobes and tracing + * can be invoked from the lock operations need to be protected against + * deadlocks. Deadlocks can be caused by recursion and by an invocation in + * the lock held section when functions which acquire this lock are invoked + * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU + * variable bpf_prog_active, which prevents BPF programs attached to perf + * events, kprobes and tracing to be invoked before the prior invocation + * from one of these contexts completed. sys_bpf() uses the same mechanism + * by pinning the task to the current CPU and incrementing the recursion + * protection accross the map operation. + * + * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain + * operations like memory allocations (even with GFP_ATOMIC) from atomic + * contexts. This is required because even with GFP_ATOMIC the memory + * allocator calls into code pathes which acquire locks with long held lock + * sections. To ensure the deterministic behaviour these locks are regular + * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only + * true atomic contexts on an RT kernel are the low level hardware + * handling, scheduling, low level interrupt handling, NMIs etc. None of + * these contexts should ever do memory allocations. + * + * As regular device interrupt handlers and soft interrupts are forced into + * thread context, the existing code which does + * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*(); + * just works. + * + * In theory the BPF locks could be converted to regular spinlocks as well, + * but the bucket locks and percpu_freelist locks can be taken from + * arbitrary contexts (perf, kprobes, tracepoints) which are required to be + * atomic contexts even on RT. These mechanisms require preallocated maps, + * so there is no need to invoke memory allocations within the lock held + * sections. + * + * BPF maps which need dynamic allocation are only used from (forced) + * thread context on RT and can therefore use regular spinlocks which in + * turn allows to invoke memory allocations from the lock held section. + * + * On a non RT kernel this distinction is neither possible nor required. + * spinlock maps to raw_spinlock and the extra code is optimized out by the + * compiler. + */ struct bucket { struct hlist_nulls_head head; - raw_spinlock_t lock; + union { + raw_spinlock_t raw_lock; + spinlock_t lock; + }; }; struct bpf_htab { @@ -65,9 +118,54 @@ struct htab_elem { struct bpf_lru_node lru_node; }; u32 hash; - char key[0] __aligned(8); + char key[] __aligned(8); }; +static inline bool htab_is_prealloc(const struct bpf_htab *htab) +{ + return !(htab->map.map_flags & BPF_F_NO_PREALLOC); +} + +static inline bool htab_use_raw_lock(const struct bpf_htab *htab) +{ + return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab)); +} + +static void htab_init_buckets(struct bpf_htab *htab) +{ + unsigned i; + + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); + if (htab_use_raw_lock(htab)) + raw_spin_lock_init(&htab->buckets[i].raw_lock); + else + spin_lock_init(&htab->buckets[i].lock); + } +} + +static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab, + struct bucket *b) +{ + unsigned long flags; + + if (htab_use_raw_lock(htab)) + raw_spin_lock_irqsave(&b->raw_lock, flags); + else + spin_lock_irqsave(&b->lock, flags); + return flags; +} + +static inline void htab_unlock_bucket(const struct bpf_htab *htab, + struct bucket *b, + unsigned long flags) +{ + if (htab_use_raw_lock(htab)) + raw_spin_unlock_irqrestore(&b->raw_lock, flags); + else + spin_unlock_irqrestore(&b->lock, flags); +} + static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); static bool htab_is_lru(const struct bpf_htab *htab) @@ -82,11 +180,6 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } -static bool htab_is_prealloc(const struct bpf_htab *htab) -{ - return !(htab->map.map_flags & BPF_F_NO_PREALLOC); -} - static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { @@ -328,8 +421,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; - int err, i; u64 cost; + int err; htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) @@ -391,10 +484,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else htab->hashrnd = get_random_int(); - for (i = 0; i < htab->n_buckets; i++) { - INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - raw_spin_lock_init(&htab->buckets[i].lock); - } + htab_init_buckets(htab); if (prealloc) { err = prealloc_init(htab); @@ -602,7 +692,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) b = __select_bucket(htab, tgt_l->hash); head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { @@ -610,7 +700,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) break; } - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); return l == tgt_l; } @@ -686,15 +776,7 @@ static void htab_elem_free_rcu(struct rcu_head *head) struct htab_elem *l = container_of(head, struct htab_elem, rcu); struct bpf_htab *htab = l->htab; - /* must increment bpf_prog_active to avoid kprobe+bpf triggering while - * we're calling kfree, otherwise deadlock is possible if kprobes - * are placed somewhere inside of slub - */ - preempt_disable(); - __this_cpu_inc(bpf_prog_active); htab_elem_free(htab, l); - __this_cpu_dec(bpf_prog_active); - preempt_enable(); } static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) @@ -884,8 +966,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, */ } - /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l_old = lookup_elem_raw(head, hash, key, key_size); @@ -926,7 +1007,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, } ret = 0; err: - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); return ret; } @@ -964,8 +1045,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); - /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l_old = lookup_elem_raw(head, hash, key, key_size); @@ -984,7 +1064,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, ret = 0; err: - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); if (ret) bpf_lru_push_free(&htab->lru, &l_new->lru_node); @@ -1019,8 +1099,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1043,7 +1122,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); return ret; } @@ -1083,8 +1162,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, return -ENOMEM; } - /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1106,7 +1184,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); if (l_new) bpf_lru_push_free(&htab->lru, &l_new->lru_node); return ret; @@ -1144,7 +1222,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l = lookup_elem_raw(head, hash, key, key_size); @@ -1154,7 +1232,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) ret = 0; } - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); return ret; } @@ -1176,7 +1254,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); l = lookup_elem_raw(head, hash, key, key_size); @@ -1185,7 +1263,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) ret = 0; } - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); if (l) bpf_lru_push_free(&htab->lru, &l->lru_node); return ret; @@ -1325,8 +1403,7 @@ alloc: } again: - preempt_disable(); - this_cpu_inc(bpf_prog_active); + bpf_disable_instrumentation(); rcu_read_lock(); again_nocopy: dst_key = keys; @@ -1335,7 +1412,7 @@ again_nocopy: head = &b->head; /* do not grab the lock unless need it (bucket_cnt > 0). */ if (locked) - raw_spin_lock_irqsave(&b->lock, flags); + flags = htab_lock_bucket(htab, b); bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) @@ -1352,10 +1429,9 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); rcu_read_unlock(); - this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); goto after_loop; } @@ -1364,10 +1440,9 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); rcu_read_unlock(); - this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); kvfree(keys); kvfree(values); goto alloc; @@ -1418,7 +1493,7 @@ again_nocopy: dst_val += value_size; } - raw_spin_unlock_irqrestore(&b->lock, flags); + htab_unlock_bucket(htab, b, flags); locked = false; while (node_to_free) { @@ -1437,8 +1512,7 @@ next_batch: } rcu_read_unlock(); - this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys, key_size * bucket_cnt) || copy_to_user(uvalues + total * value_size, values, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d8b7b110a1c5..bafc53ddd350 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -12,6 +12,8 @@ #include <linux/filter.h> #include <linux/ctype.h> #include <linux/jiffies.h> +#include <linux/pid_namespace.h> +#include <linux/proc_ns.h> #include "../../lib/kstrtox.h" @@ -338,6 +340,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level) +{ + struct cgroup *cgrp = task_dfl_cgroup(current); + struct cgroup *ancestor; + + ancestor = cgroup_ancestor(cgrp, ancestor_level); + if (!ancestor) + return 0; + return cgroup_id(ancestor); +} + +const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { + .func = bpf_get_current_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + #ifdef CONFIG_CGROUP_BPF DECLARE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); @@ -499,3 +519,46 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg4_type = ARG_PTR_TO_LONG, }; #endif + +BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino, + struct bpf_pidns_info *, nsdata, u32, size) +{ + struct task_struct *task = current; + struct pid_namespace *pidns; + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_pidns_info))) + goto clear; + + if (unlikely((u64)(dev_t)dev != dev)) + goto clear; + + if (unlikely(!task)) + goto clear; + + pidns = task_active_pid_ns(task); + if (unlikely(!pidns)) { + err = -ENOENT; + goto clear; + } + + if (!ns_match(&pidns->ns, (dev_t)dev, ino)) + goto clear; + + nsdata->pid = task_pid_nr_ns(task, pidns); + nsdata->tgid = task_tgid_nr_ns(task, pidns); + return 0; +clear: + memset((void *)nsdata, 0, (size_t) size); + return err; +} + +const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { + .func = bpf_get_ns_current_pid_tgid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5e40e7fccc21..95087d9f4ed3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -25,6 +25,7 @@ enum bpf_type { BPF_TYPE_UNSPEC = 0, BPF_TYPE_PROG, BPF_TYPE_MAP, + BPF_TYPE_LINK, }; static void *bpf_any_get(void *raw, enum bpf_type type) @@ -36,6 +37,9 @@ static void *bpf_any_get(void *raw, enum bpf_type type) case BPF_TYPE_MAP: bpf_map_inc_with_uref(raw); break; + case BPF_TYPE_LINK: + bpf_link_inc(raw); + break; default: WARN_ON_ONCE(1); break; @@ -53,6 +57,9 @@ static void bpf_any_put(void *raw, enum bpf_type type) case BPF_TYPE_MAP: bpf_map_put_with_uref(raw); break; + case BPF_TYPE_LINK: + bpf_link_put(raw); + break; default: WARN_ON_ONCE(1); break; @@ -63,20 +70,32 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) { void *raw; - *type = BPF_TYPE_MAP; raw = bpf_map_get_with_uref(ufd); - if (IS_ERR(raw)) { + if (!IS_ERR(raw)) { + *type = BPF_TYPE_MAP; + return raw; + } + + raw = bpf_prog_get(ufd); + if (!IS_ERR(raw)) { *type = BPF_TYPE_PROG; - raw = bpf_prog_get(ufd); + return raw; } - return raw; + raw = bpf_link_get_from_fd(ufd); + if (!IS_ERR(raw)) { + *type = BPF_TYPE_LINK; + return raw; + } + + return ERR_PTR(-EINVAL); } static const struct inode_operations bpf_dir_iops; static const struct inode_operations bpf_prog_iops = { }; static const struct inode_operations bpf_map_iops = { }; +static const struct inode_operations bpf_link_iops = { }; static struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, @@ -114,6 +133,8 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) *type = BPF_TYPE_PROG; else if (inode->i_op == &bpf_map_iops) *type = BPF_TYPE_MAP; + else if (inode->i_op == &bpf_link_iops) + *type = BPF_TYPE_LINK; else return -EACCES; @@ -335,6 +356,12 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) &bpffs_map_fops : &bpffs_obj_fops); } +static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) +{ + return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, + &bpffs_obj_fops); +} + static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { @@ -411,6 +438,9 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw, case BPF_TYPE_MAP: ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); break; + case BPF_TYPE_LINK: + ret = vfs_mkobj(dentry, mode, bpf_mklink, raw); + break; default: ret = -EPERM; } @@ -487,6 +517,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags) ret = bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) ret = bpf_map_new_fd(raw, f_flags); + else if (type == BPF_TYPE_LINK) + ret = bpf_link_new_fd(raw); else return -ENOENT; @@ -504,6 +536,8 @@ static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type if (inode->i_op == &bpf_map_iops) return ERR_PTR(-EINVAL); + if (inode->i_op == &bpf_link_iops) + return ERR_PTR(-EINVAL); if (inode->i_op != &bpf_prog_iops) return ERR_PTR(-EACCES); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 56e6c75d354d..65c236cf341e 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -25,7 +25,7 @@ struct lpm_trie_node { struct lpm_trie_node __rcu *child[2]; u32 prefixlen; u32 flags; - u8 data[0]; + u8 data[]; }; struct lpm_trie { @@ -34,7 +34,7 @@ struct lpm_trie { size_t n_entries; size_t max_prefixlen; size_t data_size; - raw_spinlock_t lock; + spinlock_t lock; }; /* This trie implements a longest prefix match algorithm that can be used to @@ -315,7 +315,7 @@ static int trie_update_elem(struct bpf_map *map, if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - raw_spin_lock_irqsave(&trie->lock, irq_flags); + spin_lock_irqsave(&trie->lock, irq_flags); /* Allocate and fill a new node */ @@ -422,7 +422,7 @@ out: kfree(im_node); } - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + spin_unlock_irqrestore(&trie->lock, irq_flags); return ret; } @@ -442,7 +442,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - raw_spin_lock_irqsave(&trie->lock, irq_flags); + spin_lock_irqsave(&trie->lock, irq_flags); /* Walk the tree looking for an exact key/length match and keeping * track of the path we traverse. We will need to know the node @@ -518,7 +518,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) kfree_rcu(node, rcu); out: - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + spin_unlock_irqrestore(&trie->lock, irq_flags); return ret; } @@ -575,7 +575,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) if (ret) goto out_err; - raw_spin_lock_init(&trie->lock); + spin_lock_init(&trie->lock); return &trie->map; out_err: diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 6e090140b924..b367430e611c 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -25,12 +25,18 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s) free_percpu(s->freelist); } +static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, + struct pcpu_freelist_node *node) +{ + node->next = head->first; + head->first = node; +} + static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { raw_spin_lock(&head->lock); - node->next = head->first; - head->first = node; + pcpu_freelist_push_node(head, node); raw_spin_unlock(&head->lock); } @@ -56,21 +62,16 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, u32 nr_elems) { struct pcpu_freelist_head *head; - unsigned long flags; int i, cpu, pcpu_entries; pcpu_entries = nr_elems / num_possible_cpus() + 1; i = 0; - /* disable irq to workaround lockdep false positive - * in bpf usage pcpu_freelist_populate() will never race - * with pcpu_freelist_push() - */ - local_irq_save(flags); for_each_possible_cpu(cpu) { again: head = per_cpu_ptr(s->freelist, cpu); - ___pcpu_freelist_push(head, buf); + /* No locking required as this is not visible yet. */ + pcpu_freelist_push_node(head, buf); i++; buf += elem_size; if (i == nr_elems) @@ -78,7 +79,6 @@ again: if (i % pcpu_entries) goto again; } - local_irq_restore(flags); } struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 50c083ba978c..01badd3eda7a 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -305,11 +305,6 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, if (err) goto put_file_unlock; - /* Ensure reuse->reuseport_id is set */ - err = reuseport_get_id(reuse); - if (err < 0) - goto put_file_unlock; - WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3f958b90d914..db76339fe358 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -40,6 +40,9 @@ static void do_up_read(struct irq_work *entry) { struct stack_map_irq_work *work; + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) + return; + work = container_of(entry, struct stack_map_irq_work, irq_work); up_read_non_owner(work->sem); work->sem = NULL; @@ -288,10 +291,19 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, struct stack_map_irq_work *work = NULL; if (irqs_disabled()) { - work = this_cpu_ptr(&up_read_work); - if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) - /* cannot queue more up_read, fallback */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + work = this_cpu_ptr(&up_read_work); + if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) { + /* cannot queue more up_read, fallback */ + irq_work_busy = true; + } + } else { + /* + * PREEMPT_RT does not allow to trylock mmap sem in + * interrupt disabled context. Force the fallback code. + */ irq_work_busy = true; + } } /* diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 966b7b34cde0..64783da34202 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,6 +25,7 @@ #include <linux/nospec.h> #include <linux/audit.h> #include <uapi/linux/btf.h> +#include <linux/bpf_lsm.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -171,11 +172,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, flags); } - /* must increment bpf_prog_active to avoid kprobe+bpf triggering from - * inside bpf map update or delete otherwise deadlocks are possible - */ - preempt_disable(); - __this_cpu_inc(bpf_prog_active); + bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_update(map, key, value, flags); @@ -206,8 +203,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, err = map->ops->map_update_elem(map, key, value, flags); rcu_read_unlock(); } - __this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); return err; @@ -222,8 +218,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, if (bpf_map_is_dev_bound(map)) return bpf_map_offload_lookup_elem(map, key, value); - preempt_disable(); - this_cpu_inc(bpf_prog_active); + bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); @@ -268,8 +263,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, rcu_read_unlock(); } - this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); return err; @@ -911,6 +905,21 @@ void bpf_map_inc_with_uref(struct bpf_map *map) } EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); +struct bpf_map *bpf_map_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_map *map; + + map = __bpf_map_get(f); + if (IS_ERR(map)) + return map; + + bpf_map_inc(map); + fdput(f); + + return map; +} + struct bpf_map *bpf_map_get_with_uref(u32 ufd) { struct fd f = fdget(ufd); @@ -1138,13 +1147,11 @@ static int map_delete_elem(union bpf_attr *attr) goto out; } - preempt_disable(); - __this_cpu_inc(bpf_prog_active); + bpf_disable_instrumentation(); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); - __this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); out: kfree(key); @@ -1256,13 +1263,11 @@ int generic_map_delete_batch(struct bpf_map *map, break; } - preempt_disable(); - __this_cpu_inc(bpf_prog_active); + bpf_disable_instrumentation(); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); - __this_cpu_dec(bpf_prog_active); - preempt_enable(); + bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); if (err) break; @@ -1938,6 +1943,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, switch (prog_type) { case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: break; @@ -2177,84 +2183,288 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -static int bpf_tracing_prog_release(struct inode *inode, struct file *filp) +void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops, + struct bpf_prog *prog) { - struct bpf_prog *prog = filp->private_data; + atomic64_set(&link->refcnt, 1); + link->ops = ops; + link->prog = prog; +} - WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); - bpf_prog_put(prog); +/* Clean up bpf_link and corresponding anon_inode file and FD. After + * anon_inode is created, bpf_link can't be just kfree()'d due to deferred + * anon_inode's release() call. This helper manages marking bpf_link as + * defunct, releases anon_inode file and puts reserved FD. + */ +void bpf_link_cleanup(struct bpf_link *link, struct file *link_file, + int link_fd) +{ + link->prog = NULL; + fput(link_file); + put_unused_fd(link_fd); +} + +void bpf_link_inc(struct bpf_link *link) +{ + atomic64_inc(&link->refcnt); +} + +/* bpf_link_free is guaranteed to be called from process context */ +static void bpf_link_free(struct bpf_link *link) +{ + if (link->prog) { + /* detach BPF program, clean up used resources */ + link->ops->release(link); + bpf_prog_put(link->prog); + } + /* free bpf_link and its containing memory */ + link->ops->dealloc(link); +} + +static void bpf_link_put_deferred(struct work_struct *work) +{ + struct bpf_link *link = container_of(work, struct bpf_link, work); + + bpf_link_free(link); +} + +/* bpf_link_put can be called from atomic context, but ensures that resources + * are freed from process context + */ +void bpf_link_put(struct bpf_link *link) +{ + if (!atomic64_dec_and_test(&link->refcnt)) + return; + + if (in_atomic()) { + INIT_WORK(&link->work, bpf_link_put_deferred); + schedule_work(&link->work); + } else { + bpf_link_free(link); + } +} + +static int bpf_link_release(struct inode *inode, struct file *filp) +{ + struct bpf_link *link = filp->private_data; + + bpf_link_put(link); return 0; } -static const struct file_operations bpf_tracing_prog_fops = { - .release = bpf_tracing_prog_release, +#ifdef CONFIG_PROC_FS +static const struct bpf_link_ops bpf_raw_tp_lops; +static const struct bpf_link_ops bpf_tracing_link_lops; + +static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_link *link = filp->private_data; + const struct bpf_prog *prog = link->prog; + char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; + const char *link_type; + + if (link->ops == &bpf_raw_tp_lops) + link_type = "raw_tracepoint"; + else if (link->ops == &bpf_tracing_link_lops) + link_type = "tracing"; +#ifdef CONFIG_CGROUP_BPF + else if (link->ops == &bpf_cgroup_link_lops) + link_type = "cgroup"; +#endif + else + link_type = "unknown"; + + bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); + seq_printf(m, + "link_type:\t%s\n" + "prog_tag:\t%s\n" + "prog_id:\t%u\n", + link_type, + prog_tag, + prog->aux->id); +} +#endif + +const struct file_operations bpf_link_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_link_show_fdinfo, +#endif + .release = bpf_link_release, .read = bpf_dummy_read, .write = bpf_dummy_write, }; +int bpf_link_new_fd(struct bpf_link *link) +{ + return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); +} + +/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but + * instead of immediately installing fd in fdtable, just reserve it and + * return. Caller then need to either install it with fd_install(fd, file) or + * release with put_unused_fd(fd). + * This is useful for cases when bpf_link attachment/detachment are + * complicated and expensive operations and should be delayed until all the fd + * reservation and anon_inode creation succeeds. + */ +struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd) +{ + struct file *file; + int fd; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return ERR_PTR(fd); + + file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); + if (IS_ERR(file)) { + put_unused_fd(fd); + return file; + } + + *reserved_fd = fd; + return file; +} + +struct bpf_link *bpf_link_get_from_fd(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_link *link; + + if (!f.file) + return ERR_PTR(-EBADF); + if (f.file->f_op != &bpf_link_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + link = f.file->private_data; + bpf_link_inc(link); + fdput(f); + + return link; +} + +struct bpf_tracing_link { + struct bpf_link link; +}; + +static void bpf_tracing_link_release(struct bpf_link *link) +{ + WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog)); +} + +static void bpf_tracing_link_dealloc(struct bpf_link *link) +{ + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + kfree(tr_link); +} + +static const struct bpf_link_ops bpf_tracing_link_lops = { + .release = bpf_tracing_link_release, + .dealloc = bpf_tracing_link_dealloc, +}; + static int bpf_tracing_prog_attach(struct bpf_prog *prog) { - int tr_fd, err; + struct bpf_tracing_link *link; + struct file *link_file; + int link_fd, err; - if (prog->expected_attach_type != BPF_TRACE_FENTRY && - prog->expected_attach_type != BPF_TRACE_FEXIT && - prog->type != BPF_PROG_TYPE_EXT) { + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + if (prog->expected_attach_type != BPF_TRACE_FENTRY && + prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->expected_attach_type != BPF_MODIFY_RETURN) { + err = -EINVAL; + goto out_put_prog; + } + break; + case BPF_PROG_TYPE_EXT: + if (prog->expected_attach_type != 0) { + err = -EINVAL; + goto out_put_prog; + } + break; + case BPF_PROG_TYPE_LSM: + if (prog->expected_attach_type != BPF_LSM_MAC) { + err = -EINVAL; + goto out_put_prog; + } + break; + default: err = -EINVAL; goto out_put_prog; } - err = bpf_trampoline_link_prog(prog); - if (err) + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_prog; + } + bpf_link_init(&link->link, &bpf_tracing_link_lops, prog); + + link_file = bpf_link_new_file(&link->link, &link_fd); + if (IS_ERR(link_file)) { + kfree(link); + err = PTR_ERR(link_file); goto out_put_prog; + } - tr_fd = anon_inode_getfd("bpf-tracing-prog", &bpf_tracing_prog_fops, - prog, O_CLOEXEC); - if (tr_fd < 0) { - WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); - err = tr_fd; + err = bpf_trampoline_link_prog(prog); + if (err) { + bpf_link_cleanup(&link->link, link_file, link_fd); goto out_put_prog; } - return tr_fd; + + fd_install(link_fd, link_file); + return link_fd; out_put_prog: bpf_prog_put(prog); return err; } -struct bpf_raw_tracepoint { +struct bpf_raw_tp_link { + struct bpf_link link; struct bpf_raw_event_map *btp; - struct bpf_prog *prog; }; -static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) +static void bpf_raw_tp_link_release(struct bpf_link *link) { - struct bpf_raw_tracepoint *raw_tp = filp->private_data; + struct bpf_raw_tp_link *raw_tp = + container_of(link, struct bpf_raw_tp_link, link); - if (raw_tp->prog) { - bpf_probe_unregister(raw_tp->btp, raw_tp->prog); - bpf_prog_put(raw_tp->prog); - } + bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); bpf_put_raw_tracepoint(raw_tp->btp); +} + +static void bpf_raw_tp_link_dealloc(struct bpf_link *link) +{ + struct bpf_raw_tp_link *raw_tp = + container_of(link, struct bpf_raw_tp_link, link); + kfree(raw_tp); - return 0; } -static const struct file_operations bpf_raw_tp_fops = { - .release = bpf_raw_tracepoint_release, - .read = bpf_dummy_read, - .write = bpf_dummy_write, +static const struct bpf_link_ops bpf_raw_tp_lops = { + .release = bpf_raw_tp_link_release, + .dealloc = bpf_raw_tp_link_dealloc, }; #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { - struct bpf_raw_tracepoint *raw_tp; + struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; + struct file *link_file; struct bpf_prog *prog; const char *tp_name; char buf[128]; - int tp_fd, err; + int link_fd, err; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) return -EINVAL; @@ -2263,16 +2473,10 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); - if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && - prog->type != BPF_PROG_TYPE_TRACING && - prog->type != BPF_PROG_TYPE_EXT && - prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { - err = -EINVAL; - goto out_put_prog; - } - - if (prog->type == BPF_PROG_TYPE_TRACING || - prog->type == BPF_PROG_TYPE_EXT) { + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_LSM: if (attr->raw_tracepoint.name) { /* The attach point for this category of programs * should be specified via btf_id during program load. @@ -2280,11 +2484,14 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) err = -EINVAL; goto out_put_prog; } - if (prog->expected_attach_type == BPF_TRACE_RAW_TP) + if (prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_RAW_TP) { tp_name = prog->aux->attach_func_name; - else - return bpf_tracing_prog_attach(prog); - } else { + break; + } + return bpf_tracing_prog_attach(prog); + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, u64_to_user_ptr(attr->raw_tracepoint.name), sizeof(buf) - 1) < 0) { @@ -2293,6 +2500,10 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) } buf[sizeof(buf) - 1] = 0; tp_name = buf; + break; + default: + err = -EINVAL; + goto out_put_prog; } btp = bpf_get_raw_tracepoint(tp_name); @@ -2301,29 +2512,30 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) goto out_put_prog; } - raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); - if (!raw_tp) { + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { err = -ENOMEM; goto out_put_btp; } - raw_tp->btp = btp; - raw_tp->prog = prog; + bpf_link_init(&link->link, &bpf_raw_tp_lops, prog); + link->btp = btp; - err = bpf_probe_register(raw_tp->btp, prog); - if (err) - goto out_free_tp; + link_file = bpf_link_new_file(&link->link, &link_fd); + if (IS_ERR(link_file)) { + kfree(link); + err = PTR_ERR(link_file); + goto out_put_btp; + } - tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp, - O_CLOEXEC); - if (tp_fd < 0) { - bpf_probe_unregister(raw_tp->btp, prog); - err = tp_fd; - goto out_free_tp; + err = bpf_probe_register(link->btp, prog); + if (err) { + bpf_link_cleanup(&link->link, link_file, link_fd); + goto out_put_btp; } - return tp_fd; -out_free_tp: - kfree(raw_tp); + fd_install(link_fd, link_file); + return link_fd; + out_put_btp: bpf_put_raw_tracepoint(btp); out_put_prog: @@ -2348,36 +2560,18 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, } } -#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd - -#define BPF_F_ATTACH_MASK \ - (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE) - -static int bpf_prog_attach(const union bpf_attr *attr) +static enum bpf_prog_type +attach_type_to_prog_type(enum bpf_attach_type attach_type) { - enum bpf_prog_type ptype; - struct bpf_prog *prog; - int ret; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (CHECK_ATTR(BPF_PROG_ATTACH)) - return -EINVAL; - - if (attr->attach_flags & ~BPF_F_ATTACH_MASK) - return -EINVAL; - - switch (attr->attach_type) { + switch (attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: - ptype = BPF_PROG_TYPE_CGROUP_SKB; + return BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: - ptype = BPF_PROG_TYPE_CGROUP_SOCK; - break; + return BPF_PROG_TYPE_CGROUP_SOCK; case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: @@ -2386,37 +2580,53 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: - ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; - break; + return BPF_PROG_TYPE_CGROUP_SOCK_ADDR; case BPF_CGROUP_SOCK_OPS: - ptype = BPF_PROG_TYPE_SOCK_OPS; - break; + return BPF_PROG_TYPE_SOCK_OPS; case BPF_CGROUP_DEVICE: - ptype = BPF_PROG_TYPE_CGROUP_DEVICE; - break; + return BPF_PROG_TYPE_CGROUP_DEVICE; case BPF_SK_MSG_VERDICT: - ptype = BPF_PROG_TYPE_SK_MSG; - break; + return BPF_PROG_TYPE_SK_MSG; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - ptype = BPF_PROG_TYPE_SK_SKB; - break; + return BPF_PROG_TYPE_SK_SKB; case BPF_LIRC_MODE2: - ptype = BPF_PROG_TYPE_LIRC_MODE2; - break; + return BPF_PROG_TYPE_LIRC_MODE2; case BPF_FLOW_DISSECTOR: - ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; - break; + return BPF_PROG_TYPE_FLOW_DISSECTOR; case BPF_CGROUP_SYSCTL: - ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; - break; + return BPF_PROG_TYPE_CGROUP_SYSCTL; case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: - ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; - break; + return BPF_PROG_TYPE_CGROUP_SOCKOPT; default: - return -EINVAL; + return BPF_PROG_TYPE_UNSPEC; } +} + +#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd + +#define BPF_F_ATTACH_MASK \ + (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE) + +static int bpf_prog_attach(const union bpf_attr *attr) +{ + enum bpf_prog_type ptype; + struct bpf_prog *prog; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_ATTACH)) + return -EINVAL; + + if (attr->attach_flags & ~BPF_F_ATTACH_MASK) + return -EINVAL; + + ptype = attach_type_to_prog_type(attr->attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC) + return -EINVAL; prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) @@ -2438,8 +2648,17 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_FLOW_DISSECTOR: ret = skb_flow_dissector_bpf_prog_attach(attr, prog); break; - default: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: ret = cgroup_bpf_prog_attach(attr, ptype, prog); + break; + default: + ret = -EINVAL; } if (ret) @@ -2459,53 +2678,27 @@ static int bpf_prog_detach(const union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; - switch (attr->attach_type) { - case BPF_CGROUP_INET_INGRESS: - case BPF_CGROUP_INET_EGRESS: - ptype = BPF_PROG_TYPE_CGROUP_SKB; - break; - case BPF_CGROUP_INET_SOCK_CREATE: - case BPF_CGROUP_INET4_POST_BIND: - case BPF_CGROUP_INET6_POST_BIND: - ptype = BPF_PROG_TYPE_CGROUP_SOCK; - break; - case BPF_CGROUP_INET4_BIND: - case BPF_CGROUP_INET6_BIND: - case BPF_CGROUP_INET4_CONNECT: - case BPF_CGROUP_INET6_CONNECT: - case BPF_CGROUP_UDP4_SENDMSG: - case BPF_CGROUP_UDP6_SENDMSG: - case BPF_CGROUP_UDP4_RECVMSG: - case BPF_CGROUP_UDP6_RECVMSG: - ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; - break; - case BPF_CGROUP_SOCK_OPS: - ptype = BPF_PROG_TYPE_SOCK_OPS; - break; - case BPF_CGROUP_DEVICE: - ptype = BPF_PROG_TYPE_CGROUP_DEVICE; - break; - case BPF_SK_MSG_VERDICT: - return sock_map_get_from_fd(attr, NULL); - case BPF_SK_SKB_STREAM_PARSER: - case BPF_SK_SKB_STREAM_VERDICT: + ptype = attach_type_to_prog_type(attr->attach_type); + + switch (ptype) { + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_SK_SKB: return sock_map_get_from_fd(attr, NULL); - case BPF_LIRC_MODE2: + case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); - case BPF_FLOW_DISSECTOR: + case BPF_PROG_TYPE_FLOW_DISSECTOR: return skb_flow_dissector_bpf_prog_detach(attr); - case BPF_CGROUP_SYSCTL: - ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; - break; - case BPF_CGROUP_GETSOCKOPT: - case BPF_CGROUP_SETSOCKOPT: - ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; - break; + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + return cgroup_bpf_prog_detach(attr, ptype); default: return -EINVAL; } - - return cgroup_bpf_prog_detach(attr, ptype); } #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt @@ -2539,7 +2732,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_SYSCTL: case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: - break; + return cgroup_bpf_prog_query(attr, uattr); case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: @@ -2547,8 +2740,6 @@ static int bpf_prog_query(const union bpf_attr *attr, default: return -EINVAL; } - - return cgroup_bpf_prog_query(attr, uattr); } #define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out @@ -3272,15 +3463,21 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (err) goto out; - if (file->f_op == &bpf_raw_tp_fops) { - struct bpf_raw_tracepoint *raw_tp = file->private_data; - struct bpf_raw_event_map *btp = raw_tp->btp; + if (file->f_op == &bpf_link_fops) { + struct bpf_link *link = file->private_data; - err = bpf_task_fd_query_copy(attr, uattr, - raw_tp->prog->aux->id, - BPF_FD_TYPE_RAW_TRACEPOINT, - btp->tp->name, 0, 0); - goto put_file; + if (link->ops == &bpf_raw_tp_lops) { + struct bpf_raw_tp_link *raw_tp = + container_of(link, struct bpf_raw_tp_link, link); + struct bpf_raw_event_map *btp = raw_tp->btp; + + err = bpf_task_fd_query_copy(attr, uattr, + raw_tp->link.prog->aux->id, + BPF_FD_TYPE_RAW_TRACEPOINT, + btp->tp->name, 0, 0); + goto put_file; + } + goto out_not_supp; } event = perf_get_event(file); @@ -3300,6 +3497,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, goto put_file; } +out_not_supp: err = -ENOTSUPP; put_file: fput(file); @@ -3362,6 +3560,104 @@ err_put: return err; } +#define BPF_LINK_CREATE_LAST_FIELD link_create.flags +static int link_create(union bpf_attr *attr) +{ + enum bpf_prog_type ptype; + struct bpf_prog *prog; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_LINK_CREATE)) + return -EINVAL; + + ptype = attach_type_to_prog_type(attr->link_create.attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC) + return -EINVAL; + + prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + ret = bpf_prog_attach_check_attach_type(prog, + attr->link_create.attach_type); + if (ret) + goto err_out; + + switch (ptype) { + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + ret = cgroup_bpf_link_attach(attr, prog); + break; + default: + ret = -EINVAL; + } + +err_out: + if (ret < 0) + bpf_prog_put(prog); + return ret; +} + +#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd + +static int link_update(union bpf_attr *attr) +{ + struct bpf_prog *old_prog = NULL, *new_prog; + struct bpf_link *link; + u32 flags; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_LINK_UPDATE)) + return -EINVAL; + + flags = attr->link_update.flags; + if (flags & ~BPF_F_REPLACE) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->link_update.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + new_prog = bpf_prog_get(attr->link_update.new_prog_fd); + if (IS_ERR(new_prog)) + return PTR_ERR(new_prog); + + if (flags & BPF_F_REPLACE) { + old_prog = bpf_prog_get(attr->link_update.old_prog_fd); + if (IS_ERR(old_prog)) { + ret = PTR_ERR(old_prog); + old_prog = NULL; + goto out_put_progs; + } + } + +#ifdef CONFIG_CGROUP_BPF + if (link->ops == &bpf_cgroup_link_lops) { + ret = cgroup_bpf_replace(link, old_prog, new_prog); + goto out_put_progs; + } +#endif + ret = -EINVAL; + +out_put_progs: + if (old_prog) + bpf_prog_put(old_prog); + if (ret) + bpf_prog_put(new_prog); + return ret; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -3473,6 +3769,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_DELETE_BATCH: err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH); break; + case BPF_LINK_CREATE: + err = link_create(&attr); + break; + case BPF_LINK_UPDATE: + err = link_update(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 7ae5dddd1fe6..3b495773de5a 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -9,15 +9,15 @@ #include <linux/sysfs.h> /* See scripts/link-vmlinux.sh, gen_btf() func for details */ -extern char __weak _binary__btf_vmlinux_bin_start[]; -extern char __weak _binary__btf_vmlinux_bin_end[]; +extern char __weak __start_BTF[]; +extern char __weak __stop_BTF[]; static ssize_t btf_vmlinux_read(struct file *file, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) { - memcpy(buf, _binary__btf_vmlinux_bin_start + off, len); + memcpy(buf, __start_BTF + off, len); return len; } @@ -30,15 +30,14 @@ static struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { - if (!_binary__btf_vmlinux_bin_start) + if (!__start_BTF) return 0; btf_kobj = kobject_create_and_add("btf", kernel_kobj); if (!btf_kobj) return -ENOMEM; - bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end - - _binary__btf_vmlinux_bin_start; + bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); } diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index d4f335a9a899..ceac5281bd31 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -194,3 +194,18 @@ int tnum_sbin(char *str, size_t size, struct tnum a) str[min(size - 1, (size_t)64)] = 0; return 64; } + +struct tnum tnum_subreg(struct tnum a) +{ + return tnum_cast(a, 4); +} + +struct tnum tnum_clear_subreg(struct tnum a) +{ + return tnum_lshift(tnum_rshift(a, 32), 32); +} + +struct tnum tnum_const_subreg(struct tnum a, u32 value) +{ + return tnum_or(tnum_clear_subreg(a), tnum_const(value)); +} diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 6b264a92064b..9be85aa4ec5f 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -5,6 +5,8 @@ #include <linux/filter.h> #include <linux/ftrace.h> #include <linux/rbtree_latch.h> +#include <linux/perf_event.h> +#include <linux/btf.h> /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -17,12 +19,11 @@ const struct bpf_prog_ops bpf_extension_prog_ops = { #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; -static struct latch_tree_root image_tree __cacheline_aligned; -/* serializes access to trampoline_table and image_tree */ +/* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); -static void *bpf_jit_alloc_exec_page(void) +void *bpf_jit_alloc_exec_page(void) { void *image; @@ -38,62 +39,28 @@ static void *bpf_jit_alloc_exec_page(void) return image; } -static __always_inline bool image_tree_less(struct latch_tree_node *a, - struct latch_tree_node *b) +void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) { - struct bpf_image *ia = container_of(a, struct bpf_image, tnode); - struct bpf_image *ib = container_of(b, struct bpf_image, tnode); - - return ia < ib; -} - -static __always_inline int image_tree_comp(void *addr, struct latch_tree_node *n) -{ - void *image = container_of(n, struct bpf_image, tnode); - - if (addr < image) - return -1; - if (addr >= image + PAGE_SIZE) - return 1; - - return 0; -} - -static const struct latch_tree_ops image_tree_ops = { - .less = image_tree_less, - .comp = image_tree_comp, -}; - -static void *__bpf_image_alloc(bool lock) -{ - struct bpf_image *image; - - image = bpf_jit_alloc_exec_page(); - if (!image) - return NULL; - - if (lock) - mutex_lock(&trampoline_mutex); - latch_tree_insert(&image->tnode, &image_tree, &image_tree_ops); - if (lock) - mutex_unlock(&trampoline_mutex); - return image->data; + ksym->start = (unsigned long) data; + ksym->end = ksym->start + PAGE_SIZE; + bpf_ksym_add(ksym); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, + PAGE_SIZE, false, ksym->name); } -void *bpf_image_alloc(void) +void bpf_image_ksym_del(struct bpf_ksym *ksym) { - return __bpf_image_alloc(true); + bpf_ksym_del(ksym); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, + PAGE_SIZE, true, ksym->name); } -bool is_bpf_image_address(unsigned long addr) +static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr) { - bool ret; - - rcu_read_lock(); - ret = latch_tree_find((void *) addr, &image_tree, &image_tree_ops) != NULL; - rcu_read_unlock(); + struct bpf_ksym *ksym = &tr->ksym; - return ret; + snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key); + bpf_image_ksym_add(tr->image, ksym); } struct bpf_trampoline *bpf_trampoline_lookup(u64 key) @@ -116,7 +83,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) goto out; /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ - image = __bpf_image_alloc(false); + image = bpf_jit_alloc_exec_page(); if (!image) { kfree(tr); tr = NULL; @@ -131,6 +98,8 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); tr->image = image; + INIT_LIST_HEAD_RCU(&tr->ksym.lnode); + bpf_trampoline_ksym_add(tr); out: mutex_unlock(&trampoline_mutex); return tr; @@ -190,40 +159,50 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) return ret; } -/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 - * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 - */ -#define BPF_MAX_TRAMP_PROGS 40 +static struct bpf_tramp_progs * +bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total) +{ + const struct bpf_prog_aux *aux; + struct bpf_tramp_progs *tprogs; + struct bpf_prog **progs; + int kind; + + *total = 0; + tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); + if (!tprogs) + return ERR_PTR(-ENOMEM); + + for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { + tprogs[kind].nr_progs = tr->progs_cnt[kind]; + *total += tr->progs_cnt[kind]; + progs = tprogs[kind].progs; + + hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) + *progs++ = aux->prog; + } + return tprogs; +} static int bpf_trampoline_update(struct bpf_trampoline *tr) { - void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2; - void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2; - struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS]; - int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY]; - int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT]; - struct bpf_prog **progs, **fentry, **fexit; + void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2; + void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2; + struct bpf_tramp_progs *tprogs; u32 flags = BPF_TRAMP_F_RESTORE_REGS; - struct bpf_prog_aux *aux; - int err; + int err, total; + + tprogs = bpf_trampoline_get_progs(tr, &total); + if (IS_ERR(tprogs)) + return PTR_ERR(tprogs); - if (fentry_cnt + fexit_cnt == 0) { + if (total == 0) { err = unregister_fentry(tr, old_image); tr->selector = 0; goto out; } - /* populate fentry progs */ - fentry = progs = progs_to_run; - hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FENTRY], tramp_hlist) - *progs++ = aux->prog; - - /* populate fexit progs */ - fexit = progs; - hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FEXIT], tramp_hlist) - *progs++ = aux->prog; - - if (fexit_cnt) + if (tprogs[BPF_TRAMP_FEXIT].nr_progs || + tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; /* Though the second half of trampoline page is unused a task could be @@ -232,12 +211,11 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) * preempted task. Hence wait for tasks to voluntarily schedule or go * to userspace. */ + synchronize_rcu_tasks(); - err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2, - &tr->func.model, flags, - fentry, fentry_cnt, - fexit, fexit_cnt, + err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, + &tr->func.model, flags, tprogs, tr->func.addr); if (err < 0) goto out; @@ -252,16 +230,27 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) goto out; tr->selector++; out: + kfree(tprogs); return err; } -static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t) +static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) { - switch (t) { + switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; + case BPF_MODIFY_RETURN: + return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; + case BPF_LSM_MAC: + if (!prog->aux->attach_func_proto->type) + /* The function returns void, we cannot modify its + * return value. + */ + return BPF_TRAMP_FEXIT; + else + return BPF_TRAMP_MODIFY_RETURN; default: return BPF_TRAMP_REPLACE; } @@ -275,7 +264,7 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog) int cnt; tr = prog->aux->trampoline; - kind = bpf_attach_type_to_tramp(prog->expected_attach_type); + kind = bpf_attach_type_to_tramp(prog); mutex_lock(&tr->mutex); if (tr->extension_prog) { /* cannot attach fentry/fexit if extension prog is attached. @@ -325,7 +314,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog) int err; tr = prog->aux->trampoline; - kind = bpf_attach_type_to_tramp(prog->expected_attach_type); + kind = bpf_attach_type_to_tramp(prog); mutex_lock(&tr->mutex); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); @@ -344,8 +333,6 @@ out: void bpf_trampoline_put(struct bpf_trampoline *tr) { - struct bpf_image *image; - if (!tr) return; mutex_lock(&trampoline_mutex); @@ -356,35 +343,37 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) goto out; if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; - image = container_of(tr->image, struct bpf_image, data); - latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops); + bpf_image_ksym_del(&tr->ksym); /* wait for tasks to get out of trampoline before freeing it */ synchronize_rcu_tasks(); - bpf_jit_free_exec(image); + bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); kfree(tr); out: mutex_unlock(&trampoline_mutex); } -/* The logic is similar to BPF_PROG_RUN, but with explicit rcu and preempt that - * are needed for trampoline. The macro is split into +/* The logic is similar to BPF_PROG_RUN, but with an explicit + * rcu_read_lock() and migrate_disable() which are required + * for the trampoline. The macro is split into * call _bpf_prog_enter * call prog->bpf_func * call __bpf_prog_exit */ u64 notrace __bpf_prog_enter(void) + __acquires(RCU) { u64 start = 0; rcu_read_lock(); - preempt_disable(); + migrate_disable(); if (static_branch_unlikely(&bpf_stats_enabled_key)) start = sched_clock(); return start; } void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) + __releases(RCU) { struct bpf_prog_stats *stats; @@ -401,15 +390,14 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) stats->nsecs += sched_clock() - start; u64_stats_update_end(&stats->syncp); } - preempt_enable(); + migrate_enable(); rcu_read_unlock(); } int __weak arch_prepare_bpf_trampoline(void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_prog **fentry_progs, int fentry_cnt, - struct bpf_prog **fexit_progs, int fexit_cnt, + struct bpf_tramp_progs *tprogs, void *orig_call) { return -ENOTSUPP; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1cc945daa9c8..04c6630cc18f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19,6 +19,8 @@ #include <linux/sort.h> #include <linux/perf_event.h> #include <linux/ctype.h> +#include <linux/error-injection.h> +#include <linux/bpf_lsm.h> #include "disasm.h" @@ -227,8 +229,7 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; - s64 msize_smax_value; - u64 msize_umax_value; + u64 msize_max_value; int ref_obj_id; int func_id; u32 btf_id; @@ -549,6 +550,22 @@ static void print_verifier_state(struct bpf_verifier_env *env, tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, ",var_off=%s", tn_buf); } + if (reg->s32_min_value != reg->smin_value && + reg->s32_min_value != S32_MIN) + verbose(env, ",s32_min_value=%d", + (int)(reg->s32_min_value)); + if (reg->s32_max_value != reg->smax_value && + reg->s32_max_value != S32_MAX) + verbose(env, ",s32_max_value=%d", + (int)(reg->s32_max_value)); + if (reg->u32_min_value != reg->umin_value && + reg->u32_min_value != U32_MIN) + verbose(env, ",u32_min_value=%d", + (int)(reg->u32_min_value)); + if (reg->u32_max_value != reg->umax_value && + reg->u32_max_value != U32_MAX) + verbose(env, ",u32_max_value=%d", + (int)(reg->u32_max_value)); } verbose(env, ")"); } @@ -923,6 +940,20 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) reg->smax_value = (s64)imm; reg->umin_value = imm; reg->umax_value = imm; + + reg->s32_min_value = (s32)imm; + reg->s32_max_value = (s32)imm; + reg->u32_min_value = (u32)imm; + reg->u32_max_value = (u32)imm; +} + +static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm) +{ + reg->var_off = tnum_const_subreg(reg->var_off, imm); + reg->s32_min_value = (s32)imm; + reg->s32_max_value = (s32)imm; + reg->u32_min_value = (u32)imm; + reg->u32_max_value = (u32)imm; } /* Mark the 'variable offset' part of a register as zero. This should be @@ -977,8 +1008,52 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, tnum_equals_const(reg->var_off, 0); } -/* Attempts to improve min/max values based on var_off information */ -static void __update_reg_bounds(struct bpf_reg_state *reg) +/* Reset the min/max bounds of a register */ +static void __mark_reg_unbounded(struct bpf_reg_state *reg) +{ + reg->smin_value = S64_MIN; + reg->smax_value = S64_MAX; + reg->umin_value = 0; + reg->umax_value = U64_MAX; + + reg->s32_min_value = S32_MIN; + reg->s32_max_value = S32_MAX; + reg->u32_min_value = 0; + reg->u32_max_value = U32_MAX; +} + +static void __mark_reg64_unbounded(struct bpf_reg_state *reg) +{ + reg->smin_value = S64_MIN; + reg->smax_value = S64_MAX; + reg->umin_value = 0; + reg->umax_value = U64_MAX; +} + +static void __mark_reg32_unbounded(struct bpf_reg_state *reg) +{ + reg->s32_min_value = S32_MIN; + reg->s32_max_value = S32_MAX; + reg->u32_min_value = 0; + reg->u32_max_value = U32_MAX; +} + +static void __update_reg32_bounds(struct bpf_reg_state *reg) +{ + struct tnum var32_off = tnum_subreg(reg->var_off); + + /* min signed is max(sign bit) | min(other bits) */ + reg->s32_min_value = max_t(s32, reg->s32_min_value, + var32_off.value | (var32_off.mask & S32_MIN)); + /* max signed is min(sign bit) | max(other bits) */ + reg->s32_max_value = min_t(s32, reg->s32_max_value, + var32_off.value | (var32_off.mask & S32_MAX)); + reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value); + reg->u32_max_value = min(reg->u32_max_value, + (u32)(var32_off.value | var32_off.mask)); +} + +static void __update_reg64_bounds(struct bpf_reg_state *reg) { /* min signed is max(sign bit) | min(other bits) */ reg->smin_value = max_t(s64, reg->smin_value, @@ -991,8 +1066,48 @@ static void __update_reg_bounds(struct bpf_reg_state *reg) reg->var_off.value | reg->var_off.mask); } +static void __update_reg_bounds(struct bpf_reg_state *reg) +{ + __update_reg32_bounds(reg); + __update_reg64_bounds(reg); +} + /* Uses signed min/max values to inform unsigned, and vice-versa */ -static void __reg_deduce_bounds(struct bpf_reg_state *reg) +static void __reg32_deduce_bounds(struct bpf_reg_state *reg) +{ + /* Learn sign from signed bounds. + * If we cannot cross the sign boundary, then signed and unsigned bounds + * are the same, so combine. This works even in the negative case, e.g. + * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. + */ + if (reg->s32_min_value >= 0 || reg->s32_max_value < 0) { + reg->s32_min_value = reg->u32_min_value = + max_t(u32, reg->s32_min_value, reg->u32_min_value); + reg->s32_max_value = reg->u32_max_value = + min_t(u32, reg->s32_max_value, reg->u32_max_value); + return; + } + /* Learn sign from unsigned bounds. Signed bounds cross the sign + * boundary, so we must be careful. + */ + if ((s32)reg->u32_max_value >= 0) { + /* Positive. We can't learn anything from the smin, but smax + * is positive, hence safe. + */ + reg->s32_min_value = reg->u32_min_value; + reg->s32_max_value = reg->u32_max_value = + min_t(u32, reg->s32_max_value, reg->u32_max_value); + } else if ((s32)reg->u32_min_value < 0) { + /* Negative. We can't learn anything from the smax, but smin + * is negative, hence safe. + */ + reg->s32_min_value = reg->u32_min_value = + max_t(u32, reg->s32_min_value, reg->u32_min_value); + reg->s32_max_value = reg->u32_max_value; + } +} + +static void __reg64_deduce_bounds(struct bpf_reg_state *reg) { /* Learn sign from signed bounds. * If we cannot cross the sign boundary, then signed and unsigned bounds @@ -1026,32 +1141,106 @@ static void __reg_deduce_bounds(struct bpf_reg_state *reg) } } +static void __reg_deduce_bounds(struct bpf_reg_state *reg) +{ + __reg32_deduce_bounds(reg); + __reg64_deduce_bounds(reg); +} + /* Attempts to improve var_off based on unsigned min/max information */ static void __reg_bound_offset(struct bpf_reg_state *reg) { - reg->var_off = tnum_intersect(reg->var_off, - tnum_range(reg->umin_value, - reg->umax_value)); + struct tnum var64_off = tnum_intersect(reg->var_off, + tnum_range(reg->umin_value, + reg->umax_value)); + struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off), + tnum_range(reg->u32_min_value, + reg->u32_max_value)); + + reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); +} + +static void __reg_assign_32_into_64(struct bpf_reg_state *reg) +{ + reg->umin_value = reg->u32_min_value; + reg->umax_value = reg->u32_max_value; + /* Attempt to pull 32-bit signed bounds into 64-bit bounds + * but must be positive otherwise set to worse case bounds + * and refine later from tnum. + */ + if (reg->s32_min_value > 0) + reg->smin_value = reg->s32_min_value; + else + reg->smin_value = 0; + if (reg->s32_max_value > 0) + reg->smax_value = reg->s32_max_value; + else + reg->smax_value = U32_MAX; } -static void __reg_bound_offset32(struct bpf_reg_state *reg) +static void __reg_combine_32_into_64(struct bpf_reg_state *reg) { - u64 mask = 0xffffFFFF; - struct tnum range = tnum_range(reg->umin_value & mask, - reg->umax_value & mask); - struct tnum lo32 = tnum_cast(reg->var_off, 4); - struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32); + /* special case when 64-bit register has upper 32-bit register + * zeroed. Typically happens after zext or <<32, >>32 sequence + * allowing us to use 32-bit bounds directly, + */ + if (tnum_equals_const(tnum_clear_subreg(reg->var_off), 0)) { + __reg_assign_32_into_64(reg); + } else { + /* Otherwise the best we can do is push lower 32bit known and + * unknown bits into register (var_off set from jmp logic) + * then learn as much as possible from the 64-bit tnum + * known and unknown bits. The previous smin/smax bounds are + * invalid here because of jmp32 compare so mark them unknown + * so they do not impact tnum bounds calculation. + */ + __mark_reg64_unbounded(reg); + __update_reg_bounds(reg); + } - reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range)); + /* Intersecting with the old var_off might have improved our bounds + * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), + * then new var_off is (0; 0x7f...fc) which improves our umax. + */ + __reg_deduce_bounds(reg); + __reg_bound_offset(reg); + __update_reg_bounds(reg); } -/* Reset the min/max bounds of a register */ -static void __mark_reg_unbounded(struct bpf_reg_state *reg) +static bool __reg64_bound_s32(s64 a) { - reg->smin_value = S64_MIN; - reg->smax_value = S64_MAX; - reg->umin_value = 0; - reg->umax_value = U64_MAX; + if (a > S32_MIN && a < S32_MAX) + return true; + return false; +} + +static bool __reg64_bound_u32(u64 a) +{ + if (a > U32_MIN && a < U32_MAX) + return true; + return false; +} + +static void __reg_combine_64_into_32(struct bpf_reg_state *reg) +{ + __mark_reg32_unbounded(reg); + + if (__reg64_bound_s32(reg->smin_value)) + reg->s32_min_value = (s32)reg->smin_value; + if (__reg64_bound_s32(reg->smax_value)) + reg->s32_max_value = (s32)reg->smax_value; + if (__reg64_bound_u32(reg->umin_value)) + reg->u32_min_value = (u32)reg->umin_value; + if (__reg64_bound_u32(reg->umax_value)) + reg->u32_max_value = (u32)reg->umax_value; + + /* Intersecting with the old var_off might have improved our bounds + * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), + * then new var_off is (0; 0x7f...fc) which improves our umax. + */ + __reg_deduce_bounds(reg); + __reg_bound_offset(reg); + __update_reg_bounds(reg); } /* Mark a register as having a completely unknown (scalar) value. */ @@ -2784,6 +2973,12 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env, return 0; } +/* BPF architecture zero extends alu32 ops into 64-bit registesr */ +static void zext_32_to_64(struct bpf_reg_state *reg) +{ + reg->var_off = tnum_subreg(reg->var_off); + __reg_assign_32_into_64(reg); +} /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE @@ -2806,6 +3001,14 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) } reg->smin_value = reg->umin_value; reg->smax_value = reg->umax_value; + + /* If size is smaller than 32bit register the 32bit register + * values are also truncated so we push 64-bit bounds into + * 32-bit bounds. Above were truncated < 32-bits already. + */ + if (size >= 4) + return; + __reg_combine_64_into_32(reg); } static bool bpf_map_is_rdonly(const struct bpf_map *map) @@ -3460,13 +3663,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = CONST_PTR_TO_MAP; if (type != expected_type) goto err_type; - } else if (arg_type == ARG_PTR_TO_CTX) { + } else if (arg_type == ARG_PTR_TO_CTX || + arg_type == ARG_PTR_TO_CTX_OR_NULL) { expected_type = PTR_TO_CTX; - if (type != expected_type) - goto err_type; - err = check_ctx_reg(env, reg, regno); - if (err < 0) - return err; + if (!(register_is_null(reg) && + arg_type == ARG_PTR_TO_CTX_OR_NULL)) { + if (type != expected_type) + goto err_type; + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; + } } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { expected_type = PTR_TO_SOCK_COMMON; /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ @@ -3576,11 +3783,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - /* remember the mem_size which may be used later - * to refine return values. + /* This is used to refine r0 return value bounds for helpers + * that enforce this value as an upper bound on return values. + * See do_refine_retval_range() for helpers that can refine + * the return value. C type of helper is u32 so we pull register + * bound from umax_value however, if negative verifier errors + * out. Only upper bounds can be learned because retval is an + * int type and negative retvals are allowed. */ - meta->msize_smax_value = reg->smax_value; - meta->msize_umax_value = reg->umax_value; + meta->msize_max_value = reg->umax_value; /* The register is SCALAR_VALUE; the access check * happens using its boundaries. @@ -3649,7 +3860,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_perf_event_read && func_id != BPF_FUNC_perf_event_output && func_id != BPF_FUNC_skb_output && - func_id != BPF_FUNC_perf_event_read_value) + func_id != BPF_FUNC_perf_event_read_value && + func_id != BPF_FUNC_xdp_output) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -3693,14 +3905,16 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_sk_redirect_map && func_id != BPF_FUNC_sock_map_update && func_id != BPF_FUNC_map_delete_elem && - func_id != BPF_FUNC_msg_redirect_map) + func_id != BPF_FUNC_msg_redirect_map && + func_id != BPF_FUNC_sk_select_reuseport) goto error; break; case BPF_MAP_TYPE_SOCKHASH: if (func_id != BPF_FUNC_sk_redirect_hash && func_id != BPF_FUNC_sock_hash_update && func_id != BPF_FUNC_map_delete_elem && - func_id != BPF_FUNC_msg_redirect_hash) + func_id != BPF_FUNC_msg_redirect_hash && + func_id != BPF_FUNC_sk_select_reuseport) goto error; break; case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: @@ -3737,6 +3951,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_perf_event_output: case BPF_FUNC_perf_event_read_value: case BPF_FUNC_skb_output: + case BPF_FUNC_xdp_output: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; @@ -3774,7 +3989,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_sk_select_reuseport: - if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) + if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY && + map->map_type != BPF_MAP_TYPE_SOCKMAP && + map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; case BPF_FUNC_map_peek_elem: @@ -4117,10 +4334,11 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, func_id != BPF_FUNC_probe_read_str)) return; - ret_reg->smax_value = meta->msize_smax_value; - ret_reg->umax_value = meta->msize_umax_value; + ret_reg->smax_value = meta->msize_max_value; + ret_reg->s32_max_value = meta->msize_max_value; __reg_deduce_bounds(ret_reg); __reg_bound_offset(ret_reg); + __update_reg_bounds(ret_reg); } static int @@ -4427,7 +4645,17 @@ static bool signed_add_overflows(s64 a, s64 b) return res < a; } -static bool signed_sub_overflows(s64 a, s64 b) +static bool signed_add32_overflows(s64 a, s64 b) +{ + /* Do the add in u32, where overflow is well-defined */ + s32 res = (s32)((u32)a + (u32)b); + + if (b < 0) + return res > a; + return res < a; +} + +static bool signed_sub_overflows(s32 a, s32 b) { /* Do the sub in u64, where overflow is well-defined */ s64 res = (s64)((u64)a - (u64)b); @@ -4437,6 +4665,16 @@ static bool signed_sub_overflows(s64 a, s64 b) return res > a; } +static bool signed_sub32_overflows(s32 a, s32 b) +{ + /* Do the sub in u64, where overflow is well-defined */ + s32 res = (s32)((u32)a - (u32)b); + + if (b < 0) + return res < a; + return res > a; +} + static bool check_reg_sane_offset(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, enum bpf_reg_type type) @@ -4673,6 +4911,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, !check_reg_sane_offset(env, ptr_reg, ptr_reg->type)) return -EINVAL; + /* pointer types do not carry 32-bit bounds at the moment. */ + __mark_reg32_unbounded(dst_reg); + switch (opcode) { case BPF_ADD: ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); @@ -4836,6 +5077,518 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return 0; } +static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 smin_val = src_reg->s32_min_value; + s32 smax_val = src_reg->s32_max_value; + u32 umin_val = src_reg->u32_min_value; + u32 umax_val = src_reg->u32_max_value; + + if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) || + signed_add32_overflows(dst_reg->s32_max_value, smax_val)) { + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + } else { + dst_reg->s32_min_value += smin_val; + dst_reg->s32_max_value += smax_val; + } + if (dst_reg->u32_min_value + umin_val < umin_val || + dst_reg->u32_max_value + umax_val < umax_val) { + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + } else { + dst_reg->u32_min_value += umin_val; + dst_reg->u32_max_value += umax_val; + } +} + +static void scalar_min_max_add(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 smin_val = src_reg->smin_value; + s64 smax_val = src_reg->smax_value; + u64 umin_val = src_reg->umin_value; + u64 umax_val = src_reg->umax_value; + + if (signed_add_overflows(dst_reg->smin_value, smin_val) || + signed_add_overflows(dst_reg->smax_value, smax_val)) { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + dst_reg->smin_value += smin_val; + dst_reg->smax_value += smax_val; + } + if (dst_reg->umin_value + umin_val < umin_val || + dst_reg->umax_value + umax_val < umax_val) { + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + } else { + dst_reg->umin_value += umin_val; + dst_reg->umax_value += umax_val; + } +} + +static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 smin_val = src_reg->s32_min_value; + s32 smax_val = src_reg->s32_max_value; + u32 umin_val = src_reg->u32_min_value; + u32 umax_val = src_reg->u32_max_value; + + if (signed_sub32_overflows(dst_reg->s32_min_value, smax_val) || + signed_sub32_overflows(dst_reg->s32_max_value, smin_val)) { + /* Overflow possible, we know nothing */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + } else { + dst_reg->s32_min_value -= smax_val; + dst_reg->s32_max_value -= smin_val; + } + if (dst_reg->u32_min_value < umax_val) { + /* Overflow possible, we know nothing */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + } else { + /* Cannot overflow (as long as bounds are consistent) */ + dst_reg->u32_min_value -= umax_val; + dst_reg->u32_max_value -= umin_val; + } +} + +static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 smin_val = src_reg->smin_value; + s64 smax_val = src_reg->smax_value; + u64 umin_val = src_reg->umin_value; + u64 umax_val = src_reg->umax_value; + + if (signed_sub_overflows(dst_reg->smin_value, smax_val) || + signed_sub_overflows(dst_reg->smax_value, smin_val)) { + /* Overflow possible, we know nothing */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + dst_reg->smin_value -= smax_val; + dst_reg->smax_value -= smin_val; + } + if (dst_reg->umin_value < umax_val) { + /* Overflow possible, we know nothing */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + } else { + /* Cannot overflow (as long as bounds are consistent) */ + dst_reg->umin_value -= umax_val; + dst_reg->umax_value -= umin_val; + } +} + +static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 smin_val = src_reg->s32_min_value; + u32 umin_val = src_reg->u32_min_value; + u32 umax_val = src_reg->u32_max_value; + + if (smin_val < 0 || dst_reg->s32_min_value < 0) { + /* Ain't nobody got time to multiply that sign */ + __mark_reg32_unbounded(dst_reg); + return; + } + /* Both values are positive, so we can work with unsigned and + * copy the result to signed (unless it exceeds S32_MAX). + */ + if (umax_val > U16_MAX || dst_reg->u32_max_value > U16_MAX) { + /* Potential overflow, we know nothing */ + __mark_reg32_unbounded(dst_reg); + return; + } + dst_reg->u32_min_value *= umin_val; + dst_reg->u32_max_value *= umax_val; + if (dst_reg->u32_max_value > S32_MAX) { + /* Overflow possible, we know nothing */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + } else { + dst_reg->s32_min_value = dst_reg->u32_min_value; + dst_reg->s32_max_value = dst_reg->u32_max_value; + } +} + +static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 smin_val = src_reg->smin_value; + u64 umin_val = src_reg->umin_value; + u64 umax_val = src_reg->umax_value; + + if (smin_val < 0 || dst_reg->smin_value < 0) { + /* Ain't nobody got time to multiply that sign */ + __mark_reg64_unbounded(dst_reg); + return; + } + /* Both values are positive, so we can work with unsigned and + * copy the result to signed (unless it exceeds S64_MAX). + */ + if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) { + /* Potential overflow, we know nothing */ + __mark_reg64_unbounded(dst_reg); + return; + } + dst_reg->umin_value *= umin_val; + dst_reg->umax_value *= umax_val; + if (dst_reg->umax_value > S64_MAX) { + /* Overflow possible, we know nothing */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + dst_reg->smin_value = dst_reg->umin_value; + dst_reg->smax_value = dst_reg->umax_value; + } +} + +static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + bool src_known = tnum_subreg_is_const(src_reg->var_off); + bool dst_known = tnum_subreg_is_const(dst_reg->var_off); + struct tnum var32_off = tnum_subreg(dst_reg->var_off); + s32 smin_val = src_reg->s32_min_value; + u32 umax_val = src_reg->u32_max_value; + + /* Assuming scalar64_min_max_and will be called so its safe + * to skip updating register for known 32-bit case. + */ + if (src_known && dst_known) + return; + + /* We get our minimum from the var_off, since that's inherently + * bitwise. Our maximum is the minimum of the operands' maxima. + */ + dst_reg->u32_min_value = var32_off.value; + dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val); + if (dst_reg->s32_min_value < 0 || smin_val < 0) { + /* Lose signed bounds when ANDing negative numbers, + * ain't nobody got time for that. + */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + } else { + /* ANDing two positives gives a positive, so safe to + * cast result into s64. + */ + dst_reg->s32_min_value = dst_reg->u32_min_value; + dst_reg->s32_max_value = dst_reg->u32_max_value; + } + +} + +static void scalar_min_max_and(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + bool src_known = tnum_is_const(src_reg->var_off); + bool dst_known = tnum_is_const(dst_reg->var_off); + s64 smin_val = src_reg->smin_value; + u64 umax_val = src_reg->umax_value; + + if (src_known && dst_known) { + __mark_reg_known(dst_reg, dst_reg->var_off.value & + src_reg->var_off.value); + return; + } + + /* We get our minimum from the var_off, since that's inherently + * bitwise. Our maximum is the minimum of the operands' maxima. + */ + dst_reg->umin_value = dst_reg->var_off.value; + dst_reg->umax_value = min(dst_reg->umax_value, umax_val); + if (dst_reg->smin_value < 0 || smin_val < 0) { + /* Lose signed bounds when ANDing negative numbers, + * ain't nobody got time for that. + */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + /* ANDing two positives gives a positive, so safe to + * cast result into s64. + */ + dst_reg->smin_value = dst_reg->umin_value; + dst_reg->smax_value = dst_reg->umax_value; + } + /* We may learn something more from the var_off */ + __update_reg_bounds(dst_reg); +} + +static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + bool src_known = tnum_subreg_is_const(src_reg->var_off); + bool dst_known = tnum_subreg_is_const(dst_reg->var_off); + struct tnum var32_off = tnum_subreg(dst_reg->var_off); + s32 smin_val = src_reg->smin_value; + u32 umin_val = src_reg->umin_value; + + /* Assuming scalar64_min_max_or will be called so it is safe + * to skip updating register for known case. + */ + if (src_known && dst_known) + return; + + /* We get our maximum from the var_off, and our minimum is the + * maximum of the operands' minima + */ + dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val); + dst_reg->u32_max_value = var32_off.value | var32_off.mask; + if (dst_reg->s32_min_value < 0 || smin_val < 0) { + /* Lose signed bounds when ORing negative numbers, + * ain't nobody got time for that. + */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + } else { + /* ORing two positives gives a positive, so safe to + * cast result into s64. + */ + dst_reg->s32_min_value = dst_reg->umin_value; + dst_reg->s32_max_value = dst_reg->umax_value; + } +} + +static void scalar_min_max_or(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + bool src_known = tnum_is_const(src_reg->var_off); + bool dst_known = tnum_is_const(dst_reg->var_off); + s64 smin_val = src_reg->smin_value; + u64 umin_val = src_reg->umin_value; + + if (src_known && dst_known) { + __mark_reg_known(dst_reg, dst_reg->var_off.value | + src_reg->var_off.value); + return; + } + + /* We get our maximum from the var_off, and our minimum is the + * maximum of the operands' minima + */ + dst_reg->umin_value = max(dst_reg->umin_value, umin_val); + dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; + if (dst_reg->smin_value < 0 || smin_val < 0) { + /* Lose signed bounds when ORing negative numbers, + * ain't nobody got time for that. + */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } else { + /* ORing two positives gives a positive, so safe to + * cast result into s64. + */ + dst_reg->smin_value = dst_reg->umin_value; + dst_reg->smax_value = dst_reg->umax_value; + } + /* We may learn something more from the var_off */ + __update_reg_bounds(dst_reg); +} + +static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, + u64 umin_val, u64 umax_val) +{ + /* We lose all sign bit information (except what we can pick + * up from var_off) + */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + /* If we might shift our top bit out, then we know nothing */ + if (umax_val > 31 || dst_reg->u32_max_value > 1ULL << (31 - umax_val)) { + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + } else { + dst_reg->u32_min_value <<= umin_val; + dst_reg->u32_max_value <<= umax_val; + } +} + +static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u32 umax_val = src_reg->u32_max_value; + u32 umin_val = src_reg->u32_min_value; + /* u32 alu operation will zext upper bits */ + struct tnum subreg = tnum_subreg(dst_reg->var_off); + + __scalar32_min_max_lsh(dst_reg, umin_val, umax_val); + dst_reg->var_off = tnum_subreg(tnum_lshift(subreg, umin_val)); + /* Not required but being careful mark reg64 bounds as unknown so + * that we are forced to pick them up from tnum and zext later and + * if some path skips this step we are still safe. + */ + __mark_reg64_unbounded(dst_reg); + __update_reg32_bounds(dst_reg); +} + +static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg, + u64 umin_val, u64 umax_val) +{ + /* Special case <<32 because it is a common compiler pattern to sign + * extend subreg by doing <<32 s>>32. In this case if 32bit bounds are + * positive we know this shift will also be positive so we can track + * bounds correctly. Otherwise we lose all sign bit information except + * what we can pick up from var_off. Perhaps we can generalize this + * later to shifts of any length. + */ + if (umin_val == 32 && umax_val == 32 && dst_reg->s32_max_value >= 0) + dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32; + else + dst_reg->smax_value = S64_MAX; + + if (umin_val == 32 && umax_val == 32 && dst_reg->s32_min_value >= 0) + dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32; + else + dst_reg->smin_value = S64_MIN; + + /* If we might shift our top bit out, then we know nothing */ + if (dst_reg->umax_value > 1ULL << (63 - umax_val)) { + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + } else { + dst_reg->umin_value <<= umin_val; + dst_reg->umax_value <<= umax_val; + } +} + +static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 umax_val = src_reg->umax_value; + u64 umin_val = src_reg->umin_value; + + /* scalar64 calc uses 32bit unshifted bounds so must be called first */ + __scalar64_min_max_lsh(dst_reg, umin_val, umax_val); + __scalar32_min_max_lsh(dst_reg, umin_val, umax_val); + + dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val); + /* We may learn something more from the var_off */ + __update_reg_bounds(dst_reg); +} + +static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + struct tnum subreg = tnum_subreg(dst_reg->var_off); + u32 umax_val = src_reg->u32_max_value; + u32 umin_val = src_reg->u32_min_value; + + /* BPF_RSH is an unsigned shift. If the value in dst_reg might + * be negative, then either: + * 1) src_reg might be zero, so the sign bit of the result is + * unknown, so we lose our signed bounds + * 2) it's known negative, thus the unsigned bounds capture the + * signed bounds + * 3) the signed bounds cross zero, so they tell us nothing + * about the result + * If the value in dst_reg is known nonnegative, then again the + * unsigned bounts capture the signed bounds. + * Thus, in all cases it suffices to blow away our signed bounds + * and rely on inferring new ones from the unsigned bounds and + * var_off of the result. + */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + + dst_reg->var_off = tnum_rshift(subreg, umin_val); + dst_reg->u32_min_value >>= umax_val; + dst_reg->u32_max_value >>= umin_val; + + __mark_reg64_unbounded(dst_reg); + __update_reg32_bounds(dst_reg); +} + +static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 umax_val = src_reg->umax_value; + u64 umin_val = src_reg->umin_value; + + /* BPF_RSH is an unsigned shift. If the value in dst_reg might + * be negative, then either: + * 1) src_reg might be zero, so the sign bit of the result is + * unknown, so we lose our signed bounds + * 2) it's known negative, thus the unsigned bounds capture the + * signed bounds + * 3) the signed bounds cross zero, so they tell us nothing + * about the result + * If the value in dst_reg is known nonnegative, then again the + * unsigned bounts capture the signed bounds. + * Thus, in all cases it suffices to blow away our signed bounds + * and rely on inferring new ones from the unsigned bounds and + * var_off of the result. + */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); + dst_reg->umin_value >>= umax_val; + dst_reg->umax_value >>= umin_val; + + /* Its not easy to operate on alu32 bounds here because it depends + * on bits being shifted in. Take easy way out and mark unbounded + * so we can recalculate later from tnum. + */ + __mark_reg32_unbounded(dst_reg); + __update_reg_bounds(dst_reg); +} + +static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 umin_val = src_reg->u32_min_value; + + /* Upon reaching here, src_known is true and + * umax_val is equal to umin_val. + */ + dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val); + dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val); + + dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32); + + /* blow away the dst_reg umin_value/umax_value and rely on + * dst_reg var_off to refine the result. + */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + + __mark_reg64_unbounded(dst_reg); + __update_reg32_bounds(dst_reg); +} + +static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 umin_val = src_reg->umin_value; + + /* Upon reaching here, src_known is true and umax_val is equal + * to umin_val. + */ + dst_reg->smin_value >>= umin_val; + dst_reg->smax_value >>= umin_val; + + dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64); + + /* blow away the dst_reg umin_value/umax_value and rely on + * dst_reg var_off to refine the result. + */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + + /* Its not easy to operate on alu32 bounds here because it depends + * on bits being shifted in from upper 32-bits. Take easy way out + * and mark unbounded so we can recalculate later from tnum. + */ + __mark_reg32_unbounded(dst_reg); + __update_reg_bounds(dst_reg); +} + /* WARNING: This function does calculations on 64-bit values, but the actual * execution may occur on 32-bit values. Therefore, things like bitshifts * need extra checks in the 32-bit case. @@ -4850,33 +5603,47 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, bool src_known, dst_known; s64 smin_val, smax_val; u64 umin_val, umax_val; + s32 s32_min_val, s32_max_val; + u32 u32_min_val, u32_max_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; u32 dst = insn->dst_reg; int ret; - - if (insn_bitness == 32) { - /* Relevant for 32-bit RSH: Information can propagate towards - * LSB, so it isn't sufficient to only truncate the output to - * 32 bits. - */ - coerce_reg_to_size(dst_reg, 4); - coerce_reg_to_size(&src_reg, 4); - } + bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; umin_val = src_reg.umin_value; umax_val = src_reg.umax_value; - src_known = tnum_is_const(src_reg.var_off); - dst_known = tnum_is_const(dst_reg->var_off); - if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || - smin_val > smax_val || umin_val > umax_val) { - /* Taint dst register if offset had invalid bounds derived from - * e.g. dead branches. - */ - __mark_reg_unknown(env, dst_reg); - return 0; + s32_min_val = src_reg.s32_min_value; + s32_max_val = src_reg.s32_max_value; + u32_min_val = src_reg.u32_min_value; + u32_max_val = src_reg.u32_max_value; + + if (alu32) { + src_known = tnum_subreg_is_const(src_reg.var_off); + dst_known = tnum_subreg_is_const(dst_reg->var_off); + if ((src_known && + (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) || + s32_min_val > s32_max_val || u32_min_val > u32_max_val) { + /* Taint dst register if offset had invalid bounds + * derived from e.g. dead branches. + */ + __mark_reg_unknown(env, dst_reg); + return 0; + } + } else { + src_known = tnum_is_const(src_reg.var_off); + dst_known = tnum_is_const(dst_reg->var_off); + if ((src_known && + (smin_val != smax_val || umin_val != umax_val)) || + smin_val > smax_val || umin_val > umax_val) { + /* Taint dst register if offset had invalid bounds + * derived from e.g. dead branches. + */ + __mark_reg_unknown(env, dst_reg); + return 0; + } } if (!src_known && @@ -4885,6 +5652,20 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, return 0; } + /* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops. + * There are two classes of instructions: The first class we track both + * alu32 and alu64 sign/unsigned bounds independently this provides the + * greatest amount of precision when alu operations are mixed with jmp32 + * operations. These operations are BPF_ADD, BPF_SUB, BPF_MUL, BPF_ADD, + * and BPF_OR. This is possible because these ops have fairly easy to + * understand and calculate behavior in both 32-bit and 64-bit alu ops. + * See alu32 verifier tests for examples. The second class of + * operations, BPF_LSH, BPF_RSH, and BPF_ARSH, however are not so easy + * with regards to tracking sign/unsigned bounds because the bits may + * cross subreg boundaries in the alu64 case. When this happens we mark + * the reg unbounded in the subreg bound space and use the resulting + * tnum to calculate an approximation of the sign/unsigned bounds. + */ switch (opcode) { case BPF_ADD: ret = sanitize_val_alu(env, insn); @@ -4892,22 +5673,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d tried to add from different pointers or scalars\n", dst); return ret; } - if (signed_add_overflows(dst_reg->smin_value, smin_val) || - signed_add_overflows(dst_reg->smax_value, smax_val)) { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - dst_reg->smin_value += smin_val; - dst_reg->smax_value += smax_val; - } - if (dst_reg->umin_value + umin_val < umin_val || - dst_reg->umax_value + umax_val < umax_val) { - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - dst_reg->umin_value += umin_val; - dst_reg->umax_value += umax_val; - } + scalar32_min_max_add(dst_reg, &src_reg); + scalar_min_max_add(dst_reg, &src_reg); dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); break; case BPF_SUB: @@ -4916,111 +5683,24 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d tried to sub from different pointers or scalars\n", dst); return ret; } - if (signed_sub_overflows(dst_reg->smin_value, smax_val) || - signed_sub_overflows(dst_reg->smax_value, smin_val)) { - /* Overflow possible, we know nothing */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - dst_reg->smin_value -= smax_val; - dst_reg->smax_value -= smin_val; - } - if (dst_reg->umin_value < umax_val) { - /* Overflow possible, we know nothing */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - /* Cannot overflow (as long as bounds are consistent) */ - dst_reg->umin_value -= umax_val; - dst_reg->umax_value -= umin_val; - } + scalar32_min_max_sub(dst_reg, &src_reg); + scalar_min_max_sub(dst_reg, &src_reg); dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off); break; case BPF_MUL: dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off); - if (smin_val < 0 || dst_reg->smin_value < 0) { - /* Ain't nobody got time to multiply that sign */ - __mark_reg_unbounded(dst_reg); - __update_reg_bounds(dst_reg); - break; - } - /* Both values are positive, so we can work with unsigned and - * copy the result to signed (unless it exceeds S64_MAX). - */ - if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) { - /* Potential overflow, we know nothing */ - __mark_reg_unbounded(dst_reg); - /* (except what we can learn from the var_off) */ - __update_reg_bounds(dst_reg); - break; - } - dst_reg->umin_value *= umin_val; - dst_reg->umax_value *= umax_val; - if (dst_reg->umax_value > S64_MAX) { - /* Overflow possible, we know nothing */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } + scalar32_min_max_mul(dst_reg, &src_reg); + scalar_min_max_mul(dst_reg, &src_reg); break; case BPF_AND: - if (src_known && dst_known) { - __mark_reg_known(dst_reg, dst_reg->var_off.value & - src_reg.var_off.value); - break; - } - /* We get our minimum from the var_off, since that's inherently - * bitwise. Our maximum is the minimum of the operands' maxima. - */ dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off); - dst_reg->umin_value = dst_reg->var_off.value; - dst_reg->umax_value = min(dst_reg->umax_value, umax_val); - if (dst_reg->smin_value < 0 || smin_val < 0) { - /* Lose signed bounds when ANDing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - /* ANDing two positives gives a positive, so safe to - * cast result into s64. - */ - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } - /* We may learn something more from the var_off */ - __update_reg_bounds(dst_reg); + scalar32_min_max_and(dst_reg, &src_reg); + scalar_min_max_and(dst_reg, &src_reg); break; case BPF_OR: - if (src_known && dst_known) { - __mark_reg_known(dst_reg, dst_reg->var_off.value | - src_reg.var_off.value); - break; - } - /* We get our maximum from the var_off, and our minimum is the - * maximum of the operands' minima - */ dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off); - dst_reg->umin_value = max(dst_reg->umin_value, umin_val); - dst_reg->umax_value = dst_reg->var_off.value | - dst_reg->var_off.mask; - if (dst_reg->smin_value < 0 || smin_val < 0) { - /* Lose signed bounds when ORing negative numbers, - * ain't nobody got time for that. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } else { - /* ORing two positives gives a positive, so safe to - * cast result into s64. - */ - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } - /* We may learn something more from the var_off */ - __update_reg_bounds(dst_reg); + scalar32_min_max_or(dst_reg, &src_reg); + scalar_min_max_or(dst_reg, &src_reg); break; case BPF_LSH: if (umax_val >= insn_bitness) { @@ -5030,22 +5710,10 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, mark_reg_unknown(env, regs, insn->dst_reg); break; } - /* We lose all sign bit information (except what we can pick - * up from var_off) - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - /* If we might shift our top bit out, then we know nothing */ - if (dst_reg->umax_value > 1ULL << (63 - umax_val)) { - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - dst_reg->umin_value <<= umin_val; - dst_reg->umax_value <<= umax_val; - } - dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val); - /* We may learn something more from the var_off */ - __update_reg_bounds(dst_reg); + if (alu32) + scalar32_min_max_lsh(dst_reg, &src_reg); + else + scalar_min_max_lsh(dst_reg, &src_reg); break; case BPF_RSH: if (umax_val >= insn_bitness) { @@ -5055,27 +5723,10 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, mark_reg_unknown(env, regs, insn->dst_reg); break; } - /* BPF_RSH is an unsigned shift. If the value in dst_reg might - * be negative, then either: - * 1) src_reg might be zero, so the sign bit of the result is - * unknown, so we lose our signed bounds - * 2) it's known negative, thus the unsigned bounds capture the - * signed bounds - * 3) the signed bounds cross zero, so they tell us nothing - * about the result - * If the value in dst_reg is known nonnegative, then again the - * unsigned bounts capture the signed bounds. - * Thus, in all cases it suffices to blow away our signed bounds - * and rely on inferring new ones from the unsigned bounds and - * var_off of the result. - */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); - dst_reg->umin_value >>= umax_val; - dst_reg->umax_value >>= umin_val; - /* We may learn something more from the var_off */ - __update_reg_bounds(dst_reg); + if (alu32) + scalar32_min_max_rsh(dst_reg, &src_reg); + else + scalar_min_max_rsh(dst_reg, &src_reg); break; case BPF_ARSH: if (umax_val >= insn_bitness) { @@ -5085,38 +5736,21 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, mark_reg_unknown(env, regs, insn->dst_reg); break; } - - /* Upon reaching here, src_known is true and - * umax_val is equal to umin_val. - */ - if (insn_bitness == 32) { - dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val); - dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val); - } else { - dst_reg->smin_value >>= umin_val; - dst_reg->smax_value >>= umin_val; - } - - dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, - insn_bitness); - - /* blow away the dst_reg umin_value/umax_value and rely on - * dst_reg var_off to refine the result. - */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - __update_reg_bounds(dst_reg); + if (alu32) + scalar32_min_max_arsh(dst_reg, &src_reg); + else + scalar_min_max_arsh(dst_reg, &src_reg); break; default: mark_reg_unknown(env, regs, insn->dst_reg); break; } - if (BPF_CLASS(insn->code) != BPF_ALU64) { - /* 32-bit ALU ops are (32,32)->32 */ - coerce_reg_to_size(dst_reg, 4); - } + /* ALU32 ops are zero extended into 64bit register */ + if (alu32) + zext_32_to_64(dst_reg); + __update_reg_bounds(dst_reg); __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); return 0; @@ -5290,7 +5924,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) mark_reg_unknown(env, regs, insn->dst_reg); } - coerce_reg_to_size(dst_reg, 4); + zext_32_to_64(dst_reg); } } else { /* case: R = imm @@ -5460,55 +6094,83 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, new_range); } -/* compute branch direction of the expression "if (reg opcode val) goto target;" - * and return: - * 1 - branch will be taken and "goto target" will be executed - * 0 - branch will not be taken and fall-through to next insn - * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10] - */ -static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, - bool is_jmp32) +static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) { - struct bpf_reg_state reg_lo; - s64 sval; + struct tnum subreg = tnum_subreg(reg->var_off); + s32 sval = (s32)val; - if (__is_pointer_value(false, reg)) - return -1; + switch (opcode) { + case BPF_JEQ: + if (tnum_is_const(subreg)) + return !!tnum_equals_const(subreg, val); + break; + case BPF_JNE: + if (tnum_is_const(subreg)) + return !tnum_equals_const(subreg, val); + break; + case BPF_JSET: + if ((~subreg.mask & subreg.value) & val) + return 1; + if (!((subreg.mask | subreg.value) & val)) + return 0; + break; + case BPF_JGT: + if (reg->u32_min_value > val) + return 1; + else if (reg->u32_max_value <= val) + return 0; + break; + case BPF_JSGT: + if (reg->s32_min_value > sval) + return 1; + else if (reg->s32_max_value < sval) + return 0; + break; + case BPF_JLT: + if (reg->u32_max_value < val) + return 1; + else if (reg->u32_min_value >= val) + return 0; + break; + case BPF_JSLT: + if (reg->s32_max_value < sval) + return 1; + else if (reg->s32_min_value >= sval) + return 0; + break; + case BPF_JGE: + if (reg->u32_min_value >= val) + return 1; + else if (reg->u32_max_value < val) + return 0; + break; + case BPF_JSGE: + if (reg->s32_min_value >= sval) + return 1; + else if (reg->s32_max_value < sval) + return 0; + break; + case BPF_JLE: + if (reg->u32_max_value <= val) + return 1; + else if (reg->u32_min_value > val) + return 0; + break; + case BPF_JSLE: + if (reg->s32_max_value <= sval) + return 1; + else if (reg->s32_min_value > sval) + return 0; + break; + } - if (is_jmp32) { - reg_lo = *reg; - reg = ®_lo; - /* For JMP32, only low 32 bits are compared, coerce_reg_to_size - * could truncate high bits and update umin/umax according to - * information of low bits. - */ - coerce_reg_to_size(reg, 4); - /* smin/smax need special handling. For example, after coerce, - * if smin_value is 0x00000000ffffffffLL, the value is -1 when - * used as operand to JMP32. It is a negative number from s32's - * point of view, while it is a positive number when seen as - * s64. The smin/smax are kept as s64, therefore, when used with - * JMP32, they need to be transformed into s32, then sign - * extended back to s64. - * - * Also, smin/smax were copied from umin/umax. If umin/umax has - * different sign bit, then min/max relationship doesn't - * maintain after casting into s32, for this case, set smin/smax - * to safest range. - */ - if ((reg->umax_value ^ reg->umin_value) & - (1ULL << 31)) { - reg->smin_value = S32_MIN; - reg->smax_value = S32_MAX; - } - reg->smin_value = (s64)(s32)reg->smin_value; - reg->smax_value = (s64)(s32)reg->smax_value; + return -1; +} - val = (u32)val; - sval = (s64)(s32)val; - } else { - sval = (s64)val; - } + +static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) +{ + s64 sval = (s64)val; switch (opcode) { case BPF_JEQ: @@ -5578,27 +6240,22 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, return -1; } -/* Generate min value of the high 32-bit from TNUM info. */ -static u64 gen_hi_min(struct tnum var) -{ - return var.value & ~0xffffffffULL; -} - -/* Generate max value of the high 32-bit from TNUM info. */ -static u64 gen_hi_max(struct tnum var) -{ - return (var.value | var.mask) & ~0xffffffffULL; -} - -/* Return true if VAL is compared with a s64 sign extended from s32, and they - * are with the same signedness. +/* compute branch direction of the expression "if (reg opcode val) goto target;" + * and return: + * 1 - branch will be taken and "goto target" will be executed + * 0 - branch will not be taken and fall-through to next insn + * -1 - unknown. Example: "if (reg < 5)" is unknown when register value + * range [0,10] */ -static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg) +static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, + bool is_jmp32) { - return ((s32)sval >= 0 && - reg->smin_value >= 0 && reg->smax_value <= S32_MAX) || - ((s32)sval < 0 && - reg->smax_value <= 0 && reg->smin_value >= S32_MIN); + if (__is_pointer_value(false, reg)) + return -1; + + if (is_jmp32) + return is_branch32_taken(reg, val, opcode); + return is_branch64_taken(reg, val, opcode); } /* Adjusts the register min/max values in the case that the dst_reg is the @@ -5607,10 +6264,16 @@ static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg) * In JEQ/JNE cases we also adjust the var_off values. */ static void reg_set_min_max(struct bpf_reg_state *true_reg, - struct bpf_reg_state *false_reg, u64 val, + struct bpf_reg_state *false_reg, + u64 val, u32 val32, u8 opcode, bool is_jmp32) { - s64 sval; + struct tnum false_32off = tnum_subreg(false_reg->var_off); + struct tnum false_64off = false_reg->var_off; + struct tnum true_32off = tnum_subreg(true_reg->var_off); + struct tnum true_64off = true_reg->var_off; + s64 sval = (s64)val; + s32 sval32 = (s32)val32; /* If the dst_reg is a pointer, we can't learn anything about its * variable offset from the compare (unless src_reg were a pointer into @@ -5621,9 +6284,6 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, if (__is_pointer_value(false, false_reg)) return; - val = is_jmp32 ? (u32)val : val; - sval = is_jmp32 ? (s64)(s32)val : (s64)val; - switch (opcode) { case BPF_JEQ: case BPF_JNE: @@ -5635,211 +6295,150 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, * if it is true we know the value for sure. Likewise for * BPF_JNE. */ - if (is_jmp32) { - u64 old_v = reg->var_off.value; - u64 hi_mask = ~0xffffffffULL; - - reg->var_off.value = (old_v & hi_mask) | val; - reg->var_off.mask &= hi_mask; - } else { + if (is_jmp32) + __mark_reg32_known(reg, val32); + else __mark_reg_known(reg, val); - } break; } case BPF_JSET: - false_reg->var_off = tnum_and(false_reg->var_off, - tnum_const(~val)); - if (is_power_of_2(val)) - true_reg->var_off = tnum_or(true_reg->var_off, - tnum_const(val)); + if (is_jmp32) { + false_32off = tnum_and(false_32off, tnum_const(~val32)); + if (is_power_of_2(val32)) + true_32off = tnum_or(true_32off, + tnum_const(val32)); + } else { + false_64off = tnum_and(false_64off, tnum_const(~val)); + if (is_power_of_2(val)) + true_64off = tnum_or(true_64off, + tnum_const(val)); + } break; case BPF_JGE: case BPF_JGT: { - u64 false_umax = opcode == BPF_JGT ? val : val - 1; - u64 true_umin = opcode == BPF_JGT ? val + 1 : val; - if (is_jmp32) { - false_umax += gen_hi_max(false_reg->var_off); - true_umin += gen_hi_min(true_reg->var_off); + u32 false_umax = opcode == BPF_JGT ? val32 : val32 - 1; + u32 true_umin = opcode == BPF_JGT ? val32 + 1 : val32; + + false_reg->u32_max_value = min(false_reg->u32_max_value, + false_umax); + true_reg->u32_min_value = max(true_reg->u32_min_value, + true_umin); + } else { + u64 false_umax = opcode == BPF_JGT ? val : val - 1; + u64 true_umin = opcode == BPF_JGT ? val + 1 : val; + + false_reg->umax_value = min(false_reg->umax_value, false_umax); + true_reg->umin_value = max(true_reg->umin_value, true_umin); } - false_reg->umax_value = min(false_reg->umax_value, false_umax); - true_reg->umin_value = max(true_reg->umin_value, true_umin); break; } case BPF_JSGE: case BPF_JSGT: { - s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; - s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; + if (is_jmp32) { + s32 false_smax = opcode == BPF_JSGT ? sval32 : sval32 - 1; + s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32; - /* If the full s64 was not sign-extended from s32 then don't - * deduct further info. - */ - if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) - break; - false_reg->smax_value = min(false_reg->smax_value, false_smax); - true_reg->smin_value = max(true_reg->smin_value, true_smin); + false_reg->s32_max_value = min(false_reg->s32_max_value, false_smax); + true_reg->s32_min_value = max(true_reg->s32_min_value, true_smin); + } else { + s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; + s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; + + false_reg->smax_value = min(false_reg->smax_value, false_smax); + true_reg->smin_value = max(true_reg->smin_value, true_smin); + } break; } case BPF_JLE: case BPF_JLT: { - u64 false_umin = opcode == BPF_JLT ? val : val + 1; - u64 true_umax = opcode == BPF_JLT ? val - 1 : val; - if (is_jmp32) { - false_umin += gen_hi_min(false_reg->var_off); - true_umax += gen_hi_max(true_reg->var_off); + u32 false_umin = opcode == BPF_JLT ? val32 : val32 + 1; + u32 true_umax = opcode == BPF_JLT ? val32 - 1 : val32; + + false_reg->u32_min_value = max(false_reg->u32_min_value, + false_umin); + true_reg->u32_max_value = min(true_reg->u32_max_value, + true_umax); + } else { + u64 false_umin = opcode == BPF_JLT ? val : val + 1; + u64 true_umax = opcode == BPF_JLT ? val - 1 : val; + + false_reg->umin_value = max(false_reg->umin_value, false_umin); + true_reg->umax_value = min(true_reg->umax_value, true_umax); } - false_reg->umin_value = max(false_reg->umin_value, false_umin); - true_reg->umax_value = min(true_reg->umax_value, true_umax); break; } case BPF_JSLE: case BPF_JSLT: { - s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; - s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; + if (is_jmp32) { + s32 false_smin = opcode == BPF_JSLT ? sval32 : sval32 + 1; + s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32; - if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) - break; - false_reg->smin_value = max(false_reg->smin_value, false_smin); - true_reg->smax_value = min(true_reg->smax_value, true_smax); + false_reg->s32_min_value = max(false_reg->s32_min_value, false_smin); + true_reg->s32_max_value = min(true_reg->s32_max_value, true_smax); + } else { + s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; + s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; + + false_reg->smin_value = max(false_reg->smin_value, false_smin); + true_reg->smax_value = min(true_reg->smax_value, true_smax); + } break; } default: - break; + return; } - __reg_deduce_bounds(false_reg); - __reg_deduce_bounds(true_reg); - /* We might have learned some bits from the bounds. */ - __reg_bound_offset(false_reg); - __reg_bound_offset(true_reg); if (is_jmp32) { - __reg_bound_offset32(false_reg); - __reg_bound_offset32(true_reg); + false_reg->var_off = tnum_or(tnum_clear_subreg(false_64off), + tnum_subreg(false_32off)); + true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off), + tnum_subreg(true_32off)); + __reg_combine_32_into_64(false_reg); + __reg_combine_32_into_64(true_reg); + } else { + false_reg->var_off = false_64off; + true_reg->var_off = true_64off; + __reg_combine_64_into_32(false_reg); + __reg_combine_64_into_32(true_reg); } - /* Intersecting with the old var_off might have improved our bounds - * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), - * then new var_off is (0; 0x7f...fc) which improves our umax. - */ - __update_reg_bounds(false_reg); - __update_reg_bounds(true_reg); } /* Same as above, but for the case that dst_reg holds a constant and src_reg is * the variable reg. */ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, - struct bpf_reg_state *false_reg, u64 val, + struct bpf_reg_state *false_reg, + u64 val, u32 val32, u8 opcode, bool is_jmp32) { - s64 sval; - - if (__is_pointer_value(false, false_reg)) - return; - - val = is_jmp32 ? (u32)val : val; - sval = is_jmp32 ? (s64)(s32)val : (s64)val; - - switch (opcode) { - case BPF_JEQ: - case BPF_JNE: - { - struct bpf_reg_state *reg = - opcode == BPF_JEQ ? true_reg : false_reg; - - if (is_jmp32) { - u64 old_v = reg->var_off.value; - u64 hi_mask = ~0xffffffffULL; - - reg->var_off.value = (old_v & hi_mask) | val; - reg->var_off.mask &= hi_mask; - } else { - __mark_reg_known(reg, val); - } - break; - } - case BPF_JSET: - false_reg->var_off = tnum_and(false_reg->var_off, - tnum_const(~val)); - if (is_power_of_2(val)) - true_reg->var_off = tnum_or(true_reg->var_off, - tnum_const(val)); - break; - case BPF_JGE: - case BPF_JGT: - { - u64 false_umin = opcode == BPF_JGT ? val : val + 1; - u64 true_umax = opcode == BPF_JGT ? val - 1 : val; - - if (is_jmp32) { - false_umin += gen_hi_min(false_reg->var_off); - true_umax += gen_hi_max(true_reg->var_off); - } - false_reg->umin_value = max(false_reg->umin_value, false_umin); - true_reg->umax_value = min(true_reg->umax_value, true_umax); - break; - } - case BPF_JSGE: - case BPF_JSGT: - { - s64 false_smin = opcode == BPF_JSGT ? sval : sval + 1; - s64 true_smax = opcode == BPF_JSGT ? sval - 1 : sval; - - if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) - break; - false_reg->smin_value = max(false_reg->smin_value, false_smin); - true_reg->smax_value = min(true_reg->smax_value, true_smax); - break; - } - case BPF_JLE: - case BPF_JLT: - { - u64 false_umax = opcode == BPF_JLT ? val : val - 1; - u64 true_umin = opcode == BPF_JLT ? val + 1 : val; - - if (is_jmp32) { - false_umax += gen_hi_max(false_reg->var_off); - true_umin += gen_hi_min(true_reg->var_off); - } - false_reg->umax_value = min(false_reg->umax_value, false_umax); - true_reg->umin_value = max(true_reg->umin_value, true_umin); - break; - } - case BPF_JSLE: - case BPF_JSLT: - { - s64 false_smax = opcode == BPF_JSLT ? sval : sval - 1; - s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval; - - if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) - break; - false_reg->smax_value = min(false_reg->smax_value, false_smax); - true_reg->smin_value = max(true_reg->smin_value, true_smin); - break; - } - default: - break; - } - - __reg_deduce_bounds(false_reg); - __reg_deduce_bounds(true_reg); - /* We might have learned some bits from the bounds. */ - __reg_bound_offset(false_reg); - __reg_bound_offset(true_reg); - if (is_jmp32) { - __reg_bound_offset32(false_reg); - __reg_bound_offset32(true_reg); - } - /* Intersecting with the old var_off might have improved our bounds - * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), - * then new var_off is (0; 0x7f...fc) which improves our umax. + /* How can we transform "a <op> b" into "b <op> a"? */ + static const u8 opcode_flip[16] = { + /* these stay the same */ + [BPF_JEQ >> 4] = BPF_JEQ, + [BPF_JNE >> 4] = BPF_JNE, + [BPF_JSET >> 4] = BPF_JSET, + /* these swap "lesser" and "greater" (L and G in the opcodes) */ + [BPF_JGE >> 4] = BPF_JLE, + [BPF_JGT >> 4] = BPF_JLT, + [BPF_JLE >> 4] = BPF_JGE, + [BPF_JLT >> 4] = BPF_JGT, + [BPF_JSGE >> 4] = BPF_JSLE, + [BPF_JSGT >> 4] = BPF_JSLT, + [BPF_JSLE >> 4] = BPF_JSGE, + [BPF_JSLT >> 4] = BPF_JSGT + }; + opcode = opcode_flip[opcode >> 4]; + /* This uses zero as "not present in table"; luckily the zero opcode, + * BPF_JA, can't get here. */ - __update_reg_bounds(false_reg); - __update_reg_bounds(true_reg); + if (opcode) + reg_set_min_max(true_reg, false_reg, val, val32, opcode, is_jmp32); } /* Regs are known to be equal, so intersect their min/max/var_off */ @@ -6128,13 +6727,22 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, dst_reg = ®s[insn->dst_reg]; is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - if (BPF_SRC(insn->code) == BPF_K) - pred = is_branch_taken(dst_reg, insn->imm, - opcode, is_jmp32); - else if (src_reg->type == SCALAR_VALUE && - tnum_is_const(src_reg->var_off)) - pred = is_branch_taken(dst_reg, src_reg->var_off.value, - opcode, is_jmp32); + if (BPF_SRC(insn->code) == BPF_K) { + pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32); + } else if (src_reg->type == SCALAR_VALUE && + is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) { + pred = is_branch_taken(dst_reg, + tnum_subreg(src_reg->var_off).value, + opcode, + is_jmp32); + } else if (src_reg->type == SCALAR_VALUE && + !is_jmp32 && tnum_is_const(src_reg->var_off)) { + pred = is_branch_taken(dst_reg, + src_reg->var_off.value, + opcode, + is_jmp32); + } + if (pred >= 0) { err = mark_chain_precision(env, insn->dst_reg); if (BPF_SRC(insn->code) == BPF_X && !err) @@ -6168,32 +6776,24 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ if (BPF_SRC(insn->code) == BPF_X) { struct bpf_reg_state *src_reg = ®s[insn->src_reg]; - struct bpf_reg_state lo_reg0 = *dst_reg; - struct bpf_reg_state lo_reg1 = *src_reg; - struct bpf_reg_state *src_lo, *dst_lo; - - dst_lo = &lo_reg0; - src_lo = &lo_reg1; - coerce_reg_to_size(dst_lo, 4); - coerce_reg_to_size(src_lo, 4); if (dst_reg->type == SCALAR_VALUE && src_reg->type == SCALAR_VALUE) { if (tnum_is_const(src_reg->var_off) || - (is_jmp32 && tnum_is_const(src_lo->var_off))) + (is_jmp32 && + tnum_is_const(tnum_subreg(src_reg->var_off)))) reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, - is_jmp32 - ? src_lo->var_off.value - : src_reg->var_off.value, + src_reg->var_off.value, + tnum_subreg(src_reg->var_off).value, opcode, is_jmp32); else if (tnum_is_const(dst_reg->var_off) || - (is_jmp32 && tnum_is_const(dst_lo->var_off))) + (is_jmp32 && + tnum_is_const(tnum_subreg(dst_reg->var_off)))) reg_set_min_max_inv(&other_branch_regs[insn->src_reg], src_reg, - is_jmp32 - ? dst_lo->var_off.value - : dst_reg->var_off.value, + dst_reg->var_off.value, + tnum_subreg(dst_reg->var_off).value, opcode, is_jmp32); else if (!is_jmp32 && (opcode == BPF_JEQ || opcode == BPF_JNE)) @@ -6204,7 +6804,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } } else if (dst_reg->type == SCALAR_VALUE) { reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, insn->imm, opcode, is_jmp32); + dst_reg, insn->imm, (u32)insn->imm, + opcode, is_jmp32); } /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). @@ -6405,8 +7006,9 @@ static int check_return_code(struct bpf_verifier_env *env) struct tnum range = tnum_range(0, 1); int err; - /* The struct_ops func-ptr's return type could be "void" */ - if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS && + /* LSM and struct_ops func-ptr's return type could be "void" */ + if ((env->prog->type == BPF_PROG_TYPE_STRUCT_OPS || + env->prog->type == BPF_PROG_TYPE_LSM) && !prog->aux->attach_func_proto->type) return 0; @@ -8139,26 +8741,48 @@ static bool is_tracing_prog_type(enum bpf_prog_type type) } } +static bool is_preallocated_map(struct bpf_map *map) +{ + if (!check_map_prealloc(map)) + return false; + if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta)) + return false; + return true; +} + static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, struct bpf_prog *prog) { - /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use - * preallocated hash maps, since doing memory allocation - * in overflow_handler can crash depending on where nmi got - * triggered. + /* + * Validate that trace type programs use preallocated hash maps. + * + * For programs attached to PERF events this is mandatory as the + * perf NMI can hit any arbitrary code sequence. + * + * All other trace types using preallocated hash maps are unsafe as + * well because tracepoint or kprobes can be inside locked regions + * of the memory allocator or at a place where a recursion into the + * memory allocator would see inconsistent state. + * + * On RT enabled kernels run-time allocation of all trace type + * programs is strictly prohibited due to lock type constraints. On + * !RT kernels it is allowed for backwards compatibility reasons for + * now, but warnings are emitted so developers are made aware of + * the unsafety and can fix their programs before this is enforced. */ - if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { - if (!check_map_prealloc(map)) { + if (is_tracing_prog_type(prog->type) && !is_preallocated_map(map)) { + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { verbose(env, "perf_event programs can only use preallocated hash map\n"); return -EINVAL; } - if (map->inner_map_meta && - !check_map_prealloc(map->inner_map_meta)) { - verbose(env, "perf_event programs can only use preallocated inner hash map\n"); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + verbose(env, "trace type programs can only use preallocated hash map\n"); return -EINVAL; } + WARN_ONCE(1, "trace type BPF program uses run-time allocation\n"); + verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n"); } if ((is_tracing_prog_type(prog->type) || @@ -9774,6 +10398,26 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return 0; } +#define SECURITY_PREFIX "security_" + +static int check_attach_modify_return(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + unsigned long addr = (unsigned long) prog->aux->trampoline->func.addr; + + /* This is expected to be cleaned up in the future with the KRSI effort + * introducing the LSM_HOOK macro for cleaning up lsm_hooks.h. + */ + if (within_error_injection_list(addr) || + !strncmp(SECURITY_PREFIX, prog->aux->attach_func_name, + sizeof(SECURITY_PREFIX) - 1)) + return 0; + + verbose(env, "fmod_ret attach_btf_id %u (%s) is not modifiable\n", + prog->aux->attach_btf_id, prog->aux->attach_func_name); + + return -EINVAL; +} static int check_attach_btf_id(struct bpf_verifier_env *env) { @@ -9794,7 +10438,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) return check_struct_ops_btf_id(env); - if (prog->type != BPF_PROG_TYPE_TRACING && !prog_extension) + if (prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_LSM && + !prog_extension) return 0; if (!btf_id) { @@ -9924,8 +10570,17 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (!prog_extension) return -EINVAL; /* fallthrough */ + case BPF_MODIFY_RETURN: + case BPF_LSM_MAC: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + prog->aux->attach_func_name = tname; + if (prog->type == BPF_PROG_TYPE_LSM) { + ret = bpf_lsm_verify_prog(&env->log, prog); + if (ret < 0) + return ret; + } + if (!btf_type_is_func(t)) { verbose(env, "attach_btf_id %u is not a function\n", btf_id); @@ -9940,7 +10595,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) tr = bpf_trampoline_lookup(key); if (!tr) return -ENOMEM; - prog->aux->attach_func_name = tname; /* t is either vmlinux type or another program's type */ prog->aux->attach_func_proto = t; mutex_lock(&tr->mutex); @@ -9973,6 +10627,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } tr->func.addr = (void *)addr; prog->aux->trampoline = tr; + + if (prog->expected_attach_type == BPF_MODIFY_RETURN) + ret = check_attach_modify_return(env); out: mutex_unlock(&tr->mutex); if (ret) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3dead0416b91..915dda3f7f19 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6303,27 +6303,58 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ #ifdef CONFIG_CGROUP_BPF -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_prog *replace_prog, enum bpf_attach_type type, +int cgroup_bpf_attach(struct cgroup *cgrp, + struct bpf_prog *prog, struct bpf_prog *replace_prog, + struct bpf_cgroup_link *link, + enum bpf_attach_type type, u32 flags) { int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags); + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); mutex_unlock(&cgroup_mutex); return ret; } + +int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog, + struct bpf_prog *new_prog) +{ + struct bpf_cgroup_link *cg_link; + int ret; + + if (link->ops != &bpf_cgroup_link_lops) + return -EINVAL; + + cg_link = container_of(link, struct bpf_cgroup_link, link); + + mutex_lock(&cgroup_mutex); + /* link might have been auto-released by dying cgroup, so fail */ + if (!cg_link->cgroup) { + ret = -EINVAL; + goto out_unlock; + } + if (old_prog && link->prog != old_prog) { + ret = -EPERM; + goto out_unlock; + } + ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); +out_unlock: + mutex_unlock(&cgroup_mutex); + return ret; +} + int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 flags) + enum bpf_attach_type type) { int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_detach(cgrp, prog, type); + ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); mutex_unlock(&cgroup_mutex); return ret; } + int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr) { diff --git a/kernel/events/core.c b/kernel/events/core.c index d22e4ba59dfa..318435c5bf0b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8417,23 +8417,22 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, enum perf_bpf_event_type type) { bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; - char sym[KSYM_NAME_LEN]; int i; if (prog->aux->func_cnt == 0) { - bpf_get_prog_name(prog, sym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)prog->bpf_func, - prog->jited_len, unregister, sym); + prog->jited_len, unregister, + prog->aux->ksym.name); } else { for (i = 0; i < prog->aux->func_cnt; i++) { struct bpf_prog *subprog = prog->aux->func[i]; - bpf_get_prog_name(subprog, sym); perf_event_ksymbol( PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)subprog->bpf_func, - subprog->jited_len, unregister, sym); + subprog->jited_len, unregister, + prog->aux->ksym.name); } } } @@ -9368,7 +9367,6 @@ static void bpf_overflow_handler(struct perf_event *event, int ret = 0; ctx.regs = perf_arch_bpf_user_pt_regs(regs); - preempt_disable(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); @@ -9376,7 +9374,6 @@ static void bpf_overflow_handler(struct perf_event *event, rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); - preempt_enable(); if (!ret) return; diff --git a/kernel/extable.c b/kernel/extable.c index a0024f27d3a1..7681f87e89dd 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -149,8 +149,6 @@ int kernel_text_address(unsigned long addr) goto out; if (is_bpf_text_address(addr)) goto out; - if (is_bpf_image_address(addr)) - goto out; ret = 0; out: if (no_rcu) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ec5c606bc3a1..55a6184f5990 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -268,16 +268,14 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ - preempt_disable(); for (; f; f = f->prev) { - u32 cur_ret = BPF_PROG_RUN(f->prog, sd); + u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd); if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) { ret = cur_ret; *match = f; } } - preempt_enable(); return ret; } #endif /* CONFIG_SECCOMP_FILTER */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 68250d433bd7..ca1796747a77 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -83,7 +83,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) if (in_nmi()) /* not supported yet */ return 1; - preempt_disable(); + cant_sleep(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { /* @@ -115,11 +115,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) out: __this_cpu_dec(bpf_prog_active); - preempt_enable(); return ret; } -EXPORT_SYMBOL_GPL(trace_call_bpf); #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) @@ -781,8 +779,8 @@ static const struct bpf_func_proto bpf_send_signal_thread_proto = { .arg1_type = ARG_ANYTHING, }; -static const struct bpf_func_proto * -tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +const struct bpf_func_proto * +bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -843,6 +841,10 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_send_signal_proto; case BPF_FUNC_send_signal_thread: return &bpf_send_signal_thread_proto; + case BPF_FUNC_perf_event_read_value: + return &bpf_perf_event_read_value_proto; + case BPF_FUNC_get_ns_current_pid_tgid: + return &bpf_get_ns_current_pid_tgid_proto; default: return NULL; } @@ -858,14 +860,12 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_stackid_proto; case BPF_FUNC_get_stack: return &bpf_get_stack_proto; - case BPF_FUNC_perf_event_read_value: - return &bpf_perf_event_read_value_proto; #ifdef CONFIG_BPF_KPROBE_OVERRIDE case BPF_FUNC_override_return: return &bpf_override_return_proto; #endif default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } @@ -975,7 +975,7 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_stack: return &bpf_get_stack_proto_tp; default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } @@ -1028,6 +1028,45 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { .arg3_type = ARG_CONST_SIZE, }; +BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, + void *, buf, u32, size, u64, flags) +{ +#ifndef CONFIG_X86 + return -ENOENT; +#else + static const u32 br_entry_size = sizeof(struct perf_branch_entry); + struct perf_branch_stack *br_stack = ctx->data->br_stack; + u32 to_copy; + + if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE)) + return -EINVAL; + + if (unlikely(!br_stack)) + return -EINVAL; + + if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE) + return br_stack->nr * br_entry_size; + + if (!buf || (size % br_entry_size != 0)) + return -EINVAL; + + to_copy = min_t(u32, br_stack->nr * br_entry_size, size); + memcpy(buf, br_stack->entries, to_copy); + + return to_copy; +#endif +} + +static const struct bpf_func_proto bpf_read_branch_records_proto = { + .func = bpf_read_branch_records, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1040,8 +1079,10 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_stack_proto_tp; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; + case BPF_FUNC_read_branch_records: + return &bpf_read_branch_records_proto; default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } @@ -1104,6 +1145,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { }; extern const struct bpf_func_proto bpf_skb_output_proto; +extern const struct bpf_func_proto bpf_xdp_output_proto; BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) @@ -1168,7 +1210,7 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_stack: return &bpf_get_stack_proto_raw_tp; default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } @@ -1179,6 +1221,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_NET case BPF_FUNC_skb_output: return &bpf_skb_output_proto; + case BPF_FUNC_xdp_output: + return &bpf_xdp_output_proto; #endif default: return raw_tp_prog_func_proto(func_id, prog); @@ -1213,6 +1257,13 @@ static bool tracing_prog_is_valid_access(int off, int size, return btf_ctx_access(off, size, type, prog, info); } +int __weak bpf_prog_test_run_tracing(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { .get_func_proto = raw_tp_prog_func_proto, .is_valid_access = raw_tp_prog_is_valid_access, @@ -1227,6 +1278,7 @@ const struct bpf_verifier_ops tracing_verifier_ops = { }; const struct bpf_prog_ops tracing_prog_ops = { + .test_run = bpf_prog_test_run_tracing, }; static bool raw_tp_writable_prog_is_valid_access(int off, int size, @@ -1475,10 +1527,9 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) static __always_inline void __bpf_trace_run(struct bpf_prog *prog, u64 *args) { + cant_sleep(); rcu_read_lock(); - preempt_disable(); (void) BPF_PROG_RUN(prog, args); - preempt_enable(); rcu_read_unlock(); } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 18d16f3ef980..2a8e8e9c1c75 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1333,8 +1333,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, int size, esize; int rctx; - if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) - return; + if (bpf_prog_array_valid(call)) { + u32 ret; + + preempt_disable(); + ret = trace_call_bpf(call, regs); + preempt_enable(); + if (!ret) + return; + } esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); |