diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/Makefile | 3 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 16 | ||||
-rw-r--r-- | kernel/bpf/cgroup.c | 27 | ||||
-rw-r--r-- | kernel/bpf/devmap.c | 339 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 21 | ||||
-rw-r--r-- | kernel/bpf/sysfs_btf.c | 46 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 11 | ||||
-rw-r--r-- | kernel/bpf/xskmap.c | 133 | ||||
-rw-r--r-- | kernel/trace/Kconfig | 3 |
9 files changed, 511 insertions, 88 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 29d781061cd5..e1d9adb212f9 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -22,3 +22,6 @@ obj-$(CONFIG_CGROUP_BPF) += cgroup.o ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o endif +ifeq ($(CONFIG_SYSFS),y) +obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o +endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 5fcc7a17eb5a..adb3adcebe3c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -195,8 +195,8 @@ i < btf_type_vlen(struct_type); \ i++, member++) -static DEFINE_IDR(btf_idr); -static DEFINE_SPINLOCK(btf_idr_lock); +DEFINE_IDR(btf_idr); +DEFINE_SPINLOCK(btf_idr_lock); struct btf { void *data; @@ -3376,6 +3376,15 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m); } +#ifdef CONFIG_PROC_FS +static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct btf *btf = filp->private_data; + + seq_printf(m, "btf_id:\t%u\n", btf->id); +} +#endif + static int btf_release(struct inode *inode, struct file *filp) { btf_put(filp->private_data); @@ -3383,6 +3392,9 @@ static int btf_release(struct inode *inode, struct file *filp) } const struct file_operations btf_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_btf_show_fdinfo, +#endif .release = btf_release, }; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 0a00eaca6fae..ddd8addcdb5c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -964,7 +964,6 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) return -ENOMEM; ctx->optval_end = ctx->optval + max_optlen; - ctx->optlen = max_optlen; return 0; } @@ -984,7 +983,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, .level = *level, .optname = *optname, }; - int ret; + int ret, max_optlen; /* Opportunistic check to see whether we have any BPF program * attached to the hook so we don't waste time allocating @@ -994,10 +993,18 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) return 0; - ret = sockopt_alloc_buf(&ctx, *optlen); + /* Allocate a bit more than the initial user buffer for + * BPF program. The canonical use case is overriding + * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). + */ + max_optlen = max_t(int, 16, *optlen); + + ret = sockopt_alloc_buf(&ctx, max_optlen); if (ret) return ret; + ctx.optlen = *optlen; + if (copy_from_user(ctx.optval, optval, *optlen) != 0) { ret = -EFAULT; goto out; @@ -1016,7 +1023,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, if (ctx.optlen == -1) { /* optlen set to -1, bypass kernel */ ret = 1; - } else if (ctx.optlen > *optlen || ctx.optlen < -1) { + } else if (ctx.optlen > max_optlen || ctx.optlen < -1) { /* optlen is out of bounds */ ret = -EFAULT; } else { @@ -1063,6 +1070,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, if (ret) return ret; + ctx.optlen = max_optlen; + if (!retval) { /* If kernel getsockopt finished successfully, * copy whatever was returned to the user back @@ -1325,6 +1334,7 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + u32 read_size; switch (si->off) { case offsetof(struct bpf_sysctl, write): @@ -1356,7 +1366,9 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, treg, si->dst_reg, offsetof(struct bpf_sysctl_kern, ppos)); *insn++ = BPF_STX_MEM( - BPF_SIZEOF(u32), treg, si->src_reg, 0); + BPF_SIZEOF(u32), treg, si->src_reg, + bpf_ctx_narrow_access_offset( + 0, sizeof(u32), sizeof(loff_t))); *insn++ = BPF_LDX_MEM( BPF_DW, treg, si->dst_reg, offsetof(struct bpf_sysctl_kern, tmp_reg)); @@ -1365,8 +1377,11 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), si->dst_reg, si->src_reg, offsetof(struct bpf_sysctl_kern, ppos)); + read_size = bpf_size_to_bytes(BPF_SIZE(si->code)); *insn++ = BPF_LDX_MEM( - BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0); + BPF_SIZE(si->code), si->dst_reg, si->dst_reg, + bpf_ctx_narrow_access_offset( + 0, read_size, sizeof(loff_t))); } *target_size = sizeof(u32); break; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d83cf8ccc872..d27f3b60ff6d 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -37,6 +37,12 @@ * notifier hook walks the map we know that new dev references can not be * added by the user because core infrastructure ensures dev_get_by_index() * calls will fail at this point. + * + * The devmap_hash type is a map type which interprets keys as ifindexes and + * indexes these using a hashmap. This allows maps that use ifindex as key to be + * densely packed instead of having holes in the lookup array for unused + * ifindexes. The setup and packet enqueue/send code is shared between the two + * types of devmap; only the lookup and insertion is different. */ #include <linux/bpf.h> #include <net/xdp.h> @@ -59,10 +65,11 @@ struct xdp_bulk_queue { struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ + struct hlist_node index_hlist; struct bpf_dtab *dtab; - unsigned int bit; struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; + unsigned int idx; /* keep track of map index for tracepoint */ }; struct bpf_dtab { @@ -70,33 +77,45 @@ struct bpf_dtab { struct bpf_dtab_netdev **netdev_map; struct list_head __percpu *flush_list; struct list_head list; + + /* these are only used for DEVMAP_HASH type maps */ + struct hlist_head *dev_index_head; + spinlock_t index_lock; + unsigned int items; + u32 n_buckets; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +static struct hlist_head *dev_map_create_hash(unsigned int entries) +{ + int i; + struct hlist_head *hash; + + hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); + if (hash != NULL) + for (i = 0; i < entries; i++) + INIT_HLIST_HEAD(&hash[i]); + + return hash; +} + +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { - struct bpf_dtab *dtab; int err, cpu; u64 cost; - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); + return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the * verifier prevents writes from the BPF side */ attr->map_flags |= BPF_F_RDONLY_PROG; - dtab = kzalloc(sizeof(*dtab), GFP_USER); - if (!dtab) - return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&dtab->map, attr); @@ -104,12 +123,18 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); cost += sizeof(struct list_head) * num_possible_cpus(); + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); + + if (!dtab->n_buckets) /* Overflow check */ + return -EINVAL; + cost += sizeof(struct hlist_head) * dtab->n_buckets; + } + /* if map size is larger than memlock limit, reject it */ err = bpf_map_charge_init(&dtab->map.memory, cost); if (err) - goto free_dtab; - - err = -ENOMEM; + return -EINVAL; dtab->flush_list = alloc_percpu(struct list_head); if (!dtab->flush_list) @@ -124,19 +149,48 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab->netdev_map) goto free_percpu; - spin_lock(&dev_map_lock); - list_add_tail_rcu(&dtab->list, &dev_map_list); - spin_unlock(&dev_map_lock); + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); + if (!dtab->dev_index_head) + goto free_map_area; - return &dtab->map; + spin_lock_init(&dtab->index_lock); + } + return 0; + +free_map_area: + bpf_map_area_free(dtab->netdev_map); free_percpu: free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); -free_dtab: - kfree(dtab); - return ERR_PTR(err); + return -ENOMEM; +} + +static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +{ + struct bpf_dtab *dtab; + int err; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + dtab = kzalloc(sizeof(*dtab), GFP_USER); + if (!dtab) + return ERR_PTR(-ENOMEM); + + err = dev_map_init_map(dtab, attr); + if (err) { + kfree(dtab); + return ERR_PTR(err); + } + + spin_lock(&dev_map_lock); + list_add_tail_rcu(&dtab->list, &dev_map_list); + spin_unlock(&dev_map_lock); + + return &dtab->map; } static void dev_map_free(struct bpf_map *map) @@ -188,6 +242,7 @@ static void dev_map_free(struct bpf_map *map) free_percpu(dtab->flush_list); bpf_map_area_free(dtab->netdev_map); + kfree(dtab->dev_index_head); kfree(dtab); } @@ -208,6 +263,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, + int idx) +{ + return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; +} + +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct hlist_head *head = dev_map_index_hash(dtab, key); + struct bpf_dtab_netdev *dev; + + hlist_for_each_entry_rcu(dev, head, index_hlist) + if (dev->idx == key) + return dev; + + return NULL; +} + +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + u32 idx, *next = next_key; + struct bpf_dtab_netdev *dev, *next_dev; + struct hlist_head *head; + int i = 0; + + if (!key) + goto find_first; + + idx = *(u32 *)key; + + dev = __dev_map_hash_lookup_elem(map, idx); + if (!dev) + goto find_first; + + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), + struct bpf_dtab_netdev, index_hlist); + + if (next_dev) { + *next = next_dev->idx; + return 0; + } + + i = idx & (dtab->n_buckets - 1); + i++; + + find_first: + for (; i < dtab->n_buckets; i++) { + head = dev_map_index_hash(dtab, i); + + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct bpf_dtab_netdev, + index_hlist); + if (next_dev) { + *next = next_dev->idx; + return 0; + } + } + + return -ENOENT; +} + static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, bool in_napi_ctx) { @@ -235,7 +354,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, out: bq->count = 0; - trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, + trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); @@ -363,6 +482,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->ifindex : NULL; } +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, + *(u32 *)key); + struct net_device *dev = obj ? obj->dev : NULL; + + return dev ? &dev->ifindex : NULL; +} + static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_xmit) { @@ -412,17 +540,74 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct net *net = current->nsproxy->net_ns; + struct bpf_dtab_netdev *old_dev; + int k = *(u32 *)key; + unsigned long flags; + int ret = -ENOENT; + + spin_lock_irqsave(&dtab->index_lock, flags); + + old_dev = __dev_map_hash_lookup_elem(map, k); + if (old_dev) { + dtab->items--; + hlist_del_init_rcu(&old_dev->index_hlist); + call_rcu(&old_dev->rcu, __dev_map_entry_free); + ret = 0; + } + spin_unlock_irqrestore(&dtab->index_lock, flags); + + return ret; +} + +static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, + struct bpf_dtab *dtab, + u32 ifindex, + unsigned int idx) +{ gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; + struct bpf_dtab_netdev *dev; + struct xdp_bulk_queue *bq; + int cpu; + + dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node); + if (!dev) + return ERR_PTR(-ENOMEM); + + dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), + sizeof(void *), gfp); + if (!dev->bulkq) { + kfree(dev); + return ERR_PTR(-ENOMEM); + } + + for_each_possible_cpu(cpu) { + bq = per_cpu_ptr(dev->bulkq, cpu); + bq->obj = dev; + } + + dev->dev = dev_get_by_index(net, ifindex); + if (!dev->dev) { + free_percpu(dev->bulkq); + kfree(dev); + return ERR_PTR(-EINVAL); + } + + dev->idx = idx; + dev->dtab = dtab; + + return dev; +} + +static int __dev_map_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; u32 ifindex = *(u32 *)value; - struct xdp_bulk_queue *bq; u32 i = *(u32 *)key; - int cpu; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -434,31 +619,9 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (!ifindex) { dev = NULL; } else { - dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); - if (!dev) - return -ENOMEM; - - dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), - sizeof(void *), gfp); - if (!dev->bulkq) { - kfree(dev); - return -ENOMEM; - } - - for_each_possible_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq->obj = dev; - } - - dev->dev = dev_get_by_index(net, ifindex); - if (!dev->dev) { - free_percpu(dev->bulkq); - kfree(dev); - return -EINVAL; - } - - dev->bit = i; - dev->dtab = dtab; + dev = __dev_map_alloc_node(net, dtab, ifindex, i); + if (IS_ERR(dev)) + return PTR_ERR(dev); } /* Use call_rcu() here to ensure rcu critical sections have completed @@ -472,6 +635,70 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } +static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __dev_map_update_elem(current->nsproxy->net_ns, + map, key, value, map_flags); +} + +static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *dev, *old_dev; + u32 ifindex = *(u32 *)value; + u32 idx = *(u32 *)key; + unsigned long flags; + int err = -EEXIST; + + if (unlikely(map_flags > BPF_EXIST || !ifindex)) + return -EINVAL; + + spin_lock_irqsave(&dtab->index_lock, flags); + + old_dev = __dev_map_hash_lookup_elem(map, idx); + if (old_dev && (map_flags & BPF_NOEXIST)) + goto out_err; + + dev = __dev_map_alloc_node(net, dtab, ifindex, idx); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out_err; + } + + if (old_dev) { + hlist_del_rcu(&old_dev->index_hlist); + } else { + if (dtab->items >= dtab->map.max_entries) { + spin_unlock_irqrestore(&dtab->index_lock, flags); + call_rcu(&dev->rcu, __dev_map_entry_free); + return -E2BIG; + } + dtab->items++; + } + + hlist_add_head_rcu(&dev->index_hlist, + dev_map_index_hash(dtab, idx)); + spin_unlock_irqrestore(&dtab->index_lock, flags); + + if (old_dev) + call_rcu(&old_dev->rcu, __dev_map_entry_free); + + return 0; + +out_err: + spin_unlock_irqrestore(&dtab->index_lock, flags); + return err; +} + +static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __dev_map_hash_update_elem(current->nsproxy->net_ns, + map, key, value, map_flags); +} + const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -482,6 +709,16 @@ const struct bpf_map_ops dev_map_ops = { .map_check_btf = map_check_no_btf, }; +const struct bpf_map_ops dev_map_hash_ops = { + .map_alloc = dev_map_alloc, + .map_free = dev_map_free, + .map_get_next_key = dev_map_hash_get_next_key, + .map_lookup_elem = dev_map_hash_lookup_elem, + .map_update_elem = dev_map_hash_update_elem, + .map_delete_elem = dev_map_hash_delete_elem, + .map_check_btf = map_check_no_btf, +}; + static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 272071e9112f..82eabd4e38ad 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -683,8 +683,8 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) } /* map_idr_lock should have been held */ -static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, - bool uref) +static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, + bool uref) { int refold; @@ -704,6 +704,16 @@ static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, return map; } +struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, bool uref) +{ + spin_lock_bh(&map_idr_lock); + map = __bpf_map_inc_not_zero(map, uref); + spin_unlock_bh(&map_idr_lock); + + return map; +} +EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); + int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { return -ENOTSUPP; @@ -1619,6 +1629,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT | + BPF_F_TEST_STATE_FREQ | BPF_F_TEST_RND_HI32)) return -EINVAL; @@ -2183,7 +2194,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) spin_lock_bh(&map_idr_lock); map = idr_find(&map_idr, id); if (map) - map = bpf_map_inc_not_zero(map, true); + map = __bpf_map_inc_not_zero(map, true); else map = ERR_PTR(-ENOENT); spin_unlock_bh(&map_idr_lock); @@ -2880,6 +2891,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_obj_get_next_id(&attr, uattr, &map_idr, &map_idr_lock); break; + case BPF_BTF_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &btf_idr, &btf_idr_lock); + break; case BPF_PROG_GET_FD_BY_ID: err = bpf_prog_get_fd_by_id(&attr); break; diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c new file mode 100644 index 000000000000..7ae5dddd1fe6 --- /dev/null +++ b/kernel/bpf/sysfs_btf.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide kernel BTF information for introspection and use by eBPF tools. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/kobject.h> +#include <linux/init.h> +#include <linux/sysfs.h> + +/* See scripts/link-vmlinux.sh, gen_btf() func for details */ +extern char __weak _binary__btf_vmlinux_bin_start[]; +extern char __weak _binary__btf_vmlinux_bin_end[]; + +static ssize_t +btf_vmlinux_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) +{ + memcpy(buf, _binary__btf_vmlinux_bin_start + off, len); + return len; +} + +static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { + .attr = { .name = "vmlinux", .mode = 0444, }, + .read = btf_vmlinux_read, +}; + +static struct kobject *btf_kobj; + +static int __init btf_vmlinux_init(void) +{ + if (!_binary__btf_vmlinux_bin_start) + return 0; + + btf_kobj = kobject_create_and_add("btf", kernel_kobj); + if (!btf_kobj) + return -ENOMEM; + + bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end - + _binary__btf_vmlinux_bin_start; + + return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); +} + +subsys_initcall(btf_vmlinux_init); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c36a719fee6d..ffc3e53f5300 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3463,6 +3463,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: if (func_id != BPF_FUNC_redirect_map && func_id != BPF_FUNC_map_lookup_elem) goto error; @@ -3545,6 +3546,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_redirect_map: if (map->map_type != BPF_MAP_TYPE_DEVMAP && + map->map_type != BPF_MAP_TYPE_DEVMAP_HASH && map->map_type != BPF_MAP_TYPE_CPUMAP && map->map_type != BPF_MAP_TYPE_XSKMAP) goto error; @@ -7226,7 +7228,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - bool add_new_state = false; + bool add_new_state = env->test_state_freq ? true : false; cur->last_insn_idx = env->prev_insn_idx; if (!env->insn_aux_data[insn_idx].prune_point) @@ -8622,8 +8624,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (is_narrower_load && size < target_size) { - u8 shift = bpf_ctx_narrow_load_shift(off, size, - size_default); + u8 shift = bpf_ctx_narrow_access_offset( + off, size, size_default) * 8; if (ctx_field_size <= 4) { if (shift) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, @@ -9266,6 +9268,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->allow_ptr_leaks = is_priv; + if (is_priv) + env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 9bb96ace9fa1..942c662e2eed 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -13,8 +13,71 @@ struct xsk_map { struct bpf_map map; struct xdp_sock **xsk_map; struct list_head __percpu *flush_list; + spinlock_t lock; /* Synchronize map updates */ }; +int xsk_map_inc(struct xsk_map *map) +{ + struct bpf_map *m = &map->map; + + m = bpf_map_inc(m, false); + return PTR_ERR_OR_ZERO(m); +} + +void xsk_map_put(struct xsk_map *map) +{ + bpf_map_put(&map->map); +} + +static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, + struct xdp_sock **map_entry) +{ + struct xsk_map_node *node; + int err; + + node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); + if (!node) + return NULL; + + err = xsk_map_inc(map); + if (err) { + kfree(node); + return ERR_PTR(err); + } + + node->map = map; + node->map_entry = map_entry; + return node; +} + +static void xsk_map_node_free(struct xsk_map_node *node) +{ + xsk_map_put(node->map); + kfree(node); +} + +static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) +{ + spin_lock_bh(&xs->map_list_lock); + list_add_tail(&node->node, &xs->map_list); + spin_unlock_bh(&xs->map_list_lock); +} + +static void xsk_map_sock_delete(struct xdp_sock *xs, + struct xdp_sock **map_entry) +{ + struct xsk_map_node *n, *tmp; + + spin_lock_bh(&xs->map_list_lock); + list_for_each_entry_safe(n, tmp, &xs->map_list, node) { + if (map_entry == n->map_entry) { + list_del(&n->node); + xsk_map_node_free(n); + } + } + spin_unlock_bh(&xs->map_list_lock); +} + static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { struct xsk_map *m; @@ -34,6 +97,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&m->map, attr); + spin_lock_init(&m->lock); cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); cost += sizeof(struct list_head) * num_possible_cpus(); @@ -71,21 +135,9 @@ free_m: static void xsk_map_free(struct bpf_map *map) { struct xsk_map *m = container_of(map, struct xsk_map, map); - int i; bpf_clear_redirect_map(map); synchronize_net(); - - for (i = 0; i < map->max_entries; i++) { - struct xdp_sock *xs; - - xs = m->xsk_map[i]; - if (!xs) - continue; - - sock_put((struct sock *)xs); - } - free_percpu(m->flush_list); bpf_map_area_free(m->xsk_map); kfree(m); @@ -164,8 +216,9 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *xs, *old_xs, **map_entry; u32 i = *(u32 *)key, fd = *(u32 *)value; - struct xdp_sock *xs, *old_xs; + struct xsk_map_node *node; struct socket *sock; int err; @@ -173,8 +226,6 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, return -EINVAL; if (unlikely(i >= m->map.max_entries)) return -E2BIG; - if (unlikely(map_flags == BPF_NOEXIST)) - return -EEXIST; sock = sockfd_lookup(fd, &err); if (!sock) @@ -192,32 +243,70 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, return -EOPNOTSUPP; } - sock_hold(sock->sk); + map_entry = &m->xsk_map[i]; + node = xsk_map_node_alloc(m, map_entry); + if (IS_ERR(node)) { + sockfd_put(sock); + return PTR_ERR(node); + } - old_xs = xchg(&m->xsk_map[i], xs); + spin_lock_bh(&m->lock); + old_xs = READ_ONCE(*map_entry); + if (old_xs == xs) { + err = 0; + goto out; + } else if (old_xs && map_flags == BPF_NOEXIST) { + err = -EEXIST; + goto out; + } else if (!old_xs && map_flags == BPF_EXIST) { + err = -ENOENT; + goto out; + } + xsk_map_sock_add(xs, node); + WRITE_ONCE(*map_entry, xs); if (old_xs) - sock_put((struct sock *)old_xs); - + xsk_map_sock_delete(old_xs, map_entry); + spin_unlock_bh(&m->lock); sockfd_put(sock); return 0; + +out: + spin_unlock_bh(&m->lock); + sockfd_put(sock); + xsk_map_node_free(node); + return err; } static int xsk_map_delete_elem(struct bpf_map *map, void *key) { struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *old_xs; + struct xdp_sock *old_xs, **map_entry; int k = *(u32 *)key; if (k >= map->max_entries) return -EINVAL; - old_xs = xchg(&m->xsk_map[k], NULL); + spin_lock_bh(&m->lock); + map_entry = &m->xsk_map[k]; + old_xs = xchg(map_entry, NULL); if (old_xs) - sock_put((struct sock *)old_xs); + xsk_map_sock_delete(old_xs, map_entry); + spin_unlock_bh(&m->lock); return 0; } +void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, + struct xdp_sock **map_entry) +{ + spin_lock_bh(&map->lock); + if (READ_ONCE(*map_entry) == xs) { + WRITE_ONCE(*map_entry, NULL); + xsk_map_sock_delete(xs, map_entry); + } + spin_unlock_bh(&map->lock); +} + const struct bpf_map_ops xsk_map_ops = { .map_alloc = xsk_map_alloc, .map_free = xsk_map_free, diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6a64d7772870..e08527f50d2a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -520,7 +520,8 @@ config BPF_EVENTS bool default y help - This allows the user to attach BPF programs to kprobe events. + This allows the user to attach BPF programs to kprobe, uprobe, and + tracepoint events. config DYNAMIC_EVENTS def_bool n |