diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/Makefile | 3 | ||||
| -rw-r--r-- | kernel/bpf/bpf_struct_ops.c | 628 | ||||
| -rw-r--r-- | kernel/bpf/bpf_struct_ops_types.h | 9 | ||||
| -rw-r--r-- | kernel/bpf/btf.c | 163 | ||||
| -rw-r--r-- | kernel/bpf/map_in_map.c | 3 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 69 | ||||
| -rw-r--r-- | kernel/bpf/trampoline.c | 8 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 139 |
8 files changed, 918 insertions, 104 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index d4f330351f87..046ce5d98033 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -27,3 +27,6 @@ endif ifeq ($(CONFIG_SYSFS),y) obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o endif +ifeq ($(CONFIG_BPF_JIT),y) +obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o +endif diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c new file mode 100644 index 000000000000..ddf48f49914b --- /dev/null +++ b/kernel/bpf/bpf_struct_ops.c @@ -0,0 +1,628 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2019 Facebook */ + +#include <linux/bpf.h> +#include <linux/bpf_verifier.h> +#include <linux/btf.h> +#include <linux/filter.h> +#include <linux/slab.h> +#include <linux/numa.h> +#include <linux/seq_file.h> +#include <linux/refcount.h> +#include <linux/mutex.h> + +enum bpf_struct_ops_state { + BPF_STRUCT_OPS_STATE_INIT, + BPF_STRUCT_OPS_STATE_INUSE, + BPF_STRUCT_OPS_STATE_TOBEFREE, +}; + +#define BPF_STRUCT_OPS_COMMON_VALUE \ + refcount_t refcnt; \ + enum bpf_struct_ops_state state + +struct bpf_struct_ops_value { + BPF_STRUCT_OPS_COMMON_VALUE; + char data[0] ____cacheline_aligned_in_smp; +}; + +struct bpf_struct_ops_map { + struct bpf_map map; + const struct bpf_struct_ops *st_ops; + /* protect map_update */ + struct mutex lock; + /* progs has all the bpf_prog that is populated + * to the func ptr of the kernel's struct + * (in kvalue.data). + */ + struct bpf_prog **progs; + /* image is a page that has all the trampolines + * that stores the func args before calling the bpf_prog. + * A PAGE_SIZE "image" is enough to store all trampoline for + * "progs[]". + */ + void *image; + /* uvalue->data stores the kernel struct + * (e.g. tcp_congestion_ops) that is more useful + * to userspace than the kvalue. For example, + * the bpf_prog's id is stored instead of the kernel + * address of a func ptr. + */ + struct bpf_struct_ops_value *uvalue; + /* kvalue.data stores the actual kernel's struct + * (e.g. tcp_congestion_ops) that will be + * registered to the kernel subsystem. + */ + struct bpf_struct_ops_value kvalue; +}; + +#define VALUE_PREFIX "bpf_struct_ops_" +#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) + +/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is + * the map's value exposed to the userspace and its btf-type-id is + * stored at the map->btf_vmlinux_value_type_id. + * + */ +#define BPF_STRUCT_OPS_TYPE(_name) \ +extern struct bpf_struct_ops bpf_##_name; \ + \ +struct bpf_struct_ops_##_name { \ + BPF_STRUCT_OPS_COMMON_VALUE; \ + struct _name data ____cacheline_aligned_in_smp; \ +}; +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE + +enum { +#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name, +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE + __NR_BPF_STRUCT_OPS_TYPE, +}; + +static struct bpf_struct_ops * const bpf_struct_ops[] = { +#define BPF_STRUCT_OPS_TYPE(_name) \ + [BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name, +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE +}; + +const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { +}; + +const struct bpf_prog_ops bpf_struct_ops_prog_ops = { +}; + +static const struct btf_type *module_type; + +void bpf_struct_ops_init(struct btf *btf) +{ + s32 type_id, value_id, module_id; + const struct btf_member *member; + struct bpf_struct_ops *st_ops; + struct bpf_verifier_log log = {}; + const struct btf_type *t; + char value_name[128]; + const char *mname; + u32 i, j; + + /* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */ +#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name); +#include "bpf_struct_ops_types.h" +#undef BPF_STRUCT_OPS_TYPE + + module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT); + if (module_id < 0) { + pr_warn("Cannot find struct module in btf_vmlinux\n"); + return; + } + module_type = btf_type_by_id(btf, module_id); + + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { + st_ops = bpf_struct_ops[i]; + + if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= + sizeof(value_name)) { + pr_warn("struct_ops name %s is too long\n", + st_ops->name); + continue; + } + sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); + + value_id = btf_find_by_name_kind(btf, value_name, + BTF_KIND_STRUCT); + if (value_id < 0) { + pr_warn("Cannot find struct %s in btf_vmlinux\n", + value_name); + continue; + } + + type_id = btf_find_by_name_kind(btf, st_ops->name, + BTF_KIND_STRUCT); + if (type_id < 0) { + pr_warn("Cannot find struct %s in btf_vmlinux\n", + st_ops->name); + continue; + } + t = btf_type_by_id(btf, type_id); + if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { + pr_warn("Cannot support #%u members in struct %s\n", + btf_type_vlen(t), st_ops->name); + continue; + } + + for_each_member(j, t, member) { + const struct btf_type *func_proto; + + mname = btf_name_by_offset(btf, member->name_off); + if (!*mname) { + pr_warn("anon member in struct %s is not supported\n", + st_ops->name); + break; + } + + if (btf_member_bitfield_size(t, member)) { + pr_warn("bit field member %s in struct %s is not supported\n", + mname, st_ops->name); + break; + } + + func_proto = btf_type_resolve_func_ptr(btf, + member->type, + NULL); + if (func_proto && + btf_distill_func_proto(&log, btf, + func_proto, mname, + &st_ops->func_models[j])) { + pr_warn("Error in parsing func ptr %s in struct %s\n", + mname, st_ops->name); + break; + } + } + + if (j == btf_type_vlen(t)) { + if (st_ops->init(btf)) { + pr_warn("Error in init bpf_struct_ops %s\n", + st_ops->name); + } else { + st_ops->type_id = type_id; + st_ops->type = t; + st_ops->value_id = value_id; + st_ops->value_type = btf_type_by_id(btf, + value_id); + } + } + } +} + +extern struct btf *btf_vmlinux; + +static const struct bpf_struct_ops * +bpf_struct_ops_find_value(u32 value_id) +{ + unsigned int i; + + if (!value_id || !btf_vmlinux) + return NULL; + + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { + if (bpf_struct_ops[i]->value_id == value_id) + return bpf_struct_ops[i]; + } + + return NULL; +} + +const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) +{ + unsigned int i; + + if (!type_id || !btf_vmlinux) + return NULL; + + for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) { + if (bpf_struct_ops[i]->type_id == type_id) + return bpf_struct_ops[i]; + } + + return NULL; +} + +static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + if (key && *(u32 *)key == 0) + return -ENOENT; + + *(u32 *)next_key = 0; + return 0; +} + +int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, + void *value) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + struct bpf_struct_ops_value *uvalue, *kvalue; + enum bpf_struct_ops_state state; + + if (unlikely(*(u32 *)key != 0)) + return -ENOENT; + + kvalue = &st_map->kvalue; + /* Pair with smp_store_release() during map_update */ + state = smp_load_acquire(&kvalue->state); + if (state == BPF_STRUCT_OPS_STATE_INIT) { + memset(value, 0, map->value_size); + return 0; + } + + /* No lock is needed. state and refcnt do not need + * to be updated together under atomic context. + */ + uvalue = (struct bpf_struct_ops_value *)value; + memcpy(uvalue, st_map->uvalue, map->value_size); + uvalue->state = state; + refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); + + return 0; +} + +static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EINVAL); +} + +static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) +{ + const struct btf_type *t = st_map->st_ops->type; + u32 i; + + for (i = 0; i < btf_type_vlen(t); i++) { + if (st_map->progs[i]) { + bpf_prog_put(st_map->progs[i]); + st_map->progs[i] = NULL; + } + } +} + +static int check_zero_holes(const struct btf_type *t, void *data) +{ + const struct btf_member *member; + u32 i, moff, msize, prev_mend = 0; + const struct btf_type *mtype; + + for_each_member(i, t, member) { + moff = btf_member_bit_offset(t, member) / 8; + if (moff > prev_mend && + memchr_inv(data + prev_mend, 0, moff - prev_mend)) + return -EINVAL; + + mtype = btf_type_by_id(btf_vmlinux, member->type); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, + NULL, NULL); + if (IS_ERR(mtype)) + return PTR_ERR(mtype); + prev_mend = moff + msize; + } + + if (t->size > prev_mend && + memchr_inv(data + prev_mend, 0, t->size - prev_mend)) + return -EINVAL; + + return 0; +} + +static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + const struct bpf_struct_ops *st_ops = st_map->st_ops; + struct bpf_struct_ops_value *uvalue, *kvalue; + const struct btf_member *member; + const struct btf_type *t = st_ops->type; + void *udata, *kdata; + int prog_fd, err = 0; + void *image; + u32 i; + + if (flags) + return -EINVAL; + + if (*(u32 *)key != 0) + return -E2BIG; + + err = check_zero_holes(st_ops->value_type, value); + if (err) + return err; + + uvalue = (struct bpf_struct_ops_value *)value; + err = check_zero_holes(t, uvalue->data); + if (err) + return err; + + if (uvalue->state || refcount_read(&uvalue->refcnt)) + return -EINVAL; + + uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; + kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; + + mutex_lock(&st_map->lock); + + if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { + err = -EBUSY; + goto unlock; + } + + memcpy(uvalue, value, map->value_size); + + udata = &uvalue->data; + kdata = &kvalue->data; + image = st_map->image; + + for_each_member(i, t, member) { + const struct btf_type *mtype, *ptype; + struct bpf_prog *prog; + u32 moff; + + moff = btf_member_bit_offset(t, member) / 8; + ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); + if (ptype == module_type) { + if (*(void **)(udata + moff)) + goto reset_unlock; + *(void **)(kdata + moff) = BPF_MODULE_OWNER; + continue; + } + + err = st_ops->init_member(t, member, kdata, udata); + if (err < 0) + goto reset_unlock; + + /* The ->init_member() has handled this member */ + if (err > 0) + continue; + + /* If st_ops->init_member does not handle it, + * we will only handle func ptrs and zero-ed members + * here. Reject everything else. + */ + + /* All non func ptr member must be 0 */ + if (!ptype || !btf_type_is_func_proto(ptype)) { + u32 msize; + + mtype = btf_type_by_id(btf_vmlinux, member->type); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, + NULL, NULL); + if (IS_ERR(mtype)) { + err = PTR_ERR(mtype); + goto reset_unlock; + } + + if (memchr_inv(udata + moff, 0, msize)) { + err = -EINVAL; + goto reset_unlock; + } + + continue; + } + + prog_fd = (int)(*(unsigned long *)(udata + moff)); + /* Similar check as the attr->attach_prog_fd */ + if (!prog_fd) + continue; + + prog = bpf_prog_get(prog_fd); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto reset_unlock; + } + st_map->progs[i] = prog; + + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || + prog->aux->attach_btf_id != st_ops->type_id || + prog->expected_attach_type != i) { + err = -EINVAL; + goto reset_unlock; + } + + err = arch_prepare_bpf_trampoline(image, + st_map->image + PAGE_SIZE, + &st_ops->func_models[i], 0, + &prog, 1, NULL, 0, NULL); + if (err < 0) + goto reset_unlock; + + *(void **)(kdata + moff) = image; + image += err; + + /* put prog_id to udata */ + *(unsigned long *)(udata + moff) = prog->aux->id; + } + + refcount_set(&kvalue->refcnt, 1); + bpf_map_inc(map); + + set_memory_ro((long)st_map->image, 1); + set_memory_x((long)st_map->image, 1); + err = st_ops->reg(kdata); + if (likely(!err)) { + /* Pair with smp_load_acquire() during lookup_elem(). + * It ensures the above udata updates (e.g. prog->aux->id) + * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. + */ + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); + goto unlock; + } + + /* Error during st_ops->reg(). It is very unlikely since + * the above init_member() should have caught it earlier + * before reg(). The only possibility is if there was a race + * in registering the struct_ops (under the same name) to + * a sub-system through different struct_ops's maps. + */ + set_memory_nx((long)st_map->image, 1); + set_memory_rw((long)st_map->image, 1); + bpf_map_put(map); + +reset_unlock: + bpf_struct_ops_map_put_progs(st_map); + memset(uvalue, 0, map->value_size); + memset(kvalue, 0, map->value_size); +unlock: + mutex_unlock(&st_map->lock); + return err; +} + +static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) +{ + enum bpf_struct_ops_state prev_state; + struct bpf_struct_ops_map *st_map; + + st_map = (struct bpf_struct_ops_map *)map; + prev_state = cmpxchg(&st_map->kvalue.state, + BPF_STRUCT_OPS_STATE_INUSE, + BPF_STRUCT_OPS_STATE_TOBEFREE); + if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) { + st_map->st_ops->unreg(&st_map->kvalue.data); + if (refcount_dec_and_test(&st_map->kvalue.refcnt)) + bpf_map_put(map); + } + + return 0; +} + +static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void *value; + + value = bpf_struct_ops_map_lookup_elem(map, key); + if (!value) + return; + + btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id, + value, m); + seq_puts(m, "\n"); +} + +static void bpf_struct_ops_map_free(struct bpf_map *map) +{ + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; + + if (st_map->progs) + bpf_struct_ops_map_put_progs(st_map); + bpf_map_area_free(st_map->progs); + bpf_jit_free_exec(st_map->image); + bpf_map_area_free(st_map->uvalue); + bpf_map_area_free(st_map); +} + +static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) +{ + if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || + attr->map_flags || !attr->btf_vmlinux_value_type_id) + return -EINVAL; + return 0; +} + +static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) +{ + const struct bpf_struct_ops *st_ops; + size_t map_total_size, st_map_size; + struct bpf_struct_ops_map *st_map; + const struct btf_type *t, *vt; + struct bpf_map_memory mem; + struct bpf_map *map; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); + if (!st_ops) + return ERR_PTR(-ENOTSUPP); + + vt = st_ops->value_type; + if (attr->value_size != vt->size) + return ERR_PTR(-EINVAL); + + t = st_ops->type; + + st_map_size = sizeof(*st_map) + + /* kvalue stores the + * struct bpf_struct_ops_tcp_congestions_ops + */ + (vt->size - sizeof(struct bpf_struct_ops_value)); + map_total_size = st_map_size + + /* uvalue */ + sizeof(vt->size) + + /* struct bpf_progs **progs */ + btf_type_vlen(t) * sizeof(struct bpf_prog *); + err = bpf_map_charge_init(&mem, map_total_size); + if (err < 0) + return ERR_PTR(err); + + st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); + if (!st_map) { + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + st_map->st_ops = st_ops; + map = &st_map->map; + + st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); + st_map->progs = + bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *), + NUMA_NO_NODE); + st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!st_map->uvalue || !st_map->progs || !st_map->image) { + bpf_struct_ops_map_free(map); + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + + mutex_init(&st_map->lock); + set_vm_flush_reset_perms(st_map->image); + bpf_map_init_from_attr(map, attr); + bpf_map_charge_move(&map->memory, &mem); + + return map; +} + +const struct bpf_map_ops bpf_struct_ops_map_ops = { + .map_alloc_check = bpf_struct_ops_map_alloc_check, + .map_alloc = bpf_struct_ops_map_alloc, + .map_free = bpf_struct_ops_map_free, + .map_get_next_key = bpf_struct_ops_map_get_next_key, + .map_lookup_elem = bpf_struct_ops_map_lookup_elem, + .map_delete_elem = bpf_struct_ops_map_delete_elem, + .map_update_elem = bpf_struct_ops_map_update_elem, + .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, +}; + +/* "const void *" because some subsystem is + * passing a const (e.g. const struct tcp_congestion_ops *) + */ +bool bpf_struct_ops_get(const void *kdata) +{ + struct bpf_struct_ops_value *kvalue; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + + return refcount_inc_not_zero(&kvalue->refcnt); +} + +void bpf_struct_ops_put(const void *kdata) +{ + struct bpf_struct_ops_value *kvalue; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + if (refcount_dec_and_test(&kvalue->refcnt)) { + struct bpf_struct_ops_map *st_map; + + st_map = container_of(kvalue, struct bpf_struct_ops_map, + kvalue); + bpf_map_put(&st_map->map); + } +} diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h new file mode 100644 index 000000000000..066d83ea1c99 --- /dev/null +++ b/kernel/bpf/bpf_struct_ops_types.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* internal file - do not include directly */ + +#ifdef CONFIG_BPF_JIT +#ifdef CONFIG_INET +#include <net/tcp.h> +BPF_STRUCT_OPS_TYPE(tcp_congestion_ops) +#endif +#endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ed2075884724..81d9cf75cacd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -180,11 +180,6 @@ */ #define BTF_MAX_SIZE (16 * 1024 * 1024) -#define for_each_member(i, struct_type, member) \ - for (i = 0, member = btf_type_member(struct_type); \ - i < btf_type_vlen(struct_type); \ - i++, member++) - #define for_each_member_from(i, from, struct_type, member) \ for (i = from, member = btf_type_member(struct_type) + from; \ i < btf_type_vlen(struct_type); \ @@ -382,6 +377,65 @@ static bool btf_type_is_datasec(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } +s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) +{ + const struct btf_type *t; + const char *tname; + u32 i; + + for (i = 1; i <= btf->nr_types; i++) { + t = btf->types[i]; + if (BTF_INFO_KIND(t->info) != kind) + continue; + + tname = btf_name_by_offset(btf, t->name_off); + if (!strcmp(tname, name)) + return i; + } + + return -ENOENT; +} + +const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, + u32 id, u32 *res_id) +{ + const struct btf_type *t = btf_type_by_id(btf, id); + + while (btf_type_is_modifier(t)) { + id = t->type; + t = btf_type_by_id(btf, t->type); + } + + if (res_id) + *res_id = id; + + return t; +} + +const struct btf_type *btf_type_resolve_ptr(const struct btf *btf, + u32 id, u32 *res_id) +{ + const struct btf_type *t; + + t = btf_type_skip_modifiers(btf, id, NULL); + if (!btf_type_is_ptr(t)) + return NULL; + + return btf_type_skip_modifiers(btf, t->type, res_id); +} + +const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, + u32 id, u32 *res_id) +{ + const struct btf_type *ptype; + + ptype = btf_type_resolve_ptr(btf, id, res_id); + if (ptype && btf_type_is_func_proto(ptype)) + return ptype; + + return NULL; +} + /* Types that act only as a source, not sink or intermediate * type when resolving. */ @@ -446,30 +500,6 @@ static const char *btf_int_encoding_str(u8 encoding) return "UNKN"; } -static u16 btf_type_vlen(const struct btf_type *t) -{ - return BTF_INFO_VLEN(t->info); -} - -static bool btf_type_kflag(const struct btf_type *t) -{ - return BTF_INFO_KFLAG(t->info); -} - -static u32 btf_member_bit_offset(const struct btf_type *struct_type, - const struct btf_member *member) -{ - return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) - : member->offset; -} - -static u32 btf_member_bitfield_size(const struct btf_type *struct_type, - const struct btf_member *member) -{ - return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) - : 0; -} - static u32 btf_type_int(const struct btf_type *t) { return *(u32 *)(t + 1); @@ -480,11 +510,6 @@ static const struct btf_array *btf_type_array(const struct btf_type *t) return (const struct btf_array *)(t + 1); } -static const struct btf_member *btf_type_member(const struct btf_type *t) -{ - return (const struct btf_member *)(t + 1); -} - static const struct btf_enum *btf_type_enum(const struct btf_type *t) { return (const struct btf_enum *)(t + 1); @@ -1057,7 +1082,7 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) * *elem_type: same as return type ("struct X") * *total_nelems: 1 */ -static const struct btf_type * +const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size, const struct btf_type **elem_type, u32 *total_nelems) @@ -1111,8 +1136,10 @@ resolved: return ERR_PTR(-EINVAL); *type_size = nelems * size; - *total_nelems = nelems; - *elem_type = type; + if (total_nelems) + *total_nelems = nelems; + if (elem_type) + *elem_type = type; return array_type ? : type; } @@ -1826,7 +1853,10 @@ static void btf_modifier_seq_show(const struct btf *btf, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) { - t = btf_type_id_resolve(btf, &type_id); + if (btf->resolved_ids) + t = btf_type_id_resolve(btf, &type_id); + else + t = btf_type_skip_modifiers(btf, type_id, NULL); btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); } @@ -3605,6 +3635,8 @@ struct btf *btf_parse_vmlinux(void) goto errout; } + bpf_struct_ops_init(btf); + btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; @@ -3677,7 +3709,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t)) + if (btf_type_is_int(t) || btf_type_is_enum(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -3697,7 +3729,6 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* this is a pointer to another type */ info->reg_type = PTR_TO_BTF_ID; - info->btf_id = t->type; if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type); @@ -3708,10 +3739,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return false; } } + + info->btf_id = t->type; t = btf_type_by_id(btf, t->type); /* skip modifiers */ - while (btf_type_is_modifier(t)) + while (btf_type_is_modifier(t)) { + info->btf_id = t->type; t = btf_type_by_id(btf, t->type); + } if (!btf_type_is_struct(t)) { bpf_log(log, "func '%s' arg%d type %s is not a struct\n", @@ -3737,23 +3772,57 @@ int btf_struct_access(struct bpf_verifier_log *log, again: tname = __btf_name_by_offset(btf_vmlinux, t->name_off); if (!btf_type_is_struct(t)) { - bpf_log(log, "Type '%s' is not a struct", tname); + bpf_log(log, "Type '%s' is not a struct\n", tname); return -EINVAL; } - for_each_member(i, t, member) { - if (btf_member_bitfield_size(t, member)) - /* bitfields are not supported yet */ - continue; + if (off + size > t->size) { + bpf_log(log, "access beyond struct %s at off %u size %u\n", + tname, off, size); + return -EACCES; + } + for_each_member(i, t, member) { /* offset of the field in bytes */ moff = btf_member_bit_offset(t, member) / 8; if (off + size <= moff) /* won't find anything, field is already too far */ break; + + if (btf_member_bitfield_size(t, member)) { + u32 end_bit = btf_member_bit_offset(t, member) + + btf_member_bitfield_size(t, member); + + /* off <= moff instead of off == moff because clang + * does not generate a BTF member for anonymous + * bitfield like the ":16" here: + * struct { + * int :16; + * int x:8; + * }; + */ + if (off <= moff && + BITS_ROUNDUP_BYTES(end_bit) <= off + size) + return SCALAR_VALUE; + + /* off may be accessing a following member + * + * or + * + * Doing partial access at either end of this + * bitfield. Continue on this case also to + * treat it as not accessing this bitfield + * and eventually error out as field not + * found to keep it simple. + * It could be relaxed if there was a legit + * partial access case later. + */ + continue; + } + /* In case of "off" is pointing to holes of a struct */ if (off < moff) - continue; + break; /* type of the field */ mtype = btf_type_by_id(btf_vmlinux, member->type); diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 5e9366b33f0f..b3c48d1533cb 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -22,7 +22,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || - inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE || + inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { fdput(f); return ERR_PTR(-ENOTSUPP); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 81ee8595dfee..f9db72a96ec0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -628,7 +628,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return ret; } -#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id +#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -642,6 +642,14 @@ static int map_create(union bpf_attr *attr) if (err) return -EINVAL; + if (attr->btf_vmlinux_value_type_id) { + if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || + attr->btf_key_type_id || attr->btf_value_type_id) + return -EINVAL; + } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { + return -EINVAL; + } + f_flags = bpf_get_file_flag(attr->map_flags); if (f_flags < 0) return f_flags; @@ -664,32 +672,35 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); - if (attr->btf_key_type_id || attr->btf_value_type_id) { + map->spin_lock_off = -EINVAL; + if (attr->btf_key_type_id || attr->btf_value_type_id || + /* Even the map's value is a kernel's struct, + * the bpf_prog.o must have BTF to begin with + * to figure out the corresponding kernel's + * counter part. Thus, attr->btf_fd has + * to be valid also. + */ + attr->btf_vmlinux_value_type_id) { struct btf *btf; - if (!attr->btf_value_type_id) { - err = -EINVAL; - goto free_map; - } - btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { err = PTR_ERR(btf); goto free_map; } + map->btf = btf; - err = map_check_btf(map, btf, attr->btf_key_type_id, - attr->btf_value_type_id); - if (err) { - btf_put(btf); - goto free_map; + if (attr->btf_value_type_id) { + err = map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); + if (err) + goto free_map; } - map->btf = btf; map->btf_key_type_id = attr->btf_key_type_id; map->btf_value_type_id = attr->btf_value_type_id; - } else { - map->spin_lock_off = -EINVAL; + map->btf_vmlinux_value_type_id = + attr->btf_vmlinux_value_type_id; } err = security_bpf_map_alloc(map); @@ -888,6 +899,9 @@ static int map_lookup_elem(union bpf_attr *attr) } else if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK) { err = map->ops->map_peek_elem(map, value); + } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + /* struct_ops map requires directly updating "value" */ + err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); } else { rcu_read_lock(); if (map->ops->map_lookup_elem_sys_only) @@ -1003,7 +1017,8 @@ static int map_update_elem(union bpf_attr *attr) goto out; } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || map->map_type == BPF_MAP_TYPE_SOCKHASH || - map->map_type == BPF_MAP_TYPE_SOCKMAP) { + map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } else if (IS_FD_PROG_ARRAY(map)) { @@ -1092,7 +1107,9 @@ static int map_delete_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; - } else if (IS_FD_PROG_ARRAY(map)) { + } else if (IS_FD_PROG_ARRAY(map) || + map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + /* These maps require sleepable context */ err = map->ops->map_delete_elem(map, key); goto out; } @@ -1672,17 +1689,22 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, u32 btf_id, u32 prog_fd) { - switch (prog_type) { - case BPF_PROG_TYPE_TRACING: + if (btf_id) { if (btf_id > BTF_MAX_TYPE) return -EINVAL; - break; - default: - if (btf_id || prog_fd) + + switch (prog_type) { + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_STRUCT_OPS: + break; + default: return -EINVAL; - break; + } } + if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING) + return -EINVAL; + switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { @@ -2817,6 +2839,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.btf_key_type_id = map->btf_key_type_id; info.btf_value_type_id = map->btf_value_type_id; } + info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_info_fill(&info, map); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 505f4e4b31d2..79a04417050d 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -160,11 +160,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (fexit_cnt) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; - err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags, + err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, + &tr->func.model, flags, fentry, fentry_cnt, fexit, fexit_cnt, tr->func.addr); - if (err) + if (err < 0) goto out; if (tr->selector) @@ -296,7 +297,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) } int __weak -arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, +arch_prepare_bpf_trampoline(void *image, void *image_end, + const struct btf_func_model *m, u32 flags, struct bpf_prog **fentry_progs, int fentry_cnt, struct bpf_prog **fexit_progs, int fexit_cnt, void *orig_call) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d433d70022fd..f5af759a8a5f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2859,11 +2859,6 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, u32 btf_id; int ret; - if (atype != BPF_READ) { - verbose(env, "only read is supported\n"); - return -EACCES; - } - if (off < 0) { verbose(env, "R%d is ptr_%s invalid negative access: off=%d\n", @@ -2880,17 +2875,32 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } - ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id); + if (env->ops->btf_struct_access) { + ret = env->ops->btf_struct_access(&env->log, t, off, size, + atype, &btf_id); + } else { + if (atype != BPF_READ) { + verbose(env, "only read is supported\n"); + return -EACCES; + } + + ret = btf_struct_access(&env->log, t, off, size, atype, + &btf_id); + } + if (ret < 0) return ret; - if (ret == SCALAR_VALUE) { - mark_reg_unknown(env, regs, value_regno); - return 0; + if (atype == BPF_READ) { + if (ret == SCALAR_VALUE) { + mark_reg_unknown(env, regs, value_regno); + return 0; + } + mark_reg_known_zero(env, regs, value_regno); + regs[value_regno].type = PTR_TO_BTF_ID; + regs[value_regno].btf_id = btf_id; } - mark_reg_known_zero(env, regs, value_regno); - regs[value_regno].type = PTR_TO_BTF_ID; - regs[value_regno].btf_id = btf_id; + return 0; } @@ -6349,8 +6359,30 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) static int check_return_code(struct bpf_verifier_env *env) { struct tnum enforce_attach_type_range = tnum_unknown; + const struct bpf_prog *prog = env->prog; struct bpf_reg_state *reg; struct tnum range = tnum_range(0, 1); + int err; + + /* The struct_ops func-ptr's return type could be "void" */ + if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS && + !prog->aux->attach_func_proto->type) + return 0; + + /* eBPF calling convetion is such that R0 is used + * to return the value from eBPF program. + * Make sure that it's readable at this time + * of bpf_exit, which means that program wrote + * something into it earlier + */ + err = check_reg_arg(env, BPF_REG_0, SRC_OP); + if (err) + return err; + + if (is_pointer_value(env, BPF_REG_0)) { + verbose(env, "R0 leaks addr as return value\n"); + return -EACCES; + } switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: @@ -8016,21 +8048,6 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - /* eBPF calling convetion is such that R0 is used - * to return the value from eBPF program. - * Make sure that it's readable at this time - * of bpf_exit, which means that program wrote - * something into it earlier - */ - err = check_reg_arg(env, BPF_REG_0, SRC_OP); - if (err) - return err; - - if (is_pointer_value(env, BPF_REG_0)) { - verbose(env, "R0 leaks addr as return value\n"); - return -EACCES; - } - err = check_return_code(env); if (err) return err; @@ -8138,6 +8155,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + verbose(env, "bpf_struct_ops map cannot be used in prog\n"); + return -EINVAL; + } + return 0; } @@ -8829,12 +8851,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) convert_ctx_access = bpf_xdp_sock_convert_ctx_access; break; case PTR_TO_BTF_ID: - if (type == BPF_WRITE) { + if (type == BPF_READ) { + insn->code = BPF_LDX | BPF_PROBE_MEM | + BPF_SIZE((insn)->code); + env->prog->aux->num_exentries++; + } else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) { verbose(env, "Writes through BTF pointers are not allowed\n"); return -EINVAL; } - insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); - env->prog->aux->num_exentries++; continue; default: continue; @@ -9502,6 +9526,58 @@ static void print_verification_stats(struct bpf_verifier_env *env) env->peak_states, env->longest_mark_read_walk); } +static int check_struct_ops_btf_id(struct bpf_verifier_env *env) +{ + const struct btf_type *t, *func_proto; + const struct bpf_struct_ops *st_ops; + const struct btf_member *member; + struct bpf_prog *prog = env->prog; + u32 btf_id, member_idx; + const char *mname; + + btf_id = prog->aux->attach_btf_id; + st_ops = bpf_struct_ops_find(btf_id); + if (!st_ops) { + verbose(env, "attach_btf_id %u is not a supported struct\n", + btf_id); + return -ENOTSUPP; + } + + t = st_ops->type; + member_idx = prog->expected_attach_type; + if (member_idx >= btf_type_vlen(t)) { + verbose(env, "attach to invalid member idx %u of struct %s\n", + member_idx, st_ops->name); + return -EINVAL; + } + + member = &btf_type_member(t)[member_idx]; + mname = btf_name_by_offset(btf_vmlinux, member->name_off); + func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type, + NULL); + if (!func_proto) { + verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n", + mname, member_idx, st_ops->name); + return -EINVAL; + } + + if (st_ops->check_member) { + int err = st_ops->check_member(t, member); + + if (err) { + verbose(env, "attach to unsupported member %s of struct %s\n", + mname, st_ops->name); + return err; + } + } + + prog->aux->attach_func_proto = func_proto; + prog->aux->attach_func_name = mname; + env->ops = st_ops->verifier_ops; + + return 0; +} + static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; @@ -9517,6 +9593,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) long addr; u64 key; + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) + return check_struct_ops_btf_id(env); + if (prog->type != BPF_PROG_TYPE_TRACING) return 0; |
