diff options
| author | Alexei Starovoitov <ast@kernel.org> | 2020-01-09 08:46:19 -0800 |
|---|---|---|
| committer | Alexei Starovoitov <ast@kernel.org> | 2020-01-09 09:15:25 -0800 |
| commit | 417759f7d4cf44a5fb526fbafcc9372e3dbfc0ae (patch) | |
| tree | c07bf1d77af64ba689ace342512ccbbc0c2d6a00 /kernel/bpf/syscall.c | |
| parent | 65726b5b7efae718391752ae8bb85b9843fd83f4 (diff) | |
| parent | 09903869f69f37fd7a465183545b5739c6274654 (diff) | |
| download | lwn-417759f7d4cf44a5fb526fbafcc9372e3dbfc0ae.tar.gz lwn-417759f7d4cf44a5fb526fbafcc9372e3dbfc0ae.zip | |
Merge branch 'tcp-bpf-cc'
Martin Lau says:
====================
This series introduces BPF STRUCT_OPS. It is an infra to allow
implementing some specific kernel's function pointers in BPF.
The first use case included in this series is to implement
TCP congestion control algorithm in BPF (i.e. implement
struct tcp_congestion_ops in BPF).
There has been attempt to move the TCP CC to the user space
(e.g. CCP in TCP). The common arguments are faster turn around,
get away from long-tail kernel versions in production...etc,
which are legit points.
BPF has been the continuous effort to join both kernel and
userspace upsides together (e.g. XDP to gain the performance
advantage without bypassing the kernel). The recent BPF
advancements (in particular BTF-aware verifier, BPF trampoline,
BPF CO-RE...) made implementing kernel struct ops (e.g. tcp cc)
possible in BPF.
The idea is to allow implementing tcp_congestion_ops in bpf.
It allows a faster turnaround for testing algorithm in the
production while leveraging the existing (and continue growing) BPF
feature/framework instead of building one specifically for
userspace TCP CC.
Please see individual patch for details.
The bpftool support will be posted in follow-up patches.
v4:
- Expose tcp_ca_find() to tcp.h in patch 7.
It is used to check the same bpf-tcp-cc
does not exist to guarantee the register()
will succeed.
- set_memory_ro() and then set_memory_x() only after all
trampolines are written to the image in patch 6. (Daniel)
spinlock is replaced by mutex because set_memory_*
requires sleepable context.
v3:
- Fix kbuild error by considering CONFIG_BPF_SYSCALL (kbuild)
- Support anonymous bitfield in patch 4 (Andrii, Yonghong)
- Push boundary safety check to a specific arch's trampoline function
(in patch 6) (Yonghong).
Reuse the WANR_ON_ONCE check in arch_prepare_bpf_trampoline() in x86.
- Check module field is 0 in udata in patch 6 (Yonghong)
- Check zero holes in patch 6 (Andrii)
- s/_btf_vmlinux/btf/ in patch 5 and 7 (Andrii)
- s/check_xxx/is_xxx/ in patch 7 (Andrii)
- Use "struct_ops/" convention in patch 11 (Andrii)
- Use the skel instead of bpf_object in patch 11 (Andrii)
- libbpf: Decide BPF_PROG_TYPE_STRUCT_OPS at open phase by using
find_sec_def()
- libbpf: Avoid a debug message at open phase (Andrii)
- libbpf: Add bpf_program__(is|set)_struct_ops() for consistency (Andrii)
- libbpf: Add "struct_ops" to section_defs (Andrii)
- libbpf: Some code shuffling in init_kern_struct_ops() (Andrii)
- libbpf: A few safety checks (Andrii)
v2:
- Dropped cubic for now. They will be reposted
once there are more clarity in "jiffies" on both
bpf side (about the helper) and
tcp_cubic side (some of jiffies usages are being replaced
by tp->tcp_mstamp)
- Remove unnecssary check on bitfield support from btf_struct_access()
(Yonghong)
- BTF_TYPE_EMIT macro (Yonghong, Andrii)
- value_name's length check to avoid an unlikely
type match during truncation case (Yonghong)
- BUILD_BUG_ON to ensure no trampoline-image overrun
in the future (Yonghong)
- Simplify get_next_key() (Yonghong)
- Added comment to explain how to check mandatory
func ptr in net/ipv4/bpf_tcp_ca.c (Yonghong)
- Rename "__bpf_" to "bpf_struct_ops_" for value prefix (Andrii)
- Add comment to highlight the bpf_dctcp.c is not necessarily
the same as tcp_dctcp.c. (Alexei, Eric)
- libbpf: Renmae "struct_ops" to ".struct_ops" for elf sec (Andrii)
- libbpf: Expose struct_ops as a bpf_map (Andrii)
- libbpf: Support multiple struct_ops in SEC(".struct_ops") (Andrii)
- libbpf: Add bpf_map__attach_struct_ops() (Andrii)
====================
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'kernel/bpf/syscall.c')
| -rw-r--r-- | kernel/bpf/syscall.c | 69 |
1 files changed, 46 insertions, 23 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 81ee8595dfee..f9db72a96ec0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -628,7 +628,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return ret; } -#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id +#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -642,6 +642,14 @@ static int map_create(union bpf_attr *attr) if (err) return -EINVAL; + if (attr->btf_vmlinux_value_type_id) { + if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || + attr->btf_key_type_id || attr->btf_value_type_id) + return -EINVAL; + } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { + return -EINVAL; + } + f_flags = bpf_get_file_flag(attr->map_flags); if (f_flags < 0) return f_flags; @@ -664,32 +672,35 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); - if (attr->btf_key_type_id || attr->btf_value_type_id) { + map->spin_lock_off = -EINVAL; + if (attr->btf_key_type_id || attr->btf_value_type_id || + /* Even the map's value is a kernel's struct, + * the bpf_prog.o must have BTF to begin with + * to figure out the corresponding kernel's + * counter part. Thus, attr->btf_fd has + * to be valid also. + */ + attr->btf_vmlinux_value_type_id) { struct btf *btf; - if (!attr->btf_value_type_id) { - err = -EINVAL; - goto free_map; - } - btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { err = PTR_ERR(btf); goto free_map; } + map->btf = btf; - err = map_check_btf(map, btf, attr->btf_key_type_id, - attr->btf_value_type_id); - if (err) { - btf_put(btf); - goto free_map; + if (attr->btf_value_type_id) { + err = map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); + if (err) + goto free_map; } - map->btf = btf; map->btf_key_type_id = attr->btf_key_type_id; map->btf_value_type_id = attr->btf_value_type_id; - } else { - map->spin_lock_off = -EINVAL; + map->btf_vmlinux_value_type_id = + attr->btf_vmlinux_value_type_id; } err = security_bpf_map_alloc(map); @@ -888,6 +899,9 @@ static int map_lookup_elem(union bpf_attr *attr) } else if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK) { err = map->ops->map_peek_elem(map, value); + } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + /* struct_ops map requires directly updating "value" */ + err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); } else { rcu_read_lock(); if (map->ops->map_lookup_elem_sys_only) @@ -1003,7 +1017,8 @@ static int map_update_elem(union bpf_attr *attr) goto out; } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || map->map_type == BPF_MAP_TYPE_SOCKHASH || - map->map_type == BPF_MAP_TYPE_SOCKMAP) { + map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } else if (IS_FD_PROG_ARRAY(map)) { @@ -1092,7 +1107,9 @@ static int map_delete_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; - } else if (IS_FD_PROG_ARRAY(map)) { + } else if (IS_FD_PROG_ARRAY(map) || + map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + /* These maps require sleepable context */ err = map->ops->map_delete_elem(map, key); goto out; } @@ -1672,17 +1689,22 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, u32 btf_id, u32 prog_fd) { - switch (prog_type) { - case BPF_PROG_TYPE_TRACING: + if (btf_id) { if (btf_id > BTF_MAX_TYPE) return -EINVAL; - break; - default: - if (btf_id || prog_fd) + + switch (prog_type) { + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_STRUCT_OPS: + break; + default: return -EINVAL; - break; + } } + if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING) + return -EINVAL; + switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { @@ -2817,6 +2839,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.btf_key_type_id = map->btf_key_type_id; info.btf_value_type_id = map->btf_value_type_id; } + info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_info_fill(&info, map); |
