From 1cedee13d25ab118d325f95588c1a084e9317229 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 25 May 2018 08:55:23 -0700 Subject: bpf: Hooks for sys_sendmsg In addition to already existing BPF hooks for sys_bind and sys_connect, the patch provides new hooks for sys_sendmsg. It leverages existing BPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` that provides access to socket itlself (properties like family, type, protocol) and user-passed `struct sockaddr *` so that BPF program can override destination IP and port for system calls such as sendto(2) or sendmsg(2) and/or assign source IP to the socket. The hooks are implemented as two new attach types: `BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG` for UDPv4 and UDPv6 correspondingly. UDPv4 and UDPv6 separate attach types for same reason as sys_bind and sys_connect hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound. The difference with already existing hooks is sys_sendmsg are implemented only for unconnected UDP. For TCP it doesn't make sense to change user-provided `struct sockaddr *` at sendto(2)/sendmsg(2) time since socket either was already connected and has source/destination set or wasn't connected and call to sendto(2)/sendmsg(2) would lead to ENOTCONN anyway. Connected UDP is already handled by sys_connect hooks that can override source/destination at connect time and use fast-path later, i.e. these hooks don't affect UDP fast-path. Rewriting source IP is implemented differently than that in sys_connect hooks. When sys_sendmsg is used with unconnected UDP it doesn't work to just bind socket to desired local IP address since source IP can be set on per-packet basis by using ancillary data (cmsg(3)). So no matter if socket is bound or not, source IP has to be rewritten on every call to sys_sendmsg. To do so two new fields are added to UAPI `struct bpf_sock_addr`; * `msg_src_ip4` to set source IPv4 for UDPv4; * `msg_src_ip6` to set source IPv6 for UDPv6. Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9b8c6e310e9a..cc68787f2d97 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -160,6 +160,8 @@ enum bpf_attach_type { BPF_CGROUP_INET6_CONNECT, BPF_CGROUP_INET4_POST_BIND, BPF_CGROUP_INET6_POST_BIND, + BPF_CGROUP_UDP4_SENDMSG, + BPF_CGROUP_UDP6_SENDMSG, __MAX_BPF_ATTACH_TYPE }; @@ -2363,6 +2365,12 @@ struct bpf_sock_addr { __u32 family; /* Allows 4-byte read, but no write */ __u32 type; /* Allows 4-byte read, but no write */ __u32 protocol; /* Allows 4-byte read, but no write */ + __u32 msg_src_ip4; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ + __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ }; /* User bpf_sock_ops struct to access socket values and specify request ops -- cgit v1.2.3 From 7a279e93330bd4f0074973293f1cd1aacf7c5fa6 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 29 May 2018 12:27:44 +0100 Subject: bpf: clean up eBPF helpers documentation These are minor edits for the eBPF helpers documentation in include/uapi/linux/bpf.h. The main fix consists in removing "BPF_FIB_LOOKUP_", because it ends with a non-escaped underscore that gets interpreted by rst2man and produces the following message in the resulting manual page: DOCUTILS SYSTEM MESSAGES System Message: ERROR/3 (/tmp/bpf-helpers.rst:, line 1514) Unknown target name: "bpf_fib_lookup". Other edits consist in: - Improving formatting for flag values for "bpf_fib_lookup()" helper. - Emphasising a parameter name in description of the return value for "bpf_get_stack()" helper. - Removing unnecessary blank lines between "Description" and "Return" sections for the few helpers that would use it, for consistency. Signed-off-by: Quentin Monnet Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 21 ++++++++++----------- tools/include/uapi/linux/bpf.h | 21 ++++++++++----------- 2 files changed, 20 insertions(+), 22 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cc68787f2d97..3f556b35ac8d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1010,7 +1010,6 @@ union bpf_attr { * :: * * # sysctl kernel.perf_event_max_stack= - * * Return * The positive or null stack id on success, or a negative error * in case of failure. @@ -1821,10 +1820,9 @@ union bpf_attr { * :: * * # sysctl kernel.perf_event_max_stack= - * * Return - * a non-negative value equal to or less than size on success, or - * a negative error in case of failure. + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. * * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description @@ -1845,7 +1843,6 @@ union bpf_attr { * in socket filters where *skb*\ **->data** does not always point * to the start of the mac header and where "direct packet access" * is not available. - * * Return * 0 on success, or a negative error in case of failure. * @@ -1861,16 +1858,18 @@ union bpf_attr { * rt_metric is set to metric from route. * * *plen* argument is the size of the passed in struct. - * *flags* argument can be one or more BPF_FIB_LOOKUP_ flags: + * *flags* argument can be a combination of one or more of the + * following values: * - * **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs - * full lookup using FIB rules - * **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress - * perspective (default is ingress) + * **BPF_FIB_LOOKUP_DIRECT** + * Do a direct table lookup vs full lookup using FIB + * rules. + * **BPF_FIB_LOOKUP_OUTPUT** + * Perform lookup from an egress perspective (default is + * ingress). * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. - * * Return * Egress device index on success, 0 if packet needs to continue * up the stack for further processing or a negative error in case diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index cc68787f2d97..3f556b35ac8d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1010,7 +1010,6 @@ union bpf_attr { * :: * * # sysctl kernel.perf_event_max_stack= - * * Return * The positive or null stack id on success, or a negative error * in case of failure. @@ -1821,10 +1820,9 @@ union bpf_attr { * :: * * # sysctl kernel.perf_event_max_stack= - * * Return - * a non-negative value equal to or less than size on success, or - * a negative error in case of failure. + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. * * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description @@ -1845,7 +1843,6 @@ union bpf_attr { * in socket filters where *skb*\ **->data** does not always point * to the start of the mac header and where "direct packet access" * is not available. - * * Return * 0 on success, or a negative error in case of failure. * @@ -1861,16 +1858,18 @@ union bpf_attr { * rt_metric is set to metric from route. * * *plen* argument is the size of the passed in struct. - * *flags* argument can be one or more BPF_FIB_LOOKUP_ flags: + * *flags* argument can be a combination of one or more of the + * following values: * - * **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs - * full lookup using FIB rules - * **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress - * perspective (default is ingress) + * **BPF_FIB_LOOKUP_DIRECT** + * Do a direct table lookup vs full lookup using FIB + * rules. + * **BPF_FIB_LOOKUP_OUTPUT** + * Perform lookup from an egress perspective (default is + * ingress). * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. - * * Return * Egress device index on success, 0 if packet needs to continue * up the stack for further processing or a negative error in case -- cgit v1.2.3 From fa898d769b264ede3c10cc30a537316c6a946956 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 29 May 2018 10:58:07 -0700 Subject: bpf: Drop mpls from bpf_fib_lookup MPLS support will not be submitted this dev cycle, but in working on it I do see a few changes are needed to the API. For now, drop mpls from the API. Since the fields in question are unions, the mpls fields can be added back later without affecting the uapi. Signed-off-by: David Ahern Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3f556b35ac8d..33f37eb0b6bf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1852,10 +1852,10 @@ union bpf_attr { * If lookup is successful and result shows packet is to be * forwarded, the neighbor tables are searched for the nexthop. * If successful (ie., FIB lookup shows forwarding and nexthop - * is resolved), the nexthop address is returned in ipv4_dst, - * ipv6_dst or mpls_out based on family, smac is set to mac - * address of egress device, dmac is set to nexthop mac address, - * rt_metric is set to metric from route. + * is resolved), the nexthop address is returned in ipv4_dst + * or ipv6_dst based on family, smac is set to mac address of + * egress device, dmac is set to nexthop mac address, rt_metric + * is set to metric from route (IPv4/IPv6 only). * * *plen* argument is the size of the passed in struct. * *flags* argument can be a combination of one or more of the @@ -2537,8 +2537,10 @@ struct bpf_raw_tracepoint_args { #define BPF_FIB_LOOKUP_OUTPUT BIT(1) struct bpf_fib_lookup { - /* input */ - __u8 family; /* network family, AF_INET, AF_INET6, AF_MPLS */ + /* input: network family for lookup (AF_INET, AF_INET6) + * output: network family of egress nexthop + */ + __u8 family; /* set if lookup is to consider L4 data - e.g., FIB rules */ __u8 l4_protocol; @@ -2554,22 +2556,20 @@ struct bpf_fib_lookup { __u8 tos; /* AF_INET */ __be32 flowlabel; /* AF_INET6 */ - /* output: metric of fib result */ - __u32 rt_metric; + /* output: metric of fib result (IPv4/IPv6 only) */ + __u32 rt_metric; }; union { - __be32 mpls_in; __be32 ipv4_src; __u32 ipv6_src[4]; /* in6_addr; network order */ }; - /* input to bpf_fib_lookup, *dst is destination address. - * output: bpf_fib_lookup sets to gateway address + /* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in + * network header. output: bpf_fib_lookup sets to gateway address + * if FIB lookup returns gateway route */ union { - /* return for MPLS lookups */ - __be32 mpls_out[4]; /* support up to 4 labels */ __be32 ipv4_dst; __u32 ipv6_dst[4]; /* in6_addr; network order */ }; -- cgit v1.2.3 From f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 27 May 2018 12:24:09 +0100 Subject: media: rc: introduce BPF_PROG_LIRC_MODE2 Add support for BPF_PROG_LIRC_MODE2. This type of BPF program can call rc_keydown() to reported decoded IR scancodes, or rc_repeat() to report that the last key should be repeated. The bpf program can be attached to using the bpf(BPF_PROG_ATTACH) syscall; the target_fd must be the /dev/lircN device. Acked-by: Yonghong Song Signed-off-by: Sean Young Signed-off-by: Daniel Borkmann --- drivers/media/rc/Kconfig | 13 ++ drivers/media/rc/Makefile | 1 + drivers/media/rc/bpf-lirc.c | 313 ++++++++++++++++++++++++++++++++++++++++ drivers/media/rc/lirc_dev.c | 30 ++++ drivers/media/rc/rc-core-priv.h | 21 +++ drivers/media/rc/rc-ir-raw.c | 12 +- include/linux/bpf_lirc.h | 29 ++++ include/linux/bpf_types.h | 3 + include/uapi/linux/bpf.h | 53 ++++++- kernel/bpf/syscall.c | 7 + 10 files changed, 479 insertions(+), 3 deletions(-) create mode 100644 drivers/media/rc/bpf-lirc.c create mode 100644 include/linux/bpf_lirc.h (limited to 'include/uapi') diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index eb2c3b6eca7f..d5b35a6ba899 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -25,6 +25,19 @@ config LIRC passes raw IR to and from userspace, which is needed for IR transmitting (aka "blasting") and for the lirc daemon. +config BPF_LIRC_MODE2 + bool "Support for eBPF programs attached to lirc devices" + depends on BPF_SYSCALL + depends on RC_CORE=y + depends on LIRC + help + Allow attaching eBPF programs to a lirc device using the bpf(2) + syscall command BPF_PROG_ATTACH. This is supported for raw IR + receivers. + + These eBPF programs can be used to decode IR into scancodes, for + IR protocols not supported by the kernel decoders. + menuconfig RC_DECODERS bool "Remote controller decoders" depends on RC_CORE diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile index 2e1c87066f6c..e0340d043fe8 100644 --- a/drivers/media/rc/Makefile +++ b/drivers/media/rc/Makefile @@ -5,6 +5,7 @@ obj-y += keymaps/ obj-$(CONFIG_RC_CORE) += rc-core.o rc-core-y := rc-main.o rc-ir-raw.o rc-core-$(CONFIG_LIRC) += lirc_dev.o +rc-core-$(CONFIG_BPF_LIRC_MODE2) += bpf-lirc.o obj-$(CONFIG_IR_NEC_DECODER) += ir-nec-decoder.o obj-$(CONFIG_IR_RC5_DECODER) += ir-rc5-decoder.o obj-$(CONFIG_IR_RC6_DECODER) += ir-rc6-decoder.o diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c new file mode 100644 index 000000000000..40826bba06b6 --- /dev/null +++ b/drivers/media/rc/bpf-lirc.c @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: GPL-2.0 +// bpf-lirc.c - handles bpf +// +// Copyright (C) 2018 Sean Young + +#include +#include +#include +#include "rc-core-priv.h" + +/* + * BPF interface for raw IR + */ +const struct bpf_prog_ops lirc_mode2_prog_ops = { +}; + +BPF_CALL_1(bpf_rc_repeat, u32*, sample) +{ + struct ir_raw_event_ctrl *ctrl; + + ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample); + + rc_repeat(ctrl->dev); + + return 0; +} + +static const struct bpf_func_proto rc_repeat_proto = { + .func = bpf_rc_repeat, + .gpl_only = true, /* rc_repeat is EXPORT_SYMBOL_GPL */ + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +/* + * Currently rc-core does not support 64-bit scancodes, but there are many + * known protocols with more than 32 bits. So, define the interface as u64 + * as a future-proof. + */ +BPF_CALL_4(bpf_rc_keydown, u32*, sample, u32, protocol, u64, scancode, + u32, toggle) +{ + struct ir_raw_event_ctrl *ctrl; + + ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample); + + rc_keydown(ctrl->dev, protocol, scancode, toggle != 0); + + return 0; +} + +static const struct bpf_func_proto rc_keydown_proto = { + .func = bpf_rc_keydown, + .gpl_only = true, /* rc_keydown is EXPORT_SYMBOL_GPL */ + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_rc_repeat: + return &rc_repeat_proto; + case BPF_FUNC_rc_keydown: + return &rc_keydown_proto; + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; + case BPF_FUNC_trace_printk: + if (capable(CAP_SYS_ADMIN)) + return bpf_get_trace_printk_proto(); + /* fall through */ + default: + return NULL; + } +} + +static bool lirc_mode2_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + /* We have one field of u32 */ + return type == BPF_READ && off == 0 && size == sizeof(u32); +} + +const struct bpf_verifier_ops lirc_mode2_verifier_ops = { + .get_func_proto = lirc_mode2_func_proto, + .is_valid_access = lirc_mode2_is_valid_access +}; + +#define BPF_MAX_PROGS 64 + +static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog) +{ + struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *new_array; + struct ir_raw_event_ctrl *raw; + int ret; + + if (rcdev->driver_type != RC_DRIVER_IR_RAW) + return -EINVAL; + + ret = mutex_lock_interruptible(&ir_raw_handler_lock); + if (ret) + return ret; + + raw = rcdev->raw; + if (!raw) { + ret = -ENODEV; + goto unlock; + } + + if (raw->progs && bpf_prog_array_length(raw->progs) >= BPF_MAX_PROGS) { + ret = -E2BIG; + goto unlock; + } + + old_array = raw->progs; + ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); + if (ret < 0) + goto unlock; + + rcu_assign_pointer(raw->progs, new_array); + bpf_prog_array_free(old_array); + +unlock: + mutex_unlock(&ir_raw_handler_lock); + return ret; +} + +static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog) +{ + struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *new_array; + struct ir_raw_event_ctrl *raw; + int ret; + + if (rcdev->driver_type != RC_DRIVER_IR_RAW) + return -EINVAL; + + ret = mutex_lock_interruptible(&ir_raw_handler_lock); + if (ret) + return ret; + + raw = rcdev->raw; + if (!raw) { + ret = -ENODEV; + goto unlock; + } + + old_array = raw->progs; + ret = bpf_prog_array_copy(old_array, prog, NULL, &new_array); + /* + * Do not use bpf_prog_array_delete_safe() as we would end up + * with a dummy entry in the array, and the we would free the + * dummy in lirc_bpf_free() + */ + if (ret) + goto unlock; + + rcu_assign_pointer(raw->progs, new_array); + bpf_prog_array_free(old_array); +unlock: + mutex_unlock(&ir_raw_handler_lock); + return ret; +} + +void lirc_bpf_run(struct rc_dev *rcdev, u32 sample) +{ + struct ir_raw_event_ctrl *raw = rcdev->raw; + + raw->bpf_sample = sample; + + if (raw->progs) + BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, BPF_PROG_RUN); +} + +/* + * This should be called once the rc thread has been stopped, so there can be + * no concurrent bpf execution. + */ +void lirc_bpf_free(struct rc_dev *rcdev) +{ + struct bpf_prog **progs; + + if (!rcdev->raw->progs) + return; + + progs = rcu_dereference(rcdev->raw->progs)->progs; + while (*progs) + bpf_prog_put(*progs++); + + bpf_prog_array_free(rcdev->raw->progs); +} + +int lirc_prog_attach(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct rc_dev *rcdev; + int ret; + + if (attr->attach_flags) + return -EINVAL; + + prog = bpf_prog_get_type(attr->attach_bpf_fd, + BPF_PROG_TYPE_LIRC_MODE2); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + rcdev = rc_dev_get_from_fd(attr->target_fd); + if (IS_ERR(rcdev)) { + bpf_prog_put(prog); + return PTR_ERR(rcdev); + } + + ret = lirc_bpf_attach(rcdev, prog); + if (ret) + bpf_prog_put(prog); + + put_device(&rcdev->dev); + + return ret; +} + +int lirc_prog_detach(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct rc_dev *rcdev; + int ret; + + if (attr->attach_flags) + return -EINVAL; + + prog = bpf_prog_get_type(attr->attach_bpf_fd, + BPF_PROG_TYPE_LIRC_MODE2); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + rcdev = rc_dev_get_from_fd(attr->target_fd); + if (IS_ERR(rcdev)) { + bpf_prog_put(prog); + return PTR_ERR(rcdev); + } + + ret = lirc_bpf_detach(rcdev, prog); + + bpf_prog_put(prog); + put_device(&rcdev->dev); + + return ret; +} + +int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + struct bpf_prog_array __rcu *progs; + struct rc_dev *rcdev; + u32 cnt, flags = 0; + int ret; + + if (attr->query.query_flags) + return -EINVAL; + + rcdev = rc_dev_get_from_fd(attr->query.target_fd); + if (IS_ERR(rcdev)) + return PTR_ERR(rcdev); + + if (rcdev->driver_type != RC_DRIVER_IR_RAW) { + ret = -EINVAL; + goto put; + } + + ret = mutex_lock_interruptible(&ir_raw_handler_lock); + if (ret) + goto put; + + progs = rcdev->raw->progs; + cnt = progs ? bpf_prog_array_length(progs) : 0; + + if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) { + ret = -EFAULT; + goto unlock; + } + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) { + ret = -EFAULT; + goto unlock; + } + + if (attr->query.prog_cnt != 0 && prog_ids && cnt) + ret = bpf_prog_array_copy_to_user(progs, prog_ids, cnt); + +unlock: + mutex_unlock(&ir_raw_handler_lock); +put: + put_device(&rcdev->dev); + + return ret; +} diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index 24e9fbb80e81..da7013a12a58 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,12 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev) TO_US(ev.duration), TO_STR(ev.pulse)); } + /* + * bpf does not care about the gap generated above; that exists + * for backwards compatibility + */ + lirc_bpf_run(dev, sample); + spin_lock_irqsave(&dev->lirc_fh_lock, flags); list_for_each_entry(fh, &dev->lirc_fh, list) { if (LIRC_IS_TIMEOUT(sample) && !fh->send_timeout_reports) @@ -816,4 +823,27 @@ void __exit lirc_dev_exit(void) unregister_chrdev_region(lirc_base_dev, RC_DEV_MAX); } +struct rc_dev *rc_dev_get_from_fd(int fd) +{ + struct fd f = fdget(fd); + struct lirc_fh *fh; + struct rc_dev *dev; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &lirc_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + fh = f.file->private_data; + dev = fh->rc; + + get_device(&dev->dev); + fdput(f); + + return dev; +} + MODULE_ALIAS("lirc_dev"); diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index e0e6a17460f6..eb004757038b 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -13,6 +13,7 @@ #define MAX_IR_EVENT_SIZE 512 #include +#include #include /** @@ -57,6 +58,11 @@ struct ir_raw_event_ctrl { /* raw decoder state follows */ struct ir_raw_event prev_ev; struct ir_raw_event this_ev; + +#ifdef CONFIG_BPF_LIRC_MODE2 + u32 bpf_sample; + struct bpf_prog_array __rcu *progs; +#endif struct nec_dec { int state; unsigned count; @@ -126,6 +132,9 @@ struct ir_raw_event_ctrl { } imon; }; +/* Mutex for locking raw IR processing and handler change */ +extern struct mutex ir_raw_handler_lock; + /* macros for IR decoders */ static inline bool geq_margin(unsigned d1, unsigned d2, unsigned margin) { @@ -288,6 +297,7 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev); void ir_lirc_scancode_event(struct rc_dev *dev, struct lirc_scancode *lsc); int ir_lirc_register(struct rc_dev *dev); void ir_lirc_unregister(struct rc_dev *dev); +struct rc_dev *rc_dev_get_from_fd(int fd); #else static inline int lirc_dev_init(void) { return 0; } static inline void lirc_dev_exit(void) {} @@ -299,4 +309,15 @@ static inline int ir_lirc_register(struct rc_dev *dev) { return 0; } static inline void ir_lirc_unregister(struct rc_dev *dev) { } #endif +/* + * bpf interface + */ +#ifdef CONFIG_BPF_LIRC_MODE2 +void lirc_bpf_free(struct rc_dev *dev); +void lirc_bpf_run(struct rc_dev *dev, u32 sample); +#else +static inline void lirc_bpf_free(struct rc_dev *dev) { } +static inline void lirc_bpf_run(struct rc_dev *dev, u32 sample) { } +#endif + #endif /* _RC_CORE_PRIV */ diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c index 374f83105a23..7675b7ee5bc7 100644 --- a/drivers/media/rc/rc-ir-raw.c +++ b/drivers/media/rc/rc-ir-raw.c @@ -14,7 +14,7 @@ static LIST_HEAD(ir_raw_client_list); /* Used to handle IR raw handler extensions */ -static DEFINE_MUTEX(ir_raw_handler_lock); +DEFINE_MUTEX(ir_raw_handler_lock); static LIST_HEAD(ir_raw_handler_list); static atomic64_t available_protocols = ATOMIC64_INIT(0); @@ -621,9 +621,17 @@ void ir_raw_event_unregister(struct rc_dev *dev) list_for_each_entry(handler, &ir_raw_handler_list, list) if (handler->raw_unregister) handler->raw_unregister(dev); - mutex_unlock(&ir_raw_handler_lock); + + lirc_bpf_free(dev); ir_raw_event_free(dev); + + /* + * A user can be calling bpf(BPF_PROG_{QUERY|ATTACH|DETACH}), so + * ensure that the raw member is null on unlock; this is how + * "device gone" is checked. + */ + mutex_unlock(&ir_raw_handler_lock); } /* diff --git a/include/linux/bpf_lirc.h b/include/linux/bpf_lirc.h new file mode 100644 index 000000000000..5f8a4283092d --- /dev/null +++ b/include/linux/bpf_lirc.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_LIRC_H +#define _BPF_LIRC_H + +#include + +#ifdef CONFIG_BPF_LIRC_MODE2 +int lirc_prog_attach(const union bpf_attr *attr); +int lirc_prog_detach(const union bpf_attr *attr); +int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +#else +static inline int lirc_prog_attach(const union bpf_attr *attr) +{ + return -EINVAL; +} + +static inline int lirc_prog_detach(const union bpf_attr *attr) +{ + return -EINVAL; +} + +static inline int lirc_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} +#endif + +#endif /* _BPF_LIRC_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b161e506dcfc..c5700c2d5549 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -26,6 +26,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) #endif +#ifdef CONFIG_BPF_LIRC_MODE2 +BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 33f37eb0b6bf..64ac0f7a689e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -143,6 +143,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, + BPF_PROG_TYPE_LIRC_MODE2, }; enum bpf_attach_type { @@ -162,6 +163,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_POST_BIND, BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP6_SENDMSG, + BPF_LIRC_MODE2, __MAX_BPF_ATTACH_TYPE }; @@ -2005,6 +2007,53 @@ union bpf_attr { * direct packet access. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown** () again with the same values, or calling + * **bpf_rc_repeat** (). + * + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Return + * 0 + * + * int bpf_rc_repeat(void *ctx) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. + * + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Return + * 0 */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2083,7 +2132,9 @@ union bpf_attr { FN(lwt_push_encap), \ FN(lwt_seg6_store_bytes), \ FN(lwt_seg6_adjust_srh), \ - FN(lwt_seg6_action), + FN(lwt_seg6_action), \ + FN(rc_repeat), \ + FN(rc_keydown), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e254526d6744..7365d79ae00d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -11,6 +11,7 @@ */ #include #include +#include #include #include #include @@ -1582,6 +1583,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); + case BPF_LIRC_MODE2: + return lirc_prog_attach(attr); default: return -EINVAL; } @@ -1654,6 +1657,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); + case BPF_LIRC_MODE2: + return lirc_prog_detach(attr); default: return -EINVAL; } @@ -1703,6 +1708,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; + case BPF_LIRC_MODE2: + return lirc_prog_query(attr, uattr); default: return -EINVAL; } -- cgit v1.2.3 From cb20b08ead401fd17627a36f035c0bf5bfee5567 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:36 +0200 Subject: bpf: add bpf_skb_cgroup_id helper Add a new bpf_skb_cgroup_id() helper that allows to retrieve the cgroup id from the skb's socket. This is useful in particular to enable bpf_get_cgroup_classid()-like behavior for cgroup v1 in cgroup v2 by allowing ID based matching on egress. This can in particular be used in combination with applying policy e.g. from map lookups, and also complements the older bpf_skb_under_cgroup() interface. In user space the cgroup id for a given path can be retrieved through the f_handle as demonstrated in [0] recently. [0] https://lkml.org/lkml/2018/5/22/1190 Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 19 ++++++++++++++++++- net/core/filter.c | 29 +++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 64ac0f7a689e..661318141f72 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2054,6 +2054,22 @@ union bpf_attr { * * Return * 0 + * + * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb) + * Description + * Return the cgroup v2 id of the socket associated with the *skb*. + * This is roughly similar to the **bpf_get_cgroup_classid**\ () + * helper for cgroup v1 by providing a tag resp. identifier that + * can be matched on or used for map lookups e.g. to implement + * policy. The cgroup v2 id of a given path in the hierarchy is + * exposed in user space through the f_handle API in order to get + * to the same 64-bit id. + * + * This helper can be used on TC egress path, but not on ingress, + * and is available only if the kernel was compiled with the + * **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2134,7 +2150,8 @@ union bpf_attr { FN(lwt_seg6_adjust_srh), \ FN(lwt_seg6_action), \ FN(rc_repeat), \ - FN(rc_keydown), + FN(rc_keydown), \ + FN(skb_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 885fb0e2f264..edbfaa613b6d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3661,6 +3661,27 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { .arg3_type = ARG_ANYTHING, }; +#ifdef CONFIG_SOCK_CGROUP_DATA +BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) +{ + struct sock *sk = skb_to_full_sk(skb); + struct cgroup *cgrp; + + if (!sk || !sk_fullsock(sk)) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + return cgrp->kn->id.id; +} + +static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { + .func = bpf_skb_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; +#endif + static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, unsigned long off, unsigned long len) { @@ -4747,12 +4768,16 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; + case BPF_FUNC_fib_lookup: + return &bpf_skb_fib_lookup_proto; #ifdef CONFIG_XFRM case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; #endif - case BPF_FUNC_fib_lookup: - return &bpf_skb_fib_lookup_proto; +#ifdef CONFIG_SOCK_CGROUP_DATA + case BPF_FUNC_skb_cgroup_id: + return &bpf_skb_cgroup_id_proto; +#endif default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From 1fbc2e0cfcf9677683aea2b1f9ea76fbdc6fb6d1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:37 +0200 Subject: bpf: make sure to clear unused fields in tunnel/xfrm state fetch Since the remaining bits are not filled in struct bpf_tunnel_key resp. struct bpf_xfrm_state and originate from uninitialized stack space, we should make sure to clear them before handing control back to the program. Also add a padding element to struct bpf_xfrm_state for future use similar as we have in struct bpf_tunnel_key and clear it as well. struct bpf_xfrm_state { __u32 reqid; /* 0 4 */ __u32 spi; /* 4 4 */ __u16 family; /* 8 2 */ /* XXX 2 bytes hole, try to pack */ union { __u32 remote_ipv4; /* 4 */ __u32 remote_ipv6[4]; /* 16 */ }; /* 12 16 */ /* size: 28, cachelines: 1, members: 4 */ /* sum members: 26, holes: 1, sum holes: 2 */ /* last cacheline: 28 bytes */ }; Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 3 ++- net/core/filter.c | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 661318141f72..f0b6608b1f1c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2268,7 +2268,7 @@ struct bpf_tunnel_key { }; __u8 tunnel_tos; __u8 tunnel_ttl; - __u16 tunnel_ext; + __u16 tunnel_ext; /* Padding, future use. */ __u32 tunnel_label; }; @@ -2279,6 +2279,7 @@ struct bpf_xfrm_state { __u32 reqid; __u32 spi; /* Stored in network byte order */ __u16 family; + __u16 ext; /* Padding, future use. */ union { __u32 remote_ipv4; /* Stored in network byte order */ __u32 remote_ipv6[4]; /* Stored in network byte order */ diff --git a/net/core/filter.c b/net/core/filter.c index edbfaa613b6d..28e864777c0f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3445,6 +3445,7 @@ set_compat: to->tunnel_id = be64_to_cpu(info->key.tun_id); to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; + to->tunnel_ext = 0; if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, @@ -3452,6 +3453,8 @@ set_compat: to->tunnel_label = be32_to_cpu(info->key.label); } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); + to->tunnel_label = 0; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) @@ -4047,11 +4050,14 @@ BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, to->reqid = x->props.reqid; to->spi = x->id.spi; to->family = x->props.family; + to->ext = 0; + if (to->family == AF_INET6) { memcpy(to->remote_ipv6, x->props.saddr.a6, sizeof(to->remote_ipv6)); } else { to->remote_ipv4 = x->props.saddr.a4; + memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); } return 0; -- cgit v1.2.3 From bf6fa2c893c5237b48569a13fa3c673041430b6c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 3 Jun 2018 15:59:41 -0700 Subject: bpf: implement bpf_get_current_cgroup_id() helper bpf has been used extensively for tracing. For example, bcc contains an almost full set of bpf-based tools to trace kernel and user functions/events. Most tracing tools are currently either filtered based on pid or system-wide. Containers have been used quite extensively in industry and cgroup is often used together to provide resource isolation and protection. Several processes may run inside the same container. It is often desirable to get container-level tracing results as well, e.g. syscall count, function count, I/O activity, etc. This patch implements a new helper, bpf_get_current_cgroup_id(), which will return cgroup id based on the cgroup within which the current task is running. The later patch will provide an example to show that userspace can get the same cgroup id so it could configure a filter or policy in the bpf program based on task cgroup id. The helper is currently implemented for tracing. It can be added to other program types as well when needed. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 8 +++++++- kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 15 +++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ 5 files changed, 26 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bbe297436e5d..995c3b1e59bf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -746,6 +746,7 @@ extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; +extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f0b6608b1f1c..18712b0dbfe7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2070,6 +2070,11 @@ union bpf_attr { * **CONFIG_SOCK_CGROUP_DATA** configuration option. * Return * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_get_current_cgroup_id(void) + * Return + * A 64-bit integer containing the current cgroup id based + * on the cgroup within which the current task is running. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2151,7 +2156,8 @@ union bpf_attr { FN(lwt_seg6_action), \ FN(rc_repeat), \ FN(rc_keydown), \ - FN(skb_cgroup_id), + FN(skb_cgroup_id), \ + FN(get_current_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 527587de8a67..9f1493705f40 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1765,6 +1765,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; +const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 3d24e238221e..73065e2d23c2 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -179,3 +179,18 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE, }; + +#ifdef CONFIG_CGROUPS +BPF_CALL_0(bpf_get_current_cgroup_id) +{ + struct cgroup *cgrp = task_dfl_cgroup(current); + + return cgrp->kn->id.id; +} + +const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { + .func = bpf_get_current_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; +#endif diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 752992ce3513..e2ab5b7f29d2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -564,6 +564,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_prandom_u32_proto; case BPF_FUNC_probe_read_str: return &bpf_probe_read_str_proto; + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; default: return NULL; } -- cgit v1.2.3 From bd3a08aaa9a383ffbbd5b788b797ae6e64eaa7a1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 3 Jun 2018 08:15:19 -0700 Subject: bpf: flowlabel in bpf_fib_lookup should be flowinfo As Michal noted the flow struct takes both the flow label and priority. Update the bpf_fib_lookup API to note that it is flowinfo and not just the flow label. Cc: Michal Kubecek Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- net/core/filter.c | 2 +- samples/bpf/xdp_fwd_kern.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 18712b0dbfe7..eeb6237be5c2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2629,7 +2629,7 @@ struct bpf_fib_lookup { union { /* inputs to lookup */ __u8 tos; /* AF_INET */ - __be32 flowlabel; /* AF_INET6 */ + __be32 flowinfo; /* AF_INET6, flow_label + priority */ /* output: metric of fib result (IPv4/IPv6 only) */ __u32 rt_metric; diff --git a/net/core/filter.c b/net/core/filter.c index a72ea9f61010..3d9ba7e5965a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4221,7 +4221,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl6.flowi6_oif = 0; strict = RT6_LOOKUP_F_HAS_SADDR; } - fl6.flowlabel = params->flowlabel; + fl6.flowlabel = params->flowinfo; fl6.flowi6_scope = 0; fl6.flowi6_flags = 0; fl6.mp_hash = 0; diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c index 4a6be0f87505..6673cdb9f55c 100644 --- a/samples/bpf/xdp_fwd_kern.c +++ b/samples/bpf/xdp_fwd_kern.c @@ -88,7 +88,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) return XDP_PASS; fib_params.family = AF_INET6; - fib_params.flowlabel = *(__be32 *)ip6h & IPV6_FLOWINFO_MASK; + fib_params.flowinfo = *(__be32 *)ip6h & IPV6_FLOWINFO_MASK; fib_params.l4_protocol = ip6h->nexthdr; fib_params.sport = 0; fib_params.dport = 0; -- cgit v1.2.3 From bbff2f321a864ee07c9d3d1245af498023146951 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 4 Jun 2018 13:57:13 +0200 Subject: xsk: new descriptor addressing scheme MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, AF_XDP only supports a fixed frame-size memory scheme where each frame is referenced via an index (idx). A user passes the frame index to the kernel, and the kernel acts upon the data. Some NICs, however, do not have a fixed frame-size model, instead they have a model where a memory window is passed to the hardware and multiple frames are filled into that window (referred to as the "type-writer" model). By changing the descriptor format from the current frame index addressing scheme, AF_XDP can in the future be extended to support these kinds of NICs. In the index-based model, an idx refers to a frame of size frame_size. Addressing a frame in the UMEM is done by offseting the UMEM starting address by a global offset, idx * frame_size + offset. Communicating via the fill- and completion-rings are done by means of idx. In this commit, the idx is removed in favor of an address (addr), which is a relative address ranging over the UMEM. To convert an idx-based address to the new addr is simply: addr = idx * frame_size + offset. We also stop referring to the UMEM "frame" as a frame. Instead it is simply called a chunk. To transfer ownership of a chunk to the kernel, the addr of the chunk is passed in the fill-ring. Note, that the kernel will mask addr to make it chunk aligned, so there is no need for userspace to do that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or 3000 to the fill-ring will refer to the same chunk. On the completion-ring, the addr will match that of the Tx descriptor, passed to the kernel. Changing the descriptor format to use chunks/addr will allow for future changes to move to a type-writer based model, where multiple frames can reside in one chunk. In this model passing one single chunk into the fill-ring, would potentially result in multiple Rx descriptors. This commit changes the uapi of AF_XDP sockets, and updates the documentation. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- Documentation/networking/af_xdp.rst | 101 +++++++++++++++++++++--------------- include/uapi/linux/if_xdp.h | 12 ++--- net/xdp/xdp_umem.c | 33 ++++++------ net/xdp/xdp_umem.h | 27 +++------- net/xdp/xdp_umem_props.h | 4 +- net/xdp/xsk.c | 30 ++++++----- net/xdp/xsk_queue.c | 2 +- net/xdp/xsk_queue.h | 43 +++++++-------- 8 files changed, 123 insertions(+), 129 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst index 91928d9ee4bf..ff929cfab4f4 100644 --- a/Documentation/networking/af_xdp.rst +++ b/Documentation/networking/af_xdp.rst @@ -12,7 +12,7 @@ packet processing. This document assumes that the reader is familiar with BPF and XDP. If not, the Cilium project has an excellent reference guide at -http://cilium.readthedocs.io/en/doc-1.0/bpf/. +http://cilium.readthedocs.io/en/latest/bpf/. Using the XDP_REDIRECT action from an XDP program, the program can redirect ingress frames to other XDP enabled netdevs, using the @@ -33,22 +33,22 @@ for a while due to a possible retransmit, the descriptor that points to that packet can be changed to point to another and reused right away. This again avoids copying data. -The UMEM consists of a number of equally size frames and each frame -has a unique frame id. A descriptor in one of the rings references a -frame by referencing its frame id. The user space allocates memory for -this UMEM using whatever means it feels is most appropriate (malloc, -mmap, huge pages, etc). This memory area is then registered with the -kernel using the new setsockopt XDP_UMEM_REG. The UMEM also has two -rings: the FILL ring and the COMPLETION ring. The fill ring is used by -the application to send down frame ids for the kernel to fill in with -RX packet data. References to these frames will then appear in the RX -ring once each packet has been received. The completion ring, on the -other hand, contains frame ids that the kernel has transmitted -completely and can now be used again by user space, for either TX or -RX. Thus, the frame ids appearing in the completion ring are ids that -were previously transmitted using the TX ring. In summary, the RX and -FILL rings are used for the RX path and the TX and COMPLETION rings -are used for the TX path. +The UMEM consists of a number of equally sized chunks. A descriptor in +one of the rings references a frame by referencing its addr. The addr +is simply an offset within the entire UMEM region. The user space +allocates memory for this UMEM using whatever means it feels is most +appropriate (malloc, mmap, huge pages, etc). This memory area is then +registered with the kernel using the new setsockopt XDP_UMEM_REG. The +UMEM also has two rings: the FILL ring and the COMPLETION ring. The +fill ring is used by the application to send down addr for the kernel +to fill in with RX packet data. References to these frames will then +appear in the RX ring once each packet has been received. The +completion ring, on the other hand, contains frame addr that the +kernel has transmitted completely and can now be used again by user +space, for either TX or RX. Thus, the frame addrs appearing in the +completion ring are addrs that were previously transmitted using the +TX ring. In summary, the RX and FILL rings are used for the RX path +and the TX and COMPLETION rings are used for the TX path. The socket is then finally bound with a bind() call to a device and a specific queue id on that device, and it is not until bind is @@ -59,13 +59,13 @@ wants to do this, it simply skips the registration of the UMEM and its corresponding two rings, sets the XDP_SHARED_UMEM flag in the bind call and submits the XSK of the process it would like to share UMEM with as well as its own newly created XSK socket. The new process will -then receive frame id references in its own RX ring that point to this -shared UMEM. Note that since the ring structures are single-consumer / -single-producer (for performance reasons), the new process has to -create its own socket with associated RX and TX rings, since it cannot -share this with the other process. This is also the reason that there -is only one set of FILL and COMPLETION rings per UMEM. It is the -responsibility of a single process to handle the UMEM. +then receive frame addr references in its own RX ring that point to +this shared UMEM. Note that since the ring structures are +single-consumer / single-producer (for performance reasons), the new +process has to create its own socket with associated RX and TX rings, +since it cannot share this with the other process. This is also the +reason that there is only one set of FILL and COMPLETION rings per +UMEM. It is the responsibility of a single process to handle the UMEM. How is then packets distributed from an XDP program to the XSKs? There is a BPF map called XSKMAP (or BPF_MAP_TYPE_XSKMAP in full). The @@ -102,10 +102,10 @@ UMEM UMEM is a region of virtual contiguous memory, divided into equal-sized frames. An UMEM is associated to a netdev and a specific -queue id of that netdev. It is created and configured (frame size, -frame headroom, start address and size) by using the XDP_UMEM_REG -setsockopt system call. A UMEM is bound to a netdev and queue id, via -the bind() system call. +queue id of that netdev. It is created and configured (chunk size, +headroom, start address and size) by using the XDP_UMEM_REG setsockopt +system call. A UMEM is bound to a netdev and queue id, via the bind() +system call. An AF_XDP is socket linked to a single UMEM, but one UMEM can have multiple AF_XDP sockets. To share an UMEM created via one socket A, @@ -147,13 +147,17 @@ UMEM Fill Ring ~~~~~~~~~~~~~~ The Fill ring is used to transfer ownership of UMEM frames from -user-space to kernel-space. The UMEM indicies are passed in the -ring. As an example, if the UMEM is 64k and each frame is 4k, then the -UMEM has 16 frames and can pass indicies between 0 and 15. +user-space to kernel-space. The UMEM addrs are passed in the ring. As +an example, if the UMEM is 64k and each chunk is 4k, then the UMEM has +16 chunks and can pass addrs between 0 and 64k. Frames passed to the kernel are used for the ingress path (RX rings). -The user application produces UMEM indicies to this ring. +The user application produces UMEM addrs to this ring. Note that the +kernel will mask the incoming addr. E.g. for a chunk size of 2k, the +log2(2048) LSB of the addr will be masked off, meaning that 2048, 2050 +and 3000 refers to the same chunk. + UMEM Completetion Ring ~~~~~~~~~~~~~~~~~~~~~~ @@ -165,16 +169,15 @@ used. Frames passed from the kernel to user-space are frames that has been sent (TX ring) and can be used by user-space again. -The user application consumes UMEM indicies from this ring. +The user application consumes UMEM addrs from this ring. RX Ring ~~~~~~~ The RX ring is the receiving side of a socket. Each entry in the ring -is a struct xdp_desc descriptor. The descriptor contains UMEM index -(idx), the length of the data (len), the offset into the frame -(offset). +is a struct xdp_desc descriptor. The descriptor contains UMEM offset +(addr) and the length of the data (len). If no frames have been passed to kernel via the Fill ring, no descriptors will (or can) appear on the RX ring. @@ -221,38 +224,50 @@ side is xdpsock_user.c and the XDP side xdpsock_kern.c. Naive ring dequeue and enqueue could look like this:: + // struct xdp_rxtx_ring { + // __u32 *producer; + // __u32 *consumer; + // struct xdp_desc *desc; + // }; + + // struct xdp_umem_ring { + // __u32 *producer; + // __u32 *consumer; + // __u64 *desc; + // }; + // typedef struct xdp_rxtx_ring RING; // typedef struct xdp_umem_ring RING; // typedef struct xdp_desc RING_TYPE; - // typedef __u32 RING_TYPE; + // typedef __u64 RING_TYPE; int dequeue_one(RING *ring, RING_TYPE *item) { - __u32 entries = ring->ptrs.producer - ring->ptrs.consumer; + __u32 entries = *ring->producer - *ring->consumer; if (entries == 0) return -1; // read-barrier! - *item = ring->desc[ring->ptrs.consumer & (RING_SIZE - 1)]; - ring->ptrs.consumer++; + *item = ring->desc[*ring->consumer & (RING_SIZE - 1)]; + (*ring->consumer)++; return 0; } int enqueue_one(RING *ring, const RING_TYPE *item) { - u32 free_entries = RING_SIZE - (ring->ptrs.producer - ring->ptrs.consumer); + u32 free_entries = RING_SIZE - (*ring->producer - *ring->consumer); if (free_entries == 0) return -1; - ring->desc[ring->ptrs.producer & (RING_SIZE - 1)] = *item; + ring->desc[*ring->producer & (RING_SIZE - 1)] = *item; // write-barrier! - ring->ptrs.producer++; + (*ring->producer)++; return 0; } diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 4737cfe222f5..e411d6f9ac65 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -48,8 +48,8 @@ struct xdp_mmap_offsets { struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ __u64 len; /* Length of packet data area */ - __u32 frame_size; /* Frame size */ - __u32 frame_headroom; /* Frame head room */ + __u32 chunk_size; + __u32 headroom; }; struct xdp_statistics { @@ -66,13 +66,11 @@ struct xdp_statistics { /* Rx/Tx descriptor */ struct xdp_desc { - __u32 idx; + __u64 addr; __u32 len; - __u16 offset; - __u8 flags; - __u8 padding[5]; + __u32 options; }; -/* UMEM descriptor is __u32 */ +/* UMEM descriptor is __u64 */ #endif /* _LINUX_IF_XDP_H */ diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 87998818116f..9ad791ff4739 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -14,7 +14,7 @@ #include "xdp_umem.h" -#define XDP_UMEM_MIN_FRAME_SIZE 2048 +#define XDP_UMEM_MIN_CHUNK_SIZE 2048 static void xdp_umem_unpin_pages(struct xdp_umem *umem) { @@ -151,12 +151,12 @@ static int xdp_umem_account_pages(struct xdp_umem *umem) static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { - u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; + u32 chunk_size = mr->chunk_size, headroom = mr->headroom; + unsigned int chunks, chunks_per_page; u64 addr = mr->addr, size = mr->len; - unsigned int nframes, nfpp; int size_chk, err; - if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { + if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { /* Strictly speaking we could support this, if: * - huge pages, or* * - using an IOMMU, or @@ -166,7 +166,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) return -EINVAL; } - if (!is_power_of_2(frame_size)) + if (!is_power_of_2(chunk_size)) return -EINVAL; if (!PAGE_ALIGNED(addr)) { @@ -179,33 +179,30 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if ((addr + size) < addr) return -EINVAL; - nframes = (unsigned int)div_u64(size, frame_size); - if (nframes == 0 || nframes > UINT_MAX) + chunks = (unsigned int)div_u64(size, chunk_size); + if (chunks == 0) return -EINVAL; - nfpp = PAGE_SIZE / frame_size; - if (nframes < nfpp || nframes % nfpp) + chunks_per_page = PAGE_SIZE / chunk_size; + if (chunks < chunks_per_page || chunks % chunks_per_page) return -EINVAL; - frame_headroom = ALIGN(frame_headroom, 64); + headroom = ALIGN(headroom, 64); - size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM; + size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM; if (size_chk < 0) return -EINVAL; umem->pid = get_task_pid(current, PIDTYPE_PID); - umem->size = (size_t)size; umem->address = (unsigned long)addr; - umem->props.frame_size = frame_size; - umem->props.nframes = nframes; - umem->frame_headroom = frame_headroom; + umem->props.chunk_mask = ~((u64)chunk_size - 1); + umem->props.size = size; + umem->headroom = headroom; + umem->chunk_size_nohr = chunk_size - headroom; umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; umem->user = NULL; - umem->frame_size_log2 = ilog2(frame_size); - umem->nfpp_mask = nfpp - 1; - umem->nfpplog2 = ilog2(nfpp); refcount_set(&umem->users, 1); err = xdp_umem_account_pages(umem); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 0881cf456230..aeadd1bcb72d 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -18,35 +18,20 @@ struct xdp_umem { struct xsk_queue *cq; struct page **pgs; struct xdp_umem_props props; - u32 npgs; - u32 frame_headroom; - u32 nfpp_mask; - u32 nfpplog2; - u32 frame_size_log2; + u32 headroom; + u32 chunk_size_nohr; struct user_struct *user; struct pid *pid; unsigned long address; - size_t size; refcount_t users; struct work_struct work; + u32 npgs; }; -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u32 idx) -{ - u64 pg, off; - char *data; - - pg = idx >> umem->nfpplog2; - off = (idx & umem->nfpp_mask) << umem->frame_size_log2; - - data = page_address(umem->pgs[pg]); - return data + off; -} - -static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem, - u32 idx) +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { - return xdp_umem_get_data(umem, idx) + umem->frame_headroom; + return page_address(umem->pgs[addr >> PAGE_SHIFT]) + + (addr & (PAGE_SIZE - 1)); } bool xdp_umem_validate_queues(struct xdp_umem *umem); diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h index 2cf8ec485fd2..40eab10dfc49 100644 --- a/net/xdp/xdp_umem_props.h +++ b/net/xdp/xdp_umem_props.h @@ -7,8 +7,8 @@ #define XDP_UMEM_PROPS_H_ struct xdp_umem_props { - u32 frame_size; - u32 nframes; + u64 chunk_mask; + u64 size; }; #endif /* XDP_UMEM_PROPS_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 966307ce4b8e..4688c750df1d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,24 +41,27 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { - u32 id, len = xdp->data_end - xdp->data; + u32 len = xdp->data_end - xdp->data; void *buffer; + u64 addr; int err; if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; - if (!xskq_peek_id(xs->umem->fq, &id)) { + if (!xskq_peek_addr(xs->umem->fq, &addr) || + len > xs->umem->chunk_size_nohr) { xs->rx_dropped++; return -ENOSPC; } - buffer = xdp_umem_get_data_with_headroom(xs->umem, id); + addr += xs->umem->headroom; + + buffer = xdp_umem_get_data(xs->umem, addr); memcpy(buffer, xdp->data, len); - err = xskq_produce_batch_desc(xs->rx, id, len, - xs->umem->frame_headroom); + err = xskq_produce_batch_desc(xs->rx, addr, len); if (!err) - xskq_discard_id(xs->umem->fq); + xskq_discard_addr(xs->umem->fq); else xs->rx_dropped++; @@ -95,10 +98,10 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static void xsk_destruct_skb(struct sk_buff *skb) { - u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg; + u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; struct xdp_sock *xs = xdp_sk(skb->sk); - WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id)); + WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); sock_wfree(skb); } @@ -123,14 +126,15 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, while (xskq_peek_desc(xs->tx, &desc)) { char *buffer; - u32 id, len; + u64 addr; + u32 len; if (max_batch-- == 0) { err = -EAGAIN; goto out; } - if (xskq_reserve_id(xs->umem->cq)) { + if (xskq_reserve_addr(xs->umem->cq)) { err = -EAGAIN; goto out; } @@ -153,8 +157,8 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, } skb_put(skb, len); - id = desc.idx; - buffer = xdp_umem_get_data(xs->umem, id) + desc.offset; + addr = desc.addr; + buffer = xdp_umem_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); if (unlikely(err)) { kfree_skb(skb); @@ -164,7 +168,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, skb->dev = xs->dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb_shinfo(skb)->destructor_arg = (void *)(long)id; + skb_shinfo(skb)->destructor_arg = (void *)(long)addr; skb->destructor = xsk_destruct_skb; err = dev_direct_xmit(skb, xs->queue_id); diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index ebe85e59507e..6c32e92e98fc 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -17,7 +17,7 @@ void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props) static u32 xskq_umem_get_ring_size(struct xsk_queue *q) { - return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32); + return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64); } static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index b5924e7aeb2b..337e5ad3b10e 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -27,7 +27,7 @@ struct xdp_rxtx_ring { /* Used for the fill and completion queues for buffers */ struct xdp_umem_ring { struct xdp_ring ptrs; - u32 desc[0] ____cacheline_aligned_in_smp; + u64 desc[0] ____cacheline_aligned_in_smp; }; struct xsk_queue { @@ -76,24 +76,25 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) /* UMEM queue */ -static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx) +static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) { - if (unlikely(idx >= q->umem_props.nframes)) { + if (addr >= q->umem_props.size) { q->invalid_descs++; return false; } + return true; } -static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id) +static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr) { while (q->cons_tail != q->cons_head) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; unsigned int idx = q->cons_tail & q->ring_mask; - *id = READ_ONCE(ring->desc[idx]); - if (xskq_is_valid_id(q, *id)) - return id; + *addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask; + if (xskq_is_valid_addr(q, *addr)) + return addr; q->cons_tail++; } @@ -101,7 +102,7 @@ static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id) return NULL; } -static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id) +static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr) { if (q->cons_tail == q->cons_head) { WRITE_ONCE(q->ring->consumer, q->cons_tail); @@ -111,19 +112,19 @@ static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id) smp_rmb(); } - return xskq_validate_id(q, id); + return xskq_validate_addr(q, addr); } -static inline void xskq_discard_id(struct xsk_queue *q) +static inline void xskq_discard_addr(struct xsk_queue *q) { q->cons_tail++; } -static inline int xskq_produce_id(struct xsk_queue *q, u32 id) +static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - ring->desc[q->prod_tail++ & q->ring_mask] = id; + ring->desc[q->prod_tail++ & q->ring_mask] = addr; /* Order producer and data */ smp_wmb(); @@ -132,7 +133,7 @@ static inline int xskq_produce_id(struct xsk_queue *q, u32 id) return 0; } -static inline int xskq_reserve_id(struct xsk_queue *q) +static inline int xskq_reserve_addr(struct xsk_queue *q) { if (xskq_nb_free(q, q->prod_head, 1) == 0) return -ENOSPC; @@ -145,16 +146,11 @@ static inline int xskq_reserve_id(struct xsk_queue *q) static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d) { - u32 buff_len; - - if (unlikely(d->idx >= q->umem_props.nframes)) { - q->invalid_descs++; + if (!xskq_is_valid_addr(q, d->addr)) return false; - } - buff_len = q->umem_props.frame_size; - if (unlikely(d->len > buff_len || d->len == 0 || - d->offset > buff_len || d->offset + d->len > buff_len)) { + if (((d->addr + d->len) & q->umem_props.chunk_mask) != + (d->addr & q->umem_props.chunk_mask)) { q->invalid_descs++; return false; } @@ -199,7 +195,7 @@ static inline void xskq_discard_desc(struct xsk_queue *q) } static inline int xskq_produce_batch_desc(struct xsk_queue *q, - u32 id, u32 len, u16 offset) + u64 addr, u32 len) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; unsigned int idx; @@ -208,9 +204,8 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q, return -ENOSPC; idx = (q->prod_head++) & q->ring_mask; - ring->desc[idx].idx = id; + ring->desc[idx].addr = addr; ring->desc[idx].len = len; - ring->desc[idx].offset = offset; return 0; } -- cgit v1.2.3 From 173d3adb6f437037f216270955886ca9878187a5 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 4 Jun 2018 14:05:55 +0200 Subject: xsk: add zero-copy support for Rx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the xsk_rcv to support the new MEM_TYPE_ZERO_COPY memory, and wireup ndo_bpf call in bind. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 6 +++ include/uapi/linux/if_xdp.h | 4 +- net/xdp/xdp_umem.c | 77 ++++++++++++++++++++++++++++++++++++ net/xdp/xdp_umem.h | 3 ++ net/xdp/xsk.c | 96 +++++++++++++++++++++++++++++++++++---------- 5 files changed, 165 insertions(+), 21 deletions(-) (limited to 'include/uapi') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index caf343a7e224..d93d3aac3fc9 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -22,6 +22,7 @@ struct xdp_umem_props { struct xdp_umem_page { void *addr; + dma_addr_t dma; }; struct xdp_umem { @@ -38,6 +39,9 @@ struct xdp_umem { struct work_struct work; struct page **pgs; u32 npgs; + struct net_device *dev; + u16 queue_id; + bool zc; }; struct xdp_sock { @@ -60,6 +64,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); +void xsk_umem_discard_addr(struct xdp_umem *umem); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index e411d6f9ac65..1fa0e977ea8d 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -13,7 +13,9 @@ #include /* Options for the sxdp_flags field */ -#define XDP_SHARED_UMEM 1 +#define XDP_SHARED_UMEM (1 << 0) +#define XDP_COPY (1 << 1) /* Force copy-mode */ +#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */ struct sockaddr_xdp { __u16 sxdp_family; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index aca826011f6c..f729d79b8d91 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -17,6 +17,81 @@ #define XDP_UMEM_MIN_CHUNK_SIZE 2048 +int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, + u32 queue_id, u16 flags) +{ + bool force_zc, force_copy; + struct netdev_bpf bpf; + int err; + + force_zc = flags & XDP_ZEROCOPY; + force_copy = flags & XDP_COPY; + + if (force_zc && force_copy) + return -EINVAL; + + if (force_copy) + return 0; + + dev_hold(dev); + + if (dev->netdev_ops->ndo_bpf) { + bpf.command = XDP_QUERY_XSK_UMEM; + + rtnl_lock(); + err = dev->netdev_ops->ndo_bpf(dev, &bpf); + rtnl_unlock(); + + if (err) { + dev_put(dev); + return force_zc ? -ENOTSUPP : 0; + } + + bpf.command = XDP_SETUP_XSK_UMEM; + bpf.xsk.umem = umem; + bpf.xsk.queue_id = queue_id; + + rtnl_lock(); + err = dev->netdev_ops->ndo_bpf(dev, &bpf); + rtnl_unlock(); + + if (err) { + dev_put(dev); + return force_zc ? err : 0; /* fail or fallback */ + } + + umem->dev = dev; + umem->queue_id = queue_id; + umem->zc = true; + return 0; + } + + dev_put(dev); + return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ +} + +void xdp_umem_clear_dev(struct xdp_umem *umem) +{ + struct netdev_bpf bpf; + int err; + + if (umem->dev) { + bpf.command = XDP_SETUP_XSK_UMEM; + bpf.xsk.umem = NULL; + bpf.xsk.queue_id = umem->queue_id; + + rtnl_lock(); + err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf); + rtnl_unlock(); + + if (err) + WARN(1, "failed to disable umem!\n"); + + dev_put(umem->dev); + umem->dev = NULL; + } +} + static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unsigned int i; @@ -43,6 +118,8 @@ static void xdp_umem_release(struct xdp_umem *umem) struct task_struct *task; struct mm_struct *mm; + xdp_umem_clear_dev(umem); + if (umem->fq) { xskq_destroy(umem->fq); umem->fq = NULL; diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 40e8fa4a92af..674508a32a4d 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -13,6 +13,9 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); } +int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, + u32 queue_id, u16 flags); +void xdp_umem_clear_dev(struct xdp_umem *umem); bool xdp_umem_validate_queues(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4688c750df1d..ab64bd8260ea 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -36,19 +36,28 @@ static struct xdp_sock *xdp_sk(struct sock *sk) bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { - return !!xs->rx; + return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && + READ_ONCE(xs->umem->fq); } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) +{ + return xskq_peek_addr(umem->fq, addr); +} +EXPORT_SYMBOL(xsk_umem_peek_addr); + +void xsk_umem_discard_addr(struct xdp_umem *umem) +{ + xskq_discard_addr(umem->fq); +} +EXPORT_SYMBOL(xsk_umem_discard_addr); + +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - u32 len = xdp->data_end - xdp->data; void *buffer; u64 addr; int err; - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) - return -EINVAL; - if (!xskq_peek_addr(xs->umem->fq, &addr) || len > xs->umem->chunk_size_nohr) { xs->rx_dropped++; @@ -60,25 +69,41 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) buffer = xdp_umem_get_data(xs->umem, addr); memcpy(buffer, xdp->data, len); err = xskq_produce_batch_desc(xs->rx, addr, len); - if (!err) + if (!err) { xskq_discard_addr(xs->umem->fq); - else - xs->rx_dropped++; + xdp_return_buff(xdp); + return 0; + } + xs->rx_dropped++; return err; } -int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - int err; + int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); - err = __xsk_rcv(xs, xdp); - if (likely(!err)) + if (err) { xdp_return_buff(xdp); + xs->rx_dropped++; + } return err; } +int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + u32 len; + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + return -EINVAL; + + len = xdp->data_end - xdp->data; + + return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? + __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); +} + void xsk_flush(struct xdp_sock *xs) { xskq_produce_flush_desc(xs->rx); @@ -87,12 +112,29 @@ void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 len = xdp->data_end - xdp->data; + void *buffer; + u64 addr; int err; - err = __xsk_rcv(xs, xdp); - if (!err) + if (!xskq_peek_addr(xs->umem->fq, &addr) || + len > xs->umem->chunk_size_nohr) { + xs->rx_dropped++; + return -ENOSPC; + } + + addr += xs->umem->headroom; + + buffer = xdp_umem_get_data(xs->umem, addr); + memcpy(buffer, xdp->data, len); + err = xskq_produce_batch_desc(xs->rx, addr, len); + if (!err) { + xskq_discard_addr(xs->umem->fq); xsk_flush(xs); + return 0; + } + xs->rx_dropped++; return err; } @@ -291,6 +333,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev; + u32 flags, qid; int err = 0; if (addr_len < sizeof(struct sockaddr_xdp)) @@ -315,16 +358,26 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } - if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) || - (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) { + qid = sxdp->sxdp_queue_id; + + if ((xs->rx && qid >= dev->real_num_rx_queues) || + (xs->tx && qid >= dev->real_num_tx_queues)) { err = -EINVAL; goto out_unlock; } - if (sxdp->sxdp_flags & XDP_SHARED_UMEM) { + flags = sxdp->sxdp_flags; + + if (flags & XDP_SHARED_UMEM) { struct xdp_sock *umem_xs; struct socket *sock; + if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) { + /* Cannot specify flags for shared sockets. */ + err = -EINVAL; + goto out_unlock; + } + if (xs->umem) { /* We have already our own. */ err = -EINVAL; @@ -343,8 +396,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) err = -EBADF; sockfd_put(sock); goto out_unlock; - } else if (umem_xs->dev != dev || - umem_xs->queue_id != sxdp->sxdp_queue_id) { + } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) { err = -EINVAL; sockfd_put(sock); goto out_unlock; @@ -360,6 +412,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) /* This xsk has its own umem. */ xskq_set_umem(xs->umem->fq, &xs->umem->props); xskq_set_umem(xs->umem->cq, &xs->umem->props); + + err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); + if (err) + goto out_unlock; } xs->dev = dev; -- cgit v1.2.3