diff options
author | Jakub Kicinski <kuba@kernel.org> | 2020-12-04 07:48:11 -0800 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2020-12-04 07:48:12 -0800 |
commit | a1dd1d86973182458da7798a95f26cfcbea599b4 (patch) | |
tree | 1adda22ea30ccfac7651a7eed7b7c90356f8243a /net/xdp | |
parent | 55fd59b003f6e8fd88cf16590e79823d7ccf3026 (diff) | |
parent | eceae70bdeaeb6b8ceb662983cf663ff352fbc96 (diff) | |
download | lwn-a1dd1d86973182458da7798a95f26cfcbea599b4.tar.gz lwn-a1dd1d86973182458da7798a95f26cfcbea599b4.zip |
Merge https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Alexei Starovoitov says:
====================
pull-request: bpf-next 2020-12-03
The main changes are:
1) Support BTF in kernel modules, from Andrii.
2) Introduce preferred busy-polling, from Björn.
3) bpf_ima_inode_hash() and bpf_bprm_opts_set() helpers, from KP Singh.
4) Memcg-based memory accounting for bpf objects, from Roman.
5) Allow bpf_{s,g}etsockopt from cgroup bind{4,6} hooks, from Stanislav.
* https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (118 commits)
selftests/bpf: Fix invalid use of strncat in test_sockmap
libbpf: Use memcpy instead of strncpy to please GCC
selftests/bpf: Add fentry/fexit/fmod_ret selftest for kernel module
selftests/bpf: Add tp_btf CO-RE reloc test for modules
libbpf: Support attachment of BPF tracing programs to kernel modules
libbpf: Factor out low-level BPF program loading helper
bpf: Allow to specify kernel module BTFs when attaching BPF programs
bpf: Remove hard-coded btf_vmlinux assumption from BPF verifier
selftests/bpf: Add CO-RE relocs selftest relying on kernel module BTF
selftests/bpf: Add support for marking sub-tests as skipped
selftests/bpf: Add bpf_testmod kernel module for testing
libbpf: Add kernel module BTF support for CO-RE relocations
libbpf: Refactor CO-RE relocs to not assume a single BTF object
libbpf: Add internal helper to load BTF data by FD
bpf: Keep module's btf_data_size intact after load
bpf: Fix bpf_put_raw_tracepoint()'s use of __module_address()
selftests/bpf: Add Userspace tests for TCP_WINDOW_CLAMP
bpf: Adds support for setting window clamp
samples/bpf: Fix spelling mistake "recieving" -> "receiving"
bpf: Fix cold build of test_progs-no_alu32
...
====================
Link: https://lore.kernel.org/r/20201204021936.85653-1-alexei.starovoitov@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/xdp')
-rw-r--r-- | net/xdp/xsk.c | 114 | ||||
-rw-r--r-- | net/xdp/xsk.h | 2 | ||||
-rw-r--r-- | net/xdp/xsk_buff_pool.c | 13 | ||||
-rw-r--r-- | net/xdp/xsk_queue.h | 93 | ||||
-rw-r--r-- | net/xdp/xskmap.c | 35 |
5 files changed, 201 insertions, 56 deletions
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b7b039bd9d03..56c46e5f57bc 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -23,6 +23,7 @@ #include <linux/netdevice.h> #include <linux/rculist.h> #include <net/xdp_sock_drv.h> +#include <net/busy_poll.h> #include <net/xdp.h> #include "xsk_queue.h" @@ -232,6 +233,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; + sk_mark_napi_id_once_xdp(&xs->sk, xdp); len = xdp->data_end - xdp->data; return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? @@ -332,6 +334,63 @@ out: } EXPORT_SYMBOL(xsk_tx_peek_desc); +static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs, + u32 max_entries) +{ + u32 nb_pkts = 0; + + while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) + nb_pkts++; + + xsk_tx_release(pool); + return nb_pkts; +} + +u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs, + u32 max_entries) +{ + struct xdp_sock *xs; + u32 nb_pkts; + + rcu_read_lock(); + if (!list_is_singular(&pool->xsk_tx_list)) { + /* Fallback to the non-batched version */ + rcu_read_unlock(); + return xsk_tx_peek_release_fallback(pool, descs, max_entries); + } + + xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); + if (!xs) { + nb_pkts = 0; + goto out; + } + + nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries); + if (!nb_pkts) { + xs->tx->queue_empty_descs++; + goto out; + } + + /* This is the backpressure mechanism for the Tx path. Try to + * reserve space in the completion queue for all packets, but + * if there are fewer slots available, just process that many + * packets. This avoids having to implement any buffering in + * the Tx path. + */ + nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts); + if (!nb_pkts) + goto out; + + xskq_cons_release_n(xs->tx, nb_pkts); + __xskq_cons_release(xs->tx); + xs->sk.sk_write_space(&xs->sk); + +out: + rcu_read_unlock(); + return nb_pkts; +} +EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); + static int xsk_wakeup(struct xdp_sock *xs, u8 flags) { struct net_device *dev = xs->dev; @@ -454,18 +513,65 @@ static int __xsk_sendmsg(struct sock *sk) return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk); } +static bool xsk_no_wakeup(struct sock *sk) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + /* Prefer busy-polling, skip the wakeup. */ + return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && + READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID; +#else + return false; +#endif +} + static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { bool need_wait = !(m->msg_flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); + struct xsk_buff_pool *pool; if (unlikely(!xsk_is_bound(xs))) return -ENXIO; if (unlikely(need_wait)) return -EOPNOTSUPP; - return __xsk_sendmsg(sk); + if (sk_can_busy_loop(sk)) + sk_busy_loop(sk, 1); /* only support non-blocking sockets */ + + if (xsk_no_wakeup(sk)) + return 0; + + pool = xs->pool; + if (pool->cached_need_wakeup & XDP_WAKEUP_TX) + return __xsk_sendmsg(sk); + return 0; +} + +static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) +{ + bool need_wait = !(flags & MSG_DONTWAIT); + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + + if (unlikely(!(xs->dev->flags & IFF_UP))) + return -ENETDOWN; + if (unlikely(!xs->rx)) + return -ENOBUFS; + if (unlikely(!xsk_is_bound(xs))) + return -ENXIO; + if (unlikely(need_wait)) + return -EOPNOTSUPP; + + if (sk_can_busy_loop(sk)) + sk_busy_loop(sk, 1); /* only support non-blocking sockets */ + + if (xsk_no_wakeup(sk)) + return 0; + + if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) + return xsk_wakeup(xs, XDP_WAKEUP_RX); + return 0; } static __poll_t xsk_poll(struct file *file, struct socket *sock, @@ -542,7 +648,7 @@ static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, node); if (node) { - WARN_ON(xsk_map_inc(node->map)); + bpf_map_inc(&node->map->map); map = node->map; *map_entry = node->map_entry; } @@ -572,7 +678,7 @@ static void xsk_delete_from_maps(struct xdp_sock *xs) while ((map = xsk_get_map_list_entry(xs, &map_entry))) { xsk_map_try_sock_delete(map, xs, map_entry); - xsk_map_put(map); + bpf_map_put(&map->map); } } @@ -1128,7 +1234,7 @@ static const struct proto_ops xsk_proto_ops = { .setsockopt = xsk_setsockopt, .getsockopt = xsk_getsockopt, .sendmsg = xsk_sendmsg, - .recvmsg = sock_no_recvmsg, + .recvmsg = xsk_recvmsg, .mmap = xsk_mmap, .sendpage = sock_no_sendpage, }; diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index b9e896cee5bb..edcf249ad1f1 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -41,8 +41,6 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk) void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry); -int xsk_map_inc(struct xsk_map *map); -void xsk_map_put(struct xsk_map *map); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id); int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 9287eddec52c..556d82d03687 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -144,14 +144,13 @@ static int __xp_assign_dev(struct xsk_buff_pool *pool, if (err) return err; - if (flags & XDP_USE_NEED_WAKEUP) { + if (flags & XDP_USE_NEED_WAKEUP) pool->uses_need_wakeup = true; - /* Tx needs to be explicitly woken up the first time. - * Also for supporting drivers that do not implement this - * feature. They will always have to call sendto(). - */ - pool->cached_need_wakeup = XDP_WAKEUP_TX; - } + /* Tx needs to be explicitly woken up the first time. Also + * for supporting drivers that do not implement this + * feature. They will always have to call sendto() or poll(). + */ + pool->cached_need_wakeup = XDP_WAKEUP_TX; dev_hold(netdev); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index cdb9cf3cd136..b936c46b1e16 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -18,9 +18,11 @@ struct xdp_ring { /* Hinder the adjacent cache prefetcher to prefetch the consumer * pointer if the producer pointer is touched and vice versa. */ - u32 pad ____cacheline_aligned_in_smp; + u32 pad1 ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; + u32 pad2 ____cacheline_aligned_in_smp; u32 flags; + u32 pad3 ____cacheline_aligned_in_smp; }; /* Used for the RX and TX queues for packets */ @@ -197,6 +199,30 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, return false; } +static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, + struct xdp_desc *descs, + struct xsk_buff_pool *pool, u32 max) +{ + u32 cached_cons = q->cached_cons, nb_entries = 0; + + while (cached_cons != q->cached_prod && nb_entries < max) { + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + u32 idx = cached_cons & q->ring_mask; + + descs[nb_entries] = ring->desc[idx]; + if (unlikely(!xskq_cons_is_valid_desc(q, &descs[nb_entries], pool))) { + /* Skip the entry */ + cached_cons++; + continue; + } + + nb_entries++; + cached_cons++; + } + + return nb_entries; +} + /* Functions for consumers */ static inline void __xskq_cons_release(struct xsk_queue *q) @@ -218,17 +244,22 @@ static inline void xskq_cons_get_entries(struct xsk_queue *q) __xskq_cons_peek(q); } -static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) +static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max) { u32 entries = q->cached_prod - q->cached_cons; - if (entries >= cnt) - return true; + if (entries >= max) + return max; __xskq_cons_peek(q); entries = q->cached_prod - q->cached_cons; - return entries >= cnt; + return entries >= max ? max : entries; +} + +static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) +{ + return xskq_cons_nb_entries(q, cnt) >= cnt ? true : false; } static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) @@ -247,16 +278,28 @@ static inline bool xskq_cons_peek_desc(struct xsk_queue *q, return xskq_cons_read_desc(q, desc, pool); } +static inline u32 xskq_cons_peek_desc_batch(struct xsk_queue *q, struct xdp_desc *descs, + struct xsk_buff_pool *pool, u32 max) +{ + u32 entries = xskq_cons_nb_entries(q, max); + + return xskq_cons_read_desc_batch(q, descs, pool, entries); +} + +/* To improve performance in the xskq_cons_release functions, only update local state here. + * Reflect this to global state when we get new entries from the ring in + * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop. + */ static inline void xskq_cons_release(struct xsk_queue *q) { - /* To improve performance, only update local state here. - * Reflect this to global state when we get new entries - * from the ring in xskq_cons_get_entries() and whenever - * Rx or Tx processing are completed in the NAPI loop. - */ q->cached_cons++; } +static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_cons += cnt; +} + static inline bool xskq_cons_is_full(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ @@ -266,18 +309,23 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q) /* Functions for producers */ -static inline bool xskq_prod_is_full(struct xsk_queue *q) +static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); - if (free_entries) - return false; + if (free_entries >= max) + return max; /* Refresh the local tail pointer */ q->cached_cons = READ_ONCE(q->ring->consumer); free_entries = q->nentries - (q->cached_prod - q->cached_cons); - return !free_entries; + return free_entries >= max ? max : free_entries; +} + +static inline bool xskq_prod_is_full(struct xsk_queue *q) +{ + return xskq_prod_nb_free(q, 1) ? false : true; } static inline int xskq_prod_reserve(struct xsk_queue *q) @@ -302,6 +350,23 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) return 0; } +static inline u32 xskq_prod_reserve_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, + u32 max) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 nb_entries, i, cached_prod; + + nb_entries = xskq_prod_nb_free(q, max); + + /* A, matches D */ + cached_prod = q->cached_prod; + for (i = 0; i < nb_entries; i++) + ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr; + q->cached_prod = cached_prod; + + return nb_entries; +} + static inline int xskq_prod_reserve_desc(struct xsk_queue *q, u64 addr, u32 len) { diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 49da2b8ace8b..113fd9017203 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -11,32 +11,17 @@ #include "xsk.h" -int xsk_map_inc(struct xsk_map *map) -{ - bpf_map_inc(&map->map); - return 0; -} - -void xsk_map_put(struct xsk_map *map) -{ - bpf_map_put(&map->map); -} - static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, struct xdp_sock **map_entry) { struct xsk_map_node *node; - int err; - node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); + node = bpf_map_kzalloc(&map->map, sizeof(*node), + GFP_ATOMIC | __GFP_NOWARN); if (!node) return ERR_PTR(-ENOMEM); - err = xsk_map_inc(map); - if (err) { - kfree(node); - return ERR_PTR(err); - } + bpf_map_inc(&map->map); node->map = map; node->map_entry = map_entry; @@ -45,7 +30,7 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, static void xsk_map_node_free(struct xsk_map_node *node) { - xsk_map_put(node->map); + bpf_map_put(&node->map->map); kfree(node); } @@ -73,9 +58,8 @@ static void xsk_map_sock_delete(struct xdp_sock *xs, static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { - struct bpf_map_memory mem; - int err, numa_node; struct xsk_map *m; + int numa_node; u64 size; if (!capable(CAP_NET_ADMIN)) @@ -89,18 +73,11 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) numa_node = bpf_map_attr_numa_node(attr); size = struct_size(m, xsk_map, attr->max_entries); - err = bpf_map_charge_init(&mem, size); - if (err < 0) - return ERR_PTR(err); - m = bpf_map_area_alloc(size, numa_node); - if (!m) { - bpf_map_charge_finish(&mem); + if (!m) return ERR_PTR(-ENOMEM); - } bpf_map_init_from_attr(&m->map, attr); - bpf_map_charge_move(&m->map.memory, &mem); spin_lock_init(&m->lock); return &m->map; |