diff options
Diffstat (limited to 'net/xdp')
| -rw-r--r-- | net/xdp/xdp_umem.c | 8 | ||||
| -rw-r--r-- | net/xdp/xsk.c | 505 | ||||
| -rw-r--r-- | net/xdp/xsk.h | 7 | ||||
| -rw-r--r-- | net/xdp/xsk_buff_pool.c | 132 | ||||
| -rw-r--r-- | net/xdp/xsk_diag.c | 4 | ||||
| -rw-r--r-- | net/xdp/xsk_queue.c | 2 | ||||
| -rw-r--r-- | net/xdp/xsk_queue.h | 78 | ||||
| -rw-r--r-- | net/xdp/xskmap.c | 4 |
8 files changed, 568 insertions, 172 deletions
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 9f76ca591d54..58da2f4f4397 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -97,7 +97,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) long npgs; int err; - umem->pgs = kvcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL | __GFP_NOWARN); + umem->pgs = kvzalloc_objs(*umem->pgs, umem->npgs, + GFP_KERNEL | __GFP_NOWARN); if (!umem->pgs) return -ENOMEM; @@ -202,7 +203,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (!unaligned_chunks && chunks_rem) return -EINVAL; - if (headroom >= chunk_size - XDP_PACKET_HEADROOM) + if (headroom > chunk_size - XDP_PACKET_HEADROOM - + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) - 128) return -EINVAL; if (mr->flags & XDP_UMEM_TX_METADATA_LEN) { @@ -249,7 +251,7 @@ struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr) struct xdp_umem *umem; int err; - umem = kzalloc(sizeof(*umem), GFP_KERNEL); + umem = kzalloc_obj(*umem); if (!umem) return ERR_PTR(-ENOMEM); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 89d2bef96469..5e5786cd9af5 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -23,17 +23,29 @@ #include <linux/netdevice.h> #include <linux/rculist.h> #include <linux/vmalloc.h> + +#include <net/netdev_queues.h> #include <net/xdp_sock_drv.h> #include <net/busy_poll.h> +#include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/xdp.h> +#include "../core/dev.h" + #include "xsk_queue.h" #include "xdp_umem.h" #include "xsk.h" #define TX_BATCH_SIZE 32 -#define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE) +#define MAX_PER_SOCKET_BUDGET 32 + +struct xsk_addrs { + u32 num_descs; + u64 addrs[MAX_SKB_FRAGS + 1]; +}; + +static struct kmem_cache *xsk_tx_generic_cache; void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { @@ -107,7 +119,7 @@ struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, } EXPORT_SYMBOL(xsk_get_pool_from_qid); -void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) +static void __xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) { if (queue_id < dev->num_rx_queues) dev->_rx[queue_id].pool = NULL; @@ -115,6 +127,36 @@ void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) dev->_tx[queue_id].pool = NULL; } +void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) +{ + struct netdev_rx_queue *hw_rxq; + + if (!netif_rxq_is_leased(dev, queue_id)) + return __xsk_clear_pool_at_qid(dev, queue_id); + WARN_ON_ONCE(!netif_is_queue_leasee(dev)); + + hw_rxq = __netif_get_rx_queue(dev, queue_id)->lease; + + netdev_lock(hw_rxq->dev); + queue_id = get_netdev_rx_queue_index(hw_rxq); + __xsk_clear_pool_at_qid(hw_rxq->dev, queue_id); + netdev_unlock(hw_rxq->dev); +} + +static int __xsk_reg_pool_at_qid(struct net_device *dev, + struct xsk_buff_pool *pool, u16 queue_id) +{ + if (xsk_get_pool_from_qid(dev, queue_id)) + return -EBUSY; + + if (queue_id < dev->real_num_rx_queues) + dev->_rx[queue_id].pool = pool; + if (queue_id < dev->real_num_tx_queues) + dev->_tx[queue_id].pool = pool; + + return 0; +} + /* The buffer pool is stored both in the _rx struct and the _tx struct as we do * not know if the device has more tx queues than rx, or the opposite. * This might also change during run time. @@ -122,17 +164,27 @@ void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id) { - if (queue_id >= max_t(unsigned int, - dev->real_num_rx_queues, - dev->real_num_tx_queues)) + struct netdev_rx_queue *hw_rxq; + int ret; + + if (queue_id >= max(dev->real_num_rx_queues, + dev->real_num_tx_queues)) return -EINVAL; - if (queue_id < dev->real_num_rx_queues) - dev->_rx[queue_id].pool = pool; - if (queue_id < dev->real_num_tx_queues) - dev->_tx[queue_id].pool = pool; + if (queue_id >= dev->real_num_rx_queues || + !netif_rxq_is_leased(dev, queue_id)) + return __xsk_reg_pool_at_qid(dev, pool, queue_id); + if (!netif_is_queue_leasee(dev)) + return -EBUSY; - return 0; + hw_rxq = __netif_get_rx_queue(dev, queue_id)->lease; + + netdev_lock(hw_rxq->dev); + queue_id = get_netdev_rx_queue_index(hw_rxq); + ret = __xsk_reg_pool_at_qid(hw_rxq->dev, pool, queue_id); + netdev_unlock(hw_rxq->dev); + + return ret; } static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, @@ -152,6 +204,17 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, return 0; } +static void __xsk_rcv_zc_safe(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, + u32 len, u32 flags) +{ + u64 addr; + + addr = xp_get_handle(xskb, xskb->pool); + __xskq_prod_reserve_desc(xs->rx, addr, len, flags); + + xp_release(xskb); +} + static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); @@ -159,26 +222,32 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) struct xdp_buff_xsk *pos, *tmp; struct list_head *xskb_list; u32 contd = 0; + u32 num_desc; int err; - if (frags) - contd = XDP_PKT_CONTD; + if (likely(!frags)) { + err = __xsk_rcv_zc(xs, xskb, len, contd); + if (err) + goto err; + return 0; + } - err = __xsk_rcv_zc(xs, xskb, len, contd); - if (err) + contd = XDP_PKT_CONTD; + num_desc = xdp_get_shared_info_from_buff(xdp)->nr_frags + 1; + if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { + xs->rx_queue_full++; + err = -ENOBUFS; goto err; - if (likely(!frags)) - return 0; + } + __xsk_rcv_zc_safe(xs, xskb, len, contd); xskb_list = &xskb->pool->xskb_list; list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { if (list_is_singular(xskb_list)) contd = 0; len = pos->xdp.data_end - pos->xdp.data; - err = __xsk_rcv_zc(xs, pos, len, contd); - if (err) - goto err; - list_del(&pos->list_node); + __xsk_rcv_zc_safe(xs, pos, len, contd); + list_del_init(&pos->list_node); } return 0; @@ -225,7 +294,7 @@ static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); + u32 frame_size = __xsk_pool_get_rx_frame_size(xs->pool); void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; u32 from_len, meta_len, rem, num_desc; struct xdp_buff_xsk *xskb; @@ -284,7 +353,8 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) rem -= copied; xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); - __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); + __xsk_rcv_zc_safe(xs, xskb, copied - meta_len, + rem ? XDP_PKT_CONTD : 0); meta_len = 0; } while (rem); @@ -299,6 +369,13 @@ static bool xsk_tx_writeable(struct xdp_sock *xs) return true; } +static void __xsk_tx_release(struct xdp_sock *xs) +{ + __xskq_cons_release(xs->tx); + if (xsk_tx_writeable(xs)) + xs->sk.sk_write_space(&xs->sk); +} + static bool xsk_is_bound(struct xdp_sock *xs) { if (READ_ONCE(xs->state) == XSK_BOUND) { @@ -309,15 +386,39 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } +static bool xsk_dev_queue_valid(const struct xdp_sock *xs, + const struct xdp_rxq_info *info) +{ + struct net_device *dev = xs->dev; + u32 queue_index = xs->queue_id; + struct netdev_rx_queue *rxq; + + if (info->dev == dev && + info->queue_index == queue_index) + return true; + + if (queue_index < dev->real_num_rx_queues) { + rxq = READ_ONCE(__netif_get_rx_queue(dev, queue_index)->lease); + if (!rxq) + return false; + + dev = rxq->dev; + queue_index = get_netdev_rx_queue_index(rxq); + + return info->dev == dev && + info->queue_index == queue_index; + } + return false; +} + static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { if (!xsk_is_bound(xs)) return -ENXIO; - - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + if (!xsk_dev_queue_valid(xs, xdp->rxq)) return -EINVAL; - if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { + if (len > __xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { xs->rx_dropped++; return -ENOSPC; } @@ -337,13 +438,14 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) u32 len = xdp_get_buff_len(xdp); int err; - spin_lock_bh(&xs->rx_lock); err = xsk_rcv_check(xs, xdp, len); if (!err) { + spin_lock_bh(&xs->pool->rx_lock); err = __xsk_rcv(xs, xdp, len); xsk_flush(xs); + spin_unlock_bh(&xs->pool->rx_lock); } - spin_unlock_bh(&xs->rx_lock); + return err; } @@ -405,11 +507,8 @@ void xsk_tx_release(struct xsk_buff_pool *pool) struct xdp_sock *xs; rcu_read_lock(); - list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { - __xskq_cons_release(xs->tx); - if (xsk_tx_writeable(xs)) - xs->sk.sk_write_space(&xs->sk); - } + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) + __xsk_tx_release(xs); rcu_read_unlock(); } EXPORT_SYMBOL(xsk_tx_release); @@ -526,42 +625,126 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr) +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) { - unsigned long flags; int ret; - spin_lock_irqsave(&pool->cq_lock, flags); - ret = xskq_prod_reserve_addr(pool->cq, addr); - spin_unlock_irqrestore(&pool->cq_lock, flags); + spin_lock(&pool->cq->cq_cached_prod_lock); + ret = xskq_prod_reserve(pool->cq); + spin_unlock(&pool->cq->cq_cached_prod_lock); return ret; } -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n) +static bool xsk_skb_destructor_is_addr(struct sk_buff *skb) { - unsigned long flags; + return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL; +} - spin_lock_irqsave(&pool->cq_lock, flags); - xskq_prod_submit_n(pool->cq, n); - spin_unlock_irqrestore(&pool->cq_lock, flags); +static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb) +{ + return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL); } -static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) +static struct xsk_addrs *__xsk_addrs_alloc(struct sk_buff *skb, u64 addr) { - unsigned long flags; + struct xsk_addrs *xsk_addr; - spin_lock_irqsave(&pool->cq_lock, flags); - xskq_prod_cancel_n(pool->cq, n); - spin_unlock_irqrestore(&pool->cq_lock, flags); + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); + if (unlikely(!xsk_addr)) + return NULL; + + xsk_addr->addrs[0] = addr; + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; + return xsk_addr; +} + +static struct xsk_addrs *xsk_addrs_alloc(struct sk_buff *skb) +{ + struct xsk_addrs *xsk_addr; + + if (!xsk_skb_destructor_is_addr(skb)) + return (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + + xsk_addr = __xsk_addrs_alloc(skb, xsk_skb_destructor_get_addr(skb)); + if (likely(xsk_addr)) + xsk_addr->num_descs = 1; + return xsk_addr; +} + +static int xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) +{ + if (IS_ENABLED(CONFIG_64BIT)) { + skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); + return 0; + } + + if (unlikely(!__xsk_addrs_alloc(skb, addr))) + return -ENOMEM; + return 0; +} + +static void xsk_inc_num_desc(struct sk_buff *skb) +{ + struct xsk_addrs *xsk_addr; + + if (!xsk_skb_destructor_is_addr(skb)) { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + xsk_addr->num_descs++; + } } static u32 xsk_get_num_desc(struct sk_buff *skb) { - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; + struct xsk_addrs *xsk_addr; + + if (xsk_skb_destructor_is_addr(skb)) + return 1; + + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + + return xsk_addr->num_descs; } -static void xsk_destruct_skb(struct sk_buff *skb) +static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, + struct sk_buff *skb) +{ + u32 num_descs = xsk_get_num_desc(skb); + struct xsk_addrs *xsk_addr; + u32 descs_processed = 0; + unsigned long flags; + u32 idx, i; + + spin_lock_irqsave(&pool->cq_prod_lock, flags); + idx = xskq_get_prod(pool->cq); + + if (unlikely(!xsk_skb_destructor_is_addr(skb))) { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + + for (i = 0; i < num_descs; i++) { + xskq_prod_write_addr(pool->cq, idx + descs_processed, + xsk_addr->addrs[i]); + descs_processed++; + } + kmem_cache_free(xsk_tx_generic_cache, xsk_addr); + } else { + xskq_prod_write_addr(pool->cq, idx, + xsk_skb_destructor_get_addr(skb)); + descs_processed++; + } + xskq_prod_submit_n(pool->cq, descs_processed); + spin_unlock_irqrestore(&pool->cq_prod_lock, flags); +} + +static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) +{ + spin_lock(&pool->cq->cq_cached_prod_lock); + xskq_prod_cancel_n(pool->cq, n); + spin_unlock(&pool->cq->cq_cached_prod_lock); +} + +INDIRECT_CALLABLE_SCOPE +void xsk_destruct_skb(struct sk_buff *skb) { struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; @@ -570,23 +753,39 @@ static void xsk_destruct_skb(struct sk_buff *skb) *compl->tx_timestamp = ktime_get_tai_fast_ns(); } - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb)); + xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb); sock_wfree(skb); } -static void xsk_set_destructor_arg(struct sk_buff *skb) +static int xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, + u64 addr) { - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; + int err; - skb_shinfo(skb)->destructor_arg = (void *)num; + err = xsk_skb_destructor_set_addr(skb, addr); + if (unlikely(err)) + return err; + + skb->dev = xs->dev; + skb->priority = READ_ONCE(xs->sk.sk_priority); + skb->mark = READ_ONCE(xs->sk.sk_mark); + skb->destructor = xsk_destruct_skb; + return 0; } static void xsk_consume_skb(struct sk_buff *skb) { struct xdp_sock *xs = xdp_sk(skb->sk); + u32 num_descs = xsk_get_num_desc(skb); + struct xsk_addrs *xsk_addr; + + if (unlikely(!xsk_skb_destructor_is_addr(skb))) { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + kmem_cache_free(xsk_tx_generic_cache, xsk_addr); + } skb->destructor = sock_wfree; - xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb)); + xsk_cq_cancel_locked(xs->pool, num_descs); /* Free skb without triggering the perf drop trace */ consume_skb(skb); xs->skb = NULL; @@ -598,6 +797,45 @@ static void xsk_drop_skb(struct sk_buff *skb) xsk_consume_skb(skb); } +static int xsk_skb_metadata(struct sk_buff *skb, void *buffer, + struct xdp_desc *desc, struct xsk_buff_pool *pool, + u32 hr) +{ + struct xsk_tx_metadata *meta = NULL; + + if (unlikely(pool->tx_metadata_len == 0)) + return -EINVAL; + + meta = buffer - pool->tx_metadata_len; + if (unlikely(!xsk_buff_valid_tx_metadata(meta))) + return -EINVAL; + + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { + if (unlikely(meta->request.csum_start + + meta->request.csum_offset + + sizeof(__sum16) > desc->len)) + return -EINVAL; + + skb->csum_start = hr + meta->request.csum_start; + skb->csum_offset = meta->request.csum_offset; + skb->ip_summed = CHECKSUM_PARTIAL; + + if (unlikely(pool->tx_sw_csum)) { + int err; + + err = skb_checksum_help(skb); + if (err) + return err; + } + } + + if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) + skb->skb_mstamp_ns = meta->request.launch_time; + xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); + + return 0; +} + static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { @@ -609,6 +847,9 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, int err, i; u64 addr; + addr = desc->addr; + buffer = xsk_buff_raw_get_data(pool, addr); + if (!skb) { hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); @@ -617,19 +858,39 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, return ERR_PTR(err); skb_reserve(skb, hr); + if (desc->options & XDP_TX_METADATA) { + err = xsk_skb_metadata(skb, buffer, desc, pool, hr); + if (unlikely(err)) { + kfree_skb(skb); + return ERR_PTR(err); + } + } + } else { + struct xsk_addrs *xsk_addr; + + xsk_addr = xsk_addrs_alloc(skb); + if (!xsk_addr) + return ERR_PTR(-ENOMEM); + + /* in case of -EOVERFLOW that could happen below, + * xsk_consume_skb() will release this node as whole skb + * would be dropped, which implies freeing all list elements + */ + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; } - addr = desc->addr; len = desc->len; ts = pool->unaligned ? len : pool->chunk_size; - buffer = xsk_buff_raw_get_data(pool, addr); offset = offset_in_page(buffer); addr = buffer - pool->addrs; for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { - if (unlikely(i >= MAX_SKB_FRAGS)) + if (unlikely(i >= MAX_SKB_FRAGS)) { + if (!xs->skb) + kfree_skb(skb); return ERR_PTR(-EOVERFLOW); + } page = pool->umem->pgs[addr >> PAGE_SHIFT]; get_page(page); @@ -654,16 +915,15 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { - struct xsk_tx_metadata *meta = NULL; struct net_device *dev = xs->dev; struct sk_buff *skb = xs->skb; - bool first_frag = false; int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { skb = xsk_build_skb_zerocopy(xs, desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); + skb = NULL; goto free_err; } } else { @@ -674,8 +934,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, len = desc->len; if (!skb) { - first_frag = true; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); tr = dev->needed_tailroom; skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); @@ -688,11 +946,25 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, err = skb_store_bits(skb, 0, buffer, len); if (unlikely(err)) goto free_err; + + if (desc->options & XDP_TX_METADATA) { + err = xsk_skb_metadata(skb, buffer, desc, + xs->pool, hr); + if (unlikely(err)) + goto free_err; + } } else { int nr_frags = skb_shinfo(skb)->nr_frags; + struct xsk_addrs *xsk_addr; struct page *page; u8 *vaddr; + xsk_addr = xsk_addrs_alloc(skb); + if (!xsk_addr) { + err = -ENOMEM; + goto free_err; + } + if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { err = -EOVERFLOW; goto free_err; @@ -710,58 +982,33 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); - } - - if (first_frag && desc->options & XDP_TX_METADATA) { - if (unlikely(xs->pool->tx_metadata_len == 0)) { - err = -EINVAL; - goto free_err; - } - meta = buffer - xs->pool->tx_metadata_len; - if (unlikely(!xsk_buff_valid_tx_metadata(meta))) { - err = -EINVAL; - goto free_err; - } - - if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { - if (unlikely(meta->request.csum_start + - meta->request.csum_offset + - sizeof(__sum16) > len)) { - err = -EINVAL; - goto free_err; - } - - skb->csum_start = hr + meta->request.csum_start; - skb->csum_offset = meta->request.csum_offset; - skb->ip_summed = CHECKSUM_PARTIAL; - - if (unlikely(xs->pool->tx_sw_csum)) { - err = skb_checksum_help(skb); - if (err) - goto free_err; - } - } + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; } } - skb->dev = dev; - skb->priority = READ_ONCE(xs->sk.sk_priority); - skb->mark = READ_ONCE(xs->sk.sk_mark); - skb->destructor = xsk_destruct_skb; - xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); - xsk_set_destructor_arg(skb); + if (!xs->skb) { + err = xsk_skb_init_misc(skb, xs, desc->addr); + if (unlikely(err)) + goto free_err; + } + xsk_inc_num_desc(skb); return skb; free_err: - if (first_frag && skb) + if (skb && !xs->skb) kfree_skb(skb); if (err == -EOVERFLOW) { - /* Drop the packet */ - xsk_set_destructor_arg(xs->skb); - xsk_drop_skb(xs->skb); + if (xs->skb) { + /* Drop the packet */ + xsk_inc_num_desc(xs->skb); + xsk_drop_skb(xs->skb); + } else { + xsk_cq_cancel_locked(xs->pool, 1); + xs->tx->invalid_descs++; + } xskq_cons_release(xs->tx); } else { /* Let application retry */ @@ -774,10 +1021,10 @@ free_err: static int __xsk_generic_xmit(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); - u32 max_batch = TX_BATCH_SIZE; bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; + u32 max_batch; int err = 0; mutex_lock(&xs->mutex); @@ -791,6 +1038,7 @@ static int __xsk_generic_xmit(struct sock *sk) if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; + max_batch = READ_ONCE(xs->max_tx_budget); while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { if (max_batch-- == 0) { err = -EAGAIN; @@ -802,8 +1050,11 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - if (xsk_cq_reserve_addr_locked(xs->pool, desc.addr)) + err = xsk_cq_reserve_locked(xs->pool); + if (err) { + err = -EAGAIN; goto out; + } skb = xsk_build_skb(xs, &desc); if (IS_ERR(skb)) { @@ -850,8 +1101,7 @@ static int __xsk_generic_xmit(struct sock *sk) out: if (sent_frame) - if (xsk_tx_writeable(xs)) - sk->sk_write_space(sk); + __xsk_tx_release(xs); mutex_unlock(&xs->mutex); return err; @@ -875,7 +1125,7 @@ static bool xsk_no_wakeup(struct sock *sk) #ifdef CONFIG_NET_RX_BUSY_POLL /* Prefer busy-polling, skip the wakeup. */ return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && - READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID; + napi_id_valid(READ_ONCE(sk->sk_napi_id)); #else return false; #endif @@ -1141,7 +1391,7 @@ static bool xsk_validate_queues(struct xdp_sock *xs) return xs->fq_tmp && xs->cq_tmp; } -static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; struct sock *sk = sock->sk; @@ -1178,6 +1428,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_release; } + netdev_lock_ops(dev); + if (!xs->rx && !xs->tx) { err = -EINVAL; goto out_unlock; @@ -1216,6 +1468,13 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } if (umem_xs->queue_id != qid || umem_xs->dev != dev) { + /* One fill and completion ring required for each queue id. */ + if (!xsk_validate_queues(xs)) { + err = -EINVAL; + sockfd_put(sock); + goto out_unlock; + } + /* Share the umem with another socket on another qid * and/or device. */ @@ -1294,7 +1553,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xs->queue_id = qid; xp_add_xsk(xs->pool, xs); - if (xs->zc && qid < dev->real_num_rx_queues) { + if (qid < dev->real_num_rx_queues) { struct netdev_rx_queue *rxq; rxq = __netif_get_rx_queue(dev, qid); @@ -1312,6 +1571,7 @@ out_unlock: smp_wmb(); WRITE_ONCE(xs->state, XSK_BOUND); } + netdev_unlock_ops(dev); out_release: mutex_unlock(&xs->mutex); rtnl_unlock(); @@ -1426,6 +1686,21 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, mutex_unlock(&xs->mutex); return err; } + case XDP_MAX_TX_SKB_BUDGET: + { + unsigned int budget; + + if (optlen != sizeof(budget)) + return -EINVAL; + if (copy_from_sockptr(&budget, optval, sizeof(budget))) + return -EFAULT; + if (!xs->tx || + budget < TX_BATCH_SIZE || budget > xs->tx->nentries) + return -EACCES; + + WRITE_ONCE(xs->max_tx_budget, budget); + return 0; + } default: break; } @@ -1723,8 +1998,8 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs = xdp_sk(sk); xs->state = XSK_READY; + xs->max_tx_budget = TX_BATCH_SIZE; mutex_init(&xs->mutex); - spin_lock_init(&xs->rx_lock); INIT_LIST_HEAD(&xs->map_list); spin_lock_init(&xs->map_list_lock); @@ -1785,8 +2060,18 @@ static int __init xsk_init(void) if (err) goto out_pernet; + xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", + sizeof(struct xsk_addrs), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!xsk_tx_generic_cache) { + err = -ENOMEM; + goto out_unreg_notif; + } + return 0; +out_unreg_notif: + unregister_netdevice_notifier(&xsk_netdev_notifier); out_pernet: unregister_pernet_subsys(&xsk_net_ops); out_sk: diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index a4bc4749faac..7c811b5cce76 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -4,13 +4,6 @@ #ifndef XSK_H_ #define XSK_H_ -/* Masks for xdp_umem_page flags. - * The low 12-bits of the addr will be 0 since this is the page address, so we - * can use them for flags. - */ -#define XSK_NEXT_PG_CONTIG_SHIFT 0 -#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) - struct xdp_ring_offset_v1 { __u64 producer; __u64 consumer; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 1f7975b49657..d981cfdd8535 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/netdevice.h> +#include <net/netdev_lock.h> #include <net/xsk_buff_pool.h> #include <net/xdp_sock.h> #include <net/xdp_sock_drv.h> @@ -8,28 +10,26 @@ #include "xdp_umem.h" #include "xsk.h" +#define ETH_PAD_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN) + void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs) { - unsigned long flags; - if (!xs->tx) return; - spin_lock_irqsave(&pool->xsk_tx_list_lock, flags); + spin_lock(&pool->xsk_tx_list_lock); list_add_rcu(&xs->tx_list, &pool->xsk_tx_list); - spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags); + spin_unlock(&pool->xsk_tx_list_lock); } void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs) { - unsigned long flags; - if (!xs->tx) return; - spin_lock_irqsave(&pool->xsk_tx_list_lock, flags); + spin_lock(&pool->xsk_tx_list_lock); list_del_rcu(&xs->tx_list); - spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags); + spin_unlock(&pool->xsk_tx_list_lock); } void xp_destroy(struct xsk_buff_pool *pool) @@ -44,8 +44,7 @@ void xp_destroy(struct xsk_buff_pool *pool) int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs) { - pool->tx_descs = kvcalloc(xs->tx->nentries, sizeof(*pool->tx_descs), - GFP_KERNEL); + pool->tx_descs = kvzalloc_objs(*pool->tx_descs, xs->tx->nentries); if (!pool->tx_descs) return -ENOMEM; @@ -61,11 +60,11 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, u32 i, entries; entries = unaligned ? umem->chunks : 0; - pool = kvzalloc(struct_size(pool, free_heads, entries), GFP_KERNEL); + pool = kvzalloc_flex(*pool, free_heads, entries); if (!pool) goto out; - pool->heads = kvcalloc(umem->chunks, sizeof(*pool->heads), GFP_KERNEL); + pool->heads = kvzalloc_objs(*pool->heads, umem->chunks); if (!pool->heads) goto out; @@ -87,11 +86,13 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->addrs = umem->addrs; pool->tx_metadata_len = umem->tx_metadata_len; pool->tx_sw_csum = umem->flags & XDP_UMEM_TX_SW_CSUM; + spin_lock_init(&pool->rx_lock); INIT_LIST_HEAD(&pool->free_list); INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); spin_lock_init(&pool->xsk_tx_list_lock); - spin_lock_init(&pool->cq_lock); + spin_lock_init(&pool->cq_prod_lock); + spin_lock_init(&xs->cq_tmp->cq_cached_prod_lock); refcount_set(&pool->users, 1); pool->fq = xs->fq_tmp; @@ -105,7 +106,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, if (pool->unaligned) pool->free_heads[i] = xskb; else - xp_init_xskb_addr(xskb, pool, i * pool->chunk_size); + xp_init_xskb_addr(xskb, pool, (u64)i * pool->chunk_size); } return pool; @@ -155,15 +156,15 @@ static void xp_disable_drv_zc(struct xsk_buff_pool *pool) } } -#define NETDEV_XDP_ACT_ZC (NETDEV_XDP_ACT_BASIC | \ - NETDEV_XDP_ACT_REDIRECT | \ - NETDEV_XDP_ACT_XSK_ZEROCOPY) - int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev, u16 queue_id, u16 flags) { + u32 needed = netdev->mtu + ETH_PAD_LEN; + u32 segs = netdev->xdp_zc_max_segs; + bool mbuf = flags & XDP_USE_SG; bool force_zc, force_copy; struct netdev_bpf bpf; + u32 frame_size; int err = 0; ASSERT_RTNL(); @@ -174,6 +175,9 @@ int xp_assign_dev(struct xsk_buff_pool *pool, if (force_zc && force_copy) return -EINVAL; + if (pool->tx_sw_csum && (netdev->priv_flags & IFF_TX_SKB_NO_LINEAR)) + return -EOPNOTSUPP; + if (xsk_get_pool_from_qid(netdev, queue_id)) return -EBUSY; @@ -183,7 +187,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, if (err) return err; - if (flags & XDP_USE_SG) + if (mbuf) pool->umem->flags |= XDP_UMEM_SG_FLAG; if (flags & XDP_USE_NEED_WAKEUP) @@ -200,13 +204,29 @@ int xp_assign_dev(struct xsk_buff_pool *pool, /* For copy-mode, we are done. */ return 0; - if ((netdev->xdp_features & NETDEV_XDP_ACT_ZC) != NETDEV_XDP_ACT_ZC) { + if ((netdev->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK) { err = -EOPNOTSUPP; goto err_unreg_pool; } - if (netdev->xdp_zc_max_segs == 1 && (flags & XDP_USE_SG)) { - err = -EOPNOTSUPP; + if (mbuf) { + if (segs == 1) { + err = -EOPNOTSUPP; + goto err_unreg_pool; + } + } else { + segs = 1; + } + + /* open-code xsk_pool_get_rx_frame_size() as pool->dev is not + * set yet at this point; we are before getting down to driver + */ + frame_size = __xsk_pool_get_rx_frame_size(pool) - + xsk_pool_get_tailroom(mbuf); + frame_size = ALIGN_DOWN(frame_size, 128); + + if (needed > frame_size * segs) { + err = -EINVAL; goto err_unreg_pool; } @@ -219,6 +239,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, bpf.xsk.pool = pool; bpf.xsk.queue_id = queue_id; + netdev_ops_assert_locked(netdev); err = netdev->netdev_ops->ndo_bpf(netdev, &bpf); if (err) goto err_unreg_pool; @@ -250,11 +271,11 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, u16 flags; struct xdp_umem *umem = umem_xs->umem; - /* One fill and completion ring required for each queue id. */ - if (!pool->fq || !pool->cq) - return -EINVAL; - flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY; + + if (umem->flags & XDP_UMEM_SG_FLAG) + flags |= XDP_USE_SG; + if (umem_xs->pool->uses_need_wakeup) flags |= XDP_USE_NEED_WAKEUP; @@ -263,13 +284,17 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, void xp_clear_dev(struct xsk_buff_pool *pool) { + struct net_device *netdev = pool->netdev; + if (!pool->netdev) return; + netdev_lock_ops(netdev); xp_disable_drv_zc(pool); xsk_clear_pool_at_qid(pool->netdev, pool->queue_id); - dev_put(pool->netdev); pool->netdev = NULL; + netdev_unlock_ops(netdev); + dev_put(netdev); } static void xp_release_deferred(struct work_struct *work) @@ -331,11 +356,11 @@ static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_devi { struct xsk_dma_map *dma_map; - dma_map = kzalloc(sizeof(*dma_map), GFP_KERNEL); + dma_map = kzalloc_obj(*dma_map); if (!dma_map) return NULL; - dma_map->dma_pages = kvcalloc(nr_pages, sizeof(*dma_map->dma_pages), GFP_KERNEL); + dma_map->dma_pages = kvzalloc_objs(*dma_map->dma_pages, nr_pages); if (!dma_map->dma_pages) { kfree(dma_map); return NULL; @@ -423,7 +448,8 @@ static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_ } } - pool->dma_pages = kvcalloc(dma_map->dma_pages_cnt, sizeof(*pool->dma_pages), GFP_KERNEL); + pool->dma_pages = kvzalloc_objs(*pool->dma_pages, + dma_map->dma_pages_cnt); if (!pool->dma_pages) return -ENOMEM; @@ -699,18 +725,56 @@ void xp_free(struct xdp_buff_xsk *xskb) } EXPORT_SYMBOL(xp_free); -void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +static u64 __xp_raw_get_addr(const struct xsk_buff_pool *pool, u64 addr) +{ + return pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; +} + +static void *__xp_raw_get_data(const struct xsk_buff_pool *pool, u64 addr) { - addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; return pool->addrs + addr; } + +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +{ + return __xp_raw_get_data(pool, __xp_raw_get_addr(pool, addr)); +} EXPORT_SYMBOL(xp_raw_get_data); -dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +static dma_addr_t __xp_raw_get_dma(const struct xsk_buff_pool *pool, u64 addr) { - addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; return (pool->dma_pages[addr >> PAGE_SHIFT] & ~XSK_NEXT_PG_CONTIG_MASK) + (addr & ~PAGE_MASK); } + +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +{ + return __xp_raw_get_dma(pool, __xp_raw_get_addr(pool, addr)); +} EXPORT_SYMBOL(xp_raw_get_dma); + +/** + * xp_raw_get_ctx - get &xdp_desc context + * @pool: XSk buff pool desc address belongs to + * @addr: desc address (from userspace) + * + * Helper for getting desc's DMA address and metadata pointer, if present. + * Saves one call on hotpath, double calculation of the actual address, + * and inline checks for metadata presence and sanity. + * + * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata + * pointer, if it is present and valid (initialized to %NULL otherwise). + */ +struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + struct xdp_desc_ctx ret; + + addr = __xp_raw_get_addr(pool, addr); + + ret.dma = __xp_raw_get_dma(pool, addr); + ret.meta = __xsk_buff_get_metadata(pool, __xp_raw_get_data(pool, addr)); + + return ret; +} +EXPORT_SYMBOL(xp_raw_get_ctx); diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 09dcea0cbbed..0170363eb542 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -92,7 +92,7 @@ static int xsk_diag_put_stats(const struct xdp_sock *xs, struct sk_buff *nlskb) static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, struct xdp_diag_req *req, struct user_namespace *user_ns, - u32 portid, u32 seq, u32 flags, int sk_ino) + u32 portid, u32 seq, u32 flags, u64 sk_ino) { struct xdp_sock *xs = xdp_sk(sk); struct xdp_diag_msg *msg; @@ -119,7 +119,7 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, if ((req->xdiag_show & XDP_SHOW_INFO) && nla_put_u32(nlskb, XDP_DIAG_UID, - from_kuid_munged(user_ns, sock_i_uid(sk)))) + from_kuid_munged(user_ns, sk_uid(sk)))) goto out_nlmsg_trim; if ((req->xdiag_show & XDP_SHOW_RING_CFG) && diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index d2c264030017..4dd01b7d858e 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -26,7 +26,7 @@ struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) struct xsk_queue *q; size_t size; - q = kzalloc(sizeof(*q), GFP_KERNEL); + q = kzalloc_obj(*q); if (!q) return NULL; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 46d87e961ad6..3e3fbb73d23e 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -46,6 +46,11 @@ struct xsk_queue { u64 invalid_descs; u64 queue_empty_descs; size_t ring_vmalloc_size; + /* Mutual exclusion of the completion ring in the SKB mode. + * Protect: when sockets share a single cq when the same netdev + * and queue id is shared. + */ + spinlock_t cq_cached_prod_lock; }; struct parsed_desc { @@ -143,14 +148,24 @@ static inline bool xp_unused_options_set(u32 options) static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 addr = desc->addr - pool->tx_metadata_len; - u64 len = desc->len + pool->tx_metadata_len; - u64 offset = addr & (pool->chunk_size - 1); + u64 len = desc->len; + u64 addr, offset; + + if (!len) + return false; - if (!desc->len) + /* Can overflow if desc->addr < pool->tx_metadata_len */ + if (check_sub_overflow(desc->addr, pool->tx_metadata_len, &addr)) return false; - if (offset + len > pool->chunk_size) + offset = addr & (pool->chunk_size - 1); + + /* + * Can't overflow: @offset is guaranteed to be < ``U32_MAX`` + * (pool->chunk_size is ``u32``), @len is guaranteed + * to be <= ``U32_MAX``. + */ + if (offset + len + pool->tx_metadata_len > pool->chunk_size) return false; if (addr >= pool->addrs_cnt) @@ -158,27 +173,42 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, if (xp_unused_options_set(desc->options)) return false; + return true; } static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len; - u64 len = desc->len + pool->tx_metadata_len; + u64 len = desc->len; + u64 addr, end; - if (!desc->len) + if (!len) return false; + /* Can't overflow: @len is guaranteed to be <= ``U32_MAX`` */ + len += pool->tx_metadata_len; if (len > pool->chunk_size) return false; - if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt || - xp_desc_crosses_non_contig_pg(pool, addr, len)) + /* Can overflow if desc->addr is close to 0 */ + if (check_sub_overflow(xp_unaligned_add_offset_to_addr(desc->addr), + pool->tx_metadata_len, &addr)) + return false; + + if (addr >= pool->addrs_cnt) + return false; + + /* Can overflow if pool->addrs_cnt is high enough */ + if (check_add_overflow(addr, len, &end) || end > pool->addrs_cnt) + return false; + + if (xp_desc_crosses_non_contig_pg(pool, addr, len)) return false; if (xp_unused_options_set(desc->options)) return false; + return true; } @@ -344,6 +374,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q) /* Functions for producers */ +static inline u32 xskq_get_prod(struct xsk_queue *q) +{ + return READ_ONCE(q->ring->producer); +} + static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); @@ -390,6 +425,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) return 0; } +static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + ring->desc[idx & q->ring_mask] = addr; +} + static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, u32 nb_entries) { @@ -403,20 +445,26 @@ static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_de q->cached_prod = cached_prod; } -static inline int xskq_prod_reserve_desc(struct xsk_queue *q, - u64 addr, u32 len, u32 flags) +static inline void __xskq_prod_reserve_desc(struct xsk_queue *q, + u64 addr, u32 len, u32 flags) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx; - if (xskq_prod_is_full(q)) - return -ENOBUFS; - /* A, matches D */ idx = q->cached_prod++ & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; ring->desc[idx].options = flags; +} + +static inline int xskq_prod_reserve_desc(struct xsk_queue *q, + u64 addr, u32 len, u32 flags) +{ + if (xskq_prod_is_full(q)) + return -ENOBUFS; + + __xskq_prod_reserve_desc(q, addr, len, flags); return 0; } diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index afa457506274..3bff346308d0 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -184,6 +184,10 @@ static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value, } xs = (struct xdp_sock *)sock->sk; + if (!READ_ONCE(xs->rx)) { + sockfd_put(sock); + return -ENOBUFS; + } map_entry = &m->xsk_map[i]; node = xsk_map_node_alloc(m, map_entry); |
