summaryrefslogtreecommitdiff
path: root/net/xdp
diff options
context:
space:
mode:
Diffstat (limited to 'net/xdp')
-rw-r--r--net/xdp/xdp_umem.c8
-rw-r--r--net/xdp/xsk.c505
-rw-r--r--net/xdp/xsk.h7
-rw-r--r--net/xdp/xsk_buff_pool.c132
-rw-r--r--net/xdp/xsk_diag.c4
-rw-r--r--net/xdp/xsk_queue.c2
-rw-r--r--net/xdp/xsk_queue.h78
-rw-r--r--net/xdp/xskmap.c4
8 files changed, 568 insertions, 172 deletions
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 9f76ca591d54..58da2f4f4397 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -97,7 +97,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
long npgs;
int err;
- umem->pgs = kvcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL | __GFP_NOWARN);
+ umem->pgs = kvzalloc_objs(*umem->pgs, umem->npgs,
+ GFP_KERNEL | __GFP_NOWARN);
if (!umem->pgs)
return -ENOMEM;
@@ -202,7 +203,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (!unaligned_chunks && chunks_rem)
return -EINVAL;
- if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
+ if (headroom > chunk_size - XDP_PACKET_HEADROOM -
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) - 128)
return -EINVAL;
if (mr->flags & XDP_UMEM_TX_METADATA_LEN) {
@@ -249,7 +251,7 @@ struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
struct xdp_umem *umem;
int err;
- umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+ umem = kzalloc_obj(*umem);
if (!umem)
return ERR_PTR(-ENOMEM);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 89d2bef96469..5e5786cd9af5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -23,17 +23,29 @@
#include <linux/netdevice.h>
#include <linux/rculist.h>
#include <linux/vmalloc.h>
+
+#include <net/netdev_queues.h>
#include <net/xdp_sock_drv.h>
#include <net/busy_poll.h>
+#include <net/netdev_lock.h>
#include <net/netdev_rx_queue.h>
#include <net/xdp.h>
+#include "../core/dev.h"
+
#include "xsk_queue.h"
#include "xdp_umem.h"
#include "xsk.h"
#define TX_BATCH_SIZE 32
-#define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE)
+#define MAX_PER_SOCKET_BUDGET 32
+
+struct xsk_addrs {
+ u32 num_descs;
+ u64 addrs[MAX_SKB_FRAGS + 1];
+};
+
+static struct kmem_cache *xsk_tx_generic_cache;
void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
{
@@ -107,7 +119,7 @@ struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
}
EXPORT_SYMBOL(xsk_get_pool_from_qid);
-void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
+static void __xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
{
if (queue_id < dev->num_rx_queues)
dev->_rx[queue_id].pool = NULL;
@@ -115,6 +127,36 @@ void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
dev->_tx[queue_id].pool = NULL;
}
+void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
+{
+ struct netdev_rx_queue *hw_rxq;
+
+ if (!netif_rxq_is_leased(dev, queue_id))
+ return __xsk_clear_pool_at_qid(dev, queue_id);
+ WARN_ON_ONCE(!netif_is_queue_leasee(dev));
+
+ hw_rxq = __netif_get_rx_queue(dev, queue_id)->lease;
+
+ netdev_lock(hw_rxq->dev);
+ queue_id = get_netdev_rx_queue_index(hw_rxq);
+ __xsk_clear_pool_at_qid(hw_rxq->dev, queue_id);
+ netdev_unlock(hw_rxq->dev);
+}
+
+static int __xsk_reg_pool_at_qid(struct net_device *dev,
+ struct xsk_buff_pool *pool, u16 queue_id)
+{
+ if (xsk_get_pool_from_qid(dev, queue_id))
+ return -EBUSY;
+
+ if (queue_id < dev->real_num_rx_queues)
+ dev->_rx[queue_id].pool = pool;
+ if (queue_id < dev->real_num_tx_queues)
+ dev->_tx[queue_id].pool = pool;
+
+ return 0;
+}
+
/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
* not know if the device has more tx queues than rx, or the opposite.
* This might also change during run time.
@@ -122,17 +164,27 @@ void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
u16 queue_id)
{
- if (queue_id >= max_t(unsigned int,
- dev->real_num_rx_queues,
- dev->real_num_tx_queues))
+ struct netdev_rx_queue *hw_rxq;
+ int ret;
+
+ if (queue_id >= max(dev->real_num_rx_queues,
+ dev->real_num_tx_queues))
return -EINVAL;
- if (queue_id < dev->real_num_rx_queues)
- dev->_rx[queue_id].pool = pool;
- if (queue_id < dev->real_num_tx_queues)
- dev->_tx[queue_id].pool = pool;
+ if (queue_id >= dev->real_num_rx_queues ||
+ !netif_rxq_is_leased(dev, queue_id))
+ return __xsk_reg_pool_at_qid(dev, pool, queue_id);
+ if (!netif_is_queue_leasee(dev))
+ return -EBUSY;
- return 0;
+ hw_rxq = __netif_get_rx_queue(dev, queue_id)->lease;
+
+ netdev_lock(hw_rxq->dev);
+ queue_id = get_netdev_rx_queue_index(hw_rxq);
+ ret = __xsk_reg_pool_at_qid(hw_rxq->dev, pool, queue_id);
+ netdev_unlock(hw_rxq->dev);
+
+ return ret;
}
static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
@@ -152,6 +204,17 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
return 0;
}
+static void __xsk_rcv_zc_safe(struct xdp_sock *xs, struct xdp_buff_xsk *xskb,
+ u32 len, u32 flags)
+{
+ u64 addr;
+
+ addr = xp_get_handle(xskb, xskb->pool);
+ __xskq_prod_reserve_desc(xs->rx, addr, len, flags);
+
+ xp_release(xskb);
+}
+
static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
@@ -159,26 +222,32 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
struct xdp_buff_xsk *pos, *tmp;
struct list_head *xskb_list;
u32 contd = 0;
+ u32 num_desc;
int err;
- if (frags)
- contd = XDP_PKT_CONTD;
+ if (likely(!frags)) {
+ err = __xsk_rcv_zc(xs, xskb, len, contd);
+ if (err)
+ goto err;
+ return 0;
+ }
- err = __xsk_rcv_zc(xs, xskb, len, contd);
- if (err)
+ contd = XDP_PKT_CONTD;
+ num_desc = xdp_get_shared_info_from_buff(xdp)->nr_frags + 1;
+ if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) {
+ xs->rx_queue_full++;
+ err = -ENOBUFS;
goto err;
- if (likely(!frags))
- return 0;
+ }
+ __xsk_rcv_zc_safe(xs, xskb, len, contd);
xskb_list = &xskb->pool->xskb_list;
list_for_each_entry_safe(pos, tmp, xskb_list, list_node) {
if (list_is_singular(xskb_list))
contd = 0;
len = pos->xdp.data_end - pos->xdp.data;
- err = __xsk_rcv_zc(xs, pos, len, contd);
- if (err)
- goto err;
- list_del(&pos->list_node);
+ __xsk_rcv_zc_safe(xs, pos, len, contd);
+ list_del_init(&pos->list_node);
}
return 0;
@@ -225,7 +294,7 @@ static u32 xsk_copy_xdp(void *to, void **from, u32 to_len,
static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
- u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool);
+ u32 frame_size = __xsk_pool_get_rx_frame_size(xs->pool);
void *copy_from = xsk_copy_xdp_start(xdp), *copy_to;
u32 from_len, meta_len, rem, num_desc;
struct xdp_buff_xsk *xskb;
@@ -284,7 +353,8 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
rem -= copied;
xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
- __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0);
+ __xsk_rcv_zc_safe(xs, xskb, copied - meta_len,
+ rem ? XDP_PKT_CONTD : 0);
meta_len = 0;
} while (rem);
@@ -299,6 +369,13 @@ static bool xsk_tx_writeable(struct xdp_sock *xs)
return true;
}
+static void __xsk_tx_release(struct xdp_sock *xs)
+{
+ __xskq_cons_release(xs->tx);
+ if (xsk_tx_writeable(xs))
+ xs->sk.sk_write_space(&xs->sk);
+}
+
static bool xsk_is_bound(struct xdp_sock *xs)
{
if (READ_ONCE(xs->state) == XSK_BOUND) {
@@ -309,15 +386,39 @@ static bool xsk_is_bound(struct xdp_sock *xs)
return false;
}
+static bool xsk_dev_queue_valid(const struct xdp_sock *xs,
+ const struct xdp_rxq_info *info)
+{
+ struct net_device *dev = xs->dev;
+ u32 queue_index = xs->queue_id;
+ struct netdev_rx_queue *rxq;
+
+ if (info->dev == dev &&
+ info->queue_index == queue_index)
+ return true;
+
+ if (queue_index < dev->real_num_rx_queues) {
+ rxq = READ_ONCE(__netif_get_rx_queue(dev, queue_index)->lease);
+ if (!rxq)
+ return false;
+
+ dev = rxq->dev;
+ queue_index = get_netdev_rx_queue_index(rxq);
+
+ return info->dev == dev &&
+ info->queue_index == queue_index;
+ }
+ return false;
+}
+
static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
if (!xsk_is_bound(xs))
return -ENXIO;
-
- if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
+ if (!xsk_dev_queue_valid(xs, xdp->rxq))
return -EINVAL;
- if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
+ if (len > __xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
xs->rx_dropped++;
return -ENOSPC;
}
@@ -337,13 +438,14 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
u32 len = xdp_get_buff_len(xdp);
int err;
- spin_lock_bh(&xs->rx_lock);
err = xsk_rcv_check(xs, xdp, len);
if (!err) {
+ spin_lock_bh(&xs->pool->rx_lock);
err = __xsk_rcv(xs, xdp, len);
xsk_flush(xs);
+ spin_unlock_bh(&xs->pool->rx_lock);
}
- spin_unlock_bh(&xs->rx_lock);
+
return err;
}
@@ -405,11 +507,8 @@ void xsk_tx_release(struct xsk_buff_pool *pool)
struct xdp_sock *xs;
rcu_read_lock();
- list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
- __xskq_cons_release(xs->tx);
- if (xsk_tx_writeable(xs))
- xs->sk.sk_write_space(&xs->sk);
- }
+ list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list)
+ __xsk_tx_release(xs);
rcu_read_unlock();
}
EXPORT_SYMBOL(xsk_tx_release);
@@ -526,42 +625,126 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
}
-static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
+static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
{
- unsigned long flags;
int ret;
- spin_lock_irqsave(&pool->cq_lock, flags);
- ret = xskq_prod_reserve_addr(pool->cq, addr);
- spin_unlock_irqrestore(&pool->cq_lock, flags);
+ spin_lock(&pool->cq->cq_cached_prod_lock);
+ ret = xskq_prod_reserve(pool->cq);
+ spin_unlock(&pool->cq->cq_cached_prod_lock);
return ret;
}
-static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
+static bool xsk_skb_destructor_is_addr(struct sk_buff *skb)
{
- unsigned long flags;
+ return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL;
+}
- spin_lock_irqsave(&pool->cq_lock, flags);
- xskq_prod_submit_n(pool->cq, n);
- spin_unlock_irqrestore(&pool->cq_lock, flags);
+static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
+{
+ return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
}
-static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
+static struct xsk_addrs *__xsk_addrs_alloc(struct sk_buff *skb, u64 addr)
{
- unsigned long flags;
+ struct xsk_addrs *xsk_addr;
- spin_lock_irqsave(&pool->cq_lock, flags);
- xskq_prod_cancel_n(pool->cq, n);
- spin_unlock_irqrestore(&pool->cq_lock, flags);
+ xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+ if (unlikely(!xsk_addr))
+ return NULL;
+
+ xsk_addr->addrs[0] = addr;
+ skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
+ return xsk_addr;
+}
+
+static struct xsk_addrs *xsk_addrs_alloc(struct sk_buff *skb)
+{
+ struct xsk_addrs *xsk_addr;
+
+ if (!xsk_skb_destructor_is_addr(skb))
+ return (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+
+ xsk_addr = __xsk_addrs_alloc(skb, xsk_skb_destructor_get_addr(skb));
+ if (likely(xsk_addr))
+ xsk_addr->num_descs = 1;
+ return xsk_addr;
+}
+
+static int xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr)
+{
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
+ return 0;
+ }
+
+ if (unlikely(!__xsk_addrs_alloc(skb, addr)))
+ return -ENOMEM;
+ return 0;
+}
+
+static void xsk_inc_num_desc(struct sk_buff *skb)
+{
+ struct xsk_addrs *xsk_addr;
+
+ if (!xsk_skb_destructor_is_addr(skb)) {
+ xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ xsk_addr->num_descs++;
+ }
}
static u32 xsk_get_num_desc(struct sk_buff *skb)
{
- return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
+ struct xsk_addrs *xsk_addr;
+
+ if (xsk_skb_destructor_is_addr(skb))
+ return 1;
+
+ xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+
+ return xsk_addr->num_descs;
}
-static void xsk_destruct_skb(struct sk_buff *skb)
+static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
+ struct sk_buff *skb)
+{
+ u32 num_descs = xsk_get_num_desc(skb);
+ struct xsk_addrs *xsk_addr;
+ u32 descs_processed = 0;
+ unsigned long flags;
+ u32 idx, i;
+
+ spin_lock_irqsave(&pool->cq_prod_lock, flags);
+ idx = xskq_get_prod(pool->cq);
+
+ if (unlikely(!xsk_skb_destructor_is_addr(skb))) {
+ xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+
+ for (i = 0; i < num_descs; i++) {
+ xskq_prod_write_addr(pool->cq, idx + descs_processed,
+ xsk_addr->addrs[i]);
+ descs_processed++;
+ }
+ kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
+ } else {
+ xskq_prod_write_addr(pool->cq, idx,
+ xsk_skb_destructor_get_addr(skb));
+ descs_processed++;
+ }
+ xskq_prod_submit_n(pool->cq, descs_processed);
+ spin_unlock_irqrestore(&pool->cq_prod_lock, flags);
+}
+
+static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
+{
+ spin_lock(&pool->cq->cq_cached_prod_lock);
+ xskq_prod_cancel_n(pool->cq, n);
+ spin_unlock(&pool->cq->cq_cached_prod_lock);
+}
+
+INDIRECT_CALLABLE_SCOPE
+void xsk_destruct_skb(struct sk_buff *skb)
{
struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
@@ -570,23 +753,39 @@ static void xsk_destruct_skb(struct sk_buff *skb)
*compl->tx_timestamp = ktime_get_tai_fast_ns();
}
- xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
+ xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb);
sock_wfree(skb);
}
-static void xsk_set_destructor_arg(struct sk_buff *skb)
+static int xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
+ u64 addr)
{
- long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
+ int err;
- skb_shinfo(skb)->destructor_arg = (void *)num;
+ err = xsk_skb_destructor_set_addr(skb, addr);
+ if (unlikely(err))
+ return err;
+
+ skb->dev = xs->dev;
+ skb->priority = READ_ONCE(xs->sk.sk_priority);
+ skb->mark = READ_ONCE(xs->sk.sk_mark);
+ skb->destructor = xsk_destruct_skb;
+ return 0;
}
static void xsk_consume_skb(struct sk_buff *skb)
{
struct xdp_sock *xs = xdp_sk(skb->sk);
+ u32 num_descs = xsk_get_num_desc(skb);
+ struct xsk_addrs *xsk_addr;
+
+ if (unlikely(!xsk_skb_destructor_is_addr(skb))) {
+ xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
+ }
skb->destructor = sock_wfree;
- xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
+ xsk_cq_cancel_locked(xs->pool, num_descs);
/* Free skb without triggering the perf drop trace */
consume_skb(skb);
xs->skb = NULL;
@@ -598,6 +797,45 @@ static void xsk_drop_skb(struct sk_buff *skb)
xsk_consume_skb(skb);
}
+static int xsk_skb_metadata(struct sk_buff *skb, void *buffer,
+ struct xdp_desc *desc, struct xsk_buff_pool *pool,
+ u32 hr)
+{
+ struct xsk_tx_metadata *meta = NULL;
+
+ if (unlikely(pool->tx_metadata_len == 0))
+ return -EINVAL;
+
+ meta = buffer - pool->tx_metadata_len;
+ if (unlikely(!xsk_buff_valid_tx_metadata(meta)))
+ return -EINVAL;
+
+ if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
+ if (unlikely(meta->request.csum_start +
+ meta->request.csum_offset +
+ sizeof(__sum16) > desc->len))
+ return -EINVAL;
+
+ skb->csum_start = hr + meta->request.csum_start;
+ skb->csum_offset = meta->request.csum_offset;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ if (unlikely(pool->tx_sw_csum)) {
+ int err;
+
+ err = skb_checksum_help(skb);
+ if (err)
+ return err;
+ }
+ }
+
+ if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME)
+ skb->skb_mstamp_ns = meta->request.launch_time;
+ xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
+
+ return 0;
+}
+
static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
struct xdp_desc *desc)
{
@@ -609,6 +847,9 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
int err, i;
u64 addr;
+ addr = desc->addr;
+ buffer = xsk_buff_raw_get_data(pool, addr);
+
if (!skb) {
hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
@@ -617,19 +858,39 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
return ERR_PTR(err);
skb_reserve(skb, hr);
+ if (desc->options & XDP_TX_METADATA) {
+ err = xsk_skb_metadata(skb, buffer, desc, pool, hr);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+ }
+ } else {
+ struct xsk_addrs *xsk_addr;
+
+ xsk_addr = xsk_addrs_alloc(skb);
+ if (!xsk_addr)
+ return ERR_PTR(-ENOMEM);
+
+ /* in case of -EOVERFLOW that could happen below,
+ * xsk_consume_skb() will release this node as whole skb
+ * would be dropped, which implies freeing all list elements
+ */
+ xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
}
- addr = desc->addr;
len = desc->len;
ts = pool->unaligned ? len : pool->chunk_size;
- buffer = xsk_buff_raw_get_data(pool, addr);
offset = offset_in_page(buffer);
addr = buffer - pool->addrs;
for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
- if (unlikely(i >= MAX_SKB_FRAGS))
+ if (unlikely(i >= MAX_SKB_FRAGS)) {
+ if (!xs->skb)
+ kfree_skb(skb);
return ERR_PTR(-EOVERFLOW);
+ }
page = pool->umem->pgs[addr >> PAGE_SHIFT];
get_page(page);
@@ -654,16 +915,15 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
struct xdp_desc *desc)
{
- struct xsk_tx_metadata *meta = NULL;
struct net_device *dev = xs->dev;
struct sk_buff *skb = xs->skb;
- bool first_frag = false;
int err;
if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
skb = xsk_build_skb_zerocopy(xs, desc);
if (IS_ERR(skb)) {
err = PTR_ERR(skb);
+ skb = NULL;
goto free_err;
}
} else {
@@ -674,8 +934,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
len = desc->len;
if (!skb) {
- first_frag = true;
-
hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
tr = dev->needed_tailroom;
skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
@@ -688,11 +946,25 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
err = skb_store_bits(skb, 0, buffer, len);
if (unlikely(err))
goto free_err;
+
+ if (desc->options & XDP_TX_METADATA) {
+ err = xsk_skb_metadata(skb, buffer, desc,
+ xs->pool, hr);
+ if (unlikely(err))
+ goto free_err;
+ }
} else {
int nr_frags = skb_shinfo(skb)->nr_frags;
+ struct xsk_addrs *xsk_addr;
struct page *page;
u8 *vaddr;
+ xsk_addr = xsk_addrs_alloc(skb);
+ if (!xsk_addr) {
+ err = -ENOMEM;
+ goto free_err;
+ }
+
if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
err = -EOVERFLOW;
goto free_err;
@@ -710,58 +982,33 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
- }
-
- if (first_frag && desc->options & XDP_TX_METADATA) {
- if (unlikely(xs->pool->tx_metadata_len == 0)) {
- err = -EINVAL;
- goto free_err;
- }
- meta = buffer - xs->pool->tx_metadata_len;
- if (unlikely(!xsk_buff_valid_tx_metadata(meta))) {
- err = -EINVAL;
- goto free_err;
- }
-
- if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
- if (unlikely(meta->request.csum_start +
- meta->request.csum_offset +
- sizeof(__sum16) > len)) {
- err = -EINVAL;
- goto free_err;
- }
-
- skb->csum_start = hr + meta->request.csum_start;
- skb->csum_offset = meta->request.csum_offset;
- skb->ip_summed = CHECKSUM_PARTIAL;
-
- if (unlikely(xs->pool->tx_sw_csum)) {
- err = skb_checksum_help(skb);
- if (err)
- goto free_err;
- }
- }
+ xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
}
}
- skb->dev = dev;
- skb->priority = READ_ONCE(xs->sk.sk_priority);
- skb->mark = READ_ONCE(xs->sk.sk_mark);
- skb->destructor = xsk_destruct_skb;
- xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
- xsk_set_destructor_arg(skb);
+ if (!xs->skb) {
+ err = xsk_skb_init_misc(skb, xs, desc->addr);
+ if (unlikely(err))
+ goto free_err;
+ }
+ xsk_inc_num_desc(skb);
return skb;
free_err:
- if (first_frag && skb)
+ if (skb && !xs->skb)
kfree_skb(skb);
if (err == -EOVERFLOW) {
- /* Drop the packet */
- xsk_set_destructor_arg(xs->skb);
- xsk_drop_skb(xs->skb);
+ if (xs->skb) {
+ /* Drop the packet */
+ xsk_inc_num_desc(xs->skb);
+ xsk_drop_skb(xs->skb);
+ } else {
+ xsk_cq_cancel_locked(xs->pool, 1);
+ xs->tx->invalid_descs++;
+ }
xskq_cons_release(xs->tx);
} else {
/* Let application retry */
@@ -774,10 +1021,10 @@ free_err:
static int __xsk_generic_xmit(struct sock *sk)
{
struct xdp_sock *xs = xdp_sk(sk);
- u32 max_batch = TX_BATCH_SIZE;
bool sent_frame = false;
struct xdp_desc desc;
struct sk_buff *skb;
+ u32 max_batch;
int err = 0;
mutex_lock(&xs->mutex);
@@ -791,6 +1038,7 @@ static int __xsk_generic_xmit(struct sock *sk)
if (xs->queue_id >= xs->dev->real_num_tx_queues)
goto out;
+ max_batch = READ_ONCE(xs->max_tx_budget);
while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
if (max_batch-- == 0) {
err = -EAGAIN;
@@ -802,8 +1050,11 @@ static int __xsk_generic_xmit(struct sock *sk)
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
- if (xsk_cq_reserve_addr_locked(xs->pool, desc.addr))
+ err = xsk_cq_reserve_locked(xs->pool);
+ if (err) {
+ err = -EAGAIN;
goto out;
+ }
skb = xsk_build_skb(xs, &desc);
if (IS_ERR(skb)) {
@@ -850,8 +1101,7 @@ static int __xsk_generic_xmit(struct sock *sk)
out:
if (sent_frame)
- if (xsk_tx_writeable(xs))
- sk->sk_write_space(sk);
+ __xsk_tx_release(xs);
mutex_unlock(&xs->mutex);
return err;
@@ -875,7 +1125,7 @@ static bool xsk_no_wakeup(struct sock *sk)
#ifdef CONFIG_NET_RX_BUSY_POLL
/* Prefer busy-polling, skip the wakeup. */
return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
- READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
+ napi_id_valid(READ_ONCE(sk->sk_napi_id));
#else
return false;
#endif
@@ -1141,7 +1391,7 @@ static bool xsk_validate_queues(struct xdp_sock *xs)
return xs->fq_tmp && xs->cq_tmp;
}
-static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len)
{
struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
struct sock *sk = sock->sk;
@@ -1178,6 +1428,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
goto out_release;
}
+ netdev_lock_ops(dev);
+
if (!xs->rx && !xs->tx) {
err = -EINVAL;
goto out_unlock;
@@ -1216,6 +1468,13 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
}
if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
+ /* One fill and completion ring required for each queue id. */
+ if (!xsk_validate_queues(xs)) {
+ err = -EINVAL;
+ sockfd_put(sock);
+ goto out_unlock;
+ }
+
/* Share the umem with another socket on another qid
* and/or device.
*/
@@ -1294,7 +1553,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
xs->queue_id = qid;
xp_add_xsk(xs->pool, xs);
- if (xs->zc && qid < dev->real_num_rx_queues) {
+ if (qid < dev->real_num_rx_queues) {
struct netdev_rx_queue *rxq;
rxq = __netif_get_rx_queue(dev, qid);
@@ -1312,6 +1571,7 @@ out_unlock:
smp_wmb();
WRITE_ONCE(xs->state, XSK_BOUND);
}
+ netdev_unlock_ops(dev);
out_release:
mutex_unlock(&xs->mutex);
rtnl_unlock();
@@ -1426,6 +1686,21 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
mutex_unlock(&xs->mutex);
return err;
}
+ case XDP_MAX_TX_SKB_BUDGET:
+ {
+ unsigned int budget;
+
+ if (optlen != sizeof(budget))
+ return -EINVAL;
+ if (copy_from_sockptr(&budget, optval, sizeof(budget)))
+ return -EFAULT;
+ if (!xs->tx ||
+ budget < TX_BATCH_SIZE || budget > xs->tx->nentries)
+ return -EACCES;
+
+ WRITE_ONCE(xs->max_tx_budget, budget);
+ return 0;
+ }
default:
break;
}
@@ -1723,8 +1998,8 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
xs = xdp_sk(sk);
xs->state = XSK_READY;
+ xs->max_tx_budget = TX_BATCH_SIZE;
mutex_init(&xs->mutex);
- spin_lock_init(&xs->rx_lock);
INIT_LIST_HEAD(&xs->map_list);
spin_lock_init(&xs->map_list_lock);
@@ -1785,8 +2060,18 @@ static int __init xsk_init(void)
if (err)
goto out_pernet;
+ xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
+ sizeof(struct xsk_addrs),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!xsk_tx_generic_cache) {
+ err = -ENOMEM;
+ goto out_unreg_notif;
+ }
+
return 0;
+out_unreg_notif:
+ unregister_netdevice_notifier(&xsk_netdev_notifier);
out_pernet:
unregister_pernet_subsys(&xsk_net_ops);
out_sk:
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
index a4bc4749faac..7c811b5cce76 100644
--- a/net/xdp/xsk.h
+++ b/net/xdp/xsk.h
@@ -4,13 +4,6 @@
#ifndef XSK_H_
#define XSK_H_
-/* Masks for xdp_umem_page flags.
- * The low 12-bits of the addr will be 0 since this is the page address, so we
- * can use them for flags.
- */
-#define XSK_NEXT_PG_CONTIG_SHIFT 0
-#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
-
struct xdp_ring_offset_v1 {
__u64 producer;
__u64 consumer;
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 1f7975b49657..d981cfdd8535 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/netdevice.h>
+#include <net/netdev_lock.h>
#include <net/xsk_buff_pool.h>
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
@@ -8,28 +10,26 @@
#include "xdp_umem.h"
#include "xsk.h"
+#define ETH_PAD_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN)
+
void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs)
{
- unsigned long flags;
-
if (!xs->tx)
return;
- spin_lock_irqsave(&pool->xsk_tx_list_lock, flags);
+ spin_lock(&pool->xsk_tx_list_lock);
list_add_rcu(&xs->tx_list, &pool->xsk_tx_list);
- spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags);
+ spin_unlock(&pool->xsk_tx_list_lock);
}
void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs)
{
- unsigned long flags;
-
if (!xs->tx)
return;
- spin_lock_irqsave(&pool->xsk_tx_list_lock, flags);
+ spin_lock(&pool->xsk_tx_list_lock);
list_del_rcu(&xs->tx_list);
- spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags);
+ spin_unlock(&pool->xsk_tx_list_lock);
}
void xp_destroy(struct xsk_buff_pool *pool)
@@ -44,8 +44,7 @@ void xp_destroy(struct xsk_buff_pool *pool)
int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs)
{
- pool->tx_descs = kvcalloc(xs->tx->nentries, sizeof(*pool->tx_descs),
- GFP_KERNEL);
+ pool->tx_descs = kvzalloc_objs(*pool->tx_descs, xs->tx->nentries);
if (!pool->tx_descs)
return -ENOMEM;
@@ -61,11 +60,11 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
u32 i, entries;
entries = unaligned ? umem->chunks : 0;
- pool = kvzalloc(struct_size(pool, free_heads, entries), GFP_KERNEL);
+ pool = kvzalloc_flex(*pool, free_heads, entries);
if (!pool)
goto out;
- pool->heads = kvcalloc(umem->chunks, sizeof(*pool->heads), GFP_KERNEL);
+ pool->heads = kvzalloc_objs(*pool->heads, umem->chunks);
if (!pool->heads)
goto out;
@@ -87,11 +86,13 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
pool->addrs = umem->addrs;
pool->tx_metadata_len = umem->tx_metadata_len;
pool->tx_sw_csum = umem->flags & XDP_UMEM_TX_SW_CSUM;
+ spin_lock_init(&pool->rx_lock);
INIT_LIST_HEAD(&pool->free_list);
INIT_LIST_HEAD(&pool->xskb_list);
INIT_LIST_HEAD(&pool->xsk_tx_list);
spin_lock_init(&pool->xsk_tx_list_lock);
- spin_lock_init(&pool->cq_lock);
+ spin_lock_init(&pool->cq_prod_lock);
+ spin_lock_init(&xs->cq_tmp->cq_cached_prod_lock);
refcount_set(&pool->users, 1);
pool->fq = xs->fq_tmp;
@@ -105,7 +106,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
if (pool->unaligned)
pool->free_heads[i] = xskb;
else
- xp_init_xskb_addr(xskb, pool, i * pool->chunk_size);
+ xp_init_xskb_addr(xskb, pool, (u64)i * pool->chunk_size);
}
return pool;
@@ -155,15 +156,15 @@ static void xp_disable_drv_zc(struct xsk_buff_pool *pool)
}
}
-#define NETDEV_XDP_ACT_ZC (NETDEV_XDP_ACT_BASIC | \
- NETDEV_XDP_ACT_REDIRECT | \
- NETDEV_XDP_ACT_XSK_ZEROCOPY)
-
int xp_assign_dev(struct xsk_buff_pool *pool,
struct net_device *netdev, u16 queue_id, u16 flags)
{
+ u32 needed = netdev->mtu + ETH_PAD_LEN;
+ u32 segs = netdev->xdp_zc_max_segs;
+ bool mbuf = flags & XDP_USE_SG;
bool force_zc, force_copy;
struct netdev_bpf bpf;
+ u32 frame_size;
int err = 0;
ASSERT_RTNL();
@@ -174,6 +175,9 @@ int xp_assign_dev(struct xsk_buff_pool *pool,
if (force_zc && force_copy)
return -EINVAL;
+ if (pool->tx_sw_csum && (netdev->priv_flags & IFF_TX_SKB_NO_LINEAR))
+ return -EOPNOTSUPP;
+
if (xsk_get_pool_from_qid(netdev, queue_id))
return -EBUSY;
@@ -183,7 +187,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool,
if (err)
return err;
- if (flags & XDP_USE_SG)
+ if (mbuf)
pool->umem->flags |= XDP_UMEM_SG_FLAG;
if (flags & XDP_USE_NEED_WAKEUP)
@@ -200,13 +204,29 @@ int xp_assign_dev(struct xsk_buff_pool *pool,
/* For copy-mode, we are done. */
return 0;
- if ((netdev->xdp_features & NETDEV_XDP_ACT_ZC) != NETDEV_XDP_ACT_ZC) {
+ if ((netdev->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK) {
err = -EOPNOTSUPP;
goto err_unreg_pool;
}
- if (netdev->xdp_zc_max_segs == 1 && (flags & XDP_USE_SG)) {
- err = -EOPNOTSUPP;
+ if (mbuf) {
+ if (segs == 1) {
+ err = -EOPNOTSUPP;
+ goto err_unreg_pool;
+ }
+ } else {
+ segs = 1;
+ }
+
+ /* open-code xsk_pool_get_rx_frame_size() as pool->dev is not
+ * set yet at this point; we are before getting down to driver
+ */
+ frame_size = __xsk_pool_get_rx_frame_size(pool) -
+ xsk_pool_get_tailroom(mbuf);
+ frame_size = ALIGN_DOWN(frame_size, 128);
+
+ if (needed > frame_size * segs) {
+ err = -EINVAL;
goto err_unreg_pool;
}
@@ -219,6 +239,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool,
bpf.xsk.pool = pool;
bpf.xsk.queue_id = queue_id;
+ netdev_ops_assert_locked(netdev);
err = netdev->netdev_ops->ndo_bpf(netdev, &bpf);
if (err)
goto err_unreg_pool;
@@ -250,11 +271,11 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs,
u16 flags;
struct xdp_umem *umem = umem_xs->umem;
- /* One fill and completion ring required for each queue id. */
- if (!pool->fq || !pool->cq)
- return -EINVAL;
-
flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY;
+
+ if (umem->flags & XDP_UMEM_SG_FLAG)
+ flags |= XDP_USE_SG;
+
if (umem_xs->pool->uses_need_wakeup)
flags |= XDP_USE_NEED_WAKEUP;
@@ -263,13 +284,17 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs,
void xp_clear_dev(struct xsk_buff_pool *pool)
{
+ struct net_device *netdev = pool->netdev;
+
if (!pool->netdev)
return;
+ netdev_lock_ops(netdev);
xp_disable_drv_zc(pool);
xsk_clear_pool_at_qid(pool->netdev, pool->queue_id);
- dev_put(pool->netdev);
pool->netdev = NULL;
+ netdev_unlock_ops(netdev);
+ dev_put(netdev);
}
static void xp_release_deferred(struct work_struct *work)
@@ -331,11 +356,11 @@ static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_devi
{
struct xsk_dma_map *dma_map;
- dma_map = kzalloc(sizeof(*dma_map), GFP_KERNEL);
+ dma_map = kzalloc_obj(*dma_map);
if (!dma_map)
return NULL;
- dma_map->dma_pages = kvcalloc(nr_pages, sizeof(*dma_map->dma_pages), GFP_KERNEL);
+ dma_map->dma_pages = kvzalloc_objs(*dma_map->dma_pages, nr_pages);
if (!dma_map->dma_pages) {
kfree(dma_map);
return NULL;
@@ -423,7 +448,8 @@ static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_
}
}
- pool->dma_pages = kvcalloc(dma_map->dma_pages_cnt, sizeof(*pool->dma_pages), GFP_KERNEL);
+ pool->dma_pages = kvzalloc_objs(*pool->dma_pages,
+ dma_map->dma_pages_cnt);
if (!pool->dma_pages)
return -ENOMEM;
@@ -699,18 +725,56 @@ void xp_free(struct xdp_buff_xsk *xskb)
}
EXPORT_SYMBOL(xp_free);
-void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
+static u64 __xp_raw_get_addr(const struct xsk_buff_pool *pool, u64 addr)
+{
+ return pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
+}
+
+static void *__xp_raw_get_data(const struct xsk_buff_pool *pool, u64 addr)
{
- addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
return pool->addrs + addr;
}
+
+void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
+{
+ return __xp_raw_get_data(pool, __xp_raw_get_addr(pool, addr));
+}
EXPORT_SYMBOL(xp_raw_get_data);
-dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
+static dma_addr_t __xp_raw_get_dma(const struct xsk_buff_pool *pool, u64 addr)
{
- addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
return (pool->dma_pages[addr >> PAGE_SHIFT] &
~XSK_NEXT_PG_CONTIG_MASK) +
(addr & ~PAGE_MASK);
}
+
+dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
+{
+ return __xp_raw_get_dma(pool, __xp_raw_get_addr(pool, addr));
+}
EXPORT_SYMBOL(xp_raw_get_dma);
+
+/**
+ * xp_raw_get_ctx - get &xdp_desc context
+ * @pool: XSk buff pool desc address belongs to
+ * @addr: desc address (from userspace)
+ *
+ * Helper for getting desc's DMA address and metadata pointer, if present.
+ * Saves one call on hotpath, double calculation of the actual address,
+ * and inline checks for metadata presence and sanity.
+ *
+ * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata
+ * pointer, if it is present and valid (initialized to %NULL otherwise).
+ */
+struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr)
+{
+ struct xdp_desc_ctx ret;
+
+ addr = __xp_raw_get_addr(pool, addr);
+
+ ret.dma = __xp_raw_get_dma(pool, addr);
+ ret.meta = __xsk_buff_get_metadata(pool, __xp_raw_get_data(pool, addr));
+
+ return ret;
+}
+EXPORT_SYMBOL(xp_raw_get_ctx);
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index 09dcea0cbbed..0170363eb542 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -92,7 +92,7 @@ static int xsk_diag_put_stats(const struct xdp_sock *xs, struct sk_buff *nlskb)
static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
struct xdp_diag_req *req,
struct user_namespace *user_ns,
- u32 portid, u32 seq, u32 flags, int sk_ino)
+ u32 portid, u32 seq, u32 flags, u64 sk_ino)
{
struct xdp_sock *xs = xdp_sk(sk);
struct xdp_diag_msg *msg;
@@ -119,7 +119,7 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
if ((req->xdiag_show & XDP_SHOW_INFO) &&
nla_put_u32(nlskb, XDP_DIAG_UID,
- from_kuid_munged(user_ns, sock_i_uid(sk))))
+ from_kuid_munged(user_ns, sk_uid(sk))))
goto out_nlmsg_trim;
if ((req->xdiag_show & XDP_SHOW_RING_CFG) &&
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index d2c264030017..4dd01b7d858e 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -26,7 +26,7 @@ struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
struct xsk_queue *q;
size_t size;
- q = kzalloc(sizeof(*q), GFP_KERNEL);
+ q = kzalloc_obj(*q);
if (!q)
return NULL;
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 46d87e961ad6..3e3fbb73d23e 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -46,6 +46,11 @@ struct xsk_queue {
u64 invalid_descs;
u64 queue_empty_descs;
size_t ring_vmalloc_size;
+ /* Mutual exclusion of the completion ring in the SKB mode.
+ * Protect: when sockets share a single cq when the same netdev
+ * and queue id is shared.
+ */
+ spinlock_t cq_cached_prod_lock;
};
struct parsed_desc {
@@ -143,14 +148,24 @@ static inline bool xp_unused_options_set(u32 options)
static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
struct xdp_desc *desc)
{
- u64 addr = desc->addr - pool->tx_metadata_len;
- u64 len = desc->len + pool->tx_metadata_len;
- u64 offset = addr & (pool->chunk_size - 1);
+ u64 len = desc->len;
+ u64 addr, offset;
+
+ if (!len)
+ return false;
- if (!desc->len)
+ /* Can overflow if desc->addr < pool->tx_metadata_len */
+ if (check_sub_overflow(desc->addr, pool->tx_metadata_len, &addr))
return false;
- if (offset + len > pool->chunk_size)
+ offset = addr & (pool->chunk_size - 1);
+
+ /*
+ * Can't overflow: @offset is guaranteed to be < ``U32_MAX``
+ * (pool->chunk_size is ``u32``), @len is guaranteed
+ * to be <= ``U32_MAX``.
+ */
+ if (offset + len + pool->tx_metadata_len > pool->chunk_size)
return false;
if (addr >= pool->addrs_cnt)
@@ -158,27 +173,42 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
if (xp_unused_options_set(desc->options))
return false;
+
return true;
}
static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
struct xdp_desc *desc)
{
- u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len;
- u64 len = desc->len + pool->tx_metadata_len;
+ u64 len = desc->len;
+ u64 addr, end;
- if (!desc->len)
+ if (!len)
return false;
+ /* Can't overflow: @len is guaranteed to be <= ``U32_MAX`` */
+ len += pool->tx_metadata_len;
if (len > pool->chunk_size)
return false;
- if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt ||
- xp_desc_crosses_non_contig_pg(pool, addr, len))
+ /* Can overflow if desc->addr is close to 0 */
+ if (check_sub_overflow(xp_unaligned_add_offset_to_addr(desc->addr),
+ pool->tx_metadata_len, &addr))
+ return false;
+
+ if (addr >= pool->addrs_cnt)
+ return false;
+
+ /* Can overflow if pool->addrs_cnt is high enough */
+ if (check_add_overflow(addr, len, &end) || end > pool->addrs_cnt)
+ return false;
+
+ if (xp_desc_crosses_non_contig_pg(pool, addr, len))
return false;
if (xp_unused_options_set(desc->options))
return false;
+
return true;
}
@@ -344,6 +374,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
/* Functions for producers */
+static inline u32 xskq_get_prod(struct xsk_queue *q)
+{
+ return READ_ONCE(q->ring->producer);
+}
+
static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
{
u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
@@ -390,6 +425,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
return 0;
}
+static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr)
+{
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+ ring->desc[idx & q->ring_mask] = addr;
+}
+
static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
u32 nb_entries)
{
@@ -403,20 +445,26 @@ static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_de
q->cached_prod = cached_prod;
}
-static inline int xskq_prod_reserve_desc(struct xsk_queue *q,
- u64 addr, u32 len, u32 flags)
+static inline void __xskq_prod_reserve_desc(struct xsk_queue *q,
+ u64 addr, u32 len, u32 flags)
{
struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
u32 idx;
- if (xskq_prod_is_full(q))
- return -ENOBUFS;
-
/* A, matches D */
idx = q->cached_prod++ & q->ring_mask;
ring->desc[idx].addr = addr;
ring->desc[idx].len = len;
ring->desc[idx].options = flags;
+}
+
+static inline int xskq_prod_reserve_desc(struct xsk_queue *q,
+ u64 addr, u32 len, u32 flags)
+{
+ if (xskq_prod_is_full(q))
+ return -ENOBUFS;
+
+ __xskq_prod_reserve_desc(q, addr, len, flags);
return 0;
}
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index afa457506274..3bff346308d0 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -184,6 +184,10 @@ static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
}
xs = (struct xdp_sock *)sock->sk;
+ if (!READ_ONCE(xs->rx)) {
+ sockfd_put(sock);
+ return -ENOBUFS;
+ }
map_entry = &m->xsk_map[i];
node = xsk_map_node_alloc(m, map_entry);