From b7ecc1de51ca7d0a9fa8dbc3f756ab87b99a1838 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 4 Feb 2025 18:03:11 +0100 Subject: net-sysfs: move queue attribute groups outside the default groups Rx/tx queues embed their own kobject for registering their per-queue sysfs files. The issue is they're using the kobject default groups for this and entirely rely on the kobject refcounting for releasing their sysfs paths. In order to remove rtnl_trylock calls we need sysfs files not to rely on their associated kobject refcounting for their release. Thus we here move queues sysfs files from the kobject default groups to their own groups which can be removed separately. Signed-off-by: Antoine Tenart Link: https://patch.msgid.link/20250204170314.146022-3-atenart@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netdev_rx_queue.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index 596836abf7bf..af40842f229d 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -16,6 +16,7 @@ struct netdev_rx_queue { struct rps_dev_flow_table __rcu *rps_flow_table; #endif struct kobject kobj; + const struct attribute_group **groups; struct net_device *dev; netdevice_tracker dev_tracker; -- cgit v1.2.3 From 7d60fa9e1ab1e4618b2342d54b2035a0e44d19c6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:14 -0800 Subject: net: generalise net_iov chunk owners Currently net_iov stores a pointer to struct dmabuf_genpool_chunk_owner, which serves as a useful abstraction to share data and provide a context. However, it's too devmem specific, and we want to reuse it for other memory providers, and for that we need to decouple net_iov from devmem. Make net_iov to point to a new base structure called net_iov_area, which dmabuf_genpool_chunk_owner extends. Reviewed-by: Mina Almasry Acked-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-4-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/netmem.h | 21 ++++++++++++++++++++- net/core/devmem.c | 25 +++++++++++++------------ net/core/devmem.h | 25 +++++++++---------------- 3 files changed, 42 insertions(+), 29 deletions(-) (limited to 'include/net') diff --git a/include/net/netmem.h b/include/net/netmem.h index 1b58faa4f20f..c61d5b21e7b4 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -24,11 +24,20 @@ struct net_iov { unsigned long __unused_padding; unsigned long pp_magic; struct page_pool *pp; - struct dmabuf_genpool_chunk_owner *owner; + struct net_iov_area *owner; unsigned long dma_addr; atomic_long_t pp_ref_count; }; +struct net_iov_area { + /* Array of net_iovs for this area. */ + struct net_iov *niovs; + size_t num_niovs; + + /* Offset into the dma-buf where this chunk starts. */ + unsigned long base_virtual; +}; + /* These fields in struct page are used by the page_pool and net stack: * * struct { @@ -54,6 +63,16 @@ NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr); NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count); #undef NET_IOV_ASSERT_OFFSET +static inline struct net_iov_area *net_iov_owner(const struct net_iov *niov) +{ + return niov->owner; +} + +static inline unsigned int net_iov_idx(const struct net_iov *niov) +{ + return niov - net_iov_owner(niov)->niovs; +} + /* netmem */ /** diff --git a/net/core/devmem.c b/net/core/devmem.c index 66cd1ab9224f..fb0dddcb4e60 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -33,14 +33,15 @@ static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, { struct dmabuf_genpool_chunk_owner *owner = chunk->owner; - kvfree(owner->niovs); + kvfree(owner->area.niovs); kfree(owner); } static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) { - struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); + struct dmabuf_genpool_chunk_owner *owner; + owner = net_devmem_iov_to_chunk_owner(niov); return owner->base_dma_addr + ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); } @@ -83,7 +84,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) offset = dma_addr - owner->base_dma_addr; index = offset / PAGE_SIZE; - niov = &owner->niovs[index]; + niov = &owner->area.niovs[index]; niov->pp_magic = 0; niov->pp = NULL; @@ -261,9 +262,9 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, goto err_free_chunks; } - owner->base_virtual = virtual; + owner->area.base_virtual = virtual; owner->base_dma_addr = dma_addr; - owner->num_niovs = len / PAGE_SIZE; + owner->area.num_niovs = len / PAGE_SIZE; owner->binding = binding; err = gen_pool_add_owner(binding->chunk_pool, dma_addr, @@ -275,17 +276,17 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, goto err_free_chunks; } - owner->niovs = kvmalloc_array(owner->num_niovs, - sizeof(*owner->niovs), - GFP_KERNEL); - if (!owner->niovs) { + owner->area.niovs = kvmalloc_array(owner->area.num_niovs, + sizeof(*owner->area.niovs), + GFP_KERNEL); + if (!owner->area.niovs) { err = -ENOMEM; goto err_free_chunks; } - for (i = 0; i < owner->num_niovs; i++) { - niov = &owner->niovs[i]; - niov->owner = owner; + for (i = 0; i < owner->area.num_niovs; i++) { + niov = &owner->area.niovs[i]; + niov->owner = &owner->area; page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); } diff --git a/net/core/devmem.h b/net/core/devmem.h index 99782ddeca40..a2b9913e9a17 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -10,6 +10,8 @@ #ifndef _NET_DEVMEM_H #define _NET_DEVMEM_H +#include + struct netlink_ext_ack; struct net_devmem_dmabuf_binding { @@ -51,17 +53,11 @@ struct net_devmem_dmabuf_binding { * allocations from this chunk. */ struct dmabuf_genpool_chunk_owner { - /* Offset into the dma-buf where this chunk starts. */ - unsigned long base_virtual; + struct net_iov_area area; + struct net_devmem_dmabuf_binding *binding; /* dma_addr of the start of the chunk. */ dma_addr_t base_dma_addr; - - /* Array of net_iovs for this chunk. */ - struct net_iov *niovs; - size_t num_niovs; - - struct net_devmem_dmabuf_binding *binding; }; void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); @@ -75,20 +71,17 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, void dev_dmabuf_uninstall(struct net_device *dev); static inline struct dmabuf_genpool_chunk_owner * -net_iov_owner(const struct net_iov *niov) +net_devmem_iov_to_chunk_owner(const struct net_iov *niov) { - return niov->owner; -} + struct net_iov_area *owner = net_iov_owner(niov); -static inline unsigned int net_iov_idx(const struct net_iov *niov) -{ - return niov - net_iov_owner(niov)->niovs; + return container_of(owner, struct dmabuf_genpool_chunk_owner, area); } static inline struct net_devmem_dmabuf_binding * net_devmem_iov_binding(const struct net_iov *niov) { - return net_iov_owner(niov)->binding; + return net_devmem_iov_to_chunk_owner(niov)->binding; } static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) @@ -98,7 +91,7 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) { - struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); + struct net_iov_area *owner = net_iov_owner(niov); return owner->base_virtual + ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); -- cgit v1.2.3 From 57afb483015768903029c8336ee287f4b03c1235 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:15 -0800 Subject: net: page_pool: create hooks for custom memory providers A spin off from the original page pool memory providers patch by Jakub, which allows extending page pools with custom allocators. One of such providers is devmem TCP, and the other is io_uring zerocopy added in following patches. Link: https://lore.kernel.org/netdev/20230707183935.997267-7-kuba@kernel.org/ Co-developed-by: Jakub Kicinski # initial mp proposal Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-5-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 15 +++++++++++++++ include/net/page_pool/types.h | 4 ++++ net/core/devmem.c | 15 ++++++++++++++- net/core/page_pool.c | 23 +++++++++++++++-------- 4 files changed, 48 insertions(+), 9 deletions(-) create mode 100644 include/net/page_pool/memory_provider.h (limited to 'include/net') diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h new file mode 100644 index 000000000000..e49d0a52629d --- /dev/null +++ b/include/net/page_pool/memory_provider.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_PAGE_POOL_MEMORY_PROVIDER_H +#define _NET_PAGE_POOL_MEMORY_PROVIDER_H + +#include +#include + +struct memory_provider_ops { + netmem_ref (*alloc_netmems)(struct page_pool *pool, gfp_t gfp); + bool (*release_netmem)(struct page_pool *pool, netmem_ref netmem); + int (*init)(struct page_pool *pool); + void (*destroy)(struct page_pool *pool); +}; + +#endif diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 7f405672b089..36eb57d73abc 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -152,8 +152,11 @@ struct page_pool_stats { */ #define PAGE_POOL_FRAG_GROUP_ALIGN (4 * sizeof(long)) +struct memory_provider_ops; + struct pp_memory_provider_params { void *mp_priv; + const struct memory_provider_ops *mp_ops; }; struct page_pool { @@ -216,6 +219,7 @@ struct page_pool { struct ptr_ring ring; void *mp_priv; + const struct memory_provider_ops *mp_ops; #ifdef CONFIG_PAGE_POOL_STATS /* recycle stats are per-cpu to avoid locking */ diff --git a/net/core/devmem.c b/net/core/devmem.c index fb0dddcb4e60..c81625ca57c6 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "devmem.h" @@ -27,6 +28,8 @@ /* Protected by rtnl_lock() */ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); +static const struct memory_provider_ops dmabuf_devmem_ops; + static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, struct gen_pool_chunk *chunk, void *not_used) @@ -118,6 +121,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) WARN_ON(rxq->mp_params.mp_priv != binding); rxq->mp_params.mp_priv = NULL; + rxq->mp_params.mp_ops = NULL; rxq_idx = get_netdev_rx_queue_index(rxq); @@ -153,7 +157,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, } rxq = __netif_get_rx_queue(dev, rxq_idx); - if (rxq->mp_params.mp_priv) { + if (rxq->mp_params.mp_ops) { NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); return -EEXIST; } @@ -171,6 +175,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return err; rxq->mp_params.mp_priv = binding; + rxq->mp_params.mp_ops = &dmabuf_devmem_ops; err = netdev_rx_queue_restart(dev, rxq_idx); if (err) @@ -180,6 +185,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, err_xa_erase: rxq->mp_params.mp_priv = NULL; + rxq->mp_params.mp_ops = NULL; xa_erase(&binding->bound_rxqs, xa_idx); return err; @@ -399,3 +405,10 @@ bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) /* We don't want the page pool put_page()ing our net_iovs. */ return false; } + +static const struct memory_provider_ops dmabuf_devmem_ops = { + .init = mp_dmabuf_devmem_init, + .destroy = mp_dmabuf_devmem_destroy, + .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, + .release_netmem = mp_dmabuf_devmem_release_page, +}; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f5e908c9e7ad..d632cf2c91c3 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -13,6 +13,7 @@ #include #include +#include #include #include @@ -285,13 +286,19 @@ static int page_pool_init(struct page_pool *pool, rxq = __netif_get_rx_queue(pool->slow.netdev, pool->slow.queue_idx); pool->mp_priv = rxq->mp_params.mp_priv; + pool->mp_ops = rxq->mp_params.mp_ops; } - if (pool->mp_priv) { + if (pool->mp_ops) { if (!pool->dma_map || !pool->dma_sync) return -EOPNOTSUPP; - err = mp_dmabuf_devmem_init(pool); + if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { + err = -EFAULT; + goto free_ptr_ring; + } + + err = pool->mp_ops->init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__, err); @@ -587,8 +594,8 @@ netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) return netmem; /* Slow-path: cache empty, do real allocation */ - if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) - netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp); + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + netmem = pool->mp_ops->alloc_netmems(pool, gfp); else netmem = __page_pool_alloc_pages_slow(pool, gfp); return netmem; @@ -679,8 +686,8 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) bool put; put = true; - if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) - put = mp_dmabuf_devmem_release_page(pool, netmem); + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + put = pool->mp_ops->release_netmem(pool, netmem); else __page_pool_release_page_dma(pool, netmem); @@ -1048,8 +1055,8 @@ static void __page_pool_destroy(struct page_pool *pool) page_pool_unlist(pool); page_pool_uninit(pool); - if (pool->mp_priv) { - mp_dmabuf_devmem_destroy(pool); + if (pool->mp_ops) { + pool->mp_ops->destroy(pool); static_branch_dec(&page_pool_mem_providers); } -- cgit v1.2.3 From 2508a46f920a3130e35ab2183a70fc93f0aaaee4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:17 -0800 Subject: net: page_pool: add callback for mp info printing Add a mandatory callback that prints information about the memory provider to netlink. Reviewed-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-7-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 5 +++++ net/core/devmem.c | 10 ++++++++++ net/core/netdev-genl.c | 11 ++++++----- net/core/page_pool_user.c | 5 ++--- 4 files changed, 23 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index e49d0a52629d..6d10a0959d00 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -5,11 +5,16 @@ #include #include +struct netdev_rx_queue; +struct sk_buff; + struct memory_provider_ops { netmem_ref (*alloc_netmems)(struct page_pool *pool, gfp_t gfp); bool (*release_netmem)(struct page_pool *pool, netmem_ref netmem); int (*init)(struct page_pool *pool); void (*destroy)(struct page_pool *pool); + int (*nl_fill)(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq); }; #endif diff --git a/net/core/devmem.c b/net/core/devmem.c index c81625ca57c6..63b790326c7d 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -406,9 +406,19 @@ bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) return false; } +static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq) +{ + const struct net_devmem_dmabuf_binding *binding = mp_priv; + int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF; + + return nla_put_u32(rsp, type, binding->id); +} + static const struct memory_provider_ops dmabuf_devmem_ops = { .init = mp_dmabuf_devmem_init, .destroy = mp_dmabuf_devmem_destroy, .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, .release_netmem = mp_dmabuf_devmem_release_page, + .nl_fill = mp_dmabuf_devmem_nl_fill, }; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 715f85c6b62e..5b459b4fef46 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "dev.h" #include "devmem.h" @@ -368,7 +369,7 @@ static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { - struct net_devmem_dmabuf_binding *binding; + struct pp_memory_provider_params *params; struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; @@ -385,15 +386,15 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); + if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, rxq->napi->napi_id)) goto nla_put_failure; - binding = rxq->mp_params.mp_priv; - if (binding && - nla_put_u32(rsp, NETDEV_A_QUEUE_DMABUF, binding->id)) + params = &rxq->mp_params; + if (params->mp_ops && + params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) goto nla_put_failure; - break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index d5e214c30c31..9d8a3d8597fa 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -8,9 +8,9 @@ #include #include #include +#include #include -#include "devmem.h" #include "page_pool_priv.h" #include "netdev-genl-gen.h" @@ -216,7 +216,6 @@ static int page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { - struct net_devmem_dmabuf_binding *binding = pool->mp_priv; size_t inflight, refsz; unsigned int napi_id; void *hdr; @@ -249,7 +248,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, pool->user.detach_time)) goto err_cancel; - if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) + if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL)) goto err_cancel; genlmsg_end(rsp, hdr); -- cgit v1.2.3 From f8350a4358fc9c4801f2f4229cf9b6e678055d9a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:18 -0800 Subject: net: page_pool: add a mp hook to unregister_netdevice* Devmem TCP needs a hook in unregister_netdevice_many_notify() to upkeep the set tracking queues it's bound to, i.e. ->bound_rxqs. Instead of devmem sticking directly out of the genetic path, add a mp function. Reviewed-by: Jakub Kicinski Reviewed-by: Mina Almasry Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-8-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 1 + net/core/dev.c | 16 ++++++++++++++- net/core/devmem.c | 36 +++++++++++++++------------------ net/core/devmem.h | 5 ----- 4 files changed, 32 insertions(+), 26 deletions(-) (limited to 'include/net') diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index 6d10a0959d00..36469a7e649f 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -15,6 +15,7 @@ struct memory_provider_ops { void (*destroy)(struct page_pool *pool); int (*nl_fill)(void *mp_priv, struct sk_buff *rsp, struct netdev_rx_queue *rxq); + void (*uninstall)(void *mp_priv, struct netdev_rx_queue *rxq); }; #endif diff --git a/net/core/dev.c b/net/core/dev.c index c0021cbd28fc..384b9109932a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -159,6 +159,7 @@ #include #include #include +#include #include #include @@ -11724,6 +11725,19 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) } EXPORT_SYMBOL(unregister_netdevice_queue); +static void dev_memory_provider_uninstall(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct netdev_rx_queue *rxq = &dev->_rx[i]; + struct pp_memory_provider_params *p = &rxq->mp_params; + + if (p->mp_ops && p->mp_ops->uninstall) + p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq); + } +} + void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { @@ -11778,7 +11792,7 @@ void unregister_netdevice_many_notify(struct list_head *head, dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); - dev_dmabuf_uninstall(dev); + dev_memory_provider_uninstall(dev); netdev_offload_xstats_disable_all(dev); diff --git a/net/core/devmem.c b/net/core/devmem.c index 63b790326c7d..cbac6419fcc4 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -320,26 +320,6 @@ err_put_dmabuf: return ERR_PTR(err); } -void dev_dmabuf_uninstall(struct net_device *dev) -{ - struct net_devmem_dmabuf_binding *binding; - struct netdev_rx_queue *rxq; - unsigned long xa_idx; - unsigned int i; - - for (i = 0; i < dev->real_num_rx_queues; i++) { - binding = dev->_rx[i].mp_params.mp_priv; - if (!binding) - continue; - - xa_for_each(&binding->bound_rxqs, xa_idx, rxq) - if (rxq == &dev->_rx[i]) { - xa_erase(&binding->bound_rxqs, xa_idx); - break; - } - } -} - /*** "Dmabuf devmem memory provider" ***/ int mp_dmabuf_devmem_init(struct page_pool *pool) @@ -415,10 +395,26 @@ static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp, return nla_put_u32(rsp, type, binding->id); } +static void mp_dmabuf_devmem_uninstall(void *mp_priv, + struct netdev_rx_queue *rxq) +{ + struct net_devmem_dmabuf_binding *binding = mp_priv; + struct netdev_rx_queue *bound_rxq; + unsigned long xa_idx; + + xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) { + if (bound_rxq == rxq) { + xa_erase(&binding->bound_rxqs, xa_idx); + break; + } + } +} + static const struct memory_provider_ops dmabuf_devmem_ops = { .init = mp_dmabuf_devmem_init, .destroy = mp_dmabuf_devmem_destroy, .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, .release_netmem = mp_dmabuf_devmem_release_page, .nl_fill = mp_dmabuf_devmem_nl_fill, + .uninstall = mp_dmabuf_devmem_uninstall, }; diff --git a/net/core/devmem.h b/net/core/devmem.h index a2b9913e9a17..8e999fe2ae67 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -68,7 +68,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack); -void dev_dmabuf_uninstall(struct net_device *dev); static inline struct dmabuf_genpool_chunk_owner * net_devmem_iov_to_chunk_owner(const struct net_iov *niov) @@ -145,10 +144,6 @@ net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return -EOPNOTSUPP; } -static inline void dev_dmabuf_uninstall(struct net_device *dev) -{ -} - static inline struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) { -- cgit v1.2.3 From 56102c013fa7b8dbba8c5d5f7e042ad5f18cf4ec Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Feb 2025 13:56:20 -0800 Subject: net: page_pool: add memory provider helpers Add helpers for memory providers to interact with page pools. net_mp_niov_{set,clear}_page_pool() serve to [dis]associate a net_iov with a page pool. If used, the memory provider is responsible to match "set" calls with "clear" once a net_iov is not going to be used by a page pool anymore, changing a page pool, etc. Acked-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-10-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 19 +++++++++++++++++++ net/core/page_pool.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) (limited to 'include/net') diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index 36469a7e649f..4f0ffb8f6a0a 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -18,4 +18,23 @@ struct memory_provider_ops { void (*uninstall)(void *mp_priv, struct netdev_rx_queue *rxq); }; +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); +void net_mp_niov_clear_page_pool(struct net_iov *niov); + +/** + * net_mp_netmem_place_in_cache() - give a netmem to a page pool + * @pool: the page pool to place the netmem into + * @netmem: netmem to give + * + * Push an accounted netmem into the page pool's allocation cache. The caller + * must ensure that there is space in the cache. It should only be called off + * the mp_ops->alloc_netmems() path. + */ +static inline void net_mp_netmem_place_in_cache(struct page_pool *pool, + netmem_ref netmem) +{ + pool->alloc.cache[pool->alloc.count++] = netmem; +} + #endif diff --git a/net/core/page_pool.c b/net/core/page_pool.c index d632cf2c91c3..686bd4a117d9 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -1197,3 +1197,31 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); + +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) +{ + return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); +} + +/* Associate a niov with a page pool. Should follow with a matching + * net_mp_niov_clear_page_pool() + */ +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_set_pp_info(pool, netmem); + + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); +} + +/* Disassociate a niov from a page pool. Should only be used in the + * ->release_netmem() path. + */ +void net_mp_niov_clear_page_pool(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_clear_pp_info(netmem); +} -- cgit v1.2.3 From 6e18ed929d3ba9b3b92ba5894f9233686b3e3ec1 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 4 Feb 2025 13:56:21 -0800 Subject: net: add helpers for setting a memory provider on an rx queue Add helpers that properly prep or remove a memory provider for an rx queue then restart the queue. Reviewed-by: Jakub Kicinski Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://patch.msgid.link/20250204215622.695511-11-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/memory_provider.h | 5 +++ net/core/netdev_rx_queue.c | 69 +++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) (limited to 'include/net') diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index 4f0ffb8f6a0a..b3e665897767 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -22,6 +22,11 @@ bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); void net_mp_niov_clear_page_pool(struct net_iov *niov); +int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p); +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p); + /** * net_mp_netmem_place_in_cache() - give a netmem to a page pool * @pool: the page pool to place the netmem into diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index db82786fa0c4..db46880f37cc 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "page_pool_priv.h" @@ -80,3 +81,71 @@ err_free_new_mem: return err; } EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); + +static int __net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p) +{ + struct netdev_rx_queue *rxq; + int ret; + + if (ifq_idx >= dev->real_num_rx_queues) + return -EINVAL; + ifq_idx = array_index_nospec(ifq_idx, dev->real_num_rx_queues); + + rxq = __netif_get_rx_queue(dev, ifq_idx); + if (rxq->mp_params.mp_ops) + return -EEXIST; + + rxq->mp_params = *p; + ret = netdev_rx_queue_restart(dev, ifq_idx); + if (ret) { + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + } + return ret; +} + +int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p) +{ + int ret; + + rtnl_lock(); + ret = __net_mp_open_rxq(dev, ifq_idx, p); + rtnl_unlock(); + return ret; +} + +static void __net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p) +{ + struct netdev_rx_queue *rxq; + + if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) + return; + + rxq = __netif_get_rx_queue(dev, ifq_idx); + + /* Callers holding a netdev ref may get here after we already + * went thru shutdown via dev_memory_provider_uninstall(). + */ + if (dev->reg_state > NETREG_REGISTERED && + !rxq->mp_params.mp_ops) + return; + + if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops || + rxq->mp_params.mp_priv != old_p->mp_priv)) + return; + + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + WARN_ON(netdev_rx_queue_restart(dev, ifq_idx)); +} + +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p) +{ + rtnl_lock(); + __net_mp_close_rxq(dev, ifq_idx, old_p); + rtnl_unlock(); +} -- cgit v1.2.3 From 6597e8d35851e1e0cb381f76ce1d960518fd8c94 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 5 Feb 2025 19:37:47 +0000 Subject: netdev-genl: Elide napi_id when not present There are at least two cases where napi_id may not present and the napi_id should be elided: 1. Queues could be created, but napi_enable may not have been called yet. In this case, there may be a NAPI but it may not have an ID and output of a napi_id should be elided. 2. TX-only NAPIs currently do not have NAPI IDs. If a TX queue happens to be linked with a TX-only NAPI, elide the NAPI ID from the netlink output as a NAPI ID of 0 is not useful for users. Signed-off-by: Joe Damato Reviewed-by: Sridhar Samudrala Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250205193751.297211-1-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- include/net/busy_poll.h | 5 +++++ net/core/netdev-genl.c | 14 +++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index c39a426ebf52..741fa7754700 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -24,6 +24,11 @@ */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) +static inline bool napi_id_valid(unsigned int napi_id) +{ + return napi_id >= MIN_NAPI_ID; +} + #define BUSY_POLL_BUDGET 8 #ifdef CONFIG_NET_RX_BUSY_POLL diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 5b459b4fef46..0dcd4faefd8d 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -365,6 +365,13 @@ int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) return err; } +static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi) +{ + if (napi && napi_id_valid(napi->napi_id)) + return nla_put_u32(skb, NETDEV_A_QUEUE_NAPI_ID, napi->napi_id); + return 0; +} + static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) @@ -386,9 +393,7 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); - - if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, - rxq->napi->napi_id)) + if (nla_put_napi_id(rsp, rxq->napi)) goto nla_put_failure; params = &rxq->mp_params; @@ -398,8 +403,7 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); - if (txq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, - txq->napi->napi_id)) + if (nla_put_napi_id(rsp, txq->napi)) goto nla_put_failure; } -- cgit v1.2.3 From be258f654a6ef5ab0d777f40fd3bcee540934307 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Feb 2025 09:46:05 +0000 Subject: tcp: rename inet_csk_{delete|reset}_keepalive_timer() inet_csk_delete_keepalive_timer() and inet_csk_reset_keepalive_timer() are only used from core TCP, there is no need to export them. Replace their prefix by tcp. Move them to net/ipv4/tcp_timer.c and make tcp_delete_keepalive_timer() static. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Joe Damato Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250206094605.2694118-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 3 --- include/net/tcp.h | 1 + net/ipv4/inet_connection_sock.c | 12 ------------ net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_input.c | 6 +++--- net/ipv4/tcp_minisocks.c | 3 +-- net/ipv4/tcp_timer.c | 21 +++++++++++++++------ 7 files changed, 22 insertions(+), 28 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c7f42844c79a..055aa80b05c6 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -189,9 +189,6 @@ static inline void inet_csk_delack_init(struct sock *sk) memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack)); } -void inet_csk_delete_keepalive_timer(struct sock *sk); -void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout); - static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) { struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index 5b2b04835688..bb7edf0e72aa 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -415,6 +415,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); +void tcp_reset_keepalive_timer(struct sock *sk, unsigned long timeout); void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index e4decfb270fa..2b7775b90a09 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -799,18 +799,6 @@ void inet_csk_clear_xmit_timers_sync(struct sock *sk) sk_stop_timer_sync(sk, &sk->sk_timer); } -void inet_csk_delete_keepalive_timer(struct sock *sk) -{ - sk_stop_timer(sk, &sk->sk_timer); -} -EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); - -void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) -{ - sk_reset_timer(sk, &sk->sk_timer, jiffies + len); -} -EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); - struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7f43d31c9400..2021f2709ec3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3179,7 +3179,7 @@ adjudge_to_death: const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); @@ -3632,7 +3632,7 @@ int tcp_sock_set_keepidle_locked(struct sock *sk, int val) elapsed = tp->keepalive_time - elapsed; else elapsed = 0; - inet_csk_reset_keepalive_timer(sk, elapsed); + tcp_reset_keepalive_timer(sk, elapsed); } return 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 61da8ffc2f86..286f15e4994a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6346,7 +6346,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tp->lsndtime = tcp_jiffies32; if (sock_flag(sk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); + tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); if (!tp->rx_opt.snd_wscale) __tcp_fast_path_on(tp, tp->snd_wnd); @@ -6921,7 +6921,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing @@ -6929,7 +6929,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * if it spins in bh_lock_sock(), but it is really * marginal case. */ - inet_csk_reset_keepalive_timer(sk, tmo); + tcp_reset_keepalive_timer(sk, tmo); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto consume; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b089b08e9617..0deb2ac85acf 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -566,8 +566,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(newsk, - keepalive_time_when(newtp)); + tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; newtp->rx_opt.sack_ok = ireq->sack_ok; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b412ed88ccd9..cfb6f4c4e4c9 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -751,20 +751,29 @@ void tcp_syn_ack_timeout(const struct request_sock *req) } EXPORT_SYMBOL(tcp_syn_ack_timeout); +void tcp_reset_keepalive_timer(struct sock *sk, unsigned long len) +{ + sk_reset_timer(sk, &sk->sk_timer, jiffies + len); +} + +static void tcp_delete_keepalive_timer(struct sock *sk) +{ + sk_stop_timer(sk, &sk->sk_timer); +} + void tcp_set_keepalive(struct sock *sk, int val) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return; if (val && !sock_flag(sk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); + tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); else if (!val) - inet_csk_delete_keepalive_timer(sk); + tcp_delete_keepalive_timer(sk); } EXPORT_SYMBOL_GPL(tcp_set_keepalive); - -static void tcp_keepalive_timer (struct timer_list *t) +static void tcp_keepalive_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); struct inet_connection_sock *icsk = inet_csk(sk); @@ -775,7 +784,7 @@ static void tcp_keepalive_timer (struct timer_list *t) bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ - inet_csk_reset_keepalive_timer (sk, HZ/20); + tcp_reset_keepalive_timer(sk, HZ/20); goto out; } @@ -841,7 +850,7 @@ static void tcp_keepalive_timer (struct timer_list *t) } resched: - inet_csk_reset_keepalive_timer (sk, elapsed); + tcp_reset_keepalive_timer(sk, elapsed); goto out; death: -- cgit v1.2.3 From 3e7efc3f4f03bca0ea630c302e7c79cf807476bb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 6 Feb 2025 14:56:36 -0800 Subject: net: devmem: don't call queue stop / start when the interface is down We seem to be missing a netif_running() check from the devmem installation path. Starting a queue on a stopped device makes no sense. We still want to be able to allocate the memory, just to test that the device is indeed setting up the page pools in a memory provider compatible way. This is not a bug fix, because existing drivers check if the interface is down as part of the ops. But new drivers shouldn't have to do this, as long as they can correctly alloc/free while down. Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250206225638.1387810-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netdev_queues.h | 4 ++++ net/core/netdev_rx_queue.c | 18 +++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index b02bb9f109d5..73d3401261a6 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -117,6 +117,10 @@ struct netdev_stat_ops { * * @ndo_queue_stop: Stop the RX queue at the specified index. The stopped * queue's memory is written at the specified address. + * + * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while + * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only + * be called for an interface which is open. */ struct netdev_queue_mgmt_ops { size_t ndo_queue_mem_size; diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index d421abe06c03..ddd54e1e7289 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -38,13 +38,17 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) if (err) goto err_free_new_queue_mem; - err = qops->ndo_queue_stop(dev, old_mem, rxq_idx); - if (err) - goto err_free_new_queue_mem; - - err = qops->ndo_queue_start(dev, new_mem, rxq_idx); - if (err) - goto err_start_queue; + if (netif_running(dev)) { + err = qops->ndo_queue_stop(dev, old_mem, rxq_idx); + if (err) + goto err_free_new_queue_mem; + + err = qops->ndo_queue_start(dev, new_mem, rxq_idx); + if (err) + goto err_start_queue; + } else { + swap(new_mem, old_mem); + } qops->ndo_queue_mem_free(dev, old_mem); -- cgit v1.2.3 From 23d9324a27a48858cfdd7f0342f52328e8595c6d Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 6 Feb 2025 19:26:29 +0100 Subject: xsk: add helper to get &xdp_desc's DMA and meta pointer in one go Currently, when your driver supports XSk Tx metadata and you want to send an XSk frame, you need to do the following: * call external xsk_buff_raw_get_dma(); * call inline xsk_buff_get_metadata(), which calls external xsk_buff_raw_get_data() and then do some inline checks. This effectively means that the following piece: addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; is done twice per frame, plus you have 2 external calls per frame, plus this: meta = pool->addrs + addr - pool->tx_metadata_len; if (unlikely(!xsk_buff_valid_tx_metadata(meta))) is always inlined, even if there's no meta or it's invalid. Add xsk_buff_raw_get_ctx() (xp_raw_get_ctx() to be precise) to do that in one go. It returns a small structure with 2 fields: DMA address, filled unconditionally, and metadata pointer, non-NULL only if it's present and valid. The address correction is performed only once and you also have only 1 external call per XSk frame, which does all the calculations and checks outside of your hotpath. You only need to check `if (ctx.meta)` for the metadata presence. To not copy any existing code, derive address correction and getting virtual and DMA address into small helpers. bloat-o-meter reports no object code changes for the existing functionality. Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20250206182630.3914318-5-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp_sock_drv.h | 43 +++++++++++++++++++++++++++++++++++++++--- include/net/xsk_buff_pool.h | 8 ++++++++ net/xdp/xsk_buff_pool.c | 46 +++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 90 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 784cd34f5bba..15086dcf51d8 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -196,6 +196,23 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return xp_raw_get_data(pool, addr); } +/** + * xsk_buff_raw_get_ctx - get &xdp_desc context + * @pool: XSk buff pool desc address belongs to + * @addr: desc address (from userspace) + * + * Wrapper for xp_raw_get_ctx() to be used in drivers, see its kdoc for + * details. + * + * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata + * pointer, if it is present and valid (initialized to %NULL otherwise). + */ +static inline struct xdp_desc_ctx +xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + return xp_raw_get_ctx(pool, addr); +} + #define XDP_TXMD_FLAGS_VALID ( \ XDP_TXMD_FLAGS_TIMESTAMP | \ XDP_TXMD_FLAGS_CHECKSUM | \ @@ -207,20 +224,27 @@ xsk_buff_valid_tx_metadata(const struct xsk_tx_metadata *meta) return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); } -static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +static inline struct xsk_tx_metadata * +__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) { struct xsk_tx_metadata *meta; if (!pool->tx_metadata_len) return NULL; - meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len; + meta = data - pool->tx_metadata_len; if (unlikely(!xsk_buff_valid_tx_metadata(meta))) return NULL; /* no way to signal the error to the user */ return meta; } +static inline struct xsk_tx_metadata * +xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + return __xsk_buff_get_metadata(pool, xp_raw_get_data(pool, addr)); +} + static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); @@ -388,12 +412,25 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return NULL; } +static inline struct xdp_desc_ctx +xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + return (struct xdp_desc_ctx){ }; +} + static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) { return false; } -static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +static inline struct xsk_tx_metadata * +__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) +{ + return NULL; +} + +static inline struct xsk_tx_metadata * +xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) { return NULL; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 50779406bc2d..1dcd4d71468a 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -141,6 +141,14 @@ u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max); bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); + +struct xdp_desc_ctx { + dma_addr_t dma; + struct xsk_tx_metadata *meta; +}; + +struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr); + static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) { return xskb->dma; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 1f7975b49657..c263fb7a68dc 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -699,18 +699,56 @@ void xp_free(struct xdp_buff_xsk *xskb) } EXPORT_SYMBOL(xp_free); -void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +static u64 __xp_raw_get_addr(const struct xsk_buff_pool *pool, u64 addr) +{ + return pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; +} + +static void *__xp_raw_get_data(const struct xsk_buff_pool *pool, u64 addr) { - addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; return pool->addrs + addr; } + +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +{ + return __xp_raw_get_data(pool, __xp_raw_get_addr(pool, addr)); +} EXPORT_SYMBOL(xp_raw_get_data); -dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +static dma_addr_t __xp_raw_get_dma(const struct xsk_buff_pool *pool, u64 addr) { - addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; return (pool->dma_pages[addr >> PAGE_SHIFT] & ~XSK_NEXT_PG_CONTIG_MASK) + (addr & ~PAGE_MASK); } + +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +{ + return __xp_raw_get_dma(pool, __xp_raw_get_addr(pool, addr)); +} EXPORT_SYMBOL(xp_raw_get_dma); + +/** + * xp_raw_get_ctx - get &xdp_desc context + * @pool: XSk buff pool desc address belongs to + * @addr: desc address (from userspace) + * + * Helper for getting desc's DMA address and metadata pointer, if present. + * Saves one call on hotpath, double calculation of the actual address, + * and inline checks for metadata presence and sanity. + * + * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata + * pointer, if it is present and valid (initialized to %NULL otherwise). + */ +struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + struct xdp_desc_ctx ret; + + addr = __xp_raw_get_addr(pool, addr); + + ret.dma = __xp_raw_get_dma(pool, addr); + ret.meta = __xsk_buff_get_metadata(pool, __xp_raw_get_data(pool, addr)); + + return ret; +} +EXPORT_SYMBOL(xp_raw_get_ctx); -- cgit v1.2.3 From a0596c2c63fc9d9ad16d701044293e11d8f97953 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 7 Feb 2025 16:24:59 +0900 Subject: net: fib_rules: Factorise fib_newrule() and fib_delrule(). fib_nl_newrule() / fib_nl_delrule() is the doit() handler for RTM_NEWRULE / RTM_DELRULE but also called from vrf_newlink(). Currently, we hold RTNL on both paths but will not on the former. Also, we set dev_net(dev)->rtnl to skb->sk in vrf_fib_rule() because fib_nl_newrule() / fib_nl_delrule() fetch net as sock_net(skb->sk). Let's Factorise the two functions and pass net and rtnl_held flag. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Link: https://patch.msgid.link/20250207072502.87775-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/vrf.c | 6 ++---- include/net/fib_rules.h | 8 ++++---- net/core/fib_rules.c | 36 +++++++++++++++++++++++------------- 3 files changed, 29 insertions(+), 21 deletions(-) (limited to 'include/net') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index ca81b212a246..5f21ce1013c4 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1537,14 +1537,12 @@ static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) nlmsg_end(skb, nlh); - /* fib_nl_{new,del}rule handling looks for net from skb->sk */ - skb->sk = dev_net(dev)->rtnl; if (add_it) { - err = fib_nl_newrule(skb, nlh, NULL); + err = fib_newrule(dev_net(dev), skb, nlh, NULL, true); if (err == -EEXIST) err = 0; } else { - err = fib_nl_delrule(skb, nlh, NULL); + err = fib_delrule(dev_net(dev), skb, nlh, NULL, true); if (err == -ENOENT) err = 0; } diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 04383d90a1e3..710caacad9da 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -178,10 +178,10 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack); unsigned int fib_rules_seq_read(const struct net *net, int family); -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack); -int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack); +int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held); +int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held); INDIRECT_CALLABLE_DECLARE(int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 694a8c2884a8..d68332d9cac6 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -783,15 +783,14 @@ static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 }, }; -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held) { - struct net *net = sock_net(skb->sk); + struct fib_rule *rule = NULL, *r, *last = NULL; struct fib_rule_hdr *frh = nlmsg_data(nlh); + int err = -EINVAL, unresolved = 0; struct fib_rules_ops *ops = NULL; - struct fib_rule *rule = NULL, *r, *last = NULL; struct nlattr *tb[FRA_MAX + 1]; - int err = -EINVAL, unresolved = 0; bool user_priority = false; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { @@ -893,18 +892,23 @@ errout: rules_ops_put(ops); return err; } -EXPORT_SYMBOL_GPL(fib_nl_newrule); +EXPORT_SYMBOL_GPL(fib_newrule); -int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - struct net *net = sock_net(skb->sk); + return fib_newrule(sock_net(skb->sk), skb, nlh, extack, true); +} + +int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held) +{ + struct fib_rule *rule = NULL, *nlrule = NULL; struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; - struct fib_rule *rule = NULL, *r, *nlrule = NULL; struct nlattr *tb[FRA_MAX+1]; - int err = -EINVAL; bool user_priority = false; + int err = -EINVAL; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid msg length"); @@ -969,7 +973,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, * current if it is goto rule, have actually been added. */ if (ops->nr_goto_rules > 0) { - struct fib_rule *n; + struct fib_rule *n, *r; n = list_next_entry(rule, list); if (&n->list == &ops->rules_list || n->pref != rule->pref) @@ -998,7 +1002,13 @@ errout: rules_ops_put(ops); return err; } -EXPORT_SYMBOL_GPL(fib_nl_delrule); +EXPORT_SYMBOL_GPL(fib_delrule); + +static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return fib_delrule(sock_net(skb->sk), skb, nlh, extack, true); +} static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) -- cgit v1.2.3 From be22179cfb2fb1164004b70b33a4bdf67e6dd349 Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Tue, 4 Feb 2025 12:13:51 +0100 Subject: wifi: nl80211/cfg80211: Stop supporting cooked monitor Unconditionally start to refuse creating cooked monitor interfaces to phase them out. There is no feature flag for drivers to opt-in for cooked monitor and all known users are using/preferring the modern API since the hostapd release 1.0 in May 2012. Signed-off-by: Alexander Wetzel Link: https://patch.msgid.link/20250204111352.7004-1-Alexander@wetzel-home.de Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- include/uapi/linux/nl80211.h | 4 ++-- net/wireless/nl80211.c | 4 ++++ 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 363d7dd2255a..a72e7eb7027f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2265,7 +2265,7 @@ static inline int cfg80211_get_station(struct net_device *dev, * @MONITOR_FLAG_PLCPFAIL: pass frames with bad PLCP * @MONITOR_FLAG_CONTROL: pass control frames * @MONITOR_FLAG_OTHER_BSS: disable BSSID filtering - * @MONITOR_FLAG_COOK_FRAMES: report frames after processing + * @MONITOR_FLAG_COOK_FRAMES: deprecated, will unconditionally be refused * @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address * @MONITOR_FLAG_SKIP_TX: do not pass locally transmitted frames */ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f6c1b181c886..9d8ecf20ef0d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4727,8 +4727,8 @@ enum nl80211_survey_info { * @NL80211_MNTR_FLAG_PLCPFAIL: pass frames with bad PLCP * @NL80211_MNTR_FLAG_CONTROL: pass control frames * @NL80211_MNTR_FLAG_OTHER_BSS: disable BSSID filtering - * @NL80211_MNTR_FLAG_COOK_FRAMES: report frames after processing. - * overrides all other flags. + * @NL80211_MNTR_FLAG_COOK_FRAMES: deprecated + * will unconditionally be refused * @NL80211_MNTR_FLAG_ACTIVE: use the configured MAC address * and ACK incoming unicast packets. * @NL80211_MNTR_FLAG_SKIP_TX: do not pass local tx packets diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d7d3da0f6833..8bd09110d393 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4245,6 +4245,10 @@ static int nl80211_parse_mon_options(struct cfg80211_registered_device *rdev, change = true; } + /* MONITOR_FLAG_COOK_FRAMES is deprecated, refuse cooperation */ + if (params->flags & MONITOR_FLAG_COOK_FRAMES) + return -EOPNOTSUPP; + if (params->flags & MONITOR_FLAG_ACTIVE && !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR)) return -EOPNOTSUPP; -- cgit v1.2.3 From 286e696770654d79b34bd15953e7101a1c4784c7 Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Tue, 4 Feb 2025 12:13:52 +0100 Subject: wifi: mac80211: Drop cooked monitor support Hostapd switched from cooked monitor interfaces to nl80211 Dec 2011. Drop support for the outdated cooked monitor interfaces and fix creating the virtual monitor interfaces in the following cases: 1) We have one non-monitor and one monitor interface with %MONITOR_FLAG_ACTIVE enabled and then delete the non-monitor interface. 2) We only have monitor interfaces enabled on resume while at least one has %MONITOR_FLAG_ACTIVE set. Signed-off-by: Alexander Wetzel Link: https://patch.msgid.link/20250204111352.7004-2-Alexander@wetzel-home.de Signed-off-by: Johannes Berg --- include/net/dropreason.h | 6 -- net/mac80211/cfg.c | 9 +-- net/mac80211/drop.h | 21 ++--- net/mac80211/ieee80211_i.h | 11 +-- net/mac80211/iface.c | 50 +++++------- net/mac80211/main.c | 16 +--- net/mac80211/rx.c | 194 ++++++++++++--------------------------------- net/mac80211/status.c | 34 ++------ net/mac80211/tx.c | 2 +- 9 files changed, 94 insertions(+), 249 deletions(-) (limited to 'include/net') diff --git a/include/net/dropreason.h b/include/net/dropreason.h index 56cb7be92244..7d3b1a2a6fec 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -17,12 +17,6 @@ enum skb_drop_reason_subsys { */ SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE, - /** - * @SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR: mac80211 drop reasons - * for frames still going to monitor, see net/mac80211/drop.h - */ - SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR, - /** * @SKB_DROP_REASON_SUBSYS_OPENVSWITCH: openvswitch drop reasons, * see net/openvswitch/drop.h diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 9351c64608a9..88949b90f117 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -89,15 +89,14 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, /* check flags first */ if (params->flags && ieee80211_sdata_running(sdata)) { - u32 mask = MONITOR_FLAG_COOK_FRAMES | MONITOR_FLAG_ACTIVE; + u32 mask = MONITOR_FLAG_ACTIVE; /* - * Prohibit MONITOR_FLAG_COOK_FRAMES and - * MONITOR_FLAG_ACTIVE to be changed while the - * interface is up. + * Prohibit MONITOR_FLAG_ACTIVE to be changed + * while the interface is up. * Else we would need to add a lot of cruft * to update everything: - * cooked_mntrs, monitor and all fif_* counters + * monitor and all fif_* counters * reconfigure hardware */ if ((params->flags & mask) != (sdata->u.mntr.flags & mask)) diff --git a/net/mac80211/drop.h b/net/mac80211/drop.h index 59e3ec4dc960..eb9ab310f91c 100644 --- a/net/mac80211/drop.h +++ b/net/mac80211/drop.h @@ -11,12 +11,6 @@ typedef unsigned int __bitwise ieee80211_rx_result; -#define MAC80211_DROP_REASONS_MONITOR(R) \ - R(RX_DROP_M_UNEXPECTED_4ADDR_FRAME) \ - R(RX_DROP_M_BAD_BCN_KEYIDX) \ - R(RX_DROP_M_BAD_MGMT_KEYIDX) \ -/* this line for the trailing \ - add before this */ - #define MAC80211_DROP_REASONS_UNUSABLE(R) \ /* 0x00 == ___RX_DROP_UNUSABLE */ \ R(RX_DROP_U_MIC_FAIL) \ @@ -66,6 +60,10 @@ typedef unsigned int __bitwise ieee80211_rx_result; R(RX_DROP_U_UNEXPECTED_STA_4ADDR) \ R(RX_DROP_U_UNEXPECTED_VLAN_MCAST) \ R(RX_DROP_U_NOT_PORT_CONTROL) \ + R(RX_DROP_U_UNEXPECTED_4ADDR_FRAME) \ + R(RX_DROP_U_BAD_BCN_KEYIDX) \ + /* 0x30 */ \ + R(RX_DROP_U_BAD_MGMT_KEYIDX) \ R(RX_DROP_U_UNKNOWN_ACTION_REJECTED) \ /* this line for the trailing \ - add before this */ @@ -78,10 +76,6 @@ enum ___mac80211_drop_reason { ___RX_QUEUED = SKB_NOT_DROPPED_YET, #define ENUM(x) ___ ## x, - ___RX_DROP_MONITOR = SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR << - SKB_DROP_REASON_SUBSYS_SHIFT, - MAC80211_DROP_REASONS_MONITOR(ENUM) - ___RX_DROP_UNUSABLE = SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE << SKB_DROP_REASON_SUBSYS_SHIFT, MAC80211_DROP_REASONS_UNUSABLE(ENUM) @@ -89,11 +83,10 @@ enum ___mac80211_drop_reason { }; enum mac80211_drop_reason { - RX_CONTINUE = (__force ieee80211_rx_result)___RX_CONTINUE, - RX_QUEUED = (__force ieee80211_rx_result)___RX_QUEUED, - RX_DROP_MONITOR = (__force ieee80211_rx_result)___RX_DROP_MONITOR, + RX_CONTINUE = (__force ieee80211_rx_result)___RX_CONTINUE, + RX_QUEUED = (__force ieee80211_rx_result)___RX_QUEUED, + RX_DROP = (__force ieee80211_rx_result)___RX_DROP_UNUSABLE, #define DEF(x) x = (__force ieee80211_rx_result)___ ## x, - MAC80211_DROP_REASONS_MONITOR(DEF) MAC80211_DROP_REASONS_UNUSABLE(DEF) #undef DEF }; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index e7dc3f0cfc9a..a90a44aa5758 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -200,7 +200,6 @@ enum ieee80211_packet_rx_flags { /** * enum ieee80211_rx_flags - RX data flags * - * @IEEE80211_RX_CMNTR: received on cooked monitor already * @IEEE80211_RX_BEACON_REPORTED: This frame was already reported * to cfg80211_report_obss_beacon(). * @@ -208,8 +207,7 @@ enum ieee80211_packet_rx_flags { * for a single frame. */ enum ieee80211_rx_flags { - IEEE80211_RX_CMNTR = BIT(0), - IEEE80211_RX_BEACON_REPORTED = BIT(1), + IEEE80211_RX_BEACON_REPORTED = BIT(0), }; struct ieee80211_rx_data { @@ -1380,7 +1378,7 @@ struct ieee80211_local { spinlock_t queue_stop_reason_lock; int open_count; - int monitors, cooked_mntrs, tx_mntrs; + int monitors, tx_mntrs; /* number of interfaces with corresponding FIF_ flags */ int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll, fif_probe_req; @@ -1492,7 +1490,7 @@ struct ieee80211_local { /* see iface.c */ struct list_head interfaces; - struct list_head mon_list; /* only that are IFF_UP && !cooked */ + struct list_head mon_list; /* only that are IFF_UP */ struct mutex iflist_mtx; /* Scanning and BSS list */ @@ -2090,8 +2088,7 @@ struct sk_buff * ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags); void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, - int retry_count, bool send_to_cooked, - struct ieee80211_tx_status *status); + int retry_count, struct ieee80211_tx_status *status); void ieee80211_check_fast_xmit(struct sta_info *sta); void ieee80211_check_fast_xmit_all(struct ieee80211_local *local); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 0ea7e77860b7..7d3ebfcb8c2b 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -483,8 +483,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do ieee80211_ibss_stop(sdata); break; case NL80211_IFTYPE_MONITOR: - if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) - break; list_del_rcu(&sdata->u.mntr.list); break; default: @@ -584,18 +582,17 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do /* no need to tell driver */ break; case NL80211_IFTYPE_MONITOR: - if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) { - local->cooked_mntrs--; - break; - } + if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { - local->monitors--; - if (local->monitors == 0) { - local->hw.conf.flags &= ~IEEE80211_CONF_MONITOR; - hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; - } + local->monitors--; + if (local->monitors == 0) { + local->hw.conf.flags &= ~IEEE80211_CONF_MONITOR; + hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; + } - ieee80211_adjust_monitor_flags(sdata, -1); + ieee80211_adjust_monitor_flags(sdata, -1); + } break; case NL80211_IFTYPE_NAN: /* clean all the functions */ @@ -1326,27 +1323,24 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) } break; case NL80211_IFTYPE_MONITOR: - if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) { - local->cooked_mntrs++; - break; - } - if ((sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) || ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { res = drv_add_interface(local, sdata); if (res) goto err_stop; - } else if (local->monitors == 0 && local->open_count == 0) { - res = ieee80211_add_virtual_monitor(local); - if (res) - goto err_stop; - } + } else { + if (local->monitors == 0 && local->open_count == 0) { + res = ieee80211_add_virtual_monitor(local); + if (res) + goto err_stop; + } + local->monitors++; - /* must be before the call to ieee80211_configure_filter */ - local->monitors++; - if (local->monitors == 1) { - local->hw.conf.flags |= IEEE80211_CONF_MONITOR; - hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; + /* must be before the call to ieee80211_configure_filter */ + if (local->monitors == 1) { + local->hw.conf.flags |= IEEE80211_CONF_MONITOR; + hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; + } } ieee80211_adjust_monitor_flags(sdata, 1); @@ -1423,8 +1417,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) rcu_assign_pointer(local->p2p_sdata, sdata); break; case NL80211_IFTYPE_MONITOR: - if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) - break; list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list); break; default: diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 53e5aee46885..741e6c7edcb7 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1744,18 +1744,7 @@ void ieee80211_free_hw(struct ieee80211_hw *hw) wiphy_free(local->hw.wiphy); } EXPORT_SYMBOL(ieee80211_free_hw); - -static const char * const drop_reasons_monitor[] = { -#define V(x) #x, - [0] = "RX_DROP_MONITOR", - MAC80211_DROP_REASONS_MONITOR(V) -}; - -static struct drop_reason_list drop_reason_list_monitor = { - .reasons = drop_reasons_monitor, - .n_reasons = ARRAY_SIZE(drop_reasons_monitor), -}; - +#define V(x) #x, static const char * const drop_reasons_unusable[] = { [0] = "RX_DROP_UNUSABLE", MAC80211_DROP_REASONS_UNUSABLE(V) @@ -1784,8 +1773,6 @@ static int __init ieee80211_init(void) if (ret) goto err_netdev; - drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR, - &drop_reason_list_monitor); drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE, &drop_reason_list_unusable); @@ -1804,7 +1791,6 @@ static void __exit ieee80211_exit(void) ieee80211_iface_exit(); - drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR); drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE); rcu_barrier(); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 1e28efe4203c..d33970009e00 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1045,14 +1045,14 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) if (is_multicast_ether_addr(hdr->addr1)) { if (ieee80211_has_tods(hdr->frame_control) || !ieee80211_has_fromds(hdr->frame_control)) - return RX_DROP_MONITOR; + return RX_DROP; if (ether_addr_equal(hdr->addr3, dev_addr)) - return RX_DROP_MONITOR; + return RX_DROP; } else { if (!ieee80211_has_a4(hdr->frame_control)) - return RX_DROP_MONITOR; + return RX_DROP; if (ether_addr_equal(hdr->addr4, dev_addr)) - return RX_DROP_MONITOR; + return RX_DROP; } } @@ -1064,20 +1064,20 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) struct ieee80211_mgmt *mgmt; if (!ieee80211_is_mgmt(hdr->frame_control)) - return RX_DROP_MONITOR; + return RX_DROP; if (ieee80211_is_action(hdr->frame_control)) { u8 category; /* make sure category field is present */ if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE) - return RX_DROP_MONITOR; + return RX_DROP; mgmt = (struct ieee80211_mgmt *)hdr; category = mgmt->u.action.category; if (category != WLAN_CATEGORY_MESH_ACTION && category != WLAN_CATEGORY_SELF_PROTECTED) - return RX_DROP_MONITOR; + return RX_DROP; return RX_CONTINUE; } @@ -1087,7 +1087,7 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) ieee80211_is_auth(hdr->frame_control)) return RX_CONTINUE; - return RX_DROP_MONITOR; + return RX_DROP; } return RX_CONTINUE; @@ -1513,7 +1513,7 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) hdrlen = ieee80211_hdrlen(hdr->frame_control); if (rx->skb->len < hdrlen + 8) - return RX_DROP_MONITOR; + return RX_DROP; skb_copy_bits(rx->skb, hdrlen + 6, ðertype, 2); if (ethertype == rx->sdata->control_port_protocol) @@ -1526,7 +1526,7 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) GFP_ATOMIC)) return RX_DROP_U_SPURIOUS; - return RX_DROP_MONITOR; + return RX_DROP; } return RX_CONTINUE; @@ -1862,7 +1862,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) cfg80211_rx_unexpected_4addr_frame( rx->sdata->dev, sta->sta.addr, GFP_ATOMIC); - return RX_DROP_M_UNEXPECTED_4ADDR_FRAME; + return RX_DROP_U_UNEXPECTED_4ADDR_FRAME; } /* * Update counter and free packet here to avoid @@ -1997,7 +1997,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, skb->data, skb->len); - return RX_DROP_M_BAD_BCN_KEYIDX; + return RX_DROP_U_BAD_BCN_KEYIDX; } rx->key = ieee80211_rx_get_bigtk(rx, mmie_keyidx); @@ -2011,11 +2011,11 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (mmie_keyidx < NUM_DEFAULT_KEYS || mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) - return RX_DROP_M_BAD_MGMT_KEYIDX; /* unexpected BIP keyidx */ + return RX_DROP_U_BAD_MGMT_KEYIDX; /* unexpected BIP keyidx */ if (rx->link_sta) { if (ieee80211_is_group_privacy_action(skb) && test_sta_flag(rx->sta, WLAN_STA_MFP)) - return RX_DROP_MONITOR; + return RX_DROP; rx->key = rcu_dereference(rx->link_sta->gtk[mmie_keyidx]); } @@ -2100,11 +2100,11 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (rx->key) { if (unlikely(rx->key->flags & KEY_FLAG_TAINTED)) - return RX_DROP_MONITOR; + return RX_DROP; /* TODO: add threshold stuff again */ } else { - return RX_DROP_MONITOR; + return RX_DROP; } switch (rx->key->conf.cipher) { @@ -2278,7 +2278,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) goto out; if (is_multicast_ether_addr(hdr->addr1)) - return RX_DROP_MONITOR; + return RX_DROP; I802_DEBUG_INC(rx->local->rx_handlers_fragments); @@ -2333,7 +2333,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) rx->seqno_idx, hdr); if (!entry) { I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag); - return RX_DROP_MONITOR; + return RX_DROP; } /* "The receiver shall discard MSDUs and MMPDUs whose constituent @@ -2855,25 +2855,25 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta return RX_CONTINUE; if (!pskb_may_pull(skb, sizeof(*eth) + 6)) - return RX_DROP_MONITOR; + return RX_DROP; mesh_hdr = (struct ieee80211s_hdr *)(skb->data + sizeof(*eth)); mesh_hdrlen = ieee80211_get_mesh_hdrlen(mesh_hdr); if (!pskb_may_pull(skb, sizeof(*eth) + mesh_hdrlen)) - return RX_DROP_MONITOR; + return RX_DROP; eth = (struct ethhdr *)skb->data; multicast = is_multicast_ether_addr(eth->h_dest); mesh_hdr = (struct ieee80211s_hdr *)(eth + 1); if (!mesh_hdr->ttl) - return RX_DROP_MONITOR; + return RX_DROP; /* frame is in RMC, don't forward */ if (is_multicast_ether_addr(eth->h_dest) && mesh_rmc_check(sdata, eth->h_source, mesh_hdr)) - return RX_DROP_MONITOR; + return RX_DROP; /* forward packet */ if (sdata->crypto_tx_tailroom_needed_cnt) @@ -2890,7 +2890,7 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta /* has_a4 already checked in ieee80211_rx_mesh_check */ proxied_addr = mesh_hdr->eaddr2; else - return RX_DROP_MONITOR; + return RX_DROP; rcu_read_lock(); mppath = mpp_path_lookup(sdata, proxied_addr); @@ -2922,14 +2922,14 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta goto rx_accept; IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_ttl); - return RX_DROP_MONITOR; + return RX_DROP; } if (!ifmsh->mshcfg.dot11MeshForwarding) { if (is_multicast_ether_addr(eth->h_dest)) goto rx_accept; - return RX_DROP_MONITOR; + return RX_DROP; } skb_set_queue_mapping(skb, ieee802_1d_to_ac[skb->priority]); @@ -3122,7 +3122,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (unlikely(!ieee80211_is_data_present(fc))) - return RX_DROP_MONITOR; + return RX_DROP; if (unlikely(ieee80211_has_a4(hdr->frame_control))) { switch (rx->sdata->vif.type) { @@ -3179,19 +3179,16 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) - return RX_DROP_MONITOR; + return RX_DROP; - /* - * Send unexpected-4addr-frame event to hostapd. For older versions, - * also drop the frame to cooked monitor interfaces. - */ + /* Send unexpected-4addr-frame event to hostapd */ if (ieee80211_has_a4(hdr->frame_control) && sdata->vif.type == NL80211_IFTYPE_AP) { if (rx->sta && !test_and_set_sta_flag(rx->sta, WLAN_STA_4ADDR_EVENT)) cfg80211_rx_unexpected_4addr_frame( rx->sdata->dev, rx->sta->sta.addr, GFP_ATOMIC); - return RX_DROP_MONITOR; + return RX_DROP; } res = __ieee80211_data_to_8023(rx, &port_control); @@ -3203,7 +3200,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) return res; if (!ieee80211_frame_allowed(rx, fc)) - return RX_DROP_MONITOR; + return RX_DROP; /* directly handle TDLS channel switch requests/responses */ if (unlikely(((struct ethhdr *)rx->skb->data)->h_proto == @@ -3268,11 +3265,11 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) }; if (!rx->sta) - return RX_DROP_MONITOR; + return RX_DROP; if (skb_copy_bits(skb, offsetof(struct ieee80211_bar, control), &bar_data, sizeof(bar_data))) - return RX_DROP_MONITOR; + return RX_DROP; tid = le16_to_cpu(bar_data.control) >> 12; @@ -3284,7 +3281,7 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]); if (!tid_agg_rx) - return RX_DROP_MONITOR; + return RX_DROP; start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4; event.u.ba.tid = tid; @@ -3308,12 +3305,7 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) return RX_QUEUED; } - /* - * After this point, we only want management frames, - * so we can drop all remaining control frames to - * cooked monitor interfaces. - */ - return RX_DROP_MONITOR; + return RX_DROP; } static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, @@ -3422,10 +3414,10 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) * and unknown (reserved) frames are useless. */ if (rx->skb->len < 24) - return RX_DROP_MONITOR; + return RX_DROP; if (!ieee80211_is_mgmt(mgmt->frame_control)) - return RX_DROP_MONITOR; + return RX_DROP; /* drop too small action frames */ if (ieee80211_is_action(mgmt->frame_control) && @@ -3951,17 +3943,16 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) * ones. For all other modes we will return them to the sender, * setting the 0x80 bit in the action category, as required by * 802.11-2012 9.24.4. - * Newer versions of hostapd shall also use the management frame - * registration mechanisms, but older ones still use cooked - * monitor interfaces so push all frames there. + * Newer versions of hostapd use the management frame registration + * mechanisms and old cooked monitor interface is no longer supported. */ if (!(status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) && (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) - return RX_DROP_MONITOR; + return RX_DROP; if (is_multicast_ether_addr(mgmt->da)) - return RX_DROP_MONITOR; + return RX_DROP; /* do not return rejected action frames */ if (mgmt->u.action.category & 0x80) @@ -4006,7 +3997,7 @@ ieee80211_rx_h_ext(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (sdata->vif.type != NL80211_IFTYPE_STATION) - return RX_DROP_MONITOR; + return RX_DROP; /* for now only beacons are ext, so queue them */ ieee80211_queue_skb_to_iface(sdata, rx->link_id, rx->sta, rx->skb); @@ -4027,7 +4018,7 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_OCB && sdata->vif.type != NL80211_IFTYPE_STATION) - return RX_DROP_MONITOR; + return RX_DROP; switch (stype) { case cpu_to_le16(IEEE80211_STYPE_AUTH): @@ -4038,32 +4029,32 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) case cpu_to_le16(IEEE80211_STYPE_DEAUTH): if (is_multicast_ether_addr(mgmt->da) && !is_broadcast_ether_addr(mgmt->da)) - return RX_DROP_MONITOR; + return RX_DROP; /* process only for station/IBSS */ if (sdata->vif.type != NL80211_IFTYPE_STATION && sdata->vif.type != NL80211_IFTYPE_ADHOC) - return RX_DROP_MONITOR; + return RX_DROP; break; case cpu_to_le16(IEEE80211_STYPE_ASSOC_RESP): case cpu_to_le16(IEEE80211_STYPE_REASSOC_RESP): case cpu_to_le16(IEEE80211_STYPE_DISASSOC): if (is_multicast_ether_addr(mgmt->da) && !is_broadcast_ether_addr(mgmt->da)) - return RX_DROP_MONITOR; + return RX_DROP; /* process only for station */ if (sdata->vif.type != NL80211_IFTYPE_STATION) - return RX_DROP_MONITOR; + return RX_DROP; break; case cpu_to_le16(IEEE80211_STYPE_PROBE_REQ): /* process only for ibss and mesh */ if (sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT) - return RX_DROP_MONITOR; + return RX_DROP; break; default: - return RX_DROP_MONITOR; + return RX_DROP; } ieee80211_queue_skb_to_iface(sdata, rx->link_id, rx->sta, rx->skb); @@ -4071,82 +4062,9 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) return RX_QUEUED; } -static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, - struct ieee80211_rate *rate, - ieee80211_rx_result reason) -{ - struct ieee80211_sub_if_data *sdata; - struct ieee80211_local *local = rx->local; - struct sk_buff *skb = rx->skb, *skb2; - struct net_device *prev_dev = NULL; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - int needed_headroom; - - /* - * If cooked monitor has been processed already, then - * don't do it again. If not, set the flag. - */ - if (rx->flags & IEEE80211_RX_CMNTR) - goto out_free_skb; - rx->flags |= IEEE80211_RX_CMNTR; - - /* If there are no cooked monitor interfaces, just free the SKB */ - if (!local->cooked_mntrs) - goto out_free_skb; - - /* room for the radiotap header based on driver features */ - needed_headroom = ieee80211_rx_radiotap_hdrlen(local, status, skb); - - if (skb_headroom(skb) < needed_headroom && - pskb_expand_head(skb, needed_headroom, 0, GFP_ATOMIC)) - goto out_free_skb; - - /* prepend radiotap information */ - ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom, - false); - - skb_reset_mac_header(skb); - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->pkt_type = PACKET_OTHERHOST; - skb->protocol = htons(ETH_P_802_2); - - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (!ieee80211_sdata_running(sdata)) - continue; - - if (sdata->vif.type != NL80211_IFTYPE_MONITOR || - !(sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES)) - continue; - - if (prev_dev) { - skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2) { - skb2->dev = prev_dev; - netif_receive_skb(skb2); - } - } - - prev_dev = sdata->dev; - dev_sw_netstats_rx_add(sdata->dev, skb->len); - } - - if (prev_dev) { - skb->dev = prev_dev; - netif_receive_skb(skb); - return; - } - - out_free_skb: - kfree_skb_reason(skb, (__force u32)reason); -} - static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, ieee80211_rx_result res) { - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); - struct ieee80211_supported_band *sband; - struct ieee80211_rate *rate = NULL; - if (res == RX_QUEUED) { I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued); return; @@ -4158,23 +4076,13 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, rx->link_sta->rx_stats.dropped++; } - if (u32_get_bits((__force u32)res, SKB_DROP_REASON_SUBSYS_MASK) == - SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE) { - kfree_skb_reason(rx->skb, (__force u32)res); - return; - } - - sband = rx->local->hw.wiphy->bands[status->band]; - if (status->encoding == RX_ENC_LEGACY) - rate = &sband->bitrates[status->rate_idx]; - - ieee80211_rx_cooked_monitor(rx, rate, res); + kfree_skb_reason(rx->skb, (__force u32)res); } static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) { - ieee80211_rx_result res = RX_DROP_MONITOR; + ieee80211_rx_result res = RX_DROP; struct sk_buff *skb; #define CALL_RXH(rxh) \ @@ -4238,7 +4146,7 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx) { struct sk_buff_head reorder_release; - ieee80211_rx_result res = RX_DROP_MONITOR; + ieee80211_rx_result res = RX_DROP; __skb_queue_head_init(&reorder_release); diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 5f28f3633fa0..b17b3cc7fb90 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -895,8 +895,7 @@ static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, } void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, - int retry_count, bool send_to_cooked, - struct ieee80211_tx_status *status) + int retry_count, struct ieee80211_tx_status *status) { struct sk_buff *skb2; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); @@ -930,10 +929,6 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, if (sdata->u.mntr.flags & MONITOR_FLAG_SKIP_TX) continue; - if ((sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) && - !send_to_cooked) - continue; - if (prev_dev) { skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) { @@ -964,7 +959,6 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, struct ieee80211_tx_info *info = status->info; struct sta_info *sta; __le16 fc; - bool send_to_cooked; bool acked; bool noack_success; struct ieee80211_bar *bar; @@ -1091,28 +1085,10 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, ieee80211_report_used_skb(local, skb, false, status->ack_hwtstamp); - /* this was a transmitted frame, but now we want to reuse it */ - skb_orphan(skb); - - /* Need to make a copy before skb->cb gets cleared */ - send_to_cooked = !!(info->flags & IEEE80211_TX_CTL_INJECTED) || - !(ieee80211_is_data(fc)); - - /* - * This is a bit racy but we can avoid a lot of work - * with this test... - */ - if (!local->tx_mntrs && (!send_to_cooked || !local->cooked_mntrs)) { - if (status->free_list) - list_add_tail(&skb->list, status->free_list); - else - dev_kfree_skb(skb); - return; - } - - /* send to monitor interfaces */ - ieee80211_tx_monitor(local, skb, retry_count, - send_to_cooked, status); + if (status->free_list) + list_add_tail(&skb->list, status->free_list); + else + dev_kfree_skb(skb); } void ieee80211_tx_status_skb(struct ieee80211_hw *hw, struct sk_buff *skb) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index a24636bda679..1289df373795 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5617,7 +5617,7 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, if (!copy) return bcn; - ieee80211_tx_monitor(hw_to_local(hw), copy, 1, false, NULL); + ieee80211_tx_monitor(hw_to_local(hw), copy, 1, NULL); return bcn; } -- cgit v1.2.3 From de86c5f60839dc0d771711a848b4f55ad3f90844 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Wed, 5 Feb 2025 11:39:13 +0200 Subject: wifi: mac80211: Add support for EPCS configuration Add support for configuring EPCS state: - When EPCS is enabled, send an EPCS enable request action frame to the AP. When the AP replies with EPCS enable response, enable EPCS by applying the QoS parameters provided by the AP. Do so for all the valid MLD links. Once EPCS is enabled, support processing of unsolicited EPCS enable response frames. - When EPCS is disabled, send an EPCS teardown request to the AP and apply the QoS parameters as obtained from the last received beacons. Do so for all the valid links. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250205110958.7a90afd7e140.I3f602d65f5c1fd849d6c70b12307dda33aa91ccb@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 7 ++ include/net/mac80211.h | 3 +- net/mac80211/cfg.c | 9 ++ net/mac80211/ieee80211_i.h | 11 ++ net/mac80211/iface.c | 8 ++ net/mac80211/mlme.c | 291 ++++++++++++++++++++++++++++++++++++++++++++- net/mac80211/rx.c | 17 +++ 7 files changed, 344 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 16741e542e81..8f35a3a5211c 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1543,6 +1543,10 @@ struct ieee80211_mgmt { u8 count; u8 variable[]; } __packed ml_reconf_resp; + struct { + u8 action_code; + u8 variable[]; + } __packed epcs; } u; } __packed action; DECLARE_FLEX_ARRAY(u8, body); /* Generic frame body */ @@ -5570,6 +5574,9 @@ static inline bool ieee80211_mle_reconf_sta_prof_size_ok(const u8 *data, fixed + prof->sta_info_len - 1 <= len; } +#define IEEE80211_MLE_STA_EPCS_CONTROL_LINK_ID 0x000f +#define IEEE80211_EPCS_ENA_RESP_BODY_LEN 3 + static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len) { const struct ieee80211_ttlm_elem *t2l = (const void *)data; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c3ed2fcff8b7..a3a0de4a5d63 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -702,6 +702,7 @@ struct ieee80211_parsed_tpe { * @tpe: transmit power envelope information * @pwr_reduction: power constraint of BSS. * @eht_support: does this BSS support EHT + * @epcs_support: does this BSS support EPCS * @csa_active: marks whether a channel switch is going on. * @mu_mimo_owner: indicates interface owns MU-MIMO capability * @chanctx_conf: The channel context this interface is assigned to, or %NULL @@ -823,7 +824,7 @@ struct ieee80211_bss_conf { u8 pwr_reduction; bool eht_support; - + bool epcs_support; bool csa_active; bool mu_mimo_owner; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 88949b90f117..5785fe30adaa 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -5196,6 +5196,14 @@ ieee80211_assoc_ml_reconf(struct wiphy *wiphy, struct net_device *dev, return ieee80211_mgd_assoc_ml_reconf(sdata, add_links, rem_links); } +static int +ieee80211_set_epcs(struct wiphy *wiphy, struct net_device *dev, bool enable) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + return ieee80211_mgd_set_epcs(sdata, enable); +} + const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -5311,4 +5319,5 @@ const struct cfg80211_ops mac80211_config_ops = { .set_ttlm = ieee80211_set_ttlm, .get_radio_mask = ieee80211_get_radio_mask, .assoc_ml_reconf = ieee80211_assoc_ml_reconf, + .set_epcs = ieee80211_set_epcs, }; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index a90a44aa5758..b6c769fc9abf 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -613,6 +613,12 @@ struct ieee80211_if_managed { u16 added_links; u8 dialog_token; } reconf; + + /* Support for epcs */ + struct { + bool enabled; + u8 dialog_token; + } epcs; }; struct ieee80211_if_ibss { @@ -2775,6 +2781,11 @@ int ieee80211_req_neg_ttlm(struct ieee80211_sub_if_data *sdata, void ieee80211_check_wbrf_support(struct ieee80211_local *local); void ieee80211_add_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef); void ieee80211_remove_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef); +int ieee80211_mgd_set_epcs(struct ieee80211_sub_if_data *sdata, bool enable); +void ieee80211_process_epcs_ena_resp(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len); +void ieee80211_process_epcs_teardown(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len); int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, struct cfg80211_assoc_link *add_links, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 7d3ebfcb8c2b..56fde5afe79a 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1556,6 +1556,14 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, ieee80211_process_ml_reconf_resp(sdata, mgmt, skb->len); break; + case WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_RESP: + ieee80211_process_epcs_ena_resp(sdata, mgmt, + skb->len); + break; + case WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN: + ieee80211_process_epcs_teardown(sdata, mgmt, + skb->len); + break; default: break; } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 6feb4ce2c90d..aaf84c52cd2d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3794,6 +3794,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, /* other links will be destroyed */ sdata->deflink.conf->bss = NULL; + sdata->deflink.conf->epcs_support = false; sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; netif_carrier_off(sdata->dev); @@ -3980,6 +3981,9 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, * when the flow started. */ ieee80211_ml_reconf_reset(sdata); + + ifmgd->epcs.enabled = false; + ifmgd->epcs.dialog_token = 0; } static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata) @@ -4848,6 +4852,82 @@ static bool ieee80211_twt_bcast_support(struct ieee80211_sub_if_data *sdata, IEEE80211_HE_MAC_CAP2_BCAST_TWT); } +static void ieee80211_epcs_changed(struct ieee80211_sub_if_data *sdata, + bool enabled) +{ + /* in any case this is called, dialog token should be reset */ + sdata->u.mgd.epcs.dialog_token = 0; + + if (sdata->u.mgd.epcs.enabled == enabled) + return; + + sdata->u.mgd.epcs.enabled = enabled; + cfg80211_epcs_changed(sdata->dev, enabled); +} + +static void ieee80211_epcs_teardown(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + u8 link_id; + + if (!sdata->u.mgd.epcs.enabled) + return; + + lockdep_assert_wiphy(local->hw.wiphy); + + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + struct ieee802_11_elems *elems; + struct ieee80211_link_data *link; + const struct cfg80211_bss_ies *ies; + bool ret; + + rcu_read_lock(); + + link = sdata_dereference(sdata->link[link_id], sdata); + if (!link || !link->conf || !link->conf->bss) { + rcu_read_unlock(); + continue; + } + + if (link->u.mgd.disable_wmm_tracking) { + rcu_read_unlock(); + ieee80211_set_wmm_default(link, false, false); + continue; + } + + ies = rcu_dereference(link->conf->bss->beacon_ies); + if (!ies) { + rcu_read_unlock(); + ieee80211_set_wmm_default(link, false, false); + continue; + } + + elems = ieee802_11_parse_elems(ies->data, ies->len, false, + NULL); + if (!elems) { + rcu_read_unlock(); + ieee80211_set_wmm_default(link, false, false); + continue; + } + + ret = _ieee80211_sta_wmm_params(local, link, + elems->wmm_param, + elems->wmm_param_len, + elems->mu_edca_param_set); + + kfree(elems); + rcu_read_unlock(); + + if (!ret) { + ieee80211_set_wmm_default(link, false, false); + continue; + } + + ieee80211_mgd_set_link_qos_params(link); + ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_QOS); + } +} + static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, struct link_sta_info *link_sta, struct cfg80211_bss *cbss, @@ -5121,14 +5201,27 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, link_sta); bss_conf->eht_support = link_sta->pub->eht_cap.has_eht; + bss_conf->epcs_support = bss_conf->eht_support && + !!(elems->eht_cap->fixed.mac_cap_info[0] & + IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS); + + /* EPCS might be already enabled but a new added link + * does not support EPCS. This should not really happen + * in practice. + */ + if (sdata->u.mgd.epcs.enabled && + !bss_conf->epcs_support) + ieee80211_epcs_teardown(sdata); } else { bss_conf->eht_support = false; + bss_conf->epcs_support = false; } } else { bss_conf->he_support = false; bss_conf->twt_requester = false; bss_conf->twt_protected = false; bss_conf->eht_support = false; + bss_conf->epcs_support = false; } bss_conf->twt_broadcast = @@ -7159,7 +7252,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, ieee80211_mgd_update_bss_param_ch_cnt(sdata, bss_conf, elems); - if (!link->u.mgd.disable_wmm_tracking && + if (!sdata->u.mgd.epcs.enabled && + !link->u.mgd.disable_wmm_tracking && ieee80211_sta_wmm_params(local, link, elems->wmm_param, elems->wmm_param_len, elems->mu_edca_param_set)) @@ -10334,3 +10428,198 @@ int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, kfree(data); return err; } + +static bool ieee80211_mgd_epcs_supp(struct ieee80211_sub_if_data *sdata) +{ + unsigned long valid_links = sdata->vif.valid_links; + u8 link_id; + + lockdep_assert_wiphy(sdata->local->hw.wiphy); + + if (!ieee80211_vif_is_mld(&sdata->vif)) + return false; + + for_each_set_bit(link_id, &valid_links, IEEE80211_MLD_MAX_NUM_LINKS) { + struct ieee80211_bss_conf *bss_conf = + sdata_dereference(sdata->vif.link_conf[link_id], sdata); + + if (WARN_ON(!bss_conf) || !bss_conf->epcs_support) + return false; + } + + return true; +} + +int ieee80211_mgd_set_epcs(struct ieee80211_sub_if_data *sdata, bool enable) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + struct sk_buff *skb; + int frame_len = offsetofend(struct ieee80211_mgmt, + u.action.u.epcs) + (enable ? 1 : 0); + + if (!ieee80211_mgd_epcs_supp(sdata)) + return -EINVAL; + + if (sdata->u.mgd.epcs.enabled == enable && + !sdata->u.mgd.epcs.dialog_token) + return 0; + + /* Do not allow enabling EPCS if the AP didn't respond yet. + * However, allow disabling EPCS in such a case. + */ + if (sdata->u.mgd.epcs.dialog_token && enable) + return -EALREADY; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + frame_len); + if (!skb) + return -ENOBUFS; + + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = skb_put_zero(skb, frame_len); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); + + mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; + if (enable) { + u8 *pos = mgmt->u.action.u.epcs.variable; + + mgmt->u.action.u.epcs.action_code = + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_REQ; + + *pos = ++sdata->u.mgd.dialog_token_alloc; + sdata->u.mgd.epcs.dialog_token = *pos; + } else { + mgmt->u.action.u.epcs.action_code = + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN; + + ieee80211_epcs_teardown(sdata); + ieee80211_epcs_changed(sdata, false); + } + + ieee80211_tx_skb(sdata, skb); + return 0; +} + +static void ieee80211_ml_epcs(struct ieee80211_sub_if_data *sdata, + struct ieee802_11_elems *elems) +{ + const struct element *sub; + size_t scratch_len = elems->ml_epcs_len; + u8 *scratch __free(kfree) = kzalloc(scratch_len, GFP_KERNEL); + + lockdep_assert_wiphy(sdata->local->hw.wiphy); + + if (!ieee80211_vif_is_mld(&sdata->vif) || !elems->ml_epcs) + return; + + if (WARN_ON(!scratch)) + return; + + /* Directly parse the sub elements as the common information doesn't + * hold any useful information. + */ + for_each_mle_subelement(sub, (const u8 *)elems->ml_epcs, + elems->ml_epcs_len) { + struct ieee80211_link_data *link; + struct ieee802_11_elems *link_elems __free(kfree); + u8 *pos = (void *)sub->data; + u16 control; + ssize_t len; + u8 link_id; + + if (sub->id != IEEE80211_MLE_SUBELEM_PER_STA_PROFILE) + continue; + + if (sub->datalen < sizeof(control)) + break; + + control = get_unaligned_le16(pos); + link_id = control & IEEE80211_MLE_STA_EPCS_CONTROL_LINK_ID; + + link = sdata_dereference(sdata->link[link_id], sdata); + if (!link) + continue; + + len = cfg80211_defragment_element(sub, (u8 *)elems->ml_epcs, + elems->ml_epcs_len, + scratch, scratch_len, + IEEE80211_MLE_SUBELEM_FRAGMENT); + if (len < sizeof(control)) + continue; + + pos = scratch + sizeof(control); + len -= sizeof(control); + + link_elems = ieee802_11_parse_elems(pos, len, false, NULL); + if (!link_elems) + continue; + + if (ieee80211_sta_wmm_params(sdata->local, link, + link_elems->wmm_param, + link_elems->wmm_param_len, + link_elems->mu_edca_param_set)) + ieee80211_link_info_change_notify(sdata, link, + BSS_CHANGED_QOS); + } +} + +void ieee80211_process_epcs_ena_resp(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len) +{ + struct ieee802_11_elems *elems __free(kfree) = NULL; + size_t ies_len; + u16 status_code; + u8 *pos, dialog_token; + + if (!ieee80211_mgd_epcs_supp(sdata)) + return; + + /* Handle dialog token and status code */ + pos = mgmt->u.action.u.epcs.variable; + dialog_token = *pos; + status_code = get_unaligned_le16(pos + 1); + + /* An EPCS enable response with dialog token == 0 is an unsolicited + * notification from the AP MLD. In such a case, EPCS should already be + * enabled and status must be success + */ + if (!dialog_token && + (!sdata->u.mgd.epcs.enabled || + status_code != WLAN_STATUS_SUCCESS)) + return; + + if (sdata->u.mgd.epcs.dialog_token != dialog_token) + return; + + sdata->u.mgd.epcs.dialog_token = 0; + + if (status_code != WLAN_STATUS_SUCCESS) + return; + + pos += IEEE80211_EPCS_ENA_RESP_BODY_LEN; + ies_len = len - offsetof(struct ieee80211_mgmt, + u.action.u.epcs.variable) - + IEEE80211_EPCS_ENA_RESP_BODY_LEN; + + elems = ieee802_11_parse_elems(pos, ies_len, true, NULL); + if (!elems) + return; + + ieee80211_ml_epcs(sdata, elems); + ieee80211_epcs_changed(sdata, true); +} + +void ieee80211_process_epcs_teardown(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len) +{ + if (!ieee80211_vif_is_mld(&sdata->vif) || + !sdata->u.mgd.epcs.enabled) + return; + + ieee80211_epcs_teardown(sdata); + ieee80211_epcs_changed(sdata, false); +} diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index d33970009e00..1790e7221a14 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3823,6 +3823,23 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) u.action.u.ml_reconf_resp) + 3) goto invalid; goto queue; + case WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_RESP: + if (sdata->vif.type != NL80211_IFTYPE_STATION) + break; + + if (len < offsetofend(typeof(*mgmt), + u.action.u.epcs) + + IEEE80211_EPCS_ENA_RESP_BODY_LEN) + goto invalid; + goto queue; + case WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN: + if (sdata->vif.type != NL80211_IFTYPE_STATION) + break; + + if (len < offsetofend(typeof(*mgmt), + u.action.u.epcs)) + goto invalid; + goto queue; default: break; } -- cgit v1.2.3 From 3ad4fce66e4f9d82abfc366707757e29cc14a9d2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Feb 2025 11:39:16 +0200 Subject: wifi: mac80211: add strict mode disabling workarounds Add a strict mode where we disable certain workarounds and have additional checks such as, for now, that VHT capabilities from association response match those from beacon/probe response. We can extend the checks in the future. Make it an opt-in setting by the driver so it can be set there in some driver-specific way, for example. Also allow setting this one hw flag through the hwflags debugfs, by writing a new strict=0 or strict=1 value. Signed-off-by: Johannes Berg Reviewed-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250205110958.5cecb0469479.I4a69617dc60ba0d6308416ffbc3102cfd08ba068@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ net/mac80211/debugfs.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- net/mac80211/mlme.c | 45 +++++++++++++++++++++++++++++++-------------- 3 files changed, 79 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a3a0de4a5d63..398d4e30b0db 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2852,6 +2852,11 @@ struct ieee80211_txq { * implements MLO, so operation can continue on other links when one * link is switching. * + * @IEEE80211_HW_STRICT: strictly enforce certain things mandated by the spec + * but otherwise ignored/worked around for interoperability. This is a + * HW flag so drivers can opt in according to their own control, e.g. in + * testing. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2912,6 +2917,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_DISALLOW_PUNCTURING, IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ, IEEE80211_HW_HANDLES_QUIET_CSA, + IEEE80211_HW_STRICT, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index bf0a2902d93c..69e03630f64c 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -492,6 +492,7 @@ static const char *hw_flag_names[] = { FLAG(DISALLOW_PUNCTURING), FLAG(DISALLOW_PUNCTURING_5GHZ), FLAG(HANDLES_QUIET_CSA), + FLAG(STRICT), #undef FLAG }; @@ -524,6 +525,46 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf, return rv; } +static ssize_t hwflags_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + char buf[100]; + int val; + + if (count >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, user_buf, count)) + return -EFAULT; + + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; + + if (sscanf(buf, "strict=%d", &val) == 1) { + switch (val) { + case 0: + ieee80211_hw_set(&local->hw, STRICT); + return count; + case 1: + __clear_bit(IEEE80211_HW_STRICT, local->hw.flags); + return count; + default: + return -EINVAL; + } + } + + return -EINVAL; +} + +static const struct file_operations hwflags_ops = { + .open = simple_open, + .read = hwflags_read, + .write = hwflags_write, +}; + static ssize_t misc_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { @@ -574,7 +615,6 @@ static ssize_t queues_read(struct file *file, char __user *user_buf, return simple_read_from_buffer(user_buf, count, ppos, buf, res); } -DEBUGFS_READONLY_FILE_OPS(hwflags); DEBUGFS_READONLY_FILE_OPS(queues); DEBUGFS_READONLY_FILE_OPS(misc); @@ -651,7 +691,7 @@ void debugfs_hw_add(struct ieee80211_local *local) #ifdef CONFIG_PM DEBUGFS_ADD_MODE(reset, 0200); #endif - DEBUGFS_ADD(hwflags); + DEBUGFS_ADD_MODE(hwflags, 0600); DEBUGFS_ADD(user_power); DEBUGFS_ADD(power); DEBUGFS_ADD(hw_conf); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 04e1ea43b2df..a963e020a249 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -168,6 +168,9 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, bool no_vht = false; u32 ht_cfreq; + if (ieee80211_hw_check(&sdata->local->hw, STRICT)) + ignore_ht_channel_mismatch = false; + *chandef = (struct cfg80211_chan_def) { .chan = channel, .width = NL80211_CHAN_WIDTH_20_NOHT, @@ -388,7 +391,7 @@ ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata, * zeroes, which is nonsense, and completely inconsistent with itself * (it doesn't have 8 streams). Accept the settings in this case anyway. */ - if (!ap_min_req_set) + if (!ieee80211_hw_check(&sdata->local->hw, STRICT) && !ap_min_req_set) return true; /* make sure the AP is consistent with itself @@ -448,7 +451,7 @@ ieee80211_verify_sta_he_mcs_support(struct ieee80211_sub_if_data *sdata, * zeroes, which is nonsense, and completely inconsistent with itself * (it doesn't have 8 streams). Accept the settings in this case anyway. */ - if (!ap_min_req_set) + if (!ieee80211_hw_check(&sdata->local->hw, STRICT) && !ap_min_req_set) return true; /* Need to go over for 80MHz, 160MHz and for 80+80 */ @@ -1313,13 +1316,15 @@ static bool ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, * Some APs apparently get confused if our capabilities are better * than theirs, so restrict what we advertise in the assoc request. */ - if (!(ap_vht_cap->vht_cap_info & - cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE))) - cap &= ~(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE | - IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); - else if (!(ap_vht_cap->vht_cap_info & - cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE))) - cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; + if (!ieee80211_hw_check(&local->hw, STRICT)) { + if (!(ap_vht_cap->vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE))) + cap &= ~(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE | + IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); + else if (!(ap_vht_cap->vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE))) + cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; + } /* * If some other vif is using the MU-MIMO capability we cannot associate @@ -1361,14 +1366,16 @@ static bool ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, return mu_mimo_owner; } -static void ieee80211_assoc_add_rates(struct sk_buff *skb, +static void ieee80211_assoc_add_rates(struct ieee80211_local *local, + struct sk_buff *skb, enum nl80211_chan_width width, struct ieee80211_supported_band *sband, struct ieee80211_mgd_assoc_data *assoc_data) { u32 rates; - if (assoc_data->supp_rates_len) { + if (assoc_data->supp_rates_len && + !ieee80211_hw_check(&local->hw, STRICT)) { /* * Get all rates supported by the device and the AP as * some APs don't like getting a superset of their rates @@ -1584,7 +1591,7 @@ ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata, *capab |= WLAN_CAPABILITY_SPECTRUM_MGMT; if (sband->band != NL80211_BAND_S1GHZ) - ieee80211_assoc_add_rates(skb, width, sband, assoc_data); + ieee80211_assoc_add_rates(local, skb, width, sband, assoc_data); if (*capab & WLAN_CAPABILITY_SPECTRUM_MGMT || *capab & WLAN_CAPABILITY_RADIO_MEASURE) { @@ -2051,7 +2058,8 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) * for some reason check it and want it to be set, set the bit for all * pre-EHT connections as we used to do. */ - if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_EHT) + if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_EHT && + !ieee80211_hw_check(&local->hw, STRICT)) capab |= WLAN_CAPABILITY_ESS; /* add the elements for the assoc (main) link */ @@ -5029,7 +5037,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, * 2G/3G/4G wifi routers, reported models include the "Onda PN51T", * "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device. */ - if (!is_6ghz && + if (!ieee80211_hw_check(&local->hw, STRICT) && !is_6ghz && ((assoc_data->wmm && !elems->wmm_param) || (link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT && (!elems->ht_cap_elem || !elems->ht_operation)) || @@ -5164,6 +5172,15 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, bss_vht_cap = (const void *)elem->data; } + if (ieee80211_hw_check(&local->hw, STRICT) && + (!bss_vht_cap || memcmp(bss_vht_cap, elems->vht_cap_elem, + sizeof(*bss_vht_cap)))) { + rcu_read_unlock(); + ret = false; + link_info(link, "VHT capabilities mismatch\n"); + goto out; + } + ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, elems->vht_cap_elem, bss_vht_cap, link_sta); -- cgit v1.2.3 From 8c60179b64434894eac1ffab7396bac131bc8b6e Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Wed, 5 Feb 2025 11:39:20 +0200 Subject: wifi: mac80211: set ieee80211_prep_tx_info::link_id upon Auth Rx This will be used by the low level driver. Note that link_id will be 0 in case of a non-MLO authentication. Also fix a call-site of mgd_prepare_tx() where the link_id was not populated. Update the documentation to reflect the current state ieee80211_prep_tx_info::link_id is also available in mgd_complete_tx(). Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250205110958.6a590f189ce5.I1fc5c0da26b143f5b07191eb592f01f7083d55ae@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++-- net/mac80211/driver-ops.h | 3 ++- net/mac80211/mlme.c | 4 +++- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 398d4e30b0db..22d32419e8a0 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7,7 +7,7 @@ * Copyright 2007-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2024 Intel Corporation + * Copyright (C) 2018 - 2025 Intel Corporation */ #ifndef MAC80211_H @@ -3830,7 +3830,7 @@ enum ieee80211_reconfig_type { * @was_assoc: set if this call is due to deauth/disassoc * while just having been associated * @link_id: the link id on which the frame will be TX'ed. - * Only used with the mgd_prepare_tx() method. + * 0 for a non-MLO connection. */ struct ieee80211_prep_tx_info { u16 duration; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 5acecc7bd4a9..307587c8a003 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016 Intel Deutschland GmbH -* Copyright (C) 2018-2019, 2021-2024 Intel Corporation +* Copyright (C) 2018-2019, 2021-2025 Intel Corporation */ #ifndef __MAC80211_DRIVER_OPS @@ -955,6 +955,7 @@ static inline void drv_mgd_complete_tx(struct ieee80211_local *local, return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); + info->link_id = info->link_id < 0 ? 0 : info->link_id; trace_drv_mgd_complete_tx(local, sdata, info->duration, info->subtype, info->success); if (local->ops->mgd_complete_tx) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 686b33654b52..184b328d92cb 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -8,7 +8,7 @@ * Copyright 2007, Michael Wu * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2024 Intel Corporation + * Copyright (C) 2018 - 2025 Intel Corporation */ #include @@ -4725,6 +4725,8 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); status_code = le16_to_cpu(mgmt->u.auth.status_code); + info.link_id = ifmgd->auth_data->link_id; + if (auth_alg != ifmgd->auth_data->algorithm || (auth_alg != WLAN_AUTH_SAE && auth_transaction != ifmgd->auth_data->expected_transaction) || -- cgit v1.2.3 From 0fed463777b83abe6410a8073de3f997c29afe18 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Feb 2025 15:28:26 +0000 Subject: tcp: remove tcp_reset_xmit_timer() @max_when argument All callers use TCP_RTO_MAX, we can factorize this constant, becoming a variable soon. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/net/tcp.h | 7 +++---- net/ipv4/tcp_input.c | 5 ++--- net/ipv4/tcp_output.c | 7 +++---- 3 files changed, 8 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index bb7edf0e72aa..8678a2d37ef8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1423,11 +1423,10 @@ static inline unsigned long tcp_pacing_delay(const struct sock *sk) static inline void tcp_reset_xmit_timer(struct sock *sk, const int what, - unsigned long when, - const unsigned long max_when) + unsigned long when) { inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk), - max_when); + TCP_RTO_MAX); } /* Something is really bad, we could not queue an additional packet, @@ -1456,7 +1455,7 @@ static inline void tcp_check_probe_timer(struct sock *sk) { if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - tcp_probe0_base(sk), TCP_RTO_MAX); + tcp_probe0_base(sk)); } static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 286f15e4994a..1a3ce47b8c75 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3282,8 +3282,7 @@ void tcp_rearm_rto(struct sock *sk) */ rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } - tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, - TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto); } } @@ -3563,7 +3562,7 @@ static void tcp_ack_probe(struct sock *sk) unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); when = tcp_clamp_probe0_to_user_timeout(sk, when); - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when); } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ef9f6172680f..093476fb2b2e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2910,7 +2910,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); - tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout); return true; } @@ -3544,8 +3544,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) } if (rearm_timer) tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); + inet_csk(sk)->icsk_rto); } /* We allow to exceed memory limits for FIN packets to expedite @@ -4401,7 +4400,7 @@ void tcp_send_probe0(struct sock *sk) } timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout); - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) -- cgit v1.2.3 From 7baa030155e8fa2a1bd9ea6425083c3b16787636 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Feb 2025 15:28:27 +0000 Subject: tcp: add a @pace_delay parameter to tcp_reset_xmit_timer() We want to factorize calls to inet_csk_reset_xmit_timer(), to ease TCP_RTO_MAX change. Current users want to add tcp_pacing_delay(sk) to the timeout. Remaining calls to inet_csk_reset_xmit_timer() do not add the pacing delay. Following patch will convert them, passing false for @pace_delay. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/net/tcp.h | 10 ++++++---- net/ipv4/tcp_input.c | 4 ++-- net/ipv4/tcp_output.c | 6 +++--- 3 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 8678a2d37ef8..56557b0104e3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1423,10 +1423,12 @@ static inline unsigned long tcp_pacing_delay(const struct sock *sk) static inline void tcp_reset_xmit_timer(struct sock *sk, const int what, - unsigned long when) + unsigned long when, + bool pace_delay) { - inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk), - TCP_RTO_MAX); + if (pace_delay) + when += tcp_pacing_delay(sk); + inet_csk_reset_xmit_timer(sk, what, when, TCP_RTO_MAX); } /* Something is really bad, we could not queue an additional packet, @@ -1455,7 +1457,7 @@ static inline void tcp_check_probe_timer(struct sock *sk) { if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - tcp_probe0_base(sk)); + tcp_probe0_base(sk), true); } static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1a3ce47b8c75..5c9ed7657c9f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3282,7 +3282,7 @@ void tcp_rearm_rto(struct sock *sk) */ rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } - tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, true); } } @@ -3562,7 +3562,7 @@ static void tcp_ack_probe(struct sock *sk) unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); when = tcp_clamp_probe0_to_user_timeout(sk, when); - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, true); } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 093476fb2b2e..27438ca2d5e6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2910,7 +2910,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); - tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout); + tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, true); return true; } @@ -3544,7 +3544,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) } if (rearm_timer) tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto); + inet_csk(sk)->icsk_rto, true); } /* We allow to exceed memory limits for FIN packets to expedite @@ -4400,7 +4400,7 @@ void tcp_send_probe0(struct sock *sk) } timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout); - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, true); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) -- cgit v1.2.3 From 54a378f43425085d0684679d99735696b69165bc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Feb 2025 15:28:29 +0000 Subject: tcp: add the ability to control max RTO Currently, TCP stack uses a constant (120 seconds) to limit the RTO value exponential growth. Some applications want to set a lower value. Add TCP_RTO_MAX_MS socket option to set a value (in ms) between 1 and 120 seconds. It is discouraged to change the socket rto max on a live socket, as it might lead to unexpected disconnects. Following patch is adding a netns sysctl to control the default value at socket creation time. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- .../networking/net_cachelines/inet_connection_sock.rst | 1 + include/net/inet_connection_sock.h | 1 + include/net/tcp.h | 16 +++++++++++----- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 12 ++++++++++++ net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_output.c | 4 ++-- net/ipv4/tcp_timer.c | 17 +++++++++-------- 9 files changed, 39 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/net_cachelines/inet_connection_sock.rst b/Documentation/networking/net_cachelines/inet_connection_sock.rst index 4a15627fc93b..b2401aa7c450 100644 --- a/Documentation/networking/net_cachelines/inet_connection_sock.rst +++ b/Documentation/networking/net_cachelines/inet_connection_sock.rst @@ -17,6 +17,7 @@ struct timer_list icsk_retransmit_timer read_mostly struct timer_list icsk_delack_timer read_mostly inet_csk_reset_xmit_timer,tcp_connect u32 icsk_rto read_write tcp_cwnd_validate,tcp_schedule_loss_probe,tcp_connect_init,tcp_connect,tcp_write_xmit,tcp_push_one u32 icsk_rto_min +u32 icsk_rto_max read_mostly tcp_reset_xmit_timer u32 icsk_delack_max u32 icsk_pmtu_cookie read_write tcp_sync_mss,tcp_current_mss,tcp_send_syn_data,tcp_connect_init,tcp_connect struct tcp_congestion_ops icsk_ca_ops read_write tcp_cwnd_validate,tcp_tso_segs,tcp_ca_dst_init,tcp_connect_init,tcp_connect,tcp_write_xmit diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 055aa80b05c6..d9978ffacc97 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -90,6 +90,7 @@ struct inet_connection_sock { struct timer_list icsk_delack_timer; __u32 icsk_rto; __u32 icsk_rto_min; + u32 icsk_rto_max; __u32 icsk_delack_max; __u32 icsk_pmtu_cookie; const struct tcp_congestion_ops *icsk_ca_ops; diff --git a/include/net/tcp.h b/include/net/tcp.h index 56557b0104e3..7fd2d7fa4532 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -143,8 +143,9 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCP_DELACK_MIN 4U #define TCP_ATO_MIN 4U #endif -#define TCP_RTO_MAX ((unsigned)(120*HZ)) -#define TCP_RTO_MIN ((unsigned)(HZ/5)) +#define TCP_RTO_MAX_SEC 120 +#define TCP_RTO_MAX ((unsigned)(TCP_RTO_MAX_SEC * HZ)) +#define TCP_RTO_MIN ((unsigned)(HZ / 5)) #define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */ #define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */ @@ -740,10 +741,14 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); +static inline unsigned int tcp_rto_max(const struct sock *sk) +{ + return READ_ONCE(inet_csk(sk)->icsk_rto_max); +} + static inline void tcp_bound_rto(struct sock *sk) { - if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) - inet_csk(sk)->icsk_rto = TCP_RTO_MAX; + inet_csk(sk)->icsk_rto = min(inet_csk(sk)->icsk_rto, tcp_rto_max(sk)); } static inline u32 __tcp_set_rto(const struct tcp_sock *tp) @@ -1428,7 +1433,8 @@ static inline void tcp_reset_xmit_timer(struct sock *sk, { if (pace_delay) when += tcp_pacing_delay(sk); - inet_csk_reset_xmit_timer(sk, what, when, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, what, when, + tcp_rto_max(sk)); } /* Something is really bad, we could not queue an additional packet, diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index dbf896f3146c..32a27b4a5020 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -136,6 +136,7 @@ enum { #define TCP_AO_REPAIR 42 /* Get/Set SNEs and ISNs */ #define TCP_IS_MPTCP 43 /* Is MPTCP being used? */ +#define TCP_RTO_MAX_MS 44 /* max rto time in ms */ #define TCP_REPAIR_ON 1 #define TCP_REPAIR_OFF 0 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2021f2709ec3..3bb8fbbb01d9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -432,6 +432,10 @@ void tcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; + + /* Use a sysctl ? */ + icsk->icsk_rto_max = TCP_RTO_MAX; + rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us); icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us); icsk->icsk_delack_max = TCP_DELACK_MAX; @@ -3807,6 +3811,11 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ)); return 0; + case TCP_RTO_MAX_MS: + if (val < MSEC_PER_SEC || val > TCP_RTO_MAX_SEC * MSEC_PER_SEC) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val)); + return 0; } sockopt_lock_sock(sk); @@ -4643,6 +4652,9 @@ zerocopy_rcv_out: case TCP_IS_MPTCP: val = 0; break; + case TCP_RTO_MAX_MS: + val = jiffies_to_msecs(tcp_rto_max(sk)); + break; default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9153fae2ebd8..4686783b70de 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3558,7 +3558,7 @@ static void tcp_ack_probe(struct sock *sk) * This function is not for random using! */ } else { - unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); + unsigned long when = tcp_probe0_when(sk, tcp_rto_max(sk)); when = tcp_clamp_probe0_to_user_timeout(sk, when); tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, true); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e065f7097611..06fb0123d2d6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -458,7 +458,7 @@ void tcp_ld_RTO_revert(struct sock *sk, u32 seq) icsk->icsk_backoff--; icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; - icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); tcp_mstamp_refresh(tp); delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3623d19b7c6e..464232a0d637 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4251,7 +4251,7 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) unsigned long delay; delay = TCP_DELACK_MAX << icsk->icsk_ack.retry; - if (delay < TCP_RTO_MAX) + if (delay < tcp_rto_max(sk)) icsk->icsk_ack.retry++; inet_csk_schedule_ack(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; @@ -4391,7 +4391,7 @@ void tcp_send_probe0(struct sock *sk) if (err <= 0) { if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; - timeout = tcp_probe0_when(sk, TCP_RTO_MAX); + timeout = tcp_probe0_when(sk, tcp_rto_max(sk)); } else { /* If packet was not sent due to local congestion, * Let senders fight for local resources conservatively. diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6472f560f653..c0e601e4f39c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -109,7 +109,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ - if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*tcp_rto_max(sk) || !do_reset) shift++; /* If some dubious ICMP arrived, penalize even more. */ @@ -189,12 +189,12 @@ static unsigned int tcp_model_timeout(struct sock *sk, { unsigned int linear_backoff_thresh, timeout; - linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base); + linear_backoff_thresh = ilog2(tcp_rto_max(sk) / rto_base); if (boundary <= linear_backoff_thresh) timeout = ((2 << boundary) - 1) * rto_base; else timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + - (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + (boundary - linear_backoff_thresh) * tcp_rto_max(sk); return jiffies_to_msecs(timeout); } /** @@ -268,7 +268,7 @@ static int tcp_write_timeout(struct sock *sk) retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { - const bool alive = icsk->icsk_rto < TCP_RTO_MAX; + const bool alive = icsk->icsk_rto < tcp_rto_max(sk); retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || @@ -416,7 +416,8 @@ static void tcp_probe_timer(struct sock *sk) } max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { - const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; + unsigned int rto_max = tcp_rto_max(sk); + const bool alive = inet_csk_rto_backoff(icsk, rto_max) < rto_max; max_probes = tcp_orphan_retries(sk, alive); if (!alive && icsk->icsk_backoff >= max_probes) @@ -492,7 +493,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, const struct inet_connection_sock *icsk = inet_csk(sk); u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); const struct tcp_sock *tp = tcp_sk(sk); - int timeout = TCP_RTO_MAX * 2; + int timeout = tcp_rto_max(sk) * 2; s32 rcv_delta; if (user_timeout) { @@ -665,7 +666,7 @@ out_reset_timer: icsk->icsk_backoff = 0; icsk->icsk_rto = clamp(__tcp_set_rto(tp), tcp_rto_min(sk), - TCP_RTO_MAX); + tcp_rto_max(sk)); } else if (sk->sk_state != TCP_SYN_SENT || tp->total_rto > READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) { @@ -673,7 +674,7 @@ out_reset_timer: * activated. */ icsk->icsk_backoff++; - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + icsk->icsk_rto = min(icsk->icsk_rto << 1, tcp_rto_max(sk)); } tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, tcp_clamp_rto_to_user_timeout(sk), false); -- cgit v1.2.3 From 1280c26228bd7eb14bdecd67dedbdd871f8fdda5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Feb 2025 15:28:30 +0000 Subject: tcp: add tcp_rto_max_ms sysctl Previous patch added a TCP_RTO_MAX_MS socket option to tune a TCP socket max RTO value. Many setups prefer to change a per netns sysctl. This patch adds /proc/sys/net/ipv4/tcp_rto_max_ms Its initial value is 120000 (120 seconds). Keep in mind that a decrease of tcp_rto_max_ms means shorter overall timeouts, unless tcp_retries2 sysctl is increased. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- Documentation/networking/ip-sysctl.rst | 13 +++++++++++++ .../networking/net_cachelines/netns_ipv4_sysctl.rst | 1 + include/net/netns/ipv4.h | 1 + net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++ net/ipv4/tcp.c | 6 +++--- net/ipv4/tcp_ipv4.c | 1 + 6 files changed, 29 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 363b4950d542..054561f8dcae 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -705,6 +705,8 @@ tcp_retries2 - INTEGER seconds and is a lower bound for the effective timeout. TCP will effectively time out at the first RTO which exceeds the hypothetical timeout. + If tcp_rto_max_ms is decreased, it is recommended to also + change tcp_retries2. RFC 1122 recommends at least 100 seconds for the timeout, which corresponds to a value of at least 8. @@ -1237,6 +1239,17 @@ tcp_rto_min_us - INTEGER Default: 200000 +tcp_rto_max_ms - INTEGER + Maximal TCP retransmission timeout (in ms). + Note that TCP_RTO_MAX_MS socket option has higher precedence. + + When changing tcp_rto_max_ms, it is important to understand + that tcp_retries2 might need a change. + + Possible Values: 1000 - 120,000 + + Default: 120,000 + UDP variables ============= diff --git a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst index de0263302f16..6e7b20afd2d4 100644 --- a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst +++ b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst @@ -86,6 +86,7 @@ u8 sysctl_tcp_sack u8 sysctl_tcp_window_scaling tcp_syn_options,tcp_parse_options u8 sysctl_tcp_timestamps u8 sysctl_tcp_early_retrans read_mostly tcp_schedule_loss_probe(tcp_write_xmit) +u32 sysctl_tcp_rto_max_ms u8 sysctl_tcp_recovery tcp_fastretrans_alert u8 sysctl_tcp_thin_linear_timeouts tcp_retrans_timer(on_thin_streams) u8 sysctl_tcp_slow_start_after_idle unlikely(tcp_cwnd_validate-network-not-starved) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 46452da35206..45ac125e8aeb 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -181,6 +181,7 @@ struct netns_ipv4 { u8 sysctl_tcp_window_scaling; u8 sysctl_tcp_timestamps; int sysctl_tcp_rto_min_us; + int sysctl_tcp_rto_max_ms; u8 sysctl_tcp_recovery; u8 sysctl_tcp_thin_linear_timeouts; u8 sysctl_tcp_slow_start_after_idle; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 42cb5dc9cb24..3a43010d726f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -28,6 +28,7 @@ static int tcp_adv_win_scale_max = 31; static int tcp_app_win_max = 31; static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS; static int tcp_min_snd_mss_max = 65535; +static int tcp_rto_max_max = TCP_RTO_MAX_SEC * MSEC_PER_SEC; static int ip_privileged_port_min; static int ip_privileged_port_max = 65535; static int ip_ttl_min = 1; @@ -1583,6 +1584,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, + { + .procname = "tcp_rto_max_ms", + .data = &init_net.ipv4.sysctl_tcp_rto_max_ms, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE_THOUSAND, + .extra2 = &tcp_rto_max_max, + }, }; static __net_init int ipv4_sysctl_init_net(struct net *net) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3bb8fbbb01d9..992d5c9b2487 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -423,7 +423,7 @@ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int rto_min_us; + int rto_min_us, rto_max_ms; tp->out_of_order_queue = RB_ROOT; sk->tcp_rtx_queue = RB_ROOT; @@ -433,8 +433,8 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_rto = TCP_TIMEOUT_INIT; - /* Use a sysctl ? */ - icsk->icsk_rto_max = TCP_RTO_MAX; + rto_max_ms = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_max_ms); + icsk->icsk_rto_max = msecs_to_jiffies(rto_max_ms); rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us); icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 06fb0123d2d6..d1fd2128ac6c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3532,6 +3532,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_pingpong_thresh = 1; net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); + net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; return 0; } -- cgit v1.2.3 From 34eea78a1112afc3579e06abb510a9f57521adaf Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 11 Feb 2025 10:13:52 -0800 Subject: net: report csum_complete via qstats Commit 13c7c941e729 ("netdev: add qstat for csum complete") reserved the entry for csum complete in the qstats uAPI. Start reporting this value now that we have a driver which needs it. Reviewed-by: Joe Damato Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20250211181356.580800-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netdev_queues.h | 1 + net/core/netdev-genl.c | 1 + 2 files changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 73d3401261a6..825141d675e5 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -23,6 +23,7 @@ struct netdev_queue_stats_rx { u64 hw_drops; u64 hw_drop_overruns; + u64 csum_complete; u64 csum_unnecessary; u64 csum_none; u64 csum_bad; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 0dcd4faefd8d..c18bb53d13fd 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -581,6 +581,7 @@ netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_COMPLETE, rx->csum_complete) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || -- cgit v1.2.3 From f0e70409b7eb0584d451f74db0c72af67b6170b3 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 11 Feb 2025 18:17:31 +0100 Subject: net: avoid unconditionally touching sk_tsflags on RX After commit 5d4cc87414c5 ("net: reorganize "struct sock" fields"), the sk_tsflags field shares the same cacheline with sk_forward_alloc. The UDP protocol does not acquire the sock lock in the RX path; forward allocations are protected via the receive queue spinlock; additionally udp_recvmsg() calls sock_recv_cmsgs() unconditionally touching sk_tsflags on each packet reception. Due to the above, under high packet rate traffic, when the BH and the user-space process run on different CPUs, UDP packet reception experiences a cache miss while accessing sk_tsflags. The receive path doesn't strictly need to access the problematic field; change sock_set_timestamping() to maintain the relevant information in a newly allocated sk_flags bit, so that sock_recv_cmsgs() can take decisions accessing the latter field only. With this patch applied, on an AMD epic server with i40e NICs, I measured a 10% performance improvement for small packets UDP flood performance tests - possibly a larger delta could be observed with more recent H/W. Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/dbd18c8a1171549f8249ac5a8b30b1b5ec88a425.1739294057.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 9 +++++---- net/core/sock.c | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 8036b3b79cd8..60ebf3c7b229 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -954,6 +954,7 @@ enum sock_flags { SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */ SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */ + SOCK_TIMESTAMPING_ANY, /* Copy of sk_tsflags & TSFLAGS_ANY */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) @@ -2664,13 +2665,13 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, { #define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_RCVTSTAMP) | \ - (1UL << SOCK_RCVMARK) |\ - (1UL << SOCK_RCVPRIORITY)) + (1UL << SOCK_RCVMARK) | \ + (1UL << SOCK_RCVPRIORITY) | \ + (1UL << SOCK_TIMESTAMPING_ANY)) #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) - if (sk->sk_flags & FLAGS_RECV_CMSGS || - READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY) + if (READ_ONCE(sk->sk_flags) & FLAGS_RECV_CMSGS) __sock_recv_cmsgs(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sock_write_timestamp(sk, skb->tstamp); diff --git a/net/core/sock.c b/net/core/sock.c index eae2ae70a2e0..a197f0a0b878 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -938,6 +938,7 @@ int sock_set_timestamping(struct sock *sk, int optname, WRITE_ONCE(sk->sk_tsflags, val); sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY)); if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, -- cgit v1.2.3 From ce5cf4af7ceb6c1350c7ae788659c1f0696b2267 Mon Sep 17 00:00:00 2001 From: Mateusz Polchlopek Date: Wed, 6 Nov 2024 12:37:26 -0500 Subject: libeth: move idpf_rx_csum_decoded and idpf_rx_extracted Structs idpf_rx_csum_decoded and idpf_rx_extracted are used both in idpf and iavf Intel drivers. Change the prefix from idpf_* to libeth_* and move mentioned structs to libeth's rx.h header file. Adjust usage in idpf driver. Suggested-by: Alexander Lobakin Tested-by: Rafal Romanowski Signed-off-by: Mateusz Polchlopek Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/idpf/idpf_singleq_txrx.c | 51 ++++++++++++---------- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 16 +++---- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 19 -------- include/net/libeth/rx.h | 47 ++++++++++++++++++++ 4 files changed, 82 insertions(+), 51 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index dfd7cf1d9aa0..eae1b6f474e6 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -595,7 +595,7 @@ static bool idpf_rx_singleq_is_non_eop(const union virtchnl2_rx_desc *rx_desc) */ static void idpf_rx_singleq_csum(struct idpf_rx_queue *rxq, struct sk_buff *skb, - struct idpf_rx_csum_decoded csum_bits, + struct libeth_rx_csum csum_bits, struct libeth_rx_pt decoded) { bool ipv4, ipv6; @@ -661,10 +661,10 @@ checksum_fail: * * Return: parsed checksum status. **/ -static struct idpf_rx_csum_decoded +static struct libeth_rx_csum idpf_rx_singleq_base_csum(const union virtchnl2_rx_desc *rx_desc) { - struct idpf_rx_csum_decoded csum_bits = { }; + struct libeth_rx_csum csum_bits = { }; u32 rx_error, rx_status; u64 qword; @@ -696,10 +696,10 @@ idpf_rx_singleq_base_csum(const union virtchnl2_rx_desc *rx_desc) * * Return: parsed checksum status. **/ -static struct idpf_rx_csum_decoded +static struct libeth_rx_csum idpf_rx_singleq_flex_csum(const union virtchnl2_rx_desc *rx_desc) { - struct idpf_rx_csum_decoded csum_bits = { }; + struct libeth_rx_csum csum_bits = { }; u16 rx_status0, rx_status1; rx_status0 = le16_to_cpu(rx_desc->flex_nic_wb.status_error0); @@ -798,7 +798,7 @@ idpf_rx_singleq_process_skb_fields(struct idpf_rx_queue *rx_q, u16 ptype) { struct libeth_rx_pt decoded = rx_q->rx_ptype_lkup[ptype]; - struct idpf_rx_csum_decoded csum_bits; + struct libeth_rx_csum csum_bits; /* modifies the skb - consumes the enet header */ skb->protocol = eth_type_trans(skb, rx_q->netdev); @@ -891,6 +891,7 @@ bool idpf_rx_singleq_buf_hw_alloc_all(struct idpf_rx_queue *rx_q, * idpf_rx_singleq_extract_base_fields - Extract fields from the Rx descriptor * @rx_desc: the descriptor to process * @fields: storage for extracted values + * @ptype: pointer that will store packet type * * Decode the Rx descriptor and extract relevant information including the * size and Rx packet type. @@ -900,20 +901,21 @@ bool idpf_rx_singleq_buf_hw_alloc_all(struct idpf_rx_queue *rx_q, */ static void idpf_rx_singleq_extract_base_fields(const union virtchnl2_rx_desc *rx_desc, - struct idpf_rx_extracted *fields) + struct libeth_rqe_info *fields, u32 *ptype) { u64 qword; qword = le64_to_cpu(rx_desc->base_wb.qword1.status_error_ptype_len); - fields->size = FIELD_GET(VIRTCHNL2_RX_BASE_DESC_QW1_LEN_PBUF_M, qword); - fields->rx_ptype = FIELD_GET(VIRTCHNL2_RX_BASE_DESC_QW1_PTYPE_M, qword); + fields->len = FIELD_GET(VIRTCHNL2_RX_BASE_DESC_QW1_LEN_PBUF_M, qword); + *ptype = FIELD_GET(VIRTCHNL2_RX_BASE_DESC_QW1_PTYPE_M, qword); } /** * idpf_rx_singleq_extract_flex_fields - Extract fields from the Rx descriptor * @rx_desc: the descriptor to process * @fields: storage for extracted values + * @ptype: pointer that will store packet type * * Decode the Rx descriptor and extract relevant information including the * size and Rx packet type. @@ -923,12 +925,12 @@ idpf_rx_singleq_extract_base_fields(const union virtchnl2_rx_desc *rx_desc, */ static void idpf_rx_singleq_extract_flex_fields(const union virtchnl2_rx_desc *rx_desc, - struct idpf_rx_extracted *fields) + struct libeth_rqe_info *fields, u32 *ptype) { - fields->size = FIELD_GET(VIRTCHNL2_RX_FLEX_DESC_PKT_LEN_M, - le16_to_cpu(rx_desc->flex_nic_wb.pkt_len)); - fields->rx_ptype = FIELD_GET(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M, - le16_to_cpu(rx_desc->flex_nic_wb.ptype_flex_flags0)); + fields->len = FIELD_GET(VIRTCHNL2_RX_FLEX_DESC_PKT_LEN_M, + le16_to_cpu(rx_desc->flex_nic_wb.pkt_len)); + *ptype = FIELD_GET(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M, + le16_to_cpu(rx_desc->flex_nic_wb.ptype_flex_flags0)); } /** @@ -936,17 +938,18 @@ idpf_rx_singleq_extract_flex_fields(const union virtchnl2_rx_desc *rx_desc, * @rx_q: Rx descriptor queue * @rx_desc: the descriptor to process * @fields: storage for extracted values + * @ptype: pointer that will store packet type * */ static void idpf_rx_singleq_extract_fields(const struct idpf_rx_queue *rx_q, const union virtchnl2_rx_desc *rx_desc, - struct idpf_rx_extracted *fields) + struct libeth_rqe_info *fields, u32 *ptype) { if (rx_q->rxdids == VIRTCHNL2_RXDID_1_32B_BASE_M) - idpf_rx_singleq_extract_base_fields(rx_desc, fields); + idpf_rx_singleq_extract_base_fields(rx_desc, fields, ptype); else - idpf_rx_singleq_extract_flex_fields(rx_desc, fields); + idpf_rx_singleq_extract_flex_fields(rx_desc, fields, ptype); } /** @@ -966,9 +969,10 @@ static int idpf_rx_singleq_clean(struct idpf_rx_queue *rx_q, int budget) /* Process Rx packets bounded by budget */ while (likely(total_rx_pkts < (unsigned int)budget)) { - struct idpf_rx_extracted fields = { }; + struct libeth_rqe_info fields = { }; union virtchnl2_rx_desc *rx_desc; struct idpf_rx_buf *rx_buf; + u32 ptype; /* get the Rx desc from Rx queue based on 'next_to_clean' */ rx_desc = &rx_q->rx[ntc]; @@ -989,16 +993,16 @@ static int idpf_rx_singleq_clean(struct idpf_rx_queue *rx_q, int budget) */ dma_rmb(); - idpf_rx_singleq_extract_fields(rx_q, rx_desc, &fields); + idpf_rx_singleq_extract_fields(rx_q, rx_desc, &fields, &ptype); rx_buf = &rx_q->rx_buf[ntc]; - if (!libeth_rx_sync_for_cpu(rx_buf, fields.size)) + if (!libeth_rx_sync_for_cpu(rx_buf, fields.len)) goto skip_data; if (skb) - idpf_rx_add_frag(rx_buf, skb, fields.size); + idpf_rx_add_frag(rx_buf, skb, fields.len); else - skb = idpf_rx_build_skb(rx_buf, fields.size); + skb = idpf_rx_build_skb(rx_buf, fields.len); /* exit if we failed to retrieve a buffer */ if (!skb) @@ -1033,8 +1037,7 @@ skip_data: total_rx_bytes += skb->len; /* protocol */ - idpf_rx_singleq_process_skb_fields(rx_q, skb, - rx_desc, fields.rx_ptype); + idpf_rx_singleq_process_skb_fields(rx_q, skb, rx_desc, ptype); /* send completed skb up the stack */ napi_gro_receive(rx_q->pp->p.napi, skb); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 9be6a6b59c4e..2747dc69999a 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -2895,7 +2895,7 @@ idpf_rx_hash(const struct idpf_rx_queue *rxq, struct sk_buff *skb, * skb->protocol must be set before this function is called */ static void idpf_rx_csum(struct idpf_rx_queue *rxq, struct sk_buff *skb, - struct idpf_rx_csum_decoded csum_bits, + struct libeth_rx_csum csum_bits, struct libeth_rx_pt decoded) { bool ipv4, ipv6; @@ -2923,7 +2923,7 @@ static void idpf_rx_csum(struct idpf_rx_queue *rxq, struct sk_buff *skb, if (unlikely(csum_bits.l4e)) goto checksum_fail; - if (csum_bits.raw_csum_inv || + if (!csum_bits.raw_csum_valid || decoded.inner_prot == LIBETH_RX_PT_INNER_SCTP) { skb->ip_summed = CHECKSUM_UNNECESSARY; return; @@ -2946,10 +2946,10 @@ checksum_fail: * * Return: parsed checksum status. **/ -static struct idpf_rx_csum_decoded +static struct libeth_rx_csum idpf_rx_splitq_extract_csum_bits(const struct virtchnl2_rx_flex_desc_adv_nic_3 *rx_desc) { - struct idpf_rx_csum_decoded csum = { }; + struct libeth_rx_csum csum = { }; u8 qword0, qword1; qword0 = rx_desc->status_err0_qw0; @@ -2965,9 +2965,9 @@ idpf_rx_splitq_extract_csum_bits(const struct virtchnl2_rx_flex_desc_adv_nic_3 * qword1); csum.ipv6exadd = FIELD_GET(VIRTCHNL2_RX_FLEX_DESC_ADV_STATUS0_IPV6EXADD_M, qword0); - csum.raw_csum_inv = - le16_get_bits(rx_desc->ptype_err_fflags0, - VIRTCHNL2_RX_FLEX_DESC_ADV_RAW_CSUM_INV_M); + csum.raw_csum_valid = + !le16_get_bits(rx_desc->ptype_err_fflags0, + VIRTCHNL2_RX_FLEX_DESC_ADV_RAW_CSUM_INV_M); csum.raw_csum = le16_to_cpu(rx_desc->misc.raw_cs); return csum; @@ -3058,7 +3058,7 @@ static int idpf_rx_process_skb_fields(struct idpf_rx_queue *rxq, struct sk_buff *skb, const struct virtchnl2_rx_flex_desc_adv_nic_3 *rx_desc) { - struct idpf_rx_csum_decoded csum_bits; + struct libeth_rx_csum csum_bits; struct libeth_rx_pt decoded; u16 rx_ptype; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 0f71a6f5557b..cd9a20c9cc7f 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -213,25 +213,6 @@ enum idpf_tx_ctx_desc_eipt_offload { IDPF_TX_CTX_EXT_IP_IPV4 = 0x3 }; -/* Checksum offload bits decoded from the receive descriptor. */ -struct idpf_rx_csum_decoded { - u32 l3l4p : 1; - u32 ipe : 1; - u32 eipe : 1; - u32 eudpe : 1; - u32 ipv6exadd : 1; - u32 l4e : 1; - u32 pprs : 1; - u32 nat : 1; - u32 raw_csum_inv : 1; - u32 raw_csum : 16; -}; - -struct idpf_rx_extracted { - unsigned int size; - u16 rx_ptype; -}; - #define IDPF_TX_COMPLQ_CLEAN_BUDGET 256 #define IDPF_TX_MIN_PKT_LEN 17 #define IDPF_TX_DESCS_FOR_SKB_DATA_PTR 1 diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h index 43574bd6612f..ab05024be518 100644 --- a/include/net/libeth/rx.h +++ b/include/net/libeth/rx.h @@ -198,6 +198,53 @@ struct libeth_rx_pt { enum xdp_rss_hash_type hash_type:16; }; +/** + * struct libeth_rx_csum - checksum offload bits decoded from the Rx descriptor + * @l3l4p: detectable L3 and L4 integrity check is processed by the hardware + * @ipe: IP checksum error + * @eipe: external (outermost) IP header (only for tunels) + * @eudpe: external (outermost) UDP checksum error (only for tunels) + * @ipv6exadd: IPv6 header with extension headers + * @l4e: L4 integrity error + * @pprs: set for packets that skip checksum calculation in the HW pre parser + * @nat: the packet is a UDP tunneled packet + * @raw_csum_valid: set if raw checksum is valid + * @pad: padding to naturally align raw_csum field + * @raw_csum: raw checksum + */ +struct libeth_rx_csum { + u32 l3l4p:1; + u32 ipe:1; + u32 eipe:1; + u32 eudpe:1; + u32 ipv6exadd:1; + u32 l4e:1; + u32 pprs:1; + u32 nat:1; + + u32 raw_csum_valid:1; + u32 pad:7; + u32 raw_csum:16; +}; + +/** + * struct libeth_rqe_info - receive queue element info + * @len: packet length + * @ptype: packet type based on types programmed into the device + * @eop: whether it's the last fragment of the packet + * @rxe: MAC errors: CRC, Alignment, Oversize, Undersizes, Length error + * @vlan: C-VLAN or S-VLAN tag depending on the VLAN offload configuration + */ +struct libeth_rqe_info { + u32 len; + + u32 ptype:14; + u32 eop:1; + u32 rxe:1; + + u32 vlan:16; +}; + void libeth_rx_pt_gen_hash_type(struct libeth_rx_pt *pt); /** -- cgit v1.2.3 From 54568a84c95bdea20227cf48d41f198d083e78dd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Feb 2025 13:24:15 +0000 Subject: net: introduce EXPORT_IPV6_MOD() and EXPORT_IPV6_MOD_GPL() We have many EXPORT_SYMBOL(x) in networking tree because IPv6 can be built as a module. CONFIG_IPV6=y is becoming the norm. Define a EXPORT_IPV6_MOD(x) which only exports x for modular IPv6. Same principle applies to EXPORT_IPV6_MOD_GPL() Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Mateusz Polchlopek Link: https://patch.msgid.link/20250212132418.1524422-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index ba7b43447775..305eccdf4ff7 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -673,6 +673,14 @@ static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast, memcpy(buf, &naddr, sizeof(naddr)); } +#if IS_MODULE(CONFIG_IPV6) +#define EXPORT_IPV6_MOD(X) EXPORT_SYMBOL(X) +#define EXPORT_IPV6_MOD_GPL(X) EXPORT_SYMBOL_GPL(X) +#else +#define EXPORT_IPV6_MOD(X) +#define EXPORT_IPV6_MOD_GPL(X) +#endif + #if IS_ENABLED(CONFIG_IPV6) #include #endif -- cgit v1.2.3 From b9d752105e5fdcbd9d0126d61da2f6df4a18560f Mon Sep 17 00:00:00 2001 From: Stefano Jordhani Date: Fri, 14 Feb 2025 18:17:51 +0000 Subject: net: use napi_id_valid helper In commit 6597e8d35851 ("netdev-genl: Elide napi_id when not present"), napi_id_valid function was added. Use the helper to refactor open-coded checks in the source. Suggested-by: Paolo Abeni Signed-off-by: Stefano Jordhani Reviewed-by: Joe Damato Reviewed-by: Jens Axboe # for iouring Link: https://patch.msgid.link/20250214181801.931-1-sjordhani@gmail.com Signed-off-by: Jakub Kicinski --- fs/eventpoll.c | 8 ++++---- include/net/busy_poll.h | 4 ++-- io_uring/napi.c | 4 ++-- net/core/dev.c | 6 +++--- net/core/netdev-genl.c | 2 +- net/core/page_pool_user.c | 2 +- net/core/sock.c | 2 +- net/xdp/xsk.c | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) (limited to 'include/net') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 7c0980db77b3..2fecf66661e9 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -447,7 +447,7 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) if (!budget) budget = BUSY_POLL_BUDGET; - if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) { + if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) { napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, prefer_busy_poll, budget); if (ep_events_available(ep)) @@ -492,7 +492,7 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) * or * Nothing to do if we already have this ID */ - if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) + if (!napi_id_valid(napi_id) || napi_id == ep->napi_id) return; /* record NAPI ID for use in next busy poll */ @@ -546,7 +546,7 @@ static void ep_suspend_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); - if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) + if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_suspend_irqs(napi_id); } @@ -554,7 +554,7 @@ static void ep_resume_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); - if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) + if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_resume_irqs(napi_id); } diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 741fa7754700..cab6146a510a 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -119,7 +119,7 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(sk->sk_napi_id); - if (napi_id >= MIN_NAPI_ID) + if (napi_id_valid(napi_id)) napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, READ_ONCE(sk->sk_prefer_busy_poll), READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); @@ -134,7 +134,7 @@ static inline void skb_mark_napi_id(struct sk_buff *skb, /* If the skb was already marked with a valid NAPI ID, avoid overwriting * it. */ - if (skb->napi_id < MIN_NAPI_ID) + if (!napi_id_valid(skb->napi_id)) skb->napi_id = napi->napi_id; #endif } diff --git a/io_uring/napi.c b/io_uring/napi.c index b1ade3fda30f..4a10de03e426 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -44,7 +44,7 @@ int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) struct io_napi_entry *e; /* Non-NAPI IDs can be rejected. */ - if (napi_id < MIN_NAPI_ID) + if (!napi_id_valid(napi_id)) return -EINVAL; hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; @@ -87,7 +87,7 @@ static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id) struct io_napi_entry *e; /* Non-NAPI IDs can be rejected. */ - if (napi_id < MIN_NAPI_ID) + if (!napi_id_valid(napi_id)) return -EINVAL; hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; diff --git a/net/core/dev.c b/net/core/dev.c index d5ab9a4b318e..bcb266ab2912 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1008,7 +1008,7 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id) WARN_ON_ONCE(!rcu_read_lock_held()); - if (napi_id < MIN_NAPI_ID) + if (!napi_id_valid(napi_id)) return NULL; napi = napi_by_id(napi_id); @@ -6740,7 +6740,7 @@ static void napi_hash_add(struct napi_struct *napi) /* 0..NR_CPUS range is reserved for sender_cpu use */ do { - if (unlikely(++napi_gen_id < MIN_NAPI_ID)) + if (unlikely(!napi_id_valid(++napi_gen_id))) napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); @@ -6911,7 +6911,7 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) higher = &dev->napi_list; list_for_each_entry(pos, &dev->napi_list, dev_list) { - if (pos->napi_id >= MIN_NAPI_ID) + if (napi_id_valid(pos->napi_id)) pos_id = pos->napi_id; else if (pos->config) pos_id = pos->config->napi_id; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index c18bb53d13fd..22ac51356d9f 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -267,7 +267,7 @@ netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, prev_id = UINT_MAX; list_for_each_entry(napi, &netdev->napi_list, dev_list) { - if (napi->napi_id < MIN_NAPI_ID) + if (!napi_id_valid(napi->napi_id)) continue; /* Dump continuation below depends on the list being sorted */ diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 9d8a3d8597fa..c82a95beceff 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -233,7 +233,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, goto err_cancel; napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0; - if (napi_id >= MIN_NAPI_ID && + if (napi_id_valid(napi_id) && nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id)) goto err_cancel; diff --git a/net/core/sock.c b/net/core/sock.c index a197f0a0b878..53c7af0038c4 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2042,7 +2042,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = READ_ONCE(sk->sk_napi_id); /* aggregate non-NAPI IDs down to 0 */ - if (v.val < MIN_NAPI_ID) + if (!napi_id_valid(v.val)) v.val = 0; break; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 89d2bef96469..0edf25973072 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -875,7 +875,7 @@ static bool xsk_no_wakeup(struct sock *sk) #ifdef CONFIG_NET_RX_BUSY_POLL /* Prefer busy-polling, skip the wakeup. */ return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && - READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID; + napi_id_valid(READ_ONCE(sk->sk_napi_id)); #else return false; #endif -- cgit v1.2.3 From a127c18462ea619a1ace1f00540807e009dbf225 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 14 Feb 2025 21:12:29 +0000 Subject: netlink: Add nla_put_empty_nest helper Creating empty nests is helpful when the exact attributes to be exposed in the future are not known. Encapsulate the logic in a helper. Signed-off-by: Joe Damato Suggested-by: Jakub Kicinski Link: https://patch.msgid.link/20250214211255.14194-2-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- include/net/netlink.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/netlink.h b/include/net/netlink.h index e015ffbed819..29e0db940382 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -118,6 +118,7 @@ * nla_nest_start(skb, type) start a nested attribute * nla_nest_end(skb, nla) finalize a nested attribute * nla_nest_cancel(skb, nla) cancel nested attribute construction + * nla_put_empty_nest(skb, type) create an empty nest * * Attribute Length Calculations: * nla_attr_size(payload) length of attribute w/o padding @@ -2240,6 +2241,20 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) nlmsg_trim(skb, start); } +/** + * nla_put_empty_nest - Create an empty nest + * @skb: socket buffer the message is stored in + * @attrtype: attribute type of the container + * + * This function is a helper for creating empty nests. + * + * Returns: 0 when successful or -EMSGSIZE on failure. + */ +static inline int nla_put_empty_nest(struct sk_buff *skb, int attrtype) +{ + return nla_nest_start(skb, attrtype) ? 0 : -EMSGSIZE; +} + /** * __nla_validate_nested - Validate a stream of nested attributes * @start: container attribute -- cgit v1.2.3 From 6ad861519a69ecf3cf032c579e18569f62b81263 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:26:59 -0500 Subject: net: initialize mark in sockcm_init Avoid open coding initialization of sockcm fields. Avoid reading the sk_priority field twice. This ensures all callers, existing and future, will correctly try a cmsg passed mark before sk_mark. This patch extends support for cmsg mark to: packet_spkt and packet_tpacket and net/can/raw.c. This patch extends support for cmsg priority to: packet_spkt and packet_tpacket. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-3-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 1 + net/can/raw.c | 2 +- net/packet/af_packet.c | 9 ++++----- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 60ebf3c7b229..fac65ed30983 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1829,6 +1829,7 @@ static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { *sockc = (struct sockcm_cookie) { + .mark = READ_ONCE(sk->sk_mark), .tsflags = READ_ONCE(sk->sk_tsflags), .priority = READ_ONCE(sk->sk_priority), }; diff --git a/net/can/raw.c b/net/can/raw.c index 46e8ed9d64da..9b1d5f036f57 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -963,7 +963,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) skb->dev = dev; skb->priority = sockc.priority; - skb->mark = READ_ONCE(sk->sk_mark); + skb->mark = sockc.mark; skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, &sockc); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c131e5ceea37..3e9ddf72cd03 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2102,8 +2102,8 @@ retry: skb->protocol = proto; skb->dev = dev; - skb->priority = READ_ONCE(sk->sk_priority); - skb->mark = READ_ONCE(sk->sk_mark); + skb->priority = sockc.priority; + skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); skb_setup_tx_timestamp(skb, &sockc); @@ -2634,8 +2634,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->protocol = proto; skb->dev = dev; - skb->priority = READ_ONCE(po->sk.sk_priority); - skb->mark = READ_ONCE(po->sk.sk_mark); + skb->priority = sockc->priority; + skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); skb_setup_tx_timestamp(skb, sockc); skb_zcopy_set_nouarg(skb, ph.raw); @@ -3039,7 +3039,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_unlock; sockcm_init(&sockc, sk); - sockc.mark = READ_ONCE(sk->sk_mark); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) -- cgit v1.2.3 From 94788792f37902f1f4d417f6f9663831cf7e91fc Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:27:00 -0500 Subject: ipv4: initialize inet socket cookies with sockcm_init Avoid open coding the same logic. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-4-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 305eccdf4ff7..3c4ef5ddad83 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -94,9 +94,8 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, { ipcm_init(ipcm); - ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); - ipcm->sockc.priority = READ_ONCE(inet->sk.sk_priority); - ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags); + sockcm_init(&ipcm->sockc, &inet->sk); + ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; ipcm->protocol = inet->inet_num; -- cgit v1.2.3 From 9329b58395e51bba9c847419cc4ba176df3dd2b7 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:27:01 -0500 Subject: ipv4: remove get_rttos Initialize the ip cookie tos field when initializing the cookie, in ipcm_init_sk. The existing code inverts the standard pattern for initializing cookie fields. Default is to initialize the field from the sk, then possibly overwrite that when parsing cmsgs (the unlikely case). This field inverts that, setting the field to an illegal value and after cmsg parsing checking whether the value is still illegal and thus should be overridden. Be careful to always apply mask INET_DSCP_MASK, as before. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-5-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 11 +++-------- net/ipv4/ping.c | 6 +++--- net/ipv4/raw.c | 6 +++--- net/ipv4/udp.c | 6 +++--- 4 files changed, 12 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 3c4ef5ddad83..ce5e59957dd5 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -92,7 +92,9 @@ static inline void ipcm_init(struct ipcm_cookie *ipcm) static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, const struct inet_sock *inet) { - ipcm_init(ipcm); + *ipcm = (struct ipcm_cookie) { + .tos = READ_ONCE(inet->tos), + }; sockcm_init(&ipcm->sockc, &inet->sk); @@ -256,13 +258,6 @@ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet, return RT_SCOPE_UNIVERSE; } -static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet) -{ - u8 dsfield = ipc->tos != -1 ? ipc->tos : READ_ONCE(inet->tos); - - return dsfield & INET_DSCP_MASK; -} - /* datagram.c */ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 619ddc087957..85d09f2ecadc 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -705,7 +705,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ip_options_data opt_copy; int free = 0; __be32 saddr, daddr, faddr; - u8 tos, scope; + u8 scope; int err; pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); @@ -768,7 +768,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } faddr = ipc.opt->opt.faddr; } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (ipv4_is_multicast(daddr)) { @@ -779,7 +778,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } else if (!ipc.oif) ipc.oif = READ_ONCE(inet->uc_index); - flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk->sk_uid); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4304a68d1db0..6aace4d55733 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -486,7 +486,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; - u8 tos, scope; + u8 scope; int free = 0; __be32 daddr; __be32 saddr; @@ -581,7 +581,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = ipc.opt->opt.faddr; } } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); uc_index = READ_ONCE(inet->uc_index); @@ -606,7 +605,8 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } } - flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3485989cd4bd..17c7736d8349 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1280,7 +1280,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int free = 0; int connected = 0; __be32 daddr, faddr, saddr; - u8 tos, scope; + u8 scope; __be16 dport; int err, is_udplite = IS_UDPLITE(sk); int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; @@ -1404,7 +1404,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) faddr = ipc.opt->opt.faddr; connected = 0; } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (scope == RT_SCOPE_LINK) connected = 0; @@ -1441,7 +1440,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &fl4_stack; - flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport, sk->sk_uid); -- cgit v1.2.3 From 096208592b09c2f5fc0c1a174694efa41c04209d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:27:03 -0500 Subject: ipv6: replace ipcm6_init calls with ipcm6_init_sk This initializes tclass and dontfrag before cmsg parsing, removing the need for explicit checks against -1 in each caller. Leave hlimit set to -1, because its full initialization (in ip6_sk_dst_hoplimit) requires more state (dst, flowi6, ..). This also prepares for calling sockcm_init in a follow-on patch. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-7-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 9 --------- net/ipv6/raw.c | 8 +------- net/ipv6/udp.c | 7 +------ net/l2tp/l2tp_ip6.c | 8 +------- 4 files changed, 3 insertions(+), 29 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index f5c43ad1565e..46a679d9b334 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -363,15 +363,6 @@ struct ipcm6_cookie { struct ipv6_txoptions *opt; }; -static inline void ipcm6_init(struct ipcm6_cookie *ipc6) -{ - *ipc6 = (struct ipcm6_cookie) { - .hlimit = -1, - .tclass = -1, - .dontfrag = -1, - }; -} - static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, const struct sock *sk) { diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a45aba090aa4..ae68d3f7dd32 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -777,7 +777,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk->sk_uid; - ipcm6_init(&ipc6); + ipcm6_init_sk(&ipc6, sk); ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = fl6.flowi6_mark; ipc6.sockc.priority = READ_ONCE(sk->sk_priority); @@ -891,9 +891,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); @@ -904,9 +901,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); - if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c6ea438b5c75..7096b7e84c10 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1494,7 +1494,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); - ipcm6_init(&ipc6); + ipcm6_init_sk(&ipc6, sk); ipc6.gso_size = READ_ONCE(up->gso_size); ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = READ_ONCE(sk->sk_mark); @@ -1704,9 +1704,6 @@ do_udp_sendmsg: security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6->flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6->flowlabel); dst = ip6_sk_dst_lookup_flow(sk, fl6, final_p, connected); @@ -1752,8 +1749,6 @@ back_from_confirm: WRITE_ONCE(up->pending, AF_INET6); do_append_data: - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); up->len += ulen; err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, fl6, dst_rt6_info(dst), diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index f4c1da070826..b98d13584c81 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -547,7 +547,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk->sk_uid; - ipcm6_init(&ipc6); + ipcm6_init_sk(&ipc6, sk); if (lsa) { if (addr_len < SIN6_LEN_RFC2133) @@ -634,9 +634,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); @@ -648,9 +645,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); - if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; -- cgit v1.2.3 From 5cd2f78886dd86de1b13d6502808a149f1b77959 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Feb 2025 17:27:04 -0500 Subject: ipv6: initialize inet socket cookies with sockcm_init Avoid open coding the same logic. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Link: https://patch.msgid.link/20250214222720.3205500-8-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 2 ++ net/ipv6/ping.c | 3 --- net/ipv6/raw.c | 9 +++------ net/ipv6/udp.c | 3 --- 4 files changed, 5 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 46a679d9b334..9614006f483c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -371,6 +371,8 @@ static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, .tclass = inet6_sk(sk)->tclass, .dontfrag = inet6_test_bit(DONTFRAG, sk), }; + + sockcm_init(&ipc6->sockc, sk); } static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 46b8adf6e7f8..84d90dd8b3f0 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -119,9 +119,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return -EINVAL; ipcm6_init_sk(&ipc6, sk); - ipc6.sockc.priority = READ_ONCE(sk->sk_priority); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = READ_ONCE(sk->sk_mark); fl6.flowi6_oif = oif; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ae68d3f7dd32..fda640ebd53f 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -769,19 +769,16 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) hdrincl = inet_test_bit(HDRINCL, sk); + ipcm6_init_sk(&ipc6, sk); + /* * Get and verify the address. */ memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_mark = READ_ONCE(sk->sk_mark); + fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk->sk_uid; - ipcm6_init_sk(&ipc6, sk); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = fl6.flowi6_mark; - ipc6.sockc.priority = READ_ONCE(sk->sk_priority); - if (sin6) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7096b7e84c10..3a0d6c5a8286 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1496,9 +1496,6 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init_sk(&ipc6, sk); ipc6.gso_size = READ_ONCE(up->gso_size); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = READ_ONCE(sk->sk_mark); - ipc6.sockc.priority = READ_ONCE(sk->sk_priority); /* destination address check */ if (sin6) { -- cgit v1.2.3 From da7665947b668ef7882b40888171e941db11f06a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 17 Feb 2025 15:41:03 +0200 Subject: net: fib_rules: Add port mask support Add support for configuring and deleting rules that match on source and destination ports using a mask as well as support for dumping such rules to user space. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Reviewed-by: David Ahern Link: https://patch.msgid.link/20250217134109.311176-3-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/fib_rules.h | 8 ++++++ net/core/fib_rules.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 710caacad9da..cfeb2fd0f5db 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -43,6 +43,8 @@ struct fib_rule { struct fib_kuid_range uid_range; struct fib_rule_port_range sport_range; struct fib_rule_port_range dport_range; + u16 sport_mask; + u16 dport_mask; struct rcu_head rcu; }; @@ -159,6 +161,12 @@ static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a, a->end == b->end; } +static inline bool +fib_rule_port_is_range(const struct fib_rule_port_range *range) +{ + return range->start != range->end; +} + static inline bool fib_rule_requires_fldissect(struct fib_rule *rule) { return rule->iifindex != LOOPBACK_IFINDEX && (rule->ip_proto || diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index f5b1900770ec..ba6beaa63f44 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -481,11 +481,17 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops, &rule->sport_range)) continue; + if (rule->sport_mask && r->sport_mask != rule->sport_mask) + continue; + if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; + if (rule->dport_mask && r->dport_mask != rule->dport_mask) + continue; + if (!ops->compare(r, frh, tb)) continue; return r; @@ -515,6 +521,33 @@ static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, } #endif +static int fib_nl2rule_port_mask(const struct nlattr *mask_attr, + const struct fib_rule_port_range *range, + u16 *port_mask, + struct netlink_ext_ack *extack) +{ + if (!fib_rule_port_range_valid(range)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, + "Cannot specify port mask without port value"); + return -EINVAL; + } + + if (fib_rule_port_is_range(range)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, + "Cannot specify port mask for port range"); + return -EINVAL; + } + + if (range->start & ~nla_get_u16(mask_attr)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Invalid port mask"); + return -EINVAL; + } + + *port_mask = nla_get_u16(mask_attr); + + return 0; +} + static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct fib_rules_ops *ops, @@ -644,6 +677,16 @@ static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh, NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; } + if (!fib_rule_port_is_range(&nlrule->sport_range)) + nlrule->sport_mask = U16_MAX; + } + + if (tb[FRA_SPORT_MASK]) { + err = fib_nl2rule_port_mask(tb[FRA_SPORT_MASK], + &nlrule->sport_range, + &nlrule->sport_mask, extack); + if (err) + goto errout_free; } if (tb[FRA_DPORT_RANGE]) { @@ -653,6 +696,16 @@ static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh, NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; } + if (!fib_rule_port_is_range(&nlrule->dport_range)) + nlrule->dport_mask = U16_MAX; + } + + if (tb[FRA_DPORT_MASK]) { + err = fib_nl2rule_port_mask(tb[FRA_DPORT_MASK], + &nlrule->dport_range, + &nlrule->dport_mask, extack); + if (err) + goto errout_free; } *rule = nlrule; @@ -751,10 +804,16 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, &rule->sport_range)) continue; + if (r->sport_mask != rule->sport_mask) + continue; + if (!fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; + if (r->dport_mask != rule->dport_mask) + continue; + if (!ops->compare(r, frh, tb)) continue; return 1; @@ -1051,7 +1110,9 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(1) /* FRA_PROTOCOL */ + nla_total_size(1) /* FRA_IP_PROTO */ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ - + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ + + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_DPORT_RANGE */ + + nla_total_size(2) /* FRA_SPORT_MASK */ + + nla_total_size(2); /* FRA_DPORT_MASK */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -1119,8 +1180,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, nla_put_uid_range(skb, &rule->uid_range)) || (fib_rule_port_range_set(&rule->sport_range) && nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || + (rule->sport_mask && nla_put_u16(skb, FRA_SPORT_MASK, + rule->sport_mask)) || (fib_rule_port_range_set(&rule->dport_range) && nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || + (rule->dport_mask && nla_put_u16(skb, FRA_DPORT_MASK, + rule->dport_mask)) || (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; -- cgit v1.2.3 From 79a4e21584b7d36df51d452f4dc43221b463a26f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 17 Feb 2025 15:41:04 +0200 Subject: ipv4: fib_rules: Add port mask matching Extend IPv4 FIB rules to match on source and destination ports using a mask. Note that the mask is only set when not matching on a range. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Reviewed-by: David Ahern Link: https://patch.msgid.link/20250217134109.311176-4-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/fib_rules.h | 11 +++++++++++ net/ipv4/fib_rules.c | 8 ++++---- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index cfeb2fd0f5db..5927910ec06e 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -148,6 +148,17 @@ static inline bool fib_rule_port_inrange(const struct fib_rule_port_range *a, ntohs(port) <= a->end; } +static inline bool fib_rule_port_match(const struct fib_rule_port_range *range, + u16 port_mask, __be16 port) +{ + if ((range->start ^ ntohs(port)) & port_mask) + return false; + if (!port_mask && fib_rule_port_range_set(range) && + !fib_rule_port_inrange(range, port)) + return false; + return true; +} + static inline bool fib_rule_port_range_valid(const struct fib_rule_port_range *a) { return a->start != 0 && a->end != 0 && a->end < 0xffff && diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 041c46787d94..6b3d6a957822 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -201,12 +201,12 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto)) return 0; - if (fib_rule_port_range_set(&rule->sport_range) && - !fib_rule_port_inrange(&rule->sport_range, fl4->fl4_sport)) + if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask, + fl4->fl4_sport)) return 0; - if (fib_rule_port_range_set(&rule->dport_range) && - !fib_rule_port_inrange(&rule->dport_range, fl4->fl4_dport)) + if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask, + fl4->fl4_dport)) return 0; return 1; -- cgit v1.2.3 From c8802ded46589fcd054a0497159ab4704c9dc89f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Feb 2025 19:36:16 +0100 Subject: net: dismiss sk_forward_alloc_get() After the previous patch we can remove the forward_alloc_get proto callback, basically reverting commit 292e6077b040 ("net: introduce sk_forward_alloc_get()") and commit 66d58f046c9d ("net: use sk_forward_alloc_get() in sk_get_meminfo()"). Signed-off-by: Paolo Abeni Acked-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250218-net-next-mptcp-rx-path-refactor-v1-5-4a47d90d7998@kernel.org Signed-off-by: Jakub Kicinski --- include/net/sock.h | 13 ------------- net/core/sock.c | 2 +- net/ipv4/af_inet.c | 2 +- net/ipv4/inet_diag.c | 2 +- net/sched/em_meta.c | 2 +- 5 files changed, 4 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index fac65ed30983..edbb870e3f86 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1285,10 +1285,6 @@ struct proto { unsigned int inuse_idx; #endif -#if IS_ENABLED(CONFIG_MPTCP) - int (*forward_alloc_get)(const struct sock *sk); -#endif - bool (*stream_memory_free)(const struct sock *sk, int wake); bool (*sock_is_readable)(struct sock *sk); /* Memory pressure */ @@ -1349,15 +1345,6 @@ int sock_load_diag_module(int family, int protocol); INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake)); -static inline int sk_forward_alloc_get(const struct sock *sk) -{ -#if IS_ENABLED(CONFIG_MPTCP) - if (sk->sk_prot->forward_alloc_get) - return sk->sk_prot->forward_alloc_get(sk); -#endif - return READ_ONCE(sk->sk_forward_alloc); -} - static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) diff --git a/net/core/sock.c b/net/core/sock.c index 53c7af0038c4..0d385bf27b38 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3882,7 +3882,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); - mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk); + mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc); mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 21f46ee7b6e9..5df1f1325259 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -153,7 +153,7 @@ void inet_sock_destruct(struct sock *sk) WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc)); WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); WARN_ON_ONCE(sk->sk_wmem_queued); - WARN_ON_ONCE(sk_forward_alloc_get(sk)); + WARN_ON_ONCE(sk->sk_forward_alloc); kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 321acc8abf17..efe2a085cf68 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -282,7 +282,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct inet_diag_meminfo minfo = { .idiag_rmem = sk_rmem_alloc_get(sk), .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), - .idiag_fmem = sk_forward_alloc_get(sk), + .idiag_fmem = READ_ONCE(sk->sk_forward_alloc), .idiag_tmem = sk_wmem_alloc_get(sk), }; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 8996c73c9779..3f2e707a11d1 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -460,7 +460,7 @@ META_COLLECTOR(int_sk_fwd_alloc) *err = -1; return; } - dst->value = sk_forward_alloc_get(sk); + dst->value = READ_ONCE(sk->sk_forward_alloc); } META_COLLECTOR(int_sk_sndbuf) -- cgit v1.2.3 From ba3fa6e8c1eb4e40719451710263c87fb5f22221 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 19 Feb 2025 16:32:55 +0200 Subject: ip_tunnel: Use ip_tunnel_info() helper instead of 'info + 1' Tunnel options should not be accessed directly, use the ip_tunnel_info() accessor instead. Signed-off-by: Gal Pressman Reviewed-by: Kees Cook Link: https://patch.msgid.link/20250219143256.370277-2-gal@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/ip_tunnels.h | 2 +- net/sched/act_tunnel_key.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 1aa31bdb2b31..7b54cea5de27 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -650,7 +650,7 @@ static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { - memcpy(to, info + 1, info->options_len); + memcpy(to, ip_tunnel_info_opts(info), info->options_len); } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index af7c99845948..ae5dea7c48a8 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -571,8 +571,8 @@ static void tunnel_key_release(struct tc_action *a) static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { + const u8 *src = ip_tunnel_info_opts(info); int len = info->options_len; - u8 *src = (u8 *)(info + 1); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE); @@ -580,7 +580,7 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, return -EMSGSIZE; while (len > 0) { - struct geneve_opt *opt = (struct geneve_opt *)src; + const struct geneve_opt *opt = (const struct geneve_opt *)src; if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS, opt->opt_class) || @@ -603,7 +603,7 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { - struct vxlan_metadata *md = (struct vxlan_metadata *)(info + 1); + const struct vxlan_metadata *md = ip_tunnel_info_opts(info); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN); @@ -622,7 +622,7 @@ static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb, static int tunnel_key_erspan_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { - struct erspan_metadata *md = (struct erspan_metadata *)(info + 1); + const struct erspan_metadata *md = ip_tunnel_info_opts(info); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN); -- cgit v1.2.3 From bb5e62f2d547c4de6d1b144cbce2373a76c33f18 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 19 Feb 2025 16:32:56 +0200 Subject: net: Add options as a flexible array to struct ip_tunnel_info Remove the hidden assumption that options are allocated at the end of the struct, and teach the compiler about them using a flexible array. With this, we can revert the unsafe_memcpy() call we have in tun_dst_unclone() [1], and resolve the false field-spanning write warning caused by the memcpy() in ip_tunnel_info_opts_set(). The layout of struct ip_tunnel_info remains the same with this patch. Before this patch, there was an implicit padding at the end of the struct, options would be written at 'info + 1' which is after the padding. This will remain the same as this patch explicitly aligns 'options'. The alignment is needed as the options are later casted to different structs, and might result in unaligned memory access. Pahole output before this patch: struct ip_tunnel_info { struct ip_tunnel_key key; /* 0 64 */ /* XXX last struct has 1 byte of padding */ /* --- cacheline 1 boundary (64 bytes) --- */ struct ip_tunnel_encap encap; /* 64 8 */ struct dst_cache dst_cache; /* 72 16 */ u8 options_len; /* 88 1 */ u8 mode; /* 89 1 */ /* size: 96, cachelines: 2, members: 5 */ /* padding: 6 */ /* paddings: 1, sum paddings: 1 */ /* last cacheline: 32 bytes */ }; Pahole output after this patch: struct ip_tunnel_info { struct ip_tunnel_key key; /* 0 64 */ /* XXX last struct has 1 byte of padding */ /* --- cacheline 1 boundary (64 bytes) --- */ struct ip_tunnel_encap encap; /* 64 8 */ struct dst_cache dst_cache; /* 72 16 */ u8 options_len; /* 88 1 */ u8 mode; /* 89 1 */ /* XXX 6 bytes hole, try to pack */ u8 options[] __attribute__((__aligned__(16))); /* 96 0 */ /* size: 96, cachelines: 2, members: 6 */ /* sum members: 90, holes: 1, sum holes: 6 */ /* paddings: 1, sum paddings: 1 */ /* forced alignments: 1, forced holes: 1, sum forced holes: 6 */ /* last cacheline: 32 bytes */ } __attribute__((__aligned__(16))); [1] Commit 13cfd6a6d7ac ("net: Silence false field-spanning write warning in metadata_dst memcpy") Link: https://lore.kernel.org/all/53D1D353-B8F6-4ADC-8F29-8C48A7C9C6F1@kernel.org/ Suggested-by: Kees Cook Reviewed-by: Cosmin Ratiu Reviewed-by: Tariq Toukan Signed-off-by: Gal Pressman Reviewed-by: Kees Cook Link: https://patch.msgid.link/20250219143256.370277-3-gal@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/dst_metadata.h | 7 ++----- include/net/ip_tunnels.h | 5 +++-- net/core/dst.c | 6 ++++-- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 84c15402931c..4160731dcb6e 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -163,11 +163,8 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) if (!new_md) return ERR_PTR(-ENOMEM); - unsafe_memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, - sizeof(struct ip_tunnel_info) + md_size, - /* metadata_dst_alloc() reserves room (md_size bytes) for - * options right after the ip_tunnel_info struct. - */); + memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, + sizeof(struct ip_tunnel_info) + md_size); #ifdef CONFIG_DST_CACHE /* Unclone the dst cache if there is one */ if (new_md->u.tun_info.dst_cache.cache) { diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 7b54cea5de27..e041e4865373 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -95,8 +95,8 @@ struct ip_tunnel_encap { #define ip_tunnel_info_opts(info) \ _Generic(info, \ - const struct ip_tunnel_info * : ((const void *)((info) + 1)),\ - struct ip_tunnel_info * : ((void *)((info) + 1))\ + const struct ip_tunnel_info * : ((const void *)(info)->options),\ + struct ip_tunnel_info * : ((void *)(info)->options)\ ) struct ip_tunnel_info { @@ -107,6 +107,7 @@ struct ip_tunnel_info { #endif u8 options_len; u8 mode; + u8 options[] __aligned_largest __counted_by(options_len); }; /* 6rd prefix/relay information */ diff --git a/net/core/dst.c b/net/core/dst.c index 9552a90d4772..c99b95cf9cbb 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -286,7 +286,8 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, { struct metadata_dst *md_dst; - md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); + md_dst = kmalloc(struct_size(md_dst, u.tun_info.options, optslen), + flags); if (!md_dst) return NULL; @@ -314,7 +315,8 @@ metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) int cpu; struct metadata_dst __percpu *md_dst; - md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen, + md_dst = __alloc_percpu_gfp(struct_size(md_dst, u.tun_info.options, + optslen), __alignof__(struct metadata_dst), flags); if (!md_dst) return NULL; -- cgit v1.2.3 From 24e82b7c045ba5afde5bd81f691b7a72283de35e Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 20 Feb 2025 15:29:29 +0800 Subject: bpf: Add networking timestamping support to bpf_get/setsockopt() The new SK_BPF_CB_FLAGS and new SK_BPF_CB_TX_TIMESTAMPING are added to bpf_get/setsockopt. The later patches will implement the BPF networking timestamping. The BPF program will use bpf_setsockopt(SK_BPF_CB_FLAGS, SK_BPF_CB_TX_TIMESTAMPING) to enable the BPF networking timestamping on a socket. Signed-off-by: Jason Xing Signed-off-by: Martin KaFai Lau Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250220072940.99994-2-kerneljasonxing@gmail.com --- include/net/sock.h | 3 +++ include/uapi/linux/bpf.h | 7 +++++++ net/core/filter.c | 23 +++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++++ 4 files changed, 40 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 60ebf3c7b229..a95eedacae76 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -303,6 +303,7 @@ struct sk_filter; * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING flags + * @sk_bpf_cb_flags: used in bpf_setsockopt() * @sk_use_task_frag: allow sk_page_frag() to use current->task_frag. * Sockets that can be used under memory reclaim should * set this to false. @@ -525,6 +526,8 @@ struct sock { u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, sk_txtime_unused : 6; +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG)) + u8 sk_bpf_cb_flags; void *sk_user_data; #ifdef CONFIG_SECURITY diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2acf9b336371..4e0632848440 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6913,6 +6913,12 @@ enum { BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, }; +enum { + SK_BPF_CB_TX_TIMESTAMPING = 1<<0, + SK_BPF_CB_MASK = (SK_BPF_CB_TX_TIMESTAMPING - 1) | + SK_BPF_CB_TX_TIMESTAMPING +}; + /* List of known BPF sock_ops operators. * New entries can only be added at the end */ @@ -7091,6 +7097,7 @@ enum { TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ + SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index ffec7b4357f9..5b7e44dfc037 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5222,6 +5222,25 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt) +{ + u32 sk_bpf_cb_flags; + + if (getopt) { + *(u32 *)optval = sk->sk_bpf_cb_flags; + return 0; + } + + sk_bpf_cb_flags = *(u32 *)optval; + + if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK) + return -EINVAL; + + sk->sk_bpf_cb_flags = sk_bpf_cb_flags; + + return 0; +} + static int sol_socket_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) @@ -5238,6 +5257,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname, case SO_MAX_PACING_RATE: case SO_BINDTOIFINDEX: case SO_TXREHASH: + case SK_BPF_CB_FLAGS: if (*optlen != sizeof(int)) return -EINVAL; break; @@ -5247,6 +5267,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname, return -EINVAL; } + if (optname == SK_BPF_CB_FLAGS) + return sk_bpf_set_get_cb_flags(sk, optval, getopt); + if (getopt) { if (optname == SO_BINDTODEVICE) return -EINVAL; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2acf9b336371..4e0632848440 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6913,6 +6913,12 @@ enum { BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, }; +enum { + SK_BPF_CB_TX_TIMESTAMPING = 1<<0, + SK_BPF_CB_MASK = (SK_BPF_CB_TX_TIMESTAMPING - 1) | + SK_BPF_CB_TX_TIMESTAMPING +}; + /* List of known BPF sock_ops operators. * New entries can only be added at the end */ @@ -7091,6 +7097,7 @@ enum { TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ + SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ }; enum { -- cgit v1.2.3 From df600f3b1d7963e2203ebf0987f564946a2647f1 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 20 Feb 2025 15:29:30 +0800 Subject: bpf: Prepare the sock_ops ctx and call bpf prog for TX timestamping This patch introduces a new bpf_skops_tx_timestamping() function that prepares the "struct bpf_sock_ops" ctx and then executes the sockops BPF program. The subsequent patch will utilize bpf_skops_tx_timestamping() at the existing TX timestamping kernel callbacks (__sk_tstamp_tx specifically) to call the sockops BPF program. Later, four callback points to report information to user space based on this patch will be introduced. Signed-off-by: Jason Xing Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250220072940.99994-3-kerneljasonxing@gmail.com --- include/net/sock.h | 7 +++++++ net/core/sock.c | 14 ++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index a95eedacae76..2f6b55c59c16 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2924,6 +2924,13 @@ int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping); void sock_enable_timestamps(struct sock *sk); +#if defined(CONFIG_CGROUP_BPF) +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op); +#else +static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) +{ +} +#endif void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); diff --git a/net/core/sock.c b/net/core/sock.c index a197f0a0b878..ba653c6a1229 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -949,6 +949,20 @@ int sock_set_timestamping(struct sock *sk, int optname, return 0; } +#if defined(CONFIG_CGROUP_BPF) +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) +{ + struct bpf_sock_ops_kern sock_ops; + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = op; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + bpf_skops_init_skb(&sock_ops, skb, 0); + __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS); +} +#endif + void sock_set_keepalive(struct sock *sk) { lock_sock(sk); -- cgit v1.2.3 From fd93eaffb3f977b23bc0a48d4c8616e654fcf133 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 20 Feb 2025 15:29:31 +0800 Subject: bpf: Prevent unsafe access to the sock fields in the BPF timestamping callback The subsequent patch will implement BPF TX timestamping. It will call the sockops BPF program without holding the sock lock. This breaks the current assumption that all sock ops programs will hold the sock lock. The sock's fields of the uapi's bpf_sock_ops requires this assumption. To address this, a new "u8 is_locked_tcp_sock;" field is added. This patch sets it in the current sock_ops callbacks. The "is_fullsock" test is then replaced by the "is_locked_tcp_sock" test during sock_ops_convert_ctx_access(). The new TX timestamping callbacks added in the subsequent patch will not have this set. This will prevent unsafe access from the new timestamping callbacks. Potentially, we could allow read-only access. However, this would require identifying which callback is read-safe-only and also requires additional BPF instruction rewrites in the covert_ctx. Since the BPF program can always read everything from a socket (e.g., by using bpf_core_cast), this patch keeps it simple and disables all read and write access to any socket fields through the bpf_sock_ops UAPI from the new TX timestamping callback. Moreover, note that some of the fields in bpf_sock_ops are specific to tcp_sock, and sock_ops currently only supports tcp_sock. In the future, UDP timestamping will be added, which will also break this assumption. The same idea used in this patch will be reused. Considering that the current sock_ops only supports tcp_sock, the variable is named is_locked_"tcp"_sock. Signed-off-by: Jason Xing Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250220072940.99994-4-kerneljasonxing@gmail.com --- include/linux/filter.h | 1 + include/net/tcp.h | 1 + net/core/filter.c | 8 ++++---- net/ipv4/tcp_input.c | 2 ++ net/ipv4/tcp_output.c | 2 ++ 5 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/linux/filter.h b/include/linux/filter.h index a3ea46281595..d36d5d5180b1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1508,6 +1508,7 @@ struct bpf_sock_ops_kern { void *skb_data_end; u8 op; u8 is_fullsock; + u8 is_locked_tcp_sock; u8 remaining_opt_len; u64 temp; /* temp and everything after is not * initialized to 0 before calling diff --git a/include/net/tcp.h b/include/net/tcp.h index 7fd2d7fa4532..94ea09faedf4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2657,6 +2657,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); if (sk_fullsock(sk)) { sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_owned_by_me(sk); } diff --git a/net/core/filter.c b/net/core/filter.c index 5b7e44dfc037..7a20ddae7667 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10382,10 +10382,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ - is_fullsock), \ + is_locked_tcp_sock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ + is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ @@ -10470,10 +10470,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ - is_fullsock), \ + is_locked_tcp_sock), \ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ + is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4686783b70de..4a9e70e23ef8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -169,6 +169,7 @@ static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); @@ -185,6 +186,7 @@ static void bpf_skops_established(struct sock *sk, int bpf_op, memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = bpf_op; sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ if (skb) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 464232a0d637..a796cf451e55 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -525,6 +525,7 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, sock_owned_by_me(sk); sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; } @@ -570,6 +571,7 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, sock_owned_by_me(sk); sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; } -- cgit v1.2.3 From b3b81e6b009dd8f85cd3b9c65eb492249c2649a8 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 20 Feb 2025 15:29:37 +0800 Subject: bpf: Add BPF_SOCK_OPS_TSTAMP_ACK_CB callback Support the ACK case for bpf timestamping. Add a new sock_ops callback, BPF_SOCK_OPS_TSTAMP_ACK_CB. This callback will occur at the same timestamping point as the user space's SCM_TSTAMP_ACK. The BPF program can use it to get the same SCM_TSTAMP_ACK timestamp without modifying the user-space application. This patch extends txstamp_ack to two bits: 1 stands for SO_TIMESTAMPING mode, 2 bpf extension. Signed-off-by: Jason Xing Signed-off-by: Martin KaFai Lau Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250220072940.99994-10-kerneljasonxing@gmail.com --- include/net/tcp.h | 6 ++++-- include/uapi/linux/bpf.h | 5 +++++ net/core/skbuff.c | 5 ++++- net/ipv4/tcp.c | 2 +- tools/include/uapi/linux/bpf.h | 5 +++++ 5 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 94ea09faedf4..ae6c95b01012 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -964,10 +964,12 @@ struct tcp_skb_cb { __u8 sacked; /* State flags for SACK. */ __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ - __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ +#define TSTAMP_ACK_SK 0x1 +#define TSTAMP_ACK_BPF 0x2 + __u8 txstamp_ack:2, /* Record TX timestamp for ack? */ eor:1, /* Is skb MSG_EOR marked? */ has_rxtstamp:1, /* SKB has a RX timestamp */ - unused:5; + unused:4; __u32 ack_seq; /* Sequence number ACK'd */ union { struct { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 72045c242396..cc10104f34a0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7044,6 +7044,11 @@ enum { * SK_BPF_CB_TX_TIMESTAMPING feature * is on. */ + BPF_SOCK_OPS_TSTAMP_ACK_CB, /* Called when all the skbs in the + * same sendmsg call are acked + * when SK_BPF_CB_TX_TIMESTAMPING + * feature is on. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 77b8866f94c5..dd33c12f00ca 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5550,7 +5550,7 @@ static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb, return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF : SKBTX_SW_TSTAMP); case SCM_TSTAMP_ACK: - return TCP_SKB_CB(skb)->txstamp_ack; + return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK; } return false; @@ -5575,6 +5575,9 @@ static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb, op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB; } break; + case SCM_TSTAMP_ACK: + op = BPF_SOCK_OPS_TSTAMP_ACK_CB; + break; default: return; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 992d5c9b2487..2171e2f045bb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -492,7 +492,7 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) sock_tx_timestamp(sk, sockc, &shinfo->tx_flags); if (tsflags & SOF_TIMESTAMPING_TX_ACK) - tcb->txstamp_ack = 1; + tcb->txstamp_ack |= TSTAMP_ACK_SK; if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 72045c242396..cc10104f34a0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7044,6 +7044,11 @@ enum { * SK_BPF_CB_TX_TIMESTAMPING feature * is on. */ + BPF_SOCK_OPS_TSTAMP_ACK_CB, /* Called when all the skbs in the + * same sendmsg call are acked + * when SK_BPF_CB_TX_TIMESTAMPING + * feature is on. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect -- cgit v1.2.3 From ca4419f15abd19ba8be1e109661b60f9f5b6c9f0 Mon Sep 17 00:00:00 2001 From: Song Yoong Siang Date: Sun, 16 Feb 2025 17:34:26 +0800 Subject: xsk: Add launch time hardware offload support to XDP Tx metadata Extend the XDP Tx metadata framework so that user can requests launch time hardware offload, where the Ethernet device will schedule the packet for transmission at a pre-determined time called launch time. The value of launch time is communicated from user space to Ethernet driver via launch_time field of struct xsk_tx_metadata. Suggested-by: Stanislav Fomichev Signed-off-by: Song Yoong Siang Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Acked-by: Jakub Kicinski Link: https://patch.msgid.link/20250216093430.957880-2-yoong.siang.song@intel.com --- Documentation/netlink/specs/netdev.yaml | 4 ++ Documentation/networking/xsk-tx-metadata.rst | 62 ++++++++++++++++++++++++++++ include/net/xdp_sock.h | 10 +++++ include/net/xdp_sock_drv.h | 1 + include/uapi/linux/if_xdp.h | 10 +++++ include/uapi/linux/netdev.h | 3 ++ net/core/netdev-genl.c | 2 + net/xdp/xsk.c | 3 ++ tools/include/uapi/linux/if_xdp.h | 10 +++++ tools/include/uapi/linux/netdev.h | 3 ++ 10 files changed, 108 insertions(+) (limited to 'include/net') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 288923e965ae..5f6f84630b16 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -70,6 +70,10 @@ definitions: name: tx-checksum doc: L3 checksum HW offload is supported by the driver. + - + name: tx-launch-time-fifo + doc: + Launch time HW offload is supported by the driver. - name: queue-type type: enum diff --git a/Documentation/networking/xsk-tx-metadata.rst b/Documentation/networking/xsk-tx-metadata.rst index e76b0cfc32f7..df53a10ccac3 100644 --- a/Documentation/networking/xsk-tx-metadata.rst +++ b/Documentation/networking/xsk-tx-metadata.rst @@ -50,6 +50,10 @@ The flags field enables the particular offload: checksum. ``csum_start`` specifies byte offset of where the checksumming should start and ``csum_offset`` specifies byte offset where the device should store the computed checksum. +- ``XDP_TXMD_FLAGS_LAUNCH_TIME``: requests the device to schedule the + packet for transmission at a pre-determined time called launch time. The + value of launch time is indicated by ``launch_time`` field of + ``union xsk_tx_metadata``. Besides the flags above, in order to trigger the offloads, the first packet's ``struct xdp_desc`` descriptor should set ``XDP_TX_METADATA`` @@ -65,6 +69,63 @@ In this case, when running in ``XDK_COPY`` mode, the TX checksum is calculated on the CPU. Do not enable this option in production because it will negatively affect performance. +Launch Time +=========== + +The value of the requested launch time should be based on the device's PTP +Hardware Clock (PHC) to ensure accuracy. AF_XDP takes a different data path +compared to the ETF queuing discipline, which organizes packets and delays +their transmission. Instead, AF_XDP immediately hands off the packets to +the device driver without rearranging their order or holding them prior to +transmission. Since the driver maintains FIFO behavior and does not perform +packet reordering, a packet with a launch time request will block other +packets in the same Tx Queue until it is sent. Therefore, it is recommended +to allocate separate queue for scheduling traffic that is intended for +future transmission. + +In scenarios where the launch time offload feature is disabled, the device +driver is expected to disregard the launch time request. For correct +interpretation and meaningful operation, the launch time should never be +set to a value larger than the farthest programmable time in the future +(the horizon). Different devices have different hardware limitations on the +launch time offload feature. + +stmmac driver +------------- + +For stmmac, TSO and launch time (TBS) features are mutually exclusive for +each individual Tx Queue. By default, the driver configures Tx Queue 0 to +support TSO and the rest of the Tx Queues to support TBS. The launch time +hardware offload feature can be enabled or disabled by using the tc-etf +command to call the driver's ndo_setup_tc() callback. + +The value of the launch time that is programmed in the Enhanced Normal +Transmit Descriptors is a 32-bit value, where the most significant 8 bits +represent the time in seconds and the remaining 24 bits represent the time +in 256 ns increments. The programmed launch time is compared against the +PTP time (bits[39:8]) and rolls over after 256 seconds. Therefore, the +horizon of the launch time for dwmac4 and dwxlgmac2 is 128 seconds in the +future. + +igc driver +---------- + +For igc, all four Tx Queues support the launch time feature. The launch +time hardware offload feature can be enabled or disabled by using the +tc-etf command to call the driver's ndo_setup_tc() callback. When entering +TSN mode, the igc driver will reset the device and create a default Qbv +schedule with a 1-second cycle time, with all Tx Queues open at all times. + +The value of the launch time that is programmed in the Advanced Transmit +Context Descriptor is a relative offset to the starting time of the Qbv +transmission window of the queue. The Frst flag of the descriptor can be +set to schedule the packet for the next Qbv cycle. Therefore, the horizon +of the launch time for i225 and i226 is the ending time of the next cycle +of the Qbv transmission window of the queue. For example, when the Qbv +cycle time is set to 1 second, the horizon of the launch time ranges +from 1 second to 2 seconds, depending on where the Qbv cycle is currently +running. + Querying Device Capabilities ============================ @@ -74,6 +135,7 @@ Refer to ``xsk-flags`` features bitmask in - ``tx-timestamp``: device supports ``XDP_TXMD_FLAGS_TIMESTAMP`` - ``tx-checksum``: device supports ``XDP_TXMD_FLAGS_CHECKSUM`` +- ``tx-launch-time-fifo``: device supports ``XDP_TXMD_FLAGS_LAUNCH_TIME`` See ``tools/net/ynl/samples/netdev.c`` on how to query this information. diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index bfe625b55d55..a58ae7589d12 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -110,11 +110,16 @@ struct xdp_sock { * indicates position where checksumming should start. * csum_offset indicates position where checksum should be stored. * + * void (*tmo_request_launch_time)(u64 launch_time, void *priv) + * Called when AF_XDP frame requested launch time HW offload support. + * launch_time indicates the PTP time at which the device can schedule the + * packet for transmission. */ struct xsk_tx_metadata_ops { void (*tmo_request_timestamp)(void *priv); u64 (*tmo_fill_timestamp)(void *priv); void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv); + void (*tmo_request_launch_time)(u64 launch_time, void *priv); }; #ifdef CONFIG_XDP_SOCKETS @@ -162,6 +167,11 @@ static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta, if (!meta) return; + if (ops->tmo_request_launch_time) + if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) + ops->tmo_request_launch_time(meta->request.launch_time, + priv); + if (ops->tmo_request_timestamp) if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) ops->tmo_request_timestamp(priv); diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 15086dcf51d8..513c8e9704f6 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -216,6 +216,7 @@ xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) #define XDP_TXMD_FLAGS_VALID ( \ XDP_TXMD_FLAGS_TIMESTAMP | \ XDP_TXMD_FLAGS_CHECKSUM | \ + XDP_TXMD_FLAGS_LAUNCH_TIME | \ 0) static inline bool diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 42ec5ddaab8d..42869770776e 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -127,6 +127,12 @@ struct xdp_options { */ #define XDP_TXMD_FLAGS_CHECKSUM (1 << 1) +/* Request launch time hardware offload. The device will schedule the packet for + * transmission at a pre-determined time called launch time. The value of + * launch time is communicated via launch_time field of struct xsk_tx_metadata. + */ +#define XDP_TXMD_FLAGS_LAUNCH_TIME (1 << 2) + /* AF_XDP offloads request. 'request' union member is consumed by the driver * when the packet is being transmitted. 'completion' union member is * filled by the driver when the transmit completion arrives. @@ -142,6 +148,10 @@ struct xsk_tx_metadata { __u16 csum_start; /* Offset from csum_start where checksum should be stored. */ __u16 csum_offset; + + /* XDP_TXMD_FLAGS_LAUNCH_TIME */ + /* Launch time in nanosecond against the PTP HW Clock */ + __u64 launch_time; } request; struct { diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 6c6ee183802d..b97ff8bbb0c6 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -59,10 +59,13 @@ enum netdev_xdp_rx_metadata { * by the driver. * @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the * driver. + * @NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO: Launch time HW offload is supported + * by the driver. */ enum netdev_xsk_flags { NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1, NETDEV_XSK_FLAGS_TX_CHECKSUM = 2, + NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO = 4, }; enum netdev_queue_type { diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index c18bb53d13fd..55bb58d623ac 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -53,6 +53,8 @@ XDP_METADATA_KFUNC_xxx xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; + if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time) + xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO; } if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 89d2bef96469..41093ea63700 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -742,6 +742,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, goto free_err; } } + + if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) + skb->skb_mstamp_ns = meta->request.launch_time; } } diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index 42ec5ddaab8d..42869770776e 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -127,6 +127,12 @@ struct xdp_options { */ #define XDP_TXMD_FLAGS_CHECKSUM (1 << 1) +/* Request launch time hardware offload. The device will schedule the packet for + * transmission at a pre-determined time called launch time. The value of + * launch time is communicated via launch_time field of struct xsk_tx_metadata. + */ +#define XDP_TXMD_FLAGS_LAUNCH_TIME (1 << 2) + /* AF_XDP offloads request. 'request' union member is consumed by the driver * when the packet is being transmitted. 'completion' union member is * filled by the driver when the transmit completion arrives. @@ -142,6 +148,10 @@ struct xsk_tx_metadata { __u16 csum_start; /* Offset from csum_start where checksum should be stored. */ __u16 csum_offset; + + /* XDP_TXMD_FLAGS_LAUNCH_TIME */ + /* Launch time in nanosecond against the PTP HW Clock */ + __u64 launch_time; } request; struct { diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 6c6ee183802d..b97ff8bbb0c6 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -59,10 +59,13 @@ enum netdev_xdp_rx_metadata { * by the driver. * @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the * driver. + * @NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO: Launch time HW offload is supported + * by the driver. */ enum netdev_xsk_flags { NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1, NETDEV_XSK_FLAGS_TX_CHECKSUM = 2, + NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO = 4, }; enum netdev_queue_type { -- cgit v1.2.3 From 585b64f5a62089ef42889b106b063d089feb6599 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 19 Feb 2025 15:50:57 +0200 Subject: xfrm: delay initialization of offload path till its actually requested XFRM offload path is probed even if offload isn't needed at all. Let's make sure that x->type_offload pointer stays NULL for such path to reduce ambiguity. Fixes: 9d389d7f84bb ("xfrm: Add a xfrm type offload.") Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 11 ++++++++++- net/xfrm/xfrm_device.c | 13 ++++++++----- net/xfrm/xfrm_state.c | 32 ++++++++++++++------------------ net/xfrm/xfrm_user.c | 2 +- 4 files changed, 33 insertions(+), 25 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index ed4b83696c77..e1eed5d47d07 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -464,6 +464,15 @@ struct xfrm_type_offload { int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); +void xfrm_set_type_offload(struct xfrm_state *x); +static inline void xfrm_unset_type_offload(struct xfrm_state *x) +{ + if (!x->type_offload) + return; + + module_put(x->type_offload->owner); + x->type_offload = NULL; +} /** * struct xfrm_mode_cbs - XFRM mode callbacks @@ -1760,7 +1769,7 @@ void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); -int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, +int __xfrm_init_state(struct xfrm_state *x, bool init_replay, struct netlink_ext_ack *extack); int xfrm_init_state(struct xfrm_state *x); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index d1fa94e52cea..97c8030cc417 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -244,11 +244,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, xfrm_address_t *daddr; bool is_packet_offload; - if (!x->type_offload) { - NL_SET_ERR_MSG(extack, "Type doesn't support offload"); - return -EINVAL; - } - if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) { NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request"); @@ -310,6 +305,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } + xfrm_set_type_offload(x); + if (!x->type_offload) { + NL_SET_ERR_MSG(extack, "Type doesn't support offload"); + dev_put(dev); + return -EINVAL; + } + xso->dev = dev; netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC); xso->real_dev = dev; @@ -332,6 +334,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, netdev_put(dev, &xso->dev_tracker); xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; + xfrm_unset_type_offload(x); /* User explicitly requested packet offload mode and configured * policy in addition to the XFRM state. So be civil to users, * and return an error instead of taking fallback path. diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ad2202fa82f3..69af5964c886 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -424,18 +424,18 @@ void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, } EXPORT_SYMBOL(xfrm_unregister_type_offload); -static const struct xfrm_type_offload * -xfrm_get_type_offload(u8 proto, unsigned short family, bool try_load) +void xfrm_set_type_offload(struct xfrm_state *x) { const struct xfrm_type_offload *type = NULL; struct xfrm_state_afinfo *afinfo; + bool try_load = true; retry: - afinfo = xfrm_state_get_afinfo(family); + afinfo = xfrm_state_get_afinfo(x->props.family); if (unlikely(afinfo == NULL)) - return NULL; + goto out; - switch (proto) { + switch (x->id.proto) { case IPPROTO_ESP: type = afinfo->type_offload_esp; break; @@ -449,18 +449,16 @@ retry: rcu_read_unlock(); if (!type && try_load) { - request_module("xfrm-offload-%d-%d", family, proto); + request_module("xfrm-offload-%d-%d", x->props.family, + x->id.proto); try_load = false; goto retry; } - return type; -} - -static void xfrm_put_type_offload(const struct xfrm_type_offload *type) -{ - module_put(type->owner); +out: + x->type_offload = type; } +EXPORT_SYMBOL(xfrm_set_type_offload); static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = { [XFRM_MODE_BEET] = { @@ -609,8 +607,6 @@ static void ___xfrm_state_destroy(struct xfrm_state *x) kfree(x->coaddr); kfree(x->replay_esn); kfree(x->preplay_esn); - if (x->type_offload) - xfrm_put_type_offload(x->type_offload); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); @@ -784,6 +780,8 @@ void xfrm_dev_state_free(struct xfrm_state *x) struct xfrm_dev_offload *xso = &x->xso; struct net_device *dev = READ_ONCE(xso->dev); + xfrm_unset_type_offload(x); + if (dev && dev->xfrmdev_ops) { spin_lock_bh(&xfrm_state_dev_gc_lock); if (!hlist_unhashed(&x->dev_gclist)) @@ -3122,7 +3120,7 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) } EXPORT_SYMBOL_GPL(xfrm_state_mtu); -int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, +int __xfrm_init_state(struct xfrm_state *x, bool init_replay, struct netlink_ext_ack *extack) { const struct xfrm_mode *inner_mode; @@ -3178,8 +3176,6 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, goto error; } - x->type_offload = xfrm_get_type_offload(x->id.proto, family, offload); - err = x->type->init_state(x, extack); if (err) goto error; @@ -3229,7 +3225,7 @@ int xfrm_init_state(struct xfrm_state *x) { int err; - err = __xfrm_init_state(x, true, false, NULL); + err = __xfrm_init_state(x, true, NULL); if (!err) x->km.state = XFRM_STATE_VALID; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 5877eabe9d95..b5266e0848e8 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -919,7 +919,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, goto error; } - err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack); + err = __xfrm_init_state(x, false, extack); if (err) goto error; -- cgit v1.2.3 From b6ccf61aa4fda43f7961f2e83433bc6e6b78ab95 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 19 Feb 2025 15:50:58 +0200 Subject: xfrm: simplify SA initialization routine SA replay mode is initialized differently for user-space and kernel-space users, but the call to xfrm_init_replay() existed in common path with boolean protection. That caused to situation where we have two different function orders. So let's rewrite the SA initialization flow to have same order for both in-kernel and user-space callers. Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 +-- net/xfrm/xfrm_state.c | 22 ++++++++++------------ net/xfrm/xfrm_user.c | 2 +- 3 files changed, 12 insertions(+), 15 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e1eed5d47d07..15997374a594 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1769,8 +1769,7 @@ void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); -int __xfrm_init_state(struct xfrm_state *x, bool init_replay, - struct netlink_ext_ack *extack); +int __xfrm_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack); int xfrm_init_state(struct xfrm_state *x); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 69af5964c886..7b1028671144 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -3120,8 +3120,7 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) } EXPORT_SYMBOL_GPL(xfrm_state_mtu); -int __xfrm_init_state(struct xfrm_state *x, bool init_replay, - struct netlink_ext_ack *extack) +int __xfrm_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { const struct xfrm_mode *inner_mode; const struct xfrm_mode *outer_mode; @@ -3188,12 +3187,6 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, } x->outer_mode = *outer_mode; - if (init_replay) { - err = xfrm_init_replay(x, extack); - if (err) - goto error; - } - if (x->nat_keepalive_interval) { if (x->dir != XFRM_SA_DIR_OUT) { NL_SET_ERR_MSG(extack, "NAT keepalive is only supported for outbound SAs"); @@ -3225,11 +3218,16 @@ int xfrm_init_state(struct xfrm_state *x) { int err; - err = __xfrm_init_state(x, true, NULL); - if (!err) - x->km.state = XFRM_STATE_VALID; + err = __xfrm_init_state(x, NULL); + if (err) + return err; - return err; + err = xfrm_init_replay(x, NULL); + if (err) + return err; + + x->km.state = XFRM_STATE_VALID; + return 0; } EXPORT_SYMBOL(xfrm_init_state); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index b5266e0848e8..784a2d124749 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -919,7 +919,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, goto error; } - err = __xfrm_init_state(x, false, extack); + err = __xfrm_init_state(x, extack); if (err) goto error; -- cgit v1.2.3 From ca70c104e1511c8de734ab8863a40f7c8e21a948 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 19 Feb 2025 15:51:01 +0200 Subject: xfrm: check for PMTU in tunnel mode for packet offload In tunnel mode, for the packet offload, there were no PMTU signaling to the upper level about need to fragment the packet. As a solution, call to already existing xfrm[4|6]_tunnel_check_size() to perform that. Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 9 +++++++++ net/xfrm/xfrm_device.c | 10 ++++++++-- net/xfrm/xfrm_output.c | 6 ++++-- 3 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 15997374a594..39365fd2ea17 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1781,6 +1781,15 @@ int xfrm_trans_queue(struct sk_buff *skb, struct sk_buff *)); int xfrm_output_resume(struct sock *sk, struct sk_buff *skb, int err); int xfrm_output(struct sock *sk, struct sk_buff *skb); +int xfrm4_tunnel_check_size(struct sk_buff *skb); +#if IS_ENABLED(CONFIG_IPV6) +int xfrm6_tunnel_check_size(struct sk_buff *skb); +#else +static inline int xfrm6_tunnel_check_size(struct sk_buff *skb) +{ + return -EMSGSIZE; +} +#endif #if IS_ENABLED(CONFIG_NET_PKTGEN) int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index f9d985ef30f2..d62f76161d83 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -418,12 +418,12 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) struct dst_entry *dst = skb_dst(skb); struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct net_device *dev = x->xso.dev; + bool check_tunnel_size; if (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED) return false; - if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET || - ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm)) { + if ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) { mtu = xfrm_state_mtu(x, xdst->child_mtu_cached); if (skb->len <= mtu) goto ok; @@ -435,16 +435,22 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) return false; ok: + check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET && + x->props.mode == XFRM_MODE_TUNNEL; switch (x->props.family) { case AF_INET: /* Check for IPv4 options */ if (ip_hdr(skb)->ihl != 5) return false; + if (check_tunnel_size && xfrm4_tunnel_check_size(skb)) + return false; break; case AF_INET6: /* Check for IPv6 extensions */ if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr)) return false; + if (check_tunnel_size && xfrm6_tunnel_check_size(skb)) + return false; break; default: break; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index f7abd42c077d..34c8e266641c 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -786,7 +786,7 @@ out: } EXPORT_SYMBOL_GPL(xfrm_output); -static int xfrm4_tunnel_check_size(struct sk_buff *skb) +int xfrm4_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; @@ -812,6 +812,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) out: return ret; } +EXPORT_SYMBOL_GPL(xfrm4_tunnel_check_size); static int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb) { @@ -834,7 +835,7 @@ static int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb) } #if IS_ENABLED(CONFIG_IPV6) -static int xfrm6_tunnel_check_size(struct sk_buff *skb) +int xfrm6_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; struct dst_entry *dst = skb_dst(skb); @@ -864,6 +865,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) out: return ret; } +EXPORT_SYMBOL_GPL(xfrm6_tunnel_check_size); #endif static int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) -- cgit v1.2.3 From 69c7be1b903fca2835e80ec506bd1d75ce84fb4d Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Wed, 19 Feb 2025 20:50:28 +0800 Subject: rtnetlink: Pack newlink() params into struct There are 4 net namespaces involved when creating links: - source netns - where the netlink socket resides, - target netns - where to put the device being created, - link netns - netns associated with the device (backend), - peer netns - netns of peer device. Currently, two nets are passed to newlink() callback - "src_net" parameter and "dev_net" (implicitly in net_device). They are set as follows, depending on netlink attributes in the request. +------------+-------------------+---------+---------+ | peer netns | IFLA_LINK_NETNSID | src_net | dev_net | +------------+-------------------+---------+---------+ | | absent | source | target | | absent +-------------------+---------+---------+ | | present | link | link | +------------+-------------------+---------+---------+ | | absent | peer | target | | present +-------------------+---------+---------+ | | present | peer | link | +------------+-------------------+---------+---------+ When IFLA_LINK_NETNSID is present, the device is created in link netns first and then moved to target netns. This has some side effects, including extra ifindex allocation, ifname validation and link events. These could be avoided if we create it in target netns from the beginning. On the other hand, the meaning of src_net parameter is ambiguous. It varies depending on how parameters are passed. It is the effective link (or peer netns) by design, but some drivers ignore it and use dev_net instead. To provide more netns context for drivers, this patch packs existing newlink() parameters, along with the source netns, link netns and peer netns, into a struct. The old "src_net" is renamed to "net" to avoid confusion with real source netns, and will be deprecated later. The use of src_net are converted to params->net trivially. Signed-off-by: Xiao Liang Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250219125039.18024-3-shaw.leon@gmail.com Signed-off-by: Jakub Kicinski --- drivers/infiniband/ulp/ipoib/ipoib_netlink.c | 7 ++++-- drivers/net/amt.c | 7 ++++-- drivers/net/bareudp.c | 7 ++++-- drivers/net/bonding/bond_netlink.c | 6 ++++-- drivers/net/can/dev/netlink.c | 4 ++-- drivers/net/can/vxcan.c | 7 ++++-- drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 7 ++++-- drivers/net/geneve.c | 7 ++++-- drivers/net/gtp.c | 6 ++++-- drivers/net/ipvlan/ipvlan.h | 3 +-- drivers/net/ipvlan/ipvlan_main.c | 6 ++++-- drivers/net/ipvlan/ipvtap.c | 6 +++--- drivers/net/macsec.c | 7 ++++-- drivers/net/macvlan.c | 19 +++++++++------- drivers/net/macvtap.c | 6 +++--- drivers/net/netkit.c | 14 +++++++----- drivers/net/pfcp.c | 5 +++-- drivers/net/ppp/ppp_generic.c | 7 ++++-- drivers/net/team/team_core.c | 6 ++++-- drivers/net/veth.c | 7 ++++-- drivers/net/vrf.c | 5 +++-- drivers/net/vxlan/vxlan_core.c | 7 ++++-- drivers/net/wireguard/device.c | 5 +++-- drivers/net/wireless/virtual/virt_wifi.c | 6 ++++-- drivers/net/wwan/wwan_core.c | 16 ++++++++++---- include/linux/if_macvlan.h | 6 ++++-- include/net/rtnetlink.h | 25 ++++++++++++++++++---- net/8021q/vlan_netlink.c | 7 ++++-- net/batman-adv/soft-interface.c | 9 ++++---- net/bridge/br_netlink.c | 6 ++++-- net/caif/chnl_net.c | 5 +++-- net/core/rtnetlink.c | 15 +++++++++---- net/hsr/hsr_netlink.c | 6 ++++-- net/ieee802154/6lowpan/core.c | 5 +++-- net/ipv4/ip_gre.c | 16 +++++++++----- net/ipv4/ip_vti.c | 6 ++++-- net/ipv4/ipip.c | 6 ++++-- net/ipv6/ip6_gre.c | 14 ++++++++---- net/ipv6/ip6_tunnel.c | 6 ++++-- net/ipv6/ip6_vti.c | 5 +++-- net/ipv6/sit.c | 6 ++++-- net/xfrm/xfrm_interface_core.c | 7 +++--- 42 files changed, 223 insertions(+), 110 deletions(-) (limited to 'include/net') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c index 9ad8d9856275..16cb8ced9f35 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -97,10 +97,13 @@ out_err: return ret; } -static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipoib_new_child_link(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct net *src_net = params->net; + struct nlattr **tb = params->tb; struct net_device *pdev; struct ipoib_dev_priv *ppriv; u16 child_pkey; diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 98c6205ed19f..96b7ec9a2c13 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -3161,11 +3161,14 @@ static int amt_validate(struct nlattr *tb[], struct nlattr *data[], return 0; } -static int amt_newlink(struct net *net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int amt_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct amt_dev *amt = netdev_priv(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; + struct net *net = params->net; int err = -EINVAL; amt->net = net; diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 70814303aab8..fc21dcfb4848 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -698,10 +698,13 @@ static void bareudp_dellink(struct net_device *dev, struct list_head *head) unregister_netdevice_queue(dev, head); } -static int bareudp_newlink(struct net *net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int bareudp_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; + struct net *net = params->net; struct bareudp_conf conf; int err; diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index 2a6a424806aa..ac5e402c34bc 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -564,10 +564,12 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[], return 0; } -static int bond_newlink(struct net *src_net, struct net_device *bond_dev, - struct nlattr *tb[], struct nlattr *data[], +static int bond_newlink(struct net_device *bond_dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; int err; err = bond_changelink(bond_dev, tb, data, extack); diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c index 01aacdcda260..f1db9b7ffd4d 100644 --- a/drivers/net/can/dev/netlink.c +++ b/drivers/net/can/dev/netlink.c @@ -624,8 +624,8 @@ nla_put_failure: return -EMSGSIZE; } -static int can_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int can_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c index ca8811941085..6f8ebb1cfd7b 100644 --- a/drivers/net/can/vxcan.c +++ b/drivers/net/can/vxcan.c @@ -172,10 +172,13 @@ static void vxcan_setup(struct net_device *dev) /* forward declaration for rtnl_create_link() */ static struct rtnl_link_ops vxcan_link_ops; -static int vxcan_newlink(struct net *peer_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vxcan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct net *peer_net = params->net; + struct nlattr **tb = params->tb; struct vxcan_priv *priv; struct net_device *peer; diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index f3bea196a8f9..8151e91395e2 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -117,11 +117,14 @@ static void rmnet_unregister_bridge(struct rmnet_port *port) rmnet_unregister_real_device(bridge_dev); } -static int rmnet_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int rmnet_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { u32 data_format = RMNET_FLAGS_INGRESS_DEAGGREGATION; + struct nlattr **data = params->data; + struct net *src_net = params->net; + struct nlattr **tb = params->tb; struct net_device *real_dev; int mode = RMNET_EPMODE_VND; struct rmnet_endpoint *ep; diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index dbb3960126ee..d6e8b2521052 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1614,10 +1614,13 @@ static void geneve_link_config(struct net_device *dev, geneve_change_mtu(dev, ldev_mtu - info->options_len); } -static int geneve_newlink(struct net *net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int geneve_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; + struct net *net = params->net; struct geneve_config cfg = { .df = GENEVE_DF_UNSET, .use_udp6_rx_checksums = false, diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index b7b46c5e6399..762514af8b30 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1462,10 +1462,12 @@ static int gtp_create_sockets(struct gtp_dev *gtp, const struct nlattr *nla, #define GTP_TH_MAXLEN (sizeof(struct udphdr) + sizeof(struct gtp0_header)) #define GTP_IPV6_MAXLEN (sizeof(struct ipv6hdr) + GTP_TH_MAXLEN) -static int gtp_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int gtp_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct net *src_net = params->net; unsigned int role = GTP_ROLE_GGSN; struct gtp_dev *gtp; struct gtp_net *gn; diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index 025e0c19ec25..50de3ee204db 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -166,8 +166,7 @@ struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type); void ipvlan_count_rx(const struct ipvl_dev *ipvlan, unsigned int len, bool success, bool mcast); -int ipvlan_link_new(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack); void ipvlan_link_delete(struct net_device *dev, struct list_head *head); void ipvlan_link_setup(struct net_device *dev); diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index da3a97a65507..19ce19ca7e32 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -532,11 +532,13 @@ err: return ret; } -int ipvlan_link_new(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct ipvl_dev *ipvlan = netdev_priv(dev); + struct nlattr **data = params->data; + struct net *src_net = params->net; + struct nlattr **tb = params->tb; struct ipvl_port *port; struct net_device *phy_dev; int err; diff --git a/drivers/net/ipvlan/ipvtap.c b/drivers/net/ipvlan/ipvtap.c index 1afc4c47be73..edd13916831a 100644 --- a/drivers/net/ipvlan/ipvtap.c +++ b/drivers/net/ipvlan/ipvtap.c @@ -73,8 +73,8 @@ static void ipvtap_update_features(struct tap_dev *tap, netdev_update_features(vlan->dev); } -static int ipvtap_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipvtap_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct ipvtap_dev *vlantap = netdev_priv(dev); @@ -97,7 +97,7 @@ static int ipvtap_newlink(struct net *src_net, struct net_device *dev, /* Don't put anything that may fail after macvlan_common_newlink * because we can't undo what it does. */ - err = ipvlan_link_new(src_net, dev, tb, data, extack); + err = ipvlan_link_new(dev, params, extack); if (err) { netdev_rx_handler_unregister(dev); return err; diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 1bc1e5993f56..1869b0513f57 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -4141,11 +4141,14 @@ static int macsec_add_dev(struct net_device *dev, sci_t sci, u8 icv_len) static struct lock_class_key macsec_netdev_addr_lock_key; -static int macsec_newlink(struct net *net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int macsec_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct macsec_dev *macsec = macsec_priv(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; + struct net *net = params->net; rx_handler_func_t *rx_handler; u8 icv_len = MACSEC_DEFAULT_ICV_LEN; struct net_device *real_dev; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index fed4fe2a4748..f903b414eaeb 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1440,16 +1440,19 @@ static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode, return 0; } -int macvlan_common_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +int macvlan_common_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); - struct macvlan_port *port; + struct nlattr **data = params->data; + struct net *src_net = params->net; + struct nlattr **tb = params->tb; struct net_device *lowerdev; - int err; - int macmode; + struct macvlan_port *port; bool create = false; + int macmode; + int err; if (!tb[IFLA_LINK]) return -EINVAL; @@ -1565,11 +1568,11 @@ destroy_macvlan_port: } EXPORT_SYMBOL_GPL(macvlan_common_newlink); -static int macvlan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int macvlan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - return macvlan_common_newlink(src_net, dev, tb, data, extack); + return macvlan_common_newlink(dev, params, extack); } void macvlan_dellink(struct net_device *dev, struct list_head *head) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 29a5929d48e5..b391a0f740a3 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -77,8 +77,8 @@ static void macvtap_update_features(struct tap_dev *tap, netdev_update_features(vlan->dev); } -static int macvtap_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int macvtap_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct macvtap_dev *vlantap = netdev_priv(dev); @@ -105,7 +105,7 @@ static int macvtap_newlink(struct net *src_net, struct net_device *dev, /* Don't put anything that may fail after macvlan_common_newlink * because we can't undo what it does. */ - err = macvlan_common_newlink(src_net, dev, tb, data, extack); + err = macvlan_common_newlink(dev, params, extack); if (err) { netdev_rx_handler_unregister(dev); return err; diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 1e1b00756be7..640a2dbbbd28 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -327,17 +327,20 @@ static int netkit_validate(struct nlattr *tb[], struct nlattr *data[], static struct rtnl_link_ops netkit_link_ops; -static int netkit_new_link(struct net *peer_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int netkit_new_link(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb, *attr; - enum netkit_action policy_prim = NETKIT_PASS; - enum netkit_action policy_peer = NETKIT_PASS; enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT; enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT; + struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr; + enum netkit_action policy_prim = NETKIT_PASS; + enum netkit_action policy_peer = NETKIT_PASS; + struct nlattr **data = params->data; + struct net *peer_net = params->net; enum netkit_mode mode = NETKIT_L3; unsigned char ifname_assign_type; + struct nlattr **tb = params->tb; u16 headroom = 0, tailroom = 0; struct ifinfomsg *ifmp = NULL; struct net_device *peer; @@ -345,6 +348,7 @@ static int netkit_new_link(struct net *peer_net, struct net_device *dev, struct netkit *nk; int err; + tbp = tb; if (data) { if (data[IFLA_NETKIT_MODE]) mode = nla_get_u32(data[IFLA_NETKIT_MODE]); diff --git a/drivers/net/pfcp.c b/drivers/net/pfcp.c index 68d0d9e92a22..7b0575940e1d 100644 --- a/drivers/net/pfcp.c +++ b/drivers/net/pfcp.c @@ -184,11 +184,12 @@ static int pfcp_add_sock(struct pfcp_dev *pfcp) return PTR_ERR_OR_ZERO(pfcp->sock); } -static int pfcp_newlink(struct net *net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int pfcp_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct pfcp_dev *pfcp = netdev_priv(dev); + struct net *net = params->net; struct pfcp_net *pn; int err; diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 4583e15ad03a..b3340f8a6149 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1303,10 +1303,13 @@ static int ppp_nl_validate(struct nlattr *tb[], struct nlattr *data[], return 0; } -static int ppp_nl_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ppp_nl_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct net *src_net = params->net; + struct nlattr **tb = params->tb; struct ppp_config conf = { .unit = -1, .ifname_is_set = true, diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index f4019815f473..78edb8186b6d 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -2218,10 +2218,12 @@ static void team_setup(struct net_device *dev) dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; } -static int team_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int team_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **tb = params->tb; + if (tb[IFLA_ADDRESS] == NULL) eth_hw_addr_random(dev); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 01251868a9c2..7dfda89f072f 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1765,10 +1765,13 @@ static int veth_init_queues(struct net_device *dev, struct nlattr *tb[]) return 0; } -static int veth_newlink(struct net *peer_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int veth_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct net *peer_net = params->net; + struct nlattr **tb = params->tb; int err; struct net_device *peer; struct veth_priv *priv; diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 5f21ce1013c4..849c3ced2690 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1675,11 +1675,12 @@ static void vrf_dellink(struct net_device *dev, struct list_head *head) unregister_netdevice_queue(dev, head); } -static int vrf_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vrf_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net_vrf *vrf = netdev_priv(dev); + struct nlattr **data = params->data; struct netns_vrf *nn_vrf; bool *add_fib_rules; struct net *net; diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index e2354c02def0..1a1d03abb6b9 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -4400,10 +4400,13 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], return 0; } -static int vxlan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vxlan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct net *src_net = params->net; + struct nlattr **tb = params->tb; struct vxlan_config conf; int err; diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 6cf173a008e7..404cf05bd72b 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -307,11 +307,12 @@ static void wg_setup(struct net_device *dev) wg->dev = dev; } -static int wg_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int wg_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct wg_device *wg = netdev_priv(dev); + struct net *src_net = params->net; int ret = -ENOMEM; rcu_assign_pointer(wg->creating_net, src_net); diff --git a/drivers/net/wireless/virtual/virt_wifi.c b/drivers/net/wireless/virtual/virt_wifi.c index 4ee374080466..26905b2b3ba3 100644 --- a/drivers/net/wireless/virtual/virt_wifi.c +++ b/drivers/net/wireless/virtual/virt_wifi.c @@ -519,11 +519,13 @@ static rx_handler_result_t virt_wifi_rx_handler(struct sk_buff **pskb) } /* Called with rtnl lock held. */ -static int virt_wifi_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int virt_wifi_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); + struct net *src_net = params->net; + struct nlattr **tb = params->tb; int err; if (!tb[IFLA_LINK]) diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c index a51e2755991a..a05c49b4e7f8 100644 --- a/drivers/net/wwan/wwan_core.c +++ b/drivers/net/wwan/wwan_core.c @@ -967,15 +967,18 @@ out: return dev; } -static int wwan_rtnl_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int wwan_rtnl_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct wwan_device *wwandev = wwan_dev_get_by_parent(dev->dev.parent); - u32 link_id = nla_get_u32(data[IFLA_WWAN_LINK_ID]); struct wwan_netdev_priv *priv = netdev_priv(dev); + struct nlattr **data = params->data; + u32 link_id; int ret; + link_id = nla_get_u32(data[IFLA_WWAN_LINK_ID]); + if (IS_ERR(wwandev)) return PTR_ERR(wwandev); @@ -1061,6 +1064,11 @@ static void wwan_create_default_link(struct wwan_device *wwandev, { struct nlattr *tb[IFLA_MAX + 1], *linkinfo[IFLA_INFO_MAX + 1]; struct nlattr *data[IFLA_WWAN_MAX + 1]; + struct rtnl_newlink_params params = { + .net = &init_net, + .tb = tb, + .data = data, + }; struct net_device *dev; struct nlmsghdr *nlh; struct sk_buff *msg; @@ -1105,7 +1113,7 @@ static void wwan_create_default_link(struct wwan_device *wwandev, if (WARN_ON(IS_ERR(dev))) goto unlock; - if (WARN_ON(wwan_rtnl_newlink(&init_net, dev, tb, data, NULL))) { + if (WARN_ON(wwan_rtnl_newlink(dev, ¶ms, NULL))) { free_netdev(dev); goto unlock; } diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 523025106a64..0f7281e3e448 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -59,8 +59,10 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan, extern void macvlan_common_setup(struct net_device *dev); -extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +struct rtnl_newlink_params; + +extern int macvlan_common_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack); extern void macvlan_dellink(struct net_device *dev, struct list_head *head); diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index bc0069a8b6ea..563a6a27436c 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -69,6 +69,25 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) return AF_UNSPEC; } +/** + * struct rtnl_newlink_params - parameters of rtnl_link_ops::newlink() + * + * @net: Netns of interest + * @src_net: Source netns of rtnetlink socket + * @link_net: Link netns by IFLA_LINK_NETNSID, NULL if not specified + * @peer_net: Peer netns + * @tb: IFLA_* attributes + * @data: IFLA_INFO_DATA attributes + */ +struct rtnl_newlink_params { + struct net *net; + struct net *src_net; + struct net *link_net; + struct net *peer_net; + struct nlattr **tb; + struct nlattr **data; +}; + /** * struct rtnl_link_ops - rtnetlink link operations * @@ -125,10 +144,8 @@ struct rtnl_link_ops { struct nlattr *data[], struct netlink_ext_ack *extack); - int (*newlink)(struct net *src_net, - struct net_device *dev, - struct nlattr *tb[], - struct nlattr *data[], + int (*newlink)(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack); int (*changelink)(struct net_device *dev, struct nlattr *tb[], diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 134419667d59..91df0f96e32a 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -135,11 +135,14 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], return 0; } -static int vlan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vlan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + struct nlattr **data = params->data; + struct net *src_net = params->net; + struct nlattr **tb = params->tb; struct net_device *real_dev; unsigned int max_mtu; __be16 proto; diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 822d788a5f86..d893c8013261 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1077,19 +1077,18 @@ static int batadv_softif_validate(struct nlattr *tb[], struct nlattr *data[], /** * batadv_softif_newlink() - pre-initialize and register new batadv link - * @src_net: the applicable net namespace * @dev: network device to register - * @tb: IFLA_INFO_DATA netlink attributes - * @data: enum batadv_ifla_attrs attributes + * @params: rtnl newlink parameters * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ -static int batadv_softif_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int batadv_softif_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct batadv_priv *bat_priv = netdev_priv(dev); + struct nlattr **data = params->data; const char *algo_name; int err; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 3e0f47203f2a..6e337937d0d7 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1553,11 +1553,13 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], return 0; } -static int br_dev_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int br_dev_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; int err; err = register_netdevice(dev); diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 94ad09e36df2..fa6a3c2634a8 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -438,10 +438,11 @@ static void caif_netlink_parms(struct nlattr *data[], } } -static int ipcaif_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipcaif_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; int ret; struct chnl_net *caifdev; ASSERT_RTNL(); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 0f3e2c6021de..9ebbde0d131c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3767,7 +3767,13 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, struct netlink_ext_ack *extack) { unsigned char name_assign_type = NET_NAME_USER; - struct net *net = sock_net(skb->sk); + struct rtnl_newlink_params params = { + .src_net = sock_net(skb->sk), + .link_net = link_net, + .peer_net = peer_net, + .tb = tb, + .data = data, + }; u32 portid = NETLINK_CB(skb).portid; struct net_device *dev; char ifname[IFNAMSIZ]; @@ -3792,13 +3798,14 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, dev->ifindex = ifm->ifi_index; + params.net = params.src_net; if (link_net) - net = link_net; + params.net = link_net; if (peer_net) - net = peer_net; + params.net = peer_net; if (ops->newlink) - err = ops->newlink(net, dev, tb, data, extack); + err = ops->newlink(dev, ¶ms, extack); else err = register_netdevice(dev); if (err < 0) { diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index b68f2f71d0e1..39add538ba99 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -29,10 +29,12 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = { /* Here, it seems a netdevice has already been allocated for us, and the * hsr_dev_setup routine has been executed. Nice! */ -static int hsr_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int hsr_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct net *src_net = params->net; enum hsr_version proto_version; unsigned char multicast_spec; u8 proto = HSR_PROTOCOL_HSR; diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 175efd860f7b..704bf9e3f097 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -129,10 +129,11 @@ static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[], return 0; } -static int lowpan_newlink(struct net *src_net, struct net_device *ldev, - struct nlattr *tb[], struct nlattr *data[], +static int lowpan_newlink(struct net_device *ldev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **tb = params->tb; struct net_device *wdev; int ret; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index c9f11a046c26..1fe9b13d351c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1396,10 +1396,12 @@ ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[]) return 0; } -static int ipgre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipgre_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1414,10 +1416,12 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, return ip_tunnel_newlink(dev, tb, &p, fwmark); } -static int erspan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int erspan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1697,6 +1701,7 @@ static struct rtnl_link_ops erspan_link_ops __read_mostly = { struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type) { + struct rtnl_newlink_params params = { .net = net }; struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; LIST_HEAD(list_kill); @@ -1704,6 +1709,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, int err; memset(&tb, 0, sizeof(tb)); + params.tb = tb; dev = rtnl_create_link(net, name, name_assign_type, &ipgre_tap_ops, tb, NULL); @@ -1714,7 +1720,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, t = netdev_priv(dev); t->collect_md = true; - err = ipgre_newlink(net, dev, tb, NULL, NULL); + err = ipgre_newlink(dev, ¶ms, NULL); if (err < 0) { free_netdev(dev); return ERR_PTR(err); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index f0b4419cef34..b901bee03e6d 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -575,11 +575,13 @@ static void vti_netlink_parms(struct nlattr *data[], *fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } -static int vti_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vti_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; struct ip_tunnel_parm_kern parms; + struct nlattr **tb = params->tb; __u32 fwmark = 0; vti_netlink_parms(data, &parms, &fwmark); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index dc0db5895e0e..a8b844bcfc64 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -436,11 +436,13 @@ static void ipip_netlink_parms(struct nlattr *data[], *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } -static int ipip_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipip_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 235808cfec70..863852abe8ea 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -2005,11 +2005,14 @@ out: return err; } -static int ip6gre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ip6gre_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct ip6_tnl *nt = netdev_priv(dev); + struct nlattr **data = params->data; + struct net *src_net = params->net; + struct nlattr **tb = params->tb; struct net *net = dev_net(dev); struct ip6gre_net *ign; int err; @@ -2241,11 +2244,14 @@ static void ip6erspan_tap_setup(struct net_device *dev) netif_keep_dst(dev); } -static int ip6erspan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ip6erspan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct ip6_tnl *nt = netdev_priv(dev); + struct nlattr **data = params->data; + struct net *src_net = params->net; + struct nlattr **tb = params->tb; struct net *net = dev_net(dev); struct ip6gre_net *ign; int err; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 48fd53b98972..54b843d20870 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2002,10 +2002,12 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[], parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } -static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ip6_tnl_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip_tunnel_encap ipencap; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 590737c27537..993f85aeb882 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -997,10 +997,11 @@ static void vti6_netlink_parms(struct nlattr *data[], parms->fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } -static int vti6_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vti6_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; struct net *net = dev_net(dev); struct ip6_tnl *nt; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 39bd8951bfca..e2bd52cabdee 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1550,10 +1550,12 @@ static bool ipip6_netlink_6rd_parms(struct nlattr *data[], } #endif -static int ipip6_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipip6_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct net *net = dev_net(dev); struct ip_tunnel *nt; struct ip_tunnel_encap ipencap; diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index c397eb99d867..5659a6cadd51 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -814,10 +814,11 @@ static void xfrmi_netlink_parms(struct nlattr *data[], parms->collect_md = true; } -static int xfrmi_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) +static int xfrmi_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, + struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; struct net *net = dev_net(dev); struct xfrm_if_parms p = {}; struct xfrm_if *xi; -- cgit v1.2.3 From cf517ac16ad96f3953d65ea198c0b310a1ffa14f Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Wed, 19 Feb 2025 20:50:29 +0800 Subject: net: Use link/peer netns in newlink() of rtnl_link_ops Add two helper functions - rtnl_newlink_link_net() and rtnl_newlink_peer_net() for netns fallback logic. Peer netns falls back to link netns, and link netns falls back to source netns. Convert the use of params->net in netdevice drivers to one of the helper functions for clarity. Signed-off-by: Xiao Liang Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250219125039.18024-4-shaw.leon@gmail.com Signed-off-by: Jakub Kicinski --- drivers/infiniband/ulp/ipoib/ipoib_netlink.c | 4 ++-- drivers/net/amt.c | 6 +++--- drivers/net/bareudp.c | 4 ++-- drivers/net/can/vxcan.c | 2 +- drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 4 ++-- drivers/net/geneve.c | 4 ++-- drivers/net/gtp.c | 6 +++--- drivers/net/ipvlan/ipvlan_main.c | 4 ++-- drivers/net/macsec.c | 4 ++-- drivers/net/macvlan.c | 4 ++-- drivers/net/netkit.c | 2 +- drivers/net/pfcp.c | 6 +++--- drivers/net/ppp/ppp_generic.c | 4 ++-- drivers/net/veth.c | 2 +- drivers/net/vxlan/vxlan_core.c | 4 ++-- drivers/net/wireguard/device.c | 4 ++-- drivers/net/wireless/virtual/virt_wifi.c | 4 ++-- drivers/net/wwan/wwan_core.c | 2 +- include/net/rtnetlink.h | 17 +++++++++++++++++ net/8021q/vlan_netlink.c | 4 ++-- net/hsr/hsr_netlink.c | 8 ++++---- 21 files changed, 58 insertions(+), 41 deletions(-) (limited to 'include/net') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c index 16cb8ced9f35..53db7c8191e3 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -101,8 +101,8 @@ static int ipoib_new_child_link(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **data = params->data; - struct net *src_net = params->net; struct nlattr **tb = params->tb; struct net_device *pdev; struct ipoib_dev_priv *ppriv; @@ -112,7 +112,7 @@ static int ipoib_new_child_link(struct net_device *dev, if (!tb[IFLA_LINK]) return -EINVAL; - pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + pdev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!pdev || pdev->type != ARPHRD_INFINIBAND) return -ENODEV; diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 96b7ec9a2c13..53899b70fae1 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -3165,13 +3165,13 @@ static int amt_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct amt_dev *amt = netdev_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; - struct net *net = params->net; int err = -EINVAL; - amt->net = net; + amt->net = link_net; amt->mode = nla_get_u32(data[IFLA_AMT_MODE]); if (data[IFLA_AMT_MAX_TUNNELS] && @@ -3186,7 +3186,7 @@ static int amt_newlink(struct net_device *dev, amt->hash_buckets = AMT_HSIZE; amt->nr_tunnels = 0; get_random_bytes(&amt->hash_seed, sizeof(amt->hash_seed)); - amt->stream_dev = dev_get_by_index(net, + amt->stream_dev = dev_get_by_index(link_net, nla_get_u32(data[IFLA_AMT_LINK])); if (!amt->stream_dev) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_AMT_LINK], diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index fc21dcfb4848..d1473c5f8eef 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -702,9 +702,9 @@ static int bareudp_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **data = params->data; struct nlattr **tb = params->tb; - struct net *net = params->net; struct bareudp_conf conf; int err; @@ -712,7 +712,7 @@ static int bareudp_newlink(struct net_device *dev, if (err) return err; - err = bareudp_configure(net, dev, &conf, extack); + err = bareudp_configure(link_net, dev, &conf, extack); if (err) return err; diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c index 6f8ebb1cfd7b..99a78a757167 100644 --- a/drivers/net/can/vxcan.c +++ b/drivers/net/can/vxcan.c @@ -176,8 +176,8 @@ static int vxcan_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *peer_net = rtnl_newlink_peer_net(params); struct nlattr **data = params->data; - struct net *peer_net = params->net; struct nlattr **tb = params->tb; struct vxcan_priv *priv; struct net_device *peer; diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 8151e91395e2..ba8763cac9d9 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -121,9 +121,9 @@ static int rmnet_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); u32 data_format = RMNET_FLAGS_INGRESS_DEAGGREGATION; struct nlattr **data = params->data; - struct net *src_net = params->net; struct nlattr **tb = params->tb; struct net_device *real_dev; int mode = RMNET_EPMODE_VND; @@ -137,7 +137,7 @@ static int rmnet_newlink(struct net_device *dev, return -EINVAL; } - real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + real_dev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!real_dev) { NL_SET_ERR_MSG_MOD(extack, "link does not exist"); return -ENODEV; diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index d6e8b2521052..fc62b25e0362 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1618,9 +1618,9 @@ static int geneve_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **data = params->data; struct nlattr **tb = params->tb; - struct net *net = params->net; struct geneve_config cfg = { .df = GENEVE_DF_UNSET, .use_udp6_rx_checksums = false, @@ -1634,7 +1634,7 @@ static int geneve_newlink(struct net_device *dev, if (err) return err; - err = geneve_configure(net, dev, extack, &cfg); + err = geneve_configure(link_net, dev, extack, &cfg); if (err) return err; diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 762514af8b30..ef793607890d 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1466,8 +1466,8 @@ static int gtp_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **data = params->data; - struct net *src_net = params->net; unsigned int role = GTP_ROLE_GGSN; struct gtp_dev *gtp; struct gtp_net *gn; @@ -1498,7 +1498,7 @@ static int gtp_newlink(struct net_device *dev, gtp->restart_count = nla_get_u8_default(data[IFLA_GTP_RESTART_COUNT], 0); - gtp->net = src_net; + gtp->net = link_net; err = gtp_hashtable_new(gtp, hashsize); if (err < 0) @@ -1528,7 +1528,7 @@ static int gtp_newlink(struct net_device *dev, goto out_encap; } - gn = net_generic(src_net, gtp_net_id); + gn = net_generic(link_net, gtp_net_id); list_add(>p->list, &gn->gtp_dev_list); dev->priv_destructor = gtp_destructor; diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 19ce19ca7e32..b56144ca2fde 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -535,9 +535,9 @@ err: int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct ipvl_dev *ipvlan = netdev_priv(dev); struct nlattr **data = params->data; - struct net *src_net = params->net; struct nlattr **tb = params->tb; struct ipvl_port *port; struct net_device *phy_dev; @@ -547,7 +547,7 @@ int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, if (!tb[IFLA_LINK]) return -EINVAL; - phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + phy_dev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!phy_dev) return -ENODEV; diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 1869b0513f57..4de5d63fd577 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -4145,10 +4145,10 @@ static int macsec_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct macsec_dev *macsec = macsec_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; - struct net *net = params->net; rx_handler_func_t *rx_handler; u8 icv_len = MACSEC_DEFAULT_ICV_LEN; struct net_device *real_dev; @@ -4157,7 +4157,7 @@ static int macsec_newlink(struct net_device *dev, if (!tb[IFLA_LINK]) return -EINVAL; - real_dev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK])); + real_dev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!real_dev) return -ENODEV; if (real_dev->type != ARPHRD_ETHER) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index f903b414eaeb..4e9d54be887c 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1444,9 +1444,9 @@ int macvlan_common_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct macvlan_dev *vlan = netdev_priv(dev); struct nlattr **data = params->data; - struct net *src_net = params->net; struct nlattr **tb = params->tb; struct net_device *lowerdev; struct macvlan_port *port; @@ -1457,7 +1457,7 @@ int macvlan_common_newlink(struct net_device *dev, if (!tb[IFLA_LINK]) return -EINVAL; - lowerdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + lowerdev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (lowerdev == NULL) return -ENODEV; diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 640a2dbbbd28..751347392570 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -331,13 +331,13 @@ static int netkit_new_link(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *peer_net = rtnl_newlink_peer_net(params); enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT; enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT; struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr; enum netkit_action policy_prim = NETKIT_PASS; enum netkit_action policy_peer = NETKIT_PASS; struct nlattr **data = params->data; - struct net *peer_net = params->net; enum netkit_mode mode = NETKIT_L3; unsigned char ifname_assign_type; struct nlattr **tb = params->tb; diff --git a/drivers/net/pfcp.c b/drivers/net/pfcp.c index 7b0575940e1d..f873a92d2445 100644 --- a/drivers/net/pfcp.c +++ b/drivers/net/pfcp.c @@ -188,12 +188,12 @@ static int pfcp_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct pfcp_dev *pfcp = netdev_priv(dev); - struct net *net = params->net; struct pfcp_net *pn; int err; - pfcp->net = net; + pfcp->net = link_net; err = pfcp_add_sock(pfcp); if (err) { @@ -207,7 +207,7 @@ static int pfcp_newlink(struct net_device *dev, goto exit_del_pfcp_sock; } - pn = net_generic(net, pfcp_net_id); + pn = net_generic(link_net, pfcp_net_id); list_add(&pfcp->list, &pn->pfcp_dev_list); netdev_dbg(dev, "registered new PFCP interface\n"); diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index b3340f8a6149..6220866258fc 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1307,8 +1307,8 @@ static int ppp_nl_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **data = params->data; - struct net *src_net = params->net; struct nlattr **tb = params->tb; struct ppp_config conf = { .unit = -1, @@ -1346,7 +1346,7 @@ static int ppp_nl_newlink(struct net_device *dev, if (!tb[IFLA_IFNAME] || !nla_len(tb[IFLA_IFNAME]) || !*(char *)nla_data(tb[IFLA_IFNAME])) conf.ifname_is_set = false; - err = ppp_dev_configure(src_net, dev, &conf); + err = ppp_dev_configure(link_net, dev, &conf); out_unlock: mutex_unlock(&ppp_mutex); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 7dfda89f072f..ba3ae2d8092f 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1769,8 +1769,8 @@ static int veth_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *peer_net = rtnl_newlink_peer_net(params); struct nlattr **data = params->data; - struct net *peer_net = params->net; struct nlattr **tb = params->tb; int err; struct net_device *peer; diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 1a1d03abb6b9..227d7f5a302a 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -4404,8 +4404,8 @@ static int vxlan_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **data = params->data; - struct net *src_net = params->net; struct nlattr **tb = params->tb; struct vxlan_config conf; int err; @@ -4414,7 +4414,7 @@ static int vxlan_newlink(struct net_device *dev, if (err) return err; - return __vxlan_dev_create(src_net, dev, &conf, extack); + return __vxlan_dev_create(link_net, dev, &conf, extack); } static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 404cf05bd72b..c496d35b266d 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -311,11 +311,11 @@ static int wg_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct wg_device *wg = netdev_priv(dev); - struct net *src_net = params->net; int ret = -ENOMEM; - rcu_assign_pointer(wg->creating_net, src_net); + rcu_assign_pointer(wg->creating_net, link_net); init_rwsem(&wg->static_identity.lock); mutex_init(&wg->socket_update_lock); mutex_init(&wg->device_update_lock); diff --git a/drivers/net/wireless/virtual/virt_wifi.c b/drivers/net/wireless/virtual/virt_wifi.c index 26905b2b3ba3..f9d11a023313 100644 --- a/drivers/net/wireless/virtual/virt_wifi.c +++ b/drivers/net/wireless/virtual/virt_wifi.c @@ -524,7 +524,7 @@ static int virt_wifi_newlink(struct net_device *dev, struct netlink_ext_ack *extack) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); - struct net *src_net = params->net; + struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **tb = params->tb; int err; @@ -534,7 +534,7 @@ static int virt_wifi_newlink(struct net_device *dev, netif_carrier_off(dev); priv->upperdev = dev; - priv->lowerdev = __dev_get_by_index(src_net, + priv->lowerdev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!priv->lowerdev) diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c index a05c49b4e7f8..63a47d420bc5 100644 --- a/drivers/net/wwan/wwan_core.c +++ b/drivers/net/wwan/wwan_core.c @@ -1065,7 +1065,7 @@ static void wwan_create_default_link(struct wwan_device *wwandev, struct nlattr *tb[IFLA_MAX + 1], *linkinfo[IFLA_INFO_MAX + 1]; struct nlattr *data[IFLA_WWAN_MAX + 1]; struct rtnl_newlink_params params = { - .net = &init_net, + .src_net = &init_net, .tb = tb, .data = data, }; diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 563a6a27436c..b22a106621fb 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -88,6 +88,23 @@ struct rtnl_newlink_params { struct nlattr **data; }; +/* Get effective link netns from newlink params. Generally, this is link_net + * and falls back to src_net. But for compatibility, a driver may * choose to + * use dev_net(dev) instead. + */ +static inline struct net *rtnl_newlink_link_net(struct rtnl_newlink_params *p) +{ + return p->link_net ? : p->src_net; +} + +/* Get peer netns from newlink params. Fallback to link netns if peer netns is + * not specified explicitly. + */ +static inline struct net *rtnl_newlink_peer_net(struct rtnl_newlink_params *p) +{ + return p->peer_net ? : rtnl_newlink_link_net(p); +} + /** * struct rtnl_link_ops - rtnetlink link operations * diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 91df0f96e32a..a000b1ef0520 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -139,9 +139,9 @@ static int vlan_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct nlattr **data = params->data; - struct net *src_net = params->net; struct nlattr **tb = params->tb; struct net_device *real_dev; unsigned int max_mtu; @@ -158,7 +158,7 @@ static int vlan_newlink(struct net_device *dev, return -EINVAL; } - real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + real_dev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!real_dev) { NL_SET_ERR_MSG_MOD(extack, "link does not exist"); return -ENODEV; diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 39add538ba99..b120470246cc 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -33,8 +33,8 @@ static int hsr_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **data = params->data; - struct net *src_net = params->net; enum hsr_version proto_version; unsigned char multicast_spec; u8 proto = HSR_PROTOCOL_HSR; @@ -48,7 +48,7 @@ static int hsr_newlink(struct net_device *dev, NL_SET_ERR_MSG_MOD(extack, "Slave1 device not specified"); return -EINVAL; } - link[0] = __dev_get_by_index(src_net, + link[0] = __dev_get_by_index(link_net, nla_get_u32(data[IFLA_HSR_SLAVE1])); if (!link[0]) { NL_SET_ERR_MSG_MOD(extack, "Slave1 does not exist"); @@ -58,7 +58,7 @@ static int hsr_newlink(struct net_device *dev, NL_SET_ERR_MSG_MOD(extack, "Slave2 device not specified"); return -EINVAL; } - link[1] = __dev_get_by_index(src_net, + link[1] = __dev_get_by_index(link_net, nla_get_u32(data[IFLA_HSR_SLAVE2])); if (!link[1]) { NL_SET_ERR_MSG_MOD(extack, "Slave2 does not exist"); @@ -71,7 +71,7 @@ static int hsr_newlink(struct net_device *dev, } if (data[IFLA_HSR_INTERLINK]) - interlink = __dev_get_by_index(src_net, + interlink = __dev_get_by_index(link_net, nla_get_u32(data[IFLA_HSR_INTERLINK])); if (interlink && interlink == link[0]) { -- cgit v1.2.3 From eacb1160536e097ba1eb910a1b212d1032d2e9c6 Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Wed, 19 Feb 2025 20:50:32 +0800 Subject: net: ip_tunnel: Use link netns in newlink() of rtnl_link_ops When link_net is set, use it as link netns instead of dev_net(). This prepares for rtnetlink core to create device in target netns directly, in which case the two namespaces may be different. Convert common ip_tunnel_newlink() to accept an extra link netns argument. Signed-off-by: Xiao Liang Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250219125039.18024-7-shaw.leon@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ip_tunnels.h | 5 +++-- net/ipv4/ip_gre.c | 8 +++++--- net/ipv4/ip_tunnel.c | 6 +++--- net/ipv4/ip_vti.c | 3 ++- net/ipv4/ipip.c | 3 ++- 5 files changed, 15 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index e041e4865373..a36a335cef9f 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -407,8 +407,9 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, bool log_ecn_error); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark); -int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm_kern *p, __u32 fwmark); +int ip_tunnel_newlink(struct net *net, struct net_device *dev, + struct nlattr *tb[], struct ip_tunnel_parm_kern *p, + __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 1fe9b13d351c..26d15f907551 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1413,7 +1413,8 @@ static int ipgre_newlink(struct net_device *dev, err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; - return ip_tunnel_newlink(dev, tb, &p, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, + fwmark); } static int erspan_newlink(struct net_device *dev, @@ -1433,7 +1434,8 @@ static int erspan_newlink(struct net_device *dev, err = erspan_netlink_parms(dev, data, tb, &p, &fwmark); if (err) return err; - return ip_tunnel_newlink(dev, tb, &p, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, + fwmark); } static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], @@ -1701,7 +1703,7 @@ static struct rtnl_link_ops erspan_link_ops __read_mostly = { struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type) { - struct rtnl_newlink_params params = { .net = net }; + struct rtnl_newlink_params params = { .src_net = net }; struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; LIST_HEAD(list_kill); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index dd4b4e5be998..011f2a5aab3b 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1213,11 +1213,11 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, } EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); -int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm_kern *p, __u32 fwmark) +int ip_tunnel_newlink(struct net *net, struct net_device *dev, + struct nlattr *tb[], struct ip_tunnel_parm_kern *p, + __u32 fwmark) { struct ip_tunnel *nt; - struct net *net = dev_net(dev); struct ip_tunnel_net *itn; int mtu; int err; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index b901bee03e6d..159b4473290e 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -585,7 +585,8 @@ static int vti_newlink(struct net_device *dev, __u32 fwmark = 0; vti_netlink_parms(data, &parms, &fwmark); - return ip_tunnel_newlink(dev, tb, &parms, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, + &parms, fwmark); } static int vti_changelink(struct net_device *dev, struct nlattr *tb[], diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index a8b844bcfc64..bab0bf90c908 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -455,7 +455,8 @@ static int ipip_newlink(struct net_device *dev, } ipip_netlink_parms(data, &p, &t->collect_md, &fwmark); - return ip_tunnel_newlink(dev, tb, &p, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, + fwmark); } static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], -- cgit v1.2.3 From 9c0fc091dc01894eaa686bc051feb4367b0d033a Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Wed, 19 Feb 2025 20:50:36 +0800 Subject: rtnetlink: Remove "net" from newlink params Now that devices have been converted to use the specific netns instead of ambiguous "net", let's remove it from newlink parameters. Signed-off-by: Xiao Liang Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250219125039.18024-11-shaw.leon@gmail.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 2 -- net/core/rtnetlink.c | 6 ------ 2 files changed, 8 deletions(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index b22a106621fb..ec65a8cebb99 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -72,7 +72,6 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) /** * struct rtnl_newlink_params - parameters of rtnl_link_ops::newlink() * - * @net: Netns of interest * @src_net: Source netns of rtnetlink socket * @link_net: Link netns by IFLA_LINK_NETNSID, NULL if not specified * @peer_net: Peer netns @@ -80,7 +79,6 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) * @data: IFLA_INFO_DATA attributes */ struct rtnl_newlink_params { - struct net *net; struct net *src_net; struct net *link_net; struct net *peer_net; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9ebbde0d131c..e980481bdd28 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3798,12 +3798,6 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, dev->ifindex = ifm->ifi_index; - params.net = params.src_net; - if (link_net) - params.net = link_net; - if (peer_net) - params.net = peer_net; - if (ops->newlink) err = ops->newlink(dev, ¶ms, extack); else -- cgit v1.2.3 From 9771a96a7a35daa2220cc4f170b840b34af28b2c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 21 Feb 2025 16:44:01 +0100 Subject: mptcp: sched: split get_subflow interface into two get_retrans() interface of the burst packet scheduler invokes a sleeping function mptcp_pm_subflow_chk_stale(), which calls __lock_sock_fast(). So get_retrans() interface should be set with BPF_F_SLEEPABLE flag in BPF. But get_send() interface of this scheduler can't be set with BPF_F_SLEEPABLE flag since it's invoked in ack_update_msk() under mptcp data lock. So this patch has to split get_subflow() interface of packet scheduer into two interfaces: get_send() and get_retrans(). Then we can set get_retrans() interface alone with BPF_F_SLEEPABLE flag. Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250221-net-next-mptcp-pm-misc-cleanup-3-v1-8-2b70ab1cee79@kernel.org Signed-off-by: Jakub Kicinski --- include/net/mptcp.h | 5 +++-- net/mptcp/sched.c | 35 ++++++++++++++++++++++++----------- 2 files changed, 27 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 814b5f2e3ed5..2c85ca92bb1c 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -103,13 +103,14 @@ struct mptcp_out_options { #define MPTCP_SUBFLOWS_MAX 8 struct mptcp_sched_data { - bool reinject; u8 subflows; struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; }; struct mptcp_sched_ops { - int (*get_subflow)(struct mptcp_sock *msk, + int (*get_send)(struct mptcp_sock *msk, + struct mptcp_sched_data *data); + int (*get_retrans)(struct mptcp_sock *msk, struct mptcp_sched_data *data); char name[MPTCP_SCHED_NAME_MAX]; diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index df7dbcfa3b71..94dc4b3ad82f 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -16,13 +16,25 @@ static DEFINE_SPINLOCK(mptcp_sched_list_lock); static LIST_HEAD(mptcp_sched_list); -static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, +static int mptcp_sched_default_get_send(struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + struct sock *ssk; + + ssk = mptcp_subflow_get_send(msk); + if (!ssk) + return -EINVAL; + + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); + return 0; +} + +static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk, struct mptcp_sched_data *data) { struct sock *ssk; - ssk = data->reinject ? mptcp_subflow_get_retrans(msk) : - mptcp_subflow_get_send(msk); + ssk = mptcp_subflow_get_retrans(msk); if (!ssk) return -EINVAL; @@ -31,7 +43,8 @@ static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, } static struct mptcp_sched_ops mptcp_sched_default = { - .get_subflow = mptcp_sched_default_get_subflow, + .get_send = mptcp_sched_default_get_send, + .get_retrans = mptcp_sched_default_get_retrans, .name = "default", .owner = THIS_MODULE, }; @@ -73,7 +86,7 @@ void mptcp_get_available_schedulers(char *buf, size_t maxlen) int mptcp_register_scheduler(struct mptcp_sched_ops *sched) { - if (!sched->get_subflow) + if (!sched->get_send) return -EINVAL; spin_lock(&mptcp_sched_list_lock); @@ -164,10 +177,9 @@ int mptcp_sched_get_send(struct mptcp_sock *msk) return 0; } - data.reinject = false; if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_subflow(msk, &data); - return msk->sched->get_subflow(msk, &data); + return mptcp_sched_default_get_send(msk, &data); + return msk->sched->get_send(msk, &data); } int mptcp_sched_get_retrans(struct mptcp_sock *msk) @@ -186,8 +198,9 @@ int mptcp_sched_get_retrans(struct mptcp_sock *msk) return 0; } - data.reinject = true; if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_subflow(msk, &data); - return msk->sched->get_subflow(msk, &data); + return mptcp_sched_default_get_retrans(msk, &data); + if (msk->sched->get_retrans) + return msk->sched->get_retrans(msk, &data); + return msk->sched->get_send(msk, &data); } -- cgit v1.2.3 From ebba23e0779834e68a200b7968975274cbe260ef Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Wed, 12 Feb 2025 08:22:57 +0200 Subject: wifi: mac80211: add ieee80211_iter_chan_contexts_mtx Add a chanctx iterator that can be called from a wiphy-locked context. Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250212082137.d85eef3024de.Icda0616416c5fd4b2cbf892bdab2476f26e644ec@changeid [fix kernel-doc] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 25 +++++++++++++++++++++++++ net/mac80211/chan.c | 20 +++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 22d32419e8a0..c498f685d01f 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -6671,6 +6671,31 @@ void ieee80211_iter_chan_contexts_atomic( void *data), void *iter_data); +/** + * ieee80211_iter_chan_contexts_mtx - iterate channel contexts + * @hw: pointer obtained from ieee80211_alloc_hw(). + * @iter: iterator function + * @iter_data: data passed to iterator function + * + * Iterate all active channel contexts. This function can only be used while + * holding the wiphy mutex. + * + * The iterator will not find a context that's being added (during + * the driver callback to add it) but will find it while it's being + * removed. + * + * Note that during hardware restart, all contexts that existed + * before the restart are considered already present so will be + * found while iterating, whether they've been re-added already + * or not. + */ +void ieee80211_iter_chan_contexts_mtx( + struct ieee80211_hw *hw, + void (*iter)(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *chanctx_conf, + void *data), + void *iter_data); + /** * ieee80211_ap_probereq_get - retrieve a Probe Request template * @hw: pointer obtained from ieee80211_alloc_hw(). diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index dc28f2b0957a..c3bfac58151f 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 - channel management - * Copyright 2020 - 2024 Intel Corporation + * Copyright 2020 - 2025 Intel Corporation */ #include @@ -2178,3 +2178,21 @@ void ieee80211_iter_chan_contexts_atomic( rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iter_chan_contexts_atomic); + +void ieee80211_iter_chan_contexts_mtx( + struct ieee80211_hw *hw, + void (*iter)(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *chanctx_conf, + void *data), + void *iter_data) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_chanctx *ctx; + + lockdep_assert_wiphy(hw->wiphy); + + list_for_each_entry(ctx, &local->chanctx_list, list) + if (ctx->driver_present) + iter(hw, &ctx->conf, iter_data); +} +EXPORT_SYMBOL_GPL(ieee80211_iter_chan_contexts_mtx); -- cgit v1.2.3 From ceaad3c435964f601602531d0f6f7deaff329e21 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 12 Feb 2025 08:22:58 +0200 Subject: wifi: cfg80211: expose update timestamp to drivers This information is exposed to userspace but not drivers. Make this field public so that drivers are also able to access it. The information is for example useful for link selection to determine whether the BSS corresponding to an MLO link has been seen in a recent scan. Signed-off-by: Benjamin Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250212082137.b682ee7aebc8.I0f7cca9effa2b1cee79f4f2eb8b549c99b4e0571@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ net/wireless/core.h | 1 - net/wireless/nl80211.c | 4 ++-- net/wireless/scan.c | 6 +++--- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a72e7eb7027f..3580f3f5f44e 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2947,6 +2947,7 @@ struct cfg80211_bss_ies { * @nontrans_list: list of non-transmitted BSS, if this is a transmitted one * (multi-BSSID support) * @signal: signal strength value (type depends on the wiphy's signal_type) + * @ts_boottime: timestamp of the last BSS update in nanoseconds since boot * @chains: bitmask for filled values in @chain_signal. * @chain_signal: per-chain signal strength of last received BSS in dBm. * @bssid_index: index in the multiple BSS set @@ -2971,6 +2972,8 @@ struct cfg80211_bss { s32 signal; + u64 ts_boottime; + u16 beacon_interval; u16 capability; diff --git a/net/wireless/core.h b/net/wireless/core.h index 826299f3d781..b094b526b05d 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -180,7 +180,6 @@ struct cfg80211_internal_bss { struct list_head list; struct list_head hidden_list; struct rb_node rbn; - u64 ts_boottime; unsigned long ts; unsigned long refcount; atomic_t hold; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8bd09110d393..79b3f34e72e2 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10519,9 +10519,9 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, intbss->parent_bssid))) goto nla_put_failure; - if (intbss->ts_boottime && + if (res->ts_boottime && nla_put_u64_64bit(msg, NL80211_BSS_LAST_SEEN_BOOTTIME, - intbss->ts_boottime, NL80211_BSS_PAD)) + res->ts_boottime, NL80211_BSS_PAD)) goto nla_put_failure; if (!nl80211_put_signal(msg, intbss->pub.chains, diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 1de25e9763cb..9865f305275d 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -5,7 +5,7 @@ * Copyright 2008 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2016 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #include #include @@ -1934,7 +1934,7 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, known->pub.signal = new->pub.signal; known->pub.capability = new->pub.capability; known->ts = new->ts; - known->ts_boottime = new->ts_boottime; + known->pub.ts_boottime = new->pub.ts_boottime; known->parent_tsf = new->parent_tsf; known->pub.chains = new->pub.chains; memcpy(known->pub.chain_signal, new->pub.chain_signal, @@ -2291,7 +2291,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, tmp.pub.signal = 0; tmp.pub.beacon_interval = data->beacon_interval; tmp.pub.capability = data->capability; - tmp.ts_boottime = drv_data->boottime_ns; + tmp.pub.ts_boottime = drv_data->boottime_ns; tmp.parent_tsf = drv_data->parent_tsf; ether_addr_copy(tmp.parent_bssid, drv_data->parent_bssid); tmp.pub.chains = drv_data->chains; -- cgit v1.2.3 From 291515c7640962f8865e4c54897a5e91526b450c Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 25 Feb 2025 18:17:43 +0100 Subject: net: gro: decouple GRO from the NAPI layer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In fact, these two are not tied closely to each other. The only requirements to GRO are to use it in the BH context and have some sane limits on the packet batches, e.g. NAPI has a limit of its budget (64/8/etc.). Move purely GRO fields into a new structure, &gro_node. Embed it into &napi_struct and adjust all the references. gro_node::cached_napi_id is effectively the same as napi_struct::napi_id, but to be used on GRO hotpath to mark skbs. napi_struct::napi_id is now a fully control path field. Three Ethernet drivers use napi_gro_flush() not really meant to be exported, so move it to and add that include there. napi_gro_receive() is used in more than 100 drivers, keep it in . This does not make GRO ready to use outside of the NAPI context yet. Tested-by: Daniel Xu Acked-by: Jakub Kicinski Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- drivers/net/ethernet/brocade/bna/bnad.c | 1 + drivers/net/ethernet/cortina/gemini.c | 1 + drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c | 1 + include/linux/netdevice.h | 37 ++++++++++++---- include/net/busy_poll.h | 12 ++++-- include/net/gro.h | 35 ++++++++++----- net/core/dev.c | 66 ++++++++++++++-------------- net/core/gro.c | 69 +++++++++++++++--------------- 8 files changed, 130 insertions(+), 92 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c index ece6f3b48327..3b9107003b00 100644 --- a/drivers/net/ethernet/brocade/bna/bnad.c +++ b/drivers/net/ethernet/brocade/bna/bnad.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "bnad.h" #include "bna.h" diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 991e3839858b..1f8067bdd61a 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "gemini.h" diff --git a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c index 7a9c09cd4fdc..6a7a26085fc7 100644 --- a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c +++ b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "t7xx_dpmaif.h" #include "t7xx_hif_dpmaif.h" diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2094d3edda73..26a0c4e4d963 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -340,11 +340,27 @@ struct gro_list { }; /* - * size of gro hash buckets, must less than bit number of - * napi_struct::gro_bitmask + * size of gro hash buckets, must be <= the number of bits in + * gro_node::bitmask */ #define GRO_HASH_BUCKETS 8 +/** + * struct gro_node - structure to support Generic Receive Offload + * @bitmask: bitmask to indicate used buckets in @hash + * @hash: hashtable of pending aggregated skbs, separated by flows + * @rx_list: list of pending ``GRO_NORMAL`` skbs + * @rx_count: cached current length of @rx_list + * @cached_napi_id: napi_struct::napi_id cached for hotpath, 0 for standalone + */ +struct gro_node { + unsigned long bitmask; + struct gro_list hash[GRO_HASH_BUCKETS]; + struct list_head rx_list; + u32 rx_count; + u32 cached_napi_id; +}; + /* * Structure for per-NAPI config */ @@ -371,7 +387,6 @@ struct napi_struct { unsigned long state; int weight; u32 defer_hard_irqs_count; - unsigned long gro_bitmask; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL /* CPU actively polling if netpoll is configured */ @@ -380,11 +395,8 @@ struct napi_struct { /* CPU on which NAPI has been scheduled for processing */ int list_owner; struct net_device *dev; - struct gro_list gro_hash[GRO_HASH_BUCKETS]; struct sk_buff *skb; - struct list_head rx_list; /* Pending GRO_NORMAL skbs */ - int rx_count; /* length of rx_list */ - unsigned int napi_id; /* protected by netdev_lock */ + struct gro_node gro; struct hrtimer timer; /* all fields past this point are write-protected by netdev_lock */ struct task_struct *thread; @@ -392,6 +404,7 @@ struct napi_struct { unsigned long irq_suspend_timeout; u32 defer_hard_irqs; /* control-path-only fields follow */ + u32 napi_id; struct list_head dev_list; struct hlist_node napi_hash_node; int irq; @@ -4131,8 +4144,14 @@ int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); void netif_receive_skb_list_internal(struct list_head *head); void netif_receive_skb_list(struct list_head *head); -gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); -void napi_gro_flush(struct napi_struct *napi, bool flush_old); +gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb); + +static inline gro_result_t napi_gro_receive(struct napi_struct *napi, + struct sk_buff *skb) +{ + return gro_receive_skb(&napi->gro, skb); +} + struct sk_buff *napi_get_frags(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index cab6146a510a..6e172d0f6ef5 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -127,18 +127,24 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) } /* used in the NIC receive handler to mark the skb */ -static inline void skb_mark_napi_id(struct sk_buff *skb, - struct napi_struct *napi) +static inline void __skb_mark_napi_id(struct sk_buff *skb, + const struct gro_node *gro) { #ifdef CONFIG_NET_RX_BUSY_POLL /* If the skb was already marked with a valid NAPI ID, avoid overwriting * it. */ if (!napi_id_valid(skb->napi_id)) - skb->napi_id = napi->napi_id; + skb->napi_id = gro->cached_napi_id; #endif } +static inline void skb_mark_napi_id(struct sk_buff *skb, + const struct napi_struct *napi) +{ + __skb_mark_napi_id(skb, &napi->gro); +} + /* used in the protocol handler to propagate the napi_id to the socket */ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { diff --git a/include/net/gro.h b/include/net/gro.h index 7b548f91754b..38d70c69ff80 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -509,26 +509,41 @@ static inline int gro_receive_network_flush(const void *th, const void *th2, int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); +void __gro_flush(struct gro_node *gro, bool flush_old); + +static inline void gro_flush(struct gro_node *gro, bool flush_old) +{ + if (!gro->bitmask) + return; + + __gro_flush(gro, flush_old); +} + +static inline void napi_gro_flush(struct napi_struct *napi, bool flush_old) +{ + gro_flush(&napi->gro, flush_old); +} /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ -static inline void gro_normal_list(struct napi_struct *napi) +static inline void gro_normal_list(struct gro_node *gro) { - if (!napi->rx_count) + if (!gro->rx_count) return; - netif_receive_skb_list_internal(&napi->rx_list); - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; + netif_receive_skb_list_internal(&gro->rx_list); + INIT_LIST_HEAD(&gro->rx_list); + gro->rx_count = 0; } /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, * pass the whole batch up to the stack. */ -static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs) +static inline void gro_normal_one(struct gro_node *gro, struct sk_buff *skb, + int segs) { - list_add_tail(&skb->list, &napi->rx_list); - napi->rx_count += segs; - if (napi->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch)) - gro_normal_list(napi); + list_add_tail(&skb->list, &gro->rx_list); + gro->rx_count += segs; + if (gro->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch)) + gro_normal_list(gro); } /* This function is the alternative of 'inet_iif' and 'inet_sdif' diff --git a/net/core/dev.c b/net/core/dev.c index b0ab0169f507..634bfcb3b509 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6484,7 +6484,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) return false; if (work_done) { - if (n->gro_bitmask) + if (n->gro.bitmask) timeout = napi_get_gro_flush_timeout(n); n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); } @@ -6494,15 +6494,14 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (timeout) ret = false; } - if (n->gro_bitmask) { - /* When the NAPI instance uses a timeout and keeps postponing - * it, we need to bound somehow the time packets are kept in - * the GRO layer - */ - napi_gro_flush(n, !!timeout); - } - gro_normal_list(n); + /* + * When the NAPI instance uses a timeout and keeps postponing + * it, we need to bound somehow the time packets are kept in + * the GRO layer. + */ + gro_flush(&n->gro, !!timeout); + gro_normal_list(&n->gro); if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ @@ -6566,19 +6565,15 @@ static void skb_defer_free_flush(struct softnet_data *sd) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { if (!skip_schedule) { - gro_normal_list(napi); + gro_normal_list(&napi->gro); __napi_schedule(napi); return; } - if (napi->gro_bitmask) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - napi_gro_flush(napi, HZ >= 1000); - } + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush(&napi->gro, HZ >= 1000); + gro_normal_list(&napi->gro); - gro_normal_list(napi); clear_bit(NAPI_STATE_SCHED, &napi->state); } @@ -6685,7 +6680,7 @@ restart: } work = napi_poll(napi, budget); trace_napi_poll(napi, work, budget); - gro_normal_list(napi); + gro_normal_list(&napi->gro); count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), @@ -6785,6 +6780,8 @@ void napi_resume_irqs(unsigned int napi_id) static void __napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id) { + napi->gro.cached_napi_id = napi_id; + WRITE_ONCE(napi->napi_id, napi_id); hlist_add_head_rcu(&napi->napi_hash_node, &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); @@ -6858,10 +6855,12 @@ static void init_gro_hash(struct napi_struct *napi) int i; for (i = 0; i < GRO_HASH_BUCKETS; i++) { - INIT_LIST_HEAD(&napi->gro_hash[i].list); - napi->gro_hash[i].count = 0; + INIT_LIST_HEAD(&napi->gro.hash[i].list); + napi->gro.hash[i].count = 0; } - napi->gro_bitmask = 0; + + napi->gro.bitmask = 0; + napi->gro.cached_napi_id = 0; } int dev_set_threaded(struct net_device *dev, bool threaded) @@ -7193,8 +7192,8 @@ void netif_napi_add_weight_locked(struct net_device *dev, napi->timer.function = napi_watchdog; init_gro_hash(napi); napi->skb = NULL; - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; + INIT_LIST_HEAD(&napi->gro.rx_list); + napi->gro.rx_count = 0; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) netdev_err_once(dev, "%s() called with weight %d\n", __func__, @@ -7315,10 +7314,13 @@ static void flush_gro_hash(struct napi_struct *napi) for (i = 0; i < GRO_HASH_BUCKETS; i++) { struct sk_buff *skb, *n; - list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) + list_for_each_entry_safe(skb, n, &napi->gro.hash[i].list, list) kfree_skb(skb); - napi->gro_hash[i].count = 0; + napi->gro.hash[i].count = 0; } + + napi->gro.bitmask = 0; + napi->gro.cached_napi_id = 0; } /* Must be called in process context */ @@ -7344,7 +7346,6 @@ void __netif_napi_del_locked(struct napi_struct *napi) napi_free_frags(napi); flush_gro_hash(napi); - napi->gro_bitmask = 0; if (napi->thread) { kthread_stop(napi->thread); @@ -7403,14 +7404,9 @@ static int __napi_poll(struct napi_struct *n, bool *repoll) return work; } - if (n->gro_bitmask) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - napi_gro_flush(n, HZ >= 1000); - } - - gro_normal_list(n); + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush(&n->gro, HZ >= 1000); + gro_normal_list(&n->gro); /* Some drivers may have called napi_schedule * prior to exhausting their budget. @@ -12439,7 +12435,7 @@ static struct hlist_head * __net_init netdev_create_hash(void) static int __net_init netdev_init(struct net *net) { BUILD_BUG_ON(GRO_HASH_BUCKETS > - 8 * sizeof_field(struct napi_struct, gro_bitmask)); + BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask)); INIT_LIST_HEAD(&net->dev_base_head); diff --git a/net/core/gro.c b/net/core/gro.c index 78b320b63174..9e1803fdf249 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -250,8 +250,7 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) return 0; } - -static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) +static void gro_complete(struct gro_node *gro, struct sk_buff *skb) { struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; @@ -284,43 +283,43 @@ static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) } out: - gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); + gro_normal_one(gro, skb, NAPI_GRO_CB(skb)->count); } -static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, - bool flush_old) +static void __gro_flush_chain(struct gro_node *gro, u32 index, bool flush_old) { - struct list_head *head = &napi->gro_hash[index].list; + struct list_head *head = &gro->hash[index].list; struct sk_buff *skb, *p; list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; skb_list_del_init(skb); - napi_gro_complete(napi, skb); - napi->gro_hash[index].count--; + gro_complete(gro, skb); + gro->hash[index].count--; } - if (!napi->gro_hash[index].count) - __clear_bit(index, &napi->gro_bitmask); + if (!gro->hash[index].count) + __clear_bit(index, &gro->bitmask); } -/* napi->gro_hash[].list contains packets ordered by age. +/* + * gro->hash[].list contains packets ordered by age. * youngest packets at the head of it. * Complete skbs in reverse order to reduce latencies. */ -void napi_gro_flush(struct napi_struct *napi, bool flush_old) +void __gro_flush(struct gro_node *gro, bool flush_old) { - unsigned long bitmask = napi->gro_bitmask; + unsigned long bitmask = gro->bitmask; unsigned int i, base = ~0U; while ((i = ffs(bitmask)) != 0) { bitmask >>= i; base += i; - __napi_gro_flush_chain(napi, base, flush_old); + __gro_flush_chain(gro, base, flush_old); } } -EXPORT_SYMBOL(napi_gro_flush); +EXPORT_SYMBOL(__gro_flush); static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb, const struct sk_buff *p, @@ -439,7 +438,7 @@ static void gro_try_pull_from_frag0(struct sk_buff *skb) gro_pull_from_frag0(skb, grow); } -static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) +static void gro_flush_oldest(struct gro_node *gro, struct list_head *head) { struct sk_buff *oldest; @@ -455,14 +454,15 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) * SKB to the chain. */ skb_list_del_init(oldest); - napi_gro_complete(napi, oldest); + gro_complete(gro, oldest); } -static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +static enum gro_result dev_gro_receive(struct gro_node *gro, + struct sk_buff *skb) { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); - struct gro_list *gro_list = &napi->gro_hash[bucket]; struct list_head *head = &net_hotdata.offload_base; + struct gro_list *gro_list = &gro->hash[bucket]; struct packet_offload *ptype; __be16 type = skb->protocol; struct sk_buff *pp = NULL; @@ -526,7 +526,7 @@ found_ptype: if (pp) { skb_list_del_init(pp); - napi_gro_complete(napi, pp); + gro_complete(gro, pp); gro_list->count--; } @@ -537,7 +537,7 @@ found_ptype: goto normal; if (unlikely(gro_list->count >= MAX_GRO_SKBS)) - gro_flush_oldest(napi, &gro_list->list); + gro_flush_oldest(gro, &gro_list->list); else gro_list->count++; @@ -551,10 +551,10 @@ found_ptype: ret = GRO_HELD; ok: if (gro_list->count) { - if (!test_bit(bucket, &napi->gro_bitmask)) - __set_bit(bucket, &napi->gro_bitmask); - } else if (test_bit(bucket, &napi->gro_bitmask)) { - __clear_bit(bucket, &napi->gro_bitmask); + if (!test_bit(bucket, &gro->bitmask)) + __set_bit(bucket, &gro->bitmask); + } else if (test_bit(bucket, &gro->bitmask)) { + __clear_bit(bucket, &gro->bitmask); } return ret; @@ -593,13 +593,12 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); -static gro_result_t napi_skb_finish(struct napi_struct *napi, - struct sk_buff *skb, - gro_result_t ret) +static gro_result_t gro_skb_finish(struct gro_node *gro, struct sk_buff *skb, + gro_result_t ret) { switch (ret) { case GRO_NORMAL: - gro_normal_one(napi, skb, 1); + gro_normal_one(gro, skb, 1); break; case GRO_MERGED_FREE: @@ -620,21 +619,21 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi, return ret; } -gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb) { gro_result_t ret; - skb_mark_napi_id(skb, napi); + __skb_mark_napi_id(skb, gro); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb, 0); - ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = gro_skb_finish(gro, skb, dev_gro_receive(gro, skb)); trace_napi_gro_receive_exit(ret); return ret; } -EXPORT_SYMBOL(napi_gro_receive); +EXPORT_SYMBOL(gro_receive_skb); static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { @@ -690,7 +689,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_NORMAL) - gro_normal_one(napi, skb, 1); + gro_normal_one(&napi->gro, skb, 1); break; case GRO_MERGED_FREE: @@ -759,7 +758,7 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) trace_napi_gro_frags_entry(skb); - ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = napi_frags_finish(napi, skb, dev_gro_receive(&napi->gro, skb)); trace_napi_gro_frags_exit(ret); return ret; -- cgit v1.2.3 From 388d31417ce0f1d08a1a86cab4c1dd700e9e9481 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 25 Feb 2025 18:17:44 +0100 Subject: net: gro: expose GRO init/cleanup to use outside of NAPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make GRO init and cleanup functions global to be able to use GRO without a NAPI instance. Taking into account already global gro_flush(), it's now fully usable standalone. New functions are not exported, since they're not supposed to be used outside of the kernel core code. Tested-by: Daniel Xu Reviewed-by: Jakub Kicinski Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- include/net/gro.h | 3 +++ net/core/dev.c | 37 +++---------------------------------- net/core/gro.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 34 deletions(-) (limited to 'include/net') diff --git a/include/net/gro.h b/include/net/gro.h index 38d70c69ff80..22d3a69e4404 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -546,6 +546,9 @@ static inline void gro_normal_one(struct gro_node *gro, struct sk_buff *skb, gro_normal_list(gro); } +void gro_init(struct gro_node *gro); +void gro_cleanup(struct gro_node *gro); + /* This function is the alternative of 'inet_iif' and 'inet_sdif' * functions in case we can not rely on fields of IPCB. * diff --git a/net/core/dev.c b/net/core/dev.c index 634bfcb3b509..2dd9e6f490e6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6850,19 +6850,6 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void init_gro_hash(struct napi_struct *napi) -{ - int i; - - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - INIT_LIST_HEAD(&napi->gro.hash[i].list); - napi->gro.hash[i].count = 0; - } - - napi->gro.bitmask = 0; - napi->gro.cached_napi_id = 0; -} - int dev_set_threaded(struct net_device *dev, bool threaded) { struct napi_struct *napi; @@ -7190,10 +7177,8 @@ void netif_napi_add_weight_locked(struct net_device *dev, INIT_HLIST_NODE(&napi->napi_hash_node); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; - init_gro_hash(napi); + gro_init(&napi->gro); napi->skb = NULL; - INIT_LIST_HEAD(&napi->gro.rx_list); - napi->gro.rx_count = 0; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) netdev_err_once(dev, "%s() called with weight %d\n", __func__, @@ -7307,22 +7292,6 @@ void napi_enable(struct napi_struct *n) } EXPORT_SYMBOL(napi_enable); -static void flush_gro_hash(struct napi_struct *napi) -{ - int i; - - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - struct sk_buff *skb, *n; - - list_for_each_entry_safe(skb, n, &napi->gro.hash[i].list, list) - kfree_skb(skb); - napi->gro.hash[i].count = 0; - } - - napi->gro.bitmask = 0; - napi->gro.cached_napi_id = 0; -} - /* Must be called in process context */ void __netif_napi_del_locked(struct napi_struct *napi) { @@ -7345,7 +7314,7 @@ void __netif_napi_del_locked(struct napi_struct *napi) list_del_rcu(&napi->dev_list); napi_free_frags(napi); - flush_gro_hash(napi); + gro_cleanup(&napi->gro); if (napi->thread) { kthread_stop(napi->thread); @@ -12800,7 +12769,7 @@ static int __init net_dev_init(void) INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); spin_lock_init(&sd->defer_lock); - init_gro_hash(&sd->backlog); + gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; INIT_LIST_HEAD(&sd->backlog.poll_list); diff --git a/net/core/gro.c b/net/core/gro.c index 9e1803fdf249..19bd4cdaee3a 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -790,3 +790,37 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) return sum; } EXPORT_SYMBOL(__skb_gro_checksum_complete); + +void gro_init(struct gro_node *gro) +{ + for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) { + INIT_LIST_HEAD(&gro->hash[i].list); + gro->hash[i].count = 0; + } + + gro->bitmask = 0; + gro->cached_napi_id = 0; + + INIT_LIST_HEAD(&gro->rx_list); + gro->rx_count = 0; +} + +void gro_cleanup(struct gro_node *gro) +{ + struct sk_buff *skb, *n; + + for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) { + list_for_each_entry_safe(skb, n, &gro->hash[i].list, list) + kfree_skb(skb); + + gro->hash[i].count = 0; + } + + gro->bitmask = 0; + gro->cached_napi_id = 0; + + list_for_each_entry_safe(skb, n, &gro->rx_list, list) + kfree_skb(skb); + + gro->rx_count = 0; +} -- cgit v1.2.3 From b696d289c07d8480a7d4752e448f4ee2bee9e443 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 25 Feb 2025 18:17:50 +0100 Subject: xdp: remove xdp_alloc_skb_bulk() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only user was veth, which now uses napi_skb_cache_get_bulk(). It's now preferred over a direct allocation and is exported as well, so remove this one. Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- include/net/xdp.h | 1 - net/core/xdp.c | 10 ---------- 2 files changed, 11 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h index 4dafc5e021f1..48efacbaa35d 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -343,7 +343,6 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); -int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline diff --git a/net/core/xdp.c b/net/core/xdp.c index 2c6ab6fb452f..f86eedad586a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -618,16 +618,6 @@ void xdp_warn(const char *msg, const char *func, const int line) }; EXPORT_SYMBOL_GPL(xdp_warn); -int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) -{ - n_skb = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, n_skb, skbs); - if (unlikely(!n_skb)) - return -ENOMEM; - - return 0; -} -EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk); - /** * xdp_build_skb_from_buff - create an skb from &xdp_buff * @xdp: &xdp_buff to convert to an skb -- cgit v1.2.3 From 23ff5f6f23f1419d87351243fd4258c2eec0fbf7 Mon Sep 17 00:00:00 2001 From: Sarika Sharma Date: Thu, 13 Feb 2025 22:46:21 +0530 Subject: wifi: cfg80211: reorg sinfo structure elements for mesh Currently, as multi-link operation(MLO) is not supported for mesh, reorganize the sinfo structure for mesh-specific fields and embed mesh related NL attributes together in organized view. This will allow for the simplified reorganization of sinfo structure for link level in a subsequent patch to add support for MLO station statistics. No functionality changes added. Pahole summary before the reorg of sinfo structure: - size: 256, cachelines: 4, members: 50 - sum members: 239, holes: 4, sum holes: 17 - paddings: 2, sum paddings: 2 - forced alignments: 1, forced holes: 1, sum forced holes: 1 Pahole summary after the reorg of sinfo structure: - size: 248, cachelines: 4, members: 50 - sum members: 239, holes: 4, sum holes: 9 - paddings: 2, sum paddings: 2 - forced alignments: 1, last cacheline: 56 bytes Signed-off-by: Sarika Sharma Link: https://patch.msgid.link/20250213171632.1646538-2-quic_sarishar@quicinc.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 43 +++++++++++++++++++++---------------------- net/wireless/nl80211.c | 11 ++++++----- 2 files changed, 27 insertions(+), 27 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 3580f3f5f44e..6b170a8d086c 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2045,9 +2045,6 @@ struct cfg80211_tid_stats { * @assoc_at: bootime (ns) of the last association * @rx_bytes: bytes (size of MPDUs) received from this station * @tx_bytes: bytes (size of MPDUs) transmitted to this station - * @llid: mesh local link id - * @plid: mesh peer link id - * @plink_state: mesh peer link state * @signal: The signal strength, type depends on the wiphy's signal_type. * For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_. * @signal_avg: Average signal strength, type depends on the wiphy's signal_type. @@ -2067,14 +2064,20 @@ struct cfg80211_tid_stats { * This number should increase every time the list of stations * changes, i.e. when a station is added or removed, so that * userspace can tell whether it got a consistent snapshot. + * @beacon_loss_count: Number of times beacon loss event has triggered. * @assoc_req_ies: IEs from (Re)Association Request. * This is used only when in AP mode with drivers that do not use * user space MLME/SME implementation. The information is provided for * the cfg80211_new_sta() calls to notify user space of the IEs. * @assoc_req_ies_len: Length of assoc_req_ies buffer in octets. * @sta_flags: station flags mask & values - * @beacon_loss_count: Number of times beacon loss event has triggered. * @t_offset: Time offset of the station relative to this host. + * @llid: mesh local link id + * @plid: mesh peer link id + * @plink_state: mesh peer link state + * @connected_to_gate: true if mesh STA has a path to mesh gate + * @connected_to_as: true if mesh STA has a path to authentication server + * @airtime_link_metric: mesh airtime link metric. * @local_pm: local mesh STA power save mode * @peer_pm: peer mesh STA power save mode * @nonpeer_pm: non-peer mesh STA power save mode @@ -2083,7 +2086,6 @@ struct cfg80211_tid_stats { * @rx_beacon: number of beacons received from this peer * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received * from this peer - * @connected_to_gate: true if mesh STA has a path to mesh gate * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer * @airtime_weight: current airtime scheduling weight @@ -2097,8 +2099,6 @@ struct cfg80211_tid_stats { * @fcs_err_count: number of packets (MPDUs) received from this station with * an FCS error. This counter should be incremented only when TA of the * received packet with an FCS error matches the peer MAC address. - * @airtime_link_metric: mesh airtime link metric. - * @connected_to_as: true if mesh STA has a path to authentication server * @mlo_params_valid: Indicates @assoc_link_id and @mld_addr fields are filled * by driver. Drivers use this only in cfg80211_new_sta() calls when AP * MLD's MLME/SME is offload to driver. Drivers won't fill this @@ -2125,9 +2125,6 @@ struct station_info { u64 assoc_at; u64 rx_bytes; u64 tx_bytes; - u16 llid; - u16 plid; - u8 plink_state; s8 signal; s8 signal_avg; @@ -2147,36 +2144,38 @@ struct station_info { int generation; + u32 beacon_loss_count; + const u8 *assoc_req_ies; size_t assoc_req_ies_len; - u32 beacon_loss_count; s64 t_offset; + u16 llid; + u16 plid; + u8 plink_state; + u8 connected_to_gate; + u8 connected_to_as; + u32 airtime_link_metric; enum nl80211_mesh_power_mode local_pm; enum nl80211_mesh_power_mode peer_pm; enum nl80211_mesh_power_mode nonpeer_pm; u32 expected_throughput; - u64 tx_duration; - u64 rx_duration; - u64 rx_beacon; - u8 rx_beacon_signal_avg; - u8 connected_to_gate; + u16 airtime_weight; - struct cfg80211_tid_stats *pertid; s8 ack_signal; s8 avg_ack_signal; + struct cfg80211_tid_stats *pertid; - u16 airtime_weight; + u64 tx_duration; + u64 rx_duration; + u64 rx_beacon; + u8 rx_beacon_signal_avg; u32 rx_mpdu_count; u32 fcs_err_count; - u32 airtime_link_metric; - - u8 connected_to_as; - bool mlo_params_valid; u8 assoc_link_id; u8 mld_addr[ETH_ALEN] __aligned(2); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 79b3f34e72e2..2c4e06610a79 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6750,9 +6750,6 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO_U64(RX_BYTES64, rx_bytes); PUT_SINFO_U64(TX_BYTES64, tx_bytes); - PUT_SINFO(LLID, llid, u16); - PUT_SINFO(PLID, plid, u16); - PUT_SINFO(PLINK_STATE, plink_state, u8); PUT_SINFO_U64(RX_DURATION, rx_duration); PUT_SINFO_U64(TX_DURATION, tx_duration); @@ -6796,13 +6793,18 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(TX_RETRIES, tx_retries, u32); PUT_SINFO(TX_FAILED, tx_failed, u32); PUT_SINFO(EXPECTED_THROUGHPUT, expected_throughput, u32); - PUT_SINFO(AIRTIME_LINK_METRIC, airtime_link_metric, u32); PUT_SINFO(BEACON_LOSS, beacon_loss_count, u32); + + PUT_SINFO(LLID, llid, u16); + PUT_SINFO(PLID, plid, u16); + PUT_SINFO(PLINK_STATE, plink_state, u8); + PUT_SINFO(AIRTIME_LINK_METRIC, airtime_link_metric, u32); PUT_SINFO(LOCAL_PM, local_pm, u32); PUT_SINFO(PEER_PM, peer_pm, u32); PUT_SINFO(NONPEER_PM, nonpeer_pm, u32); PUT_SINFO(CONNECTED_TO_GATE, connected_to_gate, u8); PUT_SINFO(CONNECTED_TO_AS, connected_to_as, u8); + PUT_SINFO_U64(T_OFFSET, t_offset); if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) { bss_param = nla_nest_start_noflag(msg, @@ -6830,7 +6832,6 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, &sinfo->sta_flags)) goto nla_put_failure; - PUT_SINFO_U64(T_OFFSET, t_offset); PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc); PUT_SINFO_U64(BEACON_RX, rx_beacon); PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8); -- cgit v1.2.3 From cfc47029fa124e5e04411fb1dbd3726db90fd346 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:19 -0800 Subject: ipv4: fib: Allocate fib_info_hash[] during netns initialisation. We will allocate fib_info_hash[] and fib_info_laddrhash[] for each netns. Currently, fib_info_hash[] is allocated when the first route is added. Let's move the first allocation to a new __net_init function. Note that we must call fib4_semantics_exit() in fib_net_exit_batch() because ->exit() is called earlier than ->exit_batch(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/ip_fib.h | 2 ++ net/ipv4/fib_frontend.c | 11 +++++++++++ net/ipv4/fib_semantics.c | 45 ++++++++++++++++++++++++++++++++------------- 3 files changed, 45 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a113c11ab56b..e3864b74e92a 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -162,6 +162,8 @@ struct fib_info { struct fib_nh fib_nh[] __counted_by(fib_nhs); }; +int __net_init fib4_semantics_init(struct net *net); +void __net_exit fib4_semantics_exit(struct net *net); #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rule; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6730e2034cf8..40c062f820f2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1615,9 +1615,15 @@ static int __net_init fib_net_init(struct net *net) error = ip_fib_net_init(net); if (error < 0) goto out; + + error = fib4_semantics_init(net); + if (error) + goto out_semantics; + error = nl_fib_lookup_init(net); if (error < 0) goto out_nlfl; + error = fib_proc_init(net); if (error < 0) goto out_proc; @@ -1627,6 +1633,8 @@ out: out_proc: nl_fib_lookup_exit(net); out_nlfl: + fib4_semantics_exit(net); +out_semantics: rtnl_lock(); ip_fib_net_exit(net); rtnl_unlock(); @@ -1648,6 +1656,9 @@ static void __net_exit fib_net_exit_batch(struct list_head *net_list) ip_fib_net_exit(net); rtnl_unlock(); + + list_for_each_entry(net, net_list, exit_list) + fib4_semantics_exit(net); } static struct pernet_operations fib_net_ops = { diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 23aae379ba42..b7e2023bf742 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1420,28 +1420,21 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } #endif - err = -ENOBUFS; - if (fib_info_cnt >= fib_info_hash_size) { + unsigned int new_hash_bits = fib_info_hash_bits + 1; struct hlist_head *new_info_hash; - unsigned int new_hash_bits; - - if (!fib_info_hash_bits) - new_hash_bits = 4; - else - new_hash_bits = fib_info_hash_bits + 1; new_info_hash = fib_info_hash_alloc(new_hash_bits); if (new_info_hash) fib_info_hash_move(new_info_hash, 1 << new_hash_bits); - - if (!fib_info_hash_size) - goto failure; } fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); - if (!fi) + if (!fi) { + err = -ENOBUFS; goto failure; + } + fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(fi->fib_metrics)) { err = PTR_ERR(fi->fib_metrics); @@ -1862,7 +1855,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) struct fib_info *fi; int ret = 0; - if (!fib_info_laddrhash || local == 0) + if (!local) return 0; head = fib_info_laddrhash_bucket(net, local); @@ -2264,3 +2257,29 @@ check_saddr: fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK); } } + +int __net_init fib4_semantics_init(struct net *net) +{ + unsigned int hash_bits = 4; + + if (!net_eq(net, &init_net)) + return 0; + + fib_info_hash = fib_info_hash_alloc(hash_bits); + if (!fib_info_hash) + return -ENOMEM; + + fib_info_hash_bits = hash_bits; + fib_info_hash_size = 1 << hash_bits; + fib_info_laddrhash = fib_info_hash + fib_info_hash_size; + + return 0; +} + +void __net_exit fib4_semantics_exit(struct net *net) +{ + if (!net_eq(net, &init_net)) + return; + + fib_info_hash_free(fib_info_hash); +} -- cgit v1.2.3 From 9f7f3ebeba9388c30b0cba9dc5df0c43d0e07605 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:24 -0800 Subject: ipv4: fib: Namespacify fib_info hash tables. We will convert RTM_NEWROUTE and RTM_DELROUTE to per-netns RTNL. Then, we need to have per-netns hash tables for struct fib_info. Let's allocate the hash tables per netns. fib_info_hash, fib_info_hash_bits, and fib_info_cnt are now moved to struct netns_ipv4 and accessed with net->ipv4.fib_XXX. Also, the netns checks are removed from fib_find_info_nh() and fib_find_info(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 3 +++ net/ipv4/fib_semantics.c | 61 +++++++++++++++++++++--------------------------- 2 files changed, 29 insertions(+), 35 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 45ac125e8aeb..650b2dc9199f 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -111,6 +111,9 @@ struct netns_ipv4 { #endif struct hlist_head *fib_table_hash; struct sock *fibnl; + struct hlist_head *fib_info_hash; + unsigned int fib_info_hash_bits; + unsigned int fib_info_cnt; struct sock *mc_autojoin_sk; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 0cd40ff18d8b..f68bb9e34c34 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -50,10 +50,6 @@ #include "fib_lookup.h" -static struct hlist_head *fib_info_hash; -static unsigned int fib_info_hash_bits; -static unsigned int fib_info_cnt; - /* for_nexthops and change_nexthops only used when nexthop object * is not set in a fib_info. The logic within can reference fib_nh. */ @@ -256,8 +252,7 @@ void fib_release_info(struct fib_info *fi) ASSERT_RTNL(); if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); - - fib_info_cnt--; + fi->fib_net->ipv4.fib_info_cnt--; if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); @@ -333,11 +328,12 @@ static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, static unsigned int fib_info_hashfn_result(const struct net *net, unsigned int val) { - return hash_32(val ^ net_hash_mix(net), fib_info_hash_bits); + return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits); } static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) { + struct net *net = fi->fib_net; unsigned int val; val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, @@ -352,16 +348,18 @@ static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) } endfor_nexthops(fi) } - return &fib_info_hash[fib_info_hashfn_result(fi->fib_net, val)]; + return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)]; } static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net, __be32 val) { - u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val, - fib_info_hash_bits); + unsigned int hash_bits = net->ipv4.fib_info_hash_bits; + u32 slot; - return &fib_info_hash[(1 << fib_info_hash_bits) + slot]; + slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits); + + return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot]; } static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) @@ -376,22 +374,22 @@ static void fib_info_hash_free(struct hlist_head *head) kvfree(head); } -static void fib_info_hash_grow(void) +static void fib_info_hash_grow(struct net *net) { + unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits; struct hlist_head *new_info_hash, *old_info_hash; - unsigned int old_size = 1 << fib_info_hash_bits; unsigned int i; - if (fib_info_cnt < old_size) + if (net->ipv4.fib_info_cnt < old_size) return; - new_info_hash = fib_info_hash_alloc(fib_info_hash_bits + 1); + new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1); if (!new_info_hash) return; - old_info_hash = fib_info_hash; - fib_info_hash = new_info_hash; - fib_info_hash_bits += 1; + old_info_hash = net->ipv4.fib_info_hash; + net->ipv4.fib_info_hash = new_info_hash; + net->ipv4.fib_info_hash_bits += 1; for (i = 0; i < old_size; i++) { struct hlist_head *head = &old_info_hash[i]; @@ -429,13 +427,12 @@ static struct fib_info *fib_find_info_nh(struct net *net, (__force u32)cfg->fc_prefsrc, cfg->fc_priority); hash = fib_info_hashfn_result(net, hash); - head = &fib_info_hash[hash]; + head = &net->ipv4.fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { - if (!net_eq(fi->fib_net, net)) - continue; if (!fi->nh || fi->nh->id != cfg->fc_nh_id) continue; + if (cfg->fc_protocol == fi->fib_protocol && cfg->fc_scope == fi->fib_scope && cfg->fc_prefsrc == fi->fib_prefsrc && @@ -455,10 +452,9 @@ static struct fib_info *fib_find_info(struct fib_info *nfi) struct fib_info *fi; hlist_for_each_entry(fi, head, fib_hash) { - if (!net_eq(fi->fib_net, nfi->fib_net)) - continue; if (fi->fib_nhs != nfi->fib_nhs) continue; + if (nfi->fib_protocol == fi->fib_protocol && nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && @@ -1406,7 +1402,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } #endif - fib_info_hash_grow(); + fib_info_hash_grow(net); fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); if (!fi) { @@ -1550,7 +1546,7 @@ link_it: refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); - fib_info_cnt++; + net->ipv4.fib_info_cnt++; hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); if (fi->fib_prefsrc) { @@ -2241,22 +2237,17 @@ int __net_init fib4_semantics_init(struct net *net) { unsigned int hash_bits = 4; - if (!net_eq(net, &init_net)) - return 0; - - fib_info_hash = fib_info_hash_alloc(hash_bits); - if (!fib_info_hash) + net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits); + if (!net->ipv4.fib_info_hash) return -ENOMEM; - fib_info_hash_bits = hash_bits; + net->ipv4.fib_info_hash_bits = hash_bits; + net->ipv4.fib_info_cnt = 0; return 0; } void __net_exit fib4_semantics_exit(struct net *net) { - if (!net_eq(net, &init_net)) - return; - - fib_info_hash_free(fib_info_hash); + fib_info_hash_free(net->ipv4.fib_info_hash); } -- cgit v1.2.3 From e34100c2ecbb79a7d69474eb2f58545bb9926a5f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Mar 2025 20:14:19 +0000 Subject: tcp: add a drop_reason pointer to tcp_check_req() We want to add new drop reasons for packets dropped in 3WHS in the following patches. tcp_rcv_state_process() has to set reason to TCP_FASTOPEN, because tcp_check_req() will conditionally overwrite the drop_reason. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250301201424.2046477-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 +- net/ipv4/tcp_input.c | 5 ++--- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv4/tcp_minisocks.c | 3 ++- net/ipv6/tcp_ipv6.c | 3 ++- 5 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index f9b9377a2897..a9bc959fb102 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -392,7 +392,7 @@ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, u32 *tw_isn); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, - bool *lost_race); + bool *lost_race, enum skb_drop_reason *drop_reason); enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); void tcp_enter_loss(struct sock *sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d22ad553b45b..4e2212348088 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6812,10 +6812,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - if (!tcp_check_req(sk, skb, req, true, &req_stolen)) { - SKB_DR_SET(reason, TCP_FASTOPEN); + SKB_DR_SET(reason, TCP_FASTOPEN); + if (!tcp_check_req(sk, skb, req, true, &req_stolen, &reason)) goto discard; - } } if (!th->ack && !th->rst && !th->syn) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7900855237d9..218f01a8cc5f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2265,7 +2265,8 @@ lookup: th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); - nsk = tcp_check_req(sk, skb, req, false, &req_stolen); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen, + &drop_reason); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 29b54ade7572..46c86c4f80e9 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -657,7 +657,8 @@ EXPORT_SYMBOL(tcp_create_openreq_child); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - bool fastopen, bool *req_stolen) + bool fastopen, bool *req_stolen, + enum skb_drop_reason *drop_reason) { struct tcp_options_received tmp_opt; struct sock *child; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a80608260298..d01088ab80d2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1828,7 +1828,8 @@ lookup: th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); - nsk = tcp_check_req(sk, skb, req, false, &req_stolen); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen, + &drop_reason); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } -- cgit v1.2.3 From a11a791ca81e5cd9bf2d48e1368d836bc53f00ca Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Mar 2025 20:14:20 +0000 Subject: tcp: add four drop reasons to tcp_check_req() Use two existing drop reasons in tcp_check_req(): - TCP_RFC7323_PAWS - TCP_OVERWINDOW Add two new ones: - TCP_RFC7323_TSECR (corresponds to LINUX_MIB_TSECRREJECTED) - TCP_LISTEN_OVERFLOW (when a listener accept queue is full) Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250301201424.2046477-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 9 +++++++++ net/ipv4/tcp_minisocks.c | 10 ++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 32a34dfe8cc5..e4fdc6b54cef 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -40,6 +40,8 @@ FN(TCP_OFOMERGE) \ FN(TCP_RFC7323_PAWS) \ FN(TCP_RFC7323_PAWS_ACK) \ + FN(TCP_RFC7323_TSECR) \ + FN(TCP_LISTEN_OVERFLOW) \ FN(TCP_OLD_SEQUENCE) \ FN(TCP_INVALID_SEQUENCE) \ FN(TCP_INVALID_ACK_SEQUENCE) \ @@ -281,6 +283,13 @@ enum skb_drop_reason { * Corresponds to LINUX_MIB_PAWS_OLD_ACK. */ SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK, + /** + * @SKB_DROP_REASON_TCP_RFC7323_TSECR: PAWS check, invalid TSEcr. + * Corresponds to LINUX_MIB_TSECRREJECTED. + */ + SKB_DROP_REASON_TCP_RFC7323_TSECR, + /** @SKB_DROP_REASON_TCP_LISTEN_OVERFLOW: listener queue full. */ + SKB_DROP_REASON_TCP_LISTEN_OVERFLOW, /** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */ SKB_DROP_REASON_TCP_OLD_SEQUENCE, /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 46c86c4f80e9..ba4a5d7f251d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -809,10 +809,15 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); - if (paws_reject) + if (paws_reject) { + SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS); NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); - else if (tsecr_reject) + } else if (tsecr_reject) { + SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR); NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED); + } else { + SKB_DR_SET(*drop_reason, TCP_OVERWINDOW); + } return NULL; } @@ -882,6 +887,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: + SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW); if (sk != req->rsk_listener) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); -- cgit v1.2.3 From e7b9ecce562ca6a1de32c56c597fa45e08c44ec0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Mar 2025 20:14:21 +0000 Subject: tcp: convert to dev_net_rcu() TCP uses of dev_net() are under RCU protection, change them to dev_net_rcu() to get LOCKDEP support. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250301201424.2046477-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet6_hashtables.h | 2 +- include/net/inet_hashtables.h | 2 +- net/ipv4/tcp_ipv4.c | 12 ++++++------ net/ipv4/tcp_metrics.c | 6 +++--- net/ipv6/tcp_ipv6.c | 22 +++++++++++----------- 5 files changed, 22 insertions(+), 22 deletions(-) (limited to 'include/net') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 74dd90ff5f12..c32878c69179 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -150,7 +150,7 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, int iif, int sdif, bool *refcounted) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = dev_net_rcu(skb_dst(skb)->dev); const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct sock *sk; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 5eea47f135a4..da818fb0205f 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -492,7 +492,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, const int sdif, bool *refcounted) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = dev_net_rcu(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct sock *sk; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 218f01a8cc5f..ae07613e4f33 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -494,14 +494,14 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); - struct tcp_sock *tp; + struct net *net = dev_net_rcu(skb->dev); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - struct sock *sk; struct request_sock *fastopen; + struct tcp_sock *tp; u32 seq, snd_una; + struct sock *sk; int err; - struct net *net = dev_net(skb->dev); sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, iph->daddr, th->dest, iph->saddr, @@ -786,7 +786,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); - net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); + net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) @@ -1961,7 +1961,7 @@ EXPORT_SYMBOL(tcp_v4_do_rcv); int tcp_v4_early_demux(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; @@ -2172,7 +2172,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, int tcp_v4_rcv(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; int sdif = inet_sdif(skb); int dif = inet_iif(skb); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 95669935494e..4251670e328c 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -170,7 +170,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, bool reclaim = false; spin_lock_bh(&tcp_metrics_lock); - net = dev_net(dst->dev); + net = dev_net_rcu(dst->dev); /* While waiting for the spin-lock the cache might have been populated * with this entry and so we have to check again. @@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, return NULL; } - net = dev_net(dst->dev); + net = dev_net_rcu(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); @@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, else return NULL; - net = dev_net(dst->dev); + net = dev_net_rcu(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d01088ab80d2..fe75ad8e606c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -376,7 +376,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct request_sock *fastopen; struct ipv6_pinfo *np; struct tcp_sock *tp; @@ -866,16 +866,16 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 int oif, int rst, u8 tclass, __be32 label, u32 priority, u32 txhash, struct tcp_key *key) { - const struct tcphdr *th = tcp_hdr(skb); - struct tcphdr *t1; - struct sk_buff *buff; - struct flowi6 fl6; - struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); - struct sock *ctl_sk = net->ipv6.tcp_sk; + struct net *net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); unsigned int tot_len = sizeof(struct tcphdr); + struct sock *ctl_sk = net->ipv6.tcp_sk; + const struct tcphdr *th = tcp_hdr(skb); __be32 mrst = 0, *topt; struct dst_entry *dst; - __u32 mark = 0; + struct sk_buff *buff; + struct tcphdr *t1; + struct flowi6 fl6; + u32 mark = 0; if (tsecr) tot_len += TCPOLEN_TSTAMP_ALIGNED; @@ -1041,7 +1041,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, if (!sk && !ipv6_unicast_destination(skb)) return; - net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); + net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(th, &md5_hash_location, &aoh)) return; @@ -1740,6 +1740,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { + struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); @@ -1749,7 +1750,6 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) bool refcounted; int ret; u32 isn; - struct net *net = dev_net(skb->dev); drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) @@ -2001,7 +2001,7 @@ do_time_wait: void tcp_v6_early_demux(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct ipv6hdr *hdr; const struct tcphdr *th; struct sock *sk; -- cgit v1.2.3 From 456cc675b6d4cadd4872a477d8976ff56eef4b6f Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 28 Feb 2025 18:01:31 +0800 Subject: sock: add sock_kmemdup helper This patch adds the sock version of kmemdup() helper, named sock_kmemdup(), to duplicate the input "src" memory block using the socket's option memory buffer. Signed-off-by: Geliang Tang Reviewed-by: Kuniyuki Iwashima Acked-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/f828077394c7d1f3560123497348b438c875b510.1740735165.git.tanggeliang@kylinos.cn Signed-off-by: Jakub Kicinski --- include/net/sock.h | 2 ++ net/core/sock.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index e771d99f81b0..8daf1b3b12c6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1797,6 +1797,8 @@ static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk, } void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); +void *sock_kmemdup(struct sock *sk, const void *src, + int size, gfp_t priority); void sock_kfree_s(struct sock *sk, void *mem, int size); void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 23bce41f7f1f..a0598518ce89 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2836,6 +2836,22 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) } EXPORT_SYMBOL(sock_kmalloc); +/* + * Duplicate the input "src" memory block using the socket's + * option memory buffer. + */ +void *sock_kmemdup(struct sock *sk, const void *src, + int size, gfp_t priority) +{ + void *mem; + + mem = sock_kmalloc(sk, size, priority); + if (mem) + memcpy(mem, src, size); + return mem; +} +EXPORT_SYMBOL(sock_kmemdup); + /* Free an option memory block. Note, we actually want the inline * here as this allows gcc to detect the nullify and fold away the * condition entirely. -- cgit v1.2.3 From d186f405fdf4229d0e9a52ba71662404b06cc002 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 2 Mar 2025 12:42:36 +0000 Subject: tcp: add RCU management to inet_bind_bucket Add RCU protection to inet_bind_bucket structure. - Add rcu_head field to the structure definition. - Use kfree_rcu() at destroy time, and remove inet_bind_bucket_destroy() first argument. - Use hlist_del_rcu() and hlist_add_head_rcu() methods. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250302124237.3913746-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 4 ++-- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/inet_hashtables.c | 14 +++++++------- net/ipv4/inet_timewait_sock.c | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index da818fb0205f..1061c4f536a6 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -89,6 +89,7 @@ struct inet_bind_bucket { bool fast_ipv6_only; struct hlist_node node; struct hlist_head bhash2; + struct rcu_head rcu; }; struct inet_bind2_bucket { @@ -226,8 +227,7 @@ struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev); -void inet_bind_bucket_destroy(struct kmem_cache *cachep, - struct inet_bind_bucket *tb); +void inet_bind_bucket_destroy(struct inet_bind_bucket *tb); bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, unsigned short port, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b4e514da22b6..e93c66034077 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -598,7 +598,7 @@ fail_unlock: if (bhash2_created) inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2); if (bhash_created) - inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); } if (head2_lock_acquired) spin_unlock(&head2->lock); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 46d39aa2199e..b737e13f8459 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -76,7 +76,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, tb->fastreuse = 0; tb->fastreuseport = 0; INIT_HLIST_HEAD(&tb->bhash2); - hlist_add_head(&tb->node, &head->chain); + hlist_add_head_rcu(&tb->node, &head->chain); } return tb; } @@ -84,11 +84,11 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, /* * Caller must hold hashbucket lock for this tb with local BH disabled */ -void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) +void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) { if (hlist_empty(&tb->bhash2)) { - __hlist_del(&tb->node); - kmem_cache_free(cachep, tb); + hlist_del_rcu(&tb->node); + kfree_rcu(tb, rcu); } } @@ -201,7 +201,7 @@ static void __inet_put_port(struct sock *sk) } spin_unlock(&head2->lock); - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); spin_unlock(&head->lock); } @@ -285,7 +285,7 @@ bhash2_find: error: if (created_inet_bind_bucket) - inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOMEM; @@ -1162,7 +1162,7 @@ error: spin_unlock(&head2->lock); if (tb_created) - inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); spin_unlock(&head->lock); if (tw) diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 337390ba85b4..aded4bf1bc16 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -39,7 +39,7 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, tw->tw_tb = NULL; tw->tw_tb2 = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); __sock_put((struct sock *)tw); } -- cgit v1.2.3 From 86c2bc293b8130aec9fa504e953531a84a6eb9a6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 2 Mar 2025 12:42:37 +0000 Subject: tcp: use RCU lookup in __inet_hash_connect() When __inet_hash_connect() has to try many 4-tuples before finding an available one, we see a high spinlock cost from the many spin_lock_bh(&head->lock) performed in its loop. This patch adds an RCU lookup to avoid the spinlock cost. check_established() gets a new @rcu_lookup argument. First reason is to not make any changes while head->lock is not held. Second reason is to not make this RCU lookup a second time after the spinlock has been acquired. Tested: Server: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog Client: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server Before series: utime_start=0.288582 utime_end=1.548707 stime_start=20.637138 stime_end=2002.489845 num_transactions=484453 latency_min=0.156279245 latency_max=20.922042756 latency_mean=1.546521274 latency_stddev=3.936005194 num_samples=312537 throughput=47426.00 perf top on the client: 49.54% [kernel] [k] _raw_spin_lock 25.87% [kernel] [k] _raw_spin_lock_bh 5.97% [kernel] [k] queued_spin_lock_slowpath 5.67% [kernel] [k] __inet_hash_connect 3.53% [kernel] [k] __inet6_check_established 3.48% [kernel] [k] inet6_ehashfn 0.64% [kernel] [k] rcu_all_qs After this series: utime_start=0.271607 utime_end=3.847111 stime_start=18.407684 stime_end=1997.485557 num_transactions=1350742 latency_min=0.014131929 latency_max=17.895073144 latency_mean=0.505675853 # Nice reduction of latency metrics latency_stddev=2.125164772 num_samples=307884 throughput=139866.80 # 190 % increase perf top on client: 56.86% [kernel] [k] __inet6_check_established 17.96% [kernel] [k] __inet_hash_connect 13.88% [kernel] [k] inet6_ehashfn 2.52% [kernel] [k] rcu_all_qs 2.01% [kernel] [k] __cond_resched 0.41% [kernel] [k] _raw_spin_lock Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Tested-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250302124237.3913746-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 3 ++- net/ipv4/inet_hashtables.c | 52 +++++++++++++++++++++++++++++-------------- net/ipv6/inet6_hashtables.c | 24 +++++++++++--------- 3 files changed, 50 insertions(+), 29 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 1061c4f536a6..f447d61d9598 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -529,7 +529,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, - struct inet_timewait_sock **)); + struct inet_timewait_sock **, + bool rcu_lookup)); int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index b737e13f8459..d1b5f45ee718 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -537,7 +537,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_established); /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, - struct inet_timewait_sock **twp) + struct inet_timewait_sock **twp, + bool rcu_lookup) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -556,17 +557,17 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk2; spinlock_t *lock; - rcu_read_lock(); - sk_nulls_for_each(sk2, node, &head->chain) { - if (sk2->sk_hash != hash || - !inet_match(net, sk2, acookie, ports, dif, sdif)) - continue; - if (sk2->sk_state == TCP_TIME_WAIT) - break; - rcu_read_unlock(); - return -EADDRNOTAVAIL; + if (rcu_lookup) { + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash || + !inet_match(net, sk2, acookie, ports, dif, sdif)) + continue; + if (sk2->sk_state == TCP_TIME_WAIT) + break; + return -EADDRNOTAVAIL; + } + return 0; } - rcu_read_unlock(); lock = inet_ehash_lockp(hinfo, hash); spin_lock(lock); @@ -1007,7 +1008,8 @@ static u32 *table_perturb; int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, - struct sock *, __u16, struct inet_timewait_sock **)) + struct sock *, __u16, struct inet_timewait_sock **, + bool rcu_lookup)) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_bind_hashbucket *head, *head2; @@ -1025,7 +1027,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, if (port) { local_bh_disable(); - ret = check_established(death_row, sk, port, NULL); + ret = check_established(death_row, sk, port, NULL, false); local_bh_enable(); return ret; } @@ -1061,6 +1063,21 @@ other_parity_scan: continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; + rcu_read_lock(); + hlist_for_each_entry_rcu(tb, &head->chain, node) { + if (!inet_bind_bucket_match(tb, net, port, l3mdev)) + continue; + if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) { + rcu_read_unlock(); + goto next_port; + } + if (!check_established(death_row, sk, port, &tw, true)) + break; + rcu_read_unlock(); + goto next_port; + } + rcu_read_unlock(); + spin_lock_bh(&head->lock); /* Does not bother with rcv_saddr checks, because @@ -1070,12 +1087,12 @@ other_parity_scan: if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) - goto next_port; + goto next_port_unlock; WARN_ON(hlist_empty(&tb->bhash2)); if (!check_established(death_row, sk, - port, &tw)) + port, &tw, false)) goto ok; - goto next_port; + goto next_port_unlock; } } @@ -1089,8 +1106,9 @@ other_parity_scan: tb->fastreuse = -1; tb->fastreuseport = -1; goto ok; -next_port: +next_port_unlock: spin_unlock_bh(&head->lock); +next_port: cond_resched(); } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 3604a5cae5d2..9be315496459 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -263,7 +263,8 @@ EXPORT_SYMBOL_GPL(inet6_lookup); static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) + struct inet_timewait_sock **twp, + bool rcu_lookup) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -281,17 +282,18 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk2; spinlock_t *lock; - rcu_read_lock(); - sk_nulls_for_each(sk2, node, &head->chain) { - if (sk2->sk_hash != hash || - !inet6_match(net, sk2, saddr, daddr, ports, dif, sdif)) - continue; - if (sk2->sk_state == TCP_TIME_WAIT) - break; - rcu_read_unlock(); - return -EADDRNOTAVAIL; + if (rcu_lookup) { + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash || + !inet6_match(net, sk2, saddr, daddr, + ports, dif, sdif)) + continue; + if (sk2->sk_state == TCP_TIME_WAIT) + break; + return -EADDRNOTAVAIL; + } + return 0; } - rcu_read_unlock(); lock = inet_ehash_lockp(hinfo, hash); spin_lock(lock); -- cgit v1.2.3 From f130a0cc1b4ff1ef28a307428d40436032e2b66e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 4 Mar 2025 12:59:18 +0000 Subject: inet: fix lwtunnel_valid_encap_type() lock imbalance After blamed commit rtm_to_fib_config() now calls lwtunnel_valid_encap_type{_attr}() without RTNL held, triggering an unlock balance in __rtnl_unlock, as reported by syzbot [1] IPv6 and rtm_to_nh_config() are not yet converted. Add a temporary @rtnl_is_held parameter to lwtunnel_valid_encap_type() and lwtunnel_valid_encap_type_attr(). While we are at it replace the two rcu_dereference() in lwtunnel_valid_encap_type() with more appropriate rcu_access_pointer(). [1] syz-executor245/5836 is trying to release lock (rtnl_mutex) at: [] __rtnl_unlock+0x6c/0xf0 net/core/rtnetlink.c:142 but there are no more locks to release! other info that might help us debug this: no locks held by syz-executor245/5836. stack backtrace: CPU: 0 UID: 0 PID: 5836 Comm: syz-executor245 Not tainted 6.14.0-rc4-syzkaller-00873-g3424291dd242 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2025 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_unlock_imbalance_bug+0x25b/0x2d0 kernel/locking/lockdep.c:5289 __lock_release kernel/locking/lockdep.c:5518 [inline] lock_release+0x47e/0xa30 kernel/locking/lockdep.c:5872 __mutex_unlock_slowpath+0xec/0x800 kernel/locking/mutex.c:891 __rtnl_unlock+0x6c/0xf0 net/core/rtnetlink.c:142 lwtunnel_valid_encap_type+0x38a/0x5f0 net/core/lwtunnel.c:169 lwtunnel_valid_encap_type_attr+0x113/0x270 net/core/lwtunnel.c:209 rtm_to_fib_config+0x949/0x14e0 net/ipv4/fib_frontend.c:808 inet_rtm_newroute+0xf6/0x2a0 net/ipv4/fib_frontend.c:917 rtnetlink_rcv_msg+0x791/0xcf0 net/core/rtnetlink.c:6919 netlink_rcv_skb+0x206/0x480 net/netlink/af_netlink.c:2534 netlink_unicast_kernel net/netlink/af_netlink.c:1313 [inline] netlink_unicast+0x7f6/0x990 net/netlink/af_netlink.c:1339 netlink_sendmsg+0x8de/0xcb0 net/netlink/af_netlink.c:1883 sock_sendmsg_nosec net/socket.c:709 [inline] Fixes: 1dd2af7963e9 ("ipv4: fib: Convert RTM_NEWROUTE and RTM_DELROUTE to per-netns RTNL.") Reported-by: syzbot+3f18ef0f7df107a3f6a0@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/67c6f87a.050a0220.38b91b.0147.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250304125918.2763514-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/lwtunnel.h | 12 ++++++++---- net/core/lwtunnel.c | 23 ++++++++++++----------- net/ipv4/fib_frontend.c | 4 ++-- net/ipv4/nexthop.c | 3 ++- net/ipv6/route.c | 6 ++++-- 5 files changed, 28 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 53bd2d02a4f0..39cd50300a18 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -116,9 +116,11 @@ int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_valid_encap_type(u16 encap_type, - struct netlink_ext_ack *extack); + struct netlink_ext_ack *extack, + bool rtnl_is_held); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, - struct netlink_ext_ack *extack); + struct netlink_ext_ack *extack, + bool rtnl_is_held); int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, @@ -201,13 +203,15 @@ static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, } static inline int lwtunnel_valid_encap_type(u16 encap_type, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + bool rtnl_is_held) { NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel"); return -EOPNOTSUPP; } static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + bool rtnl_is_held) { /* return 0 since we are not walking attr looking for * RTA_ENCAP_TYPE attribute on nexthops. diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 711cd3b4347a..6d3833269c2b 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -147,7 +147,8 @@ int lwtunnel_build_state(struct net *net, u16 encap_type, } EXPORT_SYMBOL_GPL(lwtunnel_build_state); -int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) +int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack, + bool rtnl_is_held) { const struct lwtunnel_encap_ops *ops; int ret = -EINVAL; @@ -158,21 +159,19 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) return ret; } - rcu_read_lock(); - ops = rcu_dereference(lwtun_encaps[encap_type]); - rcu_read_unlock(); + ops = rcu_access_pointer(lwtun_encaps[encap_type]); #ifdef CONFIG_MODULES if (!ops) { const char *encap_type_str = lwtunnel_encap_str(encap_type); if (encap_type_str) { - __rtnl_unlock(); + if (rtnl_is_held) + __rtnl_unlock(); request_module("rtnl-lwt-%s", encap_type_str); - rtnl_lock(); + if (rtnl_is_held) + rtnl_lock(); - rcu_read_lock(); - ops = rcu_dereference(lwtun_encaps[encap_type]); - rcu_read_unlock(); + ops = rcu_access_pointer(lwtun_encaps[encap_type]); } } #endif @@ -185,7 +184,8 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + bool rtnl_is_held) { struct rtnexthop *rtnh = (struct rtnexthop *)attr; struct nlattr *nla_entype; @@ -207,7 +207,8 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, encap_type = nla_get_u16(nla_entype); if (lwtunnel_valid_encap_type(encap_type, - extack) != 0) + extack, + rtnl_is_held) != 0) return -EOPNOTSUPP; } } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6de77415b5b3..3f4e629998fa 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -807,7 +807,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_MULTIPATH: err = lwtunnel_valid_encap_type_attr(nla_data(attr), nla_len(attr), - extack); + extack, false); if (err < 0) goto errout; cfg->fc_mp = nla_data(attr); @@ -825,7 +825,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, - extack); + extack, false); if (err < 0) goto errout; break; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 09a3d73b45ba..01df7dd795f0 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -3187,7 +3187,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, } cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); - err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); + err = lwtunnel_valid_encap_type(cfg->nh_encap_type, + extack, true); if (err < 0) goto out; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ef2d23a1e3d5..fb2e99a56529 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5128,7 +5128,8 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, - cfg->fc_mp_len, extack); + cfg->fc_mp_len, + extack, true); if (err < 0) goto errout; } @@ -5147,7 +5148,8 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[RTA_ENCAP_TYPE]) { cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); - err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); + err = lwtunnel_valid_encap_type(cfg->fc_encap_type, + extack, true); if (err < 0) goto errout; } -- cgit v1.2.3 From d4438ce68bf145aa1d7ed03ebf3b8ece337e3f64 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Mar 2025 03:45:50 +0000 Subject: inet: call inet6_ehashfn() once from inet6_hash_connect() inet6_ehashfn() being called from __inet6_check_established() has a big impact on performance, as shown in the Tested section. After prior patch, we can compute the hash for port 0 from inet6_hash_connect(), and derive each hash in __inet_hash_connect() from this initial hash: hash(saddr, lport, daddr, dport) == hash(saddr, 0, daddr, dport) + lport Apply the same principle for __inet_check_established(), although inet_ehashfn() has a smaller cost. Tested: Server: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog Client: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server Before this patch: utime_start=0.286131 utime_end=4.378886 stime_start=11.952556 stime_end=1991.655533 num_transactions=1446830 latency_min=0.001061085 latency_max=12.075275028 latency_mean=0.376375302 latency_stddev=1.361969596 num_samples=306383 throughput=151866.56 perf top: 50.01% [kernel] [k] __inet6_check_established 20.65% [kernel] [k] __inet_hash_connect 15.81% [kernel] [k] inet6_ehashfn 2.92% [kernel] [k] rcu_all_qs 2.34% [kernel] [k] __cond_resched 0.50% [kernel] [k] _raw_spin_lock 0.34% [kernel] [k] sched_balance_trigger 0.24% [kernel] [k] queued_spin_lock_slowpath After this patch: utime_start=0.315047 utime_end=9.257617 stime_start=7.041489 stime_end=1923.688387 num_transactions=3057968 latency_min=0.003041375 latency_max=7.056589232 latency_mean=0.141075048 # Better latency metrics latency_stddev=0.526900516 num_samples=312996 throughput=320677.21 # 111 % increase, and 229 % for the series perf top: inet6_ehashfn is no longer seen. 39.67% [kernel] [k] __inet_hash_connect 37.06% [kernel] [k] __inet6_check_established 4.79% [kernel] [k] rcu_all_qs 3.82% [kernel] [k] __cond_resched 1.76% [kernel] [k] sched_balance_domains 0.82% [kernel] [k] _raw_spin_lock 0.81% [kernel] [k] sched_balance_rq 0.81% [kernel] [k] sched_balance_trigger 0.76% [kernel] [k] queued_spin_lock_slowpath Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Tested-by: Jason Xing Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250305034550.879255-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 4 +++- include/net/ip.h | 2 +- net/ipv4/inet_hashtables.c | 26 ++++++++++++++++++-------- net/ipv6/inet6_hashtables.c | 15 +++++++++++---- 4 files changed, 33 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index f447d61d9598..949641e92539 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -527,10 +527,12 @@ static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr) int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, + u32 hash_port0, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **, - bool rcu_lookup)); + bool rcu_lookup, + u32 hash)); int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); diff --git a/include/net/ip.h b/include/net/ip.h index ce5e59957dd5..8a48ade24620 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -357,7 +357,7 @@ static inline void inet_get_local_port_range(const struct net *net, int *low, in bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); #ifdef CONFIG_SYSCTL -static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port) +static inline bool inet_is_local_reserved_port(const struct net *net, unsigned short port) { if (!net->ipv4.sysctl_local_reserved_ports) return false; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ccf69b534d2e..5bf163f756e9 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -538,7 +538,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_established); static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, struct inet_timewait_sock **twp, - bool rcu_lookup) + bool rcu_lookup, + u32 hash) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -549,8 +550,6 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, int sdif = l3mdev_master_ifindex_by_index(net, dif); INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); - unsigned int hash = inet_ehashfn(net, daddr, lport, - saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); struct inet_timewait_sock *tw = NULL; const struct hlist_nulls_node *node; @@ -1007,9 +1006,10 @@ static u32 *table_perturb; int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, + u32 hash_port0, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **, - bool rcu_lookup)) + bool rcu_lookup, u32 hash)) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_bind_hashbucket *head, *head2; @@ -1027,7 +1027,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, if (port) { local_bh_disable(); - ret = check_established(death_row, sk, port, NULL, false); + ret = check_established(death_row, sk, port, NULL, false, + hash_port0 + port); local_bh_enable(); return ret; } @@ -1071,7 +1072,8 @@ other_parity_scan: rcu_read_unlock(); goto next_port; } - if (!check_established(death_row, sk, port, &tw, true)) + if (!check_established(death_row, sk, port, &tw, true, + hash_port0 + port)) break; rcu_read_unlock(); goto next_port; @@ -1090,7 +1092,8 @@ other_parity_scan: goto next_port_unlock; WARN_ON(hlist_empty(&tb->bhash2)); if (!check_established(death_row, sk, - port, &tw, false)) + port, &tw, false, + hash_port0 + port)) goto ok; goto next_port_unlock; } @@ -1197,11 +1200,18 @@ error: int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { + const struct inet_sock *inet = inet_sk(sk); + const struct net *net = sock_net(sk); u64 port_offset = 0; + u32 hash_port0; if (!inet_sk(sk)->inet_num) port_offset = inet_sk_port_offset(sk); - return __inet_hash_connect(death_row, sk, port_offset, + + hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0, + inet->inet_daddr, inet->inet_dport); + + return __inet_hash_connect(death_row, sk, port_offset, hash_port0, __inet_check_established); } EXPORT_SYMBOL_GPL(inet_hash_connect); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 3d95f1e75a11..76ee521189eb 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -264,7 +264,8 @@ EXPORT_SYMBOL_GPL(inet6_lookup); static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, const __u16 lport, struct inet_timewait_sock **twp, - bool rcu_lookup) + bool rcu_lookup, + u32 hash) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -274,8 +275,6 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct net *net = sock_net(sk); const int sdif = l3mdev_master_ifindex_by_index(net, dif); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); - const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr, - inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); struct inet_timewait_sock *tw = NULL; const struct hlist_nulls_node *node; @@ -354,11 +353,19 @@ static u64 inet6_sk_port_offset(const struct sock *sk) int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { + const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr; + const struct in6_addr *saddr = &sk->sk_v6_daddr; + const struct inet_sock *inet = inet_sk(sk); + const struct net *net = sock_net(sk); u64 port_offset = 0; + u32 hash_port0; if (!inet_sk(sk)->inet_num) port_offset = inet6_sk_port_offset(sk); - return __inet_hash_connect(death_row, sk, port_offset, + + hash_port0 = inet6_ehashfn(net, daddr, 0, saddr, inet->inet_dport); + + return __inet_hash_connect(death_row, sk, port_offset, hash_port0, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); -- cgit v1.2.3 From 0d7336f8f06d4a1a1e2c62624d086561e8490bb7 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 6 Mar 2025 12:29:28 +0100 Subject: tcp: ulp: diag: more info without CAP_NET_ADMIN When introduced in commit 61723b393292 ("tcp: ulp: add functions to dump ulp-specific information"), the whole ULP diag info has been exported only if the requester had CAP_NET_ADMIN. It looks like not everything is sensitive, and some info can be exported to all users in order to ease the debugging from the userspace side without requiring additional capabilities. Each layer should then decide what can be exposed to everybody. The 'net_admin' boolean is then passed to the different layers. On kTLS side, it looks like there is nothing sensitive there: version, cipher type, tx/rx user config type, plus some flags. So, only some metadata about the configuration, no cryptographic info like keys, etc. Then, everything can be exported to all users. On MPTCP side, that's different. The MPTCP-related sequence numbers per subflow should certainly not be exposed to everybody. For example, the DSS mapping and ssn_offset would give all users on the system access to narrow ranges of values for the subflow TCP sequence numbers and MPTCP-level DSNs, and then ease packet injection. The TCP diag interface doesn't expose the TCP sequence numbers for TCP sockets, so best to do the same here. The rest -- token, IDs, flags -- can be exported to everybody. Acked-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250306-net-next-tcp-ulp-diag-net-admin-v1-2-06afdd860fc9@kernel.org Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 4 ++-- net/ipv4/tcp_diag.c | 8 ++++---- net/mptcp/diag.c | 42 ++++++++++++++++++++++++++---------------- net/tls/tls_main.c | 4 ++-- 4 files changed, 34 insertions(+), 24 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index a9bc959fb102..7207c52b1fc9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2598,8 +2598,8 @@ struct tcp_ulp_ops { /* cleanup ulp */ void (*release)(struct sock *sk); /* diagnostic */ - int (*get_info)(struct sock *sk, struct sk_buff *skb); - size_t (*get_info_size)(const struct sock *sk); + int (*get_info)(struct sock *sk, struct sk_buff *skb, bool net_admin); + size_t (*get_info_size)(const struct sock *sk, bool net_admin); /* clone ulp */ void (*clone)(const struct request_sock *req, struct sock *newsk, const gfp_t priority); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index d8bba37dbffd..45e174b8cd22 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -96,8 +96,8 @@ static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk, if (err) goto nla_failure; - if (net_admin && ulp_ops->get_info) - err = ulp_ops->get_info(sk, skb); + if (ulp_ops->get_info) + err = ulp_ops->get_info(sk, skb, net_admin); if (err) goto nla_failure; @@ -170,8 +170,8 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) if (ulp_ops) { size += nla_total_size(0) + nla_total_size(TCP_ULP_NAME_MAX); - if (net_admin && ulp_ops->get_info_size) - size += ulp_ops->get_info_size(sk); + if (ulp_ops->get_info_size) + size += ulp_ops->get_info_size(sk, net_admin); } } return size; diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c index 02205f7994d7..70cf9ebce833 100644 --- a/net/mptcp/diag.c +++ b/net/mptcp/diag.c @@ -12,7 +12,7 @@ #include #include "protocol.h" -static int subflow_get_info(struct sock *sk, struct sk_buff *skb) +static int subflow_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin) { struct mptcp_subflow_context *sf; struct nlattr *start; @@ -56,15 +56,6 @@ static int subflow_get_info(struct sock *sk, struct sk_buff *skb) if (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_REM, sf->remote_token) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_LOC, sf->token) || - nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, - sf->rel_write_seq) || - nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq, - MPTCP_SUBFLOW_ATTR_PAD) || - nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, - sf->map_subflow_seq) || - nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) || - nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN, - sf->map_data_len) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) || nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) || nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, subflow_get_local_id(sf))) { @@ -72,6 +63,21 @@ static int subflow_get_info(struct sock *sk, struct sk_buff *skb) goto nla_failure; } + /* Only export seq related counters to user with CAP_NET_ADMIN */ + if (net_admin && + (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, + sf->rel_write_seq) || + nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq, + MPTCP_SUBFLOW_ATTR_PAD) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, + sf->map_subflow_seq) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) || + nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN, + sf->map_data_len))) { + err = -EMSGSIZE; + goto nla_failure; + } + rcu_read_unlock(); unlock_sock_fast(sk, slow); nla_nest_end(skb, start); @@ -84,22 +90,26 @@ nla_failure: return err; } -static size_t subflow_get_info_size(const struct sock *sk) +static size_t subflow_get_info_size(const struct sock *sk, bool net_admin) { size_t size = 0; size += nla_total_size(0) + /* INET_ULP_INFO_MPTCP */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_REM */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_LOC */ - nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */ - nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */ - nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */ - nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */ - nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_LOC */ 0; + + if (net_admin) + size += nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */ + nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */ + nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */ + 0; + return size; } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 99ca4465f702..cb86b0bf9a53 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -1057,7 +1057,7 @@ static u16 tls_user_config(struct tls_context *ctx, bool tx) return 0; } -static int tls_get_info(struct sock *sk, struct sk_buff *skb) +static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin) { u16 version, cipher_type; struct tls_context *ctx; @@ -1115,7 +1115,7 @@ nla_failure: return err; } -static size_t tls_get_info_size(const struct sock *sk) +static size_t tls_get_info_size(const struct sock *sk, bool net_admin) { size_t size = 0; -- cgit v1.2.3 From 8ef890df4031121a94407c84659125cbccd3fdbe Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 7 Mar 2025 10:30:06 -0800 Subject: net: move misc netdev_lock flavors to a separate header Move the more esoteric helpers for netdev instance lock to a dedicated header. This avoids growing netdevice.h to infinity and makes rebuilding the kernel much faster (after touching the header with the helpers). The main netdev_lock() / netdev_unlock() functions are used in static inlines in netdevice.h and will probably be used most commonly, so keep them in netdevice.h. Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250307183006.2312761-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 1 + drivers/net/dummy.c | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 1 + drivers/net/ethernet/intel/iavf/iavf_main.c | 1 + drivers/net/ethernet/microsoft/mana/mana_en.c | 1 + drivers/net/geneve.c | 1 + drivers/net/hamradio/bpqether.c | 1 + drivers/net/hyperv/netvsc_drv.c | 1 + drivers/net/ipvlan/ipvlan_main.c | 1 + drivers/net/loopback.c | 1 + drivers/net/macsec.c | 1 + drivers/net/macvlan.c | 1 + drivers/net/netdevsim/netdev.c | 1 + drivers/net/ppp/ppp_generic.c | 1 + drivers/net/team/team_core.c | 1 + drivers/net/veth.c | 1 + drivers/net/vrf.c | 1 + drivers/net/vxlan/vxlan_core.c | 1 + include/linux/netdevice.h | 81 +----------------------- include/net/netdev_lock.h | 89 +++++++++++++++++++++++++++ kernel/bpf/offload.c | 1 + net/8021q/vlan_dev.c | 1 + net/bluetooth/6lowpan.c | 1 + net/bridge/br_device.c | 2 + net/core/dev.c | 1 + net/core/dev.h | 1 + net/core/dev_api.c | 2 + net/core/dev_ioctl.c | 1 + net/core/net-sysfs.c | 1 + net/core/rtnetlink.c | 1 + net/dsa/conduit.c | 1 + net/ethtool/cabletest.c | 1 + net/ethtool/cmis_fw_update.c | 1 + net/ethtool/features.c | 2 + net/ethtool/ioctl.c | 1 + net/ethtool/module.c | 1 + net/ethtool/netlink.c | 1 + net/ethtool/phy.c | 1 + net/ethtool/rss.c | 2 + net/ethtool/tsinfo.c | 1 + net/ieee802154/6lowpan/core.c | 1 + net/ipv4/ip_tunnel.c | 1 + net/ipv6/ip6_gre.c | 1 + net/ipv6/ip6_tunnel.c | 1 + net/ipv6/ip6_vti.c | 1 + net/ipv6/sit.c | 1 + net/l2tp/l2tp_eth.c | 1 + net/sched/sch_api.c | 1 + net/xdp/xsk.c | 1 + net/xdp/xsk_buff_pool.c | 1 + 51 files changed, 143 insertions(+), 80 deletions(-) create mode 100644 include/net/netdev_lock.h (limited to 'include/net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index cf0b02720dd8..6c95f478ab80 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -90,6 +90,7 @@ #include #endif #include +#include #include #include "bonding_priv.h" diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index 005d79975f3b..a4938c6a5ebb 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 1a1e6da77777..b09171110ec4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 0caf6e9bccb8..cf2b3ad75c9b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "bnxt_hsi.h" #include "bnxt.h" diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 032e1a58af6f..6d7ba4d67a19 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2,6 +2,7 @@ /* Copyright(c) 2013 - 2018 Intel Corporation. */ #include +#include #include "iavf.h" #include "iavf_ptp.h" diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 0411a1897f57..2d826077d38c 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -13,6 +13,7 @@ #include #include +#include #include #include diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 2c65f867fd31..66e38ce9cd1d 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #define GENEVE_NETDEV_VER "0.6" diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c index f6b0bfbbc753..0e0fe32d2da4 100644 --- a/drivers/net/hamradio/bpqether.c +++ b/drivers/net/hamradio/bpqether.c @@ -77,6 +77,7 @@ #include #include +#include #include #include diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 9c6501bf27bd..c51b318b8a72 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -29,6 +29,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index b56144ca2fde..0ed2fd833a5d 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -3,6 +3,7 @@ */ #include +#include #include "ipvlan.h" diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 201fddcd3b1e..1fb6ce6843ad 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -54,6 +54,7 @@ #include #include #include +#include #include /* blackhole_netdev - a device used for dsts that are marked expired! diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 4de5d63fd577..3d315e30ee47 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 4e9d54be887c..d0dfa6bca6cc 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index 54d03b0628d2..d71fd2907cc8 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index ca77661688c0..53463767cc43 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index fb917560d0a2..d8fc0c79745d 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 05f5eeef539f..7bb53961c0ea 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 36cf6191335e..7168b33adadb 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 227d7f5a302a..8c49e903cb3a 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d206c9592b60..9a297757df7e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2630,40 +2630,6 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, f(dev, &dev->_tx[i], arg); } -static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, - const struct lockdep_map *b) -{ - /* Only lower devices currently grab the instance lock, so no - * real ordering issues can occur. In the near future, only - * hardware devices will grab instance lock which also does not - * involve any ordering. Suppress lockdep ordering warnings - * until (if) we start grabbing instance lock on pure SW - * devices (bond/team/veth/etc). - */ - if (a == b) - return 0; - return -1; -} - -#define netdev_lockdep_set_classes(dev) \ -{ \ - static struct lock_class_key qdisc_tx_busylock_key; \ - static struct lock_class_key qdisc_xmit_lock_key; \ - static struct lock_class_key dev_addr_list_lock_key; \ - static struct lock_class_key dev_instance_lock_key; \ - unsigned int i; \ - \ - (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ - lockdep_set_class(&(dev)->addr_list_lock, \ - &dev_addr_list_lock_key); \ - lockdep_set_class(&(dev)->lock, \ - &dev_instance_lock_key); \ - lock_set_cmp_fn(&dev->lock, netdev_lock_cmp_fn, NULL); \ - for (i = 0; i < (dev)->num_tx_queues; i++) \ - lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ - &qdisc_xmit_lock_key); \ -} - u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, @@ -2765,56 +2731,11 @@ static inline void netdev_lock(struct net_device *dev) mutex_lock(&dev->lock); } -static inline bool netdev_trylock(struct net_device *dev) -{ - return mutex_trylock(&dev->lock); -} - static inline void netdev_unlock(struct net_device *dev) { mutex_unlock(&dev->lock); } - -static inline void netdev_assert_locked(struct net_device *dev) -{ - lockdep_assert_held(&dev->lock); -} - -static inline void netdev_assert_locked_or_invisible(struct net_device *dev) -{ - if (dev->reg_state == NETREG_REGISTERED || - dev->reg_state == NETREG_UNREGISTERING) - netdev_assert_locked(dev); -} - -static inline bool netdev_need_ops_lock(struct net_device *dev) -{ - bool ret = dev->request_ops_lock || !!dev->queue_mgmt_ops; - -#if IS_ENABLED(CONFIG_NET_SHAPER) - ret |= !!dev->netdev_ops->net_shaper_ops; -#endif - - return ret; -} - -static inline void netdev_lock_ops(struct net_device *dev) -{ - if (netdev_need_ops_lock(dev)) - netdev_lock(dev); -} - -static inline void netdev_unlock_ops(struct net_device *dev) -{ - if (netdev_need_ops_lock(dev)) - netdev_unlock(dev); -} - -static inline void netdev_ops_assert_locked(struct net_device *dev) -{ - if (netdev_need_ops_lock(dev)) - lockdep_assert_held(&dev->lock); -} +/* Additional netdev_lock()-related helpers are in net/netdev_lock.h */ void netif_napi_set_irq_locked(struct napi_struct *napi, int irq); diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h new file mode 100644 index 000000000000..99631fbd7f54 --- /dev/null +++ b/include/net/netdev_lock.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NET_NETDEV_LOCK_H +#define _NET_NETDEV_LOCK_H + +#include +#include + +static inline bool netdev_trylock(struct net_device *dev) +{ + return mutex_trylock(&dev->lock); +} + +static inline void netdev_assert_locked(struct net_device *dev) +{ + lockdep_assert_held(&dev->lock); +} + +static inline void netdev_assert_locked_or_invisible(struct net_device *dev) +{ + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) + netdev_assert_locked(dev); +} + +static inline bool netdev_need_ops_lock(struct net_device *dev) +{ + bool ret = dev->request_ops_lock || !!dev->queue_mgmt_ops; + +#if IS_ENABLED(CONFIG_NET_SHAPER) + ret |= !!dev->netdev_ops->net_shaper_ops; +#endif + + return ret; +} + +static inline void netdev_lock_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_lock(dev); +} + +static inline void netdev_unlock_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_unlock(dev); +} + +static inline void netdev_ops_assert_locked(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + lockdep_assert_held(&dev->lock); +} + +static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, + const struct lockdep_map *b) +{ + /* Only lower devices currently grab the instance lock, so no + * real ordering issues can occur. In the near future, only + * hardware devices will grab instance lock which also does not + * involve any ordering. Suppress lockdep ordering warnings + * until (if) we start grabbing instance lock on pure SW + * devices (bond/team/veth/etc). + */ + if (a == b) + return 0; + return -1; +} + +#define netdev_lockdep_set_classes(dev) \ +{ \ + static struct lock_class_key qdisc_tx_busylock_key; \ + static struct lock_class_key qdisc_xmit_lock_key; \ + static struct lock_class_key dev_addr_list_lock_key; \ + static struct lock_class_key dev_instance_lock_key; \ + unsigned int i; \ + \ + (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ + lockdep_set_class(&(dev)->addr_list_lock, \ + &dev_addr_list_lock_key); \ + lockdep_set_class(&(dev)->lock, \ + &dev_instance_lock_key); \ + lock_set_cmp_fn(&dev->lock, netdev_lock_cmp_fn, NULL); \ + for (i = 0; i < (dev)->num_tx_queues; i++) \ + lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ + &qdisc_xmit_lock_key); \ +} + +#endif diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 4f707cfe7f10..42ae8d595c2c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -25,6 +25,7 @@ #include #include #include +#include #include /* Protects offdevs, members of bpf_offload_netdev and offload members diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ee3283400716..770a4dcf7f63 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "vlan.h" #include "vlanproc.h" diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 50cfec8ccac4..1298c8685bad 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 9d8c72ed01ab..a818fdc22da9 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -16,6 +16,8 @@ #include #include +#include + #include "br_private.h" #define COMMON_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | \ diff --git a/net/core/dev.c b/net/core/dev.c index a0f75a1d1f5a..1cb134ff7327 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -156,6 +156,7 @@ #include #include #include +#include #include #include #include diff --git a/net/core/dev.h b/net/core/dev.h index b50ca645c086..0ddd3631acb0 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -6,6 +6,7 @@ #include #include #include +#include struct net; struct netlink_ext_ack; diff --git a/net/core/dev_api.c b/net/core/dev_api.c index 655a95fb7baa..1f0e24849bc6 100644 --- a/net/core/dev_api.c +++ b/net/core/dev_api.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later + #include +#include #include "dev.h" diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 296e52d1395d..5471cf4fc984 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "dev.h" diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 02d1d40b47ae..529a0f721268 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 88a352b02bce..90597bf84e3d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #if IS_ENABLED(CONFIG_IPV6) #include diff --git a/net/dsa/conduit.c b/net/dsa/conduit.c index f21bb2551bed..4ae255cfb23f 100644 --- a/net/dsa/conduit.c +++ b/net/dsa/conduit.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "conduit.h" #include "dsa.h" diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index ddcba073321f..0364b8fb577b 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -2,6 +2,7 @@ #include #include +#include #include "netlink.h" #include "common.h" diff --git a/net/ethtool/cmis_fw_update.c b/net/ethtool/cmis_fw_update.c index 946830af3e7c..df5f344209c4 100644 --- a/net/ethtool/cmis_fw_update.c +++ b/net/ethtool/cmis_fw_update.c @@ -2,6 +2,7 @@ #include #include +#include #include "common.h" #include "module_fw.h" diff --git a/net/ethtool/features.c b/net/ethtool/features.c index ccffd64d5a87..f2217983be2b 100644 --- a/net/ethtool/features.c +++ b/net/ethtool/features.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only +#include + #include "netlink.h" #include "common.h" #include "bitset.h" diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 496a2774100c..221639407c72 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "common.h" diff --git a/net/ethtool/module.c b/net/ethtool/module.c index d3d2e135e45e..4d4e0a82579a 100644 --- a/net/ethtool/module.c +++ b/net/ethtool/module.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "netlink.h" #include "common.h" diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 70834947f474..a163d40c6431 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c index 1a6b725d1f14..1f590e8d75ed 100644 --- a/net/ethtool/phy.c +++ b/net/ethtool/phy.c @@ -9,6 +9,7 @@ #include #include #include +#include struct phy_req_info { struct ethnl_req_info base; diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index ec41d1d7eefe..6d9b1769896b 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only +#include + #include "netlink.h" #include "common.h" diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c index 73b6a89b8731..32204cca24da 100644 --- a/net/ethtool/tsinfo.c +++ b/net/ethtool/tsinfo.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "netlink.h" #include "common.h" diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 9a9da74b0a4f..018929563c6b 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -50,6 +50,7 @@ #include #include +#include #include "6lowpan_i.h" diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 4b06dc7e04f2..1024f961ec9a 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index c6ebb6a6d390..957ca98fa70f 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 170a6ac30889..a04dd1bb4b19 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 83c055996fbb..09ec4b0ad7dc 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #define IP6_VTI_HASH_SIZE_SHIFT 5 diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 6f04703fe638..9a0f32acb750 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -51,6 +51,7 @@ #include #include #include +#include #include /* diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index e83691073496..cf0b66f4fb29 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f5101c2ffc66..abace7665cfe 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -26,6 +26,7 @@ #include #include +#include #include #include #include diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index f864e5d70b40..e5d104ce7b82 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 0e6ca568fdee..14716ad3d7bc 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include #include -- cgit v1.2.3 From a096a8602f4fee42a18b7537a7467a28c44728af Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 8 Mar 2025 23:03:27 +0200 Subject: wifi: cfg80211: move link reconfig parameters into a struct Add a new struct cfg80211_ml_reconf_req to collect the link reconfiguration parameters. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250308225541.0cf299c1fdd0.Id1a3b1092dc52d0d3731a8798522fdf2e052bf0b@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 15 ++++++++++++--- net/mac80211/cfg.c | 7 +++---- net/mac80211/ieee80211_i.h | 5 ++--- net/mac80211/mlme.c | 39 ++++++++++++++++++++------------------- net/wireless/core.h | 6 +++--- net/wireless/mlme.c | 13 ++++++------- net/wireless/nl80211.c | 24 +++++++++++------------- net/wireless/rdev-ops.h | 10 ++++------ net/wireless/trace.h | 13 ++++++------- 9 files changed, 67 insertions(+), 65 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 6b170a8d086c..c7cfef104896 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -7,7 +7,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #include @@ -3082,6 +3082,16 @@ struct cfg80211_assoc_link { int error; }; +/** + * struct cfg80211_ml_reconf_req - MLO link reconfiguration request + * @add_links: data for links to add, see &struct cfg80211_assoc_link + * @rem_links: bitmap of links to remove + */ +struct cfg80211_ml_reconf_req { + struct cfg80211_assoc_link add_links[IEEE80211_MLD_MAX_NUM_LINKS]; + u16 rem_links; +}; + /** * enum cfg80211_assoc_req_flags - Over-ride default behaviour in association. * @@ -4972,8 +4982,7 @@ struct cfg80211_ops { struct cfg80211_ttlm_params *params); u32 (*get_radio_mask)(struct wiphy *wiphy, struct net_device *dev); int (*assoc_ml_reconf)(struct wiphy *wiphy, struct net_device *dev, - struct cfg80211_assoc_link *add_links, - u16 rem_links); + struct cfg80211_ml_reconf_req *req); int (*set_epcs)(struct wiphy *wiphy, struct net_device *dev, bool val); }; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 09708a060bb7..76453c0f82a5 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #include @@ -5185,14 +5185,13 @@ ieee80211_set_ttlm(struct wiphy *wiphy, struct net_device *dev, static int ieee80211_assoc_ml_reconf(struct wiphy *wiphy, struct net_device *dev, - struct cfg80211_assoc_link *add_links, - u16 rem_links) + struct cfg80211_ml_reconf_req *req) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); lockdep_assert_wiphy(sdata->local->hw.wiphy); - return ieee80211_mgd_assoc_ml_reconf(sdata, add_links, rem_links); + return ieee80211_mgd_assoc_ml_reconf(sdata, req); } static int diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 845888ac3d2c..5f92619f8584 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007-2010 Johannes Berg * Copyright 2013-2015 Intel Mobile Communications GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #ifndef IEEE80211_I_H @@ -2789,8 +2789,7 @@ void ieee80211_process_epcs_teardown(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, - struct cfg80211_assoc_link *add_links, - u16 rem_links); + struct cfg80211_ml_reconf_req *req); void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 23d85a1abbc5..613959f19713 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -10362,8 +10362,7 @@ ieee80211_build_ml_reconf_req(struct ieee80211_sub_if_data *sdata, } int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, - struct cfg80211_assoc_link *add_links, - u16 rem_links) + struct cfg80211_ml_reconf_req *req) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgd_assoc_data *data = NULL; @@ -10383,9 +10382,8 @@ int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, return -EBUSY; added_links = 0; - for (link_id = 0; add_links && link_id < IEEE80211_MLD_MAX_NUM_LINKS; - link_id++) { - if (!add_links[link_id].bss) + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + if (!req->add_links[link_id].bss) continue; added_links |= BIT(link_id); @@ -10413,7 +10411,8 @@ int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_supported_band *sband; - struct cfg80211_bss *link_cbss = add_links[link_id].bss; + struct cfg80211_bss *link_cbss = + req->add_links[link_id].bss; struct ieee80211_bss *bss; if (!link_cbss) @@ -10443,11 +10442,11 @@ int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, data->link[link_id].bss = link_cbss; data->link[link_id].disabled = - add_links[link_id].disabled; + req->add_links[link_id].disabled; data->link[link_id].elems = - (u8 *)add_links[link_id].elems; + (u8 *)req->add_links[link_id].elems; data->link[link_id].elems_len = - add_links[link_id].elems_len; + req->add_links[link_id].elems_len; if (!bss->uapsd_supported) uapsd_supported = false; @@ -10497,10 +10496,11 @@ int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, * Section 35.3.6.4 in Draft P802.11be_D7.0 the AP MLD should accept the * link removal request. */ - if (rem_links) { - u16 new_active_links = sdata->vif.active_links & ~rem_links; + if (req->rem_links) { + u16 new_active_links = + sdata->vif.active_links & ~req->rem_links; - new_valid_links = sdata->vif.valid_links & ~rem_links; + new_valid_links = sdata->vif.valid_links & ~req->rem_links; /* Should not be left with no valid links to perform the * ML reconfiguration @@ -10535,14 +10535,15 @@ int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, * is expected to send the ML reconfiguration response frame on the link * on which the request was received. */ - skb = ieee80211_build_ml_reconf_req(sdata, data, rem_links); + skb = ieee80211_build_ml_reconf_req(sdata, data, req->rem_links); if (!skb) { err = -ENOMEM; goto err_free; } - if (rem_links) { - u16 new_dormant_links = sdata->vif.dormant_links & ~rem_links; + if (req->rem_links) { + u16 new_dormant_links = + sdata->vif.dormant_links & ~req->rem_links; err = ieee80211_vif_set_links(sdata, new_valid_links, new_dormant_links); @@ -10555,7 +10556,7 @@ int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { - if (!(rem_links & BIT(link_id))) + if (!(req->rem_links & BIT(link_id))) continue; ieee80211_sta_remove_link(sta, link_id); @@ -10564,17 +10565,17 @@ int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, /* notify the driver and upper layers */ ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_VALID_LINKS); - cfg80211_links_removed(sdata->dev, rem_links); + cfg80211_links_removed(sdata->dev, req->rem_links); } sdata_info(sdata, "mlo: reconf: adding=0x%x, removed=0x%x\n", - added_links, rem_links); + added_links, req->rem_links); ieee80211_tx_skb(sdata, skb); sdata->u.mgd.reconf.added_links = added_links; sdata->u.mgd.reconf.add_links_data = data; - sdata->u.mgd.reconf.removed_links = rem_links; + sdata->u.mgd.reconf.removed_links = req->rem_links; wiphy_delayed_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.reconf.wk, IEEE80211_ASSOC_TIMEOUT_SHORT); diff --git a/net/wireless/core.h b/net/wireless/core.h index b094b526b05d..c56a35040caa 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -3,7 +3,7 @@ * Wireless configuration interface internals. * * Copyright 2006-2010 Johannes Berg - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #ifndef __NET_WIRELESS_CORE_H #define __NET_WIRELESS_CORE_H @@ -568,8 +568,8 @@ void cfg80211_wdev_release_link_bsses(struct wireless_dev *wdev, u16 link_mask); int cfg80211_assoc_ml_reconf(struct cfg80211_registered_device *rdev, struct net_device *dev, - struct cfg80211_assoc_link *links, - u16 rem_links); + struct cfg80211_ml_reconf_req *req); + /** * struct cfg80211_colocated_ap - colocated AP information * diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index e10f2b3b4b7f..956d33b219df 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -4,7 +4,7 @@ * * Copyright (c) 2009, Jouni Malinen * Copyright (c) 2015 Intel Deutschland GmbH - * Copyright (C) 2019-2020, 2022-2024 Intel Corporation + * Copyright (C) 2019-2020, 2022-2025 Intel Corporation */ #include @@ -1297,25 +1297,24 @@ void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev) int cfg80211_assoc_ml_reconf(struct cfg80211_registered_device *rdev, struct net_device *dev, - struct cfg80211_assoc_link *links, - u16 rem_links) + struct cfg80211_ml_reconf_req *req) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); - err = rdev_assoc_ml_reconf(rdev, dev, links, rem_links); + err = rdev_assoc_ml_reconf(rdev, dev, req); if (!err) { int link_id; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { - if (!links[link_id].bss) + if (!req->add_links[link_id].bss) continue; - cfg80211_ref_bss(&rdev->wiphy, links[link_id].bss); - cfg80211_hold_bss(bss_from_pub(links[link_id].bss)); + cfg80211_ref_bss(&rdev->wiphy, req->add_links[link_id].bss); + cfg80211_hold_bss(bss_from_pub(req->add_links[link_id].bss)); } } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 2c4e06610a79..fe706f70ac7e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #include @@ -16488,9 +16488,9 @@ static int nl80211_assoc_ml_reconf(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_assoc_link links[IEEE80211_MLD_MAX_NUM_LINKS] = {}; + struct cfg80211_ml_reconf_req req = {}; unsigned int link_id; - u16 add_links, rem_links; + u16 add_links; int err; if (!wdev->valid_links) @@ -16506,39 +16506,37 @@ static int nl80211_assoc_ml_reconf(struct sk_buff *skb, struct genl_info *info) add_links = 0; if (info->attrs[NL80211_ATTR_MLO_LINKS]) { - err = nl80211_process_links(rdev, links, NULL, 0, info); + err = nl80211_process_links(rdev, req.add_links, NULL, 0, info); if (err) return err; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { - if (!links[link_id].bss) + if (!req.add_links[link_id].bss) continue; add_links |= BIT(link_id); } } if (info->attrs[NL80211_ATTR_MLO_RECONF_REM_LINKS]) - rem_links = + req.rem_links = nla_get_u16(info->attrs[NL80211_ATTR_MLO_RECONF_REM_LINKS]); - else - rem_links = 0; /* Validate that existing links are not added, removed links are valid * and don't allow adding and removing the same links */ - if ((add_links & rem_links) || !(add_links | rem_links) || + if ((add_links & req.rem_links) || !(add_links | req.rem_links) || (wdev->valid_links & add_links) || - ((wdev->valid_links & rem_links) != rem_links)) { + ((wdev->valid_links & req.rem_links) != req.rem_links)) { err = -EINVAL; goto out; } - err = cfg80211_assoc_ml_reconf(rdev, dev, links, rem_links); + err = cfg80211_assoc_ml_reconf(rdev, dev, &req); out: - for (link_id = 0; link_id < ARRAY_SIZE(links); link_id++) - cfg80211_put_bss(&rdev->wiphy, links[link_id].bss); + for (link_id = 0; link_id < ARRAY_SIZE(req.add_links); link_id++) + cfg80211_put_bss(&rdev->wiphy, req.add_links[link_id].bss); return err; } diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 759da1623342..9f4783c2354c 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2018, 2021-2024 Intel Corporation + * Copyright (C) 2018, 2021-2025 Intel Corporation */ #ifndef __CFG80211_RDEV_OPS #define __CFG80211_RDEV_OPS @@ -1551,16 +1551,14 @@ rdev_get_radio_mask(struct cfg80211_registered_device *rdev, static inline int rdev_assoc_ml_reconf(struct cfg80211_registered_device *rdev, struct net_device *dev, - struct cfg80211_assoc_link *add_links, - u16 rem_links) + struct cfg80211_ml_reconf_req *req) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; - trace_rdev_assoc_ml_reconf(wiphy, dev, add_links, rem_links); + trace_rdev_assoc_ml_reconf(wiphy, dev, req); if (rdev->ops->assoc_ml_reconf) - ret = rdev->ops->assoc_ml_reconf(wiphy, dev, add_links, - rem_links); + ret = rdev->ops->assoc_ml_reconf(wiphy, dev, req); trace_rdev_return_int(wiphy, ret); return ret; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 9aa8081ca454..b82dc7282c20 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2018, 2020-2024 Intel Corporation + * Copyright (C) 2018, 2020-2025 Intel Corporation */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cfg80211 @@ -4142,9 +4142,8 @@ TRACE_EVENT(cfg80211_mlo_reconf_add_done, TRACE_EVENT(rdev_assoc_ml_reconf, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - struct cfg80211_assoc_link *add_links, - u16 rem_links), - TP_ARGS(wiphy, netdev, add_links, rem_links), + struct cfg80211_ml_reconf_req *req), + TP_ARGS(wiphy, netdev, req), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY @@ -4157,9 +4156,9 @@ TRACE_EVENT(rdev_assoc_ml_reconf, u32 i; __entry->add_links = 0; - __entry->rem_links = rem_links; - for (i = 0; add_links && i < IEEE80211_MLD_MAX_NUM_LINKS; i++) - if (add_links[i].bss) + __entry->rem_links = req->rem_links; + for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) + if (req->add_links[i].bss) __entry->add_links |= BIT(i); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", add_links=0x%x, rem_links=0x%x", -- cgit v1.2.3 From 969241371f06507f922633e42a4c1a8ce00f220e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 8 Mar 2025 23:03:28 +0200 Subject: wifi: cfg80211: allow setting extended MLD capa/ops Some extended MLD capabilities and operations bits (currently the "BTM MLD Recommendataion For Multiple APs Support" bit) may depend on userspace capabilities. Allow userspace to pass the values for this field that it supports to the association and link reconfiguration operations. Signed-off-by: Johannes Berg Reviewed-by: Ilan Peer Link: https://patch.msgid.link/20250308225541.bd52078b5f65.I4dd8f53b0030db7ea87a2e0920989e7e2c7b5345@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 ++++++ include/uapi/linux/nl80211.h | 10 +++++++++- net/wireless/nl80211.c | 12 ++++++++++++ net/wireless/trace.h | 4 ++++ 4 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c7cfef104896..29b9cf0fe6c8 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3086,10 +3086,13 @@ struct cfg80211_assoc_link { * struct cfg80211_ml_reconf_req - MLO link reconfiguration request * @add_links: data for links to add, see &struct cfg80211_assoc_link * @rem_links: bitmap of links to remove + * @ext_mld_capa_ops: extended MLD capabilities and operations set by + * userspace for the ML reconfiguration action frame */ struct cfg80211_ml_reconf_req { struct cfg80211_assoc_link add_links[IEEE80211_MLD_MAX_NUM_LINKS]; u16 rem_links; + u16 ext_mld_capa_ops; }; /** @@ -3164,6 +3167,8 @@ enum cfg80211_assoc_req_flags { * the link on which the association request should be sent * @ap_mld_addr: AP MLD address in case of MLO association request, * valid iff @link_id >= 0 + * @ext_mld_capa_ops: extended MLD capabilities and operations set by + * userspace for the association */ struct cfg80211_assoc_request { struct cfg80211_bss *bss; @@ -3184,6 +3189,7 @@ struct cfg80211_assoc_request { struct cfg80211_assoc_link links[IEEE80211_MLD_MAX_NUM_LINKS]; const u8 *ap_mld_addr; s8 link_id; + u16 ext_mld_capa_ops; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 9d8ecf20ef0d..c59075acdb10 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -11,7 +11,7 @@ * Copyright 2008 Jouni Malinen * Copyright 2008 Colin McCabe * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -2893,6 +2893,12 @@ enum nl80211_commands { * @NL80211_ATTR_EPCS: Flag attribute indicating that EPCS is enabled for a * station interface. * + * @NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS: Extended MLD capabilities and + * operations that userspace implements to use during association/ML + * link reconfig, currently only "BTM MLD Recommendation For Multiple + * APs Support". Drivers may set additional flags that they support + * in the kernel or device. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3448,6 +3454,8 @@ enum nl80211_attrs { NL80211_ATTR_MLO_RECONF_REM_LINKS, NL80211_ATTR_EPCS, + NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fe706f70ac7e..e2e8c368fbbf 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -850,6 +850,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { NL80211_MAX_SUPP_SELECTORS), [NL80211_ATTR_MLO_RECONF_REM_LINKS] = { .type = NLA_U16 }, [NL80211_ATTR_EPCS] = { .type = NLA_FLAG }, + [NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS] = { .type = NLA_U16 }, }; /* policy for the key attributes */ @@ -11373,6 +11374,10 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) err = -EINVAL; goto free; } + + if (info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]) + req.ext_mld_capa_ops = + nla_get_u16(info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]); } else { if (req.link_id >= 0) return -EINVAL; @@ -11382,6 +11387,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(req.bss)) return PTR_ERR(req.bss); ap_addr = req.bss->bssid; + + if (info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]) + return -EINVAL; } err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); @@ -16532,6 +16540,10 @@ static int nl80211_assoc_ml_reconf(struct sk_buff *skb, struct genl_info *info) goto out; } + if (info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]) + req.ext_mld_capa_ops = + nla_get_u16(info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]); + err = cfg80211_assoc_ml_reconf(rdev, dev, &req); out: diff --git a/net/wireless/trace.h b/net/wireless/trace.h index b82dc7282c20..4ed9fada4ec0 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1378,6 +1378,7 @@ TRACE_EVENT(rdev_assoc, __dynamic_array(u8, fils_kek, req->fils_kek_len) __dynamic_array(u8, fils_nonces, req->fils_nonces ? 2 * FILS_NONCE_LEN : 0) + __field(u16, ext_mld_capa_ops) ), TP_fast_assign( WIPHY_ASSIGN; @@ -1404,6 +1405,7 @@ TRACE_EVENT(rdev_assoc, if (req->fils_nonces) memcpy(__get_dynamic_array(fils_nonces), req->fils_nonces, 2 * FILS_NONCE_LEN); + __entry->ext_mld_capa_ops = req->ext_mld_capa_ops; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM" ", previous bssid: %pM, use mfp: %s, flags: 0x%x", @@ -4149,6 +4151,7 @@ TRACE_EVENT(rdev_assoc_ml_reconf, NETDEV_ENTRY __field(u16, add_links) __field(u16, rem_links) + __field(u16, ext_mld_capa_ops) ), TP_fast_assign( WIPHY_ASSIGN; @@ -4160,6 +4163,7 @@ TRACE_EVENT(rdev_assoc_ml_reconf, for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) if (req->add_links[i].bss) __entry->add_links |= BIT(i); + __entry->ext_mld_capa_ops = req->ext_mld_capa_ops; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", add_links=0x%x, rem_links=0x%x", WIPHY_PR_ARG, NETDEV_PR_ARG, -- cgit v1.2.3 From cf4bd1608882792d4742e27a819493312904a680 Mon Sep 17 00:00:00 2001 From: Anjaneyulu Date: Sat, 8 Mar 2025 23:03:30 +0200 Subject: wifi: cfg80211: allow IR in 20 MHz configurations Some regulatory bodies doesn't allow IR (initiate radioation) on a specific subband, but allows it for channels with a bandwidth of 20 MHz. Add a channel flag that indicates that, and consider it in cfg80211_reg_check_beaconing. While on it, fix the kernel doc of enum nl80211_reg_rule_flags and change it to use BIT(). Signed-off-by: Anjaneyulu Co-developed-by: Somashekhar Puttagangaiah Signed-off-by: Somashekhar Puttagangaiah Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250308225541.d3ab352a73ff.I8a8f79e1c9eb74936929463960ee2a324712fe51@changeid [fix typo] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 52 ++++++++++++++++++++++++-------------------- net/wireless/chan.c | 8 ++++++- net/wireless/nl80211.c | 4 ++++ net/wireless/reg.c | 4 +++- 5 files changed, 46 insertions(+), 25 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 29b9cf0fe6c8..0d423830a8ed 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -127,6 +127,8 @@ struct wiphy; * even if it is otherwise disabled. * @IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP: Allow using this channel for AP operation * with very low power (VLP), even if otherwise set to NO_IR. + * @IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY: Allow activity on a 20 MHz channel, + * even if otherwise set to NO_IR. */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = BIT(0), @@ -155,6 +157,7 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT = BIT(23), IEEE80211_CHAN_CAN_MONITOR = BIT(24), IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP = BIT(25), + IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY = BIT(26), }; #define IEEE80211_CHAN_NO_HT40 \ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c59075acdb10..26cae1cc8727 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4335,6 +4335,8 @@ enum nl80211_wmm_rule { * otherwise completely disabled. * @NL80211_FREQUENCY_ATTR_ALLOW_6GHZ_VLP_AP: This channel can be used for a * very low power (VLP) AP, despite being NO_IR. + * @NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY: This channel can be active in + * 20 MHz bandwidth, despite being NO_IR. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -4379,6 +4381,7 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_NO_6GHZ_AFC_CLIENT, NL80211_FREQUENCY_ATTR_CAN_MONITOR, NL80211_FREQUENCY_ATTR_ALLOW_6GHZ_VLP_AP, + NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, @@ -4590,31 +4593,34 @@ enum nl80211_sched_scan_match_attr { * @NL80211_RRF_NO_6GHZ_AFC_CLIENT: Client connection to AFC AP not allowed * @NL80211_RRF_ALLOW_6GHZ_VLP_AP: Very low power (VLP) AP can be permitted * despite NO_IR configuration. + * @NL80211_RRF_ALLOW_20MHZ_ACTIVITY: Allow activity in 20 MHz bandwidth, + * despite NO_IR configuration. */ enum nl80211_reg_rule_flags { - NL80211_RRF_NO_OFDM = 1<<0, - NL80211_RRF_NO_CCK = 1<<1, - NL80211_RRF_NO_INDOOR = 1<<2, - NL80211_RRF_NO_OUTDOOR = 1<<3, - NL80211_RRF_DFS = 1<<4, - NL80211_RRF_PTP_ONLY = 1<<5, - NL80211_RRF_PTMP_ONLY = 1<<6, - NL80211_RRF_NO_IR = 1<<7, - __NL80211_RRF_NO_IBSS = 1<<8, - NL80211_RRF_AUTO_BW = 1<<11, - NL80211_RRF_IR_CONCURRENT = 1<<12, - NL80211_RRF_NO_HT40MINUS = 1<<13, - NL80211_RRF_NO_HT40PLUS = 1<<14, - NL80211_RRF_NO_80MHZ = 1<<15, - NL80211_RRF_NO_160MHZ = 1<<16, - NL80211_RRF_NO_HE = 1<<17, - NL80211_RRF_NO_320MHZ = 1<<18, - NL80211_RRF_NO_EHT = 1<<19, - NL80211_RRF_PSD = 1<<20, - NL80211_RRF_DFS_CONCURRENT = 1<<21, - NL80211_RRF_NO_6GHZ_VLP_CLIENT = 1<<22, - NL80211_RRF_NO_6GHZ_AFC_CLIENT = 1<<23, - NL80211_RRF_ALLOW_6GHZ_VLP_AP = 1<<24, + NL80211_RRF_NO_OFDM = 1 << 0, + NL80211_RRF_NO_CCK = 1 << 1, + NL80211_RRF_NO_INDOOR = 1 << 2, + NL80211_RRF_NO_OUTDOOR = 1 << 3, + NL80211_RRF_DFS = 1 << 4, + NL80211_RRF_PTP_ONLY = 1 << 5, + NL80211_RRF_PTMP_ONLY = 1 << 6, + NL80211_RRF_NO_IR = 1 << 7, + __NL80211_RRF_NO_IBSS = 1 << 8, + NL80211_RRF_AUTO_BW = 1 << 11, + NL80211_RRF_IR_CONCURRENT = 1 << 12, + NL80211_RRF_NO_HT40MINUS = 1 << 13, + NL80211_RRF_NO_HT40PLUS = 1 << 14, + NL80211_RRF_NO_80MHZ = 1 << 15, + NL80211_RRF_NO_160MHZ = 1 << 16, + NL80211_RRF_NO_HE = 1 << 17, + NL80211_RRF_NO_320MHZ = 1 << 18, + NL80211_RRF_NO_EHT = 1 << 19, + NL80211_RRF_PSD = 1 << 20, + NL80211_RRF_DFS_CONCURRENT = 1 << 21, + NL80211_RRF_NO_6GHZ_VLP_CLIENT = 1 << 22, + NL80211_RRF_NO_6GHZ_AFC_CLIENT = 1 << 23, + NL80211_RRF_ALLOW_6GHZ_VLP_AP = 1 << 24, + NL80211_RRF_ALLOW_20MHZ_ACTIVITY = 1 << 25, }; #define NL80211_RRF_PASSIVE_SCAN NL80211_RRF_NO_IR diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 9f918b77b40e..4cdb74a3f38c 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -6,7 +6,7 @@ * * Copyright 2009 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2018-2024 Intel Corporation + * Copyright 2018-2025 Intel Corporation */ #include @@ -1497,6 +1497,12 @@ bool cfg80211_reg_check_beaconing(struct wiphy *wiphy, if (cfg->reg_power == IEEE80211_REG_VLP_AP) permitting_flags |= IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP; + if ((cfg->iftype == NL80211_IFTYPE_P2P_GO || + cfg->iftype == NL80211_IFTYPE_AP) && + (chandef->width == NL80211_CHAN_WIDTH_20_NOHT || + chandef->width == NL80211_CHAN_WIDTH_20)) + permitting_flags |= IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY; + return _cfg80211_reg_can_beacon(wiphy, chandef, cfg->iftype, check_no_ir ? IEEE80211_CHAN_NO_IR : 0, permitting_flags); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e2e8c368fbbf..aee49d43cf86 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1235,6 +1235,10 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_ALLOW_6GHZ_VLP_AP)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY) && + nla_put_flag(msg, + NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 2dd0533e7660..9314f7fcd54b 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -5,7 +5,7 @@ * Copyright 2008-2011 Luis R. Rodriguez * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2024 Intel Corporation + * Copyright (C) 2018 - 2025 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -1602,6 +1602,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_PSD; if (rd_flags & NL80211_RRF_ALLOW_6GHZ_VLP_AP) channel_flags |= IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP; + if (rd_flags & NL80211_RRF_ALLOW_20MHZ_ACTIVITY) + channel_flags |= IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY; return channel_flags; } -- cgit v1.2.3 From cf12d3d71e72330b2c0bba467cc5bdea2232347e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 8 Mar 2025 23:03:32 +0200 Subject: wifi: cfg80211: improve supported_selector documentation Improve the documentation for supported BSS selectors to make it more precise. Signed-off-by: Johannes Berg Reviewed-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250308225541.ba402ff47314.I502b56111b62ea0be174ae76bd03684ae1d4aefb@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++-- include/uapi/linux/nl80211.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0d423830a8ed..c61894c1265a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3148,10 +3148,10 @@ enum cfg80211_assoc_req_flags { * included in the Current AP address field of the Reassociation Request * frame. * @flags: See &enum cfg80211_assoc_req_flags - * @supported_selectors: supported selectors in IEEE 802.11 format + * @supported_selectors: supported BSS selectors in IEEE 802.11 format * (or %NULL for no change). * If %NULL, then support for SAE_H2E should be assumed. - * @supported_selectors_len: Length of supported_selectors in octets. + * @supported_selectors_len: number of supported BSS selectors * @ht_capa: HT Capabilities over-rides. Values set in ht_capa_mask * will be used in ht_capa. Un-supported values will be ignored. * @ht_capa_mask: The bits of ht_capa which are to be used. diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 26cae1cc8727..ddcc4cda74af 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2881,9 +2881,9 @@ enum nl80211_commands { * @NL80211_ATTR_VIF_RADIO_MASK: Bitmask of allowed radios (u32). * A value of 0 means all radios. * - * @NL80211_ATTR_SUPPORTED_SELECTORS: supported selectors, array of - * supported selectors as defined by IEEE 802.11 7.3.2.2 but without the - * length restriction (at most %NL80211_MAX_SUPP_SELECTORS). + * @NL80211_ATTR_SUPPORTED_SELECTORS: supported BSS Membership Selectors, array + * of supported selectors as defined by IEEE Std 802.11-2020 9.4.2.3 but + * without the length restriction (at most %NL80211_MAX_SUPP_SELECTORS). * This can be used to provide a list of selectors that are implemented * by the supplicant. If not given, support for SAE_H2E is assumed. * -- cgit v1.2.3 From e16caea70610ed4226034dfcdaa5c43b36ff9e0a Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Sat, 8 Mar 2025 23:03:37 +0200 Subject: wifi: cfg80211: Update the link address when a link is added When links are added, update the wireless device link addresses based on the information provided by the driver. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250308225541.d694a9125aba.I79b010ea9aab47893e4f22c266362fde30b7f9ac@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 1 + net/wireless/mlme.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c61894c1265a..73f0e75cc814 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -9770,6 +9770,7 @@ struct cfg80211_mlo_reconf_done_data { u16 added_links; struct { struct cfg80211_bss *bss; + u8 *addr; } links[IEEE80211_MLD_MAX_NUM_LINKS]; }; diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 956d33b219df..05d44a443518 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -1360,6 +1360,10 @@ void cfg80211_mlo_reconf_add_done(struct net_device *dev, if (data->added_links & BIT(link_id)) { wdev->links[link_id].client.current_bss = bss_from_pub(bss); + + memcpy(wdev->links[link_id].addr, + data->links[link_id].addr, + ETH_ALEN); } else { cfg80211_unhold_bss(bss_from_pub(bss)); cfg80211_put_bss(wiphy, bss); -- cgit v1.2.3 From b5c1622762f0937a66b69b7f15466c28fe85dcf1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 11 Mar 2025 12:25:34 +0100 Subject: wifi: cfg80211: expose cfg80211_chandef_get_width() This can be just a trivial inline, to simplify some code. Expose it, and also use it in util.c where it wasn't previously available. Reviewed-by: Miriam Rachel Korenblit Link: https://patch.msgid.link/20250311122534.c5c3b4af9a74.Ib25cf60f634dc359961182113214e5cdc3504e9c@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 11 +++++++++++ net/wireless/chan.c | 5 ----- net/wireless/util.c | 4 ++-- 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 73f0e75cc814..efbd79c67be2 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1008,6 +1008,17 @@ cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1, */ int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width); +/** + * cfg80211_chandef_get_width - return chandef width in MHz + * @c: chandef to return bandwidth for + * Return: channel width in MHz for the given chandef; note that it returns + * 80 for 80+80 configurations + */ +static inline int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) +{ + return nl80211_chan_width_to_mhz(c->width); +} + /** * cfg80211_chandef_valid - check if a channel definition is valid * @chandef: the channel definition to check diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 4cdb74a3f38c..193734b7f9dc 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -55,11 +55,6 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef, } EXPORT_SYMBOL(cfg80211_chandef_create); -static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) -{ - return nl80211_chan_width_to_mhz(c->width); -} - static u32 cfg80211_get_start_freq(const struct cfg80211_chan_def *chandef, u32 cf) { diff --git a/net/wireless/util.c b/net/wireless/util.c index 60157943d351..ed868c0f7ca8 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -5,7 +5,7 @@ * Copyright 2007-2009 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2023, 2025 Intel Corporation */ #include #include @@ -2908,7 +2908,7 @@ bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, u32 freq, width; freq = ieee80211_chandef_to_khz(chandef); - width = nl80211_chan_width_to_mhz(chandef->width); + width = cfg80211_chandef_get_width(chandef); if (!ieee80211_radio_freq_range_valid(radio, freq, width)) return false; -- cgit v1.2.3 From b6b67141d6f1f736b17aca87e5ecb43b7c3a8205 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 11 Mar 2025 07:40:24 -0700 Subject: net: create netdev_nl_sock to wrap bindings list No functional changes. Next patches will add more granular locking to netdev_nl_sock. Signed-off-by: Stanislav Fomichev Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250311144026.4154277-2-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 4 ++-- include/net/netdev_netlink.h | 11 +++++++++++ net/core/netdev-genl-gen.c | 4 ++-- net/core/netdev-genl-gen.h | 6 +++--- net/core/netdev-genl.c | 19 +++++++++---------- 5 files changed, 27 insertions(+), 17 deletions(-) create mode 100644 include/net/netdev_netlink.h (limited to 'include/net') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 36f1152bfac3..f5e0750ab71d 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -745,8 +745,8 @@ operations: - irq-suspend-timeout kernel-family: - headers: [ "linux/list.h"] - sock-priv: struct list_head + headers: [ "net/netdev_netlink.h"] + sock-priv: struct netdev_nl_sock mcast-groups: list: diff --git a/include/net/netdev_netlink.h b/include/net/netdev_netlink.h new file mode 100644 index 000000000000..1599573d35c9 --- /dev/null +++ b/include/net/netdev_netlink.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_NETDEV_NETLINK_H +#define __NET_NETDEV_NETLINK_H + +#include + +struct netdev_nl_sock { + struct list_head bindings; +}; + +#endif /* __NET_NETDEV_NETLINK_H */ diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 996ac6a449eb..739f7b6506a6 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -9,7 +9,7 @@ #include "netdev-genl-gen.h" #include -#include +#include /* Integer value ranges */ static const struct netlink_range_validation netdev_a_page_pool_id_range = { @@ -217,7 +217,7 @@ struct genl_family netdev_nl_family __ro_after_init = { .n_split_ops = ARRAY_SIZE(netdev_nl_ops), .mcgrps = netdev_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(netdev_nl_mcgrps), - .sock_priv_size = sizeof(struct list_head), + .sock_priv_size = sizeof(struct netdev_nl_sock), .sock_priv_init = __netdev_nl_sock_priv_init, .sock_priv_destroy = __netdev_nl_sock_priv_destroy, }; diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index e09dd7539ff2..17d39fd64c94 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -10,7 +10,7 @@ #include #include -#include +#include /* Common nested types */ extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; @@ -42,7 +42,7 @@ enum { extern struct genl_family netdev_nl_family; -void netdev_nl_sock_priv_init(struct list_head *priv); -void netdev_nl_sock_priv_destroy(struct list_head *priv); +void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv); +void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv); #endif /* _LINUX_NETDEV_GEN_H */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 2b774183d31c..a219be90c739 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -829,8 +829,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; struct net_devmem_dmabuf_binding *binding; - struct list_head *sock_binding_list; u32 ifindex, dmabuf_fd, rxq_idx; + struct netdev_nl_sock *priv; struct net_device *netdev; struct sk_buff *rsp; struct nlattr *attr; @@ -845,10 +845,9 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); - sock_binding_list = genl_sk_priv_get(&netdev_nl_family, - NETLINK_CB(skb).sk); - if (IS_ERR(sock_binding_list)) - return PTR_ERR(sock_binding_list); + priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); + if (IS_ERR(priv)) + return PTR_ERR(priv); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) @@ -909,7 +908,7 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unbind; } - list_add(&binding->list, sock_binding_list); + list_add(&binding->list, &priv->bindings); nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); genlmsg_end(rsp, hdr); @@ -931,17 +930,17 @@ err_genlmsg_free: return err; } -void netdev_nl_sock_priv_init(struct list_head *priv) +void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) { - INIT_LIST_HEAD(priv); + INIT_LIST_HEAD(&priv->bindings); } -void netdev_nl_sock_priv_destroy(struct list_head *priv) +void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv) { struct net_devmem_dmabuf_binding *binding; struct net_devmem_dmabuf_binding *temp; - list_for_each_entry_safe(binding, temp, priv, list) { + list_for_each_entry_safe(binding, temp, &priv->bindings, list) { rtnl_lock(); net_devmem_unbind_dmabuf(binding); rtnl_unlock(); -- cgit v1.2.3 From 10eef096be25f3811ada2b43b108d1b8d8170001 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 11 Mar 2025 07:40:25 -0700 Subject: net: add granular lock for the netdev netlink socket As we move away from rtnl_lock for queue ops, introduce per-netdev_nl_sock lock. Signed-off-by: Stanislav Fomichev Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250311144026.4154277-3-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/net/netdev_netlink.h | 1 + net/core/netdev-genl.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/netdev_netlink.h b/include/net/netdev_netlink.h index 1599573d35c9..075962dbe743 100644 --- a/include/net/netdev_netlink.h +++ b/include/net/netdev_netlink.h @@ -5,6 +5,7 @@ #include struct netdev_nl_sock { + struct mutex lock; struct list_head bindings; }; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index a219be90c739..63e10717efc5 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -859,6 +859,7 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_genlmsg_free; } + mutex_lock(&priv->lock); rtnl_lock(); netdev = __dev_get_by_index(genl_info_net(info), ifindex); @@ -918,6 +919,7 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unbind; rtnl_unlock(); + mutex_unlock(&priv->lock); return 0; @@ -925,6 +927,7 @@ err_unbind: net_devmem_unbind_dmabuf(binding); err_unlock: rtnl_unlock(); + mutex_unlock(&priv->lock); err_genlmsg_free: nlmsg_free(rsp); return err; @@ -933,6 +936,7 @@ err_genlmsg_free: void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) { INIT_LIST_HEAD(&priv->bindings); + mutex_init(&priv->lock); } void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv) @@ -940,11 +944,13 @@ void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv) struct net_devmem_dmabuf_binding *binding; struct net_devmem_dmabuf_binding *temp; + mutex_lock(&priv->lock); list_for_each_entry_safe(binding, temp, &priv->bindings, list) { rtnl_lock(); net_devmem_unbind_dmabuf(binding); rtnl_unlock(); } + mutex_unlock(&priv->lock); } static int netdev_genl_netdevice_event(struct notifier_block *nb, -- cgit v1.2.3 From 0114a91da67263c56612ff19a6aa0992466e7756 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Wed, 5 Mar 2025 23:38:43 +0100 Subject: tcp: use BIT() macro in include/net/tcp.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use BIT() macro for TCP flags field and TCP congestion control flags that will be used by the congestion control algorithm. No functional changes. Signed-off-by: Chia-Yu Chang Reviewed-by: Ilpo Järvinen Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7207c52b1fc9..d2315d69e019 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -934,14 +935,14 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) #define tcp_flag_byte(th) (((u_int8_t *)th)[13]) -#define TCPHDR_FIN 0x01 -#define TCPHDR_SYN 0x02 -#define TCPHDR_RST 0x04 -#define TCPHDR_PSH 0x08 -#define TCPHDR_ACK 0x10 -#define TCPHDR_URG 0x20 -#define TCPHDR_ECE 0x40 -#define TCPHDR_CWR 0x80 +#define TCPHDR_FIN BIT(0) +#define TCPHDR_SYN BIT(1) +#define TCPHDR_RST BIT(2) +#define TCPHDR_PSH BIT(3) +#define TCPHDR_ACK BIT(4) +#define TCPHDR_URG BIT(5) +#define TCPHDR_ECE BIT(6) +#define TCPHDR_CWR BIT(7) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) @@ -1132,9 +1133,9 @@ enum tcp_ca_ack_event_flags { #define TCP_CA_UNSPEC 0 /* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ -#define TCP_CONG_NON_RESTRICTED 0x1 +#define TCP_CONG_NON_RESTRICTED BIT(0) /* Requires ECN/ECT set on all packets */ -#define TCP_CONG_NEEDS_ECN 0x2 +#define TCP_CONG_NEEDS_ECN BIT(1) #define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) union tcp_cc_info; -- cgit v1.2.3 From 2c2f08d31d2f6cd5621b121d596986bd54da679a Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 5 Mar 2025 23:38:44 +0100 Subject: tcp: extend TCP flags to allow AE bit/ACE field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With AccECN, there's one additional TCP flag to be used (AE) and ACE field that overloads the definition of AE, CWR, and ECE flags. As tcp_flags was previously only 1 byte, the byte-order stuff needs to be added to it's handling. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 11 +++++++++-- include/uapi/linux/tcp.h | 9 ++++++--- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_output.c | 8 ++++---- net/ipv6/tcp_ipv6.c | 2 +- net/netfilter/nf_log_syslog.c | 8 +++++--- 6 files changed, 26 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index d2315d69e019..4e11bdc86470 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -943,7 +943,14 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) #define TCPHDR_URG BIT(5) #define TCPHDR_ECE BIT(6) #define TCPHDR_CWR BIT(7) - +#define TCPHDR_AE BIT(8) +#define TCPHDR_FLAGS_MASK (TCPHDR_FIN | TCPHDR_SYN | TCPHDR_RST | \ + TCPHDR_PSH | TCPHDR_ACK | TCPHDR_URG | \ + TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) +#define tcp_flags_ntohs(th) (ntohs(*(__be16 *)&tcp_flag_word(th)) & \ + TCPHDR_FLAGS_MASK) + +#define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) /* State flags for sacked in struct tcp_skb_cb */ @@ -978,7 +985,7 @@ struct tcp_skb_cb { u16 tcp_gso_size; }; }; - __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ + __u16 tcp_flags; /* TCP header flags (tcp[12-13])*/ __u8 sacked; /* State flags for SACK. */ __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 32a27b4a5020..92a2e79222ea 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -28,7 +28,8 @@ struct tcphdr { __be32 seq; __be32 ack_seq; #if defined(__LITTLE_ENDIAN_BITFIELD) - __u16 res1:4, + __u16 ae:1, + res1:3, doff:4, fin:1, syn:1, @@ -40,7 +41,8 @@ struct tcphdr { cwr:1; #elif defined(__BIG_ENDIAN_BITFIELD) __u16 doff:4, - res1:4, + res1:3, + ae:1, cwr:1, ece:1, urg:1, @@ -70,6 +72,7 @@ union tcp_word_hdr { #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3]) enum { + TCP_FLAG_AE = __constant_cpu_to_be32(0x01000000), TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000), TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000), TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000), @@ -78,7 +81,7 @@ enum { TCP_FLAG_RST = __constant_cpu_to_be32(0x00040000), TCP_FLAG_SYN = __constant_cpu_to_be32(0x00020000), TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000), - TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000), + TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0E000000), TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000) }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9405b012dff..fab684221bf7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2159,7 +2159,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 24e56bf96747..efd3cb5e1ded 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -403,7 +403,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ -static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) +static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u16 flags) { skb->ip_summed = CHECKSUM_PARTIAL; @@ -1395,7 +1395,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, th->seq = htonl(tcb->seq); th->ack_seq = htonl(rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | - tcb->tcp_flags); + (tcb->tcp_flags & TCPHDR_FLAGS_MASK)); th->check = 0; th->urg_ptr = 0; @@ -1616,8 +1616,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct sk_buff *buff; int old_factor; long limit; + u16 flags; int nlen; - u8 flags; if (WARN_ON(len > skb->len)) return -EINVAL; @@ -2171,7 +2171,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, { int nlen = skb->len - len; struct sk_buff *buff; - u8 flags; + u16 flags; /* All of a TSO frame must be composed of paged data. */ DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 85c4820bfe15..a2fcc317a88e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1731,7 +1731,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index 58402226045e..86d5fc5d28e3 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -216,7 +216,9 @@ nf_log_dump_tcp_header(struct nf_log_buf *m, /* Max length: 9 "RES=0x3C " */ nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); - /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + /* Max length: 35 "AE CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->ae) + nf_log_buf_add(m, "AE "); if (th->cwr) nf_log_buf_add(m, "CWR "); if (th->ece) @@ -516,7 +518,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m, /* Proto Max log string length */ /* IP: 40+46+6+11+127 = 230 */ - /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ + /* TCP: 10+max(25,20+30+13+9+35+11+127) = 255 */ /* UDP: 10+max(25,20) = 35 */ /* UDPLITE: 14+max(25,20) = 39 */ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ @@ -526,7 +528,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m, /* (ICMP allows recursion one level deep) */ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ - /* maxlen = 230+ 91 + 230 + 252 = 803 */ + /* maxlen = 230+ 91 + 230 + 255 = 806 */ } static noinline_for_stack void -- cgit v1.2.3 From 041fb11d518f5d25d323766bc0f59b09bba314bc Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 5 Mar 2025 23:38:47 +0100 Subject: tcp: helpers for ECN mode handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create helpers for TCP ECN modes. No functional changes. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 44 ++++++++++++++++++++++++++++++++++++++++---- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_dctcp.c | 2 +- net/ipv4/tcp_input.c | 14 +++++++------- net/ipv4/tcp_minisocks.c | 4 +++- net/ipv4/tcp_output.c | 6 +++--- 6 files changed, 55 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 4e11bdc86470..5520f2d62dc0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -374,10 +374,46 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) } } -#define TCP_ECN_OK 1 -#define TCP_ECN_QUEUE_CWR 2 -#define TCP_ECN_DEMAND_CWR 4 -#define TCP_ECN_SEEN 8 +#define TCP_ECN_MODE_RFC3168 BIT(0) +#define TCP_ECN_QUEUE_CWR BIT(1) +#define TCP_ECN_DEMAND_CWR BIT(2) +#define TCP_ECN_SEEN BIT(3) +#define TCP_ECN_MODE_ACCECN BIT(4) + +#define TCP_ECN_DISABLED 0 +#define TCP_ECN_MODE_PENDING (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) +#define TCP_ECN_MODE_ANY (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) + +static inline bool tcp_ecn_mode_any(const struct tcp_sock *tp) +{ + return tp->ecn_flags & TCP_ECN_MODE_ANY; +} + +static inline bool tcp_ecn_mode_rfc3168(const struct tcp_sock *tp) +{ + return (tp->ecn_flags & TCP_ECN_MODE_ANY) == TCP_ECN_MODE_RFC3168; +} + +static inline bool tcp_ecn_mode_accecn(const struct tcp_sock *tp) +{ + return (tp->ecn_flags & TCP_ECN_MODE_ANY) == TCP_ECN_MODE_ACCECN; +} + +static inline bool tcp_ecn_disabled(const struct tcp_sock *tp) +{ + return !tcp_ecn_mode_any(tp); +} + +static inline bool tcp_ecn_mode_pending(const struct tcp_sock *tp) +{ + return (tp->ecn_flags & TCP_ECN_MODE_PENDING) == TCP_ECN_MODE_PENDING; +} + +static inline void tcp_ecn_mode_set(struct tcp_sock *tp, u8 mode) +{ + tp->ecn_flags &= ~TCP_ECN_MODE_ANY; + tp->ecn_flags |= mode; +} enum tcp_tw_status { TCP_TW_SUCCESS = 0, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 46951e749308..989c3c3d8e75 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4138,7 +4138,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; } - if (tp->ecn_flags & TCP_ECN_OK) + if (tcp_ecn_mode_any(tp)) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 8a45a4aea933..03abe0848420 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -90,7 +90,7 @@ __bpf_kfunc static void dctcp_init(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - if ((tp->ecn_flags & TCP_ECN_OK) || + if (tcp_ecn_mode_any(tp) || (sk->sk_state == TCP_LISTEN || sk->sk_state == TCP_CLOSE)) { struct dctcp *ca = inet_csk_ca(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 769048b559e5..5c270cf96678 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -342,7 +342,7 @@ static bool tcp_in_quickack_mode(struct sock *sk) static void tcp_ecn_queue_cwr(struct tcp_sock *tp) { - if (tp->ecn_flags & TCP_ECN_OK) + if (tcp_ecn_mode_rfc3168(tp)) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } @@ -369,7 +369,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - if (!(tcp_sk(sk)->ecn_flags & TCP_ECN_OK)) + if (tcp_ecn_disabled(tp)) return; switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { @@ -402,19 +402,19 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { - if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) - tp->ecn_flags &= ~TCP_ECN_OK; + if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) { - if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) - tp->ecn_flags &= ~TCP_ECN_OK; + if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) { - if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) + if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) return true; return false; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3cb8f281186b..0ae24add155b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -461,7 +461,9 @@ void tcp_openreq_init_rwin(struct request_sock *req, static void tcp_ecn_openreq_child(struct tcp_sock *tp, const struct request_sock *req) { - tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; + tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? + TCP_ECN_MODE_RFC3168 : + TCP_ECN_DISABLED); } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index efd3cb5e1ded..fffbd3ee44ed 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -325,7 +325,7 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) const struct tcp_sock *tp = tcp_sk(sk); TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; - if (!(tp->ecn_flags & TCP_ECN_OK)) + if (tcp_ecn_disabled(tp)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; else if (tcp_ca_needs_ecn(sk) || tcp_bpf_ca_needs_ecn(sk)) @@ -351,7 +351,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) if (use_ecn) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; - tp->ecn_flags = TCP_ECN_OK; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) INET_ECN_xmit(sk); } @@ -381,7 +381,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - if (tp->ecn_flags & TCP_ECN_OK) { + if (tcp_ecn_mode_rfc3168(tp)) { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { -- cgit v1.2.3 From 4618e195f9251947a5f15f7e5533bcc3a19b93ef Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 5 Mar 2025 23:38:51 +0100 Subject: tcp: add new TCP_TW_ACK_OOW state and allow ECN bits in TOS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ECN bits in TOS are always cleared when sending in ACKs in TW. Clearing them is problematic for TCP flows that used Accurate ECN because ECN bits decide which service queue the packet is placed into (L4S vs Classic). Effectively, TW ACKs are always downgraded from L4S to Classic queue which might impact, e.g., delay the ACK will experience on the path compared with the other packets of the flow. Change the TW ACK sending code to differentiate: - In tcp_v4_send_reset(), commit ba9e04a7ddf4f ("ip: fix tos reflection in ack and reset packets") cleans ECN bits for TW reset and this is not affected. - In tcp_v4_timewait_ack(), ECN bits for all TW ACKs are cleaned. But now only ECN bits of ACKs for oow data or paws_reject are cleaned, and ECN bits of other ACKs will not be cleaned. - In tcp_v4_reqsk_send_ack(), commit 66b13d99d96a1 ("ipv4: tcp: fix TOS value in ACK messages sent from TIME_WAIT") did not clean ECN bits of ACKs for oow data or paws_reject. But now the ECN bits rae cleaned for these ACKs. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Signed-off-by: David S. Miller --- include/net/tcp.h | 3 ++- net/ipv4/ip_output.c | 3 +-- net/ipv4/tcp_ipv4.c | 29 +++++++++++++++++++++++------ net/ipv4/tcp_minisocks.c | 2 +- net/ipv6/tcp_ipv6.c | 24 +++++++++++++++++------- 5 files changed, 44 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 5520f2d62dc0..c6bdd3d63159 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -419,7 +419,8 @@ enum tcp_tw_status { TCP_TW_SUCCESS = 0, TCP_TW_RST = 1, TCP_TW_ACK = 2, - TCP_TW_SYN = 3 + TCP_TW_SYN = 3, + TCP_TW_ACK_OOW = 4 }; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ea7a260bec8a..6e18d7ec5062 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -75,7 +75,6 @@ #include #include #include -#include #include #include #include @@ -1640,7 +1639,7 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, if (IS_ERR(rt)) return; - inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK; + inet_sk(sk)->tos = arg->tos; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 87f270ebc635..4fa4fbb0ad12 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -887,7 +888,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); - arg.tos = ip_hdr(skb)->tos; + /* ECN bits of TW reset are cleared */ + arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); @@ -1033,11 +1035,21 @@ static void tcp_v4_send_ack(const struct sock *sk, local_bh_enable(); } -static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) +static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, + enum tcp_tw_status tw_status) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); struct tcp_key key = {}; + u8 tos = tw->tw_tos; + + /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, + * while not cleaning ECN bits of other TW ACKs to avoid these ACKs + * being placed in a different service queues (Classic rather than L4S) + */ + if (tw_status == TCP_TW_ACK_OOW) + tos &= ~INET_ECN_MASK; + #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; @@ -1081,7 +1093,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, &key, tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, - tw->tw_tos, + tos, tw->tw_txhash); inet_twsk_put(tw); @@ -1151,6 +1163,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, key.type = TCP_KEY_MD5; } + /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, @@ -1158,7 +1171,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, req->ts_recent, 0, &key, inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, - ip_hdr(skb)->tos, + ip_hdr(skb)->tos & ~INET_ECN_MASK, READ_ONCE(tcp_rsk(req)->txhash)); if (tcp_key_is_ao(&key)) kfree(key.traffic_key); @@ -2175,6 +2188,7 @@ int tcp_v4_rcv(struct sk_buff *skb) { struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; + enum tcp_tw_status tw_status; int sdif = inet_sdif(skb); int dif = inet_iif(skb); const struct iphdr *iph; @@ -2402,7 +2416,9 @@ do_time_wait: inet_twsk_put(inet_twsk(sk)); goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { + + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn); + switch (tw_status) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, @@ -2423,7 +2439,8 @@ do_time_wait: /* to ACK */ fallthrough; case TCP_TW_ACK: - tcp_v4_timewait_ack(sk, skb); + case TCP_TW_ACK_OOW: + tcp_v4_timewait_ack(sk, skb, tw_status); break; case TCP_TW_RST: tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 0ae24add155b..fb9349be36b8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -44,7 +44,7 @@ tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, /* Send ACK. Note, we do not put the bucket, * it will be released by caller. */ - return TCP_TW_ACK; + return TCP_TW_ACK_OOW; } /* We are rate-limiting, so just release the tw sock and drop skb. */ diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a2fcc317a88e..e182ee0a2330 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -999,7 +999,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, - tclass & ~INET_ECN_MASK, priority); + tclass, priority); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); @@ -1135,7 +1135,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, trace_tcp_send_reset(sk, skb, reason); tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1, - ipv6_get_dsfield(ipv6h), label, priority, txhash, + ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK, + label, priority, txhash, &key); #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) @@ -1155,11 +1156,16 @@ static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, tclass, label, priority, txhash, key); } -static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) +static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb, + enum tcp_tw_status tw_status) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); + u8 tclass = tw->tw_tclass; struct tcp_key key = {}; + + if (tw_status == TCP_TW_ACK_OOW) + tclass &= ~INET_ECN_MASK; #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; @@ -1203,7 +1209,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, - &key, tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), + &key, tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority, tw->tw_txhash); #ifdef CONFIG_TCP_AO @@ -1280,7 +1286,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), req->ts_recent, sk->sk_bound_dev_if, - &key, ipv6_get_dsfield(ipv6_hdr(skb)), 0, + &key, ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK, + 0, READ_ONCE(sk->sk_priority), READ_ONCE(tcp_rsk(req)->txhash)); if (tcp_key_is_ao(&key)) @@ -1742,6 +1749,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; + enum tcp_tw_status tw_status; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); const struct tcphdr *th; @@ -1962,7 +1970,8 @@ do_time_wait: goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn); + switch (tw_status) { case TCP_TW_SYN: { struct sock *sk2; @@ -1987,7 +1996,8 @@ do_time_wait: /* to ACK */ fallthrough; case TCP_TW_ACK: - tcp_v6_timewait_ack(sk, skb); + case TCP_TW_ACK_OOW: + tcp_v6_timewait_ack(sk, skb, tw_status); break; case TCP_TW_RST: tcp_v6_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); -- cgit v1.2.3 From 9866884ce8ef25338c5b33cbb97c2b5d92088528 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 5 Mar 2025 23:38:52 +0100 Subject: tcp: Pass flags to __tcp_send_ack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Accurate ECN needs to send custom flags to handle IP-ECN field reflection during handshake. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/bpf_tcp_ca.c | 2 +- net/ipv4/tcp_dctcp.h | 2 +- net/ipv4/tcp_output.c | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index c6bdd3d63159..d08fbf90495d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -707,7 +707,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority, enum sk_rst_reason reason); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); -void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); +void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags); void tcp_send_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk); void tcp_send_loss_probe(struct sock *sk); diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 554804774628..e01492234b0b 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -121,7 +121,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt) { /* bpf_tcp_ca prog cannot have NULL tp */ - __tcp_send_ack((struct sock *)tp, rcv_nxt); + __tcp_send_ack((struct sock *)tp, rcv_nxt, 0); return 0; } diff --git a/net/ipv4/tcp_dctcp.h b/net/ipv4/tcp_dctcp.h index d69a77cbd0c7..4b0259111d81 100644 --- a/net/ipv4/tcp_dctcp.h +++ b/net/ipv4/tcp_dctcp.h @@ -28,7 +28,7 @@ static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, */ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { dctcp_ece_ack_cwr(sk, *ce_state); - __tcp_send_ack(sk, *prior_rcv_nxt); + __tcp_send_ack(sk, *prior_rcv_nxt, 0); } inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fffbd3ee44ed..e0a4e5432399 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4240,7 +4240,7 @@ void tcp_send_delayed_ack(struct sock *sk) } /* This routine sends an ack and also updates the window. */ -void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) +void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags) { struct sk_buff *buff; @@ -4269,7 +4269,7 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); + tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK | flags); /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. @@ -4284,7 +4284,7 @@ EXPORT_SYMBOL_GPL(__tcp_send_ack); void tcp_send_ack(struct sock *sk) { - __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); + __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt, 0); } /* This routine sends a packet with an out of date sequence -- cgit v1.2.3 From 8d4880db378350f8ed8969feea13bdc164564fc1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 11 Mar 2025 21:42:28 +0100 Subject: udp_tunnel: create a fastpath GRO lookup. Most UDP tunnels bind a socket to a local port, with ANY address, no peer and no interface index specified. Additionally it's quite common to have a single tunnel device per namespace. Track in each namespace the UDP tunnel socket respecting the above. When only a single one is present, store a reference in the netns. When such reference is not NULL, UDP tunnel GRO lookup just need to match the incoming packet destination port vs the socket local port. The tunnel socket never sets the reuse[port] flag[s]. When bound to no address and interface, no other socket can exist in the same netns matching the specified local port. Matching packets with non-local destination addresses will be aggregated, and eventually segmented as needed - no behavior changes intended. Note that the UDP tunnel socket reference is stored into struct netns_ipv4 for both IPv4 and IPv6 tunnels. That is intentional to keep all the fastpath-related netns fields in the same struct and allow cacheline-based optimization. Currently both the IPv4 and IPv6 socket pointer share the same cacheline as the `udp_table` field. Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/4d5c319c4471161829f50cb8436841de81a5edae.1741718157.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- include/linux/udp.h | 16 ++++++++++++++++ include/net/netns/ipv4.h | 11 +++++++++++ include/net/udp.h | 1 + include/net/udp_tunnel.h | 18 ++++++++++++++++++ net/ipv4/udp.c | 13 ++++++++++++- net/ipv4/udp_offload.c | 37 +++++++++++++++++++++++++++++++++++++ net/ipv4/udp_tunnel_core.c | 12 ++++++++++++ net/ipv6/udp.c | 2 ++ net/ipv6/udp_offload.c | 5 +++++ 9 files changed, 114 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/linux/udp.h b/include/linux/udp.h index 0807e21cfec9..895240177f4f 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -101,6 +101,13 @@ struct udp_sock { /* Cache friendly copy of sk->sk_peek_off >= 0 */ bool peeking_with_offset; + + /* + * Accounting for the tunnel GRO fastpath. + * Unprotected by compilers guard, as it uses space available in + * the last UDP socket cacheline. + */ + struct hlist_node tunnel_list; }; #define udp_test_bit(nr, sk) \ @@ -219,4 +226,13 @@ static inline void udp_allow_gso(struct sock *sk) #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) +static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6) +{ +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + return rcu_dereference(net->ipv4.udp_tunnel_gro[is_ipv6].sk); +#else + return NULL; +#endif +} + #endif /* _LINUX_UDP_H */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 650b2dc9199f..6373e3f17da8 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -47,6 +47,11 @@ struct sysctl_fib_multipath_hash_seed { }; #endif +struct udp_tunnel_gro { + struct sock __rcu *sk; + struct hlist_head list; +}; + struct netns_ipv4 { /* Cacheline organization can be found documented in * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. @@ -85,6 +90,11 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; struct udp_table *udp_table; +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + /* Not in a pernet subsys because need to be available at GRO stage */ + struct udp_tunnel_gro udp_tunnel_gro[2]; +#endif + #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table_header *frags_hdr; @@ -277,4 +287,5 @@ struct netns_ipv4 { struct hlist_head *inet_addr_lst; struct delayed_work addr_chk_work; }; + #endif diff --git a/include/net/udp.h b/include/net/udp.h index 6e89520e100d..a772510b2aa5 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -290,6 +290,7 @@ static inline void udp_lib_init_sock(struct sock *sk) struct udp_sock *up = udp_sk(sk); skb_queue_head_init(&up->reader_queue); + INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index a93dc51f6323..eda0f3e2f65f 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -203,6 +203,24 @@ static inline void udp_tunnel_encap_enable(struct sock *sk) udp_encap_enable(); } +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +#else +static inline void udp_tunnel_update_gro_lookup(struct net *net, + struct sock *sk, bool add) {} +#endif + +static inline void udp_tunnel_cleanup_gro(struct sock *sk) +{ + struct udp_sock *up = udp_sk(sk); + struct net *net = sock_net(sk); + + if (!up->tunnel_list.pprev) + return; + + udp_tunnel_update_gro_lookup(net, sk, false); +} + #define UDP_TUNNEL_NIC_MAX_TABLES 4 enum udp_tunnel_nic_info_flags { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d0bffcfa56d8..db606f7e4163 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2891,8 +2891,10 @@ void udp_destroy_sock(struct sock *sk) if (encap_destroy) encap_destroy(sk); } - if (udp_test_bit(ENCAP_ENABLED, sk)) + if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udp_encap_needed_key); + udp_tunnel_cleanup_gro(sk); + } } } @@ -3804,6 +3806,15 @@ fallback: static int __net_init udp_pernet_init(struct net *net) { +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + int i; + + /* No tunnel is configured */ + for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) { + INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list); + RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL); + } +#endif udp_sysctl_init(net); udp_set_table(net); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 2c0725583be3..e36d8a234848 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -12,6 +12,38 @@ #include #include #include +#include + +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +static DEFINE_SPINLOCK(udp_tunnel_gro_lock); + +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) +{ + bool is_ipv6 = sk->sk_family == AF_INET6; + struct udp_sock *tup, *up = udp_sk(sk); + struct udp_tunnel_gro *udp_tunnel_gro; + + spin_lock(&udp_tunnel_gro_lock); + udp_tunnel_gro = &net->ipv4.udp_tunnel_gro[is_ipv6]; + if (add) + hlist_add_head(&up->tunnel_list, &udp_tunnel_gro->list); + else + hlist_del_init(&up->tunnel_list); + + if (udp_tunnel_gro->list.first && + !udp_tunnel_gro->list.first->next) { + tup = hlist_entry(udp_tunnel_gro->list.first, struct udp_sock, + tunnel_list); + + rcu_assign_pointer(udp_tunnel_gro->sk, (struct sock *)tup); + } else { + RCU_INIT_POINTER(udp_tunnel_gro->sk, NULL); + } + + spin_unlock(&udp_tunnel_gro_lock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup); +#endif static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, @@ -635,8 +667,13 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct iphdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, false); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 619a53eb672d..b5695826e57a 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -58,6 +58,15 @@ error: } EXPORT_SYMBOL(udp_sock_create4); +static bool sk_saddr_any(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#else + return !sk->sk_rcv_saddr; +#endif +} + void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *cfg) { @@ -80,6 +89,9 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sk); + + if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sock->sk)) + udp_tunnel_update_gro_lookup(net, sock->sk, true); } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 024458ef163c..7317f8e053f1 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -1825,6 +1826,7 @@ void udpv6_destroy_sock(struct sock *sk) if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udpv6_encap_needed_key); udp_encap_disable(); + udp_tunnel_cleanup_gro(sk); } } } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 404212dfc99a..d8445ac1b2e4 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -118,8 +118,13 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct ipv6hdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, true); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, -- cgit v1.2.3 From 311b36574ceaccfa3f91b74054a09cd4bb877702 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 11 Mar 2025 21:42:29 +0100 Subject: udp_tunnel: use static call for GRO hooks when possible It's quite common to have a single UDP tunnel type active in the whole system. In such a case we can replace the indirect call for the UDP tunnel GRO callback with a static call. Add the related accounting in the control path and switch to static call when possible. To keep the code simple use a static array for the registered tunnel types, and size such array based on the kernel config. Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/6fd1f9c7651151493ecab174e7b8386a1534170d.1741718157.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- include/net/udp_tunnel.h | 4 ++ net/ipv4/udp_offload.c | 130 ++++++++++++++++++++++++++++++++++++++++++++- net/ipv4/udp_tunnel_core.c | 2 + 3 files changed, 135 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index eda0f3e2f65f..a7b230867eb1 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -205,9 +205,11 @@ static inline void udp_tunnel_encap_enable(struct sock *sk) #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add); #else static inline void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) {} +static inline void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) {} #endif static inline void udp_tunnel_cleanup_gro(struct sock *sk) @@ -215,6 +217,8 @@ static inline void udp_tunnel_cleanup_gro(struct sock *sk) struct udp_sock *up = udp_sk(sk); struct net *net = sock_net(sk); + udp_tunnel_update_gro_rcv(sk, false); + if (!up->tunnel_list.pprev) return; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index e36d8a234848..088aa8cb8ac0 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -15,6 +15,37 @@ #include #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + +/* + * Dummy GRO tunnel callback, exists mainly to avoid dangling/NULL + * values for the udp tunnel static call. + */ +static struct sk_buff *dummy_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->flush = 1; + return NULL; +} + +typedef struct sk_buff *(*udp_tunnel_gro_rcv_t)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); + +struct udp_tunnel_type_entry { + udp_tunnel_gro_rcv_t gro_receive; + refcount_t count; +}; + +#define UDP_MAX_TUNNEL_TYPES (IS_ENABLED(CONFIG_GENEVE) + \ + IS_ENABLED(CONFIG_VXLAN) * 2 + \ + IS_ENABLED(CONFIG_NET_FOU) * 2) + +DEFINE_STATIC_CALL(udp_tunnel_gro_rcv, dummy_gro_rcv); +static DEFINE_STATIC_KEY_FALSE(udp_tunnel_static_call); +static struct mutex udp_tunnel_gro_type_lock; +static struct udp_tunnel_type_entry udp_tunnel_gro_types[UDP_MAX_TUNNEL_TYPES]; +static unsigned int udp_tunnel_gro_type_nr; static DEFINE_SPINLOCK(udp_tunnel_gro_lock); void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) @@ -43,6 +74,101 @@ void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) spin_unlock(&udp_tunnel_gro_lock); } EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup); + +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) +{ + struct udp_tunnel_type_entry *cur = NULL; + struct udp_sock *up = udp_sk(sk); + int i, old_gro_type_nr; + + if (!up->gro_receive) + return; + + mutex_lock(&udp_tunnel_gro_type_lock); + for (i = 0; i < udp_tunnel_gro_type_nr; i++) + if (udp_tunnel_gro_types[i].gro_receive == up->gro_receive) + cur = &udp_tunnel_gro_types[i]; + + old_gro_type_nr = udp_tunnel_gro_type_nr; + if (add) { + /* + * Update the matching entry, if found, or add a new one + * if needed + */ + if (cur) { + refcount_inc(&cur->count); + goto out; + } + + if (unlikely(udp_tunnel_gro_type_nr == UDP_MAX_TUNNEL_TYPES)) { + pr_err_once("Too many UDP tunnel types, please increase UDP_MAX_TUNNEL_TYPES\n"); + /* Ensure static call will never be enabled */ + udp_tunnel_gro_type_nr = UDP_MAX_TUNNEL_TYPES + 2; + goto out; + } + + cur = &udp_tunnel_gro_types[udp_tunnel_gro_type_nr++]; + refcount_set(&cur->count, 1); + cur->gro_receive = up->gro_receive; + } else { + /* + * The stack cleanups only successfully added tunnel, the + * lookup on removal should never fail. + */ + if (WARN_ON_ONCE(!cur)) + goto out; + + if (!refcount_dec_and_test(&cur->count)) + goto out; + + /* avoid gaps, so that the enable tunnel has always id 0 */ + *cur = udp_tunnel_gro_types[--udp_tunnel_gro_type_nr]; + } + + if (udp_tunnel_gro_type_nr == 1) { + static_call_update(udp_tunnel_gro_rcv, + udp_tunnel_gro_types[0].gro_receive); + static_branch_enable(&udp_tunnel_static_call); + } else if (old_gro_type_nr == 1) { + static_branch_disable(&udp_tunnel_static_call); + static_call_update(udp_tunnel_gro_rcv, dummy_gro_rcv); + } + +out: + mutex_unlock(&udp_tunnel_gro_type_lock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_rcv); + +static void udp_tunnel_gro_init(void) +{ + mutex_init(&udp_tunnel_gro_type_lock); +} + +static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + if (static_branch_likely(&udp_tunnel_static_call)) { + if (unlikely(gro_recursion_inc_test(skb))) { + NAPI_GRO_CB(skb)->flush |= 1; + return NULL; + } + return static_call(udp_tunnel_gro_rcv)(sk, head, skb); + } + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); +} + +#else + +static void udp_tunnel_gro_init(void) {} + +static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); +} + #endif static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, @@ -654,7 +780,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); - pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); + pp = udp_tunnel_gro_rcv(sk, head, skb); out: skb_gro_flush_final(skb, pp, flush); @@ -804,5 +930,7 @@ int __init udpv4_offload_init(void) .gro_complete = udp4_gro_complete, }, }; + + udp_tunnel_gro_init(); return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP); } diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index b5695826e57a..c49fceea8313 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -90,6 +90,8 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_tunnel_encap_enable(sk); + udp_tunnel_update_gro_rcv(sock->sk, true); + if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sock->sk)) udp_tunnel_update_gro_lookup(net, sock->sk, true); } -- cgit v1.2.3 From ae2d90355aa5592b0e99c8bbb4c3fa1d8e205f1b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Mar 2025 08:22:47 +0000 Subject: inet: frags: add inet_frag_putn() helper inet_frag_putn() can release multiple references in one step. Use it in inet_frags_free_cb(). Replace inet_frag_put(X) with inet_frag_putn(X, 1) Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250312082250.1803501-2-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/inet_frag.h | 4 ++-- include/net/ipv6_frag.h | 3 ++- net/ieee802154/6lowpan/reassembly.c | 7 ++++--- net/ipv4/inet_fragment.c | 3 +-- net/ipv4/ip_fragment.c | 2 +- net/ipv6/netfilter/nf_conntrack_reasm.c | 3 ++- net/ipv6/reassembly.c | 4 ++-- 7 files changed, 14 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 5af6eb14c5db..26687ad0b141 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -145,9 +145,9 @@ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); unsigned int inet_frag_rbtree_purge(struct rb_root *root, enum skb_drop_reason reason); -static inline void inet_frag_put(struct inet_frag_queue *q) +static inline void inet_frag_putn(struct inet_frag_queue *q, int refs) { - if (refcount_dec_and_test(&q->refcnt)) + if (refs && refcount_sub_and_test(refs, &q->refcnt)) inet_frag_destroy(q); } diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index 7321ffe3a108..9d968d7d9fa4 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -66,6 +66,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) { struct net_device *dev = NULL; struct sk_buff *head; + int refs = 1; rcu_read_lock(); /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */ @@ -109,7 +110,7 @@ out: spin_unlock(&fq->q.lock); out_rcu_unlock: rcu_read_unlock(); - inet_frag_put(&fq->q); + inet_frag_putn(&fq->q, refs); } /* Check if the upper layer header is truncated in the first fragment. */ diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 867d637d86f0..2df1a027e68a 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -45,6 +45,7 @@ static void lowpan_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; + int refs = 1; fq = container_of(frag, struct frag_queue, q); @@ -56,7 +57,7 @@ static void lowpan_frag_expire(struct timer_list *t) inet_frag_kill(&fq->q); out: spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q); + inet_frag_putn(&fq->q, refs); } static inline struct lowpan_frag_queue * @@ -302,13 +303,13 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) fq = fq_find(net, cb, &hdr.source, &hdr.dest); if (fq != NULL) { - int ret; + int ret, refs = 1; spin_lock(&fq->q.lock); ret = lowpan_frag_queue(fq, skb, frag_type); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q); + inet_frag_putn(&fq->q, refs); return ret; } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index d179a2c84222..efc4cbee04c2 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -145,8 +145,7 @@ static void inet_frags_free_cb(void *ptr, void *arg) } spin_unlock_bh(&fq->lock); - if (refcount_sub_and_test(count, &fq->refcnt)) - inet_frag_destroy(fq); + inet_frag_putn(fq, count); } static LLIST_HEAD(fqdir_free_list); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 7a435746a22d..474a26191c89 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -112,7 +112,7 @@ static void ip4_frag_free(struct inet_frag_queue *q) static void ipq_put(struct ipq *ipq) { - inet_frag_put(&ipq->q); + inet_frag_putn(&ipq->q, 1); } /* Kill ipq entry. It is not destroyed immediately, diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 4120e67a8ce6..fcf969f9fe2a 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -447,6 +447,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) struct frag_hdr *fhdr; struct frag_queue *fq; struct ipv6hdr *hdr; + int refs = 1; u8 prevhdr; /* Jumbo payload inhibits frag. header */ @@ -489,7 +490,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) } spin_unlock_bh(&fq->q.lock); - inet_frag_put(&fq->q); + inet_frag_putn(&fq->q, refs); return ret; } EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index a48be617a8ab..5d56a8e5166b 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -380,7 +380,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { u32 prob_offset = 0; - int ret; + int ret, refs = 1; spin_lock(&fq->q.lock); @@ -389,7 +389,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) &prob_offset); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q); + inet_frag_putn(&fq->q, refs); if (prob_offset) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); -- cgit v1.2.3 From eb0dfc0ef195a04e519b15d73cf25d8c25ee8df7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Mar 2025 08:22:49 +0000 Subject: inet: frags: change inet_frag_kill() to defer refcount updates In the following patch, we no longer assume inet_frag_kill() callers own a reference. Consuming two refcounts from inet_frag_kill() would lead in UAF. Propagate the pointer to the refs that will be consumed later by the final inet_frag_putn() call. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250312082250.1803501-4-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/inet_frag.h | 2 +- include/net/ipv6_frag.h | 2 +- net/ieee802154/6lowpan/reassembly.c | 17 ++++++++++------- net/ipv4/inet_fragment.c | 12 +++++++----- net/ipv4/ip_fragment.c | 30 ++++++++++++------------------ net/ipv6/netfilter/nf_conntrack_reasm.c | 21 ++++++++++++--------- net/ipv6/reassembly.c | 18 ++++++++++-------- 7 files changed, 53 insertions(+), 49 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 26687ad0b141..0eccd9c3a883 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -137,7 +137,7 @@ static inline void fqdir_pre_exit(struct fqdir *fqdir) } void fqdir_exit(struct fqdir *fqdir); -void inet_frag_kill(struct inet_frag_queue *q); +void inet_frag_kill(struct inet_frag_queue *q, int *refs); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index 9d968d7d9fa4..38ef66826939 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -78,7 +78,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) goto out; fq->q.flags |= INET_FRAG_DROP; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, &refs); dev = dev_get_by_index_rcu(net, fq->iif); if (!dev) diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 2df1a027e68a..c5f86cff06ee 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -31,7 +31,8 @@ static const char lowpan_frags_cache_name[] = "lowpan-frags"; static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev, struct net_device *ldev); + struct sk_buff *prev, struct net_device *ldev, + int *refs); static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { @@ -54,7 +55,7 @@ static void lowpan_frag_expire(struct timer_list *t) if (fq->q.flags & INET_FRAG_COMPLETE) goto out; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, &refs); out: spin_unlock(&fq->q.lock); inet_frag_putn(&fq->q, refs); @@ -83,7 +84,8 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb, } static int lowpan_frag_queue(struct lowpan_frag_queue *fq, - struct sk_buff *skb, u8 frag_type) + struct sk_buff *skb, u8 frag_type, + int *refs) { struct sk_buff *prev_tail; struct net_device *ldev; @@ -144,7 +146,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq, unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - res = lowpan_frag_reasm(fq, skb, prev_tail, ldev); + res = lowpan_frag_reasm(fq, skb, prev_tail, ldev, refs); skb->_skb_refdst = orefdst; return res; } @@ -163,11 +165,12 @@ err: * the last and the first frames arrived and all the bits are here. */ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *ldev) + struct sk_buff *prev_tail, struct net_device *ldev, + int *refs) { void *reasm_data; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); if (!reasm_data) @@ -306,7 +309,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) int ret, refs = 1; spin_lock(&fq->q.lock); - ret = lowpan_frag_queue(fq, skb, frag_type); + ret = lowpan_frag_queue(fq, skb, frag_type, &refs); spin_unlock(&fq->q.lock); inet_frag_putn(&fq->q, refs); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index efc4cbee04c2..5eb186050013 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -225,10 +225,10 @@ void fqdir_exit(struct fqdir *fqdir) } EXPORT_SYMBOL(fqdir_exit); -void inet_frag_kill(struct inet_frag_queue *fq) +void inet_frag_kill(struct inet_frag_queue *fq, int *refs) { if (del_timer(&fq->timer)) - refcount_dec(&fq->refcnt); + (*refs)++; if (!(fq->flags & INET_FRAG_COMPLETE)) { struct fqdir *fqdir = fq->fqdir; @@ -243,7 +243,7 @@ void inet_frag_kill(struct inet_frag_queue *fq) if (!READ_ONCE(fqdir->dead)) { rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, fqdir->f->rhash_params); - refcount_dec(&fq->refcnt); + (*refs)++; } else { fq->flags |= INET_FRAG_HASH_DEAD; } @@ -349,9 +349,11 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, &q->node, f->rhash_params); if (*prev) { + int refs = 2; + q->flags |= INET_FRAG_COMPLETE; - inet_frag_kill(q); - inet_frag_destroy(q); + inet_frag_kill(q, &refs); + inet_frag_putn(q, refs); return NULL; } return q; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index ee953be49b34..c5f3c810706f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -76,7 +76,8 @@ static u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev); + struct sk_buff *prev_tail, struct net_device *dev, + int *refs); static void ip4_frag_init(struct inet_frag_queue *q, const void *a) @@ -107,14 +108,6 @@ static void ip4_frag_free(struct inet_frag_queue *q) inet_putpeer(qp->peer); } -/* Kill ipq entry. It is not destroyed immediately, - * because caller (and someone more) holds reference count. - */ -static void ipq_kill(struct ipq *ipq) -{ - inet_frag_kill(&ipq->q); -} - static bool frag_expire_skip_icmp(u32 user) { return user == IP_DEFRAG_AF_PACKET || @@ -152,7 +145,7 @@ static void ip_expire(struct timer_list *t) goto out; qp->q.flags |= INET_FRAG_DROP; - ipq_kill(qp); + inet_frag_kill(&qp->q, &refs); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); @@ -271,7 +264,7 @@ static int ip_frag_reinit(struct ipq *qp) } /* Add new segment to existing queue. */ -static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) +static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb, int *refs) { struct net *net = qp->q.fqdir->net; int ihl, end, flags, offset; @@ -291,7 +284,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && unlikely(ip_frag_too_far(qp)) && unlikely(err = ip_frag_reinit(qp))) { - ipq_kill(qp); + inet_frag_kill(&qp->q, refs); goto err; } @@ -375,10 +368,10 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip_frag_reasm(qp, skb, prev_tail, dev); + err = ip_frag_reasm(qp, skb, prev_tail, dev, refs); skb->_skb_refdst = orefdst; if (err) - inet_frag_kill(&qp->q); + inet_frag_kill(&qp->q, refs); return err; } @@ -395,7 +388,7 @@ insert_error: err = -EINVAL; __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); discard_qp: - inet_frag_kill(&qp->q); + inet_frag_kill(&qp->q, refs); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); err: kfree_skb_reason(skb, reason); @@ -409,7 +402,8 @@ static bool ip_frag_coalesce_ok(const struct ipq *qp) /* Build a new IP datagram from all its fragments. */ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev, + int *refs) { struct net *net = qp->q.fqdir->net; struct iphdr *iph; @@ -417,7 +411,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, int len, err; u8 ecn; - ipq_kill(qp); + inet_frag_kill(&qp->q, refs); ecn = ip_frag_ecn_table[qp->ecn]; if (unlikely(ecn == 0xff)) { @@ -495,7 +489,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) spin_lock(&qp->q.lock); - ret = ip_frag_queue(qp, skb); + ret = ip_frag_queue(qp, skb, &refs); spin_unlock(&qp->q.lock); inet_frag_putn(&qp->q, refs); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index fcf969f9fe2a..f33acb730dc5 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -123,7 +123,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) #endif static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev); + struct sk_buff *prev_tail, struct net_device *dev, + int *refs); static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { @@ -167,7 +168,8 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, - const struct frag_hdr *fhdr, int nhoff) + const struct frag_hdr *fhdr, int nhoff, + int *refs) { unsigned int payload_len; struct net_device *dev; @@ -221,7 +223,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); return -EPROTO; } if (end > fq->q.len) { @@ -287,7 +289,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = nf_ct_frag6_reasm(fq, skb, prev, dev); + err = nf_ct_frag6_reasm(fq, skb, prev, dev, refs); skb->_skb_refdst = orefdst; /* After queue has assumed skb ownership, only 0 or @@ -301,7 +303,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, return -EINPROGRESS; insert_error: - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); err: skb_dst_drop(skb); return -EINVAL; @@ -315,13 +317,14 @@ err: * the last and the first frames arrived and all the bits are here. */ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev, + int *refs) { void *reasm_data; int payload_len; u8 ecn; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -372,7 +375,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, return 0; err: - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); return -EINVAL; } @@ -483,7 +486,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) spin_lock_bh(&fq->q.lock); - ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff); + ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff, &refs); if (ret == -EPROTO) { skb->transport_header = savethdr; ret = 0; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 5d56a8e5166b..7560380bd587 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -68,7 +68,8 @@ static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev); + struct sk_buff *prev_tail, struct net_device *dev, + int *refs); static void ip6_frag_expire(struct timer_list *t) { @@ -105,7 +106,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, struct frag_hdr *fhdr, int nhoff, - u32 *prob_offset) + u32 *prob_offset, int *refs) { struct net *net = dev_net(skb_dst(skb)->dev); int offset, end, fragsize; @@ -220,7 +221,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip6_frag_reasm(fq, skb, prev_tail, dev); + err = ip6_frag_reasm(fq, skb, prev_tail, dev, refs); skb->_skb_refdst = orefdst; return err; } @@ -238,7 +239,7 @@ insert_error: __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASM_OVERLAPS); discard_fq: - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); err: @@ -254,7 +255,8 @@ err: * the last and the first frames arrived and all the bits are here. */ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev, + int *refs) { struct net *net = fq->q.fqdir->net; unsigned int nhoff; @@ -262,7 +264,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, int payload_len; u8 ecn; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -320,7 +322,7 @@ out_fail: rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS); rcu_read_unlock(); - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); return -1; } @@ -386,7 +388,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) fq->iif = iif; ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff, - &prob_offset); + &prob_offset, &refs); spin_unlock(&fq->q.lock); inet_frag_putn(&fq->q, refs); -- cgit v1.2.3 From 15492700ac41459b54a6683490adcee350ab11e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Mar 2025 08:39:07 +0000 Subject: tcp: cache RTAX_QUICKACK metric in a hot cache line tcp_in_quickack_mode() is called from input path for small packets. It calls __sk_dst_get() which reads sk->sk_dst_cache which has been put in sock_read_tx group (for good reasons). Then dst_metric(dst, RTAX_QUICKACK) also needs extra cache line misses. Cache RTAX_QUICKACK in icsk->icsk_ack.dst_quick_ack to no longer pull these cache lines for the cases a delayed ACK is scheduled. After this patch TCP receive path does not longer access sock_read_tx group. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250312083907.1931644-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/inet_connection_sock.h | 3 ++- net/core/sock.c | 6 +++++- net/ipv4/tcp_input.c | 3 +-- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index d9978ffacc97..f736d3097e43 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -117,7 +117,8 @@ struct inet_connection_sock { #define ATO_BITS 8 __u32 ato:ATO_BITS, /* Predicted tick of soft clock */ lrcv_flowlabel:20, /* last received ipv6 flowlabel */ - unused:4; + dst_quick_ack:1, /* cache dst RTAX_QUICKACK */ + unused:3; unsigned long timeout; /* Currently scheduled timeout */ __u32 lrcvtime; /* timestamp of last received data packet */ __u16 last_seg_size; /* Size of last incoming segment */ diff --git a/net/core/sock.c b/net/core/sock.c index a0598518ce89..323892066def 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2565,8 +2565,12 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) u32 max_segs = 1; sk->sk_route_caps = dst->dev->features; - if (sk_is_tcp(sk)) + if (sk_is_tcp(sk)) { + struct inet_connection_sock *icsk = inet_csk(sk); + sk->sk_route_caps |= NETIF_F_GSO; + icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK); + } if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; if (unlikely(sk->sk_gso_disabled)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5c270cf96678..72382ee4456d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -334,9 +334,8 @@ static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) static bool tcp_in_quickack_mode(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); - const struct dst_entry *dst = __sk_dst_get(sk); - return (dst && dst_metric(dst, RTAX_QUICKACK)) || + return icsk->icsk_ack.dst_quick_ack || (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } -- cgit v1.2.3 From 1305b0c22eca7373910f30ec31912796617b7121 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 13 Mar 2025 11:20:55 +0100 Subject: mptcp: pm: define struct mptcp_pm_ops In order to allow users to develop their own BPF-based path manager, this patch defines a struct ops "mptcp_pm_ops" for an MPTCP path manager, which contains a set of interfaces. Currently only init() and release() interfaces are included, subsequent patches will add others step by step. Add a set of functions to register, unregister, find and validate a given path manager struct ops. "list" is used to add this path manager to mptcp_pm_list list when it is registered. "name" is used to identify this path manager. mptcp_pm_find() uses "name" to find a path manager on the list. mptcp_pm_unregister is not used in this set, but will be invoked in .unreg of struct bpf_struct_ops. mptcp_pm_validate() will be invoked in .validate of struct bpf_struct_ops. That's why they are exported. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250313-net-next-mptcp-pm-ops-intro-v1-6-f4e4a88efc50@kernel.org Signed-off-by: Paolo Abeni --- include/net/mptcp.h | 12 ++++++++++++ net/mptcp/pm.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 5 +++++ 3 files changed, 67 insertions(+) (limited to 'include/net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 2c85ca92bb1c..645d15695e3f 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -14,6 +14,7 @@ struct mptcp_info; struct mptcp_sock; +struct mptcp_pm_addr_entry; struct seq_file; /* MPTCP sk_buff extension data */ @@ -121,6 +122,17 @@ struct mptcp_sched_ops { void (*release)(struct mptcp_sock *msk); } ____cacheline_aligned_in_smp; +#define MPTCP_PM_NAME_MAX 16 + +struct mptcp_pm_ops { + char name[MPTCP_PM_NAME_MAX]; + struct module *owner; + struct list_head list; + + void (*init)(struct mptcp_sock *msk); + void (*release)(struct mptcp_sock *msk); +} ____cacheline_aligned_in_smp; + #ifdef CONFIG_MPTCP void mptcp_init(void); diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 85ee999729a1..f4948a2cf9be 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -5,6 +5,8 @@ */ #define pr_fmt(fmt) "MPTCP: " fmt +#include +#include #include "protocol.h" #include "mib.h" @@ -18,6 +20,9 @@ struct mptcp_pm_add_entry { struct mptcp_sock *sock; }; +static DEFINE_SPINLOCK(mptcp_pm_list_lock); +static LIST_HEAD(mptcp_pm_list); + /* path manager helpers */ /* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses, @@ -1015,3 +1020,48 @@ void __init mptcp_pm_init(void) mptcp_pm_kernel_register(); mptcp_pm_nl_init(); } + +/* Must be called with rcu read lock held */ +struct mptcp_pm_ops *mptcp_pm_find(const char *name) +{ + struct mptcp_pm_ops *pm_ops; + + list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) { + if (!strcmp(pm_ops->name, name)) + return pm_ops; + } + + return NULL; +} + +int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops) +{ + return 0; +} + +int mptcp_pm_register(struct mptcp_pm_ops *pm_ops) +{ + int ret; + + ret = mptcp_pm_validate(pm_ops); + if (ret) + return ret; + + spin_lock(&mptcp_pm_list_lock); + if (mptcp_pm_find(pm_ops->name)) { + spin_unlock(&mptcp_pm_list_lock); + return -EEXIST; + } + list_add_tail_rcu(&pm_ops->list, &mptcp_pm_list); + spin_unlock(&mptcp_pm_list_lock); + + pr_debug("%s registered\n", pm_ops->name); + return 0; +} + +void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops) +{ + spin_lock(&mptcp_pm_list_lock); + list_del_rcu(&pm_ops->list); + spin_unlock(&mptcp_pm_list_lock); +} diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 15e2a03025ec..ac8a178426e4 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1050,6 +1050,11 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_ void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *entry); +struct mptcp_pm_ops *mptcp_pm_find(const char *name); +int mptcp_pm_register(struct mptcp_pm_ops *pm_ops); +void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops); +int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops); + void mptcp_userspace_pm_free_local_addr_list(struct mptcp_sock *msk); void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, -- cgit v1.2.3 From fa3ee9dd8067e178b459f1df3081fd25fc03c4b5 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 13 Mar 2025 11:21:00 +0100 Subject: mptcp: sysctl: add available_path_managers Similarly to net.mptcp.available_schedulers, this patch adds a new one net.mptcp.available_path_managers to list the available path managers. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250313-net-next-mptcp-pm-ops-intro-v1-11-f4e4a88efc50@kernel.org Signed-off-by: Paolo Abeni --- Documentation/networking/mptcp-sysctl.rst | 4 ++++ include/net/mptcp.h | 2 ++ net/mptcp/ctrl.c | 25 +++++++++++++++++++++++++ net/mptcp/pm.c | 19 +++++++++++++++++++ net/mptcp/protocol.h | 1 + 5 files changed, 51 insertions(+) (limited to 'include/net') diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index b78a2254d452..5bfab01eff5a 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -30,6 +30,10 @@ allow_join_initial_addr_port - BOOLEAN Default: 1 +available_path_managers - STRING + Shows the available path managers choices that are registered. More + path managers may be available, but not loaded. + available_schedulers - STRING Shows the available schedulers choices that are registered. More packet schedulers may be available, but not loaded. diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 645d15695e3f..bfbad695951c 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -123,6 +123,8 @@ struct mptcp_sched_ops { } ____cacheline_aligned_in_smp; #define MPTCP_PM_NAME_MAX 16 +#define MPTCP_PM_MAX 128 +#define MPTCP_PM_BUF_MAX (MPTCP_PM_NAME_MAX * MPTCP_PM_MAX) struct mptcp_pm_ops { char name[MPTCP_PM_NAME_MAX]; diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 4d8b31f32eb5..d9290c5bb6c7 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -253,6 +253,24 @@ static int proc_pm_type(const struct ctl_table *ctl, int write, return ret; } +static int proc_available_path_managers(const struct ctl_table *ctl, + int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct ctl_table tbl = { .maxlen = MPTCP_PM_BUF_MAX, }; + int ret; + + tbl.data = kmalloc(tbl.maxlen, GFP_USER); + if (!tbl.data) + return -ENOMEM; + + mptcp_pm_get_available(tbl.data, MPTCP_PM_BUF_MAX); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + kfree(tbl.data); + + return ret; +} + static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", @@ -338,6 +356,12 @@ static struct ctl_table mptcp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_path_manager, }, + { + .procname = "available_path_managers", + .maxlen = MPTCP_PM_BUF_MAX, + .mode = 0444, + .proc_handler = proc_available_path_managers, + }, }; static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) @@ -364,6 +388,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[9].data = &pernet->blackhole_timeout; table[10].data = &pernet->syn_retrans_before_tcp_fallback; table[11].data = &pernet->path_manager; + /* table[12] is for available_path_managers which is read-only info */ hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table, ARRAY_SIZE(mptcp_sysctl_table)); diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 3896f21a46bd..18b19dbccbba 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -1070,3 +1070,22 @@ void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops) list_del_rcu(&pm_ops->list); spin_unlock(&mptcp_pm_list_lock); } + +/* Build string with list of available path manager values. + * Similar to tcp_get_available_congestion_control() + */ +void mptcp_pm_get_available(char *buf, size_t maxlen) +{ + struct mptcp_pm_ops *pm_ops; + size_t offs = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) { + offs += snprintf(buf + offs, maxlen - offs, "%s%s", + offs == 0 ? "" : " ", pm_ops->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; + } + rcu_read_unlock(); +} diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 818c2c648677..d409586b5977 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1058,6 +1058,7 @@ struct mptcp_pm_ops *mptcp_pm_find(const char *name); int mptcp_pm_register(struct mptcp_pm_ops *pm_ops); void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops); int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops); +void mptcp_pm_get_available(char *buf, size_t maxlen); void mptcp_userspace_pm_free_local_addr_list(struct mptcp_sock *msk); -- cgit v1.2.3 From eaaff9b6702e99be5d79135f2afa9fc48a0d59e0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Feb 2025 14:07:01 +0100 Subject: netfilter: fib: avoid lookup if socket is available In case the fib match is used from the input hook we can avoid the fib lookup if early demux assigned a socket for us: check that the input interface matches sk-cached one. Rework the existing 'lo bypass' logic to first check sk, then for loopback interface type to elide the fib lookup. This speeds up fib matching a little, before: 93.08 GBit/s (no rules at all) 75.1 GBit/s ("fib saddr . iif oif missing drop" in prerouting) 75.62 GBit/s ("fib saddr . iif oif missing drop" in input) After: 92.48 GBit/s (no rules at all) 75.62 GBit/s (fib rule in prerouting) 90.37 GBit/s (fib rule in input). Numbers for the 'no rules' and 'prerouting' are expected to closely match in-between runs, the 3rd/input test case exercises the the 'avoid lookup if cached ifindex in sk matches' case. Test used iperf3 via veth interface, lo can't be used due to existing loopback test. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_fib.h | 21 +++++++++++++++++++++ net/ipv4/netfilter/nft_fib_ipv4.c | 11 +++++------ net/ipv6/netfilter/nft_fib_ipv6.c | 19 ++++++++++--------- 3 files changed, 36 insertions(+), 15 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 38cae7113de4..6e202ed5e63f 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -18,6 +18,27 @@ nft_fib_is_loopback(const struct sk_buff *skb, const struct net_device *in) return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } +static inline bool nft_fib_can_skip(const struct nft_pktinfo *pkt) +{ + const struct net_device *indev = nft_in(pkt); + const struct sock *sk; + + switch (nft_hook(pkt)) { + case NF_INET_PRE_ROUTING: + case NF_INET_INGRESS: + case NF_INET_LOCAL_IN: + break; + default: + return false; + } + + sk = pkt->skb->sk; + if (sk && sk_fullsock(sk)) + return sk->sk_rx_dst_ifindex == indev->ifindex; + + return nft_fib_is_loopback(pkt->skb, indev); +} + int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 625adbc42037..9082ca17e845 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -71,6 +71,11 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct net_device *oif; const struct net_device *found; + if (nft_fib_can_skip(pkt)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; + } + /* * Do not set flowi4_oif, it restricts results (for example, asking * for oif 3 will get RTN_UNICAST result even if the daddr exits @@ -85,12 +90,6 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, else oif = NULL; - if (nft_hook(pkt) == NF_INET_PRE_ROUTING && - nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { - nft_fib_store_result(dest, priv, nft_in(pkt)); - return; - } - iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); if (!iph) { regs->verdict.code = NFT_BREAK; diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index c9f1634b3838..7fd9d7b21cd4 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -170,6 +170,11 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, struct rt6_info *rt; int lookup_flags; + if (nft_fib_can_skip(pkt)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; + } + if (priv->flags & NFTA_FIB_F_IIF) oif = nft_in(pkt); else if (priv->flags & NFTA_FIB_F_OIF) @@ -181,17 +186,13 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, return; } - lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); - - if (nft_hook(pkt) == NF_INET_PRE_ROUTING || - nft_hook(pkt) == NF_INET_INGRESS) { - if (nft_fib_is_loopback(pkt->skb, nft_in(pkt)) || - nft_fib_v6_skip_icmpv6(pkt->skb, pkt->tprot, iph)) { - nft_fib_store_result(dest, priv, nft_in(pkt)); - return; - } + if (nft_fib_v6_skip_icmpv6(pkt->skb, pkt->tprot, iph)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; } + lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); + *dest = 0; rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb, lookup_flags); -- cgit v1.2.3 From a6984aa806c8cebd761e6bb1b7ec8c9cae4788d1 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 14 Mar 2025 17:00:41 +0800 Subject: net: mctp: Remove unnecessary cast in mctp_cb The void * cast in mctp_cb is unnecessary as it's already been done at the start of the function. Signed-off-by: Herbert Xu Acked-by: Jeremy Kerr Link: https://patch.msgid.link/Z9PwOQeBSYlgZlHq@gondor.apana.org.au Signed-off-by: Paolo Abeni --- include/net/mctp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mctp.h b/include/net/mctp.h index 1ecbff7116f6..07d458990113 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -212,7 +212,7 @@ static inline struct mctp_skb_cb *mctp_cb(struct sk_buff *skb) BUILD_BUG_ON(sizeof(struct mctp_skb_cb) > sizeof(skb->cb)); WARN_ON(cb->magic != 0x4d435450); - return (void *)(skb->cb); + return cb; } /* If CONFIG_MCTP_FLOWS, we may add one of these as a SKB extension, -- cgit v1.2.3 From 2f6efbabceb6b2914ee9bafb86d9a51feae9cce8 Mon Sep 17 00:00:00 2001 From: Murad Masimov Date: Mon, 17 Mar 2025 13:53:52 +0300 Subject: ax25: Remove broken autobind Binding AX25 socket by using the autobind feature leads to memory leaks in ax25_connect() and also refcount leaks in ax25_release(). Memory leak was detected with kmemleak: ================================================================ unreferenced object 0xffff8880253cd680 (size 96): backtrace: __kmalloc_node_track_caller_noprof (./include/linux/kmemleak.h:43) kmemdup_noprof (mm/util.c:136) ax25_rt_autobind (net/ax25/ax25_route.c:428) ax25_connect (net/ax25/af_ax25.c:1282) __sys_connect_file (net/socket.c:2045) __sys_connect (net/socket.c:2064) __x64_sys_connect (net/socket.c:2067) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) ================================================================ When socket is bound, refcounts must be incremented the way it is done in ax25_bind() and ax25_setsockopt() (SO_BINDTODEVICE). In case of autobind, the refcounts are not incremented. This bug leads to the following issue reported by Syzkaller: ================================================================ ax25_connect(): syz-executor318 uses autobind, please contact jreuter@yaina.de ------------[ cut here ]------------ refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 0 PID: 5317 at lib/refcount.c:31 refcount_warn_saturate+0xfa/0x1d0 lib/refcount.c:31 Modules linked in: CPU: 0 UID: 0 PID: 5317 Comm: syz-executor318 Not tainted 6.14.0-rc4-syzkaller-00278-gece144f151ac #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:refcount_warn_saturate+0xfa/0x1d0 lib/refcount.c:31 ... Call Trace: __refcount_dec include/linux/refcount.h:336 [inline] refcount_dec include/linux/refcount.h:351 [inline] ref_tracker_free+0x6af/0x7e0 lib/ref_tracker.c:236 netdev_tracker_free include/linux/netdevice.h:4302 [inline] netdev_put include/linux/netdevice.h:4319 [inline] ax25_release+0x368/0x960 net/ax25/af_ax25.c:1080 __sock_release net/socket.c:647 [inline] sock_close+0xbc/0x240 net/socket.c:1398 __fput+0x3e9/0x9f0 fs/file_table.c:464 __do_sys_close fs/open.c:1580 [inline] __se_sys_close fs/open.c:1565 [inline] __x64_sys_close+0x7f/0x110 fs/open.c:1565 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f ... ================================================================ Considering the issues above and the comments left in the code that say: "check if we can remove this feature. It is broken."; "autobinding in this may or may not work"; - it is better to completely remove this feature than to fix it because it is broken and leads to various kinds of memory bugs. Now calling connect() without first binding socket will result in an error (-EINVAL). Userspace software that relies on the autobind feature might get broken. However, this feature does not seem widely used with this specific driver as it was not reliable at any point of time, and it is already broken anyway. E.g. ax25-tools and ax25-apps packages for popular distributions do not use the autobind feature for AF_AX25. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+33841dc6aa3e1d86b78a@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=33841dc6aa3e1d86b78a Signed-off-by: Murad Masimov Signed-off-by: David S. Miller --- include/net/ax25.h | 1 - net/ax25/af_ax25.c | 30 +++++++-------------- net/ax25/ax25_route.c | 74 --------------------------------------------------- 3 files changed, 10 insertions(+), 95 deletions(-) (limited to 'include/net') diff --git a/include/net/ax25.h b/include/net/ax25.h index 4ee141aae0a2..a7bba42dde15 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -418,7 +418,6 @@ void ax25_rt_device_down(struct net_device *); int ax25_rt_ioctl(unsigned int, void __user *); extern const struct seq_operations ax25_rt_seqops; ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev); -int ax25_rt_autobind(ax25_cb *, ax25_address *); struct sk_buff *ax25_rt_build_path(struct sk_buff *, ax25_address *, ax25_address *, ax25_digi *); void ax25_rt_free(void); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 9f3b8b682adb..3ee7dba34310 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1270,28 +1270,18 @@ static int __must_check ax25_connect(struct socket *sock, } } - /* - * Must bind first - autobinding in this may or may not work. If - * the socket is already bound, check to see if the device has - * been filled in, error if it hasn't. - */ + /* Must bind first - autobinding does not work. */ if (sock_flag(sk, SOCK_ZAPPED)) { - /* check if we can remove this feature. It is broken. */ - printk(KERN_WARNING "ax25_connect(): %s uses autobind, please contact jreuter@yaina.de\n", - current->comm); - if ((err = ax25_rt_autobind(ax25, &fsa->fsa_ax25.sax25_call)) < 0) { - kfree(digi); - goto out_release; - } + kfree(digi); + err = -EINVAL; + goto out_release; + } - ax25_fillin_cb(ax25, ax25->ax25_dev); - ax25_cb_add(ax25); - } else { - if (ax25->ax25_dev == NULL) { - kfree(digi); - err = -EHOSTUNREACH; - goto out_release; - } + /* Check to see if the device has been filled in, error if it hasn't. */ + if (ax25->ax25_dev == NULL) { + kfree(digi); + err = -EHOSTUNREACH; + goto out_release; } if (sk->sk_type == SOCK_SEQPACKET && diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 69de75db0c9c..10577434f40b 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -373,80 +373,6 @@ ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev) return ax25_rt; } -/* - * Adjust path: If you specify a default route and want to connect - * a target on the digipeater path but w/o having a special route - * set before, the path has to be truncated from your target on. - */ -static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat) -{ - int k; - - for (k = 0; k < digipeat->ndigi; k++) { - if (ax25cmp(addr, &digipeat->calls[k]) == 0) - break; - } - - digipeat->ndigi = k; -} - - -/* - * Find which interface to use. - */ -int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) -{ - ax25_uid_assoc *user; - ax25_route *ax25_rt; - int err = 0; - - ax25_route_lock_use(); - ax25_rt = ax25_get_route(addr, NULL); - if (!ax25_rt) { - ax25_route_lock_unuse(); - return -EHOSTUNREACH; - } - rcu_read_lock(); - if ((ax25->ax25_dev = ax25_dev_ax25dev(ax25_rt->dev)) == NULL) { - err = -EHOSTUNREACH; - goto put; - } - - user = ax25_findbyuid(current_euid()); - if (user) { - ax25->source_addr = user->call; - ax25_uid_put(user); - } else { - if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { - err = -EPERM; - goto put; - } - ax25->source_addr = *(ax25_address *)ax25->ax25_dev->dev->dev_addr; - } - - if (ax25_rt->digipeat != NULL) { - ax25->digipeat = kmemdup(ax25_rt->digipeat, sizeof(ax25_digi), - GFP_ATOMIC); - if (ax25->digipeat == NULL) { - err = -ENOMEM; - goto put; - } - ax25_adjust_path(addr, ax25->digipeat); - } - - if (ax25->sk != NULL) { - local_bh_disable(); - bh_lock_sock(ax25->sk); - sock_reset_flag(ax25->sk, SOCK_ZAPPED); - bh_unlock_sock(ax25->sk); - local_bh_enable(); - } - -put: - rcu_read_unlock(); - ax25_route_lock_unuse(); - return err; -} struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src, ax25_address *dest, ax25_digi *digi) -- cgit v1.2.3 From 1937a0be28c01a13e18912602b8eff08d7db77cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 17 Mar 2025 08:53:13 +0000 Subject: tcp: move icsk_clean_acked to a better location As a followup of my presentation in Zagreb for netdev 0x19: icsk_clean_acked is only used by TCP when/if CONFIG_TLS_DEVICE is enabled from tcp_ack(). Rename it to tcp_clean_acked, move it to tcp_sock structure in the tcp_sock_read_rx for better cache locality in TCP fast path. Define this field only when CONFIG_TLS_DEVICE is enabled saving 8 bytes on configs not using it. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Sabrina Dubroca Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250317085313.2023214-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/tcp_sock.rst | 1 + include/linux/tcp.h | 3 +++ include/net/inet_connection_sock.h | 2 -- include/net/tcp.h | 4 ++-- net/ipv4/tcp.c | 5 +++++ net/ipv4/tcp_input.c | 12 ++++++------ net/tls/tls_device.c | 8 ++++---- 7 files changed, 21 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 1f79765072b1..bc9b2131bf7a 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -27,6 +27,7 @@ u32 dsack_dups u32 snd_una read_mostly read_write tcp_wnd_end,tcp_urg_mode,tcp_minshall_check,tcp_cwnd_validate(tx);tcp_ack,tcp_may_update_window,tcp_clean_rtx_queue(write),tcp_ack_tstamp(rx) u32 snd_sml read_write tcp_minshall_check,tcp_minshall_update u32 rcv_tstamp read_mostly tcp_ack +void * tcp_clean_acked read_mostly tcp_ack u32 lsndtime read_write tcp_slow_start_after_idle_check,tcp_event_data_sent u32 last_oow_ack_time u32 compressed_ack_rcv_nxt diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 159b2c59eb62..1669d95bb0f9 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -244,6 +244,9 @@ struct tcp_sock { struct minmax rtt_min; /* OOO segments go in this rbtree. Socket lock must be held. */ struct rb_root out_of_order_queue; +#if defined(CONFIG_TLS_DEVICE) + void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); +#endif u32 snd_ssthresh; /* Slow start size threshold */ u8 recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ __cacheline_group_end(tcp_sock_read_rx); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index f736d3097e43..e8ed52fc603f 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -66,7 +66,6 @@ struct inet_connection_sock_af_ops { * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data - * @icsk_clean_acked Clean acked data hook * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event @@ -97,7 +96,6 @@ struct inet_connection_sock { const struct inet_connection_sock_af_ops *icsk_af_ops; const struct tcp_ulp_ops *icsk_ulp_ops; void __rcu *icsk_ulp_data; - void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:5, icsk_ca_initialized:1, diff --git a/include/net/tcp.h b/include/net/tcp.h index d08fbf90495d..f8efe56bbccb 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2815,9 +2815,9 @@ extern struct static_key_false tcp_have_smc; #endif #if IS_ENABLED(CONFIG_TLS_DEVICE) -void clean_acked_data_enable(struct inet_connection_sock *icsk, +void clean_acked_data_enable(struct tcp_sock *tp, void (*cad)(struct sock *sk, u32 ack_seq)); -void clean_acked_data_disable(struct inet_connection_sock *icsk); +void clean_acked_data_disable(struct tcp_sock *tp); void clean_acked_data_flush(void); #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 989c3c3d8e75..fde56d28f586 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5026,7 +5026,12 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); +#if IS_ENABLED(CONFIG_TLS_DEVICE) + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tcp_clean_acked); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 77); +#else CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69); +#endif /* TX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 72382ee4456d..a35018e2d0ba 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -119,18 +119,18 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #if IS_ENABLED(CONFIG_TLS_DEVICE) static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ); -void clean_acked_data_enable(struct inet_connection_sock *icsk, +void clean_acked_data_enable(struct tcp_sock *tp, void (*cad)(struct sock *sk, u32 ack_seq)) { - icsk->icsk_clean_acked = cad; + tp->tcp_clean_acked = cad; static_branch_deferred_inc(&clean_acked_data_enabled); } EXPORT_SYMBOL_GPL(clean_acked_data_enable); -void clean_acked_data_disable(struct inet_connection_sock *icsk) +void clean_acked_data_disable(struct tcp_sock *tp) { static_branch_slow_dec_deferred(&clean_acked_data_enabled); - icsk->icsk_clean_acked = NULL; + tp->tcp_clean_acked = NULL; } EXPORT_SYMBOL_GPL(clean_acked_data_disable); @@ -3987,8 +3987,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) #if IS_ENABLED(CONFIG_TLS_DEVICE) if (static_branch_unlikely(&clean_acked_data_enabled.key)) - if (icsk->icsk_clean_acked) - icsk->icsk_clean_acked(sk, ack); + if (tp->tcp_clean_acked) + tp->tcp_clean_acked(sk, ack); #endif } diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index e50b6e71df13..f672a62a9a52 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -157,7 +157,7 @@ static void delete_all_records(struct tls_offload_context_tx *offload_ctx) offload_ctx->retransmit_hint = NULL; } -static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) +static void tls_tcp_clean_acked(struct sock *sk, u32 acked_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_record_info *info, *temp; @@ -204,7 +204,7 @@ void tls_device_sk_destruct(struct sock *sk) destroy_record(ctx->open_record); delete_all_records(ctx); crypto_free_aead(ctx->aead_send); - clean_acked_data_disable(inet_csk(sk)); + clean_acked_data_disable(tcp_sk(sk)); } tls_device_queue_ctx_destruction(tls_ctx); @@ -1126,7 +1126,7 @@ int tls_set_device_offload(struct sock *sk) start_marker_record->num_frags = 0; list_add_tail(&start_marker_record->list, &offload_ctx->records_list); - clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked); + clean_acked_data_enable(tcp_sk(sk), &tls_tcp_clean_acked); ctx->push_pending_record = tls_device_push_pending_record; /* TLS offload is greatly simplified if we don't send @@ -1172,7 +1172,7 @@ int tls_set_device_offload(struct sock *sk) release_lock: up_read(&device_offload_lock); - clean_acked_data_disable(inet_csk(sk)); + clean_acked_data_disable(tcp_sk(sk)); crypto_free_aead(offload_ctx->aead_send); free_offload_ctx: kfree(offload_ctx); -- cgit v1.2.3 From 66034f78a5583bc10c195647629a137e8ed02208 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Mar 2025 23:01:07 -0700 Subject: tcp/dccp: Remove inet_connection_sock_af_ops.addr2sockaddr(). inet_connection_sock_af_ops.addr2sockaddr() hasn't been used at all in the git era. $ git grep addr2sockaddr $(git rev-list HEAD | tail -n 1) Let's remove it. Note that there was a 4 bytes hole after sockaddr_len and now it's 6 bytes, so the binary layout is not changed. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250318060112.3729-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/inet6_connection_sock.h | 2 -- include/net/inet_connection_sock.h | 4 ---- net/dccp/ipv4.c | 2 -- net/dccp/ipv6.c | 4 ---- net/ipv4/inet_connection_sock.c | 11 ----------- net/ipv4/tcp_ipv4.c | 2 -- net/ipv6/inet6_connection_sock.c | 14 -------------- net/ipv6/tcp_ipv6.c | 4 ---- 8 files changed, 43 deletions(-) (limited to 'include/net') diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h index 025bd8d3c769..745891d2e113 100644 --- a/include/net/inet6_connection_sock.h +++ b/include/net/inet6_connection_sock.h @@ -21,8 +21,6 @@ struct sockaddr; struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6, const struct request_sock *req, u8 proto); -void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); - int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index e8ed52fc603f..09a9d333fa42 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -44,12 +44,10 @@ struct inet_connection_sock_af_ops { struct request_sock *req_unhash, bool *own_req); u16 net_header_len; - u16 sockaddr_len; int (*setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); - void (*addr2sockaddr)(struct sock *sk, struct sockaddr *); void (*mtu_reduced)(struct sock *sk); }; @@ -315,8 +313,6 @@ static inline __poll_t inet_csk_listen_poll(const struct sock *sk) int inet_csk_listen_start(struct sock *sk); void inet_csk_listen_stop(struct sock *sk); -void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); - /* update the fast reuse flag when adding a socket */ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, struct sock *sk); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index bfa529a54aca..2045ddac0fe9 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -934,8 +934,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, - .addr2sockaddr = inet_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in), }; static int dccp_v4_init_sock(struct sock *sk) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 39ae9d89d7d4..e24dbffabfc1 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -988,8 +988,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { .net_header_len = sizeof(struct ipv6hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = inet6_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in6), }; /* @@ -1004,8 +1002,6 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = inet6_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in6), }; static void dccp_v6_sk_destruct(struct sock *sk) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index e93c66034077..dd5cf8914a28 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1553,17 +1553,6 @@ skip_child_forget: } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); -void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) -{ - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; - const struct inet_sock *inet = inet_sk(sk); - - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = inet->inet_daddr; - sin->sin_port = inet->inet_dport; -} -EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); - static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) { const struct inet_sock *inet = inet_sk(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4fa4fbb0ad12..1cd0938d47e0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2477,8 +2477,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, - .addr2sockaddr = inet_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in), .mtu_reduced = tcp_v4_mtu_reduced, }; EXPORT_IPV6_MOD(ipv4_specific); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 80043e46117c..dbcf556a35bb 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -56,20 +56,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, } EXPORT_SYMBOL(inet6_csk_route_req); -void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) -{ - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; - - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = sk->sk_v6_daddr; - sin6->sin6_port = inet_sk(sk)->inet_dport; - /* We do not store received flowlabel for TCP */ - sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, - sk->sk_bound_dev_if); -} -EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); - static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e182ee0a2330..c134cf1a603a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2068,8 +2068,6 @@ const struct inet_connection_sock_af_ops ipv6_specific = { .net_header_len = sizeof(struct ipv6hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = inet6_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v6_mtu_reduced, }; @@ -2102,8 +2100,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = inet6_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v4_mtu_reduced, }; -- cgit v1.2.3 From c353e8983e0dea5dbba7789033326e1ad34135b7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 20 Mar 2025 19:22:38 +0100 Subject: net: introduce per netns packet chains Currently network taps unbound to any interface are linked in the global ptype_all list, affecting the performance in all the network namespaces. Add per netns ptypes chains, so that in the mentioned case only the netns owning the packet socket(s) is affected. While at that drop the global ptype_all list: no in kernel user registers a tap on "any" type without specifying either the target device or the target namespace (and IMHO doing that would not make any sense). Note that this adds a conditional in the fast path (to check for per netns ptype_specific list) and increases the dataset size by a cacheline (owing the per netns lists). Reviewed-by: Sabrina Dubroca Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/ae405f98875ee87f8150c460ad162de7e466f8a7.1742494826.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 12 +++++++++- include/net/hotdata.h | 1 - include/net/net_namespace.h | 3 +++ net/core/dev.c | 53 +++++++++++++++++++++++++++++++++++---------- net/core/hotdata.c | 1 - net/core/net-procfs.c | 28 ++++++++++++++++++------ net/core/net_namespace.c | 2 ++ 7 files changed, 78 insertions(+), 22 deletions(-) (limited to 'include/net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0c5b1f7f8f3a..f22cca7c03ad 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4278,7 +4278,17 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev, return 0; } -bool dev_nit_active(struct net_device *dev); +bool dev_nit_active_rcu(const struct net_device *dev); +static inline bool dev_nit_active(const struct net_device *dev) +{ + bool ret; + + rcu_read_lock(); + ret = dev_nit_active_rcu(dev); + rcu_read_unlock(); + return ret; +} + void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); static inline void __dev_put(struct net_device *dev) diff --git a/include/net/hotdata.h b/include/net/hotdata.h index 30e9570beb2a..fda94b2647ff 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -23,7 +23,6 @@ struct net_hotdata { struct net_offload udpv6_offload; #endif struct list_head offload_base; - struct list_head ptype_all; struct kmem_cache *skbuff_cache; struct kmem_cache *skbuff_fclone_cache; struct kmem_cache *skb_small_head_cache; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index f467a66abc6b..bd57d8fb54f1 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -83,6 +83,9 @@ struct net { struct llist_node defer_free_list; struct llist_node cleanup_list; /* namespaces on death row */ + struct list_head ptype_all; + struct list_head ptype_specific; + #ifdef CONFIG_KEYS struct key_tag *key_domain; /* Key domain of operation tag */ #endif diff --git a/net/core/dev.c b/net/core/dev.c index 235560341765..bcf81c3ff6a3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -572,10 +572,18 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) static inline struct list_head *ptype_head(const struct packet_type *pt) { - if (pt->type == htons(ETH_P_ALL)) - return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all; - else - return pt->dev ? &pt->dev->ptype_specific : + if (pt->type == htons(ETH_P_ALL)) { + if (!pt->af_packet_net && !pt->dev) + return NULL; + + return pt->dev ? &pt->dev->ptype_all : + &pt->af_packet_net->ptype_all; + } + + if (pt->dev) + return &pt->dev->ptype_specific; + + return pt->af_packet_net ? &pt->af_packet_net->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; } @@ -596,6 +604,9 @@ void dev_add_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); + if (WARN_ON_ONCE(!head)) + return; + spin_lock(&ptype_lock); list_add_rcu(&pt->list, head); spin_unlock(&ptype_lock); @@ -620,6 +631,9 @@ void __dev_remove_pack(struct packet_type *pt) struct list_head *head = ptype_head(pt); struct packet_type *pt1; + if (!head) + return; + spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { @@ -2441,16 +2455,21 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) } /** - * dev_nit_active - return true if any network interface taps are in use + * dev_nit_active_rcu - return true if any network interface taps are in use + * + * The caller must hold the RCU lock * * @dev: network device to check for the presence of taps */ -bool dev_nit_active(struct net_device *dev) +bool dev_nit_active_rcu(const struct net_device *dev) { - return !list_empty(&net_hotdata.ptype_all) || + /* Callers may hold either RCU or RCU BH lock */ + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + + return !list_empty(&dev_net(dev)->ptype_all) || !list_empty(&dev->ptype_all); } -EXPORT_SYMBOL_GPL(dev_nit_active); +EXPORT_SYMBOL_GPL(dev_nit_active_rcu); /* * Support routine. Sends outgoing frames to any network @@ -2459,11 +2478,12 @@ EXPORT_SYMBOL_GPL(dev_nit_active); void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { - struct list_head *ptype_list = &net_hotdata.ptype_all; struct packet_type *ptype, *pt_prev = NULL; + struct list_head *ptype_list; struct sk_buff *skb2 = NULL; rcu_read_lock(); + ptype_list = &dev_net_rcu(dev)->ptype_all; again: list_for_each_entry_rcu(ptype, ptype_list, list) { if (READ_ONCE(ptype->ignore_outgoing)) @@ -2507,7 +2527,7 @@ again: pt_prev = ptype; } - if (ptype_list == &net_hotdata.ptype_all) { + if (ptype_list != &dev->ptype_all) { ptype_list = &dev->ptype_all; goto again; } @@ -3752,7 +3772,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, unsigned int len; int rc; - if (dev_nit_active(dev)) + if (dev_nit_active_rcu(dev)) dev_queue_xmit_nit(skb, dev); len = skb->len; @@ -5696,7 +5716,8 @@ another_round: if (pfmemalloc) goto skip_taps; - list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) { + list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all, + list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -5808,6 +5829,14 @@ check_vlan_id: deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &ptype_base[ntohs(type) & PTYPE_HASH_MASK]); + + /* orig_dev and skb->dev could belong to different netns; + * Even in such case we need to traverse only the list + * coming from skb->dev, as the ptype owner (packet socket) + * will use dev_net(skb->dev) to do namespace filtering. + */ + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &dev_net_rcu(skb->dev)->ptype_specific); } deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, diff --git a/net/core/hotdata.c b/net/core/hotdata.c index d0aaaaa556f2..0bc893d5f07b 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -7,7 +7,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), - .ptype_all = LIST_HEAD_INIT(net_hotdata.ptype_all), .gro_normal_batch = 8, .netdev_budget = 300, diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index fa6d3969734a..3e92bf0f9060 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -185,7 +185,13 @@ static void *ptype_get_idx(struct seq_file *seq, loff_t pos) } } - list_for_each_entry_rcu(pt, &net_hotdata.ptype_all, list) { + list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) { + if (i == pos) + return pt; + ++i; + } + + list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_specific, list) { if (i == pos) return pt; ++i; @@ -210,6 +216,7 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net *net = seq_file_net(seq); struct net_device *dev; struct packet_type *pt; struct list_head *nxt; @@ -232,15 +239,22 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) goto found; } } - - nxt = net_hotdata.ptype_all.next; - goto ptype_all; + nxt = net->ptype_all.next; + goto net_ptype_all; } - if (pt->type == htons(ETH_P_ALL)) { -ptype_all: - if (nxt != &net_hotdata.ptype_all) + if (pt->af_packet_net) { +net_ptype_all: + if (nxt != &net->ptype_all && nxt != &net->ptype_specific) goto found; + + if (nxt == &net->ptype_all) { + /* continue with ->ptype_specific if it's not empty */ + nxt = net->ptype_specific.next; + if (nxt != &net->ptype_specific) + goto found; + } + hash = 0; nxt = ptype_base[0].next; } else diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 4303f2a49262..b0dfdf791ece 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -340,6 +340,8 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL); #endif + INIT_LIST_HEAD(&net->ptype_all); + INIT_LIST_HEAD(&net->ptype_specific); preinit_net_sysctl(net); } -- cgit v1.2.3 From f38805c5d26fe4af97837c10d58074a7496638bf Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Mon, 17 Mar 2025 20:03:13 +0800 Subject: tcp: support TCP_RTO_MIN_US for set/getsockopt use Support adjusting/reading RTO MIN for socket level by using set/getsockopt(). This new option has the same effect as TCP_BPF_RTO_MIN, which means it doesn't affect RTAX_RTO_MIN usage (by using ip route...). Considering that bpf option was implemented before this patch, so we need to use a standalone new option for pure tcp set/getsockopt() use. When the socket is created, its icsk_rto_min is set to the default value that is controlled by sysctl_tcp_rto_min_us. Then if application calls setsockopt() with TCP_RTO_MIN_US flag to pass a valid value, then icsk_rto_min will be overridden in jiffies unit. This patch adds WRITE_ONCE/READ_ONCE to avoid data-race around icsk_rto_min. Signed-off-by: Jason Xing Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250317120314.41404-2-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 4 ++-- include/net/tcp.h | 2 +- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 13 ++++++++++++- 4 files changed, 16 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 054561f8dcae..5c63ab928b97 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1229,8 +1229,8 @@ tcp_pingpong_thresh - INTEGER tcp_rto_min_us - INTEGER Minimal TCP retransmission timeout (in microseconds). Note that the rto_min route option has the highest precedence for configuring this - setting, followed by the TCP_BPF_RTO_MIN socket option, followed by - this tcp_rto_min_us sysctl. + setting, followed by the TCP_BPF_RTO_MIN and TCP_RTO_MIN_US socket + options, followed by this tcp_rto_min_us sysctl. The recommended practice is to use a value less or equal to 200000 microseconds. diff --git a/include/net/tcp.h b/include/net/tcp.h index f8efe56bbccb..4450c384ef17 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -844,7 +844,7 @@ u32 tcp_delack_max(const struct sock *sk); static inline u32 tcp_rto_min(const struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); - u32 rto_min = inet_csk(sk)->icsk_rto_min; + u32 rto_min = READ_ONCE(inet_csk(sk)->icsk_rto_min); if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 92a2e79222ea..a126b3860e63 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -140,6 +140,7 @@ enum { #define TCP_IS_MPTCP 43 /* Is MPTCP being used? */ #define TCP_RTO_MAX_MS 44 /* max rto time in ms */ +#define TCP_RTO_MIN_US 45 /* min rto time in us */ #define TCP_REPAIR_ON 1 #define TCP_REPAIR_OFF 0 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index fde56d28f586..a8a1370d642e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3352,7 +3352,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_probes_out = 0; icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; - icsk->icsk_rto_min = TCP_RTO_MIN; + WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN); icsk->icsk_delack_max = TCP_DELACK_MAX; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_snd_cwnd_set(tp, TCP_INIT_CWND); @@ -3833,6 +3833,14 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val)); return 0; + case TCP_RTO_MIN_US: { + int rto_min = usecs_to_jiffies(val); + + if (rto_min > TCP_RTO_MIN || rto_min < TCP_TIMEOUT_MIN) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_rto_min, rto_min); + return 0; + } } sockopt_lock_sock(sk); @@ -4672,6 +4680,9 @@ zerocopy_rcv_out: case TCP_RTO_MAX_MS: val = jiffies_to_msecs(tcp_rto_max(sk)); break; + case TCP_RTO_MIN_US: + val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_rto_min)); + break; default: return -ENOPROTOOPT; } -- cgit v1.2.3 From f9af583a2c769962de599f64627829d5b7a8e5f2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Mar 2025 20:48:48 -0700 Subject: af_unix: Sort headers. This is a prep patch to make the following changes cleaner. No functional change intended. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250318034934.86708-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 4 +-- net/unix/af_unix.c | 62 +++++++++++++++++++++++----------------------- net/unix/diag.c | 15 +++++------ net/unix/garbage.c | 17 ++++++------- net/unix/sysctl_net_unix.c | 1 - net/unix/unix_bpf.c | 4 +-- 6 files changed, 51 insertions(+), 52 deletions(-) (limited to 'include/net') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 63129c79b8cb..c8dfdb41916d 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -2,10 +2,10 @@ #ifndef __LINUX_NET_AFUNIX_H #define __LINUX_NET_AFUNIX_H -#include -#include #include #include +#include +#include #include #if IS_ENABLED(CONFIG_UNIX) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7f8f3859cdb3..1ff0ac99f3f3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -77,46 +77,46 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include -#include -#include +#include +#include #include -#include -#include -#include +#include #include +#include #include -#include -#include -#include -#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include +#include +#include +#include +#include +#include +#include #include +#include #include -#include -#include +#include #include +#include +#include +#include +#include +#include +#include #include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; diff --git a/net/unix/diag.c b/net/unix/diag.c index 9138af8b465e..ba715507556a 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -1,15 +1,16 @@ // SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include -#include + #include +#include +#include +#include +#include #include -#include +#include #include -#include +#include #include +#include static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 9848b7b78701..6a641d4b5542 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -63,22 +63,21 @@ * wrt receive and holding up unrelated socket operations. */ +#include +#include #include -#include -#include -#include +#include #include -#include -#include #include -#include #include -#include +#include +#include +#include +#include #include - -#include #include #include +#include #include struct unix_sock *unix_get_socket(struct file *filp) diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index 357b3e5f3847..55118ae897d6 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -8,7 +8,6 @@ #include #include #include - #include static struct ctl_table unix_table[] = { diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index bca2d86ba97d..86598a959eaa 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -1,10 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Cong Wang */ -#include #include -#include +#include #include +#include #define unix_sk_has_data(__sk, __psock) \ ({ !skb_queue_empty(&__sk->sk_receive_queue) || \ -- cgit v1.2.3 From 84960bf2403123a5a15711d050455e195a8d5ba3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Mar 2025 20:48:49 -0700 Subject: af_unix: Move internal definitions to net/unix/. net/af_unix.h is included by core and some LSMs, but most definitions need not be. Let's move struct unix_{vertex,edge} to net/unix/garbage.c and other definitions to net/unix/af_unix.h. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250318034934.86708-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 76 +--------------------------------------------- net/unix/af_unix.c | 2 ++ net/unix/af_unix.h | 75 +++++++++++++++++++++++++++++++++++++++++++++ net/unix/diag.c | 2 ++ net/unix/garbage.c | 18 +++++++++++ net/unix/sysctl_net_unix.c | 2 ++ net/unix/unix_bpf.c | 2 ++ 7 files changed, 102 insertions(+), 75 deletions(-) create mode 100644 net/unix/af_unix.h (limited to 'include/net') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index c8dfdb41916d..b5d70baba52b 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -17,61 +17,17 @@ static inline struct unix_sock *unix_get_socket(struct file *filp) } #endif -extern unsigned int unix_tot_inflight; -void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); -void unix_del_edges(struct scm_fp_list *fpl); -void unix_update_edges(struct unix_sock *receiver); -int unix_prepare_fpl(struct scm_fp_list *fpl); -void unix_destroy_fpl(struct scm_fp_list *fpl); -void unix_gc(void); -void wait_for_unix_gc(struct scm_fp_list *fpl); - -struct unix_vertex { - struct list_head edges; - struct list_head entry; - struct list_head scc_entry; - unsigned long out_degree; - unsigned long index; - unsigned long scc_index; -}; - -struct unix_edge { - struct unix_sock *predecessor; - struct unix_sock *successor; - struct list_head vertex_entry; - struct list_head stack_entry; -}; - -struct sock *unix_peer_get(struct sock *sk); - -#define UNIX_HASH_MOD (256 - 1) -#define UNIX_HASH_SIZE (256 * 2) -#define UNIX_HASH_BITS 8 - struct unix_address { refcount_t refcnt; int len; struct sockaddr_un name[]; }; -struct unix_skb_parms { - struct pid *pid; /* Skb credentials */ - kuid_t uid; - kgid_t gid; - struct scm_fp_list *fp; /* Passed files */ -#ifdef CONFIG_SECURITY_NETWORK - u32 secid; /* Security ID */ -#endif - u32 consumed; -} __randomize_layout; - struct scm_stat { atomic_t nr_fds; unsigned long nr_unix_fds; }; -#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) - /* The AF_UNIX socket */ struct unix_sock { /* WARNING: sk has to be the first member */ @@ -84,6 +40,7 @@ struct unix_sock { struct unix_vertex *vertex; spinlock_t lock; struct socket_wq peer_wq; +#define peer_wait peer_wq.wait wait_queue_entry_t peer_wake; struct scm_stat scm_stat; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) @@ -94,35 +51,4 @@ struct unix_sock { #define unix_sk(ptr) container_of_const(ptr, struct unix_sock, sk) #define unix_peer(sk) (unix_sk(sk)->peer) -#define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) -#define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock) - -#define peer_wait peer_wq.wait - -long unix_inq_len(struct sock *sk); -long unix_outq_len(struct sock *sk); - -int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, - int flags); -int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, - int flags); -#ifdef CONFIG_SYSCTL -int unix_sysctl_register(struct net *net); -void unix_sysctl_unregister(struct net *net); -#else -static inline int unix_sysctl_register(struct net *net) { return 0; } -static inline void unix_sysctl_unregister(struct net *net) {} -#endif - -#ifdef CONFIG_BPF_SYSCALL -extern struct proto unix_dgram_proto; -extern struct proto unix_stream_proto; - -int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); -int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); -void __init unix_bpf_build_proto(void); -#else -static inline void __init unix_bpf_build_proto(void) -{} -#endif #endif diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1ff0ac99f3f3..6390e04fe916 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -118,6 +118,8 @@ #include #include +#include "af_unix.h" + static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; diff --git a/net/unix/af_unix.h b/net/unix/af_unix.h new file mode 100644 index 000000000000..ed4aedc42813 --- /dev/null +++ b/net/unix/af_unix.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __AF_UNIX_H +#define __AF_UNIX_H + +#include + +#define UNIX_HASH_MOD (256 - 1) +#define UNIX_HASH_SIZE (256 * 2) +#define UNIX_HASH_BITS 8 + +#define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) +#define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock) + +struct sock *unix_peer_get(struct sock *sk); + +struct unix_skb_parms { + struct pid *pid; /* skb credentials */ + kuid_t uid; + kgid_t gid; + struct scm_fp_list *fp; /* Passed files */ +#ifdef CONFIG_SECURITY_NETWORK + u32 secid; /* Security ID */ +#endif + u32 consumed; +} __randomize_layout; + +#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) + +/* GC for SCM_RIGHTS */ +extern unsigned int unix_tot_inflight; +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); +void unix_del_edges(struct scm_fp_list *fpl); +void unix_update_edges(struct unix_sock *receiver); +int unix_prepare_fpl(struct scm_fp_list *fpl); +void unix_destroy_fpl(struct scm_fp_list *fpl); +void unix_gc(void); +void wait_for_unix_gc(struct scm_fp_list *fpl); + +/* SOCK_DIAG */ +long unix_inq_len(struct sock *sk); +long unix_outq_len(struct sock *sk); + +/* sysctl */ +#ifdef CONFIG_SYSCTL +int unix_sysctl_register(struct net *net); +void unix_sysctl_unregister(struct net *net); +#else +static inline int unix_sysctl_register(struct net *net) +{ + return 0; +} + +static inline void unix_sysctl_unregister(struct net *net) +{ +} +#endif + +/* BPF SOCKMAP */ +int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags); +int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags); + +#ifdef CONFIG_BPF_SYSCALL +extern struct proto unix_dgram_proto; +extern struct proto unix_stream_proto; + +int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); +int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); +void __init unix_bpf_build_proto(void); +#else +static inline void __init unix_bpf_build_proto(void) +{ +} +#endif + +#endif diff --git a/net/unix/diag.c b/net/unix/diag.c index ba715507556a..c7e8c7d008f6 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -12,6 +12,8 @@ #include #include +#include "af_unix.h" + static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { /* might or might not have a hash table lock */ diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 6a641d4b5542..8c8c7360349d 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -80,6 +80,24 @@ #include #include +#include "af_unix.h" + +struct unix_vertex { + struct list_head edges; + struct list_head entry; + struct list_head scc_entry; + unsigned long out_degree; + unsigned long index; + unsigned long scc_index; +}; + +struct unix_edge { + struct unix_sock *predecessor; + struct unix_sock *successor; + struct list_head vertex_entry; + struct list_head stack_entry; +}; + struct unix_sock *unix_get_socket(struct file *filp) { struct inode *inode = file_inode(filp); diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index 55118ae897d6..236b7faa9254 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -10,6 +10,8 @@ #include #include +#include "af_unix.h" + static struct ctl_table unix_table[] = { { .procname = "max_dgram_qlen", diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index 86598a959eaa..979dd4c4261a 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -6,6 +6,8 @@ #include #include +#include "af_unix.h" + #define unix_sk_has_data(__sk, __psock) \ ({ !skb_queue_empty(&__sk->sk_receive_queue) || \ !skb_queue_empty(&__psock->ingress_skb) || \ -- cgit v1.2.3 From 3056172a261c88fc834147f6427b09dfe4d4290b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Mar 2025 20:48:50 -0700 Subject: af_unix: Explicitly include headers for non-pointer struct fields. include/net/af_unix.h indirectly includes some definitions for structs. Let's include such headers explicitly. linux/atomic.h : scm_stat.nr_fds linux/net.h : unix_sock.peer_wq linux/path.h : unix_sock.path linux/spinlock.h : unix_sock.lock linux/wait.h : unix_sock.peer_wake uapi/linux/un.h : unix_address.name[] linux/socket.h is removed as the structs there are not used directly, and linux/un.h is clarified with uapi as un.h only exists under include/uapi. While at it, duplicate headers are removed from .c files. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250318034934.86708-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 8 ++++++-- net/unix/af_unix.c | 3 --- net/unix/diag.c | 3 --- net/unix/garbage.c | 5 ----- net/unix/unix_bpf.c | 1 - 5 files changed, 6 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index b5d70baba52b..b588069ece7e 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -2,11 +2,15 @@ #ifndef __LINUX_NET_AFUNIX_H #define __LINUX_NET_AFUNIX_H +#include #include +#include +#include #include -#include -#include +#include +#include #include +#include #if IS_ENABLED(CONFIG_UNIX) struct unix_sock *unix_get_socket(struct file *filp); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6390e04fe916..c081440cf576 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -92,7 +92,6 @@ #include #include #include -#include #include #include #include @@ -110,12 +109,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include "af_unix.h" diff --git a/net/unix/diag.c b/net/unix/diag.c index c7e8c7d008f6..8b2247e05596 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -3,13 +3,10 @@ #include #include #include -#include #include -#include #include #include #include -#include #include #include "af_unix.h" diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 8c8c7360349d..cd75502c47f1 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -66,18 +66,13 @@ #include #include #include -#include -#include #include #include #include #include #include -#include -#include #include #include -#include #include #include "af_unix.h" diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index 979dd4c4261a..e0d30d6d22ac 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -4,7 +4,6 @@ #include #include #include -#include #include "af_unix.h" -- cgit v1.2.3 From b709857ecbf511bb25603790ff9c3f12abe36559 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Mar 2025 21:25:16 +0000 Subject: ipv6: fix _DEVADD() and _DEVUPD() macros ip6_rcv_core() is using: __IP6_ADD_STATS(net, idev, IPSTATS_MIB_NOECTPKTS + (ipv6_get_dsfield(hdr) & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); This is currently evaluating both expressions twice. Fix _DEVADD() and _DEVUPD() macros to evaluate their arguments once. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250319212516.2385451-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 9614006f483c..2ccdf85f34f1 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -246,17 +246,20 @@ extern int sysctl_mld_qrv; #define _DEVADD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ + unsigned long _field = (field); \ + unsigned long _val = (val); \ if (likely(_idev != NULL)) \ - mod##SNMP_ADD_STATS((_idev)->stats.statname, (field), (val)); \ - mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, (field), (val));\ + mod##SNMP_ADD_STATS((_idev)->stats.statname, _field, _val); \ + mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, _field, _val);\ }) #define _DEVUPD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ + unsigned long _val = (val); \ if (likely(_idev != NULL)) \ - mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, (val)); \ - mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\ + mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, _val); \ + mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, _val);\ }) /* MIBs */ -- cgit v1.2.3 From 0de2a5c4b824da2205658ebebb99a55c43cdf60f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Mar 2025 12:16:04 +0000 Subject: tcp: avoid atomic operations on sk->sk_rmem_alloc TCP uses generic skb_set_owner_r() and sock_rfree() for received packets, with socket lock being owned. Switch to private versions, avoiding two atomic operations per packet. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250320121604.3342831-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 15 +++++++++++++++ net/ipv4/tcp.c | 18 ++++++++++++++++-- net/ipv4/tcp_fastopen.c | 2 +- net/ipv4/tcp_input.c | 6 +++--- 4 files changed, 35 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 4450c384ef17..df04dc09c519 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -779,6 +779,7 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) /* tcp.c */ void tcp_get_info(struct sock *, struct tcp_info *); +void tcp_sock_rfree(struct sk_buff *skb); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, @@ -2898,4 +2899,18 @@ enum skb_drop_reason tcp_inbound_hash(struct sock *sk, const void *saddr, const void *daddr, int family, int dif, int sdif); +/* version of skb_set_owner_r() avoiding one atomic_add() */ +static inline void tcp_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; + skb->destructor = tcp_sock_rfree; + + sock_owned_by_me(sk); + atomic_set(&sk->sk_rmem_alloc, + atomic_read(&sk->sk_rmem_alloc) + skb->truesize); + + sk_forward_alloc_add(sk, -skb->truesize); +} + #endif /* _TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6edc441b3702..ea8de00f669d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1525,11 +1525,25 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) __tcp_cleanup_rbuf(sk, copied); } +/* private version of sock_rfree() avoiding one atomic_sub() */ +void tcp_sock_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + unsigned int len = skb->truesize; + + sock_owned_by_me(sk); + atomic_set(&sk->sk_rmem_alloc, + atomic_read(&sk->sk_rmem_alloc) - len); + + sk_forward_alloc_add(sk, len); + sk_mem_reclaim(sk); +} + static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); - if (likely(skb->destructor == sock_rfree)) { - sock_rfree(skb); + if (likely(skb->destructor == tcp_sock_rfree)) { + tcp_sock_rfree(skb); skb->destructor = NULL; skb->sk = NULL; return skb_attempt_defer_free(skb); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 1a6b1bc54245..ca40665145c6 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -189,7 +189,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) tcp_segs_in(tp, skb); __skb_pull(skb, tcp_hdrlen(skb)); sk_forced_mem_schedule(sk, skb->truesize); - skb_set_owner_r(skb, sk); + tcp_skb_set_owner_r(skb, sk); TCP_SKB_CB(skb)->seq++; TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a35018e2d0ba..e1f952fbac48 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5171,7 +5171,7 @@ end: if (tcp_is_sack(tp)) tcp_grow_window(sk, skb, false); skb_condense(skb); - skb_set_owner_r(skb, sk); + tcp_skb_set_owner_r(skb, sk); } } @@ -5187,7 +5187,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { tcp_add_receive_queue(sk, skb); - skb_set_owner_r(skb, sk); + tcp_skb_set_owner_r(skb, sk); } return eaten; } @@ -5504,7 +5504,7 @@ skip_this: __skb_queue_before(list, skb, nskb); else __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ - skb_set_owner_r(nskb, sk); + tcp_skb_set_owner_r(nskb, sk); mptcp_skb_ext_move(nskb, skb); /* Copy data, releasing collapsed skbs. */ -- cgit v1.2.3 From 094ee6017ea09c11d6af187935a949df32803ce0 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Fri, 21 Mar 2025 12:48:52 +0800 Subject: bonding: check xdp prog when set bond mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following operations can trigger a warning[1]: ip netns add ns1 ip netns exec ns1 ip link add bond0 type bond mode balance-rr ip netns exec ns1 ip link set dev bond0 xdp obj af_xdp_kern.o sec xdp ip netns exec ns1 ip link set bond0 type bond mode broadcast ip netns del ns1 When delete the namespace, dev_xdp_uninstall() is called to remove xdp program on bond dev, and bond_xdp_set() will check the bond mode. If bond mode is changed after attaching xdp program, the warning may occur. Some bond modes (broadcast, etc.) do not support native xdp. Set bond mode with xdp program attached is not good. Add check for xdp program when set bond mode. [1] ------------[ cut here ]------------ WARNING: CPU: 0 PID: 11 at net/core/dev.c:9912 unregister_netdevice_many_notify+0x8d9/0x930 Modules linked in: CPU: 0 UID: 0 PID: 11 Comm: kworker/u4:0 Not tainted 6.14.0-rc4 #107 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 Workqueue: netns cleanup_net RIP: 0010:unregister_netdevice_many_notify+0x8d9/0x930 Code: 00 00 48 c7 c6 6f e3 a2 82 48 c7 c7 d0 b3 96 82 e8 9c 10 3e ... RSP: 0018:ffffc90000063d80 EFLAGS: 00000282 RAX: 00000000ffffffa1 RBX: ffff888004959000 RCX: 00000000ffffdfff RDX: 0000000000000000 RSI: 00000000ffffffea RDI: ffffc90000063b48 RBP: ffffc90000063e28 R08: ffffffff82d39b28 R09: 0000000000009ffb R10: 0000000000000175 R11: ffffffff82d09b40 R12: ffff8880049598e8 R13: 0000000000000001 R14: dead000000000100 R15: ffffc90000045000 FS: 0000000000000000(0000) GS:ffff888007a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000d406b60 CR3: 000000000483e000 CR4: 00000000000006f0 Call Trace: ? __warn+0x83/0x130 ? unregister_netdevice_many_notify+0x8d9/0x930 ? report_bug+0x18e/0x1a0 ? handle_bug+0x54/0x90 ? exc_invalid_op+0x18/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? unregister_netdevice_many_notify+0x8d9/0x930 ? bond_net_exit_batch_rtnl+0x5c/0x90 cleanup_net+0x237/0x3d0 process_one_work+0x163/0x390 worker_thread+0x293/0x3b0 ? __pfx_worker_thread+0x10/0x10 kthread+0xec/0x1e0 ? __pfx_kthread+0x10/0x10 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2f/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 ---[ end trace 0000000000000000 ]--- Fixes: 9e2ee5c7e7c3 ("net, bonding: Add XDP support to the bonding driver") Signed-off-by: Wang Liang Acked-by: Jussi Maki Reviewed-by: Nikolay Aleksandrov Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250321044852.1086551-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 8 ++++---- drivers/net/bonding/bond_options.c | 3 +++ include/net/bonding.h | 1 + 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e45bba240cbc..4da5fcb7def4 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -322,9 +322,9 @@ static bool bond_sk_check(struct bonding *bond) } } -static bool bond_xdp_check(struct bonding *bond) +bool bond_xdp_check(struct bonding *bond, int mode) { - switch (BOND_MODE(bond)) { + switch (mode) { case BOND_MODE_ROUNDROBIN: case BOND_MODE_ACTIVEBACKUP: return true; @@ -1937,7 +1937,7 @@ void bond_xdp_set_features(struct net_device *bond_dev) ASSERT_RTNL(); - if (!bond_xdp_check(bond) || !bond_has_slaves(bond)) { + if (!bond_xdp_check(bond, BOND_MODE(bond)) || !bond_has_slaves(bond)) { xdp_clear_features_flag(bond_dev); return; } @@ -5699,7 +5699,7 @@ static int bond_xdp_set(struct net_device *dev, struct bpf_prog *prog, ASSERT_RTNL(); - if (!bond_xdp_check(bond)) { + if (!bond_xdp_check(bond, BOND_MODE(bond))) { BOND_NL_ERR(dev, extack, "No native XDP support for the current bonding mode"); return -EOPNOTSUPP; diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index d1b095af253b..91893c29b899 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -868,6 +868,9 @@ static bool bond_set_xfrm_features(struct bonding *bond) static int bond_option_mode_set(struct bonding *bond, const struct bond_opt_value *newval) { + if (bond->xdp_prog && !bond_xdp_check(bond, newval->value)) + return -EOPNOTSUPP; + if (!bond_mode_uses_arp(newval->value)) { if (bond->params.arp_interval) { netdev_dbg(bond->dev, "%s mode is incompatible with arp monitoring, start mii monitoring\n", diff --git a/include/net/bonding.h b/include/net/bonding.h index 8bb5f016969f..95f67b308c19 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -695,6 +695,7 @@ void bond_debug_register(struct bonding *bond); void bond_debug_unregister(struct bonding *bond); void bond_debug_reregister(struct bonding *bond); const char *bond_mode_name(int mode); +bool bond_xdp_check(struct bonding *bond, int mode); void bond_setup(struct net_device *bond_dev); unsigned int bond_get_num_tx_queues(void); int bond_netlink_init(void); -- cgit v1.2.3 From f3483c8e1da62993fe0f57af23b925de7661adaa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Mar 2025 17:13:09 +0000 Subject: net: rfs: hash function change RFS is using two kinds of hash tables. First one is controlled by /proc/sys/net/core/rps_sock_flow_entries = 2^N and using the N low order bits of the l4 hash is good enough. Then each RX queue has its own hash table, controlled by /sys/class/net/eth1/queues/rx-$q/rps_flow_cnt = 2^X Current hash function, using the X low order bits is suboptimal, because RSS is usually using Func(hash) = (hash % power_of_two); For example, with 32 RX queues, 6 low order bits have no entropy for a given queue. Switch this hash function to hash_32(hash, log) to increase chances to use all possible slots and reduce collisions. Signed-off-by: Eric Dumazet Cc: Tom Herbert Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250321171309.634100-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/rps.h | 2 +- net/core/dev.c | 13 +++++++++---- net/core/net-sysfs.c | 4 ++-- 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/rps.h b/include/net/rps.h index a93401d23d66..e358e9711f27 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -39,7 +39,7 @@ struct rps_dev_flow { * The rps_dev_flow_table structure contains a table of flow mappings. */ struct rps_dev_flow_table { - unsigned int mask; + u8 log; struct rcu_head rcu; struct rps_dev_flow flows[]; }; diff --git a/net/core/dev.c b/net/core/dev.c index bcf81c3ff6a3..b6b1c7898281 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4751,6 +4751,11 @@ EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); +static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) +{ + return hash_32(hash, flow_table->log); +} + static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) @@ -4777,7 +4782,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = skb_get_hash(skb) & flow_table->mask; + flow_id = rfs_slot(skb_get_hash(skb), flow_table); rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) @@ -4856,7 +4861,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ - rflow = &flow_table->flows[hash & flow_table->mask]; + rflow = &flow_table->flows[rfs_slot(hash, flow_table)]; tcpu = rflow->cpu; /* @@ -4923,13 +4928,13 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); - if (flow_table && flow_id <= flow_table->mask) { + if (flow_table && flow_id < (1UL << flow_table->log)) { rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - READ_ONCE(rflow->last_qtail)) < - (int)(10 * flow_table->mask))) + (int)(10 << flow_table->log))) expire = false; } rcu_read_unlock(); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index abaa1c919b98..b6fbe629ccee 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1056,7 +1056,7 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, rcu_read_lock(); flow_table = rcu_dereference(queue->rps_flow_table); if (flow_table) - val = (unsigned long)flow_table->mask + 1; + val = 1UL << flow_table->log; rcu_read_unlock(); return sysfs_emit(buf, "%lu\n", val); @@ -1109,7 +1109,7 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, if (!table) return -ENOMEM; - table->mask = mask; + table->log = ilog2(mask) + 1; for (count = 0; count <= mask; count++) table->flows[count].cpu = RPS_NO_CPU; } else { -- cgit v1.2.3 From 1f6154227b49c3d3f306f624858e695bfee50aae Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Mar 2025 09:14:27 -0700 Subject: Revert "udp_tunnel: GRO optimizations" Revert "udp_tunnel: use static call for GRO hooks when possible" This reverts commit 311b36574ceaccfa3f91b74054a09cd4bb877702. Revert "udp_tunnel: create a fastpath GRO lookup." This reverts commit 8d4880db378350f8ed8969feea13bdc164564fc1. There are multiple small issues with the series. In the interest of unblocking the merge window let's opt for a revert. Link: https://lore.kernel.org/cover.1742557254.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/udp.h | 16 ----- include/net/netns/ipv4.h | 11 --- include/net/udp.h | 1 - include/net/udp_tunnel.h | 22 ------ net/ipv4/udp.c | 13 +--- net/ipv4/udp_offload.c | 167 +-------------------------------------------- net/ipv4/udp_tunnel_core.c | 14 ---- net/ipv6/udp.c | 2 - net/ipv6/udp_offload.c | 5 -- 9 files changed, 2 insertions(+), 249 deletions(-) (limited to 'include/net') diff --git a/include/linux/udp.h b/include/linux/udp.h index 895240177f4f..0807e21cfec9 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -101,13 +101,6 @@ struct udp_sock { /* Cache friendly copy of sk->sk_peek_off >= 0 */ bool peeking_with_offset; - - /* - * Accounting for the tunnel GRO fastpath. - * Unprotected by compilers guard, as it uses space available in - * the last UDP socket cacheline. - */ - struct hlist_node tunnel_list; }; #define udp_test_bit(nr, sk) \ @@ -226,13 +219,4 @@ static inline void udp_allow_gso(struct sock *sk) #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) -static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6) -{ -#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) - return rcu_dereference(net->ipv4.udp_tunnel_gro[is_ipv6].sk); -#else - return NULL; -#endif -} - #endif /* _LINUX_UDP_H */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 6373e3f17da8..650b2dc9199f 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -47,11 +47,6 @@ struct sysctl_fib_multipath_hash_seed { }; #endif -struct udp_tunnel_gro { - struct sock __rcu *sk; - struct hlist_head list; -}; - struct netns_ipv4 { /* Cacheline organization can be found documented in * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. @@ -90,11 +85,6 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; struct udp_table *udp_table; -#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) - /* Not in a pernet subsys because need to be available at GRO stage */ - struct udp_tunnel_gro udp_tunnel_gro[2]; -#endif - #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table_header *frags_hdr; @@ -287,5 +277,4 @@ struct netns_ipv4 { struct hlist_head *inet_addr_lst; struct delayed_work addr_chk_work; }; - #endif diff --git a/include/net/udp.h b/include/net/udp.h index a772510b2aa5..6e89520e100d 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -290,7 +290,6 @@ static inline void udp_lib_init_sock(struct sock *sk) struct udp_sock *up = udp_sk(sk); skb_queue_head_init(&up->reader_queue); - INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index a7b230867eb1..a93dc51f6323 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -203,28 +203,6 @@ static inline void udp_tunnel_encap_enable(struct sock *sk) udp_encap_enable(); } -#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) -void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); -void udp_tunnel_update_gro_rcv(struct sock *sk, bool add); -#else -static inline void udp_tunnel_update_gro_lookup(struct net *net, - struct sock *sk, bool add) {} -static inline void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) {} -#endif - -static inline void udp_tunnel_cleanup_gro(struct sock *sk) -{ - struct udp_sock *up = udp_sk(sk); - struct net *net = sock_net(sk); - - udp_tunnel_update_gro_rcv(sk, false); - - if (!up->tunnel_list.pprev) - return; - - udp_tunnel_update_gro_lookup(net, sk, false); -} - #define UDP_TUNNEL_NIC_MAX_TABLES 4 enum udp_tunnel_nic_info_flags { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index db606f7e4163..d0bffcfa56d8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2891,10 +2891,8 @@ void udp_destroy_sock(struct sock *sk) if (encap_destroy) encap_destroy(sk); } - if (udp_test_bit(ENCAP_ENABLED, sk)) { + if (udp_test_bit(ENCAP_ENABLED, sk)) static_branch_dec(&udp_encap_needed_key); - udp_tunnel_cleanup_gro(sk); - } } } @@ -3806,15 +3804,6 @@ fallback: static int __net_init udp_pernet_init(struct net *net) { -#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) - int i; - - /* No tunnel is configured */ - for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) { - INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list); - RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL); - } -#endif udp_sysctl_init(net); udp_set_table(net); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 088aa8cb8ac0..2c0725583be3 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -12,164 +12,6 @@ #include #include #include -#include - -#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) - -/* - * Dummy GRO tunnel callback, exists mainly to avoid dangling/NULL - * values for the udp tunnel static call. - */ -static struct sk_buff *dummy_gro_rcv(struct sock *sk, - struct list_head *head, - struct sk_buff *skb) -{ - NAPI_GRO_CB(skb)->flush = 1; - return NULL; -} - -typedef struct sk_buff *(*udp_tunnel_gro_rcv_t)(struct sock *sk, - struct list_head *head, - struct sk_buff *skb); - -struct udp_tunnel_type_entry { - udp_tunnel_gro_rcv_t gro_receive; - refcount_t count; -}; - -#define UDP_MAX_TUNNEL_TYPES (IS_ENABLED(CONFIG_GENEVE) + \ - IS_ENABLED(CONFIG_VXLAN) * 2 + \ - IS_ENABLED(CONFIG_NET_FOU) * 2) - -DEFINE_STATIC_CALL(udp_tunnel_gro_rcv, dummy_gro_rcv); -static DEFINE_STATIC_KEY_FALSE(udp_tunnel_static_call); -static struct mutex udp_tunnel_gro_type_lock; -static struct udp_tunnel_type_entry udp_tunnel_gro_types[UDP_MAX_TUNNEL_TYPES]; -static unsigned int udp_tunnel_gro_type_nr; -static DEFINE_SPINLOCK(udp_tunnel_gro_lock); - -void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) -{ - bool is_ipv6 = sk->sk_family == AF_INET6; - struct udp_sock *tup, *up = udp_sk(sk); - struct udp_tunnel_gro *udp_tunnel_gro; - - spin_lock(&udp_tunnel_gro_lock); - udp_tunnel_gro = &net->ipv4.udp_tunnel_gro[is_ipv6]; - if (add) - hlist_add_head(&up->tunnel_list, &udp_tunnel_gro->list); - else - hlist_del_init(&up->tunnel_list); - - if (udp_tunnel_gro->list.first && - !udp_tunnel_gro->list.first->next) { - tup = hlist_entry(udp_tunnel_gro->list.first, struct udp_sock, - tunnel_list); - - rcu_assign_pointer(udp_tunnel_gro->sk, (struct sock *)tup); - } else { - RCU_INIT_POINTER(udp_tunnel_gro->sk, NULL); - } - - spin_unlock(&udp_tunnel_gro_lock); -} -EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup); - -void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) -{ - struct udp_tunnel_type_entry *cur = NULL; - struct udp_sock *up = udp_sk(sk); - int i, old_gro_type_nr; - - if (!up->gro_receive) - return; - - mutex_lock(&udp_tunnel_gro_type_lock); - for (i = 0; i < udp_tunnel_gro_type_nr; i++) - if (udp_tunnel_gro_types[i].gro_receive == up->gro_receive) - cur = &udp_tunnel_gro_types[i]; - - old_gro_type_nr = udp_tunnel_gro_type_nr; - if (add) { - /* - * Update the matching entry, if found, or add a new one - * if needed - */ - if (cur) { - refcount_inc(&cur->count); - goto out; - } - - if (unlikely(udp_tunnel_gro_type_nr == UDP_MAX_TUNNEL_TYPES)) { - pr_err_once("Too many UDP tunnel types, please increase UDP_MAX_TUNNEL_TYPES\n"); - /* Ensure static call will never be enabled */ - udp_tunnel_gro_type_nr = UDP_MAX_TUNNEL_TYPES + 2; - goto out; - } - - cur = &udp_tunnel_gro_types[udp_tunnel_gro_type_nr++]; - refcount_set(&cur->count, 1); - cur->gro_receive = up->gro_receive; - } else { - /* - * The stack cleanups only successfully added tunnel, the - * lookup on removal should never fail. - */ - if (WARN_ON_ONCE(!cur)) - goto out; - - if (!refcount_dec_and_test(&cur->count)) - goto out; - - /* avoid gaps, so that the enable tunnel has always id 0 */ - *cur = udp_tunnel_gro_types[--udp_tunnel_gro_type_nr]; - } - - if (udp_tunnel_gro_type_nr == 1) { - static_call_update(udp_tunnel_gro_rcv, - udp_tunnel_gro_types[0].gro_receive); - static_branch_enable(&udp_tunnel_static_call); - } else if (old_gro_type_nr == 1) { - static_branch_disable(&udp_tunnel_static_call); - static_call_update(udp_tunnel_gro_rcv, dummy_gro_rcv); - } - -out: - mutex_unlock(&udp_tunnel_gro_type_lock); -} -EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_rcv); - -static void udp_tunnel_gro_init(void) -{ - mutex_init(&udp_tunnel_gro_type_lock); -} - -static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, - struct list_head *head, - struct sk_buff *skb) -{ - if (static_branch_likely(&udp_tunnel_static_call)) { - if (unlikely(gro_recursion_inc_test(skb))) { - NAPI_GRO_CB(skb)->flush |= 1; - return NULL; - } - return static_call(udp_tunnel_gro_rcv)(sk, head, skb); - } - return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); -} - -#else - -static void udp_tunnel_gro_init(void) {} - -static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, - struct list_head *head, - struct sk_buff *skb) -{ - return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); -} - -#endif static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, @@ -780,7 +622,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); - pp = udp_tunnel_gro_rcv(sk, head, skb); + pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); out: skb_gro_flush_final(skb, pp, flush); @@ -793,13 +635,8 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct iphdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); - struct sock *sk; int iif, sdif; - sk = udp_tunnel_sk(net, false); - if (sk && dport == htons(sk->sk_num)) - return sk; - inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, @@ -930,7 +767,5 @@ int __init udpv4_offload_init(void) .gro_complete = udp4_gro_complete, }, }; - - udp_tunnel_gro_init(); return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP); } diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index c49fceea8313..619a53eb672d 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -58,15 +58,6 @@ error: } EXPORT_SYMBOL(udp_sock_create4); -static bool sk_saddr_any(struct sock *sk) -{ -#if IS_ENABLED(CONFIG_IPV6) - return ipv6_addr_any(&sk->sk_v6_rcv_saddr); -#else - return !sk->sk_rcv_saddr; -#endif -} - void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *cfg) { @@ -89,11 +80,6 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sk); - - udp_tunnel_update_gro_rcv(sock->sk, true); - - if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sock->sk)) - udp_tunnel_update_gro_lookup(net, sock->sk, true); } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7317f8e053f1..024458ef163c 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include @@ -1826,7 +1825,6 @@ void udpv6_destroy_sock(struct sock *sk) if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udpv6_encap_needed_key); udp_encap_disable(); - udp_tunnel_cleanup_gro(sk); } } } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index d8445ac1b2e4..404212dfc99a 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -118,13 +118,8 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct ipv6hdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); - struct sock *sk; int iif, sdif; - sk = udp_tunnel_sk(net, true); - if (sk && dport == htons(sk->sk_num)) - return sk; - inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, -- cgit v1.2.3 From 60bfe8a7dc424728fb1d83f43ae21384952ba353 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 27 Jan 2025 21:37:16 +0000 Subject: Bluetooth: MGMT: Remove unused mgmt_*_discovery_complete mgmt_start_discovery_complete() and mgmt_stop_discovery_complete() last uses were removed in 2022 by commit ec2904c259c5 ("Bluetooth: Remove dead code from hci_request.c") Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Simon Horman Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 2 -- net/bluetooth/mgmt.c | 40 ---------------------------------------- 2 files changed, 42 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6281063cbd8e..131ff1f4ebef 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -2354,8 +2354,6 @@ void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status); void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, u8 status); void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status); -void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status); -void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status); void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 *dev_class, s8 rssi, u32 flags, u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len, diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 621c555f639b..12835e5054be 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -5743,29 +5743,6 @@ done: return err; } -void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status) -{ - struct mgmt_pending_cmd *cmd; - - bt_dev_dbg(hdev, "status %u", status); - - hci_dev_lock(hdev); - - cmd = pending_find(MGMT_OP_START_DISCOVERY, hdev); - if (!cmd) - cmd = pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev); - - if (!cmd) - cmd = pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev); - - if (cmd) { - cmd->cmd_complete(cmd, mgmt_status(status)); - mgmt_pending_remove(cmd); - } - - hci_dev_unlock(hdev); -} - static bool discovery_type_is_valid(struct hci_dev *hdev, uint8_t type, uint8_t *mgmt_status) { @@ -6018,23 +5995,6 @@ failed: return err; } -void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status) -{ - struct mgmt_pending_cmd *cmd; - - bt_dev_dbg(hdev, "status %u", status); - - hci_dev_lock(hdev); - - cmd = pending_find(MGMT_OP_STOP_DISCOVERY, hdev); - if (cmd) { - cmd->cmd_complete(cmd, mgmt_status(status)); - mgmt_pending_remove(cmd); - } - - hci_dev_unlock(hdev); -} - static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; -- cgit v1.2.3 From c9d84da18d1e0d28a7e16ca6df8e6d47570501d4 Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Wed, 19 Feb 2025 22:51:32 +0000 Subject: Bluetooth: L2CAP: convert timeouts to secs_to_jiffies() Commit b35108a51cf7 ("jiffies: Define secs_to_jiffies()") introduced secs_to_jiffies(). As the value here is a multiple of 1000, use secs_to_jiffies() instead of msecs_to_jiffies() for readability. Signed-off-by: Easwar Hariharan Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/l2cap.h | 4 ++-- net/bluetooth/l2cap_core.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 9189354c568f..0bf8cb17a6e8 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -38,8 +38,8 @@ #define L2CAP_DEFAULT_TX_WINDOW 63 #define L2CAP_DEFAULT_EXT_WINDOW 0x3FFF #define L2CAP_DEFAULT_MAX_TX 3 -#define L2CAP_DEFAULT_RETRANS_TO 2000 /* 2 seconds */ -#define L2CAP_DEFAULT_MONITOR_TO 12000 /* 12 seconds */ +#define L2CAP_DEFAULT_RETRANS_TO 2 /* seconds */ +#define L2CAP_DEFAULT_MONITOR_TO 12 /* seconds */ #define L2CAP_DEFAULT_MAX_PDU_SIZE 1492 /* Sized for AMP packet */ #define L2CAP_DEFAULT_ACK_TO 200 #define L2CAP_DEFAULT_MAX_SDU_SIZE 0xFFFF diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index c27ea70f71e1..7b4adab353cf 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -282,7 +282,7 @@ static void __set_retrans_timer(struct l2cap_chan *chan) if (!delayed_work_pending(&chan->monitor_timer) && chan->retrans_timeout) { l2cap_set_timer(chan, &chan->retrans_timer, - msecs_to_jiffies(chan->retrans_timeout)); + secs_to_jiffies(chan->retrans_timeout)); } } @@ -291,7 +291,7 @@ static void __set_monitor_timer(struct l2cap_chan *chan) __clear_retrans_timer(chan); if (chan->monitor_timeout) { l2cap_set_timer(chan, &chan->monitor_timer, - msecs_to_jiffies(chan->monitor_timeout)); + secs_to_jiffies(chan->monitor_timeout)); } } -- cgit v1.2.3 From ff26b2dd6568392f60fa67a4e58279938025c3af Mon Sep 17 00:00:00 2001 From: Pedro Nishiyama Date: Sat, 1 Mar 2025 03:22:58 -0300 Subject: Bluetooth: Add quirk for broken READ_VOICE_SETTING Some fake controllers cannot be initialized because they return a smaller report than expected for READ_VOICE_SETTING. Signed-off-by: Pedro Nishiyama Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 8 ++++++++ include/net/bluetooth/hci_core.h | 4 ++++ net/bluetooth/hci_sync.c | 3 +++ 3 files changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 3ec915738112..295d97e312e1 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -354,6 +354,14 @@ enum { * during the hdev->setup vendor callback. */ HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY, + + /* When this quirk is set, the HCI_OP_READ_VOICE_SETTING command is + * skipped. This is required for a subset of the CSR controller clones + * which erroneously claim to support it. + * + * This quirk must be set before hci_register_dev is called. + */ + HCI_QUIRK_BROKEN_READ_VOICE_SETTING, }; /* HCI device flags */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 131ff1f4ebef..7966db4038cc 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1925,6 +1925,10 @@ void hci_conn_del_sysfs(struct hci_conn *conn); ((dev)->commands[20] & 0x10 && \ !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &hdev->quirks)) +#define read_voice_setting_capable(dev) \ + ((dev)->commands[9] & 0x04 && \ + !test_bit(HCI_QUIRK_BROKEN_READ_VOICE_SETTING, &(dev)->quirks)) + /* Use enhanced synchronous connection if command is supported and its quirk * has not been set. */ diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index a43749aebf76..fd0785838760 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3696,6 +3696,9 @@ static int hci_read_local_name_sync(struct hci_dev *hdev) /* Read Voice Setting */ static int hci_read_voice_setting_sync(struct hci_dev *hdev) { + if (!read_voice_setting_capable(hdev)) + return 0; + return __hci_cmd_sync_status(hdev, HCI_OP_READ_VOICE_SETTING, 0, NULL, HCI_CMD_TIMEOUT); } -- cgit v1.2.3 From 127881334eaad639e0a19a399ee8c91d6c9dc982 Mon Sep 17 00:00:00 2001 From: Pedro Nishiyama Date: Sat, 1 Mar 2025 03:22:59 -0300 Subject: Bluetooth: Add quirk for broken READ_PAGE_SCAN_TYPE Some fake controllers cannot be initialized because they return a smaller report than expected for READ_PAGE_SCAN_TYPE. Signed-off-by: Pedro Nishiyama Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 8 ++++++++ net/bluetooth/hci_sync.c | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 295d97e312e1..aa684d2b079f 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -362,6 +362,14 @@ enum { * This quirk must be set before hci_register_dev is called. */ HCI_QUIRK_BROKEN_READ_VOICE_SETTING, + + /* When this quirk is set, the HCI_OP_READ_PAGE_SCAN_TYPE command is + * skipped. This is required for a subset of the CSR controller clones + * which erroneously claim to support it. + * + * This quirk must be set before hci_register_dev is called. + */ + HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, }; /* HCI device flags */ diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index fd0785838760..c4c2cf51b219 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4132,7 +4132,8 @@ static int hci_read_page_scan_type_sync(struct hci_dev *hdev) * support the Read Page Scan Type command. Check support for * this command in the bit mask of supported commands. */ - if (!(hdev->commands[13] & 0x01)) + if (!(hdev->commands[13] & 0x01) || + test_bit(HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, &hdev->quirks)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_TYPE, -- cgit v1.2.3 From 13218453521d75916dfed55efb8e809bfc03cb4b Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 12 Mar 2025 11:14:20 -0400 Subject: Bluetooth: hci_core: Enable buffer flow control for SCO/eSCO This enables buffer flow control for SCO/eSCO (see: Bluetooth Core 6.0 spec: 6.22. Synchronous Flow Control Enable), recently this has caused the following problem and is actually a nice addition for the likes of Socket TX complete: < HCI Command: Read Buffer Size (0x04|0x0005) plen 0 > HCI Event: Command Complete (0x0e) plen 11 Read Buffer Size (0x04|0x0005) ncmd 1 Status: Success (0x00) ACL MTU: 1021 ACL max packet: 5 SCO MTU: 240 SCO max packet: 8 ... < SCO Data TX: Handle 257 flags 0x00 dlen 120 < SCO Data TX: Handle 257 flags 0x00 dlen 120 < SCO Data TX: Handle 257 flags 0x00 dlen 120 < SCO Data TX: Handle 257 flags 0x00 dlen 120 < SCO Data TX: Handle 257 flags 0x00 dlen 120 < SCO Data TX: Handle 257 flags 0x00 dlen 120 < SCO Data TX: Handle 257 flags 0x00 dlen 120 < SCO Data TX: Handle 257 flags 0x00 dlen 120 < SCO Data TX: Handle 257 flags 0x00 dlen 120 > HCI Event: Hardware Error (0x10) plen 1 Code: 0x0a To fix the code will now attempt to enable buffer flow control when HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED is set by the driver: < HCI Command: Write Sync Fl.. (0x03|0x002f) plen 1 Flow control: Enabled (0x01) > HCI Event: Command Complete (0x0e) plen 4 Write Sync Flow Control Enable (0x03|0x002f) ncmd 1 Status: Success (0x00) On success then HCI_SCO_FLOWCTL would be set which indicates sco_cnt shall be used for flow contro. Fixes: 7fedd3bb6b77 ("Bluetooth: Prioritize SCO traffic") Signed-off-by: Luiz Augusto von Dentz Tested-by: Pauli Virtanen --- include/net/bluetooth/hci.h | 13 +++++++++ include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_core.c | 62 ++++++++++++++++++---------------------- net/bluetooth/hci_event.c | 2 ++ net/bluetooth/hci_sync.c | 24 ++++++++++++++++ 5 files changed, 68 insertions(+), 34 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index aa684d2b079f..6da61c185c94 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -208,6 +208,13 @@ enum { */ HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, + /* When this quirk is set consider Sync Flow Control as supported by + * the driver. + * + * This quirk must be set before hci_register_dev is called. + */ + HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED, + /* When this quirk is set, the LE states reported through the * HCI_LE_READ_SUPPORTED_STATES are invalid/broken. * @@ -448,6 +455,7 @@ enum { HCI_WIDEBAND_SPEECH_ENABLED, HCI_EVENT_FILTER_CONFIGURED, HCI_PA_SYNC, + HCI_SCO_FLOWCTL, HCI_DUT_MODE, HCI_VENDOR_DIAG, @@ -1544,6 +1552,11 @@ struct hci_rp_read_tx_power { __s8 tx_power; } __packed; +#define HCI_OP_WRITE_SYNC_FLOWCTL 0x0c2f +struct hci_cp_write_sync_flowctl { + __u8 enable; +} __packed; + #define HCI_OP_READ_PAGE_SCAN_TYPE 0x0c46 struct hci_rp_read_page_scan_type { __u8 status; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 7966db4038cc..f78e4298e39a 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1858,6 +1858,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define lmp_hold_capable(dev) ((dev)->features[0][0] & LMP_HOLD) #define lmp_sniff_capable(dev) ((dev)->features[0][0] & LMP_SNIFF) #define lmp_park_capable(dev) ((dev)->features[0][1] & LMP_PARK) +#define lmp_sco_capable(dev) ((dev)->features[0][1] & LMP_SCO) #define lmp_inq_rssi_capable(dev) ((dev)->features[0][3] & LMP_RSSI_INQ) #define lmp_esco_capable(dev) ((dev)->features[0][3] & LMP_ESCO) #define lmp_bredr_capable(dev) (!((dev)->features[0][4] & LMP_NO_BREDR)) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 012fc107901a..94d9147612da 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3552,42 +3552,27 @@ static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type) } /* Schedule SCO */ -static void hci_sched_sco(struct hci_dev *hdev) +static void hci_sched_sco(struct hci_dev *hdev, __u8 type) { struct hci_conn *conn; struct sk_buff *skb; - int quote; + int quote, *cnt; + unsigned int pkts = hdev->sco_pkts; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "type %u", type); - if (!hci_conn_num(hdev, SCO_LINK)) + if (!hci_conn_num(hdev, type) || !pkts) return; - while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, "e))) { - while (quote-- && (skb = skb_dequeue(&conn->data_q))) { - BT_DBG("skb %p len %d", skb, skb->len); - hci_send_frame(hdev, skb); - - conn->sent++; - if (conn->sent == ~0) - conn->sent = 0; - } - } -} - -static void hci_sched_esco(struct hci_dev *hdev) -{ - struct hci_conn *conn; - struct sk_buff *skb; - int quote; - - BT_DBG("%s", hdev->name); - - if (!hci_conn_num(hdev, ESCO_LINK)) - return; + /* Use sco_pkts if flow control has not been enabled which will limit + * the amount of buffer sent in a row. + */ + if (!hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) + cnt = &pkts; + else + cnt = &hdev->sco_cnt; - while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK, - "e))) { + while (*cnt && (conn = hci_low_sent(hdev, type, "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_frame(hdev, skb); @@ -3595,8 +3580,17 @@ static void hci_sched_esco(struct hci_dev *hdev) conn->sent++; if (conn->sent == ~0) conn->sent = 0; + (*cnt)--; } } + + /* Rescheduled if all packets were sent and flow control is not enabled + * as there could be more packets queued that could not be sent and + * since no HCI_EV_NUM_COMP_PKTS event will be generated the reschedule + * needs to be forced. + */ + if (!pkts && !hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) + queue_work(hdev->workqueue, &hdev->tx_work); } static void hci_sched_acl_pkt(struct hci_dev *hdev) @@ -3632,8 +3626,8 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev) chan->conn->sent++; /* Send pending SCO packets right away */ - hci_sched_sco(hdev); - hci_sched_esco(hdev); + hci_sched_sco(hdev, SCO_LINK); + hci_sched_sco(hdev, ESCO_LINK); } } @@ -3688,8 +3682,8 @@ static void hci_sched_le(struct hci_dev *hdev) chan->conn->sent++; /* Send pending SCO packets right away */ - hci_sched_sco(hdev); - hci_sched_esco(hdev); + hci_sched_sco(hdev, SCO_LINK); + hci_sched_sco(hdev, ESCO_LINK); } } @@ -3734,8 +3728,8 @@ static void hci_tx_work(struct work_struct *work) if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { /* Schedule queues and send stuff to HCI driver */ - hci_sched_sco(hdev); - hci_sched_esco(hdev); + hci_sched_sco(hdev, SCO_LINK); + hci_sched_sco(hdev, ESCO_LINK); hci_sched_iso(hdev); hci_sched_acl(hdev); hci_sched_le(hdev); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 19e19c9f5e68..6d0138b778aa 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4445,9 +4445,11 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, break; case SCO_LINK: + case ESCO_LINK: hdev->sco_cnt += count; if (hdev->sco_cnt > hdev->sco_pkts) hdev->sco_cnt = hdev->sco_pkts; + break; case ISO_LINK: diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index c4c2cf51b219..609b035e5c90 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3769,6 +3769,28 @@ static int hci_write_ca_timeout_sync(struct hci_dev *hdev) sizeof(param), ¶m, HCI_CMD_TIMEOUT); } +/* Enable SCO flow control if supported */ +static int hci_write_sync_flowctl_sync(struct hci_dev *hdev) +{ + struct hci_cp_write_sync_flowctl cp; + int err; + + /* Check if the controller supports SCO and HCI_OP_WRITE_SYNC_FLOWCTL */ + if (!lmp_sco_capable(hdev) || !(hdev->commands[10] & BIT(4)) || + !test_bit(HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED, &hdev->quirks)) + return 0; + + memset(&cp, 0, sizeof(cp)); + cp.enable = 0x01; + + err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SYNC_FLOWCTL, + sizeof(cp), &cp, HCI_CMD_TIMEOUT); + if (!err) + hci_dev_set_flag(hdev, HCI_SCO_FLOWCTL); + + return err; +} + /* BR Controller init stage 2 command sequence */ static const struct hci_init_stage br_init2[] = { /* HCI_OP_READ_BUFFER_SIZE */ @@ -3787,6 +3809,8 @@ static const struct hci_init_stage br_init2[] = { HCI_INIT(hci_clear_event_filter_sync), /* HCI_OP_WRITE_CA_TIMEOUT */ HCI_INIT(hci_write_ca_timeout_sync), + /* HCI_OP_WRITE_SYNC_FLOWCTL */ + HCI_INIT(hci_write_sync_flowctl_sync), {} }; -- cgit v1.2.3 From e8c00f5433d020a2230226abe7e43f43dc686920 Mon Sep 17 00:00:00 2001 From: Wentao Guan Date: Tue, 18 Mar 2025 02:50:34 +0800 Subject: Bluetooth: HCI: Add definition of hci_rp_remote_name_req_cancel Return Parameters is not only status, also bdaddr: BLUETOOTH CORE SPECIFICATION Version 5.4 | Vol 4, Part E page 1870: BLUETOOTH CORE SPECIFICATION Version 5.0 | Vol 2, Part E page 802: Return parameters: Status: Size: 1 octet BD_ADDR: Size: 6 octets Note that it also fixes the warning: "Bluetooth: hci0: unexpected cc 0x041a length: 7 > 1" Fixes: c8992cffbe741 ("Bluetooth: hci_event: Use of a function table to handle Command Complete") Signed-off-by: Wentao Guan Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 5 +++++ net/bluetooth/hci_event.c | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 6da61c185c94..a8586c3058c7 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -879,6 +879,11 @@ struct hci_cp_remote_name_req_cancel { bdaddr_t bdaddr; } __packed; +struct hci_rp_remote_name_req_cancel { + __u8 status; + bdaddr_t bdaddr; +} __packed; + #define HCI_OP_READ_REMOTE_FEATURES 0x041b struct hci_cp_read_remote_features { __le16 handle; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 6d0138b778aa..0df4a0e082c8 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -151,7 +151,7 @@ static u8 hci_cc_exit_periodic_inq(struct hci_dev *hdev, void *data, static u8 hci_cc_remote_name_req_cancel(struct hci_dev *hdev, void *data, struct sk_buff *skb) { - struct hci_ev_status *rp = data; + struct hci_rp_remote_name_req_cancel *rp = data; bt_dev_dbg(hdev, "status 0x%2.2x", rp->status); @@ -4015,8 +4015,8 @@ static const struct hci_cc { HCI_CC_STATUS(HCI_OP_INQUIRY_CANCEL, hci_cc_inquiry_cancel), HCI_CC_STATUS(HCI_OP_PERIODIC_INQ, hci_cc_periodic_inq), HCI_CC_STATUS(HCI_OP_EXIT_PERIODIC_INQ, hci_cc_exit_periodic_inq), - HCI_CC_STATUS(HCI_OP_REMOTE_NAME_REQ_CANCEL, - hci_cc_remote_name_req_cancel), + HCI_CC(HCI_OP_REMOTE_NAME_REQ_CANCEL, hci_cc_remote_name_req_cancel, + sizeof(struct hci_rp_remote_name_req_cancel)), HCI_CC(HCI_OP_ROLE_DISCOVERY, hci_cc_role_discovery, sizeof(struct hci_rp_role_discovery)), HCI_CC(HCI_OP_READ_LINK_POLICY, hci_cc_read_link_policy, -- cgit v1.2.3 From 134f4b39df7b77225a80ef585c15d46f964f5e6f Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Tue, 18 Mar 2025 21:06:43 +0200 Subject: Bluetooth: add support for skb TX SND/COMPLETION timestamping Support enabling TX timestamping for some skbs, and track them until packet completion. Generate software SCM_TSTAMP_COMPLETION when getting completion report from hardware. Generate software SCM_TSTAMP_SND before sending to driver. Sending from driver requires changes in the driver API, and drivers mostly are going to send the skb immediately. Make the default situation with no COMPLETION TX timestamping more efficient by only counting packets in the queue when there is nothing to track. When there is something to track, we need to make clones, since the driver may modify sent skbs. The tx_q queue length is bounded by the hdev flow control, which will not send new packets before it has got completion reports for old ones. Signed-off-by: Pauli Virtanen Reviewed-by: Willem de Bruijn Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 20 +++++++ net/bluetooth/hci_conn.c | 122 +++++++++++++++++++++++++++++++++++++++ net/bluetooth/hci_core.c | 15 +++-- net/bluetooth/hci_event.c | 4 ++ 4 files changed, 157 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index f78e4298e39a..5115da34f881 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -261,6 +261,12 @@ struct adv_info { struct delayed_work rpa_expired_cb; }; +struct tx_queue { + struct sk_buff_head queue; + unsigned int extra; + unsigned int tracked; +}; + #define HCI_MAX_ADV_INSTANCES 5 #define HCI_DEFAULT_ADV_DURATION 2 @@ -733,6 +739,8 @@ struct hci_conn { struct sk_buff_head data_q; struct list_head chan_list; + struct tx_queue tx_q; + struct delayed_work disc_work; struct delayed_work auto_accept_work; struct delayed_work idle_work; @@ -1572,6 +1580,18 @@ void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active); void hci_conn_failed(struct hci_conn *conn, u8 status); u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle); +void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb); +void hci_conn_tx_dequeue(struct hci_conn *conn); +void hci_setup_tx_timestamp(struct sk_buff *skb, size_t key_offset, + const struct sockcm_cookie *sockc); + +static inline void hci_sockcm_init(struct sockcm_cookie *sockc, struct sock *sk) +{ + *sockc = (struct sockcm_cookie) { + .tsflags = READ_ONCE(sk->sk_tsflags), + }; +} + /* * hci_conn_get() and hci_conn_put() are used to control the life-time of an * "hci_conn" object. They do not guarantee that the hci_conn object is running, diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index d097e308a755..95972fd4c784 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -1002,6 +1003,7 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t } skb_queue_head_init(&conn->data_q); + skb_queue_head_init(&conn->tx_q.queue); INIT_LIST_HEAD(&conn->chan_list); INIT_LIST_HEAD(&conn->link_list); @@ -1155,6 +1157,7 @@ void hci_conn_del(struct hci_conn *conn) } skb_queue_purge(&conn->data_q); + skb_queue_purge(&conn->tx_q.queue); /* Remove the connection from the list and cleanup its remaining * state. This is a separate function since for some cases like @@ -3064,3 +3067,122 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason) */ return hci_cmd_sync_run_once(hdev, abort_conn_sync, conn, NULL); } + +void hci_setup_tx_timestamp(struct sk_buff *skb, size_t key_offset, + const struct sockcm_cookie *sockc) +{ + struct sock *sk = skb ? skb->sk : NULL; + + /* This shall be called on a single skb of those generated by user + * sendmsg(), and only when the sendmsg() does not return error to + * user. This is required for keeping the tskey that increments here in + * sync with possible sendmsg() counting by user. + * + * Stream sockets shall set key_offset to sendmsg() length in bytes + * and call with the last fragment, others to 1 and first fragment. + */ + + if (!skb || !sockc || !sk || !key_offset) + return; + + sock_tx_timestamp(sk, sockc, &skb_shinfo(skb)->tx_flags); + + if (sockc->tsflags & SOF_TIMESTAMPING_OPT_ID && + sockc->tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) { + if (sockc->tsflags & SOCKCM_FLAG_TS_OPT_ID) { + skb_shinfo(skb)->tskey = sockc->ts_opt_id; + } else { + int key = atomic_add_return(key_offset, &sk->sk_tskey); + + skb_shinfo(skb)->tskey = key - 1; + } + } +} + +void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb) +{ + struct tx_queue *comp = &conn->tx_q; + bool track = false; + + /* Emit SND now, ie. just before sending to driver */ + if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP) + __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SND); + + /* COMPLETION tstamp is emitted for tracked skb later in Number of + * Completed Packets event. Available only for flow controlled cases. + * + * TODO: SCO support without flowctl (needs to be done in drivers) + */ + switch (conn->type) { + case ISO_LINK: + case ACL_LINK: + case LE_LINK: + break; + case SCO_LINK: + case ESCO_LINK: + if (!hci_dev_test_flag(conn->hdev, HCI_SCO_FLOWCTL)) + return; + break; + default: + return; + } + + if (skb->sk && (skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP)) + track = true; + + /* If nothing is tracked, just count extra skbs at the queue head */ + if (!track && !comp->tracked) { + comp->extra++; + return; + } + + if (track) { + skb = skb_clone_sk(skb); + if (!skb) + goto count_only; + + comp->tracked++; + } else { + skb = skb_clone(skb, GFP_KERNEL); + if (!skb) + goto count_only; + } + + skb_queue_tail(&comp->queue, skb); + return; + +count_only: + /* Stop tracking skbs, and only count. This will not emit timestamps for + * the packets, but if we get here something is more seriously wrong. + */ + comp->tracked = 0; + comp->extra += skb_queue_len(&comp->queue) + 1; + skb_queue_purge(&comp->queue); +} + +void hci_conn_tx_dequeue(struct hci_conn *conn) +{ + struct tx_queue *comp = &conn->tx_q; + struct sk_buff *skb; + + /* If there are tracked skbs, the counted extra go before dequeuing real + * skbs, to keep ordering. When nothing is tracked, the ordering doesn't + * matter so dequeue real skbs first to get rid of them ASAP. + */ + if (comp->extra && (comp->tracked || skb_queue_empty(&comp->queue))) { + comp->extra--; + return; + } + + skb = skb_dequeue(&comp->queue); + if (!skb) + return; + + if (skb->sk) { + comp->tracked--; + __skb_tstamp_tx(skb, NULL, NULL, skb->sk, + SCM_TSTAMP_COMPLETION); + } + + kfree_skb(skb); +} diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 94d9147612da..5eb0600bbd03 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3029,6 +3029,13 @@ static int hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) return 0; } +static int hci_send_conn_frame(struct hci_dev *hdev, struct hci_conn *conn, + struct sk_buff *skb) +{ + hci_conn_tx_queue(conn, skb); + return hci_send_frame(hdev, skb); +} + /* Send HCI command */ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, const void *param) @@ -3575,7 +3582,7 @@ static void hci_sched_sco(struct hci_dev *hdev, __u8 type) while (*cnt && (conn = hci_low_sent(hdev, type, "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); - hci_send_frame(hdev, skb); + hci_send_conn_frame(hdev, conn, skb); conn->sent++; if (conn->sent == ~0) @@ -3618,7 +3625,7 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev) hci_conn_enter_active_mode(chan->conn, bt_cb(skb)->force_active); - hci_send_frame(hdev, skb); + hci_send_conn_frame(hdev, chan->conn, skb); hdev->acl_last_tx = jiffies; hdev->acl_cnt--; @@ -3674,7 +3681,7 @@ static void hci_sched_le(struct hci_dev *hdev) skb = skb_dequeue(&chan->data_q); - hci_send_frame(hdev, skb); + hci_send_conn_frame(hdev, chan->conn, skb); hdev->le_last_tx = jiffies; (*cnt)--; @@ -3708,7 +3715,7 @@ static void hci_sched_iso(struct hci_dev *hdev) while (*cnt && (conn = hci_low_sent(hdev, ISO_LINK, "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); - hci_send_frame(hdev, skb); + hci_send_conn_frame(hdev, conn, skb); conn->sent++; if (conn->sent == ~0) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 0df4a0e082c8..83990c975c1f 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4415,6 +4415,7 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, struct hci_comp_pkts_info *info = &ev->handles[i]; struct hci_conn *conn; __u16 handle, count; + unsigned int i; handle = __le16_to_cpu(info->handle); count = __le16_to_cpu(info->count); @@ -4425,6 +4426,9 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, conn->sent -= count; + for (i = 0; i < count; ++i) + hci_conn_tx_dequeue(conn); + switch (conn->type) { case ACL_LINK: hdev->acl_cnt += count; -- cgit v1.2.3 From d415ba28825909423b1253a3a0db131d74ea6242 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Tue, 18 Mar 2025 21:06:44 +0200 Subject: Bluetooth: ISO: add TX timestamping Add BT_SCM_ERROR socket CMSG type. Support TX timestamping in ISO sockets. Support MSG_ERRQUEUE in ISO recvmsg. If a packet from sendmsg() is fragmented, only the first ACL fragment is timestamped. Signed-off-by: Pauli Virtanen Reviewed-by: Willem de Bruijn Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 1 + net/bluetooth/iso.c | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 435250c72d56..bbefde319f95 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -156,6 +156,7 @@ struct bt_voice { #define BT_PKT_STATUS 16 #define BT_SCM_PKT_STATUS 0x03 +#define BT_SCM_ERROR 0x04 #define BT_ISO_QOS 17 diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 0cb52a3308ba..3501a991f1c6 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -518,7 +518,8 @@ static struct bt_iso_qos *iso_sock_get_qos(struct sock *sk) return &iso_pi(sk)->qos; } -static int iso_send_frame(struct sock *sk, struct sk_buff *skb) +static int iso_send_frame(struct sock *sk, struct sk_buff *skb, + const struct sockcm_cookie *sockc) { struct iso_conn *conn = iso_pi(sk)->conn; struct bt_iso_qos *qos = iso_sock_get_qos(sk); @@ -538,10 +539,12 @@ static int iso_send_frame(struct sock *sk, struct sk_buff *skb) hdr->slen = cpu_to_le16(hci_iso_data_len_pack(len, HCI_ISO_STATUS_VALID)); - if (sk->sk_state == BT_CONNECTED) + if (sk->sk_state == BT_CONNECTED) { + hci_setup_tx_timestamp(skb, 1, sockc); hci_send_iso(conn->hcon, skb); - else + } else { len = -ENOTCONN; + } return len; } @@ -1348,6 +1351,7 @@ static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct sk_buff *skb, **frag; + struct sockcm_cookie sockc; size_t mtu; int err; @@ -1360,6 +1364,14 @@ static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; + hci_sockcm_init(&sockc, sk); + + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (err) + return err; + } + lock_sock(sk); if (sk->sk_state != BT_CONNECTED) { @@ -1405,7 +1417,7 @@ static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, lock_sock(sk); if (sk->sk_state == BT_CONNECTED) - err = iso_send_frame(sk, skb); + err = iso_send_frame(sk, skb, &sockc); else err = -ENOTCONN; @@ -1474,6 +1486,10 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, BT_DBG("sk %p", sk); + if (unlikely(flags & MSG_ERRQUEUE)) + return sock_recv_errqueue(sk, msg, len, SOL_BLUETOOTH, + BT_SCM_ERROR); + if (test_and_clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { sock_hold(sk); lock_sock(sk); -- cgit v1.2.3 From 11770f41b8a7e1207576e944ec1c98b74fac89a5 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Tue, 18 Mar 2025 21:06:45 +0200 Subject: Bluetooth: L2CAP: add TX timestamping Support TX timestamping in L2CAP sockets. Support MSG_ERRQUEUE recvmsg. For other than SOCK_STREAM L2CAP sockets, if a packet from sendmsg() is fragmented, only the first ACL fragment is timestamped. For SOCK_STREAM L2CAP sockets, use the bytestream convention and timestamp the last fragment and count bytes in tskey. Timestamps are not generated in the Enhanced Retransmission mode, as meaning of COMPLETION stamp is unclear if L2CAP layer retransmits. Signed-off-by: Pauli Virtanen Reviewed-by: Willem de Bruijn Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/l2cap.h | 3 ++- net/bluetooth/6lowpan.c | 2 +- net/bluetooth/l2cap_core.c | 41 ++++++++++++++++++++++++++++++++++++++--- net/bluetooth/l2cap_sock.c | 15 ++++++++++++++- net/bluetooth/smp.c | 2 +- 5 files changed, 56 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 0bf8cb17a6e8..4bb0eaedda18 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -955,7 +955,8 @@ void l2cap_chan_close(struct l2cap_chan *chan, int reason); int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, bdaddr_t *dst, u8 dst_type, u16 timeout); int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu); -int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len); +int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, + const struct sockcm_cookie *sockc); void l2cap_chan_busy(struct l2cap_chan *chan, int busy); void l2cap_chan_rx_avail(struct l2cap_chan *chan, ssize_t rx_avail); int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator); diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 73530b8e1eae..f0c862091bff 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -444,7 +444,7 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, memset(&msg, 0, sizeof(msg)); iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iv, 1, skb->len); - err = l2cap_chan_send(chan, &msg, skb->len); + err = l2cap_chan_send(chan, &msg, skb->len, NULL); if (err > 0) { netdev->stats.tx_bytes += err; netdev->stats.tx_packets++; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 7b4adab353cf..c7b66b2ea9f2 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -2515,7 +2515,33 @@ static void l2cap_le_flowctl_send(struct l2cap_chan *chan) skb_queue_len(&chan->tx_q)); } -int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) +static void l2cap_tx_timestamp(struct sk_buff *skb, + const struct sockcm_cookie *sockc, + size_t len) +{ + struct sock *sk = skb ? skb->sk : NULL; + + if (sk && sk->sk_type == SOCK_STREAM) + hci_setup_tx_timestamp(skb, len, sockc); + else + hci_setup_tx_timestamp(skb, 1, sockc); +} + +static void l2cap_tx_timestamp_seg(struct sk_buff_head *queue, + const struct sockcm_cookie *sockc, + size_t len) +{ + struct sk_buff *skb = skb_peek(queue); + struct sock *sk = skb ? skb->sk : NULL; + + if (sk && sk->sk_type == SOCK_STREAM) + l2cap_tx_timestamp(skb_peek_tail(queue), sockc, len); + else + l2cap_tx_timestamp(skb, sockc, len); +} + +int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, + const struct sockcm_cookie *sockc) { struct sk_buff *skb; int err; @@ -2530,6 +2556,8 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) if (IS_ERR(skb)) return PTR_ERR(skb); + l2cap_tx_timestamp(skb, sockc, len); + l2cap_do_send(chan, skb); return len; } @@ -2553,6 +2581,8 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) if (err) return err; + l2cap_tx_timestamp_seg(&seg_queue, sockc, len); + skb_queue_splice_tail_init(&seg_queue, &chan->tx_q); l2cap_le_flowctl_send(chan); @@ -2574,6 +2604,8 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) if (IS_ERR(skb)) return PTR_ERR(skb); + l2cap_tx_timestamp(skb, sockc, len); + l2cap_do_send(chan, skb); err = len; break; @@ -2597,10 +2629,13 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) if (err) break; - if (chan->mode == L2CAP_MODE_ERTM) + if (chan->mode == L2CAP_MODE_ERTM) { + /* TODO: ERTM mode timestamping */ l2cap_tx(chan, NULL, &seg_queue, L2CAP_EV_DATA_REQUEST); - else + } else { + l2cap_tx_timestamp_seg(&seg_queue, sockc, len); l2cap_streaming_send(chan, &seg_queue); + } err = len; diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index acd11b268b98..5aa55fa69594 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1106,6 +1106,7 @@ static int l2cap_sock_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; + struct sockcm_cookie sockc; int err; BT_DBG("sock %p, sk %p", sock, sk); @@ -1120,6 +1121,14 @@ static int l2cap_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (sk->sk_state != BT_CONNECTED) return -ENOTCONN; + hci_sockcm_init(&sockc, sk); + + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (err) + return err; + } + lock_sock(sk); err = bt_sock_wait_ready(sk, msg->msg_flags); release_sock(sk); @@ -1127,7 +1136,7 @@ static int l2cap_sock_sendmsg(struct socket *sock, struct msghdr *msg, return err; l2cap_chan_lock(chan); - err = l2cap_chan_send(chan, msg, len); + err = l2cap_chan_send(chan, msg, len, &sockc); l2cap_chan_unlock(chan); return err; @@ -1168,6 +1177,10 @@ static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg, struct l2cap_pinfo *pi = l2cap_pi(sk); int err; + if (unlikely(flags & MSG_ERRQUEUE)) + return sock_recv_errqueue(sk, msg, len, SOL_BLUETOOTH, + BT_SCM_ERROR); + lock_sock(sk); if (sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index a31c6acf1df2..47f359f24d1f 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -608,7 +608,7 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iv, 2, 1 + len); - l2cap_chan_send(chan, &msg, 1 + len); + l2cap_chan_send(chan, &msg, 1 + len, NULL); if (!chan->data) return; -- cgit v1.2.3 From e2f81e8f4d0c3109e1a18620c931fe16bfb235ef Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 24 Mar 2025 15:45:29 -0700 Subject: net: constify dev pointer in misc instance lock helpers lockdep asserts and predicates can operate on const pointers. In the future this will let us add asserts in functions which operate on const pointers like dev_get_min_mp_channel_count(). Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250324224537.248800-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netdev_lock.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index 99631fbd7f54..689ffdfae50d 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -11,19 +11,20 @@ static inline bool netdev_trylock(struct net_device *dev) return mutex_trylock(&dev->lock); } -static inline void netdev_assert_locked(struct net_device *dev) +static inline void netdev_assert_locked(const struct net_device *dev) { lockdep_assert_held(&dev->lock); } -static inline void netdev_assert_locked_or_invisible(struct net_device *dev) +static inline void +netdev_assert_locked_or_invisible(const struct net_device *dev) { if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) netdev_assert_locked(dev); } -static inline bool netdev_need_ops_lock(struct net_device *dev) +static inline bool netdev_need_ops_lock(const struct net_device *dev) { bool ret = dev->request_ops_lock || !!dev->queue_mgmt_ops; @@ -46,7 +47,7 @@ static inline void netdev_unlock_ops(struct net_device *dev) netdev_unlock(dev); } -static inline void netdev_ops_assert_locked(struct net_device *dev) +static inline void netdev_ops_assert_locked(const struct net_device *dev) { if (netdev_need_ops_lock(dev)) lockdep_assert_held(&dev->lock); -- cgit v1.2.3 From 4b702f8b72c7b05daa1b763fdc0840aa78178c3a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 24 Mar 2025 15:45:30 -0700 Subject: net: explain "protection types" for the instance lock Try to define some terminology for which fields are protected by which lock and how. Some fields are protected by both rtnl_lock and instance lock which is hard to talk about without having a "key phrase" to refer to a particular protection scheme. "ops protected" fields are defined later in the series, one by one. Add ASSERT_RTNL() to netdev_ops_assert_locked() for drivers not other instance protection of ops. Hopefully it's not too confusion that netdev_lock_ops() does not match the lock which netdev_ops_assert_locked() will assert, exactly. The noun "ops" is in a different place in the name, so I think it's acceptable... Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250324224537.248800-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 28 ++++++++++++++++++++++------ include/net/netdev_lock.h | 3 +++ 2 files changed, 25 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 55859c565f84..2b91fb96a411 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2496,19 +2496,35 @@ struct net_device { * Should always be taken using netdev_lock() / netdev_unlock() helpers. * Drivers are free to use it for other protection. * - * Protects: + * For the drivers that implement shaper or queue API, the scope + * of this lock is expanded to cover most ndo/queue/ethtool/sysfs + * operations. Drivers may opt-in to this behavior by setting + * @request_ops_lock. + * + * @lock protection mixes with rtnl_lock in multiple ways, fields are + * either: + * + * - simply protected by the instance @lock; + * + * - double protected - writers hold both locks, readers hold either; + * + * - ops protected - protected by the lock held around the NDOs + * and other callbacks, that is the instance lock on devices for + * which netdev_need_ops_lock() returns true, otherwise by rtnl_lock; + * + * - double ops protected - always protected by rtnl_lock but for + * devices for which netdev_need_ops_lock() returns true - also + * the instance lock. + * + * Simply protects: * @gro_flush_timeout, @napi_defer_hard_irqs, @napi_list, * @net_shaper_hierarchy, @reg_state, @threaded * - * Partially protects (writers must hold both @lock and rtnl_lock): + * Double protects: * @up * * Also protects some fields in struct napi_struct. * - * For the drivers that implement shaper or queue API, the scope - * of this lock is expanded to cover most ndo/queue/ethtool/sysfs - * operations. - * * Ordering: take after rtnl_lock. */ struct mutex lock; diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index 689ffdfae50d..efd302375ef2 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -5,6 +5,7 @@ #include #include +#include static inline bool netdev_trylock(struct net_device *dev) { @@ -51,6 +52,8 @@ static inline void netdev_ops_assert_locked(const struct net_device *dev) { if (netdev_need_ops_lock(dev)) lockdep_assert_held(&dev->lock); + else + ASSERT_RTNL(); } static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, -- cgit v1.2.3 From 310ae9eb2617c62deedef8f121d7ca1ae774fa76 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 24 Mar 2025 15:45:32 -0700 Subject: net: designate queue -> napi linking as "ops protected" netdev netlink is the only reader of netdev_{,rx_}queue->napi, and it already holds netdev->lock. Switch protection of the writes to netdev->lock to "ops protected". The expectation will be now that accessing queue->napi will require netdev->lock for "ops locked" drivers, and rtnl_lock for all other drivers. Current "ops locked" drivers don't require any changes. gve and netdevsim use _locked() helpers right next to netif_queue_set_napi() so they must be holding the instance lock. iavf doesn't call it. bnxt is a bit messy but all paths seem locked. Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250324224537.248800-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 5 +++-- include/net/netdev_lock.h | 8 ++++++++ include/net/netdev_rx_queue.h | 2 +- net/core/dev.c | 3 +-- 4 files changed, 13 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 60ef367d8575..fa79145518d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -710,7 +710,7 @@ struct netdev_queue { * slow- / control-path part */ /* NAPI instance for the queue - * Readers and writers must hold RTNL + * "ops protected", see comment about net_device::lock */ struct napi_struct *napi; @@ -2526,7 +2526,8 @@ struct net_device { * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues * - * Also protects some fields in struct napi_struct. + * Also protects some fields in: + * struct napi_struct, struct netdev_queue, struct netdev_rx_queue * * Ordering: take after rtnl_lock. */ diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index efd302375ef2..1c0c9a94cc22 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -56,6 +56,14 @@ static inline void netdev_ops_assert_locked(const struct net_device *dev) ASSERT_RTNL(); } +static inline void +netdev_ops_assert_locked_or_invisible(const struct net_device *dev) +{ + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) + netdev_ops_assert_locked(dev); +} + static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index af40842f229d..b2238b551dce 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -24,7 +24,7 @@ struct netdev_rx_queue { struct xsk_buff_pool *pool; #endif /* NAPI instance for the queue - * Readers and writers must hold RTNL + * "ops protected", see comment about net_device::lock */ struct napi_struct *napi; struct pp_memory_provider_params mp_params; diff --git a/net/core/dev.c b/net/core/dev.c index 2d9be3ecd5e6..ab74e1f005d2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6901,8 +6901,7 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, if (WARN_ON_ONCE(napi && !napi->dev)) return; - if (dev->reg_state >= NETREG_REGISTERED) - ASSERT_RTNL(); + netdev_ops_assert_locked_or_invisible(dev); switch (type) { case NETDEV_QUEUE_TYPE_RX: -- cgit v1.2.3 From a7c428ee8f59f171a3b57474f2bd5cee0ef1e036 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Mar 2025 20:36:06 +0000 Subject: tcp/dccp: remove icsk->icsk_timeout icsk->icsk_timeout can be replaced by icsk->icsk_retransmit_timer.expires This saves 8 bytes in TCP/DCCP sockets and helps for better cache locality. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250324203607.703850-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/inet_connection_sock.rst | 3 +-- include/net/inet_connection_sock.h | 12 ++++++++---- net/dccp/timer.c | 4 ++-- net/ipv4/inet_diag.c | 4 ++-- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/tcp_timer.c | 11 ++++++----- net/ipv6/tcp_ipv6.c | 4 ++-- net/mptcp/protocol.c | 2 +- tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c | 4 ++-- tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c | 4 ++-- 10 files changed, 28 insertions(+), 24 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/net_cachelines/inet_connection_sock.rst b/Documentation/networking/net_cachelines/inet_connection_sock.rst index b2401aa7c450..5fb0dd70c9af 100644 --- a/Documentation/networking/net_cachelines/inet_connection_sock.rst +++ b/Documentation/networking/net_cachelines/inet_connection_sock.rst @@ -12,8 +12,7 @@ struct inet_sock icsk_inet read_mostly r struct request_sock_queue icsk_accept_queue struct inet_bind_bucket icsk_bind_hash read_mostly tcp_set_state struct inet_bind2_bucket icsk_bind2_hash read_mostly tcp_set_state,inet_put_port -unsigned_long icsk_timeout read_mostly inet_csk_reset_xmit_timer,tcp_connect -struct timer_list icsk_retransmit_timer read_mostly inet_csk_reset_xmit_timer,tcp_connect +struct timer_list icsk_retransmit_timer read_write inet_csk_reset_xmit_timer,tcp_connect struct timer_list icsk_delack_timer read_mostly inet_csk_reset_xmit_timer,tcp_connect u32 icsk_rto read_write tcp_cwnd_validate,tcp_schedule_loss_probe,tcp_connect_init,tcp_connect,tcp_write_xmit,tcp_push_one u32 icsk_rto_min diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 09a9d333fa42..6dca0ac6fbc6 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -56,7 +56,6 @@ struct inet_connection_sock_af_ops { * @icsk_accept_queue: FIFO of established children * @icsk_bind_hash: Bind node * @icsk_bind2_hash: Bind node in the bhash2 table - * @icsk_timeout: Timeout * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout * @icsk_pmtu_cookie Last pmtu seen by socket @@ -82,7 +81,6 @@ struct inet_connection_sock { struct request_sock_queue icsk_accept_queue; struct inet_bind_bucket *icsk_bind_hash; struct inet_bind2_bucket *icsk_bind2_hash; - unsigned long icsk_timeout; struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; @@ -187,6 +185,12 @@ static inline void inet_csk_delack_init(struct sock *sk) memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack)); } +static inline unsigned long +icsk_timeout(const struct inet_connection_sock *icsk) +{ + return READ_ONCE(icsk->icsk_retransmit_timer.expires); +} + static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -225,8 +229,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) { smp_store_release(&icsk->icsk_pending, what); - icsk->icsk_timeout = jiffies + when; - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + when += jiffies; + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, when); } else if (what == ICSK_TIME_DACK) { smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_TIMER); diff --git a/net/dccp/timer.c b/net/dccp/timer.c index a4cfb47b60e5..9fd14a336189 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -139,9 +139,9 @@ static void dccp_write_timer(struct timer_list *t) if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending) goto out; - if (time_after(icsk->icsk_timeout, jiffies)) { + if (time_after(icsk_timeout(icsk), jiffies)) { sk_reset_timer(sk, &icsk->icsk_retransmit_timer, - icsk->icsk_timeout); + icsk_timeout(icsk)); goto out; } diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index efe2a085cf68..c2bb91d9e9ff 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -315,12 +315,12 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; r->idiag_expires = - jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); } else if (icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = - jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); } else if (timer_pending(&sk->sk_timer)) { r->idiag_timer = 2; r->idiag_retrans = icsk->icsk_probes_out; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1cd0938d47e0..8cce0d5489da 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2923,10 +2923,10 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk_timeout(icsk); } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk_timeout(icsk); } else if (timer_pending(&sk->sk_timer)) { timer_active = 2; timer_expires = sk->sk_timer.expires; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 728bce01ccd3..d828b74c3e73 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -509,7 +509,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, * and tp->rcv_tstamp might very well have been written recently. * rcv_delta can thus be negative. */ - rcv_delta = icsk->icsk_timeout - tp->rcv_tstamp; + rcv_delta = icsk_timeout(icsk) - tp->rcv_tstamp; if (rcv_delta <= timeout) return false; @@ -685,7 +685,8 @@ out:; } /* Called with bottom-half processing disabled. - Called by tcp_write_timer() */ + * Called by tcp_write_timer() and tcp_release_cb(). + */ void tcp_write_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -695,11 +696,11 @@ void tcp_write_timer_handler(struct sock *sk) !icsk->icsk_pending) return; - if (time_after(icsk->icsk_timeout, jiffies)) { - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + if (time_after(icsk_timeout(icsk), jiffies)) { + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, + icsk_timeout(icsk)); return; } - tcp_mstamp_refresh(tcp_sk(sk)); event = icsk->icsk_pending; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c134cf1a603a..b03c223eda4f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2195,10 +2195,10 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk_timeout(icsk); } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk_timeout(icsk); } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index ad780ae1d30d..1ac378ba1d67 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -422,7 +422,7 @@ static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subfl const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? - inet_csk(ssk)->icsk_timeout - jiffies : 0; + icsk_timeout(inet_csk(ssk)) - jiffies : 0; } static void mptcp_set_timeout(struct sock *sk) diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c index d22449c69363..164640db3a29 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c @@ -99,10 +99,10 @@ static int dump_tcp_sock(struct seq_file *seq, struct tcp_sock *tp, icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk->icsk_retransmit_timer.expires; } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk->icsk_retransmit_timer.expires; } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c index 8b072666f9d9..591c703f5032 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c @@ -99,10 +99,10 @@ static int dump_tcp6_sock(struct seq_file *seq, struct tcp6_sock *tp, icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk->icsk_retransmit_timer.expires; } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk->icsk_retransmit_timer.expires; } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; -- cgit v1.2.3 From f1e30061e8a5af48c91ef2f25762f792114a6a90 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Mar 2025 20:36:07 +0000 Subject: tcp/dccp: remove icsk->icsk_ack.timeout icsk->icsk_ack.timeout can be replaced by icsk->csk_delack_timer.expires This saves 8 bytes in TCP/DCCP sockets and helps for better cache locality. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250324203607.703850-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/inet_connection_sock.rst | 1 - include/net/inet_connection_sock.h | 12 ++++++++---- net/dccp/output.c | 5 ++--- net/dccp/timer.c | 4 ++-- net/ipv4/tcp_output.c | 7 +++---- net/ipv4/tcp_timer.c | 5 +++-- net/mptcp/options.c | 1 - net/mptcp/protocol.c | 1 - 8 files changed, 18 insertions(+), 18 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/net_cachelines/inet_connection_sock.rst b/Documentation/networking/net_cachelines/inet_connection_sock.rst index 5fb0dd70c9af..8fae85ebb773 100644 --- a/Documentation/networking/net_cachelines/inet_connection_sock.rst +++ b/Documentation/networking/net_cachelines/inet_connection_sock.rst @@ -38,7 +38,6 @@ struct icsk_ack_u8 quick read_write w struct icsk_ack_u8 pingpong struct icsk_ack_u8 retry write_mostly read_write inet_csk_clear_xmit_timer,tcp_rearm_rto,tcp_event_new_data_sent,tcp_write_xmit,__tcp_send_ack,tcp_send_ack, struct icsk_ack_u8 ato read_mostly write_mostly tcp_dec_quickack_mode,tcp_event_ack_sent,__tcp_transmit_skb,__tcp_send_ack,tcp_send_ack -struct icsk_ack_unsigned_long timeout read_write read_write inet_csk_reset_xmit_timer,tcp_connect struct icsk_ack_u32 lrcvtime read_write tcp_finish_connect,tcp_connect,tcp_event_data_sent,__tcp_transmit_skb struct icsk_ack_u16 rcv_mss write_mostly read_mostly __tcp_select_window,__tcp_cleanup_rbuf,tcp_initialize_rcv_mss,tcp_connect_init struct icsk_mtup_int search_high read_write tcp_mtup_init,tcp_sync_mss,tcp_connect_init,tcp_mtu_check_reprobe,tcp_write_xmit diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 6dca0ac6fbc6..1735db332aab 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -113,7 +113,6 @@ struct inet_connection_sock { lrcv_flowlabel:20, /* last received ipv6 flowlabel */ dst_quick_ack:1, /* cache dst RTAX_QUICKACK */ unused:3; - unsigned long timeout; /* Currently scheduled timeout */ __u32 lrcvtime; /* timestamp of last received data packet */ __u16 last_seg_size; /* Size of last incoming segment */ __u16 rcv_mss; /* MSS used for delayed ACK decisions */ @@ -191,6 +190,12 @@ icsk_timeout(const struct inet_connection_sock *icsk) return READ_ONCE(icsk->icsk_retransmit_timer.expires); } +static inline unsigned long +icsk_delack_timeout(const struct inet_connection_sock *icsk) +{ + return READ_ONCE(icsk->icsk_delack_timer.expires); +} + static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -226,16 +231,15 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, when = max_when; } + when += jiffies; if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) { smp_store_release(&icsk->icsk_pending, what); - when += jiffies; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, when); } else if (what == ICSK_TIME_DACK) { smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_TIMER); - icsk->icsk_ack.timeout = jiffies + when; - sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + sk_reset_timer(sk, &icsk->icsk_delack_timer, when); } else { pr_debug("inet_csk BUG: unknown timer value\n"); } diff --git a/net/dccp/output.c b/net/dccp/output.c index 5c2e24f3c39b..39cf3430177a 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -627,11 +627,10 @@ void dccp_send_delayed_ack(struct sock *sk) return; } - if (!time_before(timeout, icsk->icsk_ack.timeout)) - timeout = icsk->icsk_ack.timeout; + if (!time_before(timeout, icsk_delack_timeout(icsk))) + timeout = icsk_delack_timeout(icsk); } icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; - icsk->icsk_ack.timeout = timeout; sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } #endif diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 9fd14a336189..232ac4ae0a73 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -185,9 +185,9 @@ static void dccp_delack_timer(struct timer_list *t) if (sk->sk_state == DCCP_CLOSED || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) goto out; - if (time_after(icsk->icsk_ack.timeout, jiffies)) { + if (time_after(icsk_delack_timeout(icsk), jiffies)) { sk_reset_timer(sk, &icsk->icsk_delack_timer, - icsk->icsk_ack.timeout); + icsk_delack_timeout(icsk)); goto out; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9a6061017114..13295a59d22e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4225,17 +4225,16 @@ void tcp_send_delayed_ack(struct sock *sk) /* Use new timeout only if there wasn't a older one earlier. */ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { /* If delack timer is about to expire, send ACK now. */ - if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { + if (time_before_eq(icsk_delack_timeout(icsk), jiffies + (ato >> 2))) { tcp_send_ack(sk); return; } - if (!time_before(timeout, icsk->icsk_ack.timeout)) - timeout = icsk->icsk_ack.timeout; + if (!time_before(timeout, icsk_delack_timeout(icsk))) + timeout = icsk_delack_timeout(icsk); } smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); - icsk->icsk_ack.timeout = timeout; sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index d828b74c3e73..d64383b06a29 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -322,8 +322,9 @@ void tcp_delack_timer_handler(struct sock *sk) if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) return; - if (time_after(icsk->icsk_ack.timeout, jiffies)) { - sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + if (time_after(icsk_delack_timeout(icsk), jiffies)) { + sk_reset_timer(sk, &icsk->icsk_delack_timer, + icsk_delack_timeout(icsk)); return; } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 23949ae2a3a8..421ced031289 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -432,7 +432,6 @@ static void clear_3rdack_retransmission(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); sk_stop_timer(sk, &icsk->icsk_delack_timer); - icsk->icsk_ack.timeout = 0; icsk->icsk_ack.ato = 0; icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER); } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1ac378ba1d67..44f7ab463d75 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3420,7 +3420,6 @@ static void schedule_3rdack_retransmission(struct sock *ssk) WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); - icsk->icsk_ack.timeout = timeout; sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout); } -- cgit v1.2.3 From eed14eb510c040a3826b633048244bb7a816c67d Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 21 Mar 2025 15:42:16 -0400 Subject: Bluetooth: MGMT: Add LL Privacy Setting This adds LL Privacy (bit 22) to Read Controller Information so the likes of bluetoothd(1) can detect when the controller supports it or not. Fixes: e209e5ccc5ac ("Bluetooth: MGMT: Mark LL Privacy as stable") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/mgmt.h | 1 + net/bluetooth/mgmt.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index affac861efdc..3575cd16049a 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -113,6 +113,7 @@ struct mgmt_rp_read_index_list { #define MGMT_SETTING_CIS_PERIPHERAL BIT(19) #define MGMT_SETTING_ISO_BROADCASTER BIT(20) #define MGMT_SETTING_ISO_SYNC_RECEIVER BIT(21) +#define MGMT_SETTING_LL_PRIVACY BIT(22) #define MGMT_OP_READ_INFO 0x0004 #define MGMT_READ_INFO_SIZE 0 diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 4fd30ba243be..c1e1e529e26c 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -851,6 +851,9 @@ static u32 get_supported_settings(struct hci_dev *hdev) if (cis_peripheral_capable(hdev)) settings |= MGMT_SETTING_CIS_PERIPHERAL; + if (ll_privacy_capable(hdev)) + settings |= MGMT_SETTING_LL_PRIVACY; + settings |= MGMT_SETTING_PHY_CONFIGURATION; return settings; @@ -933,6 +936,9 @@ static u32 get_current_settings(struct hci_dev *hdev) if (sync_recv_capable(hdev)) settings |= MGMT_SETTING_ISO_SYNC_RECEIVER; + if (ll_privacy_capable(hdev)) + settings |= MGMT_SETTING_LL_PRIVACY; + return settings; } -- cgit v1.2.3 From 705094f6556d88540e1076e432cbca6b596421dd Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 26 Mar 2025 15:01:48 +1100 Subject: unix: fix up for "apparmor: add fine grained af_unix mediation" After merging the apparmor tree, today's linux-next build (x86_64 allmodconfig) failed like this: security/apparmor/af_unix.c: In function 'unix_state_double_lock': security/apparmor/af_unix.c:627:17: error: implicit declaration of function 'unix_state_lock'; did you mean 'unix_state_double_lock'? [-Wimplicit-function-declaration] 627 | unix_state_lock(sk1); | ^~~~~~~~~~~~~~~ | unix_state_double_lock security/apparmor/af_unix.c: In function 'unix_state_double_unlock': security/apparmor/af_unix.c:642:17: error: implicit declaration of function 'unix_state_unlock'; did you mean 'unix_state_double_lock'? [-Wimplicit-function-declaration] 642 | unix_state_unlock(sk1); | ^~~~~~~~~~~~~~~~~ | unix_state_double_lock Caused by commit c05e705812d1 ("apparmor: add fine grained af_unix mediation") interacting with commit 84960bf24031 ("af_unix: Move internal definitions to net/unix/.") from the net-next tree. Reviewed-by: Kuniyuki Iwashima Signed-off-by: Stephen Rothwell Link: https://patch.msgid.link/20250326150148.72d9138d@canb.auug.org.au Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 3 +++ net/unix/af_unix.h | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index b588069ece7e..1af1841b7601 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -55,4 +55,7 @@ struct unix_sock { #define unix_sk(ptr) container_of_const(ptr, struct unix_sock, sk) #define unix_peer(sk) (unix_sk(sk)->peer) +#define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) +#define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock) + #endif diff --git a/net/unix/af_unix.h b/net/unix/af_unix.h index ed4aedc42813..59db179df9bb 100644 --- a/net/unix/af_unix.h +++ b/net/unix/af_unix.h @@ -8,9 +8,6 @@ #define UNIX_HASH_SIZE (256 * 2) #define UNIX_HASH_BITS 8 -#define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) -#define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock) - struct sock *unix_peer_get(struct sock *sk); struct unix_skb_parms { -- cgit v1.2.3