From 71a98248c63c535eaa4d4c22f099b68d902006d0 Mon Sep 17 00:00:00 2001 From: Yasuaki Torimaru Date: Thu, 26 Mar 2026 14:58:00 +0900 Subject: xfrm: clear trailing padding in build_polexpire() build_expire() clears the trailing padding bytes of struct xfrm_user_expire after setting the hard field via memset_after(), but the analogous function build_polexpire() does not do this for struct xfrm_user_polexpire. The padding bytes after the __u8 hard field are left uninitialized from the heap allocation, and are then sent to userspace via netlink multicast to XFRMNLGRP_EXPIRE listeners, leaking kernel heap memory contents. Add the missing memset_after() call, matching build_expire(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Yasuaki Torimaru Reviewed-by: Simon Horman Reviewed-by: Breno Leitao Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 1656b487f833..5d59c11fc01e 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3960,6 +3960,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, return err; } upe->hard = !!hard; + /* clear the padding bytes */ + memset_after(upe, 0, hard); nlmsg_end(skb, nlh); return 0; -- cgit v1.2.3 From 7081d46d32312f1a31f0e0e99c6835a394037599 Mon Sep 17 00:00:00 2001 From: Keenan Dong Date: Thu, 26 Mar 2026 20:36:39 +0800 Subject: xfrm: account XFRMA_IF_ID in aevent size calculation xfrm_get_ae() allocates the reply skb with xfrm_aevent_msgsize(), then build_aevent() appends attributes including XFRMA_IF_ID when x->if_id is set. xfrm_aevent_msgsize() does not include space for XFRMA_IF_ID. For states with if_id, build_aevent() can fail with -EMSGSIZE and hit BUG_ON(err < 0) in xfrm_get_ae(), turning a malformed netlink interaction into a kernel panic. Account XFRMA_IF_ID in the size calculation unconditionally and replace the BUG_ON with normal error unwinding. Fixes: 7e6526404ade ("xfrm: Add a new lookup key to match xfrm interfaces.") Reported-by: Keenan Dong Signed-off-by: Keenan Dong Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 5d59c11fc01e..a779590c985a 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2677,7 +2677,8 @@ static inline unsigned int xfrm_aevent_msgsize(struct xfrm_state *x) + nla_total_size(4) /* XFRM_AE_RTHR */ + nla_total_size(4) /* XFRM_AE_ETHR */ + nla_total_size(sizeof(x->dir)) /* XFRMA_SA_DIR */ - + nla_total_size(4); /* XFRMA_SA_PCPU */ + + nla_total_size(4) /* XFRMA_SA_PCPU */ + + nla_total_size(sizeof(x->if_id)); /* XFRMA_IF_ID */ } static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) @@ -2789,7 +2790,12 @@ static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh, c.portid = nlh->nlmsg_pid; err = build_aevent(r_skb, x, &c); - BUG_ON(err < 0); + if (err < 0) { + spin_unlock_bh(&x->lock); + xfrm_state_put(x); + kfree_skb(r_skb); + return err; + } err = nlmsg_unicast(xfrm_net_nlsk(net, skb), r_skb, NETLINK_CB(skb).portid); spin_unlock_bh(&x->lock); -- cgit v1.2.3 From ad8391d37f334ee73ba91926f8b4e4cf6d31ea04 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 1 Apr 2026 00:54:15 +0000 Subject: bpf: sockmap: Fix use-after-free of sk->sk_socket in sk_psock_verdict_data_ready(). syzbot reported use-after-free of AF_UNIX socket's sk->sk_socket in sk_psock_verdict_data_ready(). [0] In unix_stream_sendmsg(), the peer socket's ->sk_data_ready() is called after dropping its unix_state_lock(). Although the sender socket holds the peer's refcount, it does not prevent the peer's sock_orphan(), and the peer's sk_socket might be freed after one RCU grace period. Let's fetch the peer's sk->sk_socket and sk->sk_socket->ops under RCU in sk_psock_verdict_data_ready(). [0]: BUG: KASAN: slab-use-after-free in sk_psock_verdict_data_ready+0xec/0x590 net/core/skmsg.c:1278 Read of size 8 at addr ffff8880594da860 by task syz.4.1842/11013 CPU: 1 UID: 0 PID: 11013 Comm: syz.4.1842 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2026 Call Trace: dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xba/0x230 mm/kasan/report.c:482 kasan_report+0x117/0x150 mm/kasan/report.c:595 sk_psock_verdict_data_ready+0xec/0x590 net/core/skmsg.c:1278 unix_stream_sendmsg+0x8a3/0xe80 net/unix/af_unix.c:2482 sock_sendmsg_nosec net/socket.c:721 [inline] __sock_sendmsg net/socket.c:736 [inline] ____sys_sendmsg+0x972/0x9f0 net/socket.c:2585 ___sys_sendmsg+0x2a5/0x360 net/socket.c:2639 __sys_sendmsg net/socket.c:2671 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x1bd/0x2a0 net/socket.c:2674 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x14d/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7facf899c819 Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007facf9827028 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007facf8c15fa0 RCX: 00007facf899c819 RDX: 0000000000000000 RSI: 0000200000000500 RDI: 0000000000000004 RBP: 00007facf8a32c91 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007facf8c16038 R14: 00007facf8c15fa0 R15: 00007ffd41b01c78 Allocated by task 11013: kasan_save_stack mm/kasan/common.c:57 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:78 unpoison_slab_object mm/kasan/common.c:340 [inline] __kasan_slab_alloc+0x6c/0x80 mm/kasan/common.c:366 kasan_slab_alloc include/linux/kasan.h:253 [inline] slab_post_alloc_hook mm/slub.c:4538 [inline] slab_alloc_node mm/slub.c:4866 [inline] kmem_cache_alloc_lru_noprof+0x2b8/0x640 mm/slub.c:4885 sock_alloc_inode+0x28/0xc0 net/socket.c:316 alloc_inode+0x6a/0x1b0 fs/inode.c:347 new_inode_pseudo include/linux/fs.h:3003 [inline] sock_alloc net/socket.c:631 [inline] __sock_create+0x12d/0x9d0 net/socket.c:1562 sock_create net/socket.c:1656 [inline] __sys_socketpair+0x1c4/0x560 net/socket.c:1803 __do_sys_socketpair net/socket.c:1856 [inline] __se_sys_socketpair net/socket.c:1853 [inline] __x64_sys_socketpair+0x9b/0xb0 net/socket.c:1853 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x14d/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 15: kasan_save_stack mm/kasan/common.c:57 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:78 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:584 poison_slab_object mm/kasan/common.c:253 [inline] __kasan_slab_free+0x5c/0x80 mm/kasan/common.c:285 kasan_slab_free include/linux/kasan.h:235 [inline] slab_free_hook mm/slub.c:2685 [inline] slab_free mm/slub.c:6165 [inline] kmem_cache_free+0x187/0x630 mm/slub.c:6295 rcu_do_batch kernel/rcu/tree.c:2617 [inline] rcu_core+0x7cd/0x1070 kernel/rcu/tree.c:2869 handle_softirqs+0x22a/0x870 kernel/softirq.c:622 run_ksoftirqd+0x36/0x60 kernel/softirq.c:1063 smpboot_thread_fn+0x541/0xa50 kernel/smpboot.c:160 kthread+0x388/0x470 kernel/kthread.c:436 ret_from_fork+0x51e/0xb90 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Fixes: c63829182c37 ("af_unix: Implement ->psock_update_sk_prot()") Closes: https://lore.kernel.org/bpf/69cc6b9f.a70a0220.128fd0.004b.GAE@google.com/ Reported-by: syzbot+2184232f07e3677fbaef@syzkaller.appspotmail.com Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Reviewed-by: Jiayuan Chen Link: https://patch.msgid.link/20260401005418.2452999-1-kuniyu@google.com --- net/core/skmsg.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 3261793abe83..6187a83bd741 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1267,17 +1267,20 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { - struct socket *sock = sk->sk_socket; - const struct proto_ops *ops; + const struct proto_ops *ops = NULL; + struct socket *sock; int copied; trace_sk_data_ready(sk); - if (unlikely(!sock)) - return; - ops = READ_ONCE(sock->ops); + rcu_read_lock(); + sock = READ_ONCE(sk->sk_socket); + if (likely(sock)) + ops = READ_ONCE(sock->ops); + rcu_read_unlock(); if (!ops || !ops->read_skb) return; + copied = ops->read_skb(sk, sk_psock_verdict_recv); if (copied >= 0) { struct sk_psock *psock; -- cgit v1.2.3 From 3a359bf5c61d52e7f09754108309d637532164a6 Mon Sep 17 00:00:00 2001 From: Ruide Cao Date: Thu, 2 Apr 2026 23:12:31 +0800 Subject: batman-adv: reject oversized global TT response buffers batadv_tt_prepare_tvlv_global_data() builds the allocation length for a global TT response in 16-bit temporaries. When a remote originator advertises a large enough global TT, the TT payload length plus the VLAN header offset can exceed 65535 and wrap before kmalloc(). The full-table response path still uses the original TT payload length when it fills tt_change, so the wrapped allocation is too small and batadv_tt_prepare_tvlv_global_data() writes past the end of the heap object before the later packet-size check runs. Fix this by rejecting TT responses whose TVLV value length cannot fit in the 16-bit TVLV payload length field. Fixes: 7ea7b4a14275 ("batman-adv: make the TT CRC logic VLAN specific") Cc: stable@vger.kernel.org Reported-by: Yifan Wu Reported-by: Juefei Pu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Tested-by: Ren Wei Signed-off-by: Ruide Cao Signed-off-by: Ren Wei Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 6e95e883c2bf..05cddcf994f6 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -798,8 +798,8 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, { u16 num_vlan = 0; u16 num_entries = 0; - u16 change_offset; - u16 tvlv_len; + u16 tvlv_len = 0; + unsigned int change_offset; struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_orig_node_vlan *vlan; u8 *tt_change_ptr; @@ -816,6 +816,11 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, if (*tt_len < 0) *tt_len = batadv_tt_len(num_entries); + if (change_offset > U16_MAX || *tt_len > U16_MAX - change_offset) { + *tt_len = 0; + goto out; + } + tvlv_len = *tt_len; tvlv_len += change_offset; -- cgit v1.2.3 From c842743d073bdd683606cb414eb0ca84465dd834 Mon Sep 17 00:00:00 2001 From: Ruide Cao Date: Thu, 2 Apr 2026 22:46:20 +0800 Subject: net: sched: act_csum: validate nested VLAN headers tcf_csum_act() walks nested VLAN headers directly from skb->data when an skb still carries in-payload VLAN tags. The current code reads vlan->h_vlan_encapsulated_proto and then pulls VLAN_HLEN bytes without first ensuring that the full VLAN header is present in the linear area. If only part of an inner VLAN header is linearized, accessing h_vlan_encapsulated_proto reads past the linear area, and the following skb_pull(VLAN_HLEN) may violate skb invariants. Fix this by requiring pskb_may_pull(skb, VLAN_HLEN) before accessing and pulling each nested VLAN header. If the header still is not fully available, drop the packet through the existing error path. Fixes: 2ecba2d1e45b ("net: sched: act_csum: Fix csum calc for tagged packets") Reported-by: Yifan Wu Reported-by: Juefei Pu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Tested-by: Ren Wei Signed-off-by: Ruide Cao Signed-off-by: Ren Wei Reviewed-by: Simon Horman Link: https://patch.msgid.link/22df2fcb49f410203eafa5d97963dd36089f4ecf.1774892775.git.caoruide123@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/act_csum.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 213e1ce9d2da..a9e4635d899e 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -604,8 +604,12 @@ again: protocol = skb->protocol; orig_vlan_tag_present = true; } else { - struct vlan_hdr *vlan = (struct vlan_hdr *)skb->data; + struct vlan_hdr *vlan; + if (!pskb_may_pull(skb, VLAN_HLEN)) + goto drop; + + vlan = (struct vlan_hdr *)skb->data; protocol = vlan->h_vlan_encapsulated_proto; skb_pull(skb, VLAN_HLEN); skb_reset_network_header(skb); -- cgit v1.2.3 From 4e65a8b8daa18d63255ec58964dd192c7fdd9f8b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Apr 2026 10:17:32 +0000 Subject: ipv6: ioam: fix potential NULL dereferences in __ioam6_fill_trace_data() We need to check __in6_dev_get() for possible NULL value, as suggested by Yiming Qian. Also add skb_dst_dev_rcu() instead of skb_dst_dev(), and two missing READ_ONCE(). Note that @dev can't be NULL. Fixes: 9ee11f0fff20 ("ipv6: ioam: Data plane support for Pre-allocated Trace") Reported-by: Yiming Qian Signed-off-by: Eric Dumazet Reviewed-by: Justin Iurman Link: https://patch.msgid.link/20260402101732.1188059-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ioam6.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index 3978773bec42..05a0b7d7e2aa 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -710,7 +710,9 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_schema *sc, unsigned int sclen, bool is_input) { - struct net_device *dev = skb_dst_dev(skb); + /* Note: skb_dst_dev_rcu() can't be NULL at this point. */ + struct net_device *dev = skb_dst_dev_rcu(skb); + struct inet6_dev *i_skb_dev, *idev; struct timespec64 ts; ktime_t tstamp; u64 raw64; @@ -721,13 +723,16 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, data = trace->data + trace->remlen * 4 - trace->nodelen * 4 - sclen * 4; + i_skb_dev = skb->dev ? __in6_dev_get(skb->dev) : NULL; + idev = __in6_dev_get(dev); + /* hop_lim and node_id */ if (trace->type.bit0) { byte = ipv6_hdr(skb)->hop_limit; if (is_input) byte--; - raw32 = dev_net(dev)->ipv6.sysctl.ioam6_id; + raw32 = READ_ONCE(dev_net(dev)->ipv6.sysctl.ioam6_id); *(__be32 *)data = cpu_to_be32((byte << 24) | raw32); data += sizeof(__be32); @@ -735,18 +740,18 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, /* ingress_if_id and egress_if_id */ if (trace->type.bit1) { - if (!skb->dev) + if (!i_skb_dev) raw16 = IOAM6_U16_UNAVAILABLE; else - raw16 = (__force u16)READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_id); + raw16 = (__force u16)READ_ONCE(i_skb_dev->cnf.ioam6_id); *(__be16 *)data = cpu_to_be16(raw16); data += sizeof(__be16); - if (dev->flags & IFF_LOOPBACK) + if ((dev->flags & IFF_LOOPBACK) || !idev) raw16 = IOAM6_U16_UNAVAILABLE; else - raw16 = (__force u16)READ_ONCE(__in6_dev_get(dev)->cnf.ioam6_id); + raw16 = (__force u16)READ_ONCE(idev->cnf.ioam6_id); *(__be16 *)data = cpu_to_be16(raw16); data += sizeof(__be16); @@ -822,7 +827,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (is_input) byte--; - raw64 = dev_net(dev)->ipv6.sysctl.ioam6_id_wide; + raw64 = READ_ONCE(dev_net(dev)->ipv6.sysctl.ioam6_id_wide); *(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64); data += sizeof(__be64); @@ -830,18 +835,18 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, /* ingress_if_id and egress_if_id (wide) */ if (trace->type.bit9) { - if (!skb->dev) + if (!i_skb_dev) raw32 = IOAM6_U32_UNAVAILABLE; else - raw32 = READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_id_wide); + raw32 = READ_ONCE(i_skb_dev->cnf.ioam6_id_wide); *(__be32 *)data = cpu_to_be32(raw32); data += sizeof(__be32); - if (dev->flags & IFF_LOOPBACK) + if ((dev->flags & IFF_LOOPBACK) || !idev) raw32 = IOAM6_U32_UNAVAILABLE; else - raw32 = READ_ONCE(__in6_dev_get(dev)->cnf.ioam6_id_wide); + raw32 = READ_ONCE(idev->cnf.ioam6_id_wide); *(__be32 *)data = cpu_to_be32(raw32); data += sizeof(__be32); -- cgit v1.2.3 From 1979645e1842cb7017525a61a0e0e0beb924d02a Mon Sep 17 00:00:00 2001 From: Zijing Yin Date: Thu, 2 Apr 2026 07:01:53 -0700 Subject: bridge: guard local VLAN-0 FDB helpers against NULL vlan group When CONFIG_BRIDGE_VLAN_FILTERING is not set, br_vlan_group() and nbp_vlan_group() return NULL (br_private.h stub definitions). The BR_BOOLOPT_FDB_LOCAL_VLAN_0 toggle code is compiled unconditionally and reaches br_fdb_delete_locals_per_vlan_port() and br_fdb_insert_locals_per_vlan_port(), where the NULL vlan group pointer is dereferenced via list_for_each_entry(v, &vg->vlan_list, vlist). The observed crash is in the delete path, triggered when creating a bridge with IFLA_BR_MULTI_BOOLOPT containing BR_BOOLOPT_FDB_LOCAL_VLAN_0 via RTM_NEWLINK. The insert helper has the same bug pattern. Oops: general protection fault, probably for non-canonical address 0xdffffc0000000056: 0000 [#1] KASAN NOPTI KASAN: null-ptr-deref in range [0x00000000000002b0-0x00000000000002b7] RIP: 0010:br_fdb_delete_locals_per_vlan+0x2b9/0x310 Call Trace: br_fdb_toggle_local_vlan_0+0x452/0x4c0 br_toggle_fdb_local_vlan_0+0x31/0x80 net/bridge/br.c:276 br_boolopt_toggle net/bridge/br.c:313 br_boolopt_multi_toggle net/bridge/br.c:364 br_changelink net/bridge/br_netlink.c:1542 br_dev_newlink net/bridge/br_netlink.c:1575 Add NULL checks for the vlan group pointer in both helpers, returning early when there are no VLANs to iterate. This matches the existing pattern used by other bridge FDB functions such as br_fdb_add() and br_fdb_delete(). Fixes: 21446c06b441 ("net: bridge: Introduce UAPI for BR_BOOLOPT_FDB_LOCAL_VLAN_0") Signed-off-by: Zijing Yin Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260402140153.3925663-1-yzjaurora@gmail.com Signed-off-by: Jakub Kicinski --- net/bridge/br_fdb.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 0501ffcb8a3d..e2c17f620f00 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -597,6 +597,9 @@ static void br_fdb_delete_locals_per_vlan_port(struct net_bridge *br, dev = br->dev; } + if (!vg) + return; + list_for_each_entry(v, &vg->vlan_list, vlist) br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); } @@ -630,6 +633,9 @@ static int br_fdb_insert_locals_per_vlan_port(struct net_bridge *br, dev = br->dev; } + if (!vg) + return 0; + list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; -- cgit v1.2.3 From 7b735ef81286007794a227ce2539419479c02a5f Mon Sep 17 00:00:00 2001 From: Nikolaos Gkarlis Date: Thu, 2 Apr 2026 20:14:32 +0200 Subject: rtnetlink: add missing netlink_ns_capable() check for peer netns rtnl_newlink() lacks a CAP_NET_ADMIN capability check on the peer network namespace when creating paired devices (veth, vxcan, netkit). This allows an unprivileged user with a user namespace to create interfaces in arbitrary network namespaces, including init_net. Add a netlink_ns_capable() check for CAP_NET_ADMIN in the peer namespace before allowing device creation to proceed. Fixes: 81adee47dfb6 ("net: Support specifying the network namespace upon device creation.") Signed-off-by: Nikolaos Gkarlis Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260402181432.4126920-1-nickgarlis@gmail.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index fae8034efbff..69daba3ddaf0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3894,28 +3894,42 @@ out_unregister: goto out; } -static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, +static struct net *rtnl_get_peer_net(struct sk_buff *skb, + const struct rtnl_link_ops *ops, struct nlattr *tbp[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct nlattr *tb[IFLA_MAX + 1]; + struct nlattr *tb[IFLA_MAX + 1], **attrs; + struct net *net; int err; - if (!data || !data[ops->peer_type]) - return rtnl_link_get_net_ifla(tbp); - - err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); - if (err < 0) - return ERR_PTR(err); - - if (ops->validate) { - err = ops->validate(tb, NULL, extack); + if (!data || !data[ops->peer_type]) { + attrs = tbp; + } else { + err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); if (err < 0) return ERR_PTR(err); + + if (ops->validate) { + err = ops->validate(tb, NULL, extack); + if (err < 0) + return ERR_PTR(err); + } + + attrs = tb; } - return rtnl_link_get_net_ifla(tb); + net = rtnl_link_get_net_ifla(attrs); + if (IS_ERR_OR_NULL(net)) + return net; + + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + put_net(net); + return ERR_PTR(-EPERM); + } + + return net; } static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -4054,7 +4068,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } if (ops->peer_type) { - peer_net = rtnl_get_peer_net(ops, tb, data, extack); + peer_net = rtnl_get_peer_net(skb, ops, tb, data, extack); if (IS_ERR(peer_net)) { ret = PTR_ERR(peer_net); goto put_ops; -- cgit v1.2.3 From 48a5fe38772b6f039522469ee6131a67838221a8 Mon Sep 17 00:00:00 2001 From: Oleh Konko Date: Thu, 2 Apr 2026 09:48:57 +0000 Subject: tipc: fix bc_ackers underflow on duplicate GRP_ACK_MSG The GRP_ACK_MSG handler in tipc_group_proto_rcv() currently decrements bc_ackers on every inbound group ACK, even when the same member has already acknowledged the current broadcast round. Because bc_ackers is a u16, a duplicate ACK received after the last legitimate ACK wraps the counter to 65535. Once wrapped, tipc_group_bc_cong() keeps reporting congestion and later group broadcasts on the affected socket stay blocked until the group is recreated. Fix this by ignoring duplicate or stale ACKs before touching bc_acked or bc_ackers. This makes repeated GRP_ACK_MSG handling idempotent and prevents the underflow path. Fixes: 2f487712b893 ("tipc: guarantee that group broadcast doesn't bypass group unicast") Cc: stable@vger.kernel.org Signed-off-by: Oleh Konko Reviewed-by: Tung Nguyen Reviewed-by: Simon Horman Link: https://patch.msgid.link/41a4833f368641218e444fdcff822039.security@1seal.org Signed-off-by: Jakub Kicinski --- net/tipc/group.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/group.c b/net/tipc/group.c index e0e6227b433b..14e6732624e2 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -746,6 +746,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, u32 port = msg_origport(hdr); struct tipc_member *m, *pm; u16 remitted, in_flight; + u16 acked; if (!grp) return; @@ -798,7 +799,10 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, case GRP_ACK_MSG: if (!m) return; - m->bc_acked = msg_grp_bc_acked(hdr); + acked = msg_grp_bc_acked(hdr); + if (less_eq(acked, m->bc_acked)) + return; + m->bc_acked = acked; if (--grp->bc_ackers) return; list_del_init(&m->small_win); -- cgit v1.2.3 From 06aaf04ca815f7a1f17762fd847b7bc14b8833fb Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Thu, 2 Apr 2026 09:26:12 +0200 Subject: ipv4: nexthop: avoid duplicate NHA_HW_STATS_ENABLE on nexthop group dump Currently NHA_HW_STATS_ENABLE is included twice everytime a dump of nexthop group is performed with NHA_OP_FLAG_DUMP_STATS. As all the stats querying were moved to nla_put_nh_group_stats(), leave only that instance of the attribute querying. Fixes: 5072ae00aea4 ("net: nexthop: Expose nexthop group HW stats to user space") Signed-off-by: Fernando Fernandez Mancera Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260402072613.25262-1-fmancera@suse.de Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index c942f1282236..a0c694583299 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -902,8 +902,7 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh, goto nla_put_failure; if (op_flags & NHA_OP_FLAG_DUMP_STATS && - (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats) || - nla_put_nh_group_stats(skb, nh, op_flags))) + nla_put_nh_group_stats(skb, nh, op_flags)) goto nla_put_failure; return 0; -- cgit v1.2.3 From 14cf0cd35361f4e94824bf8a42f72713d7702a73 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Thu, 2 Apr 2026 09:26:13 +0200 Subject: ipv4: nexthop: allocate skb dynamically in rtm_get_nexthop() When querying a nexthop object via RTM_GETNEXTHOP, the kernel currently allocates a fixed-size skb using NLMSG_GOODSIZE. While sufficient for single nexthops and small Equal-Cost Multi-Path groups, this fixed allocation fails for large nexthop groups like 512 nexthops. This results in the following warning splat: WARNING: net/ipv4/nexthop.c:3395 at rtm_get_nexthop+0x176/0x1c0, CPU#20: rep/4608 [...] RIP: 0010:rtm_get_nexthop (net/ipv4/nexthop.c:3395) [...] Call Trace: rtnetlink_rcv_msg (net/core/rtnetlink.c:6989) netlink_rcv_skb (net/netlink/af_netlink.c:2550) netlink_unicast (net/netlink/af_netlink.c:1319 net/netlink/af_netlink.c:1344) netlink_sendmsg (net/netlink/af_netlink.c:1894) ____sys_sendmsg (net/socket.c:721 net/socket.c:736 net/socket.c:2585) ___sys_sendmsg (net/socket.c:2641) __sys_sendmsg (net/socket.c:2671) do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Fix this by allocating the size dynamically using nh_nlmsg_size() and using nlmsg_new(), this is consistent with nexthop_notify() behavior. In addition, adjust nh_nlmsg_size_grp() so it calculates the size needed based on flags passed. While at it, also add the size of NHA_FDB for nexthop group size calculation as it was missing too. This cannot be reproduced via iproute2 as the group size is currently limited and the command fails as follows: addattr_l ERROR: message exceeded bound of 1048 Fixes: 430a049190de ("nexthop: Add support for nexthop groups") Reported-by: Yiming Qian Closes: https://lore.kernel.org/netdev/CAL_bE8Li2h4KO+AQFXW4S6Yb_u5X4oSKnkywW+LPFjuErhqELA@mail.gmail.com/ Signed-off-by: Fernando Fernandez Mancera Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260402072613.25262-2-fmancera@suse.de Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index a0c694583299..2c9036c719b6 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1003,16 +1003,32 @@ static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg) nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */ } -static size_t nh_nlmsg_size_grp(struct nexthop *nh) +static size_t nh_nlmsg_size_grp(struct nexthop *nh, u32 op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; size_t tot = nla_total_size(sz) + - nla_total_size(2); /* NHA_GROUP_TYPE */ + nla_total_size(2) + /* NHA_GROUP_TYPE */ + nla_total_size(0); /* NHA_FDB */ if (nhg->resilient) tot += nh_nlmsg_size_grp_res(nhg); + if (op_flags & NHA_OP_FLAG_DUMP_STATS) { + tot += nla_total_size(0) + /* NHA_GROUP_STATS */ + nla_total_size(4); /* NHA_HW_STATS_ENABLE */ + tot += nhg->num_nh * + (nla_total_size(0) + /* NHA_GROUP_STATS_ENTRY */ + nla_total_size(4) + /* NHA_GROUP_STATS_ENTRY_ID */ + nla_total_size_64bit(8)); /* NHA_GROUP_STATS_ENTRY_PACKETS */ + + if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS) { + tot += nhg->num_nh * + nla_total_size_64bit(8); /* NHA_GROUP_STATS_ENTRY_PACKETS_HW */ + tot += nla_total_size(4); /* NHA_HW_STATS_USED */ + } + } + return tot; } @@ -1047,14 +1063,14 @@ static size_t nh_nlmsg_size_single(struct nexthop *nh) return sz; } -static size_t nh_nlmsg_size(struct nexthop *nh) +static size_t nh_nlmsg_size(struct nexthop *nh, u32 op_flags) { size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg)); sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) - sz += nh_nlmsg_size_grp(nh) + + sz += nh_nlmsg_size_grp(nh, op_flags) + nla_total_size(4) + /* NHA_OP_FLAGS */ 0; else @@ -1070,7 +1086,7 @@ static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any()); + skb = nlmsg_new(nh_nlmsg_size(nh, 0), gfp_any()); if (!skb) goto errout; @@ -3376,15 +3392,15 @@ static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (err) return err; - err = -ENOBUFS; - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) - goto out; - err = -ENOENT; nh = nexthop_find_by_id(net, id); if (!nh) - goto errout_free; + goto out; + + err = -ENOBUFS; + skb = nlmsg_new(nh_nlmsg_size(nh, op_flags), GFP_KERNEL); + if (!skb) + goto out; err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, op_flags); -- cgit v1.2.3 From fde29fd9349327acc50d19a0b5f3d5a6c964dfd8 Mon Sep 17 00:00:00 2001 From: Yiqi Sun Date: Thu, 2 Apr 2026 15:04:19 +0800 Subject: ipv4: icmp: fix null-ptr-deref in icmp_build_probe() ipv6_stub->ipv6_dev_find() may return ERR_PTR(-EAFNOSUPPORT) when the IPv6 stack is not active (CONFIG_IPV6=m and not loaded), and passing this error pointer to dev_hold() will cause a kernel crash with null-ptr-deref. Instead, silently discard the request. RFC 8335 does not appear to define a specific response for the case where an IPv6 interface identifier is syntactically valid but the implementation cannot perform the lookup at runtime, and silently dropping the request may safer than misreporting "No Such Interface". Fixes: d329ea5bd884 ("icmp: add response to RFC 8335 PROBE messages") Signed-off-by: Yiqi Sun Link: https://patch.msgid.link/20260402070419.2291578-1-sunyiqixm@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/icmp.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 568bd1e95d44..4e2a6c70dcd8 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1346,6 +1346,13 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in6_addr)) goto send_mal_query; dev = ipv6_stub->ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev); + /* + * If IPv6 identifier lookup is unavailable, silently + * discard the request instead of misreporting NO_IF. + */ + if (IS_ERR(dev)) + return false; + dev_hold(dev); break; #endif -- cgit v1.2.3 From 82d8701b2c930d0e96b0dbc9115a218d791cb0d2 Mon Sep 17 00:00:00 2001 From: Haoze Xie Date: Mon, 6 Apr 2026 21:17:28 +0800 Subject: batman-adv: hold claim backbone gateways by reference batadv_bla_add_claim() can replace claim->backbone_gw and drop the old gateway's last reference while readers still follow the pointer. The netlink claim dump path dereferences claim->backbone_gw->orig and takes claim->backbone_gw->crc_lock without pinning the underlying backbone gateway. batadv_bla_check_claim() still has the same naked pointer access pattern. Reuse batadv_bla_claim_get_backbone_gw() in both readers so they operate on a stable gateway reference until the read-side work is complete. This keeps the dump and claim-check paths aligned with the lifetime rules introduced for the other BLA claim readers. Fixes: 23721387c409 ("batman-adv: add basic bridge loop avoidance code") Fixes: 04f3f5bf1883 ("batman-adv: add B.A.T.M.A.N. Dump BLA claims via netlink") Cc: stable@vger.kernel.org Reported-by: Yifan Wu Reported-by: Juefei Pu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Signed-off-by: Haoze Xie Signed-off-by: Ao Zhou Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bridge_loop_avoidance.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 3dc791c15bf7..648fa97ea913 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -2130,6 +2130,7 @@ batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, struct batadv_bla_claim *claim) { const u8 *primary_addr = primary_if->net_dev->dev_addr; + struct batadv_bla_backbone_gw *backbone_gw; u16 backbone_crc; bool is_own; void *hdr; @@ -2145,32 +2146,35 @@ batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, genl_dump_check_consistent(cb, hdr); - is_own = batadv_compare_eth(claim->backbone_gw->orig, - primary_addr); + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); + + is_own = batadv_compare_eth(backbone_gw->orig, primary_addr); - spin_lock_bh(&claim->backbone_gw->crc_lock); - backbone_crc = claim->backbone_gw->crc; - spin_unlock_bh(&claim->backbone_gw->crc_lock); + spin_lock_bh(&backbone_gw->crc_lock); + backbone_crc = backbone_gw->crc; + spin_unlock_bh(&backbone_gw->crc_lock); if (is_own) if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) { genlmsg_cancel(msg, hdr); - goto out; + goto put_backbone_gw; } if (nla_put(msg, BATADV_ATTR_BLA_ADDRESS, ETH_ALEN, claim->addr) || nla_put_u16(msg, BATADV_ATTR_BLA_VID, claim->vid) || nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN, - claim->backbone_gw->orig) || + backbone_gw->orig) || nla_put_u16(msg, BATADV_ATTR_BLA_CRC, backbone_crc)) { genlmsg_cancel(msg, hdr); - goto out; + goto put_backbone_gw; } genlmsg_end(msg, hdr); ret = 0; +put_backbone_gw: + batadv_backbone_gw_put(backbone_gw); out: return ret; } @@ -2448,6 +2452,7 @@ out: bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid) { + struct batadv_bla_backbone_gw *backbone_gw; struct batadv_bla_claim search_claim; struct batadv_bla_claim *claim = NULL; struct batadv_hard_iface *primary_if = NULL; @@ -2470,9 +2475,13 @@ bool batadv_bla_check_claim(struct batadv_priv *bat_priv, * return false. */ if (claim) { - if (!batadv_compare_eth(claim->backbone_gw->orig, + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); + + if (!batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr)) ret = false; + + batadv_backbone_gw_put(backbone_gw); batadv_claim_put(claim); } -- cgit v1.2.3 From a315e022a72d95ef5f1d4e58e903cb492b0ad931 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 2 Apr 2026 17:49:51 +0200 Subject: xsk: tighten UMEM headroom validation to account for tailroom and min frame MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current headroom validation in xdp_umem_reg() could leave us with insufficient space dedicated to even receive minimum-sized ethernet frame. Furthermore if multi-buffer would come to play then skb_shared_info stored at the end of XSK frame would be corrupted. HW typically works with 128-aligned sizes so let us provide this value as bare minimum. Multi-buffer setting is known later in the configuration process so besides accounting for 128 bytes, let us also take care of tailroom space upfront. Reviewed-by: Björn Töpel Acked-by: Stanislav Fomichev Fixes: 99e3a236dd43 ("xsk: Add missing check on user supplied headroom size") Signed-off-by: Maciej Fijalkowski Link: https://patch.msgid.link/20260402154958.562179-2-maciej.fijalkowski@intel.com Signed-off-by: Jakub Kicinski --- net/xdp/xdp_umem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 066ce07c506d..58da2f4f4397 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -203,7 +203,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (!unaligned_chunks && chunks_rem) return -EINVAL; - if (headroom >= chunk_size - XDP_PACKET_HEADROOM) + if (headroom > chunk_size - XDP_PACKET_HEADROOM - + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) - 128) return -EINVAL; if (mr->flags & XDP_UMEM_TX_METADATA_LEN) { -- cgit v1.2.3 From 1ee1605138fc94cc8f8f273321dd2471c64977f9 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 2 Apr 2026 17:49:52 +0200 Subject: xsk: respect tailroom for ZC setups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multi-buffer XDP stores information about frags in skb_shared_info that sits at the tailroom of a packet. The storage space is reserved via xdp_data_hard_end(): ((xdp)->data_hard_start + (xdp)->frame_sz - \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) and then we refer to it via macro below: static inline struct skb_shared_info * xdp_get_shared_info_from_buff(const struct xdp_buff *xdp) { return (struct skb_shared_info *)xdp_data_hard_end(xdp); } Currently we do not respect this tailroom space in multi-buffer AF_XDP ZC scenario. To address this, introduce xsk_pool_get_tailroom() and use it within xsk_pool_get_rx_frame_size() which is used in ZC drivers to configure length of HW Rx buffer. Typically drivers on Rx Hw buffers side work on 128 byte alignment so let us align the value returned by xsk_pool_get_rx_frame_size() in order to avoid addressing this on driver's side. This addresses the fact that idpf uses mentioned function *before* pool->dev being set so we were at risk that after subtracting tailroom we would not provide 128-byte aligned value to HW. Since xsk_pool_get_rx_frame_size() is actively used in xsk_rcv_check() and __xsk_rcv(), add a variant of this routine that will not include 128 byte alignment and therefore old behavior is preserved. Reviewed-by: Björn Töpel Acked-by: Stanislav Fomichev Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") Signed-off-by: Maciej Fijalkowski Link: https://patch.msgid.link/20260402154958.562179-3-maciej.fijalkowski@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp_sock_drv.h | 23 ++++++++++++++++++++++- net/xdp/xsk.c | 4 ++-- 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 6b9ebae2dc95..46797645a0c2 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -41,16 +41,37 @@ static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) return XDP_PACKET_HEADROOM + pool->headroom; } +static inline u32 xsk_pool_get_tailroom(bool mbuf) +{ + return mbuf ? SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : 0; +} + static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { return pool->chunk_size; } -static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) +static inline u32 __xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); } +static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) +{ + u32 frame_size = __xsk_pool_get_rx_frame_size(pool); + struct xdp_umem *umem = pool->umem; + bool mbuf; + + /* Reserve tailroom only for zero-copy pools that opted into + * multi-buffer. The reserved area is used for skb_shared_info, + * matching the XDP core's xdp_data_hard_end() layout. + */ + mbuf = pool->dev && (umem->flags & XDP_UMEM_SG_FLAG); + frame_size -= xsk_pool_get_tailroom(mbuf); + + return ALIGN_DOWN(frame_size, 128); +} + static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool) { return pool->unaligned ? 0 : xsk_pool_get_chunk_size(pool); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 6149f6a79897..c8ef9e427c9c 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -239,7 +239,7 @@ static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); + u32 frame_size = __xsk_pool_get_rx_frame_size(xs->pool); void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; u32 from_len, meta_len, rem, num_desc; struct xdp_buff_xsk *xskb; @@ -338,7 +338,7 @@ static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; - if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { + if (len > __xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { xs->rx_dropped++; return -ENOSPC; } -- cgit v1.2.3 From 93e84fe45b752d17a5a46b306ed78f0133bbc719 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 2 Apr 2026 17:49:53 +0200 Subject: xsk: fix XDP_UMEM_SG_FLAG issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently xp_assign_dev_shared() is missing XDP_USE_SG being propagated to flags so set it in order to preserve mtu check that is supposed to be done only when no multi-buffer setup is in picture. Also, this flag has the same value as XDP_UMEM_TX_SW_CSUM so we could get unexpected SG setups for software Tx checksums. Since csum flag is UAPI, modify value of XDP_UMEM_SG_FLAG. Fixes: d609f3d228a8 ("xsk: add multi-buffer support for sockets sharing umem") Reviewed-by: Björn Töpel Signed-off-by: Maciej Fijalkowski Link: https://patch.msgid.link/20260402154958.562179-4-maciej.fijalkowski@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp_sock.h | 2 +- net/xdp/xsk_buff_pool.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 23e8861e8b25..ebac60a3d8a1 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -14,7 +14,7 @@ #include #include -#define XDP_UMEM_SG_FLAG (1 << 1) +#define XDP_UMEM_SG_FLAG BIT(3) struct net_device; struct xsk_queue; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 37b7a68b89b3..729602a3cec0 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -247,6 +247,10 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, struct xdp_umem *umem = umem_xs->umem; flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY; + + if (umem->flags & XDP_UMEM_SG_FLAG) + flags |= XDP_USE_SG; + if (umem_xs->pool->uses_need_wakeup) flags |= XDP_USE_NEED_WAKEUP; -- cgit v1.2.3 From 36ee60b569ba0dfb6f961333b90d19ab5b323fa9 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 2 Apr 2026 17:49:54 +0200 Subject: xsk: validate MTU against usable frame size on bind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AF_XDP bind currently accepts zero-copy pool configurations without verifying that the device MTU fits into the usable frame space provided by the UMEM chunk. This becomes a problem since we started to respect tailroom which is subtracted from chunk_size (among with headroom). 2k chunk size might not provide enough space for standard 1500 MTU, so let us catch such settings at bind time. Furthermore, validate whether underlying HW will be able to satisfy configured MTU wrt XSK's frame size multiplied by supported Rx buffer chain length (that is exposed via net_device::xdp_zc_max_segs). Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") Reviewed-by: Björn Töpel Signed-off-by: Maciej Fijalkowski Link: https://patch.msgid.link/20260402154958.562179-5-maciej.fijalkowski@intel.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk_buff_pool.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 729602a3cec0..cd7bc50872f6 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -10,6 +10,8 @@ #include "xdp_umem.h" #include "xsk.h" +#define ETH_PAD_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN) + void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs) { if (!xs->tx) @@ -157,8 +159,12 @@ static void xp_disable_drv_zc(struct xsk_buff_pool *pool) int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev, u16 queue_id, u16 flags) { + u32 needed = netdev->mtu + ETH_PAD_LEN; + u32 segs = netdev->xdp_zc_max_segs; + bool mbuf = flags & XDP_USE_SG; bool force_zc, force_copy; struct netdev_bpf bpf; + u32 frame_size; int err = 0; ASSERT_RTNL(); @@ -178,7 +184,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, if (err) return err; - if (flags & XDP_USE_SG) + if (mbuf) pool->umem->flags |= XDP_UMEM_SG_FLAG; if (flags & XDP_USE_NEED_WAKEUP) @@ -200,8 +206,24 @@ int xp_assign_dev(struct xsk_buff_pool *pool, goto err_unreg_pool; } - if (netdev->xdp_zc_max_segs == 1 && (flags & XDP_USE_SG)) { - err = -EOPNOTSUPP; + if (mbuf) { + if (segs == 1) { + err = -EOPNOTSUPP; + goto err_unreg_pool; + } + } else { + segs = 1; + } + + /* open-code xsk_pool_get_rx_frame_size() as pool->dev is not + * set yet at this point; we are before getting down to driver + */ + frame_size = __xsk_pool_get_rx_frame_size(pool) - + xsk_pool_get_tailroom(mbuf); + frame_size = ALIGN_DOWN(frame_size, 128); + + if (needed > frame_size * segs) { + err = -EINVAL; goto err_unreg_pool; } -- cgit v1.2.3 From 0f42e3f4fe2a58394e37241d02d9ca6ab7b7d516 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Fri, 3 Apr 2026 09:45:12 +0800 Subject: net: skb: fix cross-cache free of KFENCE-allocated skb head SKB_SMALL_HEAD_CACHE_SIZE is intentionally set to a non-power-of-2 value (e.g. 704 on x86_64) to avoid collisions with generic kmalloc bucket sizes. This ensures that skb_kfree_head() can reliably use skb_end_offset to distinguish skb heads allocated from skb_small_head_cache vs. generic kmalloc caches. However, when KFENCE is enabled, kfence_ksize() returns the exact requested allocation size instead of the slab bucket size. If a caller (e.g. bpf_test_init) allocates skb head data via kzalloc() and the requested size happens to equal SKB_SMALL_HEAD_CACHE_SIZE, then slab_build_skb() -> ksize() returns that exact value. After subtracting skb_shared_info overhead, skb_end_offset ends up matching SKB_SMALL_HEAD_HEADROOM, causing skb_kfree_head() to incorrectly free the object to skb_small_head_cache instead of back to the original kmalloc cache, resulting in a slab cross-cache free: kmem_cache_free(skbuff_small_head): Wrong slab cache. Expected skbuff_small_head but got kmalloc-1k Fix this by always calling kfree(head) in skb_kfree_head(). This keeps the free path generic and avoids allocator-specific misclassification for KFENCE objects. Fixes: bf9f1baa279f ("net: add dedicated kmem_cache for typical/small skb->head") Reported-by: Antonius Closes: https://lore.kernel.org/netdev/CAK8a0jxC5L5N7hq-DT2_NhUyjBxrPocoiDazzsBk4TGgT1r4-A@mail.gmail.com/ Signed-off-by: Jiayuan Chen Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260403014517.142550-1-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0e217041958a..43ee86dcf2ea 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1083,10 +1083,7 @@ static int skb_pp_frag_ref(struct sk_buff *skb) static void skb_kfree_head(void *head, unsigned int end_offset) { - if (end_offset == SKB_SMALL_HEAD_HEADROOM) - kmem_cache_free(net_hotdata.skb_small_head_cache, head); - else - kfree(head); + kfree(head); } static void skb_free_head(struct sk_buff *skb) -- cgit v1.2.3 From 069daad4f2ae9c5c108131995529d5f02392c446 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Thu, 2 Apr 2026 13:31:04 +0200 Subject: xfrm: Wait for RCU readers during policy netns exit xfrm_policy_fini() frees the policy_bydst hash tables after flushing the policy work items and deleting all policies, but it does not wait for concurrent RCU readers to leave their read-side critical sections first. The policy_bydst tables are published via rcu_assign_pointer() and are looked up through rcu_dereference_check(), so netns teardown must also wait for an RCU grace period before freeing the table memory. Fix this by adding synchronize_rcu() before freeing the policy hash tables. Fixes: e1e551bc5630 ("xfrm: policy: prepare policy_bydst hash for rcu lookups") Signed-off-by: Steffen Klassert Reviewed-by: Florian Westphal --- net/xfrm/xfrm_policy.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 362939aa56cf..8f0188e763c7 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4290,6 +4290,8 @@ static void xfrm_policy_fini(struct net *net) #endif xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false); + synchronize_rcu(); + WARN_ON(!list_empty(&net->xfrm.policy_all)); for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { -- cgit v1.2.3 From 1c428b03840094410c5fb6a5db30640486bbbfcb Mon Sep 17 00:00:00 2001 From: Qi Tang Date: Thu, 2 Apr 2026 19:44:01 +0800 Subject: xfrm: hold dev ref until after transport_finish NF_HOOK After async crypto completes, xfrm_input_resume() calls dev_put() immediately on re-entry before the skb reaches transport_finish. The skb->dev pointer is then used inside NF_HOOK and its okfn, which can race with device teardown. Remove the dev_put from the async resumption entry and instead drop the reference after the NF_HOOK call in transport_finish, using a saved device pointer since NF_HOOK may consume the skb. This covers NF_DROP, NF_QUEUE and NF_STOLEN paths that skip the okfn. For non-transport exits (decaps, gro, drop) and secondary async return points, release the reference inline when async is set. Suggested-by: Florian Westphal Fixes: acf568ee859f ("xfrm: Reinject transport-mode packets through tasklet") Cc: stable@vger.kernel.org Signed-off-by: Qi Tang Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_input.c | 5 ++++- net/ipv6/xfrm6_input.c | 5 ++++- net/xfrm/xfrm_input.c | 18 ++++++++++++++---- 3 files changed, 22 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index f28cfd88eaf5..c2eac844bcdb 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -50,6 +50,7 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); struct iphdr *iph = ip_hdr(skb); + struct net_device *dev = skb->dev; iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; @@ -73,8 +74,10 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) } NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, - dev_net(skb->dev), NULL, skb, skb->dev, NULL, + dev_net(dev), NULL, skb, dev, NULL, xfrm4_rcv_encap_finish); + if (async) + dev_put(dev); return 0; } diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 9005fc156a20..699a001ac166 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -43,6 +43,7 @@ static int xfrm6_transport_finish2(struct net *net, struct sock *sk, int xfrm6_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); + struct net_device *dev = skb->dev; int nhlen = -skb_network_offset(skb); skb_network_header(skb)[IP6CB(skb)->nhoff] = @@ -68,8 +69,10 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) } NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, - dev_net(skb->dev), NULL, skb, skb->dev, NULL, + dev_net(dev), NULL, skb, dev, NULL, xfrm6_transport_finish2); + if (async) + dev_put(dev); return 0; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index dc1312ed5a09..f65291eba1f6 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -506,7 +506,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) /* An encap_type of -1 indicates async resumption. */ if (encap_type == -1) { async = 1; - dev_put(skb->dev); seq = XFRM_SKB_CB(skb)->seq.input.low; spin_lock(&x->lock); goto resume; @@ -659,8 +658,11 @@ process: dev_hold(skb->dev); nexthdr = x->type->input(x, skb); - if (nexthdr == -EINPROGRESS) + if (nexthdr == -EINPROGRESS) { + if (async) + dev_put(skb->dev); return 0; + } dev_put(skb->dev); spin_lock(&x->lock); @@ -695,9 +697,11 @@ resume: XFRM_MODE_SKB_CB(skb)->protocol = nexthdr; err = xfrm_inner_mode_input(x, skb); - if (err == -EINPROGRESS) + if (err == -EINPROGRESS) { + if (async) + dev_put(skb->dev); return 0; - else if (err) { + } else if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); goto drop; } @@ -734,6 +738,8 @@ resume_decapped: sp->olen = 0; if (skb_valid_dst(skb)) skb_dst_drop(skb); + if (async) + dev_put(skb->dev); gro_cells_receive(&gro_cells, skb); return 0; } else { @@ -753,6 +759,8 @@ resume_decapped: sp->olen = 0; if (skb_valid_dst(skb)) skb_dst_drop(skb); + if (async) + dev_put(skb->dev); gro_cells_receive(&gro_cells, skb); return err; } @@ -763,6 +771,8 @@ resume_decapped: drop_unlock: spin_unlock(&x->lock); drop: + if (async) + dev_put(skb->dev); xfrm_rcv_cb(skb, family, x && x->type ? x->type->proto : nexthdr, -1); kfree_skb(skb); return 0; -- cgit v1.2.3 From 83317cce60a032c49480dcdabe146435bd689d03 Mon Sep 17 00:00:00 2001 From: Kotlyarov Mihail Date: Sat, 4 Apr 2026 12:05:20 +0300 Subject: xfrm: fix refcount leak in xfrm_migrate_policy_find syzkaller reported a memory leak in xfrm_policy_alloc: BUG: memory leak unreferenced object 0xffff888114d79000 (size 1024): comm "syz.1.17", pid 931 ... xfrm_policy_alloc+0xb3/0x4b0 net/xfrm/xfrm_policy.c:432 The root cause is a double call to xfrm_pol_hold_rcu() in xfrm_migrate_policy_find(). The lookup function already returns a policy with held reference, making the second call redundant. Remove the redundant xfrm_pol_hold_rcu() call to fix the refcount imbalance and prevent the memory leak. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 563d5ca93e88 ("xfrm: switch migrate to xfrm_policy_lookup_bytype") Signed-off-by: Kotlyarov Mihail Reviewed-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 8f0188e763c7..a872af5610dc 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4528,9 +4528,6 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * pol = xfrm_policy_lookup_bytype(net, type, &fl, sel->family, dir, if_id); if (IS_ERR_OR_NULL(pol)) goto out_unlock; - - if (!xfrm_pol_hold_rcu(pol)) - pol = NULL; out_unlock: rcu_read_unlock(); return pol; -- cgit v1.2.3 From 1beb76b2053b68c491b78370794b8ff63c8f8c02 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 6 Apr 2026 17:33:03 +0200 Subject: xfrm_user: fix info leak in build_mapping() struct xfrm_usersa_id has a one-byte padding hole after the proto field, which ends up never getting set to zero before copying out to userspace. Fix that up by zeroing out the whole structure before setting individual variables. Fixes: 3a2dfbe8acb1 ("xfrm: Notify changes in UDP encapsulation via netlink") Cc: Steffen Klassert Cc: Herbert Xu Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Assisted-by: gregkh_clanker_t1000 Signed-off-by: Greg Kroah-Hartman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index a779590c985a..baa43c325da2 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -4172,6 +4172,7 @@ static int build_mapping(struct sk_buff *skb, struct xfrm_state *x, um = nlmsg_data(nlh); + memset(&um->id, 0, sizeof(um->id)); memcpy(&um->id.daddr, &x->id.daddr, sizeof(um->id.daddr)); um->id.spi = x->id.spi; um->id.family = x->props.family; -- cgit v1.2.3 From d10119968d0e1f2b669604baf2a8b5fdb72fa6b4 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 6 Apr 2026 17:34:22 +0200 Subject: xfrm_user: fix info leak in build_report() struct xfrm_user_report is a __u8 proto field followed by a struct xfrm_selector which means there is three "empty" bytes of padding, but the padding is never zeroed before copying to userspace. Fix that up by zeroing the structure before setting individual member variables. Cc: stable Cc: Steffen Klassert Cc: Herbert Xu Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Assisted-by: gregkh_clanker_t1000 Signed-off-by: Greg Kroah-Hartman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index baa43c325da2..d56450f61669 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -4125,6 +4125,7 @@ static int build_report(struct sk_buff *skb, u8 proto, return -EMSGSIZE; ur = nlmsg_data(nlh); + memset(ur, 0, sizeof(*ur)); ur->proto = proto; memcpy(&ur->sel, sel, sizeof(ur->sel)); -- cgit v1.2.3 From 426c355742f02cf743b347d9d7dbdc1bfbfa31ef Mon Sep 17 00:00:00 2001 From: Zhengchuan Liang Date: Sun, 22 Mar 2026 11:46:08 -0700 Subject: net: af_key: zero aligned sockaddr tail in PF_KEY exports PF_KEY export paths use `pfkey_sockaddr_size()` when reserving sockaddr payload space, so IPv6 addresses occupy 32 bytes on the wire. However, `pfkey_sockaddr_fill()` initializes only the first 28 bytes of `struct sockaddr_in6`, leaving the final 4 aligned bytes uninitialized. Not every PF_KEY message is affected. The state and policy dump builders already zero the whole message buffer before filling the sockaddr payloads. Keep the fix to the export paths that still append aligned sockaddr payloads with plain `skb_put()`: - `SADB_ACQUIRE` - `SADB_X_NAT_T_NEW_MAPPING` - `SADB_X_MIGRATE` Fix those paths by clearing only the aligned sockaddr tail after `pfkey_sockaddr_fill()`. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Fixes: 08de61beab8a ("[PFKEYV2]: Extension for dynamic update of endpoint address(es)") Reported-by: Yifan Wu Reported-by: Juefei Pu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Tested-by: Xiao Liu Signed-off-by: Zhengchuan Liang Signed-off-by: Steffen Klassert --- net/key/af_key.c | 52 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index 72ac2ace419d..5d480ae39405 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -757,6 +757,22 @@ static unsigned int pfkey_sockaddr_fill(const xfrm_address_t *xaddr, __be16 port return 0; } +static unsigned int pfkey_sockaddr_fill_zero_tail(const xfrm_address_t *xaddr, + __be16 port, + struct sockaddr *sa, + unsigned short family) +{ + unsigned int prefixlen; + int sockaddr_len = pfkey_sockaddr_len(family); + int sockaddr_size = pfkey_sockaddr_size(family); + + prefixlen = pfkey_sockaddr_fill(xaddr, port, sa, family); + if (sockaddr_size > sockaddr_len) + memset((u8 *)sa + sockaddr_len, 0, sockaddr_size - sockaddr_len); + + return prefixlen; +} + static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, int add_keys, int hsc) { @@ -3206,9 +3222,9 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = - pfkey_sockaddr_fill(&x->props.saddr, 0, - (struct sockaddr *) (addr + 1), - x->props.family); + pfkey_sockaddr_fill_zero_tail(&x->props.saddr, 0, + (struct sockaddr *)(addr + 1), + x->props.family); if (!addr->sadb_address_prefixlen) BUG(); @@ -3221,9 +3237,9 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = - pfkey_sockaddr_fill(&x->id.daddr, 0, - (struct sockaddr *) (addr + 1), - x->props.family); + pfkey_sockaddr_fill_zero_tail(&x->id.daddr, 0, + (struct sockaddr *)(addr + 1), + x->props.family); if (!addr->sadb_address_prefixlen) BUG(); @@ -3421,9 +3437,9 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = - pfkey_sockaddr_fill(&x->props.saddr, 0, - (struct sockaddr *) (addr + 1), - x->props.family); + pfkey_sockaddr_fill_zero_tail(&x->props.saddr, 0, + (struct sockaddr *)(addr + 1), + x->props.family); if (!addr->sadb_address_prefixlen) BUG(); @@ -3443,9 +3459,9 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = - pfkey_sockaddr_fill(ipaddr, 0, - (struct sockaddr *) (addr + 1), - x->props.family); + pfkey_sockaddr_fill_zero_tail(ipaddr, 0, + (struct sockaddr *)(addr + 1), + x->props.family); if (!addr->sadb_address_prefixlen) BUG(); @@ -3474,15 +3490,15 @@ static int set_sadb_address(struct sk_buff *skb, int sasize, int type, switch (type) { case SADB_EXT_ADDRESS_SRC: addr->sadb_address_prefixlen = sel->prefixlen_s; - pfkey_sockaddr_fill(&sel->saddr, 0, - (struct sockaddr *)(addr + 1), - sel->family); + pfkey_sockaddr_fill_zero_tail(&sel->saddr, 0, + (struct sockaddr *)(addr + 1), + sel->family); break; case SADB_EXT_ADDRESS_DST: addr->sadb_address_prefixlen = sel->prefixlen_d; - pfkey_sockaddr_fill(&sel->daddr, 0, - (struct sockaddr *)(addr + 1), - sel->family); + pfkey_sockaddr_fill_zero_tail(&sel->daddr, 0, + (struct sockaddr *)(addr + 1), + sel->family); break; default: return -EINVAL; -- cgit v1.2.3 From ea245d78dec594372e27d8c79616baf49e98a4a1 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 30 Mar 2026 11:14:13 +0200 Subject: net: rfkill: prevent unlimited numbers of rfkill events from being created Userspace can create an unlimited number of rfkill events if the system is so configured, while not consuming them from the rfkill file descriptor, causing a potential out of memory situation. Prevent this from bounding the number of pending rfkill events at a "large" number (i.e. 1000) to prevent abuses like this. Cc: Johannes Berg Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Cc: stable Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2026033013-disfigure-scroll-e25e@gregkh Signed-off-by: Johannes Berg --- net/rfkill/core.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 2444237bc36a..4827e1fb8804 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -73,11 +73,14 @@ struct rfkill_int_event { struct rfkill_event_ext ev; }; +/* Max rfkill events that can be "in-flight" for one data source */ +#define MAX_RFKILL_EVENT 1000 struct rfkill_data { struct list_head list; struct list_head events; struct mutex mtx; wait_queue_head_t read_wait; + u32 event_count; bool input_handler; u8 max_size; }; @@ -255,10 +258,12 @@ static void rfkill_global_led_trigger_unregister(void) } #endif /* CONFIG_RFKILL_LEDS */ -static void rfkill_fill_event(struct rfkill_event_ext *ev, - struct rfkill *rfkill, - enum rfkill_operation op) +static int rfkill_fill_event(struct rfkill_int_event *int_ev, + struct rfkill *rfkill, + struct rfkill_data *data, + enum rfkill_operation op) { + struct rfkill_event_ext *ev = &int_ev->ev; unsigned long flags; ev->idx = rfkill->idx; @@ -271,6 +276,15 @@ static void rfkill_fill_event(struct rfkill_event_ext *ev, RFKILL_BLOCK_SW_PREV)); ev->hard_block_reasons = rfkill->hard_block_reasons; spin_unlock_irqrestore(&rfkill->lock, flags); + + scoped_guard(mutex, &data->mtx) { + if (data->event_count++ > MAX_RFKILL_EVENT) { + data->event_count--; + return -ENOSPC; + } + list_add_tail(&int_ev->list, &data->events); + } + return 0; } static void rfkill_send_events(struct rfkill *rfkill, enum rfkill_operation op) @@ -282,10 +296,10 @@ static void rfkill_send_events(struct rfkill *rfkill, enum rfkill_operation op) ev = kzalloc_obj(*ev); if (!ev) continue; - rfkill_fill_event(&ev->ev, rfkill, op); - mutex_lock(&data->mtx); - list_add_tail(&ev->list, &data->events); - mutex_unlock(&data->mtx); + if (rfkill_fill_event(ev, rfkill, data, op)) { + kfree(ev); + continue; + } wake_up_interruptible(&data->read_wait); } } @@ -1186,10 +1200,8 @@ static int rfkill_fop_open(struct inode *inode, struct file *file) if (!ev) goto free; rfkill_sync(rfkill); - rfkill_fill_event(&ev->ev, rfkill, RFKILL_OP_ADD); - mutex_lock(&data->mtx); - list_add_tail(&ev->list, &data->events); - mutex_unlock(&data->mtx); + if (rfkill_fill_event(ev, rfkill, data, RFKILL_OP_ADD)) + kfree(ev); } list_add(&data->list, &rfkill_fds); mutex_unlock(&rfkill_global_mutex); @@ -1259,6 +1271,7 @@ static ssize_t rfkill_fop_read(struct file *file, char __user *buf, ret = -EFAULT; list_del(&ev->list); + data->event_count--; kfree(ev); out: mutex_unlock(&data->mtx); -- cgit v1.2.3 From a9b8b18364fffce4c451e6f6fd218fa4ab646705 Mon Sep 17 00:00:00 2001 From: Muhammad Alifa Ramdhan Date: Fri, 3 Apr 2026 09:36:17 +0800 Subject: net/tls: fix use-after-free in -EBUSY error path of tls_do_encryption The -EBUSY handling in tls_do_encryption(), introduced by commit 859054147318 ("net: tls: handle backlogging of crypto requests"), has a use-after-free due to double cleanup of encrypt_pending and the scatterlist entry. When crypto_aead_encrypt() returns -EBUSY, the request is enqueued to the cryptd backlog and the async callback tls_encrypt_done() will be invoked upon completion. That callback unconditionally restores the scatterlist entry (sge->offset, sge->length) and decrements ctx->encrypt_pending. However, if tls_encrypt_async_wait() returns an error, the synchronous error path in tls_do_encryption() performs the same cleanup again, double-decrementing encrypt_pending and double-restoring the scatterlist. The double-decrement corrupts the encrypt_pending sentinel (initialized to 1), making tls_encrypt_async_wait() permanently skip the wait for pending async callbacks. A subsequent sendmsg can then free the tls_rec via bpf_exec_tx_verdict() while a cryptd callback is still pending, resulting in a use-after-free when the callback fires on the freed record. Fix this by skipping the synchronous cleanup when the -EBUSY async wait returns an error, since the callback has already handled encrypt_pending and sge restoration. Fixes: 859054147318 ("net: tls: handle backlogging of crypto requests") Cc: stable@vger.kernel.org Signed-off-by: Muhammad Alifa Ramdhan Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20260403013617.2838875-1-ramdhan@starlabs.sg Signed-off-by: Paolo Abeni --- net/tls/tls_sw.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index dd9dda759bbb..83e78a3d1e65 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -584,6 +584,16 @@ static int tls_do_encryption(struct sock *sk, if (rc == -EBUSY) { rc = tls_encrypt_async_wait(ctx); rc = rc ?: -EINPROGRESS; + /* + * The async callback tls_encrypt_done() has already + * decremented encrypt_pending and restored the sge on + * both success and error. Skip the synchronous cleanup + * below on error, just remove the record and return. + */ + if (rc != -EINPROGRESS) { + list_del(&rec->list); + return rc; + } } if (!rc || rc != -EINPROGRESS) { atomic_dec(&ctx->encrypt_pending); -- cgit v1.2.3 From 944b3b734cfbbe9502274c092bc3b8220764cc92 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Apr 2026 17:19:38 -0700 Subject: net: avoid nul-deref trying to bind mp to incapable device Sashiko points out that we use qops in __net_mp_open_rxq() but never validate they are null. This was introduced when check was moved from netdev_rx_queue_restart(). Look at ops directly instead of the locking config. qops imply netdev_need_ops_lock(). We used netdev_need_ops_lock() initially to signify that the real_num_rx_queues check below is safe without rtnl_lock, but I'm not sure if this is actually clear to most people, anyway. Fixes: da7772a2b4ad ("net: move mp->rx_page_size validation to __net_mp_open_rxq()") Acked-by: Daniel Borkmann Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20260404001938.2425670-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev_rx_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 668a90658f25..05fd2875d725 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -117,7 +117,7 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, struct netdev_rx_queue *rxq; int ret; - if (!netdev_need_ops_lock(dev)) + if (!qops) return -EOPNOTSUPP; if (rxq_idx >= dev->real_num_rx_queues) { -- cgit v1.2.3 From c3812651b522fe8437ebb7063b75ddb95b571643 Mon Sep 17 00:00:00 2001 From: Andrea Mayer Date: Sat, 4 Apr 2026 02:44:04 +0200 Subject: seg6: separate dst_cache for input and output paths in seg6 lwtunnel The seg6 lwtunnel uses a single dst_cache per encap route, shared between seg6_input_core() and seg6_output_core(). These two paths can perform the post-encap SID lookup in different routing contexts (e.g., ip rules matching on the ingress interface, or VRF table separation). Whichever path runs first populates the cache, and the other reuses it blindly, bypassing its own lookup. Fix this by splitting the cache into cache_input and cache_output, so each path maintains its own cached dst independently. Fixes: 6c8702c60b88 ("ipv6: sr: add support for SRH encapsulation and injection with lwtunnels") Cc: stable@vger.kernel.org Signed-off-by: Andrea Mayer Reviewed-by: Nicolas Dichtel Reviewed-by: Justin Iurman Link: https://patch.msgid.link/20260404004405.4057-2-andrea.mayer@uniroma2.it Signed-off-by: Jakub Kicinski --- net/ipv6/seg6_iptunnel.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 3e1b9991131a..d6a0f7df9080 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -48,7 +48,8 @@ static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) } struct seg6_lwt { - struct dst_cache cache; + struct dst_cache cache_input; + struct dst_cache cache_output; struct seg6_iptunnel_encap tuninfo[]; }; @@ -488,7 +489,7 @@ static int seg6_input_core(struct net *net, struct sock *sk, slwt = seg6_lwt_lwtunnel(lwtst); local_bh_disable(); - dst = dst_cache_get(&slwt->cache); + dst = dst_cache_get(&slwt->cache_input); local_bh_enable(); err = seg6_do_srh(skb, dst); @@ -504,7 +505,7 @@ static int seg6_input_core(struct net *net, struct sock *sk, /* cache only if we don't create a dst reference loop */ if (!dst->error && lwtst != dst->lwtstate) { local_bh_disable(); - dst_cache_set_ip6(&slwt->cache, dst, + dst_cache_set_ip6(&slwt->cache_input, dst, &ipv6_hdr(skb)->saddr); local_bh_enable(); } @@ -564,7 +565,7 @@ static int seg6_output_core(struct net *net, struct sock *sk, slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); local_bh_disable(); - dst = dst_cache_get(&slwt->cache); + dst = dst_cache_get(&slwt->cache_output); local_bh_enable(); err = seg6_do_srh(skb, dst); @@ -591,7 +592,7 @@ static int seg6_output_core(struct net *net, struct sock *sk, /* cache only if we don't create a dst reference loop */ if (orig_dst->lwtstate != dst->lwtstate) { local_bh_disable(); - dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); + dst_cache_set_ip6(&slwt->cache_output, dst, &fl6.saddr); local_bh_enable(); } @@ -701,11 +702,13 @@ static int seg6_build_state(struct net *net, struct nlattr *nla, slwt = seg6_lwt_lwtunnel(newts); - err = dst_cache_init(&slwt->cache, GFP_ATOMIC); - if (err) { - kfree(newts); - return err; - } + err = dst_cache_init(&slwt->cache_input, GFP_ATOMIC); + if (err) + goto err_free_newts; + + err = dst_cache_init(&slwt->cache_output, GFP_ATOMIC); + if (err) + goto err_destroy_input; memcpy(&slwt->tuninfo, tuninfo, tuninfo_len); @@ -720,11 +723,20 @@ static int seg6_build_state(struct net *net, struct nlattr *nla, *ts = newts; return 0; + +err_destroy_input: + dst_cache_destroy(&slwt->cache_input); +err_free_newts: + kfree(newts); + return err; } static void seg6_destroy_state(struct lwtunnel_state *lwt) { - dst_cache_destroy(&seg6_lwt_lwtunnel(lwt)->cache); + struct seg6_lwt *slwt = seg6_lwt_lwtunnel(lwt); + + dst_cache_destroy(&slwt->cache_input); + dst_cache_destroy(&slwt->cache_output); } static int seg6_fill_encap_info(struct sk_buff *skb, -- cgit v1.2.3 From 9a91797e61d286805ae10a92cc48959c30800556 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Wed, 1 Apr 2026 15:58:01 +0800 Subject: ipvs: fix NULL deref in ip_vs_add_service error path When ip_vs_bind_scheduler() succeeds in ip_vs_add_service(), the local variable sched is set to NULL. If ip_vs_start_estimator() subsequently fails, the out_err cleanup calls ip_vs_unbind_scheduler(svc, sched) with sched == NULL. ip_vs_unbind_scheduler() passes the cur_sched NULL check (because svc->scheduler was set by the successful bind) but then dereferences the NULL sched parameter at sched->done_service, causing a kernel panic at offset 0x30 from NULL. Oops: general protection fault, [..] [#1] PREEMPT SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000030-0x0000000000000037] RIP: 0010:ip_vs_unbind_scheduler (net/netfilter/ipvs/ip_vs_sched.c:69) Call Trace: ip_vs_add_service.isra.0 (net/netfilter/ipvs/ip_vs_ctl.c:1500) do_ip_vs_set_ctl (net/netfilter/ipvs/ip_vs_ctl.c:2809) nf_setsockopt (net/netfilter/nf_sockopt.c:102) [..] Fix by simply not clearing the local sched variable after a successful bind. ip_vs_unbind_scheduler() already detects whether a scheduler is installed via svc->scheduler, and keeping sched non-NULL ensures the error path passes the correct pointer to both ip_vs_unbind_scheduler() and ip_vs_scheduler_put(). While the bug is older, the problem popups in more recent kernels (6.2), when the new error path is taken after the ip_vs_start_estimator() call. Fixes: 705dd3444081 ("ipvs: use kthreads for stats estimation") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Acked-by: Simon Horman Acked-by: Julian Anastasov Signed-off-by: Florian Westphal --- net/netfilter/ipvs/ip_vs_ctl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 35642de2a0fe..2aaf50f52c8e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1452,7 +1452,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ret = ip_vs_bind_scheduler(svc, sched); if (ret) goto out_err; - sched = NULL; } ret = ip_vs_start_estimator(ipvs, &svc->stats); -- cgit v1.2.3 From 1f3083aec8836213da441270cdb1ab612dd82cf4 Mon Sep 17 00:00:00 2001 From: Xiang Mei Date: Wed, 1 Apr 2026 14:20:57 -0700 Subject: netfilter: nfnetlink_log: initialize nfgenmsg in NLMSG_DONE terminator When batching multiple NFLOG messages (inst->qlen > 1), __nfulnl_send() appends an NLMSG_DONE terminator with sizeof(struct nfgenmsg) payload via nlmsg_put(), but never initializes the nfgenmsg bytes. The nlmsg_put() helper only zeroes alignment padding after the payload, not the payload itself, so four bytes of stale kernel heap data are leaked to userspace in the NLMSG_DONE message body. Use nfnl_msg_put() to build the NLMSG_DONE terminator, which initializes the nfgenmsg payload via nfnl_fill_hdr(), consistent with how __build_packet_message() already constructs NFULNL_MSG_PACKET headers. Fixes: 29c5d4afba51 ("[NETFILTER]: nfnetlink_log: fix sending of multipart messages") Reported-by: Weiming Shi Signed-off-by: Xiang Mei Signed-off-by: Florian Westphal --- net/netfilter/nfnetlink_log.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index f80978c06fa0..0db908518b2f 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -361,10 +361,10 @@ static void __nfulnl_send(struct nfulnl_instance *inst) { if (inst->qlen > 1) { - struct nlmsghdr *nlh = nlmsg_put(inst->skb, 0, 0, - NLMSG_DONE, - sizeof(struct nfgenmsg), - 0); + struct nlmsghdr *nlh = nfnl_msg_put(inst->skb, 0, 0, + NLMSG_DONE, 0, + AF_UNSPEC, NFNETLINK_V0, + htons(inst->group_num)); if (WARN_ONCE(!nlh, "bad nlskb size: %u, tailroom %d\n", inst->skb->len, skb_tailroom(inst->skb))) { kfree_skb(inst->skb); -- cgit v1.2.3 From ff64c5bfef12461df8450e0f50bb693b5269c720 Mon Sep 17 00:00:00 2001 From: Ren Wei Date: Fri, 3 Apr 2026 23:52:52 +0800 Subject: netfilter: xt_multiport: validate range encoding in checkentry ports_match_v1() treats any non-zero pflags entry as the start of a port range and unconditionally consumes the next ports[] element as the range end. The checkentry path currently validates protocol, flags and count, but it does not validate the range encoding itself. As a result, malformed rules can mark the last slot as a range start or place two range starts back to back, leaving ports_match_v1() to step past the last valid ports[] element while interpreting the rule. Reject malformed multiport v1 rules in checkentry by validating that each range start has a following element and that the following element is not itself marked as another range start. Fixes: a89ecb6a2ef7 ("[NETFILTER]: x_tables: unify IPv4/IPv6 multiport match") Reported-by: Yifan Wu Reported-by: Juefei Pu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Tested-by: Yuhang Zheng Signed-off-by: Ren Wei Signed-off-by: Florian Westphal --- net/netfilter/xt_multiport.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c index 44a00f5acde8..a1691ff405d3 100644 --- a/net/netfilter/xt_multiport.c +++ b/net/netfilter/xt_multiport.c @@ -105,6 +105,28 @@ multiport_mt(const struct sk_buff *skb, struct xt_action_param *par) return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1])); } +static bool +multiport_valid_ranges(const struct xt_multiport_v1 *multiinfo) +{ + unsigned int i; + + for (i = 0; i < multiinfo->count; i++) { + if (!multiinfo->pflags[i]) + continue; + + if (++i >= multiinfo->count) + return false; + + if (multiinfo->pflags[i]) + return false; + + if (multiinfo->ports[i - 1] > multiinfo->ports[i]) + return false; + } + + return true; +} + static inline bool check(u_int16_t proto, u_int8_t ip_invflags, @@ -127,8 +149,10 @@ static int multiport_mt_check(const struct xt_mtchk_param *par) const struct ipt_ip *ip = par->entryinfo; const struct xt_multiport_v1 *multiinfo = par->matchinfo; - return check(ip->proto, ip->invflags, multiinfo->flags, - multiinfo->count) ? 0 : -EINVAL; + if (!check(ip->proto, ip->invflags, multiinfo->flags, multiinfo->count)) + return -EINVAL; + + return multiport_valid_ranges(multiinfo) ? 0 : -EINVAL; } static int multiport_mt6_check(const struct xt_mtchk_param *par) @@ -136,8 +160,10 @@ static int multiport_mt6_check(const struct xt_mtchk_param *par) const struct ip6t_ip6 *ip = par->entryinfo; const struct xt_multiport_v1 *multiinfo = par->matchinfo; - return check(ip->proto, ip->invflags, multiinfo->flags, - multiinfo->count) ? 0 : -EINVAL; + if (!check(ip->proto, ip->invflags, multiinfo->flags, multiinfo->count)) + return -EINVAL; + + return multiport_valid_ranges(multiinfo) ? 0 : -EINVAL; } static struct xt_match multiport_mt_reg[] __read_mostly = { -- cgit v1.2.3 From fdce0b3590f724540795b874b4c8850c90e6b0a8 Mon Sep 17 00:00:00 2001 From: Zhengchuan Liang Date: Sat, 4 Apr 2026 17:39:47 +0800 Subject: netfilter: ip6t_eui64: reject invalid MAC header for all packets `eui64_mt6()` derives a modified EUI-64 from the Ethernet source address and compares it with the low 64 bits of the IPv6 source address. The existing guard only rejects an invalid MAC header when `par->fragoff != 0`. For packets with `par->fragoff == 0`, `eui64_mt6()` can still reach `eth_hdr(skb)` even when the MAC header is not valid. Fix this by removing the `par->fragoff != 0` condition so that packets with an invalid MAC header are rejected before accessing `eth_hdr(skb)`. Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2") Reported-by: Yifan Wu Reported-by: Juefei Pu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Tested-by: Ren Wei Signed-off-by: Zhengchuan Liang Signed-off-by: Ren Wei Signed-off-by: Florian Westphal --- net/ipv6/netfilter/ip6t_eui64.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c index d704f7ed300c..da69a27e8332 100644 --- a/net/ipv6/netfilter/ip6t_eui64.c +++ b/net/ipv6/netfilter/ip6t_eui64.c @@ -22,8 +22,7 @@ eui64_mt6(const struct sk_buff *skb, struct xt_action_param *par) unsigned char eui64[8]; if (!(skb_mac_header(skb) >= skb->head && - skb_mac_header(skb) + ETH_HLEN <= skb->data) && - par->fragoff != 0) { + skb_mac_header(skb) + ETH_HLEN <= skb->data)) { par->hotdrop = true; return false; } -- cgit v1.2.3 From f8dca15a1b190787bbd03285304b569631160eda Mon Sep 17 00:00:00 2001 From: Tuan Do Date: Fri, 3 Apr 2026 00:33:17 -0700 Subject: netfilter: nft_ct: fix use-after-free in timeout object destroy nft_ct_timeout_obj_destroy() frees the timeout object with kfree() immediately after nf_ct_untimeout(), without waiting for an RCU grace period. Concurrent packet processing on other CPUs may still hold RCU-protected references to the timeout object obtained via rcu_dereference() in nf_ct_timeout_data(). Add an rcu_head to struct nf_ct_timeout and use kfree_rcu() to defer freeing until after an RCU grace period, matching the approach already used in nfnetlink_cttimeout.c. KASAN report: BUG: KASAN: slab-use-after-free in nf_conntrack_tcp_packet+0x1381/0x29d0 Read of size 4 at addr ffff8881035fe19c by task exploit/80 Call Trace: nf_conntrack_tcp_packet+0x1381/0x29d0 nf_conntrack_in+0x612/0x8b0 nf_hook_slow+0x70/0x100 __ip_local_out+0x1b2/0x210 tcp_sendmsg_locked+0x722/0x1580 __sys_sendto+0x2d8/0x320 Allocated by task 75: nft_ct_timeout_obj_init+0xf6/0x290 nft_obj_init+0x107/0x1b0 nf_tables_newobj+0x680/0x9c0 nfnetlink_rcv_batch+0xc29/0xe00 Freed by task 26: nft_obj_destroy+0x3f/0xa0 nf_tables_trans_destroy_work+0x51c/0x5c0 process_one_work+0x2c4/0x5a0 Fixes: 7e0b2b57f01d ("netfilter: nft_ct: add ct timeout support") Cc: stable@vger.kernel.org Signed-off-by: Tuan Do Signed-off-by: Florian Westphal --- include/net/netfilter/nf_conntrack_timeout.h | 1 + net/netfilter/nft_ct.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 9fdaba911de6..3a66d4abb6d6 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -14,6 +14,7 @@ struct nf_ct_timeout { __u16 l3num; const struct nf_conntrack_l4proto *l4proto; + struct rcu_head rcu; char data[]; }; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 128ff8155b5d..04c74ccf9b84 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -1020,7 +1020,7 @@ static void nft_ct_timeout_obj_destroy(const struct nft_ctx *ctx, nf_queue_nf_hook_drop(ctx->net); nf_ct_untimeout(ctx->net, timeout); nf_ct_netns_put(ctx->net, ctx->family); - kfree(priv->timeout); + kfree_rcu(priv->timeout, rcu); } static int nft_ct_timeout_obj_dump(struct sk_buff *skb, -- cgit v1.2.3 From 936206e3f6ff411581e615e930263d6f8b78df9d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 7 Apr 2026 17:00:01 +0200 Subject: netfilter: nfnetlink_queue: make hash table per queue Sharing a global hash table among all queues is tempting, but it can cause crash: BUG: KASAN: slab-use-after-free in nfqnl_recv_verdict+0x11ac/0x15e0 [nfnetlink_queue] [..] nfqnl_recv_verdict+0x11ac/0x15e0 [nfnetlink_queue] nfnetlink_rcv_msg+0x46a/0x930 kmem_cache_alloc_node_noprof+0x11e/0x450 struct nf_queue_entry is freed via kfree, but parallel cpu can still encounter such an nf_queue_entry when walking the list. Alternative fix is to free the nf_queue_entry via kfree_rcu() instead, but as we have to alloc/free for each skb this will cause more mem pressure. Cc: Scott Mitchell Fixes: e19079adcd26 ("netfilter: nfnetlink_queue: optimize verdict lookup with hash table") Signed-off-by: Florian Westphal --- include/net/netfilter/nf_queue.h | 1 - net/netfilter/nfnetlink_queue.c | 139 ++++++++++++++------------------------- 2 files changed, 49 insertions(+), 91 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 45eb26b2e95b..d17035d14d96 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -23,7 +23,6 @@ struct nf_queue_entry { struct nf_hook_state state; bool nf_ct_is_unconfirmed; u16 size; /* sizeof(entry) + saved route keys */ - u16 queue_num; /* extra space to store route keys */ }; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 47f7f62906e2..8e02f84784da 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -49,8 +49,8 @@ #endif #define NFQNL_QMAX_DEFAULT 1024 -#define NFQNL_HASH_MIN 1024 -#define NFQNL_HASH_MAX 1048576 +#define NFQNL_HASH_MIN 8 +#define NFQNL_HASH_MAX 32768 /* We're using struct nlattr which has 16bit nla_len. Note that nla_len * includes the header length. Thus, the maximum packet length that we @@ -60,29 +60,10 @@ */ #define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) -/* Composite key for packet lookup: (net, queue_num, packet_id) */ -struct nfqnl_packet_key { - possible_net_t net; - u32 packet_id; - u16 queue_num; -} __aligned(sizeof(u32)); /* jhash2 requires 32-bit alignment */ - -/* Global rhashtable - one for entire system, all netns */ -static struct rhashtable nfqnl_packet_map __read_mostly; - -/* Helper to initialize composite key */ -static inline void nfqnl_init_key(struct nfqnl_packet_key *key, - struct net *net, u32 packet_id, u16 queue_num) -{ - memset(key, 0, sizeof(*key)); - write_pnet(&key->net, net); - key->packet_id = packet_id; - key->queue_num = queue_num; -} - struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ - struct rcu_head rcu; + struct rhashtable nfqnl_packet_map; + struct rcu_work rwork; u32 peer_portid; unsigned int queue_maxlen; @@ -106,6 +87,7 @@ struct nfqnl_instance { typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); +static struct workqueue_struct *nfq_cleanup_wq __read_mostly; static unsigned int nfnl_queue_net_id __read_mostly; #define INSTANCE_BUCKETS 16 @@ -124,34 +106,10 @@ static inline u_int8_t instance_hashfn(u_int16_t queue_num) return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; } -/* Extract composite key from nf_queue_entry for hashing */ -static u32 nfqnl_packet_obj_hashfn(const void *data, u32 len, u32 seed) -{ - const struct nf_queue_entry *entry = data; - struct nfqnl_packet_key key; - - nfqnl_init_key(&key, entry->state.net, entry->id, entry->queue_num); - - return jhash2((u32 *)&key, sizeof(key) / sizeof(u32), seed); -} - -/* Compare stack-allocated key against entry */ -static int nfqnl_packet_obj_cmpfn(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct nfqnl_packet_key *key = arg->key; - const struct nf_queue_entry *entry = obj; - - return !net_eq(entry->state.net, read_pnet(&key->net)) || - entry->queue_num != key->queue_num || - entry->id != key->packet_id; -} - static const struct rhashtable_params nfqnl_rhashtable_params = { .head_offset = offsetof(struct nf_queue_entry, hash_node), - .key_len = sizeof(struct nfqnl_packet_key), - .obj_hashfn = nfqnl_packet_obj_hashfn, - .obj_cmpfn = nfqnl_packet_obj_cmpfn, + .key_offset = offsetof(struct nf_queue_entry, id), + .key_len = sizeof(u32), .automatic_shrinking = true, .min_size = NFQNL_HASH_MIN, .max_size = NFQNL_HASH_MAX, @@ -190,6 +148,10 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); + err = rhashtable_init(&inst->nfqnl_packet_map, &nfqnl_rhashtable_params); + if (err < 0) + goto out_free; + spin_lock(&q->instances_lock); if (instance_lookup(q, queue_num)) { err = -EEXIST; @@ -210,6 +172,8 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) out_unlock: spin_unlock(&q->instances_lock); + rhashtable_destroy(&inst->nfqnl_packet_map); +out_free: kfree(inst); return ERR_PTR(err); } @@ -217,15 +181,18 @@ out_unlock: static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data); -static void -instance_destroy_rcu(struct rcu_head *head) +static void instance_destroy_work(struct work_struct *work) { - struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, - rcu); + struct nfqnl_instance *inst; + inst = container_of(to_rcu_work(work), struct nfqnl_instance, + rwork); rcu_read_lock(); nfqnl_flush(inst, NULL, 0); rcu_read_unlock(); + + rhashtable_destroy(&inst->nfqnl_packet_map); + kfree(inst); module_put(THIS_MODULE); } @@ -234,7 +201,9 @@ static void __instance_destroy(struct nfqnl_instance *inst) { hlist_del_rcu(&inst->hlist); - call_rcu(&inst->rcu, instance_destroy_rcu); + + INIT_RCU_WORK(&inst->rwork, instance_destroy_work); + queue_rcu_work(nfq_cleanup_wq, &inst->rwork); } static void @@ -250,9 +219,7 @@ __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { int err; - entry->queue_num = queue->queue_num; - - err = rhashtable_insert_fast(&nfqnl_packet_map, &entry->hash_node, + err = rhashtable_insert_fast(&queue->nfqnl_packet_map, &entry->hash_node, nfqnl_rhashtable_params); if (unlikely(err)) return err; @@ -266,23 +233,19 @@ __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) static void __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { - rhashtable_remove_fast(&nfqnl_packet_map, &entry->hash_node, + rhashtable_remove_fast(&queue->nfqnl_packet_map, &entry->hash_node, nfqnl_rhashtable_params); list_del(&entry->list); queue->queue_total--; } static struct nf_queue_entry * -find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id, - struct net *net) +find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) { - struct nfqnl_packet_key key; struct nf_queue_entry *entry; - nfqnl_init_key(&key, net, id, queue->queue_num); - spin_lock_bh(&queue->lock); - entry = rhashtable_lookup_fast(&nfqnl_packet_map, &key, + entry = rhashtable_lookup_fast(&queue->nfqnl_packet_map, &id, nfqnl_rhashtable_params); if (entry) @@ -1531,7 +1494,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, verdict = ntohl(vhdr->verdict); - entry = find_dequeue_entry(queue, ntohl(vhdr->id), info->net); + entry = find_dequeue_entry(queue, ntohl(vhdr->id)); if (entry == NULL) return -ENOENT; @@ -1880,40 +1843,38 @@ static int __init nfnetlink_queue_init(void) { int status; - status = rhashtable_init(&nfqnl_packet_map, &nfqnl_rhashtable_params); - if (status < 0) - return status; + nfq_cleanup_wq = alloc_ordered_workqueue("nfq_workqueue", 0); + if (!nfq_cleanup_wq) + return -ENOMEM; status = register_pernet_subsys(&nfnl_queue_net_ops); - if (status < 0) { - pr_err("failed to register pernet ops\n"); - goto cleanup_rhashtable; - } + if (status < 0) + goto cleanup_pernet_subsys; - netlink_register_notifier(&nfqnl_rtnl_notifier); - status = nfnetlink_subsys_register(&nfqnl_subsys); - if (status < 0) { - pr_err("failed to create netlink socket\n"); - goto cleanup_netlink_notifier; - } + status = netlink_register_notifier(&nfqnl_rtnl_notifier); + if (status < 0) + goto cleanup_rtnl_notifier; status = register_netdevice_notifier(&nfqnl_dev_notifier); - if (status < 0) { - pr_err("failed to register netdevice notifier\n"); - goto cleanup_netlink_subsys; - } + if (status < 0) + goto cleanup_dev_notifier; + + status = nfnetlink_subsys_register(&nfqnl_subsys); + if (status < 0) + goto cleanup_nfqnl_subsys; nf_register_queue_handler(&nfqh); return status; -cleanup_netlink_subsys: - nfnetlink_subsys_unregister(&nfqnl_subsys); -cleanup_netlink_notifier: +cleanup_nfqnl_subsys: + unregister_netdevice_notifier(&nfqnl_dev_notifier); +cleanup_dev_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); +cleanup_rtnl_notifier: unregister_pernet_subsys(&nfnl_queue_net_ops); -cleanup_rhashtable: - rhashtable_destroy(&nfqnl_packet_map); +cleanup_pernet_subsys: + destroy_workqueue(nfq_cleanup_wq); return status; } @@ -1924,9 +1885,7 @@ static void __exit nfnetlink_queue_fini(void) nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); - - rhashtable_destroy(&nfqnl_packet_map); - + destroy_workqueue(nfq_cleanup_wq); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } -- cgit v1.2.3 From bdbfead6d38979475df0c2f4bad2b19394fe9bdc Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 8 Apr 2026 13:12:29 +0100 Subject: rxrpc: Fix key quota calculation for multitoken keys In the rxrpc key preparsing, every token extracted sets the proposed quota value, but for multitoken keys, this will overwrite the previous proposed quota, losing it. Fix this by adding to the proposed quota instead. Fixes: 8a7a3eb4ddbe ("KEYS: RxRPC: Use key preparsing") Closes: https://sashiko.dev/#/patchset/20260319150150.4189381-1-dhowells%40redhat.com Signed-off-by: David Howells cc: Marc Dionne cc: Jeffrey Altman cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-2-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/key.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 85078114b2dd..af403f0ccab5 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -72,7 +72,7 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, return -EKEYREJECTED; plen = sizeof(*token) + sizeof(*token->kad) + tktlen; - prep->quotalen = datalen + plen; + prep->quotalen += datalen + plen; plen -= sizeof(*token); token = kzalloc_obj(*token); @@ -199,7 +199,7 @@ static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep, } plen = sizeof(*token) + sizeof(*token->rxgk) + tktlen + keylen; - prep->quotalen = datalen + plen; + prep->quotalen += datalen + plen; plen -= sizeof(*token); token = kzalloc_obj(*token); @@ -460,6 +460,7 @@ static int rxrpc_preparse(struct key_preparsed_payload *prep) memcpy(&kver, prep->data, sizeof(kver)); prep->data += sizeof(kver); prep->datalen -= sizeof(kver); + prep->quotalen = 0; _debug("KEY I/F VERSION: %u", kver); @@ -497,7 +498,7 @@ static int rxrpc_preparse(struct key_preparsed_payload *prep) goto error; plen = sizeof(*token->kad) + v1->ticket_length; - prep->quotalen = plen + sizeof(*token); + prep->quotalen += plen + sizeof(*token); ret = -ENOMEM; token = kzalloc_obj(*token); -- cgit v1.2.3 From b555912b9b21075e8298015f888ffe3ff60b1a97 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 8 Apr 2026 13:12:30 +0100 Subject: rxrpc: Fix key parsing memleak In rxrpc_preparse_xdr_yfs_rxgk(), the memory attached to token->rxgk can be leaked in a few error paths after it's allocated. Fix this by freeing it in the "reject_token:" case. Fixes: 0ca100ff4df6 ("rxrpc: Add YFS RxGK (GSSAPI) security class") Closes: https://sashiko.dev/#/patchset/20260319150150.4189381-1-dhowells%40redhat.com Signed-off-by: David Howells cc: Marc Dionne cc: Jeffrey Altman cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-3-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/key.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index af403f0ccab5..26d4336a4a02 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -274,6 +274,7 @@ nomem_token: nomem: return -ENOMEM; reject_token: + kfree(token->rxgk); kfree(token); reject: return -EKEYREJECTED; -- cgit v1.2.3 From 6a59d84b4fc2f27f7b40e348506cc686712e260b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 8 Apr 2026 13:12:31 +0100 Subject: rxrpc: Fix anonymous key handling In rxrpc_new_client_call_for_sendmsg(), a key with no payload is meant to be substituted for a NULL key pointer, but the variable this is done with is subsequently not used. Fix this by using "key" rather than "rx->key" when filling in the connection parameters. Note that this only affects direct use of AF_RXRPC; the kAFS filesystem doesn't use sendmsg() directly and so bypasses the issue. Further, AF_RXRPC passes a NULL key in if no key is set, so using an anonymous key in that manner works. Since this hasn't been noticed to this point, it might be better just to remove the "key" variable and the code that sets it - and, arguably, rxrpc_init_client_call_security() would be a better place to handle it. Fixes: 19ffa01c9c45 ("rxrpc: Use structs to hold connection params and protocol info") Closes: https://sashiko.dev/#/patchset/20260319150150.4189381-1-dhowells%40redhat.com Signed-off-by: David Howells cc: Marc Dionne cc: Jeffrey Altman cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-4-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/sendmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 04f9c5f2dc24..c35de4fd75e3 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -637,7 +637,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, memset(&cp, 0, sizeof(cp)); cp.local = rx->local; cp.peer = peer; - cp.key = rx->key; + cp.key = key; cp.security_level = rx->min_sec_level; cp.exclusive = rx->exclusive | p->exclusive; cp.upgrade = p->upgrade; -- cgit v1.2.3 From 146d4ab94cf129ee06cd467cb5c71368a6b5bad6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 8 Apr 2026 13:12:32 +0100 Subject: rxrpc: Fix call removal to use RCU safe deletion Fix rxrpc call removal from the rxnet->calls list to use list_del_rcu() rather than list_del_init() to prevent stuffing up reading /proc/net/rxrpc/calls from potentially getting into an infinite loop. This, however, means that list_empty() no longer works on an entry that's been deleted from the list, making it harder to detect prior deletion. Fix this by: Firstly, make rxrpc_destroy_all_calls() only dump the first ten calls that are unexpectedly still on the list. Limiting the number of steps means there's no need to call cond_resched() or to remove calls from the list here, thereby eliminating the need for rxrpc_put_call() to check for that. rxrpc_put_call() can then be fixed to unconditionally delete the call from the list as it is the only place that the deletion occurs. Fixes: 2baec2c3f854 ("rxrpc: Support network namespacing") Closes: https://sashiko.dev/#/patchset/20260319150150.4189381-1-dhowells%40redhat.com Signed-off-by: David Howells cc: Marc Dionne cc: Jeffrey Altman cc: Linus Torvalds cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-5-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 2 +- net/rxrpc/call_object.c | 24 +++++++++--------------- 2 files changed, 10 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 869f97c9bf73..a826cd80007b 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -347,7 +347,7 @@ EM(rxrpc_call_see_release, "SEE release ") \ EM(rxrpc_call_see_userid_exists, "SEE u-exists") \ EM(rxrpc_call_see_waiting_call, "SEE q-conn ") \ - E_(rxrpc_call_see_zap, "SEE zap ") + E_(rxrpc_call_see_still_live, "SEE !still-l") #define rxrpc_txqueue_traces \ EM(rxrpc_txqueue_await_reply, "AWR") \ diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 918f41d97a2f..59329cfe1532 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -654,11 +654,9 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace why) if (dead) { ASSERTCMP(__rxrpc_call_state(call), ==, RXRPC_CALL_COMPLETE); - if (!list_empty(&call->link)) { - spin_lock(&rxnet->call_lock); - list_del_init(&call->link); - spin_unlock(&rxnet->call_lock); - } + spin_lock(&rxnet->call_lock); + list_del_rcu(&call->link); + spin_unlock(&rxnet->call_lock); rxrpc_cleanup_call(call); } @@ -730,24 +728,20 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) _enter(""); if (!list_empty(&rxnet->calls)) { - spin_lock(&rxnet->call_lock); + int shown = 0; - while (!list_empty(&rxnet->calls)) { - call = list_entry(rxnet->calls.next, - struct rxrpc_call, link); - _debug("Zapping call %p", call); + spin_lock(&rxnet->call_lock); - rxrpc_see_call(call, rxrpc_call_see_zap); - list_del_init(&call->link); + list_for_each_entry(call, &rxnet->calls, link) { + rxrpc_see_call(call, rxrpc_call_see_still_live); pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n", call, refcount_read(&call->ref), rxrpc_call_states[__rxrpc_call_state(call)], call->flags, call->events); - spin_unlock(&rxnet->call_lock); - cond_resched(); - spin_lock(&rxnet->call_lock); + if (++shown >= 10) + break; } spin_unlock(&rxnet->call_lock); -- cgit v1.2.3 From d179a868dd755b0cfcf7582e00943d702b9943b8 Mon Sep 17 00:00:00 2001 From: Oleh Konko Date: Wed, 8 Apr 2026 13:12:33 +0100 Subject: rxrpc: Fix RxGK token loading to check bounds rxrpc_preparse_xdr_yfs_rxgk() reads the raw key length and ticket length from the XDR token as u32 values and passes each through round_up(x, 4) before using the rounded value for validation and allocation. When the raw length is >= 0xfffffffd, round_up() wraps to 0, so the bounds check and kzalloc both use 0 while the subsequent memcpy still copies the original ~4 GiB value, producing a heap buffer overflow reachable from an unprivileged add_key() call. Fix this by: (1) Rejecting raw key lengths above AFSTOKEN_GK_KEY_MAX and raw ticket lengths above AFSTOKEN_GK_TOKEN_MAX before rounding, consistent with the caps that the RxKAD path already enforces via AFSTOKEN_RK_TIX_MAX. (2) Sizing the flexible-array allocation from the validated raw key length via struct_size_t() instead of the rounded value. (3) Caching the raw lengths so that the later field assignments and memcpy calls do not re-read from the token, eliminating a class of TOCTOU re-parse. The control path (valid token with lengths within bounds) is unaffected. Fixes: 0ca100ff4df6 ("rxrpc: Add YFS RxGK (GSSAPI) security class") Signed-off-by: Oleh Konko Signed-off-by: David Howells Reviewed-by: Jeffrey Altman cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-6-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/key.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 26d4336a4a02..77237a82be3b 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -171,7 +172,7 @@ static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep, size_t plen; const __be32 *ticket, *key; s64 tmp; - u32 tktlen, keylen; + size_t raw_keylen, raw_tktlen, keylen, tktlen; _enter(",{%x,%x,%x,%x},%x", ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), @@ -181,18 +182,22 @@ static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep, goto reject; key = xdr + (6 * 2 + 1); - keylen = ntohl(key[-1]); - _debug("keylen: %x", keylen); - keylen = round_up(keylen, 4); + raw_keylen = ntohl(key[-1]); + _debug("keylen: %zx", raw_keylen); + if (raw_keylen > AFSTOKEN_GK_KEY_MAX) + goto reject; + keylen = round_up(raw_keylen, 4); if ((6 * 2 + 2) * 4 + keylen > toklen) goto reject; ticket = xdr + (6 * 2 + 1 + (keylen / 4) + 1); - tktlen = ntohl(ticket[-1]); - _debug("tktlen: %x", tktlen); - tktlen = round_up(tktlen, 4); + raw_tktlen = ntohl(ticket[-1]); + _debug("tktlen: %zx", raw_tktlen); + if (raw_tktlen > AFSTOKEN_GK_TOKEN_MAX) + goto reject; + tktlen = round_up(raw_tktlen, 4); if ((6 * 2 + 2) * 4 + keylen + tktlen != toklen) { - kleave(" = -EKEYREJECTED [%x!=%x, %x,%x]", + kleave(" = -EKEYREJECTED [%zx!=%x, %zx,%zx]", (6 * 2 + 2) * 4 + keylen + tktlen, toklen, keylen, tktlen); goto reject; @@ -206,7 +211,7 @@ static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep, if (!token) goto nomem; - token->rxgk = kzalloc(sizeof(*token->rxgk) + keylen, GFP_KERNEL); + token->rxgk = kzalloc(struct_size_t(struct rxgk_key, _key, raw_keylen), GFP_KERNEL); if (!token->rxgk) goto nomem_token; @@ -221,9 +226,9 @@ static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep, token->rxgk->enctype = tmp = xdr_dec64(xdr + 5 * 2); if (tmp < 0 || tmp > UINT_MAX) goto reject_token; - token->rxgk->key.len = ntohl(key[-1]); + token->rxgk->key.len = raw_keylen; token->rxgk->key.data = token->rxgk->_key; - token->rxgk->ticket.len = ntohl(ticket[-1]); + token->rxgk->ticket.len = raw_tktlen; if (token->rxgk->endtime != 0) { expiry = rxrpc_s64_to_time64(token->rxgk->endtime); @@ -236,8 +241,7 @@ static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep, memcpy(token->rxgk->key.data, key, token->rxgk->key.len); /* Pad the ticket so that we can use it directly in XDR */ - token->rxgk->ticket.data = kzalloc(round_up(token->rxgk->ticket.len, 4), - GFP_KERNEL); + token->rxgk->ticket.data = kzalloc(tktlen, GFP_KERNEL); if (!token->rxgk->ticket.data) goto nomem_yrxgk; memcpy(token->rxgk->ticket.data, ticket, token->rxgk->ticket.len); -- cgit v1.2.3 From b33f5741bb187db8ff32e8f5b96def77cc94dfca Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Wed, 8 Apr 2026 13:12:34 +0100 Subject: rxrpc: Fix use of wrong skb when comparing queued RESP challenge serial In rxrpc_post_response(), the code should be comparing the challenge serial number from the cached response before deciding to switch to a newer response, but looks at the newer packet private data instead, rendering the comparison always false. Fix this by switching to look at the older packet. Fix further[1] to substitute the new packet in place of the old one if newer and also to release whichever we don't use. Fixes: 5800b1cf3fd8 ("rxrpc: Allow CHALLENGEs to the passed to the app for a RESPONSE") Signed-off-by: Alok Tiwari Signed-off-by: David Howells Reviewed-by: Jeffrey Altman cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://sashiko.dev/#/patchset/20260319150150.4189381-1-dhowells%40redhat.com [1] Link: https://patch.msgid.link/20260408121252.2249051-7-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 1 + net/rxrpc/conn_event.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index a826cd80007b..f7f559204b87 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -185,6 +185,7 @@ EM(rxrpc_skb_put_input, "PUT input ") \ EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \ EM(rxrpc_skb_put_oob, "PUT oob ") \ + EM(rxrpc_skb_put_old_response, "PUT old-resp ") \ EM(rxrpc_skb_put_purge, "PUT purge ") \ EM(rxrpc_skb_put_purge_oob, "PUT purge-oob") \ EM(rxrpc_skb_put_response, "PUT response ") \ diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 98ad9b51ca2c..c50cbfc5a313 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -557,11 +557,11 @@ void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb) spin_lock_irq(&local->lock); old = conn->tx_response; if (old) { - struct rxrpc_skb_priv *osp = rxrpc_skb(skb); + struct rxrpc_skb_priv *osp = rxrpc_skb(old); /* Always go with the response to the most recent challenge. */ if (after(sp->resp.challenge_serial, osp->resp.challenge_serial)) - conn->tx_response = old; + conn->tx_response = skb; else old = skb; } else { @@ -569,4 +569,5 @@ void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb) } spin_unlock_irq(&local->lock); rxrpc_poke_conn(conn, rxrpc_conn_get_poke_response); + rxrpc_free_skb(old, rxrpc_skb_put_old_response); } -- cgit v1.2.3 From 65b3ffe0972ed023acc3981a0f7e1ae5d0208bd3 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Wed, 8 Apr 2026 13:12:35 +0100 Subject: rxrpc: Fix rack timer warning to report unexpected mode rxrpc_rack_timer_expired() clears call->rack_timer_mode to OFF before the switch. The default case warning therefore always prints OFF and doesn't identify the unexpected timer mode. Log the saved mode value instead so the warning reports the actual unexpected rack timer mode. Fixes: 7c482665931b ("rxrpc: Implement RACK/TLP to deal with transmission stalls [RFC8985]") Signed-off-by: Alok Tiwari Signed-off-by: David Howells Reviewed-by: Simon Horman Reviewed-by: Jeffrey Altman cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-8-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/input_rack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/input_rack.c b/net/rxrpc/input_rack.c index 13c371261e0a..9eb109ffba56 100644 --- a/net/rxrpc/input_rack.c +++ b/net/rxrpc/input_rack.c @@ -413,6 +413,6 @@ void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by) break; //case RXRPC_CALL_RACKTIMER_ZEROWIN: default: - pr_warn("Unexpected rack timer %u", call->rack_timer_mode); + pr_warn("Unexpected rack timer %u", mode); } } -- cgit v1.2.3 From d666540d217e8d420544ebdfbadeedd623562733 Mon Sep 17 00:00:00 2001 From: Anderson Nascimento Date: Wed, 8 Apr 2026 13:12:36 +0100 Subject: rxrpc: Fix key reference count leak from call->key When creating a client call in rxrpc_alloc_client_call(), the code obtains a reference to the key. This is never cleaned up and gets leaked when the call is destroyed. Fix this by freeing call->key in rxrpc_destroy_call(). Before the patch, it shows the key reference counter elevated: $ cat /proc/keys | grep afs@54321 1bffe9cd I--Q--i 8053480 4169w 3b010000 1000 1000 rxrpc afs@54321: ka $ After the patch, the invalidated key is removed when the code exits: $ cat /proc/keys | grep afs@54321 $ Fixes: f3441d4125fc ("rxrpc: Copy client call parameters into rxrpc_call earlier") Signed-off-by: Anderson Nascimento Co-developed-by: David Howells Signed-off-by: David Howells Reviewed-by: Jeffrey Altman cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-9-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/call_object.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 59329cfe1532..f035f486c139 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -692,6 +692,7 @@ static void rxrpc_destroy_call(struct work_struct *work) rxrpc_put_bundle(call->bundle, rxrpc_bundle_put_call); rxrpc_put_peer(call->peer, rxrpc_peer_put_call); rxrpc_put_local(call->local, rxrpc_local_put_call); + key_put(call->key); call_rcu(&call->rcu, rxrpc_rcu_free_call); } -- cgit v1.2.3 From 0cd3e3f3f2ec1a45aa559e2c0f3d57fac5eb3c25 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Wed, 8 Apr 2026 13:12:37 +0100 Subject: rxrpc: Fix to request an ack if window is limited Peers may only send immediate acks for every 2 UDP packets received. When sending a jumbogram, it is important to check that there is sufficient window space to send another same sized jumbogram following the current one, and request an ack if there isn't. Failure to do so may cause the call to stall waiting for an ack until the resend timer fires. Where jumbograms are in use this causes a very significant drop in performance. Fixes: fe24a5494390 ("rxrpc: Send jumbo DATA packets") Signed-off-by: Marc Dionne Signed-off-by: David Howells cc: Jeffrey Altman cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-10-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 1 + net/rxrpc/ar-internal.h | 2 +- net/rxrpc/output.c | 2 ++ net/rxrpc/proc.c | 5 +++-- 4 files changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index f7f559204b87..578b8038b211 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -521,6 +521,7 @@ #define rxrpc_req_ack_traces \ EM(rxrpc_reqack_ack_lost, "ACK-LOST ") \ EM(rxrpc_reqack_app_stall, "APP-STALL ") \ + EM(rxrpc_reqack_jumbo_win, "JUMBO-WIN ") \ EM(rxrpc_reqack_more_rtt, "MORE-RTT ") \ EM(rxrpc_reqack_no_srv_last, "NO-SRVLAST") \ EM(rxrpc_reqack_old_rtt, "OLD-RTT ") \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 36d6ca0d1089..96ecb83c9071 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -117,7 +117,7 @@ struct rxrpc_net { atomic_t stat_tx_jumbo[10]; atomic_t stat_rx_jumbo[10]; - atomic_t stat_why_req_ack[8]; + atomic_t stat_why_req_ack[9]; atomic_t stat_io_loop; }; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index d70db367e358..870e59bf06af 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -479,6 +479,8 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, why = rxrpc_reqack_old_rtt; else if (!last && !after(READ_ONCE(call->send_top), txb->seq)) why = rxrpc_reqack_app_stall; + else if (call->tx_winsize <= (2 * req->n) || call->cong_cwnd <= (2 * req->n)) + why = rxrpc_reqack_jumbo_win; else goto dont_set_request_ack; diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 59292f7f9205..7755fca5beb8 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -518,11 +518,12 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]), atomic_read(&rxnet->stat_rx_acks[0])); seq_printf(seq, - "Why-Req-A: acklost=%u mrtt=%u ortt=%u stall=%u\n", + "Why-Req-A: acklost=%u mrtt=%u ortt=%u stall=%u jwin=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_more_rtt]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt]), - atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_app_stall])); + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_app_stall]), + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_jumbo_win])); seq_printf(seq, "Why-Req-A: nolast=%u retx=%u slows=%u smtxw=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_no_srv_last]), -- cgit v1.2.3 From 6331f1b24a3e85465f6454e003a3e6c22005a5c5 Mon Sep 17 00:00:00 2001 From: Douya Le Date: Wed, 8 Apr 2026 13:12:38 +0100 Subject: rxrpc: Only put the call ref if one was acquired rxrpc_input_packet_on_conn() can process a to-client packet after the current client call on the channel has already been torn down. In that case chan->call is NULL, rxrpc_try_get_call() returns NULL and there is no reference to drop. The client-side implicit-end error path does not account for that and unconditionally calls rxrpc_put_call(). This turns a protocol error path into a kernel crash instead of rejecting the packet. Only drop the call reference if one was actually acquired. Keep the existing protocol error handling unchanged. Fixes: 5e6ef4f1017c ("rxrpc: Make the I/O thread take over the call and local processor work") Reported-by: Yifan Wu Reported-by: Juefei Pu Signed-off-by: Douya Le Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Signed-off-by: Ao Zhou Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-11-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/io_thread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index e939ecf417c4..697956931925 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -419,7 +419,8 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, if (sp->hdr.callNumber > chan->call_id) { if (rxrpc_to_client(sp)) { - rxrpc_put_call(call, rxrpc_call_put_input); + if (call) + rxrpc_put_call(call, rxrpc_call_put_input); return rxrpc_protocol_error(skb, rxrpc_eproto_unexpected_implicit_end); } -- cgit v1.2.3 From fe4447cd95623b1cfacc15f280aab73a6d7340b2 Mon Sep 17 00:00:00 2001 From: Yuqi Xu Date: Wed, 8 Apr 2026 13:12:39 +0100 Subject: rxrpc: reject undecryptable rxkad response tickets rxkad_decrypt_ticket() decrypts the RXKAD response ticket and then parses the buffer as plaintext without checking whether crypto_skcipher_decrypt() succeeded. A malformed RESPONSE can therefore use a non-block-aligned ticket length, make the decrypt operation fail, and still drive the ticket parser with attacker-controlled bytes. Check the decrypt result and abort the connection with RXKADBADTICKET when ticket decryption fails. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Reported-by: Yifan Wu Reported-by: Juefei Pu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Tested-by: Ren Wei Signed-off-by: Yuqi Xu Signed-off-by: Ren Wei Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-12-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/rxkad.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index e923d6829008..0f79d694cb08 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -958,6 +958,7 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, struct in_addr addr; unsigned int life; time64_t issue, now; + int ret; bool little_endian; u8 *p, *q, *name, *end; @@ -977,8 +978,11 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, sg_init_one(&sg[0], ticket, ticket_len); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, ticket_len, iv.x); - crypto_skcipher_decrypt(req); + ret = crypto_skcipher_decrypt(req); skcipher_request_free(req); + if (ret < 0) + return rxrpc_abort_conn(conn, skb, RXKADBADTICKET, -EPROTO, + rxkad_abort_resp_tkt_short); p = ticket; end = p + ticket_len; -- cgit v1.2.3 From 3e3138007887504ee9206d0bfb5acb062c600025 Mon Sep 17 00:00:00 2001 From: Keenan Dong Date: Wed, 8 Apr 2026 13:12:40 +0100 Subject: rxrpc: fix RESPONSE authenticator parser OOB read rxgk_verify_authenticator() copies auth_len bytes into a temporary buffer and then passes p + auth_len as the parser limit to rxgk_do_verify_authenticator(). Since p is a __be32 *, that inflates the parser end pointer by a factor of four and lets malformed RESPONSE authenticators read past the kmalloc() buffer. Decoded from the original latest-net reproduction logs with scripts/decode_stacktrace.sh: BUG: KASAN: slab-out-of-bounds in rxgk_verify_response() Call Trace: dump_stack_lvl() [lib/dump_stack.c:123] print_report() [mm/kasan/report.c:379 mm/kasan/report.c:482] kasan_report() [mm/kasan/report.c:597] rxgk_verify_response() [net/rxrpc/rxgk.c:1103 net/rxrpc/rxgk.c:1167 net/rxrpc/rxgk.c:1274] rxrpc_process_connection() [net/rxrpc/conn_event.c:266 net/rxrpc/conn_event.c:364 net/rxrpc/conn_event.c:386] process_one_work() [kernel/workqueue.c:3281] worker_thread() [kernel/workqueue.c:3353 kernel/workqueue.c:3440] kthread() [kernel/kthread.c:436] ret_from_fork() [arch/x86/kernel/process.c:164] Allocated by task 54: rxgk_verify_response() [include/linux/slab.h:954 net/rxrpc/rxgk.c:1155 net/rxrpc/rxgk.c:1274] rxrpc_process_connection() [net/rxrpc/conn_event.c:266 net/rxrpc/conn_event.c:364 net/rxrpc/conn_event.c:386] Convert the byte count to __be32 units before constructing the parser limit. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Signed-off-by: Keenan Dong Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: Willy Tarreau cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-13-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/rxgk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index f9f5a2dc62ed..01dbdf0b5cf2 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -1164,7 +1164,8 @@ static int rxgk_verify_authenticator(struct rxrpc_connection *conn, } p = auth; - ret = rxgk_do_verify_authenticator(conn, krb5, skb, p, p + auth_len); + ret = rxgk_do_verify_authenticator(conn, krb5, skb, p, + p + auth_len / sizeof(*p)); error: kfree(auth); return ret; -- cgit v1.2.3 From a2567217ade970ecc458144b6be469bc015b23e5 Mon Sep 17 00:00:00 2001 From: Keenan Dong Date: Wed, 8 Apr 2026 13:12:41 +0100 Subject: rxrpc: fix oversized RESPONSE authenticator length check rxgk_verify_response() decodes auth_len from the packet and is supposed to verify that it fits in the remaining bytes. The existing check is inverted, so oversized RESPONSE authenticators are accepted and passed to rxgk_decrypt_skb(), which can later reach skb_to_sgvec() with an impossible length and hit BUG_ON(len). Decoded from the original latest-net reproduction logs with scripts/decode_stacktrace.sh: RIP: __skb_to_sgvec() [net/core/skbuff.c:5285 (discriminator 1)] Call Trace: skb_to_sgvec() [net/core/skbuff.c:5305] rxgk_decrypt_skb() [net/rxrpc/rxgk_common.h:81] rxgk_verify_response() [net/rxrpc/rxgk.c:1268] rxrpc_process_connection() [net/rxrpc/conn_event.c:266 net/rxrpc/conn_event.c:364 net/rxrpc/conn_event.c:386] process_one_work() [kernel/workqueue.c:3281] worker_thread() [kernel/workqueue.c:3353 kernel/workqueue.c:3440] kthread() [kernel/kthread.c:436] ret_from_fork() [arch/x86/kernel/process.c:164] Reject authenticator lengths that exceed the remaining packet payload. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Signed-off-by: Keenan Dong Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: Willy Tarreau cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-14-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/rxgk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 01dbdf0b5cf2..9e4a4ff28913 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -1224,7 +1224,7 @@ static int rxgk_verify_response(struct rxrpc_connection *conn, auth_offset = offset; auth_len = ntohl(xauth_len); - if (auth_len < len) + if (auth_len > len) goto short_packet; if (auth_len & 3) goto inconsistent; -- cgit v1.2.3 From f125846ee79fcae537a964ce66494e96fa54a6de Mon Sep 17 00:00:00 2001 From: Luxiao Xu Date: Wed, 8 Apr 2026 13:12:42 +0100 Subject: rxrpc: fix reference count leak in rxrpc_server_keyring() This patch fixes a reference count leak in rxrpc_server_keyring() by checking if rx->securities is already set. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Reported-by: Yifan Wu Reported-by: Juefei Pu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Tested-by: Ren Wei Signed-off-by: Luxiao Xu Signed-off-by: Ren Wei Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-15-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/server_key.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c index 36b05fd842a7..27491f1e1273 100644 --- a/net/rxrpc/server_key.c +++ b/net/rxrpc/server_key.c @@ -125,6 +125,9 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) _enter(""); + if (rx->securities) + return -EINVAL; + if (optlen <= 0 || optlen > PAGE_SIZE - 1) return -EINVAL; -- cgit v1.2.3 From 2afd86ccbb2082a3c4258aea8c07e5bb6267bc2f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 8 Apr 2026 13:12:43 +0100 Subject: rxrpc: Fix key/keyring checks in setsockopt(RXRPC_SECURITY_KEY/KEYRING) An AF_RXRPC socket can be both client and server at the same time. When sending new calls (ie. it's acting as a client), it uses rx->key to set the security, and when accepting incoming calls (ie. it's acting as a server), it uses rx->securities. setsockopt(RXRPC_SECURITY_KEY) sets rx->key to point to an rxrpc-type key and setsockopt(RXRPC_SECURITY_KEYRING) sets rx->securities to point to a keyring of rxrpc_s-type keys. Now, it should be possible to use both rx->key and rx->securities on the same socket - but for userspace AF_RXRPC sockets rxrpc_setsockopt() prevents that. Fix this by: (1) Remove the incorrect check rxrpc_setsockopt(RXRPC_SECURITY_KEYRING) makes on rx->key. (2) Move the check that rxrpc_setsockopt(RXRPC_SECURITY_KEY) makes on rx->key down into rxrpc_request_key(). (3) Remove rxrpc_request_key()'s check on rx->securities. This (in combination with a previous patch) pushes the checks down into the functions that set those pointers and removes the cross-checks that prevent both key and keyring being set. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Closes: https://sashiko.dev/#/patchset/20260401105614.1696001-10-dhowells@redhat.com Signed-off-by: David Howells cc: Marc Dionne cc: Anderson Nascimento cc: Luxiao Xu cc: Yuan Tan cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-16-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/af_rxrpc.c | 6 ------ net/rxrpc/key.c | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 0f90272ac254..32ec91fa938f 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -654,9 +654,6 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, goto success; case RXRPC_SECURITY_KEY: - ret = -EINVAL; - if (rx->key) - goto error; ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; @@ -664,9 +661,6 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, goto error; case RXRPC_SECURITY_KEYRING: - ret = -EINVAL; - if (rx->key) - goto error; ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 77237a82be3b..6301d79ee35a 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -622,7 +622,7 @@ int rxrpc_request_key(struct rxrpc_sock *rx, sockptr_t optval, int optlen) _enter(""); - if (optlen <= 0 || optlen > PAGE_SIZE - 1 || rx->securities) + if (optlen <= 0 || optlen > PAGE_SIZE - 1 || rx->key) return -EINVAL; description = memdup_sockptr_nul(optval, optlen); -- cgit v1.2.3 From f93af41b9f5f798823d0d0fb8765c2a936d76270 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 8 Apr 2026 13:12:44 +0100 Subject: rxrpc: Fix missing error checks for rxkad encryption/decryption failure Add error checking for failure of crypto_skcipher_en/decrypt() to various rxkad function as the crypto functions can fail with ENOMEM at least. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Closes: https://sashiko.dev/#/patchset/20260401105614.1696001-10-dhowells@redhat.com Signed-off-by: David Howells cc: Marc Dionne cc: Jeffrey Altman cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-17-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/rxkad.c | 57 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 0f79d694cb08..eb7f2769d2b1 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -197,6 +197,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn, struct rxrpc_crypt iv; __be32 *tmpbuf; size_t tmpsize = 4 * sizeof(__be32); + int ret; _enter(""); @@ -225,13 +226,13 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn, skcipher_request_set_sync_tfm(req, ci); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, tmpsize, iv.x); - crypto_skcipher_encrypt(req); + ret = crypto_skcipher_encrypt(req); skcipher_request_free(req); memcpy(&conn->rxkad.csum_iv, tmpbuf + 2, sizeof(conn->rxkad.csum_iv)); kfree(tmpbuf); - _leave(" = 0"); - return 0; + _leave(" = %d", ret); + return ret; } /* @@ -264,6 +265,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, struct scatterlist sg; size_t pad; u16 check; + int ret; _enter(""); @@ -286,11 +288,11 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); - crypto_skcipher_encrypt(req); + ret = crypto_skcipher_encrypt(req); skcipher_request_zero(req); - _leave(" = 0"); - return 0; + _leave(" = %d", ret); + return ret; } /* @@ -345,7 +347,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) union { __be32 buf[2]; } crypto __aligned(8); - u32 x, y; + u32 x, y = 0; int ret; _enter("{%d{%x}},{#%u},%u,", @@ -376,8 +378,10 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); - crypto_skcipher_encrypt(req); + ret = crypto_skcipher_encrypt(req); skcipher_request_zero(req); + if (ret < 0) + goto out; y = ntohl(crypto.buf[1]); y = (y >> 16) & 0xffff; @@ -413,6 +417,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) memset(p + txb->pkt_len, 0, gap); } +out: skcipher_request_free(req); _leave(" = %d [set %x]", ret, y); return ret; @@ -453,8 +458,10 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, 8, iv.x); - crypto_skcipher_decrypt(req); + ret = crypto_skcipher_decrypt(req); skcipher_request_zero(req); + if (ret < 0) + return ret; /* Extract the decrypted packet length */ if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) @@ -531,10 +538,14 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, sp->len, iv.x); - crypto_skcipher_decrypt(req); + ret = crypto_skcipher_decrypt(req); skcipher_request_zero(req); if (sg != _sg) kfree(sg); + if (ret < 0) { + WARN_ON_ONCE(ret != -ENOMEM); + return ret; + } /* Extract the decrypted packet length */ if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) @@ -602,8 +613,10 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); - crypto_skcipher_encrypt(req); + ret = crypto_skcipher_encrypt(req); skcipher_request_zero(req); + if (ret < 0) + goto out; y = ntohl(crypto.buf[1]); cksum = (y >> 16) & 0xffff; @@ -1077,21 +1090,23 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, /* * decrypt the response packet */ -static void rxkad_decrypt_response(struct rxrpc_connection *conn, - struct rxkad_response *resp, - const struct rxrpc_crypt *session_key) +static int rxkad_decrypt_response(struct rxrpc_connection *conn, + struct rxkad_response *resp, + const struct rxrpc_crypt *session_key) { struct skcipher_request *req = rxkad_ci_req; struct scatterlist sg[1]; struct rxrpc_crypt iv; + int ret; _enter(",,%08x%08x", ntohl(session_key->n[0]), ntohl(session_key->n[1])); mutex_lock(&rxkad_ci_mutex); - if (crypto_sync_skcipher_setkey(rxkad_ci, session_key->x, - sizeof(*session_key)) < 0) - BUG(); + ret = crypto_sync_skcipher_setkey(rxkad_ci, session_key->x, + sizeof(*session_key)); + if (ret < 0) + goto unlock; memcpy(&iv, session_key, sizeof(iv)); @@ -1100,12 +1115,14 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn, skcipher_request_set_sync_tfm(req, rxkad_ci); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); - crypto_skcipher_decrypt(req); + ret = crypto_skcipher_decrypt(req); skcipher_request_zero(req); +unlock: mutex_unlock(&rxkad_ci_mutex); _leave(""); + return ret; } /* @@ -1198,7 +1215,9 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, /* use the session key from inside the ticket to decrypt the * response */ - rxkad_decrypt_response(conn, response, &session_key); + ret = rxkad_decrypt_response(conn, response, &session_key); + if (ret < 0) + goto temporary_error_free_ticket; if (ntohl(response->encrypted.epoch) != conn->proto.epoch || ntohl(response->encrypted.cid) != conn->proto.cid || -- cgit v1.2.3 From 699e52180f4231c257821c037ed5c99d5eb0edb8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 8 Apr 2026 13:12:45 +0100 Subject: rxrpc: Fix integer overflow in rxgk_verify_response() In rxgk_verify_response(), there's a potential integer overflow due to rounding up token_len before checking it, thereby allowing the length check to be bypassed. Fix this by checking the unrounded value against len too (len is limited as the response must fit in a single UDP packet). Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Closes: https://sashiko.dev/#/patchset/20260401105614.1696001-10-dhowells@redhat.com Signed-off-by: David Howells cc: Marc Dionne cc: Jeffrey Altman cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-18-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/rxgk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 9e4a4ff28913..064c1531fc99 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -1209,7 +1209,8 @@ static int rxgk_verify_response(struct rxrpc_connection *conn, token_offset = offset; token_len = ntohl(rhdr.token_len); - if (xdr_round_up(token_len) + sizeof(__be32) > len) + if (token_len > len || + xdr_round_up(token_len) + sizeof(__be32) > len) goto short_packet; trace_rxrpc_rx_response(conn, sp->hdr.serial, 0, sp->hdr.cksum, token_len); -- cgit v1.2.3 From 7e1876caa8363056f58a21d3b31b82c2daf7e608 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 8 Apr 2026 13:12:46 +0100 Subject: rxrpc: Fix leak of rxgk context in rxgk_verify_response() Fix rxgk_verify_response() to clean up the rxgk context it creates. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Closes: https://sashiko.dev/#/patchset/20260401105614.1696001-10-dhowells@redhat.com Signed-off-by: David Howells cc: Marc Dionne cc: Jeffrey Altman cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-19-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/rxgk.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 064c1531fc99..c67e3c2ca871 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -1270,16 +1270,18 @@ static int rxgk_verify_response(struct rxrpc_connection *conn, if (ret < 0) { rxrpc_abort_conn(conn, skb, RXGK_SEALEDINCON, ret, rxgk_abort_resp_auth_dec); - goto out; + goto out_gk; } ret = rxgk_verify_authenticator(conn, krb5, skb, auth_offset, auth_len); if (ret < 0) - goto out; + goto out_gk; conn->key = key; key = NULL; ret = 0; +out_gk: + rxgk_put(gk); out: key_put(key); _leave(" = %d", ret); -- cgit v1.2.3 From f564af387c8c28238f8ebc13314c589d7ba8475d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 8 Apr 2026 13:12:47 +0100 Subject: rxrpc: Fix buffer overread in rxgk_do_verify_authenticator() Fix rxgk_do_verify_authenticator() to check the buffer size before checking the nonce. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Closes: https://sashiko.dev/#/patchset/20260401105614.1696001-10-dhowells@redhat.com Signed-off-by: David Howells cc: Marc Dionne cc: Jeffrey Altman cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-20-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/rxgk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index c67e3c2ca871..0d5e654da918 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -1085,6 +1085,9 @@ static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn, _enter(""); + if ((end - p) * sizeof(__be32) < 24) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_auth); if (memcmp(p, conn->rxgk.nonce, 20) != 0) return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, rxgk_abort_resp_bad_nonce); @@ -1098,7 +1101,7 @@ static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn, p += xdr_round_up(app_len) / sizeof(__be32); if (end - p < 4) return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, - rxgk_abort_resp_short_applen); + rxgk_abort_resp_short_auth); level = ntohl(*p++); epoch = ntohl(*p++); -- cgit v1.2.3 From c43ffdcfdbb5567b1f143556df8a04b4eeea041c Mon Sep 17 00:00:00 2001 From: Wang Jie Date: Wed, 8 Apr 2026 13:12:48 +0100 Subject: rxrpc: only handle RESPONSE during service challenge Only process RESPONSE packets while the service connection is still in RXRPC_CONN_SERVICE_CHALLENGING. Check that state under state_lock before running response verification and security initialization, then use a local secured flag to decide whether to queue the secured-connection work after the state transition. This keeps duplicate or late RESPONSE packets from re-running the setup path and removes the unlocked post-transition state test. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Reported-by: Yifan Wu Reported-by: Juefei Pu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Signed-off-by: Jie Wang Signed-off-by: Yang Yang Signed-off-by: David Howells cc: Marc Dionne cc: Jeffrey Altman cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-21-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/conn_event.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index c50cbfc5a313..9a41ec708aeb 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -247,6 +247,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + bool secured = false; int ret; if (conn->state == RXRPC_CONN_ABORTED) @@ -262,6 +263,13 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, return ret; case RXRPC_PACKET_TYPE_RESPONSE: + spin_lock_irq(&conn->state_lock); + if (conn->state != RXRPC_CONN_SERVICE_CHALLENGING) { + spin_unlock_irq(&conn->state_lock); + return 0; + } + spin_unlock_irq(&conn->state_lock); + ret = conn->security->verify_response(conn, skb); if (ret < 0) return ret; @@ -272,11 +280,13 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, return ret; spin_lock_irq(&conn->state_lock); - if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) + if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) { conn->state = RXRPC_CONN_SERVICE; + secured = true; + } spin_unlock_irq(&conn->state_lock); - if (conn->state == RXRPC_CONN_SERVICE) { + if (secured) { /* Offload call state flipping to the I/O thread. As * we've already received the packet, put it on the * front of the queue. -- cgit v1.2.3 From a44ce6aa2efb61fe44f2cfab72bb01544bbca272 Mon Sep 17 00:00:00 2001 From: Pengpeng Hou Date: Wed, 8 Apr 2026 13:12:49 +0100 Subject: rxrpc: proc: size address buffers for %pISpc output The AF_RXRPC procfs helpers format local and remote socket addresses into fixed 50-byte stack buffers with "%pISpc". That is too small for the longest current-tree IPv6-with-port form the formatter can produce. In lib/vsprintf.c, the compressed IPv6 path uses a dotted-quad tail not only for v4mapped addresses, but also for ISATAP addresses via ipv6_addr_is_isatap(). As a result, a case such as [ffff:ffff:ffff:ffff:0:5efe:255.255.255.255]:65535 is possible with the current formatter. That is 50 visible characters, so 51 bytes including the trailing NUL, which does not fit in the existing char[50] buffers used by net/rxrpc/proc.c. Size the buffers from the formatter's maximum textual form and switch the call sites to scnprintf(). Changes since v1: - correct the changelog to cite the actual maximum current-tree case explicitly - frame the proof around the ISATAP formatting path instead of the earlier mapped-v4 example Fixes: 75b54cb57ca3 ("rxrpc: Add IPv6 support") Signed-off-by: Pengpeng Hou Signed-off-by: David Howells cc: Marc Dionne cc: Anderson Nascimento cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260408121252.2249051-22-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/proc.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 7755fca5beb8..e9a27fa7b25d 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -10,6 +10,10 @@ #include #include "ar-internal.h" +#define RXRPC_PROC_ADDRBUF_SIZE \ + (sizeof("[xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255]") + \ + sizeof(":12345")) + static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = { [RXRPC_CONN_UNUSED] = "Unused ", [RXRPC_CONN_CLIENT_UNSECURED] = "ClUnsec ", @@ -53,7 +57,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); enum rxrpc_call_state state; rxrpc_seq_t tx_bottom; - char lbuff[50], rbuff[50]; + char lbuff[RXRPC_PROC_ADDRBUF_SIZE], rbuff[RXRPC_PROC_ADDRBUF_SIZE]; long timeout = 0; if (v == &rxnet->calls) { @@ -69,11 +73,11 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) local = call->local; if (local) - sprintf(lbuff, "%pISpc", &local->srx.transport); + scnprintf(lbuff, sizeof(lbuff), "%pISpc", &local->srx.transport); else strcpy(lbuff, "no_local"); - sprintf(rbuff, "%pISpc", &call->dest_srx.transport); + scnprintf(rbuff, sizeof(rbuff), "%pISpc", &call->dest_srx.transport); state = rxrpc_call_state(call); if (state != RXRPC_CALL_SERVER_PREALLOC) @@ -142,7 +146,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) struct rxrpc_connection *conn; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); const char *state; - char lbuff[50], rbuff[50]; + char lbuff[RXRPC_PROC_ADDRBUF_SIZE], rbuff[RXRPC_PROC_ADDRBUF_SIZE]; if (v == &rxnet->conn_proc_list) { seq_puts(seq, @@ -161,8 +165,8 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) goto print; } - sprintf(lbuff, "%pISpc", &conn->local->srx.transport); - sprintf(rbuff, "%pISpc", &conn->peer->srx.transport); + scnprintf(lbuff, sizeof(lbuff), "%pISpc", &conn->local->srx.transport); + scnprintf(rbuff, sizeof(rbuff), "%pISpc", &conn->peer->srx.transport); print: state = rxrpc_is_conn_aborted(conn) ? rxrpc_call_completions[conn->completion] : @@ -228,7 +232,7 @@ static int rxrpc_bundle_seq_show(struct seq_file *seq, void *v) { struct rxrpc_bundle *bundle; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); - char lbuff[50], rbuff[50]; + char lbuff[RXRPC_PROC_ADDRBUF_SIZE], rbuff[RXRPC_PROC_ADDRBUF_SIZE]; if (v == &rxnet->bundle_proc_list) { seq_puts(seq, @@ -242,8 +246,8 @@ static int rxrpc_bundle_seq_show(struct seq_file *seq, void *v) bundle = list_entry(v, struct rxrpc_bundle, proc_link); - sprintf(lbuff, "%pISpc", &bundle->local->srx.transport); - sprintf(rbuff, "%pISpc", &bundle->peer->srx.transport); + scnprintf(lbuff, sizeof(lbuff), "%pISpc", &bundle->local->srx.transport); + scnprintf(rbuff, sizeof(rbuff), "%pISpc", &bundle->peer->srx.transport); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %3u %3d" " %c%c%c %08x | %08x %08x %08x %08x %08x\n", @@ -279,7 +283,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) { struct rxrpc_peer *peer; time64_t now; - char lbuff[50], rbuff[50]; + char lbuff[RXRPC_PROC_ADDRBUF_SIZE], rbuff[RXRPC_PROC_ADDRBUF_SIZE]; if (v == SEQ_START_TOKEN) { seq_puts(seq, @@ -290,9 +294,9 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) peer = list_entry(v, struct rxrpc_peer, hash_link); - sprintf(lbuff, "%pISpc", &peer->local->srx.transport); + scnprintf(lbuff, sizeof(lbuff), "%pISpc", &peer->local->srx.transport); - sprintf(rbuff, "%pISpc", &peer->srx.transport); + scnprintf(rbuff, sizeof(rbuff), "%pISpc", &peer->srx.transport); now = ktime_get_seconds(); seq_printf(seq, @@ -401,7 +405,7 @@ const struct seq_operations rxrpc_peer_seq_ops = { static int rxrpc_local_seq_show(struct seq_file *seq, void *v) { struct rxrpc_local *local; - char lbuff[50]; + char lbuff[RXRPC_PROC_ADDRBUF_SIZE]; if (v == SEQ_START_TOKEN) { seq_puts(seq, @@ -412,7 +416,7 @@ static int rxrpc_local_seq_show(struct seq_file *seq, void *v) local = hlist_entry(v, struct rxrpc_local, link); - sprintf(lbuff, "%pISpc", &local->srx.transport); + scnprintf(lbuff, sizeof(lbuff), "%pISpc", &local->srx.transport); seq_printf(seq, "UDP %-47.47s %3u %3u %3u\n", -- cgit v1.2.3 From b30b1675aa2bcf0491fd3830b051df4e08a7c8ca Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Sat, 4 Apr 2026 15:41:37 +0200 Subject: net: ioam6: fix OOB and missing lock When trace->type.bit6 is set: if (trace->type.bit6) { ... queue = skb_get_tx_queue(dev, skb); qdisc = rcu_dereference(queue->qdisc); This code can lead to an out-of-bounds access of the dev->_tx[] array when is_input is true. In such a case, the packet is on the RX path and skb->queue_mapping contains the RX queue index of the ingress device. If the ingress device has more RX queues than the egress device (dev) has TX queues, skb_get_queue_mapping(skb) will exceed dev->num_tx_queues. Add a check to avoid this situation since skb_get_tx_queue() does not clamp the index. This issue has also revealed that per queue visibility cannot be accurate and will be replaced later as a new feature. While at it, add missing lock around qdisc_qstats_qlen_backlog(). The function __ioam6_fill_trace_data() is called from both softirq and process contexts, hence the use of spin_lock_bh() here. Fixes: b63c5478e9cb ("ipv6: ioam: Support for Queue depth data field") Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20260403214418.2233266-2-kuba@kernel.org/ Signed-off-by: Justin Iurman Link: https://patch.msgid.link/20260404134137.24553-1-justin.iurman@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/ioam6.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index 05a0b7d7e2aa..e963a71858a7 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -803,12 +803,16 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, struct Qdisc *qdisc; __u32 qlen, backlog; - if (dev->flags & IFF_LOOPBACK) { + if (dev->flags & IFF_LOOPBACK || + skb_get_queue_mapping(skb) >= dev->num_tx_queues) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); } else { queue = skb_get_tx_queue(dev, skb); qdisc = rcu_dereference(queue->qdisc); + + spin_lock_bh(qdisc_lock(qdisc)); qdisc_qstats_qlen_backlog(qdisc, &qlen, &backlog); + spin_unlock_bh(qdisc_lock(qdisc)); *(__be32 *)data = cpu_to_be32(backlog); } -- cgit v1.2.3 From 9b55b253907e7431210483519c5ad711a37dafa1 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Mon, 6 Apr 2026 11:15:10 +0800 Subject: mptcp: fix slab-use-after-free in __inet_lookup_established The ehash table lookups are lockless and rely on SLAB_TYPESAFE_BY_RCU to guarantee socket memory stability during RCU read-side critical sections. Both tcp_prot and tcpv6_prot have their slab caches created with this flag via proto_register(). However, MPTCP's mptcp_subflow_init() copies tcpv6_prot into tcpv6_prot_override during inet_init() (fs_initcall, level 5), before inet6_init() (module_init/device_initcall, level 6) has called proto_register(&tcpv6_prot). At that point, tcpv6_prot.slab is still NULL, so tcpv6_prot_override.slab remains NULL permanently. This causes MPTCP v6 subflow child sockets to be allocated via kmalloc (falling into kmalloc-4k) instead of the TCPv6 slab cache. The kmalloc-4k cache lacks SLAB_TYPESAFE_BY_RCU, so when these sockets are freed without SOCK_RCU_FREE (which is cleared for child sockets by design), the memory can be immediately reused. Concurrent ehash lookups under rcu_read_lock can then access freed memory, triggering a slab-use-after-free in __inet_lookup_established. Fix this by splitting the IPv6-specific initialization out of mptcp_subflow_init() into a new mptcp_subflow_v6_init(), called from mptcp_proto_v6_init() before protocol registration. This ensures tcpv6_prot_override.slab correctly inherits the SLAB_TYPESAFE_BY_RCU slab cache. Fixes: b19bc2945b40 ("mptcp: implement delegated actions") Cc: stable@vger.kernel.org Signed-off-by: Jiayuan Chen Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260406031512.189159-1-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 2 ++ net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 15 +++++++++------ 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 65c3bb8016f4..614c3f583ca0 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -4660,6 +4660,8 @@ int __init mptcp_proto_v6_init(void) { int err; + mptcp_subflow_v6_init(); + mptcp_v6_prot = mptcp_prot; strscpy(mptcp_v6_prot.name, "MPTCPv6", sizeof(mptcp_v6_prot.name)); mptcp_v6_prot.slab = NULL; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0bd1ee860316..ec15e503da8b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -875,6 +875,7 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk, void __init mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) int __init mptcp_proto_v6_init(void); +void __init mptcp_subflow_v6_init(void); #endif struct sock *mptcp_sk_clone_init(const struct sock *sk, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 6716970693e9..4ff5863aa9fd 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -2165,7 +2165,15 @@ void __init mptcp_subflow_init(void) tcp_prot_override.psock_update_sk_prot = NULL; #endif + mptcp_diag_subflow_init(&subflow_ulp_ops); + + if (tcp_register_ulp(&subflow_ulp_ops) != 0) + panic("MPTCP: failed to register subflows to ULP\n"); +} + #if IS_ENABLED(CONFIG_MPTCP_IPV6) +void __init mptcp_subflow_v6_init(void) +{ /* In struct mptcp_subflow_request_sock, we assume the TCP request sock * structures for v4 and v6 have the same size. It should not changed in * the future but better to make sure to be warned if it is no longer @@ -2204,10 +2212,5 @@ void __init mptcp_subflow_init(void) /* Disable sockmap processing for subflows */ tcpv6_prot_override.psock_update_sk_prot = NULL; #endif -#endif - - mptcp_diag_subflow_init(&subflow_ulp_ops); - - if (tcp_register_ulp(&subflow_ulp_ops) != 0) - panic("MPTCP: failed to register subflows to ULP\n"); } +#endif -- cgit v1.2.3 From 8e2760eaab778494fc1fa257031e0e1799647f46 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 7 Apr 2026 10:41:41 +0200 Subject: Revert "mptcp: add needs_id for netlink appending addr" This commit was originally adding the ability to add MPTCP endpoints with ID 0 by accident. The in-kernel PM, handling MPTCP endpoints at the net namespace level, is not supposed to handle endpoints with such ID, because this ID 0 is reserved to the initial subflow, as mentioned in the MPTCPv1 protocol [1], a per-connection setting. Note that 'ip mptcp endpoint add id 0' stops early with an error, but other tools might still request the in-kernel PM to create MPTCP endpoints with this restricted ID 0. In other words, it was wrong to call the mptcp_pm_has_addr_attr_id helper to check whether the address ID attribute is set: if it was set to 0, a new MPTCP endpoint would be created with ID 0, which is not expected, and might cause various issues later. Fixes: 584f38942626 ("mptcp: add needs_id for netlink appending addr") Cc: stable@vger.kernel.org Link: https://datatracker.ietf.org/doc/html/rfc8684#section-3.2-9 [1] Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260407-net-mptcp-revert-pm-needs-id-v2-1-7a25cbc324f8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 82e59f9c6dd9..0ebf43be9939 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -720,7 +720,7 @@ static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, struct mptcp_pm_addr_entry *entry, - bool needs_id, bool replace) + bool replace) { struct mptcp_pm_addr_entry *cur, *del_entry = NULL; int ret = -EINVAL; @@ -779,7 +779,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, } } - if (!entry->addr.id && needs_id) { + if (!entry->addr.id) { find_next: entry->addr.id = find_next_zero_bit(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, @@ -790,7 +790,7 @@ find_next: } } - if (!entry->addr.id && needs_id) + if (!entry->addr.id) goto out; __set_bit(entry->addr.id, pernet->id_bitmap); @@ -923,7 +923,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, return -ENOMEM; entry->addr.port = 0; - ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true, false); + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, false); if (ret < 0) kfree(entry); @@ -977,18 +977,6 @@ next: return 0; } -static bool mptcp_pm_has_addr_attr_id(const struct nlattr *attr, - struct genl_info *info) -{ - struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; - - if (!nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, - mptcp_pm_address_nl_policy, info->extack) && - tb[MPTCP_PM_ADDR_ATTR_ID]) - return true; - return false; -} - /* Add an MPTCP endpoint */ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) { @@ -1037,9 +1025,7 @@ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) goto out_free; } } - ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, - !mptcp_pm_has_addr_attr_id(attr, info), - true); + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true); if (ret < 0) { GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret); goto out_free; -- cgit v1.2.3 From 39897df386376912d561d4946499379effa1e7ef Mon Sep 17 00:00:00 2001 From: Jiexun Wang Date: Tue, 7 Apr 2026 16:00:14 +0800 Subject: af_unix: read UNIX_DIAG_VFS data under unix_state_lock Exact UNIX diag lookups hold a reference to the socket, but not to u->path. Meanwhile, unix_release_sock() clears u->path under unix_state_lock() and drops the path reference after unlocking. Read the inode and device numbers for UNIX_DIAG_VFS while holding unix_state_lock(), then emit the netlink attribute after dropping the lock. This keeps the VFS data stable while the reply is being built. Fixes: 5f7b0569460b ("unix_diag: Unix inode info NLA") Reported-by: Yifan Wu Reported-by: Juefei Pu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Tested-by: Ren Wei Signed-off-by: Jiexun Wang Signed-off-by: Ren Wei Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260407080015.1744197-1-n05ec@lzu.edu.cn Signed-off-by: Jakub Kicinski --- net/unix/diag.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/unix/diag.c b/net/unix/diag.c index ca3473026151..c9c1e51c4419 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -28,18 +28,23 @@ static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb) { - struct dentry *dentry = unix_sk(sk)->path.dentry; + struct unix_diag_vfs uv; + struct dentry *dentry; + bool have_vfs = false; + unix_state_lock(sk); + dentry = unix_sk(sk)->path.dentry; if (dentry) { - struct unix_diag_vfs uv = { - .udiag_vfs_ino = d_backing_inode(dentry)->i_ino, - .udiag_vfs_dev = dentry->d_sb->s_dev, - }; - - return nla_put(nlskb, UNIX_DIAG_VFS, sizeof(uv), &uv); + uv.udiag_vfs_ino = d_backing_inode(dentry)->i_ino; + uv.udiag_vfs_dev = dentry->d_sb->s_dev; + have_vfs = true; } + unix_state_unlock(sk); - return 0; + if (!have_vfs) + return 0; + + return nla_put(nlskb, UNIX_DIAG_VFS, sizeof(uv), &uv); } static int sk_diag_dump_peer(struct sock *sk, struct sk_buff *nlskb) -- cgit v1.2.3 From 0006c6f1091bbeea88b8a88a6548b9fb2f803c74 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 6 Apr 2026 22:27:30 -0400 Subject: devlink: Fix incorrect skb socket family dumping The devlink_fmsg_dump_skb function was incorrectly using the socket type (sk->sk_type) instead of the socket family (sk->sk_family) when filling the "family" field in the fast message dump. This patch fixes this to properly display the socket family. Fixes: 3dbfde7f6bc7b8 ("devlink: add devlink_fmsg_dump_skb() function") Signed-off-by: Li RongQing Link: https://patch.msgid.link/20260407022730.2393-1-lirongqing@baidu.com Signed-off-by: Jakub Kicinski --- net/devlink/health.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/devlink/health.c b/net/devlink/health.c index 449c7611c640..ea7a334e939b 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -1327,7 +1327,7 @@ void devlink_fmsg_dump_skb(struct devlink_fmsg *fmsg, const struct sk_buff *skb) if (sk) { devlink_fmsg_pair_nest_start(fmsg, "sk"); devlink_fmsg_obj_nest_start(fmsg); - devlink_fmsg_put(fmsg, "family", sk->sk_type); + devlink_fmsg_put(fmsg, "family", sk->sk_family); devlink_fmsg_put(fmsg, "type", sk->sk_type); devlink_fmsg_put(fmsg, "proto", sk->sk_protocol); devlink_fmsg_obj_nest_end(fmsg); -- cgit v1.2.3 From ebe560ea5f54134279356703e73b7f867c89db13 Mon Sep 17 00:00:00 2001 From: Alice Mikityanska Date: Fri, 3 Apr 2026 20:49:49 +0300 Subject: l2tp: Drop large packets with UDP encap syzbot reported a WARN on my patch series [1]. The actual issue is an overflow of 16-bit UDP length field, and it exists in the upstream code. My series added a debug WARN with an overflow check that exposed the issue, that's why syzbot tripped on my patches, rather than on upstream code. syzbot's repro: r0 = socket$pppl2tp(0x18, 0x1, 0x1) r1 = socket$inet6_udp(0xa, 0x2, 0x0) connect$inet6(r1, &(0x7f00000000c0)={0xa, 0x0, 0x0, @loopback, 0xfffffffc}, 0x1c) connect$pppl2tp(r0, &(0x7f0000000240)=@pppol2tpin6={0x18, 0x1, {0x0, r1, 0x4, 0x0, 0x0, 0x0, {0xa, 0x4e22, 0xffff, @ipv4={'\x00', '\xff\xff', @empty}}}}, 0x32) writev(r0, &(0x7f0000000080)=[{&(0x7f0000000000)="ee", 0x34000}], 0x1) It basically sends an oversized (0x34000 bytes) PPPoL2TP packet with UDP encapsulation, and l2tp_xmit_core doesn't check for overflows when it assigns the UDP length field. The value gets trimmed to 16 bites. Add an overflow check that drops oversized packets and avoids sending packets with trimmed UDP length to the wire. syzbot's stack trace (with my patch applied): len >= 65536u WARNING: ./include/linux/udp.h:38 at udp_set_len_short include/linux/udp.h:38 [inline], CPU#1: syz.0.17/5957 WARNING: ./include/linux/udp.h:38 at l2tp_xmit_core net/l2tp/l2tp_core.c:1293 [inline], CPU#1: syz.0.17/5957 WARNING: ./include/linux/udp.h:38 at l2tp_xmit_skb+0x1204/0x18d0 net/l2tp/l2tp_core.c:1327, CPU#1: syz.0.17/5957 Modules linked in: CPU: 1 UID: 0 PID: 5957 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 RIP: 0010:udp_set_len_short include/linux/udp.h:38 [inline] RIP: 0010:l2tp_xmit_core net/l2tp/l2tp_core.c:1293 [inline] RIP: 0010:l2tp_xmit_skb+0x1204/0x18d0 net/l2tp/l2tp_core.c:1327 Code: 0f 0b 90 e9 21 f9 ff ff e8 e9 05 ec f6 90 0f 0b 90 e9 8d f9 ff ff e8 db 05 ec f6 90 0f 0b 90 e9 cc f9 ff ff e8 cd 05 ec f6 90 <0f> 0b 90 e9 de fa ff ff 44 89 f1 80 e1 07 80 c1 03 38 c1 0f 8c 4f RSP: 0018:ffffc90003d67878 EFLAGS: 00010293 RAX: ffffffff8ad985e3 RBX: ffff8881a6400090 RCX: ffff8881697f0000 RDX: 0000000000000000 RSI: 0000000000034010 RDI: 000000000000ffff RBP: dffffc0000000000 R08: 0000000000000003 R09: 0000000000000004 R10: dffffc0000000000 R11: fffff520007acf00 R12: ffff8881baf20900 R13: 0000000000034010 R14: ffff8881a640008e R15: ffff8881760f7000 FS: 000055557e81f500(0000) GS:ffff8882a9467000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000200000033000 CR3: 00000001612f4000 CR4: 00000000000006f0 Call Trace: pppol2tp_sendmsg+0x40a/0x5f0 net/l2tp/l2tp_ppp.c:302 sock_sendmsg_nosec net/socket.c:727 [inline] __sock_sendmsg net/socket.c:742 [inline] sock_write_iter+0x503/0x550 net/socket.c:1195 do_iter_readv_writev+0x619/0x8c0 fs/read_write.c:-1 vfs_writev+0x33c/0x990 fs/read_write.c:1059 do_writev+0x154/0x2e0 fs/read_write.c:1105 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x14d/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f636479c629 Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffffd4241c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000014 RAX: ffffffffffffffda RBX: 00007f6364a15fa0 RCX: 00007f636479c629 RDX: 0000000000000001 RSI: 0000200000000080 RDI: 0000000000000003 RBP: 00007f6364832b39 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007f6364a15fac R14: 00007f6364a15fa0 R15: 00007f6364a15fa0 [1]: https://lore.kernel.org/all/20260226201600.222044-1-alice.kernel@fastmail.im/ Fixes: 3557baabf280 ("[L2TP]: PPP over L2TP driver core") Reported-by: syzbot+ci3edea60a44225dec@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/69a1dfba.050a0220.3a55be.0026.GAE@google.com/ Signed-off-by: Alice Mikityanska Link: https://patch.msgid.link/20260403174949.843941-1-alice.kernel@fastmail.im Signed-off-by: Paolo Abeni --- net/l2tp/l2tp_core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index c89ae52764b8..157fc23ce4e1 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1290,6 +1290,11 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, uns uh->source = inet->inet_sport; uh->dest = inet->inet_dport; udp_len = uhlen + session->hdr_len + data_len; + if (udp_len > U16_MAX) { + kfree_skb(skb); + ret = NET_XMIT_DROP; + goto out_unlock; + } uh->len = htons(udp_len); /* Calculate UDP checksum if configured to do so */ -- cgit v1.2.3