From db5914695a84a7b128ec2e4e9272e6e8091753e1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Jan 2024 11:25:57 +0000
Subject: inet_diag: add module pointer to "struct inet_diag_handler"

Following patch is going to use RCU instead of
inet_diag_table_mutex acquisition.

This patch is a preparation, no change of behavior yet.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/inet_diag.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index 84abb30a3fbb..a9033696b0aa 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -8,6 +8,7 @@
 struct inet_hashinfo;
 
 struct inet_diag_handler {
+	struct module	*owner;
 	void		(*dump)(struct sk_buff *skb,
 				struct netlink_callback *cb,
 				const struct inet_diag_req_v2 *r);
-- 
cgit v1.2.3


From 114b4bb1cc19239b272d52ebbe156053483fe2f8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Jan 2024 11:25:59 +0000
Subject: sock_diag: add module pointer to "struct sock_diag_handler"

Following patch is going to use RCU instead of
sock_diag_table_mutex acquisition.

This patch is a preparation, no change of behavior yet.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/sock_diag.h | 1 +
 net/ipv4/inet_diag.c      | 2 ++
 net/netlink/diag.c        | 1 +
 net/packet/diag.c         | 1 +
 net/smc/smc_diag.c        | 1 +
 net/tipc/diag.c           | 1 +
 net/unix/diag.c           | 1 +
 net/vmw_vsock/diag.c      | 1 +
 net/xdp/xsk_diag.c        | 1 +
 9 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 0b9ecd8cf979..7c07754d711b 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -13,6 +13,7 @@ struct nlmsghdr;
 struct sock;
 
 struct sock_diag_handler {
+	struct module *owner;
 	__u8 family;
 	int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh);
 	int (*get_info)(struct sk_buff *skb, struct sock *sk);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index abf7dc982796..52ce20691e4e 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -1488,6 +1488,7 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
 }
 
 static const struct sock_diag_handler inet_diag_handler = {
+	.owner = THIS_MODULE,
 	.family = AF_INET,
 	.dump = inet_diag_handler_cmd,
 	.get_info = inet_diag_handler_get_info,
@@ -1495,6 +1496,7 @@ static const struct sock_diag_handler inet_diag_handler = {
 };
 
 static const struct sock_diag_handler inet6_diag_handler = {
+	.owner = THIS_MODULE,
 	.family = AF_INET6,
 	.dump = inet_diag_handler_cmd,
 	.get_info = inet_diag_handler_get_info,
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 1eeff9422856..e12c90d5f6ad 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -241,6 +241,7 @@ static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 }
 
 static const struct sock_diag_handler netlink_diag_handler = {
+	.owner = THIS_MODULE,
 	.family = AF_NETLINK,
 	.dump = netlink_diag_handler_dump,
 };
diff --git a/net/packet/diag.c b/net/packet/diag.c
index 9a7980e3309d..b3bd2f6c2bf7 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -245,6 +245,7 @@ static int packet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 }
 
 static const struct sock_diag_handler packet_diag_handler = {
+	.owner = THIS_MODULE,
 	.family = AF_PACKET,
 	.dump = packet_diag_handler_dump,
 };
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 52f7c4f1e767..32bad267fa3e 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -255,6 +255,7 @@ static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 }
 
 static const struct sock_diag_handler smc_diag_handler = {
+	.owner = THIS_MODULE,
 	.family = AF_SMC,
 	.dump = smc_diag_handler_dump,
 };
diff --git a/net/tipc/diag.c b/net/tipc/diag.c
index 18733451c9e0..54dde8c4e4d4 100644
--- a/net/tipc/diag.c
+++ b/net/tipc/diag.c
@@ -95,6 +95,7 @@ static int tipc_sock_diag_handler_dump(struct sk_buff *skb,
 }
 
 static const struct sock_diag_handler tipc_sock_diag_handler = {
+	.owner = THIS_MODULE,
 	.family = AF_TIPC,
 	.dump = tipc_sock_diag_handler_dump,
 };
diff --git a/net/unix/diag.c b/net/unix/diag.c
index bec09a3a1d44..c3648b706509 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -322,6 +322,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 }
 
 static const struct sock_diag_handler unix_diag_handler = {
+	.owner = THIS_MODULE,
 	.family = AF_UNIX,
 	.dump = unix_diag_handler_dump,
 };
diff --git a/net/vmw_vsock/diag.c b/net/vmw_vsock/diag.c
index 2e29994f92ff..ab87ef66c1e8 100644
--- a/net/vmw_vsock/diag.c
+++ b/net/vmw_vsock/diag.c
@@ -157,6 +157,7 @@ static int vsock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 }
 
 static const struct sock_diag_handler vsock_diag_handler = {
+	.owner = THIS_MODULE,
 	.family = AF_VSOCK,
 	.dump = vsock_diag_handler_dump,
 };
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index 9f8955367275..09dcea0cbbed 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -194,6 +194,7 @@ static int xsk_diag_handler_dump(struct sk_buff *nlskb, struct nlmsghdr *hdr)
 }
 
 static const struct sock_diag_handler xsk_diag_handler = {
+	.owner = THIS_MODULE,
 	.family = AF_XDP,
 	.dump = xsk_diag_handler_dump,
 };
-- 
cgit v1.2.3


From 86e8921df05c6e9423ab74ab8d41022775d8b83a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Jan 2024 11:26:01 +0000
Subject: sock_diag: allow concurrent operation in sock_diag_rcv_msg()

TCPDIAG_GETSOCK and DCCPDIAG_GETSOCK diag are serialized
on sock_diag_table_mutex.

This is to make sure inet_diag module is not unloaded
while diag was ongoing.

It is time to get rid of this mutex and use RCU protection,
allowing full parallelism.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/sock_diag.h |  9 +++++++--
 net/core/sock_diag.c      | 43 ++++++++++++++++++++++++++-----------------
 net/ipv4/inet_diag.c      |  9 +++++++--
 3 files changed, 40 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 7c07754d711b..110978dc9af1 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -23,8 +23,13 @@ struct sock_diag_handler {
 int sock_diag_register(const struct sock_diag_handler *h);
 void sock_diag_unregister(const struct sock_diag_handler *h);
 
-void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
-void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
+struct sock_diag_inet_compat {
+	struct module *owner;
+	int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh);
+};
+
+void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr);
+void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr);
 
 u64 __sock_gen_cookie(struct sock *sk);
 
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 72009e1f4380..5c3666431df4 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -17,8 +17,9 @@
 #include <linux/sock_diag.h>
 
 static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX];
-static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
-static DEFINE_MUTEX(sock_diag_table_mutex);
+
+static struct sock_diag_inet_compat __rcu *inet_rcv_compat;
+
 static struct workqueue_struct *broadcast_wq;
 
 DEFINE_COOKIE(sock_cookie);
@@ -184,19 +185,20 @@ void sock_diag_broadcast_destroy(struct sock *sk)
 	queue_work(broadcast_wq, &bsk->work);
 }
 
-void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr)
 {
-	mutex_lock(&sock_diag_table_mutex);
-	inet_rcv_compat = fn;
-	mutex_unlock(&sock_diag_table_mutex);
+	xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat,
+	     ptr);
 }
 EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
 
-void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr)
 {
-	mutex_lock(&sock_diag_table_mutex);
-	inet_rcv_compat = NULL;
-	mutex_unlock(&sock_diag_table_mutex);
+	const struct sock_diag_inet_compat *old;
+
+	old = xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat,
+		   NULL);
+	WARN_ON_ONCE(old != ptr);
 }
 EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
 
@@ -259,20 +261,27 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
 static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
 {
+	const struct sock_diag_inet_compat *ptr;
 	int ret;
 
 	switch (nlh->nlmsg_type) {
 	case TCPDIAG_GETSOCK:
 	case DCCPDIAG_GETSOCK:
-		if (inet_rcv_compat == NULL)
+
+		if (!rcu_access_pointer(inet_rcv_compat))
 			sock_load_diag_module(AF_INET, 0);
 
-		mutex_lock(&sock_diag_table_mutex);
-		if (inet_rcv_compat != NULL)
-			ret = inet_rcv_compat(skb, nlh);
-		else
-			ret = -EOPNOTSUPP;
-		mutex_unlock(&sock_diag_table_mutex);
+		rcu_read_lock();
+		ptr = rcu_dereference(inet_rcv_compat);
+		if (ptr && !try_module_get(ptr->owner))
+			ptr = NULL;
+		rcu_read_unlock();
+
+		ret = -EOPNOTSUPP;
+		if (ptr) {
+			ret = ptr->fn(skb, nlh);
+			module_put(ptr->owner);
+		}
 
 		return ret;
 	case SOCK_DIAG_BY_FAMILY:
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 52ce20691e4e..2c2d8b9dd8e9 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -1527,6 +1527,11 @@ void inet_diag_unregister(const struct inet_diag_handler *h)
 }
 EXPORT_SYMBOL_GPL(inet_diag_unregister);
 
+static const struct sock_diag_inet_compat inet_diag_compat = {
+	.owner	= THIS_MODULE,
+	.fn	= inet_diag_rcv_msg_compat,
+};
+
 static int __init inet_diag_init(void)
 {
 	const int inet_diag_table_size = (IPPROTO_MAX *
@@ -1545,7 +1550,7 @@ static int __init inet_diag_init(void)
 	if (err)
 		goto out_free_inet;
 
-	sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
+	sock_diag_register_inet_compat(&inet_diag_compat);
 out:
 	return err;
 
@@ -1560,7 +1565,7 @@ static void __exit inet_diag_exit(void)
 {
 	sock_diag_unregister(&inet6_diag_handler);
 	sock_diag_unregister(&inet_diag_handler);
-	sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
+	sock_diag_unregister_inet_compat(&inet_diag_compat);
 	kfree(inet_diag_table);
 }
 
-- 
cgit v1.2.3


From 7c05e7f3e74e7e550534d524e04d7e6f78d6fa24 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 5 Jan 2024 18:48:17 +0800
Subject: bpf: Support inlining bpf_kptr_xchg() helper

The motivation of inlining bpf_kptr_xchg() comes from the performance
profiling of bpf memory allocator benchmark. The benchmark uses
bpf_kptr_xchg() to stash the allocated objects and to pop the stashed
objects for free. After inling bpf_kptr_xchg(), the performance for
object free on 8-CPUs VM increases about 2%~10%. The inline also has
downside: both the kasan and kcsan checks on the pointer will be
unavailable.

bpf_kptr_xchg() can be inlined by converting the calling of
bpf_kptr_xchg() into an atomic_xchg() instruction. But the conversion
depends on two conditions:
1) JIT backend supports atomic_xchg() on pointer-sized word
2) For the specific arch, the implementation of xchg is the same as
   atomic_xchg() on pointer-sized words.

It seems most 64-bit JIT backends satisfies these two conditions. But
as a precaution, defining a weak function bpf_jit_supports_ptr_xchg()
to state whether such conversion is safe and only supporting inline for
64-bit host.

For x86-64, it supports BPF_XCHG atomic operation and both xchg() and
atomic_xchg() use arch_xchg() to implement the exchange, so enabling the
inline of bpf_kptr_xchg() on x86-64 first.

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20240105104819.3916743-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c |  5 +++++
 include/linux/filter.h      |  1 +
 kernel/bpf/core.c           | 10 ++++++++++
 kernel/bpf/helpers.c        |  1 +
 kernel/bpf/verifier.c       | 17 +++++++++++++++++
 5 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 919f647c740f..e1390d1e331b 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3242,3 +3242,8 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
 		BUG_ON(ret < 0);
 	}
 }
+
+bool bpf_jit_supports_ptr_xchg(void)
+{
+	return true;
+}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 68fb6c8142fe..35f067fd3840 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -955,6 +955,7 @@ bool bpf_jit_supports_subprog_tailcalls(void);
 bool bpf_jit_supports_kfunc_call(void);
 bool bpf_jit_supports_far_kfunc_call(void);
 bool bpf_jit_supports_exceptions(void);
+bool bpf_jit_supports_ptr_xchg(void);
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
 bool bpf_helper_changes_pkt_data(void *func);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ea6843be2616..fbb1d95a9b44 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2925,6 +2925,16 @@ bool __weak bpf_jit_supports_far_kfunc_call(void)
 	return false;
 }
 
+/* Return TRUE if the JIT backend satisfies the following two conditions:
+ * 1) JIT backend supports atomic_xchg() on pointer-sized words.
+ * 2) Under the specific arch, the implementation of xchg() is the same
+ *    as atomic_xchg() on pointer-sized words.
+ */
+bool __weak bpf_jit_supports_ptr_xchg(void)
+{
+	return false;
+}
+
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
  */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index be72824f32b2..e04ca1af8927 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1414,6 +1414,7 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
 {
 	unsigned long *kptr = map_value;
 
+	/* This helper may be inlined by verifier. */
 	return xchg(kptr, (unsigned long)ptr);
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 65f598694d55..5b33d65eef7b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19809,6 +19809,23 @@ patch_map_ops_generic:
 			continue;
 		}
 
+		/* Implement bpf_kptr_xchg inline */
+		if (prog->jit_requested && BITS_PER_LONG == 64 &&
+		    insn->imm == BPF_FUNC_kptr_xchg &&
+		    bpf_jit_supports_ptr_xchg()) {
+			insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2);
+			insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0);
+			cnt = 2;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
 patch_call_imm:
 		fn = env->ops->get_func_proto(insn->imm, env->prog);
 		/* all functions that have prototype and verifier allowed
-- 
cgit v1.2.3


From 522bb2c1f82b12eb7befaae815d1d959b8e6bba2 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 4 Jan 2024 16:09:05 -0800
Subject: bpf: support multiple tags per argument

Add ability to iterate multiple decl_tag types pointed to the same
function argument. Use this to support multiple __arg_xxx tags per
global subprog argument.

We leave btf_find_decl_tag_value() intact, but change its implementation
to use a new btf_find_next_decl_tag() which can be straightforwardly
used to find next BTF type ID of a matching btf_decl_tag type.
btf_prepare_func_args() is switched from btf_find_decl_tag_value() to
btf_find_next_decl_tag() to gain multiple tags per argument support.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240105000909.2818934-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h |  2 ++
 kernel/bpf/btf.c    | 64 ++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 43 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e30100597d0a..377857b232c6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2472,6 +2472,8 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
 			 struct btf *btf, const struct btf_type *t);
 const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
 				    int comp_idx, const char *tag_key);
+int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
+			   int comp_idx, const char *tag_key, int last_id);
 
 struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 1636b574a63d..4f532b303a27 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3310,30 +3310,48 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 	return BTF_FIELD_FOUND;
 }
 
-const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
-				    int comp_idx, const char *tag_key)
+int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
+			   int comp_idx, const char *tag_key, int last_id)
 {
-	const char *value = NULL;
-	int i;
+	int len = strlen(tag_key);
+	int i, n;
 
-	for (i = 1; i < btf_nr_types(btf); i++) {
+	for (i = last_id + 1, n = btf_nr_types(btf); i < n; i++) {
 		const struct btf_type *t = btf_type_by_id(btf, i);
-		int len = strlen(tag_key);
 
 		if (!btf_type_is_decl_tag(t))
 			continue;
-		if (pt != btf_type_by_id(btf, t->type) ||
-		    btf_type_decl_tag(t)->component_idx != comp_idx)
+		if (pt != btf_type_by_id(btf, t->type))
+			continue;
+		if (btf_type_decl_tag(t)->component_idx != comp_idx)
 			continue;
 		if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
 			continue;
-		/* Prevent duplicate entries for same type */
-		if (value)
-			return ERR_PTR(-EEXIST);
-		value = __btf_name_by_offset(btf, t->name_off) + len;
+		return i;
 	}
-	if (!value)
-		return ERR_PTR(-ENOENT);
+	return -ENOENT;
+}
+
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+				    int comp_idx, const char *tag_key)
+{
+	const char *value = NULL;
+	const struct btf_type *t;
+	int len, id;
+
+	id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, 0);
+	if (id < 0)
+		return ERR_PTR(id);
+
+	t = btf_type_by_id(btf, id);
+	len = strlen(tag_key);
+	value = __btf_name_by_offset(btf, t->name_off) + len;
+
+	/* Prevent duplicate entries for same type */
+	id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, id);
+	if (id >= 0)
+		return ERR_PTR(-EEXIST);
+
 	return value;
 }
 
@@ -7032,20 +7050,16 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 	 * Only PTR_TO_CTX and SCALAR are supported atm.
 	 */
 	for (i = 0; i < nargs; i++) {
-		const char *tag;
 		u32 tags = 0;
+		int id = 0;
 
-		tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:");
-		if (IS_ERR(tag) && PTR_ERR(tag) == -ENOENT) {
-			tag = NULL;
-		} else if (IS_ERR(tag)) {
-			bpf_log(log, "arg#%d type's tag fetching failure: %ld\n", i, PTR_ERR(tag));
-			return PTR_ERR(tag);
-		}
 		/* 'arg:<tag>' decl_tag takes precedence over derivation of
 		 * register type from BTF type itself
 		 */
-		if (tag) {
+		while ((id = btf_find_next_decl_tag(btf, fn_t, i, "arg:", id)) > 0) {
+			const struct btf_type *tag_t = btf_type_by_id(btf, id);
+			const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4;
+
 			/* disallow arg tags in static subprogs */
 			if (!is_global) {
 				bpf_log(log, "arg#%d type tag is not supported in static functions\n", i);
@@ -7061,6 +7075,10 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 				return -EOPNOTSUPP;
 			}
 		}
+		if (id != -ENOENT) {
+			bpf_log(log, "arg#%d type tag fetching failure: %d\n", i, id);
+			return id;
+		}
 
 		t = btf_type_by_id(btf, args[i].type);
 		while (btf_type_is_modifier(t))
-- 
cgit v1.2.3


From 32f55dd4add4df1a5bc8febc1fafd3086290dbf6 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maxim@isovalent.com>
Date: Mon, 8 Jan 2024 22:51:58 +0200
Subject: bpf: Make bpf_for_each_spilled_reg consider narrow spills

Adjust the check in bpf_get_spilled_reg to take into account spilled
registers narrower than 64 bits. That allows find_equal_scalars to
properly adjust the range of all spilled registers that have the same
ID. Before this change, it was possible for a register and a spilled
register to have the same IDs but different ranges if the spill was
narrower than 64 bits and a range check was performed on the register.

Signed-off-by: Maxim Mikityanskiy <maxim@isovalent.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240108205209.838365-5-maxtram95@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d07d857ca67f..e11baecbde68 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -453,7 +453,7 @@ struct bpf_verifier_state {
 
 #define bpf_get_spilled_reg(slot, frame, mask)				\
 	(((slot < frame->allocated_stack / BPF_REG_SIZE) &&		\
-	  ((1 << frame->stack[slot].slot_type[0]) & (mask))) \
+	  ((1 << frame->stack[slot].slot_type[BPF_REG_SIZE - 1]) & (mask))) \
 	 ? &frame->stack[slot].spilled_ptr : NULL)
 
 /* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */
-- 
cgit v1.2.3


From b18afb6f42292a9154102fed7265ae747bbfbc57 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 15 Jan 2024 12:55:09 -0800
Subject: tcp: Move tcp_ns_to_ts() to tcp.h

We will support arbitrary SYN Cookie with BPF.

When BPF prog validates ACK and kfunc allocates a reqsk, we need
to call tcp_ns_to_ts() to calculate an offset of TSval for later
use:

  time
  t0 : Send SYN+ACK
       -> tsval = Initial TSval (Random Number)

  t1 : Recv ACK of 3WHS
       -> tsoff = TSecr - tcp_ns_to_ts(usec_ts_ok, tcp_clock_ns())
                = Initial TSval - t1

  t2 : Send ACK
       -> tsval = t2 + tsoff
                = Initial TSval + (t2 - t1)
                = Initial TSval + Time Delta (x)

  (x) Note that the time delta does not include the initial RTT
      from t0 to t1.

Let's move tcp_ns_to_ts() to tcp.h.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240115205514.68364-2-kuniyu@amazon.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/tcp.h     | 9 +++++++++
 net/ipv4/syncookies.c | 9 ---------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index dd78a1181031..114000e71a46 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -577,6 +577,15 @@ static inline u32 tcp_cookie_time(void)
 	return val;
 }
 
+/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */
+static inline u64 tcp_ns_to_ts(bool usec_ts, u64 val)
+{
+	if (usec_ts)
+		return div_u64(val, NSEC_PER_USEC);
+
+	return div_u64(val, NSEC_PER_MSEC);
+}
+
 u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
 			      u16 *mssp);
 __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 61f1c96cfe63..981944c22820 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -51,15 +51,6 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
 			    count, &syncookie_secret[c]);
 }
 
-/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */
-static u64 tcp_ns_to_ts(bool usec_ts, u64 val)
-{
-	if (usec_ts)
-		return div_u64(val, NSEC_PER_USEC);
-
-	return div_u64(val, NSEC_PER_MSEC);
-}
-
 /*
  * when syncookies are in effect and tcp timestamps are enabled we encode
  * tcp options in the lower bits of the timestamp value that will be
-- 
cgit v1.2.3


From 95e752b5299fa8c90099f7bc2aa1ee3e2e2c95ab Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 15 Jan 2024 12:55:10 -0800
Subject: tcp: Move skb_steal_sock() to request_sock.h

We will support arbitrary SYN Cookie with BPF.

If BPF prog validates ACK and kfunc allocates a reqsk, it will
be carried to TCP stack as skb->sk with req->syncookie 1.

In skb_steal_sock(), we need to check inet_reqsk(sk)->syncookie
to see if the reqsk is created by kfunc.  However, inet_reqsk()
is not available in sock.h.

Let's move skb_steal_sock() to request_sock.h.

While at it, we refactor skb_steal_sock() so it returns early if
skb->sk is NULL to minimise the following patch.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240115205514.68364-3-kuniyu@amazon.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/request_sock.h | 28 ++++++++++++++++++++++++++++
 include/net/sock.h         | 25 -------------------------
 2 files changed, 28 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 144c39db9898..26c630c40abb 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -83,6 +83,34 @@ static inline struct sock *req_to_sk(struct request_sock *req)
 	return (struct sock *)req;
 }
 
+/**
+ * skb_steal_sock - steal a socket from an sk_buff
+ * @skb: sk_buff to steal the socket from
+ * @refcounted: is set to true if the socket is reference-counted
+ * @prefetched: is set to true if the socket was assigned from bpf
+ */
+static inline struct sock *skb_steal_sock(struct sk_buff *skb,
+					  bool *refcounted, bool *prefetched)
+{
+	struct sock *sk = skb->sk;
+
+	if (!sk) {
+		*prefetched = false;
+		*refcounted = false;
+		return NULL;
+	}
+
+	*prefetched = skb_sk_is_prefetched(skb);
+	if (*prefetched)
+		*refcounted = sk_is_refcounted(sk);
+	else
+		*refcounted = true;
+
+	skb->destructor = NULL;
+	skb->sk = NULL;
+	return sk;
+}
+
 static inline struct request_sock *
 reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener,
 	    bool attach_listener)
diff --git a/include/net/sock.h b/include/net/sock.h
index a7f815c7cfdf..32a399fdcbb5 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2814,31 +2814,6 @@ sk_is_refcounted(struct sock *sk)
 	return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
 }
 
-/**
- * skb_steal_sock - steal a socket from an sk_buff
- * @skb: sk_buff to steal the socket from
- * @refcounted: is set to true if the socket is reference-counted
- * @prefetched: is set to true if the socket was assigned from bpf
- */
-static inline struct sock *
-skb_steal_sock(struct sk_buff *skb, bool *refcounted, bool *prefetched)
-{
-	if (skb->sk) {
-		struct sock *sk = skb->sk;
-
-		*refcounted = true;
-		*prefetched = skb_sk_is_prefetched(skb);
-		if (*prefetched)
-			*refcounted = sk_is_refcounted(sk);
-		skb->destructor = NULL;
-		skb->sk = NULL;
-		return sk;
-	}
-	*prefetched = false;
-	*refcounted = false;
-	return NULL;
-}
-
 /* Checks if this SKB belongs to an HW offloaded socket
  * and whether any SW fallbacks are required based on dev.
  * Check decrypted mark in case skb_orphan() cleared socket.
-- 
cgit v1.2.3


From 8b5ac68fb5ee416537c1214cbacf0ddc4293cce9 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 15 Jan 2024 12:55:11 -0800
Subject: bpf: tcp: Handle BPF SYN Cookie in skb_steal_sock().

We will support arbitrary SYN Cookie with BPF.

If BPF prog validates ACK and kfunc allocates a reqsk, it will
be carried to TCP stack as skb->sk with req->syncookie 1.  Also,
the reqsk has its listener as req->rsk_listener with no refcnt
taken.

When the TCP stack looks up a socket from the skb, we steal
inet_reqsk(skb->sk)->rsk_listener in skb_steal_sock() so that
the skb will be processed in cookie_v[46]_check() with the
listener.

Note that we do not clear skb->sk and skb->destructor so that we
can carry the reqsk to cookie_v[46]_check().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20240115205514.68364-4-kuniyu@amazon.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/request_sock.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 26c630c40abb..8839133d6f6b 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -101,10 +101,21 @@ static inline struct sock *skb_steal_sock(struct sk_buff *skb,
 	}
 
 	*prefetched = skb_sk_is_prefetched(skb);
-	if (*prefetched)
+	if (*prefetched) {
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
+		if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
+			struct request_sock *req = inet_reqsk(sk);
+
+			*refcounted = false;
+			sk = req->rsk_listener;
+			req->rsk_listener = NULL;
+			return sk;
+		}
+#endif
 		*refcounted = sk_is_refcounted(sk);
-	else
+	} else {
 		*refcounted = true;
+	}
 
 	skb->destructor = NULL;
 	skb->sk = NULL;
-- 
cgit v1.2.3


From 695751e31a63efd2bbe6779873adf1e4deb00cd5 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 15 Jan 2024 12:55:12 -0800
Subject: bpf: tcp: Handle BPF SYN Cookie in cookie_v[46]_check().

We will support arbitrary SYN Cookie with BPF in the following
patch.

If BPF prog validates ACK and kfunc allocates a reqsk, it will
be carried to cookie_[46]_check() as skb->sk.  If skb->sk is not
NULL, we call cookie_bpf_check().

Then, we clear skb->sk and skb->destructor, which are needed not
to hold refcnt for reqsk and the listener.  See the following patch
for details.

After that, we finish initialisation for the remaining fields with
cookie_tcp_reqsk_init().

Note that the server side WScale is set only for non-BPF SYN Cookie.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20240115205514.68364-5-kuniyu@amazon.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/tcp.h     | 20 ++++++++++++++++++++
 net/ipv4/syncookies.c | 31 +++++++++++++++++++++++++++----
 net/ipv6/syncookies.c | 13 +++++++++----
 3 files changed, 56 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 114000e71a46..dfe99a084a71 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -599,6 +599,26 @@ static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry *
 		dst_feature(dst, RTAX_FEATURE_ECN);
 }
 
+#if IS_ENABLED(CONFIG_BPF)
+static inline bool cookie_bpf_ok(struct sk_buff *skb)
+{
+	return skb->sk;
+}
+
+struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb);
+#else
+static inline bool cookie_bpf_ok(struct sk_buff *skb)
+{
+	return false;
+}
+
+static inline struct request_sock *cookie_bpf_check(struct net *net, struct sock *sk,
+						    struct sk_buff *skb)
+{
+	return NULL;
+}
+#endif
+
 /* From net/ipv6/syncookies.c */
 int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th);
 struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 981944c22820..be88bf586ff9 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -295,6 +295,24 @@ static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb,
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_BPF)
+struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct request_sock *req = inet_reqsk(skb->sk);
+
+	skb->sk = NULL;
+	skb->destructor = NULL;
+
+	if (cookie_tcp_reqsk_init(sk, skb, req)) {
+		reqsk_free(req);
+		req = NULL;
+	}
+
+	return req;
+}
+EXPORT_SYMBOL_GPL(cookie_bpf_check);
+#endif
+
 struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
 					    struct sock *sk, struct sk_buff *skb,
 					    struct tcp_options_received *tcp_opt,
@@ -395,9 +413,13 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	    !th->ack || th->rst)
 		goto out;
 
-	req = cookie_tcp_check(net, sk, skb);
-	if (IS_ERR(req))
-		goto out;
+	if (cookie_bpf_ok(skb)) {
+		req = cookie_bpf_check(sk, skb);
+	} else {
+		req = cookie_tcp_check(net, sk, skb);
+		if (IS_ERR(req))
+			goto out;
+	}
 	if (!req)
 		goto out_drop;
 
@@ -445,7 +467,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 				  ireq->wscale_ok, &rcv_wscale,
 				  dst_metric(&rt->dst, RTAX_INITRWND));
 
-	ireq->rcv_wscale  = rcv_wscale;
+	if (!req->syncookie)
+		ireq->rcv_wscale = rcv_wscale;
 	ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst);
 
 	ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst);
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index c8d2ca27220c..6b9c69278819 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -182,9 +182,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	    !th->ack || th->rst)
 		goto out;
 
-	req = cookie_tcp_check(net, sk, skb);
-	if (IS_ERR(req))
-		goto out;
+	if (cookie_bpf_ok(skb)) {
+		req = cookie_bpf_check(sk, skb);
+	} else {
+		req = cookie_tcp_check(net, sk, skb);
+		if (IS_ERR(req))
+			goto out;
+	}
 	if (!req)
 		goto out_drop;
 
@@ -247,7 +251,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 				  ireq->wscale_ok, &rcv_wscale,
 				  dst_metric(dst, RTAX_INITRWND));
 
-	ireq->rcv_wscale = rcv_wscale;
+	if (!req->syncookie)
+		ireq->rcv_wscale = rcv_wscale;
 	ireq->ecn_ok &= cookie_ecn_ok(net, dst);
 
 	ret = tcp_get_cookie_sock(sk, skb, req, dst);
-- 
cgit v1.2.3


From e472f88891abbc535a5e16a68a104073985f6061 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 15 Jan 2024 12:55:13 -0800
Subject: bpf: tcp: Support arbitrary SYN Cookie.

This patch adds a new kfunc available at TC hook to support arbitrary
SYN Cookie.

The basic usage is as follows:

    struct bpf_tcp_req_attrs attrs = {
        .mss = mss,
        .wscale_ok = wscale_ok,
        .rcv_wscale = rcv_wscale, /* Server's WScale < 15 */
        .snd_wscale = snd_wscale, /* Client's WScale < 15 */
        .tstamp_ok = tstamp_ok,
        .rcv_tsval = tsval,
        .rcv_tsecr = tsecr, /* Server's Initial TSval */
        .usec_ts_ok = usec_ts_ok,
        .sack_ok = sack_ok,
        .ecn_ok = ecn_ok,
    }

    skc = bpf_skc_lookup_tcp(...);
    sk = (struct sock *)bpf_skc_to_tcp_sock(skc);
    bpf_sk_assign_tcp_reqsk(skb, sk, attrs, sizeof(attrs));
    bpf_sk_release(skc);

bpf_sk_assign_tcp_reqsk() takes skb, a listener sk, and struct
bpf_tcp_req_attrs and allocates reqsk and configures it.  Then,
bpf_sk_assign_tcp_reqsk() links reqsk with skb and the listener.

The notable thing here is that we do not hold refcnt for both reqsk
and listener.  To differentiate that, we mark reqsk->syncookie, which
is only used in TX for now.  So, if reqsk->syncookie is 1 in RX, it
means that the reqsk is allocated by kfunc.

When skb is freed, sock_pfree() checks if reqsk->syncookie is 1,
and in that case, we set NULL to reqsk->rsk_listener before calling
reqsk_free() as reqsk does not hold a refcnt of the listener.

When the TCP stack looks up a socket from the skb, we steal the
listener from the reqsk in skb_steal_sock() and create a full sk
in cookie_v[46]_check().

The refcnt of reqsk will finally be set to 1 in tcp_get_cookie_sock()
after creating a full sk.

Note that we can extend struct bpf_tcp_req_attrs in the future when
we add a new attribute that is determined in 3WHS.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20240115205514.68364-6-kuniyu@amazon.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/tcp.h |  14 +++++++
 net/core/filter.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/core/sock.c   |  14 ++++++-
 3 files changed, 135 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index dfe99a084a71..451dc1373970 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -600,6 +600,20 @@ static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry *
 }
 
 #if IS_ENABLED(CONFIG_BPF)
+struct bpf_tcp_req_attrs {
+	u32 rcv_tsval;
+	u32 rcv_tsecr;
+	u16 mss;
+	u8 rcv_wscale;
+	u8 snd_wscale;
+	u8 ecn_ok;
+	u8 wscale_ok;
+	u8 sack_ok;
+	u8 tstamp_ok;
+	u8 usec_ts_ok;
+	u8 reserved[3];
+};
+
 static inline bool cookie_bpf_ok(struct sk_buff *skb)
 {
 	return skb->sk;
diff --git a/net/core/filter.c b/net/core/filter.c
index 8c9f67c81e22..6a7abbaa50b8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -11837,6 +11837,103 @@ __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
 
 	return 0;
 }
+
+__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct sk_buff *skb, struct sock *sk,
+					struct bpf_tcp_req_attrs *attrs, int attrs__sz)
+{
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
+	const struct request_sock_ops *ops;
+	struct inet_request_sock *ireq;
+	struct tcp_request_sock *treq;
+	struct request_sock *req;
+	struct net *net;
+	__u16 min_mss;
+	u32 tsoff = 0;
+
+	if (attrs__sz != sizeof(*attrs) ||
+	    attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2])
+		return -EINVAL;
+
+	if (!skb_at_tc_ingress(skb))
+		return -EINVAL;
+
+	net = dev_net(skb->dev);
+	if (net != sock_net(sk))
+		return -ENETUNREACH;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		ops = &tcp_request_sock_ops;
+		min_mss = 536;
+		break;
+#if IS_BUILTIN(CONFIG_IPV6)
+	case htons(ETH_P_IPV6):
+		ops = &tcp6_request_sock_ops;
+		min_mss = IPV6_MIN_MTU - 60;
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+
+	if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN ||
+	    sk_is_mptcp(sk))
+		return -EINVAL;
+
+	if (attrs->mss < min_mss)
+		return -EINVAL;
+
+	if (attrs->wscale_ok) {
+		if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling))
+			return -EINVAL;
+
+		if (attrs->snd_wscale > TCP_MAX_WSCALE ||
+		    attrs->rcv_wscale > TCP_MAX_WSCALE)
+			return -EINVAL;
+	}
+
+	if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack))
+		return -EINVAL;
+
+	if (attrs->tstamp_ok) {
+		if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps))
+			return -EINVAL;
+
+		tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns());
+	}
+
+	req = inet_reqsk_alloc(ops, sk, false);
+	if (!req)
+		return -ENOMEM;
+
+	ireq = inet_rsk(req);
+	treq = tcp_rsk(req);
+
+	req->rsk_listener = sk;
+	req->syncookie = 1;
+	req->mss = attrs->mss;
+	req->ts_recent = attrs->rcv_tsval;
+
+	ireq->snd_wscale = attrs->snd_wscale;
+	ireq->rcv_wscale = attrs->rcv_wscale;
+	ireq->tstamp_ok	= !!attrs->tstamp_ok;
+	ireq->sack_ok = !!attrs->sack_ok;
+	ireq->wscale_ok = !!attrs->wscale_ok;
+	ireq->ecn_ok = !!attrs->ecn_ok;
+
+	treq->req_usec_ts = !!attrs->usec_ts_ok;
+	treq->ts_off = tsoff;
+
+	skb_orphan(skb);
+	skb->sk = req_to_sk(req);
+	skb->destructor = sock_pfree;
+
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
 __bpf_kfunc_end_defs();
 
 int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
@@ -11865,6 +11962,10 @@ BTF_SET8_START(bpf_kfunc_check_set_sock_addr)
 BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
 BTF_SET8_END(bpf_kfunc_check_set_sock_addr)
 
+BTF_SET8_START(bpf_kfunc_check_set_tcp_reqsk)
+BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
+BTF_SET8_END(bpf_kfunc_check_set_tcp_reqsk)
+
 static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
 	.owner = THIS_MODULE,
 	.set = &bpf_kfunc_check_set_skb,
@@ -11880,6 +11981,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = {
 	.set = &bpf_kfunc_check_set_sock_addr,
 };
 
+static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = {
+	.owner = THIS_MODULE,
+	.set = &bpf_kfunc_check_set_tcp_reqsk,
+};
+
 static int __init bpf_kfunc_init(void)
 {
 	int ret;
@@ -11895,8 +12001,9 @@ static int __init bpf_kfunc_init(void)
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
-	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
-						&bpf_kfunc_set_sock_addr);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+					       &bpf_kfunc_set_sock_addr);
+	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
 }
 late_initcall(bpf_kfunc_init);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 158dbdebce6a..147fb2656e6b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2582,8 +2582,18 @@ EXPORT_SYMBOL(sock_efree);
 #ifdef CONFIG_INET
 void sock_pfree(struct sk_buff *skb)
 {
-	if (sk_is_refcounted(skb->sk))
-		sock_gen_put(skb->sk);
+	struct sock *sk = skb->sk;
+
+	if (!sk_is_refcounted(sk))
+		return;
+
+	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
+		inet_reqsk(sk)->rsk_listener = NULL;
+		reqsk_free(inet_reqsk(sk));
+		return;
+	}
+
+	sock_gen_put(sk);
 }
 EXPORT_SYMBOL(sock_pfree);
 #endif /* CONFIG_INET */
-- 
cgit v1.2.3


From f98df79bf7f772597313adca2720cb38770490dd Mon Sep 17 00:00:00 2001
From: Victor Stewart <v@nametag.social>
Date: Tue, 16 Jan 2024 20:29:52 +0000
Subject: bpf, docs: Fix bpf_redirect_peer header doc

Amend the bpf_redirect_peer() header documentation to also mention
support for the netkit device type.

Signed-off-by: Victor Stewart <v@nametag.social>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240116202952.241009-1-v@nametag.social
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 754e68ca8744..a00f8a5623e1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4839,9 +4839,9 @@ union bpf_attr {
  * 		going through the CPU's backlog queue.
  *
  * 		The *flags* argument is reserved and must be 0. The helper is
- * 		currently only supported for tc BPF program types at the ingress
- * 		hook and for veth device types. The peer device must reside in a
- * 		different network namespace.
+ * 		currently only supported for tc BPF program types at the
+ * 		ingress hook and for veth and netkit target device types. The
+ * 		peer device must reside in a different network namespace.
  * 	Return
  * 		The helper returns **TC_ACT_REDIRECT** on success or
  * 		**TC_ACT_SHOT** on error.
-- 
cgit v1.2.3


From b3f086a7a136d721d112f35fe4cd7272e93cf06b Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Thu, 18 Jan 2024 13:17:51 -0800
Subject: bpf: Define struct bpf_tcp_req_attrs when CONFIG_SYN_COOKIES=n.

kernel test robot reported the warning below:

  >> net/core/filter.c:11842:13: warning: declaration of 'struct bpf_tcp_req_attrs' will not be visible outside of this function [-Wvisibility]
      11842 |                                         struct bpf_tcp_req_attrs *attrs, int attrs__sz)
            |                                                ^
     1 warning generated.

struct bpf_tcp_req_attrs is defined under CONFIG_SYN_COOKIES
but used in kfunc without the config.

Let's move struct bpf_tcp_req_attrs definition outside of
CONFIG_SYN_COOKIES guard.

Fixes: e472f88891ab ("bpf: tcp: Support arbitrary SYN Cookie.")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202401180418.CUVc0hxF-lkp@intel.com/
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20240118211751.25790-1-kuniyu@amazon.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/tcp.h | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 451dc1373970..58e65af74ad1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -498,6 +498,22 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
 					    struct tcp_options_received *tcp_opt,
 					    int mss, u32 tsoff);
 
+#if IS_ENABLED(CONFIG_BPF)
+struct bpf_tcp_req_attrs {
+	u32 rcv_tsval;
+	u32 rcv_tsecr;
+	u16 mss;
+	u8 rcv_wscale;
+	u8 snd_wscale;
+	u8 ecn_ok;
+	u8 wscale_ok;
+	u8 sack_ok;
+	u8 tstamp_ok;
+	u8 usec_ts_ok;
+	u8 reserved[3];
+};
+#endif
+
 #ifdef CONFIG_SYN_COOKIES
 
 /* Syncookies use a monotonic timer which increments every 60 seconds.
@@ -600,20 +616,6 @@ static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry *
 }
 
 #if IS_ENABLED(CONFIG_BPF)
-struct bpf_tcp_req_attrs {
-	u32 rcv_tsval;
-	u32 rcv_tsecr;
-	u16 mss;
-	u8 rcv_wscale;
-	u8 snd_wscale;
-	u8 ecn_ok;
-	u8 wscale_ok;
-	u8 sack_ok;
-	u8 tstamp_ok;
-	u8 usec_ts_ok;
-	u8 reserved[3];
-};
-
 static inline bool cookie_bpf_ok(struct sk_buff *skb)
 {
 	return skb->sk;
-- 
cgit v1.2.3


From d5c16492c66fbfca85f36e42363d32212df5927b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 19 Jan 2024 12:04:58 +0100
Subject: bpf: Add cookie to perf_event bpf_link_info records

At the moment we don't store cookie for perf_event probes,
while we do that for the rest of the probes.

Adding cookie fields to struct bpf_link_info perf event
probe records:

  perf_event.uprobe
  perf_event.kprobe
  perf_event.tracepoint
  perf_event.perf_event

And the code to store that in bpf_link_info struct.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/r/20240119110505.400573-2-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 6 ++++++
 kernel/bpf/syscall.c           | 4 ++++
 tools/include/uapi/linux/bpf.h | 6 ++++++
 3 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a00f8a5623e1..181e74433272 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6582,6 +6582,7 @@ struct bpf_link_info {
 					__aligned_u64 file_name; /* in/out */
 					__u32 name_len;
 					__u32 offset; /* offset from file_name */
+					__u64 cookie;
 				} uprobe; /* BPF_PERF_EVENT_UPROBE, BPF_PERF_EVENT_URETPROBE */
 				struct {
 					__aligned_u64 func_name; /* in/out */
@@ -6589,14 +6590,19 @@ struct bpf_link_info {
 					__u32 offset; /* offset from func_name */
 					__u64 addr;
 					__u64 missed;
+					__u64 cookie;
 				} kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */
 				struct {
 					__aligned_u64 tp_name;   /* in/out */
 					__u32 name_len;
+					__u32 :32;
+					__u64 cookie;
 				} tracepoint; /* BPF_PERF_EVENT_TRACEPOINT */
 				struct {
 					__u64 config;
 					__u32 type;
+					__u32 :32;
+					__u64 cookie;
 				} event; /* BPF_PERF_EVENT_EVENT */
 			};
 		} perf_event;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1f18681721c..13193aaafb64 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3501,6 +3501,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
 	if (!kallsyms_show_value(current_cred()))
 		addr = 0;
 	info->perf_event.kprobe.addr = addr;
+	info->perf_event.kprobe.cookie = event->bpf_cookie;
 	return 0;
 }
 #endif
@@ -3526,6 +3527,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
 	else
 		info->perf_event.type = BPF_PERF_EVENT_UPROBE;
 	info->perf_event.uprobe.offset = offset;
+	info->perf_event.uprobe.cookie = event->bpf_cookie;
 	return 0;
 }
 #endif
@@ -3553,6 +3555,7 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
 	uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
 	ulen = info->perf_event.tracepoint.name_len;
 	info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
+	info->perf_event.tracepoint.cookie = event->bpf_cookie;
 	return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL);
 }
 
@@ -3561,6 +3564,7 @@ static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
 {
 	info->perf_event.event.type = event->attr.type;
 	info->perf_event.event.config = event->attr.config;
+	info->perf_event.event.cookie = event->bpf_cookie;
 	info->perf_event.type = BPF_PERF_EVENT_EVENT;
 	return 0;
 }
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a00f8a5623e1..181e74433272 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6582,6 +6582,7 @@ struct bpf_link_info {
 					__aligned_u64 file_name; /* in/out */
 					__u32 name_len;
 					__u32 offset; /* offset from file_name */
+					__u64 cookie;
 				} uprobe; /* BPF_PERF_EVENT_UPROBE, BPF_PERF_EVENT_URETPROBE */
 				struct {
 					__aligned_u64 func_name; /* in/out */
@@ -6589,14 +6590,19 @@ struct bpf_link_info {
 					__u32 offset; /* offset from func_name */
 					__u64 addr;
 					__u64 missed;
+					__u64 cookie;
 				} kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */
 				struct {
 					__aligned_u64 tp_name;   /* in/out */
 					__u32 name_len;
+					__u32 :32;
+					__u64 cookie;
 				} tracepoint; /* BPF_PERF_EVENT_TRACEPOINT */
 				struct {
 					__u64 config;
 					__u32 type;
+					__u32 :32;
+					__u64 cookie;
 				} event; /* BPF_PERF_EVENT_EVENT */
 			};
 		} perf_event;
-- 
cgit v1.2.3


From 9fd112b1f82b587ffb12fb67dd032f551fdb571a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 19 Jan 2024 12:04:59 +0100
Subject: bpf: Store cookies in kprobe_multi bpf_link_info data

Storing cookies in kprobe_multi bpf_link_info data. The cookies
field is optional and if provided it needs to be an array of
__u64 with kprobe_multi.count length.

Acked-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20240119110505.400573-3-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       |  1 +
 kernel/trace/bpf_trace.c       | 15 +++++++++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 3 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 181e74433272..287d05732668 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6563,6 +6563,7 @@ struct bpf_link_info {
 			__u32 count; /* in/out: kprobe_multi function count */
 			__u32 flags;
 			__u64 missed;
+			__aligned_u64 cookies;
 		} kprobe_multi;
 		struct {
 			__aligned_u64 path;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7ac6c52b25eb..c98c20abaf99 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2679,6 +2679,7 @@ static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
 static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
 						struct bpf_link_info *info)
 {
+	u64 __user *ucookies = u64_to_user_ptr(info->kprobe_multi.cookies);
 	u64 __user *uaddrs = u64_to_user_ptr(info->kprobe_multi.addrs);
 	struct bpf_kprobe_multi_link *kmulti_link;
 	u32 ucount = info->kprobe_multi.count;
@@ -2686,6 +2687,8 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
 
 	if (!uaddrs ^ !ucount)
 		return -EINVAL;
+	if (ucookies && !ucount)
+		return -EINVAL;
 
 	kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
 	info->kprobe_multi.count = kmulti_link->cnt;
@@ -2699,6 +2702,18 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
 	else
 		ucount = kmulti_link->cnt;
 
+	if (ucookies) {
+		if (kmulti_link->cookies) {
+			if (copy_to_user(ucookies, kmulti_link->cookies, ucount * sizeof(u64)))
+				return -EFAULT;
+		} else {
+			for (i = 0; i < ucount; i++) {
+				if (put_user(0, ucookies + i))
+					return -EFAULT;
+			}
+		}
+	}
+
 	if (kallsyms_show_value(current_cred())) {
 		if (copy_to_user(uaddrs, kmulti_link->addrs, ucount * sizeof(u64)))
 			return -EFAULT;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 181e74433272..287d05732668 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6563,6 +6563,7 @@ struct bpf_link_info {
 			__u32 count; /* in/out: kprobe_multi function count */
 			__u32 flags;
 			__u64 missed;
+			__aligned_u64 cookies;
 		} kprobe_multi;
 		struct {
 			__aligned_u64 path;
-- 
cgit v1.2.3


From 3b1f89e747cd4b24244f2798a35d28815b744303 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Fri, 19 Jan 2024 14:49:52 -0800
Subject: bpf: refactory struct_ops type initialization to a function.

Move the majority of the code to bpf_struct_ops_init_one(), which can then
be utilized for the initialization of newly registered dynamically
allocated struct_ops types in the following patches.

Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240119225005.668602-2-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/btf.h         |   1 +
 kernel/bpf/bpf_struct_ops.c | 157 +++++++++++++++++++++++---------------------
 kernel/bpf/btf.c            |   5 ++
 3 files changed, 89 insertions(+), 74 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index cf5c6ff48981..932af1680bb5 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -137,6 +137,7 @@ struct btf_struct_metas {
 
 extern const struct file_operations btf_fops;
 
+const char *btf_get_name(const struct btf *btf);
 void btf_get(struct btf *btf);
 void btf_put(struct btf *btf);
 int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 02068bd0e4d9..96cba76f4ac3 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -110,102 +110,111 @@ const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
 
 static const struct btf_type *module_type;
 
-void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
+static void bpf_struct_ops_init_one(struct bpf_struct_ops *st_ops,
+				    struct btf *btf,
+				    struct bpf_verifier_log *log)
 {
-	s32 type_id, value_id, module_id;
 	const struct btf_member *member;
-	struct bpf_struct_ops *st_ops;
 	const struct btf_type *t;
+	s32 type_id, value_id;
 	char value_name[128];
 	const char *mname;
-	u32 i, j;
+	int i;
 
-	/* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */
-#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name);
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
+	if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
+	    sizeof(value_name)) {
+		pr_warn("struct_ops name %s is too long\n",
+			st_ops->name);
+		return;
+	}
+	sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
 
-	module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
-	if (module_id < 0) {
-		pr_warn("Cannot find struct module in btf_vmlinux\n");
+	value_id = btf_find_by_name_kind(btf, value_name,
+					 BTF_KIND_STRUCT);
+	if (value_id < 0) {
+		pr_warn("Cannot find struct %s in %s\n",
+			value_name, btf_get_name(btf));
 		return;
 	}
-	module_type = btf_type_by_id(btf, module_id);
 
-	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
-		st_ops = bpf_struct_ops[i];
+	type_id = btf_find_by_name_kind(btf, st_ops->name,
+					BTF_KIND_STRUCT);
+	if (type_id < 0) {
+		pr_warn("Cannot find struct %s in %s\n",
+			st_ops->name, btf_get_name(btf));
+		return;
+	}
+	t = btf_type_by_id(btf, type_id);
+	if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
+		pr_warn("Cannot support #%u members in struct %s\n",
+			btf_type_vlen(t), st_ops->name);
+		return;
+	}
 
-		if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
-		    sizeof(value_name)) {
-			pr_warn("struct_ops name %s is too long\n",
+	for_each_member(i, t, member) {
+		const struct btf_type *func_proto;
+
+		mname = btf_name_by_offset(btf, member->name_off);
+		if (!*mname) {
+			pr_warn("anon member in struct %s is not supported\n",
 				st_ops->name);
-			continue;
+			break;
 		}
-		sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
 
-		value_id = btf_find_by_name_kind(btf, value_name,
-						 BTF_KIND_STRUCT);
-		if (value_id < 0) {
-			pr_warn("Cannot find struct %s in btf_vmlinux\n",
-				value_name);
-			continue;
+		if (__btf_member_bitfield_size(t, member)) {
+			pr_warn("bit field member %s in struct %s is not supported\n",
+				mname, st_ops->name);
+			break;
 		}
 
-		type_id = btf_find_by_name_kind(btf, st_ops->name,
-						BTF_KIND_STRUCT);
-		if (type_id < 0) {
-			pr_warn("Cannot find struct %s in btf_vmlinux\n",
-				st_ops->name);
-			continue;
-		}
-		t = btf_type_by_id(btf, type_id);
-		if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
-			pr_warn("Cannot support #%u members in struct %s\n",
-				btf_type_vlen(t), st_ops->name);
-			continue;
+		func_proto = btf_type_resolve_func_ptr(btf,
+						       member->type,
+						       NULL);
+		if (func_proto &&
+		    btf_distill_func_proto(log, btf,
+					   func_proto, mname,
+					   &st_ops->func_models[i])) {
+			pr_warn("Error in parsing func ptr %s in struct %s\n",
+				mname, st_ops->name);
+			break;
 		}
+	}
 
-		for_each_member(j, t, member) {
-			const struct btf_type *func_proto;
+	if (i == btf_type_vlen(t)) {
+		if (st_ops->init(btf)) {
+			pr_warn("Error in init bpf_struct_ops %s\n",
+				st_ops->name);
+		} else {
+			st_ops->type_id = type_id;
+			st_ops->type = t;
+			st_ops->value_id = value_id;
+			st_ops->value_type = btf_type_by_id(btf,
+							    value_id);
+		}
+	}
+}
 
-			mname = btf_name_by_offset(btf, member->name_off);
-			if (!*mname) {
-				pr_warn("anon member in struct %s is not supported\n",
-					st_ops->name);
-				break;
-			}
+void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
+{
+	struct bpf_struct_ops *st_ops;
+	s32 module_id;
+	u32 i;
 
-			if (__btf_member_bitfield_size(t, member)) {
-				pr_warn("bit field member %s in struct %s is not supported\n",
-					mname, st_ops->name);
-				break;
-			}
+	/* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */
+#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name);
+#include "bpf_struct_ops_types.h"
+#undef BPF_STRUCT_OPS_TYPE
 
-			func_proto = btf_type_resolve_func_ptr(btf,
-							       member->type,
-							       NULL);
-			if (func_proto &&
-			    btf_distill_func_proto(log, btf,
-						   func_proto, mname,
-						   &st_ops->func_models[j])) {
-				pr_warn("Error in parsing func ptr %s in struct %s\n",
-					mname, st_ops->name);
-				break;
-			}
-		}
+	module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
+	if (module_id < 0) {
+		pr_warn("Cannot find struct module in %s\n", btf_get_name(btf));
+		return;
+	}
+	module_type = btf_type_by_id(btf, module_id);
 
-		if (j == btf_type_vlen(t)) {
-			if (st_ops->init(btf)) {
-				pr_warn("Error in init bpf_struct_ops %s\n",
-					st_ops->name);
-			} else {
-				st_ops->type_id = type_id;
-				st_ops->type = t;
-				st_ops->value_id = value_id;
-				st_ops->value_type = btf_type_by_id(btf,
-								    value_id);
-			}
-		}
+	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
+		st_ops = bpf_struct_ops[i];
+		bpf_struct_ops_init_one(st_ops, btf, log);
 	}
 }
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4f532b303a27..f8735a2ec5d4 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1707,6 +1707,11 @@ static void btf_free_rcu(struct rcu_head *rcu)
 	btf_free(btf);
 }
 
+const char *btf_get_name(const struct btf *btf)
+{
+	return btf->name;
+}
+
 void btf_get(struct btf *btf)
 {
 	refcount_inc(&btf->refcnt);
-- 
cgit v1.2.3


From 4c5763ed996a61b51d721d0968d0df957826ea49 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Fri, 19 Jan 2024 14:49:54 -0800
Subject: bpf, net: introduce bpf_struct_ops_desc.

Move some of members of bpf_struct_ops to bpf_struct_ops_desc.  type_id is
unavailabe in bpf_struct_ops anymore. Modules should get it from the btf
received by kmod's init function.

Cc: netdev@vger.kernel.org
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240119225005.668602-4-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h            | 15 +++++---
 kernel/bpf/bpf_struct_ops.c    | 80 ++++++++++++++++++++++--------------------
 kernel/bpf/verifier.c          |  8 +++--
 net/bpf/bpf_dummy_struct_ops.c | 11 +++++-
 net/ipv4/bpf_tcp_ca.c          |  8 ++++-
 5 files changed, 73 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 377857b232c6..7fc95e7babab 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1673,18 +1673,23 @@ struct bpf_struct_ops {
 	void (*unreg)(void *kdata);
 	int (*update)(void *kdata, void *old_kdata);
 	int (*validate)(void *kdata);
-	const struct btf_type *type;
-	const struct btf_type *value_type;
+	void *cfi_stubs;
 	const char *name;
 	struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
+};
+
+struct bpf_struct_ops_desc {
+	struct bpf_struct_ops *st_ops;
+
+	const struct btf_type *type;
+	const struct btf_type *value_type;
 	u32 type_id;
 	u32 value_id;
-	void *cfi_stubs;
 };
 
 #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
 #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
-const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id);
+const struct bpf_struct_ops_desc *bpf_struct_ops_find(u32 type_id);
 void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log);
 bool bpf_struct_ops_get(const void *kdata);
 void bpf_struct_ops_put(const void *kdata);
@@ -1728,7 +1733,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 			    union bpf_attr __user *uattr);
 #endif
 #else
-static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
+static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(u32 type_id)
 {
 	return NULL;
 }
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 5b3ebcb435d0..9774f7824e8b 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -32,7 +32,7 @@ struct bpf_struct_ops_value {
 struct bpf_struct_ops_map {
 	struct bpf_map map;
 	struct rcu_head rcu;
-	const struct bpf_struct_ops *st_ops;
+	const struct bpf_struct_ops_desc *st_ops_desc;
 	/* protect map_update */
 	struct mutex lock;
 	/* link has all the bpf_links that is populated
@@ -92,9 +92,9 @@ enum {
 	__NR_BPF_STRUCT_OPS_TYPE,
 };
 
-static struct bpf_struct_ops * const bpf_struct_ops[] = {
+static struct bpf_struct_ops_desc bpf_struct_ops[] = {
 #define BPF_STRUCT_OPS_TYPE(_name)				\
-	[BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name,
+	[BPF_STRUCT_OPS_TYPE_##_name] = { .st_ops = &bpf_##_name },
 #include "bpf_struct_ops_types.h"
 #undef BPF_STRUCT_OPS_TYPE
 };
@@ -115,10 +115,11 @@ enum {
 	IDX_MODULE_ID,
 };
 
-static void bpf_struct_ops_init_one(struct bpf_struct_ops *st_ops,
-				    struct btf *btf,
-				    struct bpf_verifier_log *log)
+static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
+				     struct btf *btf,
+				     struct bpf_verifier_log *log)
 {
+	struct bpf_struct_ops *st_ops = st_ops_desc->st_ops;
 	const struct btf_member *member;
 	const struct btf_type *t;
 	s32 type_id, value_id;
@@ -190,18 +191,18 @@ static void bpf_struct_ops_init_one(struct bpf_struct_ops *st_ops,
 			pr_warn("Error in init bpf_struct_ops %s\n",
 				st_ops->name);
 		} else {
-			st_ops->type_id = type_id;
-			st_ops->type = t;
-			st_ops->value_id = value_id;
-			st_ops->value_type = btf_type_by_id(btf,
-							    value_id);
+			st_ops_desc->type_id = type_id;
+			st_ops_desc->type = t;
+			st_ops_desc->value_id = value_id;
+			st_ops_desc->value_type = btf_type_by_id(btf,
+								 value_id);
 		}
 	}
 }
 
 void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
 {
-	struct bpf_struct_ops *st_ops;
+	struct bpf_struct_ops_desc *st_ops_desc;
 	u32 i;
 
 	/* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */
@@ -210,14 +211,14 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
 #undef BPF_STRUCT_OPS_TYPE
 
 	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
-		st_ops = bpf_struct_ops[i];
-		bpf_struct_ops_init_one(st_ops, btf, log);
+		st_ops_desc = &bpf_struct_ops[i];
+		bpf_struct_ops_desc_init(st_ops_desc, btf, log);
 	}
 }
 
 extern struct btf *btf_vmlinux;
 
-static const struct bpf_struct_ops *
+static const struct bpf_struct_ops_desc *
 bpf_struct_ops_find_value(u32 value_id)
 {
 	unsigned int i;
@@ -226,14 +227,14 @@ bpf_struct_ops_find_value(u32 value_id)
 		return NULL;
 
 	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
-		if (bpf_struct_ops[i]->value_id == value_id)
-			return bpf_struct_ops[i];
+		if (bpf_struct_ops[i].value_id == value_id)
+			return &bpf_struct_ops[i];
 	}
 
 	return NULL;
 }
 
-const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
+const struct bpf_struct_ops_desc *bpf_struct_ops_find(u32 type_id)
 {
 	unsigned int i;
 
@@ -241,8 +242,8 @@ const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
 		return NULL;
 
 	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
-		if (bpf_struct_ops[i]->type_id == type_id)
-			return bpf_struct_ops[i];
+		if (bpf_struct_ops[i].type_id == type_id)
+			return &bpf_struct_ops[i];
 	}
 
 	return NULL;
@@ -302,7 +303,7 @@ static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key)
 
 static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
 {
-	const struct btf_type *t = st_map->st_ops->type;
+	const struct btf_type *t = st_map->st_ops_desc->type;
 	u32 i;
 
 	for (i = 0; i < btf_type_vlen(t); i++) {
@@ -382,11 +383,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 					   void *value, u64 flags)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
-	const struct bpf_struct_ops *st_ops = st_map->st_ops;
+	const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc;
+	const struct bpf_struct_ops *st_ops = st_ops_desc->st_ops;
 	struct bpf_struct_ops_value *uvalue, *kvalue;
 	const struct btf_type *module_type;
 	const struct btf_member *member;
-	const struct btf_type *t = st_ops->type;
+	const struct btf_type *t = st_ops_desc->type;
 	struct bpf_tramp_links *tlinks;
 	void *udata, *kdata;
 	int prog_fd, err;
@@ -399,7 +401,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	if (*(u32 *)key != 0)
 		return -E2BIG;
 
-	err = check_zero_holes(st_ops->value_type, value);
+	err = check_zero_holes(st_ops_desc->value_type, value);
 	if (err)
 		return err;
 
@@ -492,7 +494,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		}
 
 		if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
-		    prog->aux->attach_btf_id != st_ops->type_id ||
+		    prog->aux->attach_btf_id != st_ops_desc->type_id ||
 		    prog->expected_attach_type != i) {
 			bpf_prog_put(prog);
 			err = -EINVAL;
@@ -588,7 +590,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
 			     BPF_STRUCT_OPS_STATE_TOBEFREE);
 	switch (prev_state) {
 	case BPF_STRUCT_OPS_STATE_INUSE:
-		st_map->st_ops->unreg(&st_map->kvalue.data);
+		st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
 		bpf_map_put(map);
 		return 0;
 	case BPF_STRUCT_OPS_STATE_TOBEFREE:
@@ -669,22 +671,22 @@ static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
 
 static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 {
-	const struct bpf_struct_ops *st_ops;
+	const struct bpf_struct_ops_desc *st_ops_desc;
 	size_t st_map_size;
 	struct bpf_struct_ops_map *st_map;
 	const struct btf_type *t, *vt;
 	struct bpf_map *map;
 	int ret;
 
-	st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
-	if (!st_ops)
+	st_ops_desc = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
+	if (!st_ops_desc)
 		return ERR_PTR(-ENOTSUPP);
 
-	vt = st_ops->value_type;
+	vt = st_ops_desc->value_type;
 	if (attr->value_size != vt->size)
 		return ERR_PTR(-EINVAL);
 
-	t = st_ops->type;
+	t = st_ops_desc->type;
 
 	st_map_size = sizeof(*st_map) +
 		/* kvalue stores the
@@ -696,7 +698,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	if (!st_map)
 		return ERR_PTR(-ENOMEM);
 
-	st_map->st_ops = st_ops;
+	st_map->st_ops_desc = st_ops_desc;
 	map = &st_map->map;
 
 	ret = bpf_jit_charge_modmem(PAGE_SIZE);
@@ -733,8 +735,8 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
-	const struct bpf_struct_ops *st_ops = st_map->st_ops;
-	const struct btf_type *vt = st_ops->value_type;
+	const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc;
+	const struct btf_type *vt = st_ops_desc->value_type;
 	u64 usage;
 
 	usage = sizeof(*st_map) +
@@ -808,7 +810,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
 		/* st_link->map can be NULL if
 		 * bpf_struct_ops_link_create() fails to register.
 		 */
-		st_map->st_ops->unreg(&st_map->kvalue.data);
+		st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
 		bpf_map_put(&st_map->map);
 	}
 	kfree(st_link);
@@ -855,7 +857,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
 	if (!bpf_struct_ops_valid_to_reg(new_map))
 		return -EINVAL;
 
-	if (!st_map->st_ops->update)
+	if (!st_map->st_ops_desc->st_ops->update)
 		return -EOPNOTSUPP;
 
 	mutex_lock(&update_mutex);
@@ -868,12 +870,12 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
 
 	old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
 	/* The new and old struct_ops must be the same type. */
-	if (st_map->st_ops != old_st_map->st_ops) {
+	if (st_map->st_ops_desc != old_st_map->st_ops_desc) {
 		err = -EINVAL;
 		goto err_out;
 	}
 
-	err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+	err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
 	if (err)
 		goto err_out;
 
@@ -924,7 +926,7 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
 	if (err)
 		goto err_out;
 
-	err = st_map->st_ops->reg(st_map->kvalue.data);
+	err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data);
 	if (err) {
 		bpf_link_cleanup(&link_primer);
 		link = NULL;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9e9cc132dd02..e279491118b7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20285,6 +20285,7 @@ static void print_verification_stats(struct bpf_verifier_env *env)
 static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
 {
 	const struct btf_type *t, *func_proto;
+	const struct bpf_struct_ops_desc *st_ops_desc;
 	const struct bpf_struct_ops *st_ops;
 	const struct btf_member *member;
 	struct bpf_prog *prog = env->prog;
@@ -20297,14 +20298,15 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
 	}
 
 	btf_id = prog->aux->attach_btf_id;
-	st_ops = bpf_struct_ops_find(btf_id);
-	if (!st_ops) {
+	st_ops_desc = bpf_struct_ops_find(btf_id);
+	if (!st_ops_desc) {
 		verbose(env, "attach_btf_id %u is not a supported struct\n",
 			btf_id);
 		return -ENOTSUPP;
 	}
+	st_ops = st_ops_desc->st_ops;
 
-	t = st_ops->type;
+	t = st_ops_desc->type;
 	member_idx = prog->expected_attach_type;
 	if (member_idx >= btf_type_vlen(t)) {
 		verbose(env, "attach to invalid member idx %u of struct %s\n",
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index 8906f7bdf4a9..ba2c58dba2da 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -22,6 +22,8 @@ struct bpf_dummy_ops_test_args {
 	struct bpf_dummy_ops_state state;
 };
 
+static struct btf *bpf_dummy_ops_btf;
+
 static struct bpf_dummy_ops_test_args *
 dummy_ops_init_args(const union bpf_attr *kattr, unsigned int nr)
 {
@@ -90,9 +92,15 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 	void *image = NULL;
 	unsigned int op_idx;
 	int prog_ret;
+	s32 type_id;
 	int err;
 
-	if (prog->aux->attach_btf_id != st_ops->type_id)
+	type_id = btf_find_by_name_kind(bpf_dummy_ops_btf,
+					bpf_bpf_dummy_ops.name,
+					BTF_KIND_STRUCT);
+	if (type_id < 0)
+		return -EINVAL;
+	if (prog->aux->attach_btf_id != type_id)
 		return -EOPNOTSUPP;
 
 	func_proto = prog->aux->attach_func_proto;
@@ -148,6 +156,7 @@ out:
 
 static int bpf_dummy_init(struct btf *btf)
 {
+	bpf_dummy_ops_btf = btf;
 	return 0;
 }
 
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index ae8b15e6896f..dffd8828079b 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -20,6 +20,7 @@ static u32 unsupported_ops[] = {
 
 static const struct btf_type *tcp_sock_type;
 static u32 tcp_sock_id, sock_id;
+static const struct btf_type *tcp_congestion_ops_type;
 
 static int bpf_tcp_ca_init(struct btf *btf)
 {
@@ -36,6 +37,11 @@ static int bpf_tcp_ca_init(struct btf *btf)
 	tcp_sock_id = type_id;
 	tcp_sock_type = btf_type_by_id(btf, tcp_sock_id);
 
+	type_id = btf_find_by_name_kind(btf, "tcp_congestion_ops", BTF_KIND_STRUCT);
+	if (type_id < 0)
+		return -EINVAL;
+	tcp_congestion_ops_type = btf_type_by_id(btf, type_id);
+
 	return 0;
 }
 
@@ -149,7 +155,7 @@ static u32 prog_ops_moff(const struct bpf_prog *prog)
 	u32 midx;
 
 	midx = prog->expected_attach_type;
-	t = bpf_tcp_congestion_ops.type;
+	t = tcp_congestion_ops_type;
 	m = &btf_type_member(t)[midx];
 
 	return __btf_member_bit_offset(t, m) / 8;
-- 
cgit v1.2.3


From 1338b93346587a2a6ac79bbcf55ef5b357745573 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Fri, 19 Jan 2024 14:49:57 -0800
Subject: bpf: pass btf object id in bpf_map_info.

Include btf object id (btf_obj_id) in bpf_map_info so that tools (ex:
bpftools struct_ops dump) know the correct btf from the kernel to look up
type information of struct_ops types.

Since struct_ops types can be defined and registered in a module. The
type information of a struct_ops type are defined in the btf of the
module defining it.  The userspace tools need to know which btf is for
the module defining a struct_ops type.

Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240119225005.668602-7-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h            | 4 ++++
 include/uapi/linux/bpf.h       | 2 +-
 kernel/bpf/bpf_struct_ops.c    | 7 +++++++
 kernel/bpf/syscall.c           | 2 ++
 tools/include/uapi/linux/bpf.h | 2 +-
 5 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7fc95e7babab..29fcae9fa8ed 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1732,6 +1732,7 @@ struct bpf_dummy_ops {
 int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 			    union bpf_attr __user *uattr);
 #endif
+void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map);
 #else
 static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(u32 type_id)
 {
@@ -1759,6 +1760,9 @@ static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
 {
 	return -EOPNOTSUPP;
 }
+static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map)
+{
+}
 
 #endif
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 287d05732668..a380047c86af 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6487,7 +6487,7 @@ struct bpf_map_info {
 	__u32 btf_id;
 	__u32 btf_key_type_id;
 	__u32 btf_value_type_id;
-	__u32 :32;	/* alignment pad */
+	__u32 btf_vmlinux_id;
 	__u64 map_extra;
 } __attribute__((aligned(8)));
 
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 5ddcca4c4fba..5e98af4fc2e2 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -947,3 +947,10 @@ err_out:
 	kfree(link);
 	return err;
 }
+
+void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map)
+{
+	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+	info->btf_vmlinux_id = btf_obj_id(st_map->btf);
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 13193aaafb64..55b458429705 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4691,6 +4691,8 @@ static int bpf_map_get_info_by_fd(struct file *file,
 		info.btf_value_type_id = map->btf_value_type_id;
 	}
 	info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
+	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS)
+		bpf_map_struct_ops_info_fill(&info, map);
 
 	if (bpf_map_is_offloaded(map)) {
 		err = bpf_map_offload_info_fill(&info, map);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 287d05732668..a380047c86af 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6487,7 +6487,7 @@ struct bpf_map_info {
 	__u32 btf_id;
 	__u32 btf_key_type_id;
 	__u32 btf_value_type_id;
-	__u32 :32;	/* alignment pad */
+	__u32 btf_vmlinux_id;
 	__u64 map_extra;
 } __attribute__((aligned(8)));
 
-- 
cgit v1.2.3


From 689423db3bda2244c24db8a64de4cdb37be1de41 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Fri, 19 Jan 2024 14:49:58 -0800
Subject: bpf: lookup struct_ops types from a given module BTF.

This is a preparation for searching for struct_ops types from a specified
module. BTF is always btf_vmlinux now. This patch passes a pointer of BTF
to bpf_struct_ops_find_value() and bpf_struct_ops_find(). Once the new
registration API of struct_ops types is used, other BTFs besides
btf_vmlinux can also be passed to them.

Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240119225005.668602-8-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h         |  4 ++--
 kernel/bpf/bpf_struct_ops.c | 11 ++++++-----
 kernel/bpf/verifier.c       |  2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 29fcae9fa8ed..86ff8911d7ee 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1689,7 +1689,7 @@ struct bpf_struct_ops_desc {
 
 #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
 #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
-const struct bpf_struct_ops_desc *bpf_struct_ops_find(u32 type_id);
+const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id);
 void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log);
 bool bpf_struct_ops_get(const void *kdata);
 void bpf_struct_ops_put(const void *kdata);
@@ -1734,7 +1734,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 #endif
 void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map);
 #else
-static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(u32 type_id)
+static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id)
 {
 	return NULL;
 }
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 5e98af4fc2e2..7505f515aac3 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -221,11 +221,11 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
 extern struct btf *btf_vmlinux;
 
 static const struct bpf_struct_ops_desc *
-bpf_struct_ops_find_value(u32 value_id)
+bpf_struct_ops_find_value(struct btf *btf, u32 value_id)
 {
 	unsigned int i;
 
-	if (!value_id || !btf_vmlinux)
+	if (!value_id || !btf)
 		return NULL;
 
 	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
@@ -236,11 +236,12 @@ bpf_struct_ops_find_value(u32 value_id)
 	return NULL;
 }
 
-const struct bpf_struct_ops_desc *bpf_struct_ops_find(u32 type_id)
+const struct bpf_struct_ops_desc *
+bpf_struct_ops_find(struct btf *btf, u32 type_id)
 {
 	unsigned int i;
 
-	if (!type_id || !btf_vmlinux)
+	if (!type_id || !btf)
 		return NULL;
 
 	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
@@ -682,7 +683,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	struct bpf_map *map;
 	int ret;
 
-	st_ops_desc = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
+	st_ops_desc = bpf_struct_ops_find_value(btf_vmlinux, attr->btf_vmlinux_value_type_id);
 	if (!st_ops_desc)
 		return ERR_PTR(-ENOTSUPP);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e279491118b7..2a0fd2ccdb11 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20298,7 +20298,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
 	}
 
 	btf_id = prog->aux->attach_btf_id;
-	st_ops_desc = bpf_struct_ops_find(btf_id);
+	st_ops_desc = bpf_struct_ops_find(btf_vmlinux, btf_id);
 	if (!st_ops_desc) {
 		verbose(env, "attach_btf_id %u is not a supported struct\n",
 			btf_id);
-- 
cgit v1.2.3


From fcc2c1fb0651477c8ed78a3a293c175ccd70697a Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Fri, 19 Jan 2024 14:49:59 -0800
Subject: bpf: pass attached BTF to the bpf_struct_ops subsystem

Pass the fd of a btf from the userspace to the bpf() syscall, and then
convert the fd into a btf. The btf is generated from the module that
defines the target BPF struct_ops type.

In order to inform the kernel about the module that defines the target
struct_ops type, the userspace program needs to provide a btf fd for the
respective module's btf. This btf contains essential information on the
types defined within the module, including the target struct_ops type.

A btf fd must be provided to the kernel for struct_ops maps and for the bpf
programs attached to those maps.

In the case of the bpf programs, the attach_btf_obj_fd parameter is passed
as part of the bpf_attr and is converted into a btf. This btf is then
stored in the prog->aux->attach_btf field. Here, it just let the verifier
access attach_btf directly.

In the case of struct_ops maps, a btf fd is passed as value_type_btf_obj_fd
of bpf_attr. The bpf_struct_ops_map_alloc() function converts the fd to a
btf and stores it as st_map->btf. A flag BPF_F_VTYPE_BTF_OBJ_FD is added
for map_flags to indicate that the value of value_type_btf_obj_fd is set.

Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240119225005.668602-9-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/uapi/linux/bpf.h       |  8 ++++++
 kernel/bpf/bpf_struct_ops.c    | 65 ++++++++++++++++++++++++++++++------------
 kernel/bpf/syscall.c           |  2 +-
 kernel/bpf/verifier.c          |  9 ++++--
 tools/include/uapi/linux/bpf.h |  8 ++++++
 5 files changed, 70 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a380047c86af..1fef6d5a1330 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1330,6 +1330,9 @@ enum {
 
 /* Get path from provided FD in BPF_OBJ_PIN/BPF_OBJ_GET commands */
 	BPF_F_PATH_FD		= (1U << 14),
+
+/* Flag for value_type_btf_obj_fd, the fd is available */
+	BPF_F_VTYPE_BTF_OBJ_FD	= (1U << 15),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1403,6 +1406,11 @@ union bpf_attr {
 		 * to using 5 hash functions).
 		 */
 		__u64	map_extra;
+
+		__s32   value_type_btf_obj_fd;	/* fd pointing to a BTF
+						 * type data for
+						 * btf_vmlinux_value_type_id.
+						 */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 7505f515aac3..3b8d689ece5d 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -641,6 +641,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
 		bpf_jit_uncharge_modmem(PAGE_SIZE);
 	}
 	bpf_map_area_free(st_map->uvalue);
+	btf_put(st_map->btf);
 	bpf_map_area_free(st_map);
 }
 
@@ -669,7 +670,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
 {
 	if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
-	    (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
+	    (attr->map_flags & ~(BPF_F_LINK | BPF_F_VTYPE_BTF_OBJ_FD)) ||
+	    !attr->btf_vmlinux_value_type_id)
 		return -EINVAL;
 	return 0;
 }
@@ -681,15 +683,36 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	struct bpf_struct_ops_map *st_map;
 	const struct btf_type *t, *vt;
 	struct bpf_map *map;
+	struct btf *btf;
 	int ret;
 
-	st_ops_desc = bpf_struct_ops_find_value(btf_vmlinux, attr->btf_vmlinux_value_type_id);
-	if (!st_ops_desc)
-		return ERR_PTR(-ENOTSUPP);
+	if (attr->map_flags & BPF_F_VTYPE_BTF_OBJ_FD) {
+		/* The map holds btf for its whole life time. */
+		btf = btf_get_by_fd(attr->value_type_btf_obj_fd);
+		if (IS_ERR(btf))
+			return ERR_CAST(btf);
+		if (!btf_is_module(btf)) {
+			btf_put(btf);
+			return ERR_PTR(-EINVAL);
+		}
+	} else {
+		btf = bpf_get_btf_vmlinux();
+		if (IS_ERR(btf))
+			return ERR_CAST(btf);
+		btf_get(btf);
+	}
+
+	st_ops_desc = bpf_struct_ops_find_value(btf, attr->btf_vmlinux_value_type_id);
+	if (!st_ops_desc) {
+		ret = -ENOTSUPP;
+		goto errout;
+	}
 
 	vt = st_ops_desc->value_type;
-	if (attr->value_size != vt->size)
-		return ERR_PTR(-EINVAL);
+	if (attr->value_size != vt->size) {
+		ret = -EINVAL;
+		goto errout;
+	}
 
 	t = st_ops_desc->type;
 
@@ -700,17 +723,17 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 		(vt->size - sizeof(struct bpf_struct_ops_value));
 
 	st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
-	if (!st_map)
-		return ERR_PTR(-ENOMEM);
+	if (!st_map) {
+		ret = -ENOMEM;
+		goto errout;
+	}
 
 	st_map->st_ops_desc = st_ops_desc;
 	map = &st_map->map;
 
 	ret = bpf_jit_charge_modmem(PAGE_SIZE);
-	if (ret) {
-		__bpf_struct_ops_map_free(map);
-		return ERR_PTR(ret);
-	}
+	if (ret)
+		goto errout_free;
 
 	st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
 	if (!st_map->image) {
@@ -719,24 +742,30 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 		 * here.
 		 */
 		bpf_jit_uncharge_modmem(PAGE_SIZE);
-		__bpf_struct_ops_map_free(map);
-		return ERR_PTR(-ENOMEM);
+		ret = -ENOMEM;
+		goto errout_free;
 	}
 	st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
 	st_map->links =
 		bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *),
 				   NUMA_NO_NODE);
 	if (!st_map->uvalue || !st_map->links) {
-		__bpf_struct_ops_map_free(map);
-		return ERR_PTR(-ENOMEM);
+		ret = -ENOMEM;
+		goto errout_free;
 	}
-
-	st_map->btf = btf_vmlinux;
+	st_map->btf = btf;
 
 	mutex_init(&st_map->lock);
 	bpf_map_init_from_attr(map, attr);
 
 	return map;
+
+errout_free:
+	__bpf_struct_ops_map_free(map);
+errout:
+	btf_put(btf);
+
+	return ERR_PTR(ret);
 }
 
 static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 55b458429705..f8124b3229e2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1123,7 +1123,7 @@ free_map_tab:
 	return ret;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD map_extra
+#define BPF_MAP_CREATE_LAST_FIELD value_type_btf_obj_fd
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2a0fd2ccdb11..6081512deb79 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20290,6 +20290,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
 	const struct btf_member *member;
 	struct bpf_prog *prog = env->prog;
 	u32 btf_id, member_idx;
+	struct btf *btf;
 	const char *mname;
 
 	if (!prog->gpl_compatible) {
@@ -20297,8 +20298,10 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
 		return -EINVAL;
 	}
 
+	btf = prog->aux->attach_btf ?: bpf_get_btf_vmlinux();
+
 	btf_id = prog->aux->attach_btf_id;
-	st_ops_desc = bpf_struct_ops_find(btf_vmlinux, btf_id);
+	st_ops_desc = bpf_struct_ops_find(btf, btf_id);
 	if (!st_ops_desc) {
 		verbose(env, "attach_btf_id %u is not a supported struct\n",
 			btf_id);
@@ -20315,8 +20318,8 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
 	}
 
 	member = &btf_type_member(t)[member_idx];
-	mname = btf_name_by_offset(btf_vmlinux, member->name_off);
-	func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type,
+	mname = btf_name_by_offset(btf, member->name_off);
+	func_proto = btf_type_resolve_func_ptr(btf, member->type,
 					       NULL);
 	if (!func_proto) {
 		verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a380047c86af..1fef6d5a1330 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1330,6 +1330,9 @@ enum {
 
 /* Get path from provided FD in BPF_OBJ_PIN/BPF_OBJ_GET commands */
 	BPF_F_PATH_FD		= (1U << 14),
+
+/* Flag for value_type_btf_obj_fd, the fd is available */
+	BPF_F_VTYPE_BTF_OBJ_FD	= (1U << 15),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1403,6 +1406,11 @@ union bpf_attr {
 		 * to using 5 hash functions).
 		 */
 		__u64	map_extra;
+
+		__s32   value_type_btf_obj_fd;	/* fd pointing to a BTF
+						 * type data for
+						 * btf_vmlinux_value_type_id.
+						 */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
-- 
cgit v1.2.3


From e3f87fdfed7b770dd7066b02262b12747881e76d Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Fri, 19 Jan 2024 14:50:00 -0800
Subject: bpf: hold module refcnt in bpf_struct_ops map creation and prog
 verification.

To ensure that a module remains accessible whenever a struct_ops object of
a struct_ops type provided by the module is still in use.

struct bpf_struct_ops_map doesn't hold a refcnt to btf anymore since a
module will hold a refcnt to it's btf already. But, struct_ops programs are
different. They hold their associated btf, not the module since they need
only btf to assure their types (signatures).

However, verifier holds the refcnt of the associated module of a struct_ops
type temporarily when verify a struct_ops prog. Verifier needs the help
from the verifier operators (struct bpf_verifier_ops) provided by the owner
module to verify data access of a prog, provide information, and generate
code.

This patch also add a count of links (links_cnt) to bpf_struct_ops_map. It
avoids bpf_struct_ops_map_put_progs() from accessing btf after calling
module_put() in bpf_struct_ops_map_free().

Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240119225005.668602-10-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h          |  1 +
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/bpf_struct_ops.c  | 29 +++++++++++++++++++++++------
 kernel/bpf/verifier.c        | 11 +++++++++++
 4 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 86ff8911d7ee..a5b425893d38 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1674,6 +1674,7 @@ struct bpf_struct_ops {
 	int (*update)(void *kdata, void *old_kdata);
 	int (*validate)(void *kdata);
 	void *cfi_stubs;
+	struct module *owner;
 	const char *name;
 	struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
 };
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e11baecbde68..7f5816482a10 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -662,6 +662,7 @@ struct bpf_verifier_env {
 	u32 prev_insn_idx;
 	struct bpf_prog *prog;		/* eBPF program being verified */
 	const struct bpf_verifier_ops *ops;
+	struct module *attach_btf_mod;	/* The owner module of prog->aux->attach_btf */
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 3b8d689ece5d..02216a8d9265 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -40,6 +40,7 @@ struct bpf_struct_ops_map {
 	 * (in kvalue.data).
 	 */
 	struct bpf_link **links;
+	u32 links_cnt;
 	/* image is a page that has all the trampolines
 	 * that stores the func args before calling the bpf_prog.
 	 * A PAGE_SIZE "image" is enough to store all trampoline for
@@ -306,10 +307,9 @@ static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key)
 
 static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
 {
-	const struct btf_type *t = st_map->st_ops_desc->type;
 	u32 i;
 
-	for (i = 0; i < btf_type_vlen(t); i++) {
+	for (i = 0; i < st_map->links_cnt; i++) {
 		if (st_map->links[i]) {
 			bpf_link_put(st_map->links[i]);
 			st_map->links[i] = NULL;
@@ -641,12 +641,20 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
 		bpf_jit_uncharge_modmem(PAGE_SIZE);
 	}
 	bpf_map_area_free(st_map->uvalue);
-	btf_put(st_map->btf);
 	bpf_map_area_free(st_map);
 }
 
 static void bpf_struct_ops_map_free(struct bpf_map *map)
 {
+	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+	/* st_ops->owner was acquired during map_alloc to implicitly holds
+	 * the btf's refcnt. The acquire was only done when btf_is_module()
+	 * st_map->btf cannot be NULL here.
+	 */
+	if (btf_is_module(st_map->btf))
+		module_put(st_map->st_ops_desc->st_ops->owner);
+
 	/* The struct_ops's function may switch to another struct_ops.
 	 *
 	 * For example, bpf_tcp_cc_x->init() may switch to
@@ -682,6 +690,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	size_t st_map_size;
 	struct bpf_struct_ops_map *st_map;
 	const struct btf_type *t, *vt;
+	struct module *mod = NULL;
 	struct bpf_map *map;
 	struct btf *btf;
 	int ret;
@@ -695,11 +704,18 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 			btf_put(btf);
 			return ERR_PTR(-EINVAL);
 		}
+
+		mod = btf_try_get_module(btf);
+		/* mod holds a refcnt to btf. We don't need an extra refcnt
+		 * here.
+		 */
+		btf_put(btf);
+		if (!mod)
+			return ERR_PTR(-EINVAL);
 	} else {
 		btf = bpf_get_btf_vmlinux();
 		if (IS_ERR(btf))
 			return ERR_CAST(btf);
-		btf_get(btf);
 	}
 
 	st_ops_desc = bpf_struct_ops_find_value(btf, attr->btf_vmlinux_value_type_id);
@@ -746,8 +762,9 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 		goto errout_free;
 	}
 	st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
+	st_map->links_cnt = btf_type_vlen(t);
 	st_map->links =
-		bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *),
+		bpf_map_area_alloc(st_map->links_cnt * sizeof(struct bpf_links *),
 				   NUMA_NO_NODE);
 	if (!st_map->uvalue || !st_map->links) {
 		ret = -ENOMEM;
@@ -763,7 +780,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 errout_free:
 	__bpf_struct_ops_map_free(map);
 errout:
-	btf_put(btf);
+	module_put(mod);
 
 	return ERR_PTR(ret);
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6081512deb79..f31868ba0c2d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20299,6 +20299,15 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
 	}
 
 	btf = prog->aux->attach_btf ?: bpf_get_btf_vmlinux();
+	if (btf_is_module(btf)) {
+		/* Make sure st_ops is valid through the lifetime of env */
+		env->attach_btf_mod = btf_try_get_module(btf);
+		if (!env->attach_btf_mod) {
+			verbose(env, "struct_ops module %s is not found\n",
+				btf_get_name(btf));
+			return -ENOTSUPP;
+		}
+	}
 
 	btf_id = prog->aux->attach_btf_id;
 	st_ops_desc = bpf_struct_ops_find(btf, btf_id);
@@ -21024,6 +21033,8 @@ err_release_maps:
 		env->prog->expected_attach_type = 0;
 
 	*prog = env->prog;
+
+	module_put(env->attach_btf_mod);
 err_unlock:
 	if (!is_priv)
 		mutex_unlock(&bpf_verifier_lock);
-- 
cgit v1.2.3


From 612d087d4ba54cef47946e22e5dabad762dd7ed5 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Fri, 19 Jan 2024 14:50:01 -0800
Subject: bpf: validate value_type

A value_type should consist of three components: refcnt, state, and data.
refcnt and state has been move to struct bpf_struct_ops_common_value to
make it easier to check the value type.

Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240119225005.668602-11-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h         | 12 ++++++
 kernel/bpf/bpf_struct_ops.c | 93 +++++++++++++++++++++++++++++----------------
 2 files changed, 72 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a5b425893d38..7c178170f93f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1688,6 +1688,18 @@ struct bpf_struct_ops_desc {
 	u32 value_id;
 };
 
+enum bpf_struct_ops_state {
+	BPF_STRUCT_OPS_STATE_INIT,
+	BPF_STRUCT_OPS_STATE_INUSE,
+	BPF_STRUCT_OPS_STATE_TOBEFREE,
+	BPF_STRUCT_OPS_STATE_READY,
+};
+
+struct bpf_struct_ops_common_value {
+	refcount_t refcnt;
+	enum bpf_struct_ops_state state;
+};
+
 #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
 #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
 const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 02216a8d9265..30ab34fab0f8 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -13,19 +13,8 @@
 #include <linux/btf_ids.h>
 #include <linux/rcupdate_wait.h>
 
-enum bpf_struct_ops_state {
-	BPF_STRUCT_OPS_STATE_INIT,
-	BPF_STRUCT_OPS_STATE_INUSE,
-	BPF_STRUCT_OPS_STATE_TOBEFREE,
-	BPF_STRUCT_OPS_STATE_READY,
-};
-
-#define BPF_STRUCT_OPS_COMMON_VALUE			\
-	refcount_t refcnt;				\
-	enum bpf_struct_ops_state state
-
 struct bpf_struct_ops_value {
-	BPF_STRUCT_OPS_COMMON_VALUE;
+	struct bpf_struct_ops_common_value common;
 	char data[] ____cacheline_aligned_in_smp;
 };
 
@@ -81,8 +70,8 @@ static DEFINE_MUTEX(update_mutex);
 #define BPF_STRUCT_OPS_TYPE(_name)				\
 extern struct bpf_struct_ops bpf_##_name;			\
 								\
-struct bpf_struct_ops_##_name {						\
-	BPF_STRUCT_OPS_COMMON_VALUE;				\
+struct bpf_struct_ops_##_name {					\
+	struct bpf_struct_ops_common_value common;		\
 	struct _name data ____cacheline_aligned_in_smp;		\
 };
 #include "bpf_struct_ops_types.h"
@@ -113,11 +102,49 @@ const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
 
 BTF_ID_LIST(st_ops_ids)
 BTF_ID(struct, module)
+BTF_ID(struct, bpf_struct_ops_common_value)
 
 enum {
 	IDX_MODULE_ID,
+	IDX_ST_OPS_COMMON_VALUE_ID,
 };
 
+extern struct btf *btf_vmlinux;
+
+static bool is_valid_value_type(struct btf *btf, s32 value_id,
+				const struct btf_type *type,
+				const char *value_name)
+{
+	const struct btf_type *common_value_type;
+	const struct btf_member *member;
+	const struct btf_type *vt, *mt;
+
+	vt = btf_type_by_id(btf, value_id);
+	if (btf_vlen(vt) != 2) {
+		pr_warn("The number of %s's members should be 2, but we get %d\n",
+			value_name, btf_vlen(vt));
+		return false;
+	}
+	member = btf_type_member(vt);
+	mt = btf_type_by_id(btf, member->type);
+	common_value_type = btf_type_by_id(btf_vmlinux,
+					   st_ops_ids[IDX_ST_OPS_COMMON_VALUE_ID]);
+	if (mt != common_value_type) {
+		pr_warn("The first member of %s should be bpf_struct_ops_common_value\n",
+			value_name);
+		return false;
+	}
+	member++;
+	mt = btf_type_by_id(btf, member->type);
+	if (mt != type) {
+		pr_warn("The second member of %s should be %s\n",
+			value_name, btf_name_by_offset(btf, type->name_off));
+		return false;
+	}
+
+	return true;
+}
+
 static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 				     struct btf *btf,
 				     struct bpf_verifier_log *log)
@@ -138,14 +165,6 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 	}
 	sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
 
-	value_id = btf_find_by_name_kind(btf, value_name,
-					 BTF_KIND_STRUCT);
-	if (value_id < 0) {
-		pr_warn("Cannot find struct %s in %s\n",
-			value_name, btf_get_name(btf));
-		return;
-	}
-
 	type_id = btf_find_by_name_kind(btf, st_ops->name,
 					BTF_KIND_STRUCT);
 	if (type_id < 0) {
@@ -160,6 +179,16 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 		return;
 	}
 
+	value_id = btf_find_by_name_kind(btf, value_name,
+					 BTF_KIND_STRUCT);
+	if (value_id < 0) {
+		pr_warn("Cannot find struct %s in %s\n",
+			value_name, btf_get_name(btf));
+		return;
+	}
+	if (!is_valid_value_type(btf, value_id, t, value_name))
+		return;
+
 	for_each_member(i, t, member) {
 		const struct btf_type *func_proto;
 
@@ -219,8 +248,6 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
 	}
 }
 
-extern struct btf *btf_vmlinux;
-
 static const struct bpf_struct_ops_desc *
 bpf_struct_ops_find_value(struct btf *btf, u32 value_id)
 {
@@ -276,7 +303,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 
 	kvalue = &st_map->kvalue;
 	/* Pair with smp_store_release() during map_update */
-	state = smp_load_acquire(&kvalue->state);
+	state = smp_load_acquire(&kvalue->common.state);
 	if (state == BPF_STRUCT_OPS_STATE_INIT) {
 		memset(value, 0, map->value_size);
 		return 0;
@@ -287,7 +314,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 	 */
 	uvalue = value;
 	memcpy(uvalue, st_map->uvalue, map->value_size);
-	uvalue->state = state;
+	uvalue->common.state = state;
 
 	/* This value offers the user space a general estimate of how
 	 * many sockets are still utilizing this struct_ops for TCP
@@ -295,7 +322,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 	 * should sufficiently meet our present goals.
 	 */
 	refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
-	refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0));
+	refcount_set(&uvalue->common.refcnt, max_t(s64, refcnt, 0));
 
 	return 0;
 }
@@ -413,7 +440,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	if (err)
 		return err;
 
-	if (uvalue->state || refcount_read(&uvalue->refcnt))
+	if (uvalue->common.state || refcount_read(&uvalue->common.refcnt))
 		return -EINVAL;
 
 	tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
@@ -425,7 +452,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 
 	mutex_lock(&st_map->lock);
 
-	if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) {
+	if (kvalue->common.state != BPF_STRUCT_OPS_STATE_INIT) {
 		err = -EBUSY;
 		goto unlock;
 	}
@@ -540,7 +567,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		 *
 		 * Pair with smp_load_acquire() during lookup_elem().
 		 */
-		smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+		smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_READY);
 		goto unlock;
 	}
 
@@ -558,7 +585,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		 * It ensures the above udata updates (e.g. prog->aux->id)
 		 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
 		 */
-		smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);
+		smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_INUSE);
 		goto unlock;
 	}
 
@@ -588,7 +615,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
 	if (st_map->map.map_flags & BPF_F_LINK)
 		return -EOPNOTSUPP;
 
-	prev_state = cmpxchg(&st_map->kvalue.state,
+	prev_state = cmpxchg(&st_map->kvalue.common.state,
 			     BPF_STRUCT_OPS_STATE_INUSE,
 			     BPF_STRUCT_OPS_STATE_TOBEFREE);
 	switch (prev_state) {
@@ -848,7 +875,7 @@ static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
 	return map->map_type == BPF_MAP_TYPE_STRUCT_OPS &&
 		map->map_flags & BPF_F_LINK &&
 		/* Pair with smp_store_release() during map_update */
-		smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY;
+		smp_load_acquire(&st_map->kvalue.common.state) == BPF_STRUCT_OPS_STATE_READY;
 }
 
 static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
-- 
cgit v1.2.3


From f6be98d19985411ca1f3d53413d94d5b7f41c200 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Fri, 19 Jan 2024 14:50:02 -0800
Subject: bpf, net: switch to dynamic registration

Replace the static list of struct_ops types with per-btf struct_ops_tab to
enable dynamic registration.

Both bpf_dummy_ops and bpf_tcp_ca now utilize the registration function
instead of being listed in bpf_struct_ops_types.h.

Cc: netdev@vger.kernel.org
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240119225005.668602-12-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h               |  27 ++++++----
 include/linux/btf.h               |  12 +++++
 kernel/bpf/bpf_struct_ops.c       | 100 +++++---------------------------------
 kernel/bpf/bpf_struct_ops_types.h |  12 -----
 kernel/bpf/btf.c                  |  86 ++++++++++++++++++++++++++++++--
 net/bpf/bpf_dummy_struct_ops.c    |  11 ++++-
 net/ipv4/bpf_tcp_ca.c             |  12 +++--
 7 files changed, 142 insertions(+), 118 deletions(-)
 delete mode 100644 kernel/bpf/bpf_struct_ops_types.h

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7c178170f93f..75b7f9b19c6a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1701,9 +1701,20 @@ struct bpf_struct_ops_common_value {
 };
 
 #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
+/* This macro helps developer to register a struct_ops type and generate
+ * type information correctly. Developers should use this macro to register
+ * a struct_ops type instead of calling __register_bpf_struct_ops() directly.
+ */
+#define register_bpf_struct_ops(st_ops, type)				\
+	({								\
+		struct bpf_struct_ops_##type {				\
+			struct bpf_struct_ops_common_value common;	\
+			struct type data ____cacheline_aligned_in_smp;	\
+		};							\
+		BTF_TYPE_EMIT(struct bpf_struct_ops_##type);		\
+		__register_bpf_struct_ops(st_ops);			\
+	})
 #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
-const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id);
-void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log);
 bool bpf_struct_ops_get(const void *kdata);
 void bpf_struct_ops_put(const void *kdata);
 int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
@@ -1745,16 +1756,12 @@ struct bpf_dummy_ops {
 int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 			    union bpf_attr __user *uattr);
 #endif
+int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
+			     struct btf *btf,
+			     struct bpf_verifier_log *log);
 void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map);
 #else
-static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id)
-{
-	return NULL;
-}
-static inline void bpf_struct_ops_init(struct btf *btf,
-				       struct bpf_verifier_log *log)
-{
-}
+#define register_bpf_struct_ops(st_ops, type) ({ (void *)(st_ops); 0; })
 static inline bool bpf_try_module_get(const void *data, struct module *owner)
 {
 	return try_module_get(owner);
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 932af1680bb5..1ee8977b8c95 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -497,6 +497,18 @@ static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id)
 
 struct bpf_verifier_log;
 
+#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
+struct bpf_struct_ops;
+int __register_bpf_struct_ops(struct bpf_struct_ops *st_ops);
+const struct bpf_struct_ops_desc *bpf_struct_ops_find_value(struct btf *btf, u32 value_id);
+const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id);
+#else
+static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id)
+{
+	return NULL;
+}
+#endif
+
 #ifdef CONFIG_BPF_SYSCALL
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
 const char *btf_name_by_offset(const struct btf *btf, u32 offset);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 30ab34fab0f8..defc052e4622 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -62,35 +62,6 @@ static DEFINE_MUTEX(update_mutex);
 #define VALUE_PREFIX "bpf_struct_ops_"
 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
 
-/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is
- * the map's value exposed to the userspace and its btf-type-id is
- * stored at the map->btf_vmlinux_value_type_id.
- *
- */
-#define BPF_STRUCT_OPS_TYPE(_name)				\
-extern struct bpf_struct_ops bpf_##_name;			\
-								\
-struct bpf_struct_ops_##_name {					\
-	struct bpf_struct_ops_common_value common;		\
-	struct _name data ____cacheline_aligned_in_smp;		\
-};
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
-
-enum {
-#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name,
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
-	__NR_BPF_STRUCT_OPS_TYPE,
-};
-
-static struct bpf_struct_ops_desc bpf_struct_ops[] = {
-#define BPF_STRUCT_OPS_TYPE(_name)				\
-	[BPF_STRUCT_OPS_TYPE_##_name] = { .st_ops = &bpf_##_name },
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
-};
-
 const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
 };
 
@@ -145,9 +116,9 @@ static bool is_valid_value_type(struct btf *btf, s32 value_id,
 	return true;
 }
 
-static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
-				     struct btf *btf,
-				     struct bpf_verifier_log *log)
+int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
+			     struct btf *btf,
+			     struct bpf_verifier_log *log)
 {
 	struct bpf_struct_ops *st_ops = st_ops_desc->st_ops;
 	const struct btf_member *member;
@@ -161,7 +132,7 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 	    sizeof(value_name)) {
 		pr_warn("struct_ops name %s is too long\n",
 			st_ops->name);
-		return;
+		return -EINVAL;
 	}
 	sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
 
@@ -170,13 +141,13 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 	if (type_id < 0) {
 		pr_warn("Cannot find struct %s in %s\n",
 			st_ops->name, btf_get_name(btf));
-		return;
+		return -EINVAL;
 	}
 	t = btf_type_by_id(btf, type_id);
 	if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
 		pr_warn("Cannot support #%u members in struct %s\n",
 			btf_type_vlen(t), st_ops->name);
-		return;
+		return -EINVAL;
 	}
 
 	value_id = btf_find_by_name_kind(btf, value_name,
@@ -184,10 +155,10 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 	if (value_id < 0) {
 		pr_warn("Cannot find struct %s in %s\n",
 			value_name, btf_get_name(btf));
-		return;
+		return -EINVAL;
 	}
 	if (!is_valid_value_type(btf, value_id, t, value_name))
-		return;
+		return -EINVAL;
 
 	for_each_member(i, t, member) {
 		const struct btf_type *func_proto;
@@ -196,13 +167,13 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 		if (!*mname) {
 			pr_warn("anon member in struct %s is not supported\n",
 				st_ops->name);
-			break;
+			return -EOPNOTSUPP;
 		}
 
 		if (__btf_member_bitfield_size(t, member)) {
 			pr_warn("bit field member %s in struct %s is not supported\n",
 				mname, st_ops->name);
-			break;
+			return -EOPNOTSUPP;
 		}
 
 		func_proto = btf_type_resolve_func_ptr(btf,
@@ -214,7 +185,7 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 					   &st_ops->func_models[i])) {
 			pr_warn("Error in parsing func ptr %s in struct %s\n",
 				mname, st_ops->name);
-			break;
+			return -EINVAL;
 		}
 	}
 
@@ -222,6 +193,7 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 		if (st_ops->init(btf)) {
 			pr_warn("Error in init bpf_struct_ops %s\n",
 				st_ops->name);
+			return -EINVAL;
 		} else {
 			st_ops_desc->type_id = type_id;
 			st_ops_desc->type = t;
@@ -230,54 +202,8 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 								 value_id);
 		}
 	}
-}
 
-void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
-{
-	struct bpf_struct_ops_desc *st_ops_desc;
-	u32 i;
-
-	/* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */
-#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name);
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
-
-	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
-		st_ops_desc = &bpf_struct_ops[i];
-		bpf_struct_ops_desc_init(st_ops_desc, btf, log);
-	}
-}
-
-static const struct bpf_struct_ops_desc *
-bpf_struct_ops_find_value(struct btf *btf, u32 value_id)
-{
-	unsigned int i;
-
-	if (!value_id || !btf)
-		return NULL;
-
-	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
-		if (bpf_struct_ops[i].value_id == value_id)
-			return &bpf_struct_ops[i];
-	}
-
-	return NULL;
-}
-
-const struct bpf_struct_ops_desc *
-bpf_struct_ops_find(struct btf *btf, u32 type_id)
-{
-	unsigned int i;
-
-	if (!type_id || !btf)
-		return NULL;
-
-	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
-		if (bpf_struct_ops[i].type_id == type_id)
-			return &bpf_struct_ops[i];
-	}
-
-	return NULL;
+	return 0;
 }
 
 static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key,
diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
deleted file mode 100644
index 5678a9ddf817..000000000000
--- a/kernel/bpf/bpf_struct_ops_types.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* internal file - do not include directly */
-
-#ifdef CONFIG_BPF_JIT
-#ifdef CONFIG_NET
-BPF_STRUCT_OPS_TYPE(bpf_dummy_ops)
-#endif
-#ifdef CONFIG_INET
-#include <net/tcp.h>
-BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
-#endif
-#endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2e5e6ec5d965..5c3e526a2dec 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -19,6 +19,7 @@
 #include <linux/bpf_verifier.h>
 #include <linux/btf.h>
 #include <linux/btf_ids.h>
+#include <linux/bpf.h>
 #include <linux/bpf_lsm.h>
 #include <linux/skmsg.h>
 #include <linux/perf_event.h>
@@ -5972,8 +5973,6 @@ struct btf *btf_parse_vmlinux(void)
 	/* btf_parse_vmlinux() runs under bpf_verifier_lock */
 	bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
 
-	bpf_struct_ops_init(btf, log);
-
 	refcount_set(&btf->refcnt, 1);
 
 	err = btf_alloc_id(btf);
@@ -8706,11 +8705,13 @@ bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
 	return !strncmp(reg_name, arg_name, cmp_len);
 }
 
+#ifdef CONFIG_BPF_JIT
 static int
-btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops)
+btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
+		   struct bpf_verifier_log *log)
 {
 	struct btf_struct_ops_tab *tab, *new_tab;
-	int i;
+	int i, err;
 
 	tab = btf->struct_ops_tab;
 	if (!tab) {
@@ -8740,7 +8741,84 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops)
 
 	tab->ops[btf->struct_ops_tab->cnt].st_ops = st_ops;
 
+	err = bpf_struct_ops_desc_init(&tab->ops[btf->struct_ops_tab->cnt], btf, log);
+	if (err)
+		return err;
+
 	btf->struct_ops_tab->cnt++;
 
 	return 0;
 }
+
+const struct bpf_struct_ops_desc *
+bpf_struct_ops_find_value(struct btf *btf, u32 value_id)
+{
+	const struct bpf_struct_ops_desc *st_ops_list;
+	unsigned int i;
+	u32 cnt;
+
+	if (!value_id)
+		return NULL;
+	if (!btf->struct_ops_tab)
+		return NULL;
+
+	cnt = btf->struct_ops_tab->cnt;
+	st_ops_list = btf->struct_ops_tab->ops;
+	for (i = 0; i < cnt; i++) {
+		if (st_ops_list[i].value_id == value_id)
+			return &st_ops_list[i];
+	}
+
+	return NULL;
+}
+
+const struct bpf_struct_ops_desc *
+bpf_struct_ops_find(struct btf *btf, u32 type_id)
+{
+	const struct bpf_struct_ops_desc *st_ops_list;
+	unsigned int i;
+	u32 cnt;
+
+	if (!type_id)
+		return NULL;
+	if (!btf->struct_ops_tab)
+		return NULL;
+
+	cnt = btf->struct_ops_tab->cnt;
+	st_ops_list = btf->struct_ops_tab->ops;
+	for (i = 0; i < cnt; i++) {
+		if (st_ops_list[i].type_id == type_id)
+			return &st_ops_list[i];
+	}
+
+	return NULL;
+}
+
+int __register_bpf_struct_ops(struct bpf_struct_ops *st_ops)
+{
+	struct bpf_verifier_log *log;
+	struct btf *btf;
+	int err = 0;
+
+	btf = btf_get_module_btf(st_ops->owner);
+	if (!btf)
+		return -EINVAL;
+
+	log = kzalloc(sizeof(*log), GFP_KERNEL | __GFP_NOWARN);
+	if (!log) {
+		err = -ENOMEM;
+		goto errout;
+	}
+
+	log->level = BPF_LOG_KERNEL;
+
+	err = btf_add_struct_ops(btf, st_ops, log);
+
+errout:
+	kfree(log);
+	btf_put(btf);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(__register_bpf_struct_ops);
+#endif
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index ba2c58dba2da..02de71719aed 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -7,7 +7,7 @@
 #include <linux/bpf.h>
 #include <linux/btf.h>
 
-extern struct bpf_struct_ops bpf_bpf_dummy_ops;
+static struct bpf_struct_ops bpf_bpf_dummy_ops;
 
 /* A common type for test_N with return value in bpf_dummy_ops */
 typedef int (*dummy_ops_test_ret_fn)(struct bpf_dummy_ops_state *state, ...);
@@ -256,7 +256,7 @@ static struct bpf_dummy_ops __bpf_bpf_dummy_ops = {
 	.test_sleepable = bpf_dummy_test_sleepable,
 };
 
-struct bpf_struct_ops bpf_bpf_dummy_ops = {
+static struct bpf_struct_ops bpf_bpf_dummy_ops = {
 	.verifier_ops = &bpf_dummy_verifier_ops,
 	.init = bpf_dummy_init,
 	.check_member = bpf_dummy_ops_check_member,
@@ -265,4 +265,11 @@ struct bpf_struct_ops bpf_bpf_dummy_ops = {
 	.unreg = bpf_dummy_unreg,
 	.name = "bpf_dummy_ops",
 	.cfi_stubs = &__bpf_bpf_dummy_ops,
+	.owner = THIS_MODULE,
 };
+
+static int __init bpf_dummy_struct_ops_init(void)
+{
+	return register_bpf_struct_ops(&bpf_bpf_dummy_ops, bpf_dummy_ops);
+}
+late_initcall(bpf_dummy_struct_ops_init);
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index dffd8828079b..8e7716256d3c 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -12,7 +12,7 @@
 #include <net/bpf_sk_storage.h>
 
 /* "extern" is to avoid sparse warning.  It is only used in bpf_struct_ops.c. */
-extern struct bpf_struct_ops bpf_tcp_congestion_ops;
+static struct bpf_struct_ops bpf_tcp_congestion_ops;
 
 static u32 unsupported_ops[] = {
 	offsetof(struct tcp_congestion_ops, get_info),
@@ -345,7 +345,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
 	.release = __bpf_tcp_ca_release,
 };
 
-struct bpf_struct_ops bpf_tcp_congestion_ops = {
+static struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.verifier_ops = &bpf_tcp_ca_verifier_ops,
 	.reg = bpf_tcp_ca_reg,
 	.unreg = bpf_tcp_ca_unreg,
@@ -356,10 +356,16 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.validate = bpf_tcp_ca_validate,
 	.name = "tcp_congestion_ops",
 	.cfi_stubs = &__bpf_ops_tcp_congestion_ops,
+	.owner = THIS_MODULE,
 };
 
 static int __init bpf_tcp_ca_kfunc_init(void)
 {
-	return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set);
+	int ret;
+
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set);
+	ret = ret ?: register_bpf_struct_ops(&bpf_tcp_congestion_ops, tcp_congestion_ops);
+
+	return ret;
 }
 late_initcall(bpf_tcp_ca_kfunc_init);
-- 
cgit v1.2.3


From 20df28fb5bd8081a05ec34542bd45e4f3feeced5 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 22 Jan 2024 10:19:55 -0800
Subject: net/ipv6: resolve warning in ip6_fib.c

In some configurations, the 'iter' variable in function
fib6_repair_tree() is unused, resulting the following warning when
compiled with W=1.

    net/ipv6/ip6_fib.c:1781:6: warning: variable 'iter' set but not used [-Wunused-but-set-variable]
     1781 |         int iter = 0;
	  |             ^

It is unclear what is the advantage of this RT6_TRACE() macro[1], since
users can control pr_debug() in runtime, which is better than at
compilation time. pr_debug() has no overhead when disabled.

Remove the RT6_TRACE() in favor of simple pr_debug() helpers.

[1] Link: https://lore.kernel.org/all/ZZwSEJv2HgI0cD4J@gmail.com/
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240122181955.2391676-2-leitao@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip6_fib.h |  6 ------
 net/ipv6/ip6_fib.c    | 15 +++++++++------
 net/ipv6/route.c      |  8 ++++----
 3 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 9ba6413fd2e3..360b12e61850 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -30,12 +30,6 @@
 
 #define RT6_DEBUG 2
 
-#if RT6_DEBUG >= 3
-#define RT6_TRACE(x...) pr_debug(x)
-#else
-#define RT6_TRACE(x...) do { ; } while (0)
-#endif
-
 struct rt6_info;
 struct fib6_info;
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index fb41bec6b4b5..38a0348b1d17 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1801,7 +1801,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 					    lockdep_is_held(&table->tb6_lock));
 		struct fib6_info *new_fn_leaf;
 
-		RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
+		pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
 		iter++;
 
 		WARN_ON(fn->fn_flags & RTN_RTINFO);
@@ -1864,7 +1864,8 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 		FOR_WALKERS(net, w) {
 			if (!child) {
 				if (w->node == fn) {
-					RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
+					pr_debug("W %p adjusted by delnode 1, s=%d/%d\n",
+						 w, w->state, nstate);
 					w->node = pn;
 					w->state = nstate;
 				}
@@ -1872,10 +1873,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 				if (w->node == fn) {
 					w->node = child;
 					if (children&2) {
-						RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
+						pr_debug("W %p adjusted by delnode 2, s=%d\n",
+							 w, w->state);
 						w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
 					} else {
-						RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
+						pr_debug("W %p adjusted by delnode 2, s=%d\n",
+							 w, w->state);
 						w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
 					}
 				}
@@ -1951,7 +1954,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 	read_lock(&net->ipv6.fib6_walker_lock);
 	FOR_WALKERS(net, w) {
 		if (w->state == FWS_C && w->leaf == rt) {
-			RT6_TRACE("walker %p adjusted by delroute\n", w);
+			pr_debug("walker %p adjusted by delroute\n", w);
 			w->leaf = rcu_dereference_protected(rt->fib6_next,
 					    lockdep_is_held(&table->tb6_lock));
 			if (!w->leaf)
@@ -2289,7 +2292,7 @@ static int fib6_age(struct fib6_info *rt, void *arg)
 
 	if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
 		if (time_after(now, rt->expires)) {
-			RT6_TRACE("expiring %p\n", rt);
+			pr_debug("expiring %p\n", rt);
 			return -1;
 		}
 		gc_args->more++;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ea1dec8448fc..63b4c6056582 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2085,12 +2085,12 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
 	 */
 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
-			RT6_TRACE("aging clone %p\n", rt);
+			pr_debug("aging clone %p\n", rt);
 			rt6_remove_exception(bucket, rt6_ex);
 			return;
 		}
 	} else if (time_after(jiffies, rt->dst.expires)) {
-		RT6_TRACE("purging expired route %p\n", rt);
+		pr_debug("purging expired route %p\n", rt);
 		rt6_remove_exception(bucket, rt6_ex);
 		return;
 	}
@@ -2101,8 +2101,8 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
 
 		if (!(neigh && (neigh->flags & NTF_ROUTER))) {
-			RT6_TRACE("purging route %p via non-router but gateway\n",
-				  rt);
+			pr_debug("purging route %p via non-router but gateway\n",
+				 rt);
 			rt6_remove_exception(bucket, rt6_ex);
 			return;
 		}
-- 
cgit v1.2.3


From 6fe01d3cbb924a72493eb3f4722dfcfd1c194234 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Jan 2024 18:20:59 -0800
Subject: bpf: Add BPF token delegation mount options to BPF FS

Add few new mount options to BPF FS that allow to specify that a given
BPF FS instance allows creation of BPF token (added in the next patch),
and what sort of operations are allowed under BPF token. As such, we get
4 new mount options, each is a bit mask
  - `delegate_cmds` allow to specify which bpf() syscall commands are
    allowed with BPF token derived from this BPF FS instance;
  - if BPF_MAP_CREATE command is allowed, `delegate_maps` specifies
    a set of allowable BPF map types that could be created with BPF token;
  - if BPF_PROG_LOAD command is allowed, `delegate_progs` specifies
    a set of allowable BPF program types that could be loaded with BPF token;
  - if BPF_PROG_LOAD command is allowed, `delegate_attachs` specifies
    a set of allowable BPF program attach types that could be loaded with
    BPF token; delegate_progs and delegate_attachs are meant to be used
    together, as full BPF program type is, in general, determined
    through both program type and program attach type.

Currently, these mount options accept the following forms of values:
  - a special value "any", that enables all possible values of a given
  bit set;
  - numeric value (decimal or hexadecimal, determined by kernel
  automatically) that specifies a bit mask value directly;
  - all the values for a given mount option are combined, if specified
  multiple times. E.g., `mount -t bpf nodev /path/to/mount -o
  delegate_maps=0x1 -o delegate_maps=0x2` will result in a combined 0x3
  mask.

Ideally, more convenient (for humans) symbolic form derived from
corresponding UAPI enums would be accepted (e.g., `-o
delegate_progs=kprobe|tracepoint`) and I intend to implement this, but
it requires a bunch of UAPI header churn, so I postponed it until this
feature lands upstream or at least there is a definite consensus that
this feature is acceptable and is going to make it, just to minimize
amount of wasted effort and not increase amount of non-essential code to
be reviewed.

Attentive reader will notice that BPF FS is now marked as
FS_USERNS_MOUNT, which theoretically makes it mountable inside non-init
user namespace as long as the process has sufficient *namespaced*
capabilities within that user namespace. But in reality we still
restrict BPF FS to be mountable only by processes with CAP_SYS_ADMIN *in
init userns* (extra check in bpf_fill_super()). FS_USERNS_MOUNT is added
to allow creating BPF FS context object (i.e., fsopen("bpf")) from
inside unprivileged process inside non-init userns, to capture that
userns as the owning userns. It will still be required to pass this
context object back to privileged process to instantiate and mount it.

This manipulation is important, because capturing non-init userns as the
owning userns of BPF FS instance (super block) allows to use that userns
to constraint BPF token to that userns later on (see next patch). So
creating BPF FS with delegation inside unprivileged userns will restrict
derived BPF token objects to only "work" inside that intended userns,
making it scoped to a intended "container". Also, setting these
delegation options requires capable(CAP_SYS_ADMIN), so unprivileged
process cannot set this up without involvement of a privileged process.

There is a set of selftests at the end of the patch set that simulates
this sequence of steps and validates that everything works as intended.
But careful review is requested to make sure there are no missed gaps in
the implementation and testing.

This somewhat subtle set of aspects is the result of previous
discussions ([0]) about various user namespace implications and
interactions with BPF token functionality and is necessary to contain
BPF token inside intended user namespace.

  [0] https://lore.kernel.org/bpf/20230704-hochverdient-lehne-eeb9eeef785e@brauner/

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/bpf/20240124022127.2379740-3-andrii@kernel.org
---
 include/linux/bpf.h | 12 +++++++
 kernel/bpf/inode.c  | 90 ++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 90 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 75b7f9b19c6a..28374cec49df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1609,6 +1609,18 @@ struct bpf_link_primer {
 	u32 id;
 };
 
+struct bpf_mount_opts {
+	kuid_t uid;
+	kgid_t gid;
+	umode_t mode;
+
+	/* BPF token-related delegation options */
+	u64 delegate_cmds;
+	u64 delegate_maps;
+	u64 delegate_progs;
+	u64 delegate_attachs;
+};
+
 struct bpf_struct_ops_value;
 struct btf_member;
 
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 41e0a55c35f5..70b748f6228c 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -20,6 +20,7 @@
 #include <linux/filter.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/kstrtox.h>
 #include "preload/bpf_preload.h"
 
 enum bpf_type {
@@ -601,6 +602,7 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 {
 	struct inode *inode = d_inode(root);
 	umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
+	struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
 
 	if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
 		seq_printf(m, ",uid=%u",
@@ -610,6 +612,26 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 			   from_kgid_munged(&init_user_ns, inode->i_gid));
 	if (mode != S_IRWXUGO)
 		seq_printf(m, ",mode=%o", mode);
+
+	if (opts->delegate_cmds == ~0ULL)
+		seq_printf(m, ",delegate_cmds=any");
+	else if (opts->delegate_cmds)
+		seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds);
+
+	if (opts->delegate_maps == ~0ULL)
+		seq_printf(m, ",delegate_maps=any");
+	else if (opts->delegate_maps)
+		seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps);
+
+	if (opts->delegate_progs == ~0ULL)
+		seq_printf(m, ",delegate_progs=any");
+	else if (opts->delegate_progs)
+		seq_printf(m, ",delegate_progs=0x%llx", opts->delegate_progs);
+
+	if (opts->delegate_attachs == ~0ULL)
+		seq_printf(m, ",delegate_attachs=any");
+	else if (opts->delegate_attachs)
+		seq_printf(m, ",delegate_attachs=0x%llx", opts->delegate_attachs);
 	return 0;
 }
 
@@ -635,28 +657,31 @@ enum {
 	OPT_UID,
 	OPT_GID,
 	OPT_MODE,
+	OPT_DELEGATE_CMDS,
+	OPT_DELEGATE_MAPS,
+	OPT_DELEGATE_PROGS,
+	OPT_DELEGATE_ATTACHS,
 };
 
 static const struct fs_parameter_spec bpf_fs_parameters[] = {
 	fsparam_u32	("uid",				OPT_UID),
 	fsparam_u32	("gid",				OPT_GID),
 	fsparam_u32oct	("mode",			OPT_MODE),
+	fsparam_string	("delegate_cmds",		OPT_DELEGATE_CMDS),
+	fsparam_string	("delegate_maps",		OPT_DELEGATE_MAPS),
+	fsparam_string	("delegate_progs",		OPT_DELEGATE_PROGS),
+	fsparam_string	("delegate_attachs",		OPT_DELEGATE_ATTACHS),
 	{}
 };
 
-struct bpf_mount_opts {
-	kuid_t uid;
-	kgid_t gid;
-	umode_t mode;
-};
-
 static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	struct bpf_mount_opts *opts = fc->fs_private;
+	struct bpf_mount_opts *opts = fc->s_fs_info;
 	struct fs_parse_result result;
 	kuid_t uid;
 	kgid_t gid;
-	int opt;
+	int opt, err;
+	u64 msk;
 
 	opt = fs_parse(fc, bpf_fs_parameters, param, &result);
 	if (opt < 0) {
@@ -708,6 +733,28 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	case OPT_MODE:
 		opts->mode = result.uint_32 & S_IALLUGO;
 		break;
+	case OPT_DELEGATE_CMDS:
+	case OPT_DELEGATE_MAPS:
+	case OPT_DELEGATE_PROGS:
+	case OPT_DELEGATE_ATTACHS:
+		if (strcmp(param->string, "any") == 0) {
+			msk = ~0ULL;
+		} else {
+			err = kstrtou64(param->string, 0, &msk);
+			if (err)
+				return err;
+		}
+		/* Setting delegation mount options requires privileges */
+		if (msk && !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		switch (opt) {
+		case OPT_DELEGATE_CMDS: opts->delegate_cmds |= msk; break;
+		case OPT_DELEGATE_MAPS: opts->delegate_maps |= msk; break;
+		case OPT_DELEGATE_PROGS: opts->delegate_progs |= msk; break;
+		case OPT_DELEGATE_ATTACHS: opts->delegate_attachs |= msk; break;
+		default: return -EINVAL;
+		}
+		break;
 	}
 
 	return 0;
@@ -784,10 +831,14 @@ out:
 static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	static const struct tree_descr bpf_rfiles[] = { { "" } };
-	struct bpf_mount_opts *opts = fc->fs_private;
+	struct bpf_mount_opts *opts = sb->s_fs_info;
 	struct inode *inode;
 	int ret;
 
+	/* Mounting an instance of BPF FS requires privileges */
+	if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
 	if (ret)
 		return ret;
@@ -811,7 +862,7 @@ static int bpf_get_tree(struct fs_context *fc)
 
 static void bpf_free_fc(struct fs_context *fc)
 {
-	kfree(fc->fs_private);
+	kfree(fc->s_fs_info);
 }
 
 static const struct fs_context_operations bpf_context_ops = {
@@ -835,17 +886,32 @@ static int bpf_init_fs_context(struct fs_context *fc)
 	opts->uid = current_fsuid();
 	opts->gid = current_fsgid();
 
-	fc->fs_private = opts;
+	/* start out with no BPF token delegation enabled */
+	opts->delegate_cmds = 0;
+	opts->delegate_maps = 0;
+	opts->delegate_progs = 0;
+	opts->delegate_attachs = 0;
+
+	fc->s_fs_info = opts;
 	fc->ops = &bpf_context_ops;
 	return 0;
 }
 
+static void bpf_kill_super(struct super_block *sb)
+{
+	struct bpf_mount_opts *opts = sb->s_fs_info;
+
+	kill_litter_super(sb);
+	kfree(opts);
+}
+
 static struct file_system_type bpf_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "bpf",
 	.init_fs_context = bpf_init_fs_context,
 	.parameters	= bpf_fs_parameters,
-	.kill_sb	= kill_litter_super,
+	.kill_sb	= bpf_kill_super,
+	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 static int __init bpf_init(void)
-- 
cgit v1.2.3


From 35f96de04127d332a5c5e8a155d31f452f88c76d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Jan 2024 18:21:00 -0800
Subject: bpf: Introduce BPF token object

Add new kind of BPF kernel object, BPF token. BPF token is meant to
allow delegating privileged BPF functionality, like loading a BPF
program or creating a BPF map, from privileged process to a *trusted*
unprivileged process, all while having a good amount of control over which
privileged operations could be performed using provided BPF token.

This is achieved through mounting BPF FS instance with extra delegation
mount options, which determine what operations are delegatable, and also
constraining it to the owning user namespace (as mentioned in the
previous patch).

BPF token itself is just a derivative from BPF FS and can be created
through a new bpf() syscall command, BPF_TOKEN_CREATE, which accepts BPF
FS FD, which can be attained through open() API by opening BPF FS mount
point. Currently, BPF token "inherits" delegated command, map types,
prog type, and attach type bit sets from BPF FS as is. In the future,
having an BPF token as a separate object with its own FD, we can allow
to further restrict BPF token's allowable set of things either at the
creation time or after the fact, allowing the process to guard itself
further from unintentionally trying to load undesired kind of BPF
programs. But for now we keep things simple and just copy bit sets as is.

When BPF token is created from BPF FS mount, we take reference to the
BPF super block's owning user namespace, and then use that namespace for
checking all the {CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN}
capabilities that are normally only checked against init userns (using
capable()), but now we check them using ns_capable() instead (if BPF
token is provided). See bpf_token_capable() for details.

Such setup means that BPF token in itself is not sufficient to grant BPF
functionality. User namespaced process has to *also* have necessary
combination of capabilities inside that user namespace. So while
previously CAP_BPF was useless when granted within user namespace, now
it gains a meaning and allows container managers and sys admins to have
a flexible control over which processes can and need to use BPF
functionality within the user namespace (i.e., container in practice).
And BPF FS delegation mount options and derived BPF tokens serve as
a per-container "flag" to grant overall ability to use bpf() (plus further
restrict on which parts of bpf() syscalls are treated as namespaced).

Note also, BPF_TOKEN_CREATE command itself requires ns_capable(CAP_BPF)
within the BPF FS owning user namespace, rounding up the ns_capable()
story of BPF token. Also creating BPF token in init user namespace is
currently not supported, given BPF token doesn't have any effect in init
user namespace anyways.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/bpf/20240124022127.2379740-4-andrii@kernel.org
---
 include/linux/bpf.h            |  41 ++++++++
 include/uapi/linux/bpf.h       |  37 +++++++
 kernel/bpf/Makefile            |   2 +-
 kernel/bpf/inode.c             |  12 ++-
 kernel/bpf/syscall.c           |  17 ++++
 kernel/bpf/token.c             | 217 +++++++++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  37 +++++++
 7 files changed, 357 insertions(+), 6 deletions(-)
 create mode 100644 kernel/bpf/token.c

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 28374cec49df..d9ff7ce547b4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -52,6 +52,10 @@ struct module;
 struct bpf_func_state;
 struct ftrace_ops;
 struct cgroup;
+struct bpf_token;
+struct user_namespace;
+struct super_block;
+struct inode;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -1621,6 +1625,13 @@ struct bpf_mount_opts {
 	u64 delegate_attachs;
 };
 
+struct bpf_token {
+	struct work_struct work;
+	atomic64_t refcnt;
+	struct user_namespace *userns;
+	u64 allowed_cmds;
+};
+
 struct bpf_struct_ops_value;
 struct btf_member;
 
@@ -2109,6 +2120,7 @@ static inline void bpf_enable_instrumentation(void)
 	migrate_enable();
 }
 
+extern const struct super_operations bpf_super_ops;
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
 extern const struct file_operations bpf_iter_fops;
@@ -2243,6 +2255,8 @@ static inline void bpf_map_dec_elem_count(struct bpf_map *map)
 
 extern int sysctl_unprivileged_bpf_disabled;
 
+bool bpf_token_capable(const struct bpf_token *token, int cap);
+
 static inline bool bpf_allow_ptr_leaks(void)
 {
 	return perfmon_capable();
@@ -2277,8 +2291,17 @@ int bpf_link_new_fd(struct bpf_link *link);
 struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
 
+void bpf_token_inc(struct bpf_token *token);
+void bpf_token_put(struct bpf_token *token);
+int bpf_token_create(union bpf_attr *attr);
+struct bpf_token *bpf_token_get_from_fd(u32 ufd);
+
+bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
+
 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
+struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir,
+			    umode_t mode);
 
 #define BPF_ITER_FUNC_PREFIX "bpf_iter_"
 #define DEFINE_BPF_ITER_FUNC(target, args...)			\
@@ -2638,6 +2661,24 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 	return -EOPNOTSUPP;
 }
 
+static inline bool bpf_token_capable(const struct bpf_token *token, int cap)
+{
+	return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
+}
+
+static inline void bpf_token_inc(struct bpf_token *token)
+{
+}
+
+static inline void bpf_token_put(struct bpf_token *token)
+{
+}
+
+static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline void __dev_flush(void)
 {
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1fef6d5a1330..b9dc0cca172c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -847,6 +847,36 @@ union bpf_iter_link_info {
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
  *
+ * BPF_TOKEN_CREATE
+ *	Description
+ *		Create BPF token with embedded information about what
+ *		BPF-related functionality it allows:
+ *		- a set of allowed bpf() syscall commands;
+ *		- a set of allowed BPF map types to be created with
+ *		BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed;
+ *		- a set of allowed BPF program types and BPF program attach
+ *		types to be loaded with BPF_PROG_LOAD command, if
+ *		BPF_PROG_LOAD itself is allowed.
+ *
+ *		BPF token is created (derived) from an instance of BPF FS,
+ *		assuming it has necessary delegation mount options specified.
+ *		This BPF token can be passed as an extra parameter to various
+ *		bpf() syscall commands to grant BPF subsystem functionality to
+ *		unprivileged processes.
+ *
+ *		When created, BPF token is "associated" with the owning
+ *		user namespace of BPF FS instance (super block) that it was
+ *		derived from, and subsequent BPF operations performed with
+ *		BPF token would be performing capabilities checks (i.e.,
+ *		CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within
+ *		that user namespace. Without BPF token, such capabilities
+ *		have to be granted in init user namespace, making bpf()
+ *		syscall incompatible with user namespace, for the most part.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *
@@ -901,6 +931,8 @@ enum bpf_cmd {
 	BPF_ITER_CREATE,
 	BPF_LINK_DETACH,
 	BPF_PROG_BIND_MAP,
+	BPF_TOKEN_CREATE,
+	__MAX_BPF_CMD,
 };
 
 enum bpf_map_type {
@@ -1722,6 +1754,11 @@ union bpf_attr {
 		__u32		flags;		/* extra flags */
 	} prog_bind_map;
 
+	struct { /* struct used by BPF_TOKEN_CREATE command */
+		__u32		flags;
+		__u32		bpffs_fd;
+	} token_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f526b7573e97..4ce95acfcaa7 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
 endif
 CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 70b748f6228c..565be1f3f1ea 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -99,9 +99,9 @@ static const struct inode_operations bpf_prog_iops = { };
 static const struct inode_operations bpf_map_iops  = { };
 static const struct inode_operations bpf_link_iops  = { };
 
-static struct inode *bpf_get_inode(struct super_block *sb,
-				   const struct inode *dir,
-				   umode_t mode)
+struct inode *bpf_get_inode(struct super_block *sb,
+			    const struct inode *dir,
+			    umode_t mode)
 {
 	struct inode *inode;
 
@@ -603,6 +603,7 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 	struct inode *inode = d_inode(root);
 	umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
 	struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
+	u64 mask;
 
 	if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
 		seq_printf(m, ",uid=%u",
@@ -613,7 +614,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 	if (mode != S_IRWXUGO)
 		seq_printf(m, ",mode=%o", mode);
 
-	if (opts->delegate_cmds == ~0ULL)
+	mask = (1ULL << __MAX_BPF_CMD) - 1;
+	if ((opts->delegate_cmds & mask) == mask)
 		seq_printf(m, ",delegate_cmds=any");
 	else if (opts->delegate_cmds)
 		seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds);
@@ -646,7 +648,7 @@ static void bpf_free_inode(struct inode *inode)
 	free_inode_nonrcu(inode);
 }
 
-static const struct super_operations bpf_super_ops = {
+const struct super_operations bpf_super_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
 	.show_options	= bpf_show_options,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d5f1edee2d50..3ecba592b8ed 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5426,6 +5426,20 @@ out_prog_put:
 	return ret;
 }
 
+#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd
+
+static int token_create(union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_TOKEN_CREATE))
+		return -EINVAL;
+
+	/* no flags are supported yet */
+	if (attr->token_create.flags)
+		return -EINVAL;
+
+	return bpf_token_create(attr);
+}
+
 static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 {
 	union bpf_attr attr;
@@ -5559,6 +5573,9 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 	case BPF_PROG_BIND_MAP:
 		err = bpf_prog_bind_map(&attr);
 		break;
+	case BPF_TOKEN_CREATE:
+		err = token_create(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
new file mode 100644
index 000000000000..bdb6fe697568
--- /dev/null
+++ b/kernel/bpf/token.c
@@ -0,0 +1,217 @@
+#include <linux/bpf.h>
+#include <linux/vmalloc.h>
+#include <linux/fdtable.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/idr.h>
+#include <linux/namei.h>
+#include <linux/user_namespace.h>
+
+static bool bpf_ns_capable(struct user_namespace *ns, int cap)
+{
+	return ns_capable(ns, cap) || (cap != CAP_SYS_ADMIN && ns_capable(ns, CAP_SYS_ADMIN));
+}
+
+bool bpf_token_capable(const struct bpf_token *token, int cap)
+{
+	struct user_namespace *userns;
+
+	/* BPF token allows ns_capable() level of capabilities */
+	userns = token ? token->userns : &init_user_ns;
+	if (!bpf_ns_capable(userns, cap))
+		return false;
+	return true;
+}
+
+void bpf_token_inc(struct bpf_token *token)
+{
+	atomic64_inc(&token->refcnt);
+}
+
+static void bpf_token_free(struct bpf_token *token)
+{
+	put_user_ns(token->userns);
+	kfree(token);
+}
+
+static void bpf_token_put_deferred(struct work_struct *work)
+{
+	struct bpf_token *token = container_of(work, struct bpf_token, work);
+
+	bpf_token_free(token);
+}
+
+void bpf_token_put(struct bpf_token *token)
+{
+	if (!token)
+		return;
+
+	if (!atomic64_dec_and_test(&token->refcnt))
+		return;
+
+	INIT_WORK(&token->work, bpf_token_put_deferred);
+	schedule_work(&token->work);
+}
+
+static int bpf_token_release(struct inode *inode, struct file *filp)
+{
+	struct bpf_token *token = filp->private_data;
+
+	bpf_token_put(token);
+	return 0;
+}
+
+static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+	struct bpf_token *token = filp->private_data;
+	u64 mask;
+
+	BUILD_BUG_ON(__MAX_BPF_CMD >= 64);
+	mask = (1ULL << __MAX_BPF_CMD) - 1;
+	if ((token->allowed_cmds & mask) == mask)
+		seq_printf(m, "allowed_cmds:\tany\n");
+	else
+		seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds);
+}
+
+#define BPF_TOKEN_INODE_NAME "bpf-token"
+
+static const struct inode_operations bpf_token_iops = { };
+
+static const struct file_operations bpf_token_fops = {
+	.release	= bpf_token_release,
+	.show_fdinfo	= bpf_token_show_fdinfo,
+};
+
+int bpf_token_create(union bpf_attr *attr)
+{
+	struct bpf_mount_opts *mnt_opts;
+	struct bpf_token *token = NULL;
+	struct user_namespace *userns;
+	struct inode *inode;
+	struct file *file;
+	struct path path;
+	struct fd f;
+	umode_t mode;
+	int err, fd;
+
+	f = fdget(attr->token_create.bpffs_fd);
+	if (!f.file)
+		return -EBADF;
+
+	path = f.file->f_path;
+	path_get(&path);
+	fdput(f);
+
+	if (path.dentry != path.mnt->mnt_sb->s_root) {
+		err = -EINVAL;
+		goto out_path;
+	}
+	if (path.mnt->mnt_sb->s_op != &bpf_super_ops) {
+		err = -EINVAL;
+		goto out_path;
+	}
+	err = path_permission(&path, MAY_ACCESS);
+	if (err)
+		goto out_path;
+
+	userns = path.dentry->d_sb->s_user_ns;
+	/*
+	 * Enforce that creators of BPF tokens are in the same user
+	 * namespace as the BPF FS instance. This makes reasoning about
+	 * permissions a lot easier and we can always relax this later.
+	 */
+	if (current_user_ns() != userns) {
+		err = -EPERM;
+		goto out_path;
+	}
+	if (!ns_capable(userns, CAP_BPF)) {
+		err = -EPERM;
+		goto out_path;
+	}
+
+	/* Creating BPF token in init_user_ns doesn't make much sense. */
+	if (current_user_ns() == &init_user_ns) {
+		err = -EOPNOTSUPP;
+		goto out_path;
+	}
+
+	mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+	inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_path;
+	}
+
+	inode->i_op = &bpf_token_iops;
+	inode->i_fop = &bpf_token_fops;
+	clear_nlink(inode); /* make sure it is unlinked */
+
+	file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops);
+	if (IS_ERR(file)) {
+		iput(inode);
+		err = PTR_ERR(file);
+		goto out_path;
+	}
+
+	token = kzalloc(sizeof(*token), GFP_USER);
+	if (!token) {
+		err = -ENOMEM;
+		goto out_file;
+	}
+
+	atomic64_set(&token->refcnt, 1);
+
+	/* remember bpffs owning userns for future ns_capable() checks */
+	token->userns = get_user_ns(userns);
+
+	mnt_opts = path.dentry->d_sb->s_fs_info;
+	token->allowed_cmds = mnt_opts->delegate_cmds;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0) {
+		err = fd;
+		goto out_token;
+	}
+
+	file->private_data = token;
+	fd_install(fd, file);
+
+	path_put(&path);
+	return fd;
+
+out_token:
+	bpf_token_free(token);
+out_file:
+	fput(file);
+out_path:
+	path_put(&path);
+	return err;
+}
+
+struct bpf_token *bpf_token_get_from_fd(u32 ufd)
+{
+	struct fd f = fdget(ufd);
+	struct bpf_token *token;
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+	if (f.file->f_op != &bpf_token_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	token = f.file->private_data;
+	bpf_token_inc(token);
+	fdput(f);
+
+	return token;
+}
+
+bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+{
+	if (!token)
+		return false;
+	return token->allowed_cmds & (1ULL << cmd);
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1fef6d5a1330..b9dc0cca172c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -847,6 +847,36 @@ union bpf_iter_link_info {
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
  *
+ * BPF_TOKEN_CREATE
+ *	Description
+ *		Create BPF token with embedded information about what
+ *		BPF-related functionality it allows:
+ *		- a set of allowed bpf() syscall commands;
+ *		- a set of allowed BPF map types to be created with
+ *		BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed;
+ *		- a set of allowed BPF program types and BPF program attach
+ *		types to be loaded with BPF_PROG_LOAD command, if
+ *		BPF_PROG_LOAD itself is allowed.
+ *
+ *		BPF token is created (derived) from an instance of BPF FS,
+ *		assuming it has necessary delegation mount options specified.
+ *		This BPF token can be passed as an extra parameter to various
+ *		bpf() syscall commands to grant BPF subsystem functionality to
+ *		unprivileged processes.
+ *
+ *		When created, BPF token is "associated" with the owning
+ *		user namespace of BPF FS instance (super block) that it was
+ *		derived from, and subsequent BPF operations performed with
+ *		BPF token would be performing capabilities checks (i.e.,
+ *		CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within
+ *		that user namespace. Without BPF token, such capabilities
+ *		have to be granted in init user namespace, making bpf()
+ *		syscall incompatible with user namespace, for the most part.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *
@@ -901,6 +931,8 @@ enum bpf_cmd {
 	BPF_ITER_CREATE,
 	BPF_LINK_DETACH,
 	BPF_PROG_BIND_MAP,
+	BPF_TOKEN_CREATE,
+	__MAX_BPF_CMD,
 };
 
 enum bpf_map_type {
@@ -1722,6 +1754,11 @@ union bpf_attr {
 		__u32		flags;		/* extra flags */
 	} prog_bind_map;
 
+	struct { /* struct used by BPF_TOKEN_CREATE command */
+		__u32		flags;
+		__u32		bpffs_fd;
+	} token_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
-- 
cgit v1.2.3


From a177fc2bf6fd83704854feaf7aae926b1df4f0b9 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Jan 2024 18:21:01 -0800
Subject: bpf: Add BPF token support to BPF_MAP_CREATE command

Allow providing token_fd for BPF_MAP_CREATE command to allow controlled
BPF map creation from unprivileged process through delegated BPF token.
New BPF_F_TOKEN_FD flag is added to specify together with BPF token FD
for BPF_MAP_CREATE command.

Wire through a set of allowed BPF map types to BPF token, derived from
BPF FS at BPF token creation time. This, in combination with allowed_cmds
allows to create a narrowly-focused BPF token (controlled by privileged
agent) with a restrictive set of BPF maps that application can attempt
to create.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20240124022127.2379740-5-andrii@kernel.org
---
 include/linux/bpf.h                                |  2 +
 include/uapi/linux/bpf.h                           |  8 +++
 kernel/bpf/inode.c                                 |  3 +-
 kernel/bpf/syscall.c                               | 59 +++++++++++++++++-----
 kernel/bpf/token.c                                 | 16 ++++++
 tools/include/uapi/linux/bpf.h                     |  8 +++
 .../selftests/bpf/prog_tests/libbpf_probes.c       |  2 +
 .../testing/selftests/bpf/prog_tests/libbpf_str.c  |  3 ++
 8 files changed, 86 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d9ff7ce547b4..8252452d0c4d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1630,6 +1630,7 @@ struct bpf_token {
 	atomic64_t refcnt;
 	struct user_namespace *userns;
 	u64 allowed_cmds;
+	u64 allowed_maps;
 };
 
 struct bpf_struct_ops_value;
@@ -2297,6 +2298,7 @@ int bpf_token_create(union bpf_attr *attr);
 struct bpf_token *bpf_token_get_from_fd(u32 ufd);
 
 bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
+bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
 
 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b9dc0cca172c..c78cab8b462d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -983,6 +983,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_BLOOM_FILTER,
 	BPF_MAP_TYPE_USER_RINGBUF,
 	BPF_MAP_TYPE_CGRP_STORAGE,
+	__MAX_BPF_MAP_TYPE
 };
 
 /* Note that tracing related programs such as
@@ -1365,6 +1366,9 @@ enum {
 
 /* Flag for value_type_btf_obj_fd, the fd is available */
 	BPF_F_VTYPE_BTF_OBJ_FD	= (1U << 15),
+
+/* BPF token FD is passed in a corresponding command's token_fd field */
+	BPF_F_TOKEN_FD          = (1U << 16),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1443,6 +1447,10 @@ union bpf_attr {
 						 * type data for
 						 * btf_vmlinux_value_type_id.
 						 */
+		/* BPF token FD to use with BPF_MAP_CREATE operation.
+		 * If provided, map_flags should have BPF_F_TOKEN_FD flag set.
+		 */
+		__s32	map_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 565be1f3f1ea..034b7e4d8f19 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -620,7 +620,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 	else if (opts->delegate_cmds)
 		seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds);
 
-	if (opts->delegate_maps == ~0ULL)
+	mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
+	if ((opts->delegate_maps & mask) == mask)
 		seq_printf(m, ",delegate_maps=any");
 	else if (opts->delegate_maps)
 		seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3ecba592b8ed..b13a4bdcd3a0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1011,8 +1011,8 @@ int map_check_no_btf(const struct bpf_map *map,
 	return -ENOTSUPP;
 }
 
-static int map_check_btf(struct bpf_map *map, const struct btf *btf,
-			 u32 btf_key_id, u32 btf_value_id)
+static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
+			 const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
 {
 	const struct btf_type *key_type, *value_type;
 	u32 key_size, value_size;
@@ -1040,7 +1040,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 	if (!IS_ERR_OR_NULL(map->record)) {
 		int i;
 
-		if (!bpf_capable()) {
+		if (!bpf_token_capable(token, CAP_BPF)) {
 			ret = -EPERM;
 			goto free_map_tab;
 		}
@@ -1128,14 +1128,16 @@ static bool bpf_net_capable(void)
 	return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD value_type_btf_obj_fd
+#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
 	const struct bpf_map_ops *ops;
+	struct bpf_token *token = NULL;
 	int numa_node = bpf_map_attr_numa_node(attr);
 	u32 map_type = attr->map_type;
 	struct bpf_map *map;
+	bool token_flag;
 	int f_flags;
 	int err;
 
@@ -1143,6 +1145,12 @@ static int map_create(union bpf_attr *attr)
 	if (err)
 		return -EINVAL;
 
+	/* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it
+	 * to avoid per-map type checks tripping on unknown flag
+	 */
+	token_flag = attr->map_flags & BPF_F_TOKEN_FD;
+	attr->map_flags &= ~BPF_F_TOKEN_FD;
+
 	if (attr->btf_vmlinux_value_type_id) {
 		if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
 		    attr->btf_key_type_id || attr->btf_value_type_id)
@@ -1183,14 +1191,32 @@ static int map_create(union bpf_attr *attr)
 	if (!ops->map_mem_usage)
 		return -EINVAL;
 
+	if (token_flag) {
+		token = bpf_token_get_from_fd(attr->map_token_fd);
+		if (IS_ERR(token))
+			return PTR_ERR(token);
+
+		/* if current token doesn't grant map creation permissions,
+		 * then we can't use this token, so ignore it and rely on
+		 * system-wide capabilities checks
+		 */
+		if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) ||
+		    !bpf_token_allow_map_type(token, attr->map_type)) {
+			bpf_token_put(token);
+			token = NULL;
+		}
+	}
+
+	err = -EPERM;
+
 	/* Intent here is for unprivileged_bpf_disabled to block BPF map
 	 * creation for unprivileged users; other actions depend
 	 * on fd availability and access to bpffs, so are dependent on
 	 * object creation success. Even with unprivileged BPF disabled,
 	 * capability checks are still carried out.
 	 */
-	if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
-		return -EPERM;
+	if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
+		goto put_token;
 
 	/* check privileged map type permissions */
 	switch (map_type) {
@@ -1223,25 +1249,27 @@ static int map_create(union bpf_attr *attr)
 	case BPF_MAP_TYPE_LRU_PERCPU_HASH:
 	case BPF_MAP_TYPE_STRUCT_OPS:
 	case BPF_MAP_TYPE_CPUMAP:
-		if (!bpf_capable())
-			return -EPERM;
+		if (!bpf_token_capable(token, CAP_BPF))
+			goto put_token;
 		break;
 	case BPF_MAP_TYPE_SOCKMAP:
 	case BPF_MAP_TYPE_SOCKHASH:
 	case BPF_MAP_TYPE_DEVMAP:
 	case BPF_MAP_TYPE_DEVMAP_HASH:
 	case BPF_MAP_TYPE_XSKMAP:
-		if (!bpf_net_capable())
-			return -EPERM;
+		if (!bpf_token_capable(token, CAP_NET_ADMIN))
+			goto put_token;
 		break;
 	default:
 		WARN(1, "unsupported map type %d", map_type);
-		return -EPERM;
+		goto put_token;
 	}
 
 	map = ops->map_alloc(attr);
-	if (IS_ERR(map))
-		return PTR_ERR(map);
+	if (IS_ERR(map)) {
+		err = PTR_ERR(map);
+		goto put_token;
+	}
 	map->ops = ops;
 	map->map_type = map_type;
 
@@ -1278,7 +1306,7 @@ static int map_create(union bpf_attr *attr)
 		map->btf = btf;
 
 		if (attr->btf_value_type_id) {
-			err = map_check_btf(map, btf, attr->btf_key_type_id,
+			err = map_check_btf(map, token, btf, attr->btf_key_type_id,
 					    attr->btf_value_type_id);
 			if (err)
 				goto free_map;
@@ -1299,6 +1327,7 @@ static int map_create(union bpf_attr *attr)
 		goto free_map_sec;
 
 	bpf_map_save_memcg(map);
+	bpf_token_put(token);
 
 	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
@@ -1319,6 +1348,8 @@ free_map_sec:
 free_map:
 	btf_put(map->btf);
 	map->ops->map_free(map);
+put_token:
+	bpf_token_put(token);
 	return err;
 }
 
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index bdb6fe697568..bc86be4ca567 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -73,6 +73,13 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
 		seq_printf(m, "allowed_cmds:\tany\n");
 	else
 		seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds);
+
+	BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64);
+	mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
+	if ((token->allowed_maps & mask) == mask)
+		seq_printf(m, "allowed_maps:\tany\n");
+	else
+		seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps);
 }
 
 #define BPF_TOKEN_INODE_NAME "bpf-token"
@@ -168,6 +175,7 @@ int bpf_token_create(union bpf_attr *attr)
 
 	mnt_opts = path.dentry->d_sb->s_fs_info;
 	token->allowed_cmds = mnt_opts->delegate_cmds;
+	token->allowed_maps = mnt_opts->delegate_maps;
 
 	fd = get_unused_fd_flags(O_CLOEXEC);
 	if (fd < 0) {
@@ -215,3 +223,11 @@ bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
 		return false;
 	return token->allowed_cmds & (1ULL << cmd);
 }
+
+bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type)
+{
+	if (!token || type >= __MAX_BPF_MAP_TYPE)
+		return false;
+
+	return token->allowed_maps & (1ULL << type);
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b9dc0cca172c..c78cab8b462d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -983,6 +983,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_BLOOM_FILTER,
 	BPF_MAP_TYPE_USER_RINGBUF,
 	BPF_MAP_TYPE_CGRP_STORAGE,
+	__MAX_BPF_MAP_TYPE
 };
 
 /* Note that tracing related programs such as
@@ -1365,6 +1366,9 @@ enum {
 
 /* Flag for value_type_btf_obj_fd, the fd is available */
 	BPF_F_VTYPE_BTF_OBJ_FD	= (1U << 15),
+
+/* BPF token FD is passed in a corresponding command's token_fd field */
+	BPF_F_TOKEN_FD          = (1U << 16),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1443,6 +1447,10 @@ union bpf_attr {
 						 * type data for
 						 * btf_vmlinux_value_type_id.
 						 */
+		/* BPF token FD to use with BPF_MAP_CREATE operation.
+		 * If provided, map_flags should have BPF_F_TOKEN_FD flag set.
+		 */
+		__s32	map_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
index 9f766ddd946a..573249a2814d 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
@@ -68,6 +68,8 @@ void test_libbpf_probe_map_types(void)
 
 		if (map_type == BPF_MAP_TYPE_UNSPEC)
 			continue;
+		if (strcmp(map_type_name, "__MAX_BPF_MAP_TYPE") == 0)
+			continue;
 
 		if (!test__start_subtest(map_type_name))
 			continue;
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
index eb34d612d6f8..1f328c0d8aff 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
@@ -132,6 +132,9 @@ static void test_libbpf_bpf_map_type_str(void)
 		const char *map_type_str;
 		char buf[256];
 
+		if (map_type == __MAX_BPF_MAP_TYPE)
+			continue;
+
 		map_type_name = btf__str_by_offset(btf, e->name_off);
 		map_type_str = libbpf_bpf_map_type_str(map_type);
 		ASSERT_OK_PTR(map_type_str, map_type_name);
-- 
cgit v1.2.3


From 9ea7c4bf17e39d463eb4782f948f401d9764b1b3 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Jan 2024 18:21:02 -0800
Subject: bpf: Add BPF token support to BPF_BTF_LOAD command

Accept BPF token FD in BPF_BTF_LOAD command to allow BTF data loading
through delegated BPF token. BPF_F_TOKEN_FD flag has to be specified
when passing BPF token FD. Given BPF_BTF_LOAD command didn't have flags
field before, we also add btf_flags field.

BTF loading is a pretty straightforward operation, so as long as BPF
token is created with allow_cmds granting BPF_BTF_LOAD command, kernel
proceeds to parsing BTF data and creating BTF object.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20240124022127.2379740-6-andrii@kernel.org
---
 include/uapi/linux/bpf.h       |  5 +++++
 kernel/bpf/syscall.c           | 23 +++++++++++++++++++++--
 tools/include/uapi/linux/bpf.h |  5 +++++
 3 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c78cab8b462d..cb2c888e3bb4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1632,6 +1632,11 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		btf_log_true_size;
+		__u32		btf_flags;
+		/* BPF token FD to use with BPF_BTF_LOAD operation.
+		 * If provided, btf_flags should have BPF_F_TOKEN_FD flag set.
+		 */
+		__s32		btf_token_fd;
 	};
 
 	struct {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b13a4bdcd3a0..45b3a55896eb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4831,15 +4831,34 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	return err;
 }
 
-#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
+#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
 
 static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
 {
+	struct bpf_token *token = NULL;
+
 	if (CHECK_ATTR(BPF_BTF_LOAD))
 		return -EINVAL;
 
-	if (!bpf_capable())
+	if (attr->btf_flags & ~BPF_F_TOKEN_FD)
+		return -EINVAL;
+
+	if (attr->btf_flags & BPF_F_TOKEN_FD) {
+		token = bpf_token_get_from_fd(attr->btf_token_fd);
+		if (IS_ERR(token))
+			return PTR_ERR(token);
+		if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) {
+			bpf_token_put(token);
+			token = NULL;
+		}
+	}
+
+	if (!bpf_token_capable(token, CAP_BPF)) {
+		bpf_token_put(token);
 		return -EPERM;
+	}
+
+	bpf_token_put(token);
 
 	return btf_new_fd(attr, uattr, uattr_size);
 }
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c78cab8b462d..cb2c888e3bb4 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1632,6 +1632,11 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		btf_log_true_size;
+		__u32		btf_flags;
+		/* BPF token FD to use with BPF_BTF_LOAD operation.
+		 * If provided, btf_flags should have BPF_F_TOKEN_FD flag set.
+		 */
+		__s32		btf_token_fd;
 	};
 
 	struct {
-- 
cgit v1.2.3


From caf8f28e036c4ba1e823355da6c0c01c39e70ab9 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Jan 2024 18:21:03 -0800
Subject: bpf: Add BPF token support to BPF_PROG_LOAD command

Add basic support of BPF token to BPF_PROG_LOAD. BPF_F_TOKEN_FD flag
should be set in prog_flags field when providing prog_token_fd.

Wire through a set of allowed BPF program types and attach types,
derived from BPF FS at BPF token creation time. Then make sure we
perform bpf_token_capable() checks everywhere where it's relevant.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20240124022127.2379740-7-andrii@kernel.org
---
 include/linux/bpf.h                                |  6 ++
 include/uapi/linux/bpf.h                           |  5 ++
 kernel/bpf/core.c                                  |  1 +
 kernel/bpf/inode.c                                 |  6 +-
 kernel/bpf/syscall.c                               | 90 ++++++++++++++++------
 kernel/bpf/token.c                                 | 27 +++++++
 tools/include/uapi/linux/bpf.h                     |  5 ++
 .../selftests/bpf/prog_tests/libbpf_probes.c       |  2 +
 .../testing/selftests/bpf/prog_tests/libbpf_str.c  |  3 +
 9 files changed, 118 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8252452d0c4d..d0bf37e3f166 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1489,6 +1489,7 @@ struct bpf_prog_aux {
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
+	struct bpf_token *token;
 	struct bpf_prog_offload *offload;
 	struct btf *btf;
 	struct bpf_func_info *func_info;
@@ -1631,6 +1632,8 @@ struct bpf_token {
 	struct user_namespace *userns;
 	u64 allowed_cmds;
 	u64 allowed_maps;
+	u64 allowed_progs;
+	u64 allowed_attachs;
 };
 
 struct bpf_struct_ops_value;
@@ -2299,6 +2302,9 @@ struct bpf_token *bpf_token_get_from_fd(u32 ufd);
 
 bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
 bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
+bool bpf_token_allow_prog_type(const struct bpf_token *token,
+			       enum bpf_prog_type prog_type,
+			       enum bpf_attach_type attach_type);
 
 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cb2c888e3bb4..d96708380e52 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1028,6 +1028,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
+	__MAX_BPF_PROG_TYPE
 };
 
 enum bpf_attach_type {
@@ -1520,6 +1521,10 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		log_true_size;
+		/* BPF token FD to use with BPF_PROG_LOAD operation.
+		 * If provided, prog_flags should have BPF_F_TOKEN_FD flag set.
+		 */
+		__s32		prog_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index fbb1d95a9b44..00dccba29769 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2779,6 +2779,7 @@ void bpf_prog_free(struct bpf_prog *fp)
 
 	if (aux->dst_prog)
 		bpf_prog_put(aux->dst_prog);
+	bpf_token_put(aux->token);
 	INIT_WORK(&aux->work, bpf_prog_free_deferred);
 	schedule_work(&aux->work);
 }
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 034b7e4d8f19..5fb10da5717f 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -626,12 +626,14 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 	else if (opts->delegate_maps)
 		seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps);
 
-	if (opts->delegate_progs == ~0ULL)
+	mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
+	if ((opts->delegate_progs & mask) == mask)
 		seq_printf(m, ",delegate_progs=any");
 	else if (opts->delegate_progs)
 		seq_printf(m, ",delegate_progs=0x%llx", opts->delegate_progs);
 
-	if (opts->delegate_attachs == ~0ULL)
+	mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
+	if ((opts->delegate_attachs & mask) == mask)
 		seq_printf(m, ",delegate_attachs=any");
 	else if (opts->delegate_attachs)
 		seq_printf(m, ",delegate_attachs=0x%llx", opts->delegate_attachs);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 45b3a55896eb..61b4bf4cc287 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2626,13 +2626,15 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD log_true_size
+#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd
 
 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	enum bpf_prog_type type = attr->prog_type;
 	struct bpf_prog *prog, *dst_prog = NULL;
 	struct btf *attach_btf = NULL;
+	struct bpf_token *token = NULL;
+	bool bpf_cap;
 	int err;
 	char license[128];
 
@@ -2646,13 +2648,35 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 				 BPF_F_TEST_RND_HI32 |
 				 BPF_F_XDP_HAS_FRAGS |
 				 BPF_F_XDP_DEV_BOUND_ONLY |
-				 BPF_F_TEST_REG_INVARIANTS))
+				 BPF_F_TEST_REG_INVARIANTS |
+				 BPF_F_TOKEN_FD))
 		return -EINVAL;
 
+	bpf_prog_load_fixup_attach_type(attr);
+
+	if (attr->prog_flags & BPF_F_TOKEN_FD) {
+		token = bpf_token_get_from_fd(attr->prog_token_fd);
+		if (IS_ERR(token))
+			return PTR_ERR(token);
+		/* if current token doesn't grant prog loading permissions,
+		 * then we can't use this token, so ignore it and rely on
+		 * system-wide capabilities checks
+		 */
+		if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) ||
+		    !bpf_token_allow_prog_type(token, attr->prog_type,
+					       attr->expected_attach_type)) {
+			bpf_token_put(token);
+			token = NULL;
+		}
+	}
+
+	bpf_cap = bpf_token_capable(token, CAP_BPF);
+	err = -EPERM;
+
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
 	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
-	    !bpf_capable())
-		return -EPERM;
+	    !bpf_cap)
+		goto put_token;
 
 	/* Intent here is for unprivileged_bpf_disabled to block BPF program
 	 * creation for unprivileged users; other actions depend
@@ -2661,21 +2685,23 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	 * capability checks are still carried out for these
 	 * and other operations.
 	 */
-	if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
-		return -EPERM;
+	if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
+		goto put_token;
 
 	if (attr->insn_cnt == 0 ||
-	    attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
-		return -E2BIG;
+	    attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
+		err = -E2BIG;
+		goto put_token;
+	}
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
-	    !bpf_capable())
-		return -EPERM;
+	    !bpf_cap)
+		goto put_token;
 
-	if (is_net_admin_prog_type(type) && !bpf_net_capable())
-		return -EPERM;
-	if (is_perfmon_prog_type(type) && !perfmon_capable())
-		return -EPERM;
+	if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
+		goto put_token;
+	if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
+		goto put_token;
 
 	/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
 	 * or btf, we need to check which one it is
@@ -2685,27 +2711,33 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 		if (IS_ERR(dst_prog)) {
 			dst_prog = NULL;
 			attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
-			if (IS_ERR(attach_btf))
-				return -EINVAL;
+			if (IS_ERR(attach_btf)) {
+				err = -EINVAL;
+				goto put_token;
+			}
 			if (!btf_is_kernel(attach_btf)) {
 				/* attaching through specifying bpf_prog's BTF
 				 * objects directly might be supported eventually
 				 */
 				btf_put(attach_btf);
-				return -ENOTSUPP;
+				err = -ENOTSUPP;
+				goto put_token;
 			}
 		}
 	} else if (attr->attach_btf_id) {
 		/* fall back to vmlinux BTF, if BTF type ID is specified */
 		attach_btf = bpf_get_btf_vmlinux();
-		if (IS_ERR(attach_btf))
-			return PTR_ERR(attach_btf);
-		if (!attach_btf)
-			return -EINVAL;
+		if (IS_ERR(attach_btf)) {
+			err = PTR_ERR(attach_btf);
+			goto put_token;
+		}
+		if (!attach_btf) {
+			err = -EINVAL;
+			goto put_token;
+		}
 		btf_get(attach_btf);
 	}
 
-	bpf_prog_load_fixup_attach_type(attr);
 	if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
 				       attach_btf, attr->attach_btf_id,
 				       dst_prog)) {
@@ -2713,7 +2745,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 			bpf_prog_put(dst_prog);
 		if (attach_btf)
 			btf_put(attach_btf);
-		return -EINVAL;
+		err = -EINVAL;
+		goto put_token;
 	}
 
 	/* plain bpf_prog allocation */
@@ -2723,7 +2756,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 			bpf_prog_put(dst_prog);
 		if (attach_btf)
 			btf_put(attach_btf);
-		return -ENOMEM;
+		err = -EINVAL;
+		goto put_token;
 	}
 
 	prog->expected_attach_type = attr->expected_attach_type;
@@ -2734,6 +2768,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
 	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
 
+	/* move token into prog->aux, reuse taken refcnt */
+	prog->aux->token = token;
+	token = NULL;
+
 	err = security_bpf_prog_alloc(prog->aux);
 	if (err)
 		goto free_prog;
@@ -2851,6 +2889,8 @@ free_prog:
 	if (prog->aux->attach_btf)
 		btf_put(prog->aux->attach_btf);
 	bpf_prog_free(prog);
+put_token:
+	bpf_token_put(token);
 	return err;
 }
 
@@ -3858,7 +3898,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 	case BPF_PROG_TYPE_SK_LOOKUP:
 		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
 	case BPF_PROG_TYPE_CGROUP_SKB:
-		if (!bpf_net_capable())
+		if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN))
 			/* cg-skb progs can be loaded by unpriv user.
 			 * check permissions at attach time.
 			 */
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index bc86be4ca567..c13c73788d8c 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -80,6 +80,20 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
 		seq_printf(m, "allowed_maps:\tany\n");
 	else
 		seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps);
+
+	BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64);
+	mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
+	if ((token->allowed_progs & mask) == mask)
+		seq_printf(m, "allowed_progs:\tany\n");
+	else
+		seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs);
+
+	BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64);
+	mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
+	if ((token->allowed_attachs & mask) == mask)
+		seq_printf(m, "allowed_attachs:\tany\n");
+	else
+		seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs);
 }
 
 #define BPF_TOKEN_INODE_NAME "bpf-token"
@@ -176,6 +190,8 @@ int bpf_token_create(union bpf_attr *attr)
 	mnt_opts = path.dentry->d_sb->s_fs_info;
 	token->allowed_cmds = mnt_opts->delegate_cmds;
 	token->allowed_maps = mnt_opts->delegate_maps;
+	token->allowed_progs = mnt_opts->delegate_progs;
+	token->allowed_attachs = mnt_opts->delegate_attachs;
 
 	fd = get_unused_fd_flags(O_CLOEXEC);
 	if (fd < 0) {
@@ -231,3 +247,14 @@ bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type t
 
 	return token->allowed_maps & (1ULL << type);
 }
+
+bool bpf_token_allow_prog_type(const struct bpf_token *token,
+			       enum bpf_prog_type prog_type,
+			       enum bpf_attach_type attach_type)
+{
+	if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE)
+		return false;
+
+	return (token->allowed_progs & (1ULL << prog_type)) &&
+	       (token->allowed_attachs & (1ULL << attach_type));
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index cb2c888e3bb4..d96708380e52 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1028,6 +1028,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
+	__MAX_BPF_PROG_TYPE
 };
 
 enum bpf_attach_type {
@@ -1520,6 +1521,10 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		log_true_size;
+		/* BPF token FD to use with BPF_PROG_LOAD operation.
+		 * If provided, prog_flags should have BPF_F_TOKEN_FD flag set.
+		 */
+		__s32		prog_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
index 573249a2814d..4ed46ed58a7b 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
@@ -30,6 +30,8 @@ void test_libbpf_probe_prog_types(void)
 
 		if (prog_type == BPF_PROG_TYPE_UNSPEC)
 			continue;
+		if (strcmp(prog_type_name, "__MAX_BPF_PROG_TYPE") == 0)
+			continue;
 
 		if (!test__start_subtest(prog_type_name))
 			continue;
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
index 1f328c0d8aff..62ea855ec4d0 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
@@ -189,6 +189,9 @@ static void test_libbpf_bpf_prog_type_str(void)
 		const char *prog_type_str;
 		char buf[256];
 
+		if (prog_type == __MAX_BPF_PROG_TYPE)
+			continue;
+
 		prog_type_name = btf__str_by_offset(btf, e->name_off);
 		prog_type_str = libbpf_bpf_prog_type_str(prog_type);
 		ASSERT_OK_PTR(prog_type_str, prog_type_name);
-- 
cgit v1.2.3


From bbc1d24724e110b86a1a7c3c1724ce0d62cc1e2e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Jan 2024 18:21:04 -0800
Subject: bpf: Take into account BPF token when fetching helper protos

Instead of performing unconditional system-wide bpf_capable() and
perfmon_capable() calls inside bpf_base_func_proto() function (and other
similar ones) to determine eligibility of a given BPF helper for a given
program, use previously recorded BPF token during BPF_PROG_LOAD command
handling to inform the decision.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20240124022127.2379740-8-andrii@kernel.org
---
 drivers/media/rc/bpf-lirc.c |  2 +-
 include/linux/bpf.h         |  5 +++--
 kernel/bpf/cgroup.c         |  6 +++---
 kernel/bpf/helpers.c        |  6 +++---
 kernel/bpf/syscall.c        |  5 +++--
 kernel/trace/bpf_trace.c    |  2 +-
 net/core/filter.c           | 32 ++++++++++++++++----------------
 net/ipv4/bpf_tcp_ca.c       |  2 +-
 net/netfilter/nf_bpf_link.c |  2 +-
 9 files changed, 32 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index fe17c7f98e81..6d07693c6b9f 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -110,7 +110,7 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_trace_printk:
-		if (perfmon_capable())
+		if (bpf_token_capable(prog->aux->token, CAP_PERFMON))
 			return bpf_get_trace_printk_proto();
 		fallthrough;
 	default:
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d0bf37e3f166..1325225bf602 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2550,7 +2550,8 @@ int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
 struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
 
-const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
+const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id,
+						 const struct bpf_prog *prog);
 void bpf_task_storage_free(struct task_struct *task);
 void bpf_cgrp_storage_free(struct cgroup *cgroup);
 bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
@@ -2810,7 +2811,7 @@ static inline int btf_struct_access(struct bpf_verifier_log *log,
 }
 
 static inline const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
+bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	return NULL;
 }
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 491d20038cbe..98e0e3835b28 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1630,7 +1630,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
@@ -2191,7 +2191,7 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
@@ -2348,7 +2348,7 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e04ca1af8927..bcb951a2ecf4 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1680,7 +1680,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
 const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
 
 const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
+bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_map_lookup_elem:
@@ -1731,7 +1731,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		break;
 	}
 
-	if (!bpf_capable())
+	if (!bpf_token_capable(prog->aux->token, CAP_BPF))
 		return NULL;
 
 	switch (func_id) {
@@ -1789,7 +1789,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		break;
 	}
 
-	if (!perfmon_capable())
+	if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
 		return NULL;
 
 	switch (func_id) {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 61b4bf4cc287..f76408c957ce 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5772,7 +5772,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
 const struct bpf_func_proto * __weak
 tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-	return bpf_base_func_proto(func_id);
+	return bpf_base_func_proto(func_id, prog);
 }
 
 BPF_CALL_1(bpf_sys_close, u32, fd)
@@ -5822,7 +5822,8 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_sys_bpf:
-		return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
+		return !bpf_token_capable(prog->aux->token, CAP_PERFMON)
+		       ? NULL : &bpf_sys_bpf_proto;
 	case BPF_FUNC_btf_find_by_name_kind:
 		return &bpf_btf_find_by_name_kind_proto;
 	case BPF_FUNC_sys_close:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c98c20abaf99..64fdaf79d113 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1629,7 +1629,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_trace_vprintk:
 		return bpf_get_trace_vprintk_proto();
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 6a7abbaa50b8..521bcd0f5e4d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -87,7 +87,7 @@
 #include "dev.h"
 
 static const struct bpf_func_proto *
-bpf_sk_base_func_proto(enum bpf_func_id func_id);
+bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
 
 int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
 {
@@ -7862,7 +7862,7 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
@@ -7955,7 +7955,7 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 			return NULL;
 		}
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -7974,7 +7974,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_skb_event_output_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8161,7 +8161,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #endif
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8220,7 +8220,7 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #endif
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 
 #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
@@ -8281,7 +8281,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_sock_proto;
 #endif /* CONFIG_INET */
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8323,7 +8323,7 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_cgroup_classid_curr_proto;
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8367,7 +8367,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skc_lookup_tcp_proto;
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8378,7 +8378,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_flow_dissector_load_bytes_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8405,7 +8405,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_under_cgroup:
 		return &bpf_skb_under_cgroup_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -11236,7 +11236,7 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
@@ -11418,7 +11418,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_sk_release:
 		return &bpf_sk_release_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -11752,7 +11752,7 @@ const struct bpf_func_proto bpf_sock_from_file_proto = {
 };
 
 static const struct bpf_func_proto *
-bpf_sk_base_func_proto(enum bpf_func_id func_id)
+bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	const struct bpf_func_proto *func;
 
@@ -11781,10 +11781,10 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 
-	if (!perfmon_capable())
+	if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
 		return NULL;
 
 	return func;
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 8e7716256d3c..834edc18463a 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -197,7 +197,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
index 0e4beae421f8..5257d5e7eb09 100644
--- a/net/netfilter/nf_bpf_link.c
+++ b/net/netfilter/nf_bpf_link.c
@@ -314,7 +314,7 @@ static bool nf_is_valid_access(int off, int size, enum bpf_access_type type,
 static const struct bpf_func_proto *
 bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-	return bpf_base_func_proto(func_id);
+	return bpf_base_func_proto(func_id, prog);
 }
 
 const struct bpf_verifier_ops netfilter_verifier_ops = {
-- 
cgit v1.2.3


From d79a3549754725bb90e58104417449edddf3da3d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Jan 2024 18:21:05 -0800
Subject: bpf: Consistently use BPF token throughout BPF verifier logic

Remove remaining direct queries to perfmon_capable() and bpf_capable()
in BPF verifier logic and instead use BPF token (if available) to make
decisions about privileges.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20240124022127.2379740-9-andrii@kernel.org
---
 include/linux/bpf.h    | 16 ++++++++--------
 include/linux/filter.h |  2 +-
 kernel/bpf/arraymap.c  |  2 +-
 kernel/bpf/core.c      |  2 +-
 kernel/bpf/verifier.c  | 13 ++++++-------
 net/core/filter.c      |  4 ++--
 6 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1325225bf602..4e146e9708be 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2261,24 +2261,24 @@ extern int sysctl_unprivileged_bpf_disabled;
 
 bool bpf_token_capable(const struct bpf_token *token, int cap);
 
-static inline bool bpf_allow_ptr_leaks(void)
+static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token)
 {
-	return perfmon_capable();
+	return bpf_token_capable(token, CAP_PERFMON);
 }
 
-static inline bool bpf_allow_uninit_stack(void)
+static inline bool bpf_allow_uninit_stack(const struct bpf_token *token)
 {
-	return perfmon_capable();
+	return bpf_token_capable(token, CAP_PERFMON);
 }
 
-static inline bool bpf_bypass_spec_v1(void)
+static inline bool bpf_bypass_spec_v1(const struct bpf_token *token)
 {
-	return cpu_mitigations_off() || perfmon_capable();
+	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
 }
 
-static inline bool bpf_bypass_spec_v4(void)
+static inline bool bpf_bypass_spec_v4(const struct bpf_token *token)
 {
-	return cpu_mitigations_off() || perfmon_capable();
+	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
 }
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 35f067fd3840..fee070b9826e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1140,7 +1140,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
 		return false;
 	if (!bpf_jit_harden)
 		return false;
-	if (bpf_jit_harden == 1 && bpf_capable())
+	if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF))
 		return false;
 
 	return true;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 0bdbbbeab155..13358675ff2e 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -82,7 +82,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int numa_node = bpf_map_attr_numa_node(attr);
 	u32 elem_size, index_mask, max_entries;
-	bool bypass_spec_v1 = bpf_bypass_spec_v1();
+	bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL);
 	u64 array_size, mask64;
 	struct bpf_array *array;
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 00dccba29769..71c459a51d9e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -682,7 +682,7 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
 	if (!bpf_prog_kallsyms_candidate(fp) ||
-	    !bpf_capable())
+	    !bpf_token_capable(fp->aux->token, CAP_BPF))
 		return;
 
 	bpf_prog_ksym_set_addr(fp);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f31868ba0c2d..fe833e831cb6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20830,7 +20830,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	env->prog = *prog;
 	env->ops = bpf_verifier_ops[env->prog->type];
 	env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
-	is_priv = bpf_capable();
+
+	env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token);
+	env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token);
+	env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token);
+	env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token);
+	env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF);
 
 	bpf_get_btf_vmlinux();
 
@@ -20862,12 +20867,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
 		env->strict_alignment = false;
 
-	env->allow_ptr_leaks = bpf_allow_ptr_leaks();
-	env->allow_uninit_stack = bpf_allow_uninit_stack();
-	env->bypass_spec_v1 = bpf_bypass_spec_v1();
-	env->bypass_spec_v4 = bpf_bypass_spec_v4();
-	env->bpf_capable = bpf_capable();
-
 	if (is_priv)
 		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
 	env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
diff --git a/net/core/filter.c b/net/core/filter.c
index 521bcd0f5e4d..40121475e8d1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8580,7 +8580,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
-		if (!bpf_capable())
+		if (!bpf_token_capable(prog->aux->token, CAP_BPF))
 			return false;
 		break;
 	}
@@ -8592,7 +8592,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
 			break;
 		case bpf_ctx_range(struct __sk_buff, tstamp):
-			if (!bpf_capable())
+			if (!bpf_token_capable(prog->aux->token, CAP_BPF))
 				return false;
 			break;
 		default:
-- 
cgit v1.2.3


From 1b67772e4e3f16cd647b229cae95fc06d120be08 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Jan 2024 18:21:06 -0800
Subject: bpf,lsm: Refactor bpf_prog_alloc/bpf_prog_free LSM hooks

Based on upstream discussion ([0]), rework existing
bpf_prog_alloc_security LSM hook. Rename it to bpf_prog_load and instead
of passing bpf_prog_aux, pass proper bpf_prog pointer for a full BPF
program struct. Also, we pass bpf_attr union with all the user-provided
arguments for BPF_PROG_LOAD command.  This will give LSMs as much
information as we can basically provide.

The hook is also BPF token-aware now, and optional bpf_token struct is
passed as a third argument. bpf_prog_load LSM hook is called after
a bunch of sanity checks were performed, bpf_prog and bpf_prog_aux were
allocated and filled out, but right before performing full-fledged BPF
verification step.

bpf_prog_free LSM hook is now accepting struct bpf_prog argument, for
consistency. SELinux code is adjusted to all new names, types, and
signatures.

Note, given that bpf_prog_load (previously bpf_prog_alloc) hook can be
used by some LSMs to allocate extra security blob, but also by other
LSMs to reject BPF program loading, we need to make sure that
bpf_prog_free LSM hook is called after bpf_prog_load/bpf_prog_alloc one
*even* if the hook itself returned error. If we don't do that, we run
the risk of leaking memory. This seems to be possible today when
combining SELinux and BPF LSM, as one example, depending on their
relative ordering.

Also, for BPF LSM setup, add bpf_prog_load and bpf_prog_free to
sleepable LSM hooks list, as they are both executed in sleepable
context. Also drop bpf_prog_load hook from untrusted, as there is no
issue with refcount or anything else anymore, that originally forced us
to add it to untrusted list in c0c852dd1876 ("bpf: Do not mark certain LSM
hook arguments as trusted"). We now trigger this hook much later and it
should not be an issue anymore.

  [0] https://lore.kernel.org/bpf/9fe88aef7deabbe87d3fc38c4aea3c69.paul@paul-moore.com/

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Paul Moore <paul@paul-moore.com>
Link: https://lore.kernel.org/bpf/20240124022127.2379740-10-andrii@kernel.org
---
 include/linux/lsm_hook_defs.h |  5 +++--
 include/linux/security.h      | 12 +++++++-----
 kernel/bpf/bpf_lsm.c          |  5 +++--
 kernel/bpf/syscall.c          | 25 +++++++++++++------------
 security/security.c           | 25 +++++++++++++++----------
 security/selinux/hooks.c      | 15 ++++++++-------
 6 files changed, 49 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 185924c56378..370181aa685b 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -406,8 +406,9 @@ LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
 LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
 LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map)
 LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map)
-LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux)
-LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux)
+LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
+	 struct bpf_token *token)
+LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
 #endif /* CONFIG_BPF_SYSCALL */
 
 LSM_HOOK(int, 0, locked_down, enum lockdown_reason what)
diff --git a/include/linux/security.h b/include/linux/security.h
index d0eb20f90b26..cb2932fce448 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2064,15 +2064,16 @@ static inline void securityfs_remove(struct dentry *dentry)
 union bpf_attr;
 struct bpf_map;
 struct bpf_prog;
-struct bpf_prog_aux;
+struct bpf_token;
 #ifdef CONFIG_SECURITY
 extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
 extern int security_bpf_prog(struct bpf_prog *prog);
 extern int security_bpf_map_alloc(struct bpf_map *map);
 extern void security_bpf_map_free(struct bpf_map *map);
-extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux);
-extern void security_bpf_prog_free(struct bpf_prog_aux *aux);
+extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
+				  struct bpf_token *token);
+extern void security_bpf_prog_free(struct bpf_prog *prog);
 #else
 static inline int security_bpf(int cmd, union bpf_attr *attr,
 					     unsigned int size)
@@ -2098,12 +2099,13 @@ static inline int security_bpf_map_alloc(struct bpf_map *map)
 static inline void security_bpf_map_free(struct bpf_map *map)
 { }
 
-static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
+					 struct bpf_token *token)
 {
 	return 0;
 }
 
-static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
+static inline void security_bpf_prog_free(struct bpf_prog *prog)
 { }
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_BPF_SYSCALL */
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index e8e910395bf6..7ee0dd011de4 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -263,6 +263,8 @@ BTF_ID(func, bpf_lsm_bpf_map)
 BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
 BTF_ID(func, bpf_lsm_bpf_map_free_security)
 BTF_ID(func, bpf_lsm_bpf_prog)
+BTF_ID(func, bpf_lsm_bpf_prog_load)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
 BTF_ID(func, bpf_lsm_bprm_check_security)
 BTF_ID(func, bpf_lsm_bprm_committed_creds)
 BTF_ID(func, bpf_lsm_bprm_committing_creds)
@@ -358,8 +360,7 @@ BTF_SET_END(sleepable_lsm_hooks)
 
 BTF_SET_START(untrusted_lsm_hooks)
 BTF_ID(func, bpf_lsm_bpf_map_free_security)
-BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
-BTF_ID(func, bpf_lsm_bpf_prog_free_security)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
 BTF_ID(func, bpf_lsm_file_alloc_security)
 BTF_ID(func, bpf_lsm_file_free_security)
 #ifdef CONFIG_SECURITY_NETWORK
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f76408c957ce..3857a2a9ae7f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2180,7 +2180,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 	kvfree(aux->func_info);
 	kfree(aux->func_info_aux);
 	free_uid(aux->user);
-	security_bpf_prog_free(aux);
+	security_bpf_prog_free(aux->prog);
 	bpf_prog_free(aux->prog);
 }
 
@@ -2772,10 +2772,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	prog->aux->token = token;
 	token = NULL;
 
-	err = security_bpf_prog_alloc(prog->aux);
-	if (err)
-		goto free_prog;
-
 	prog->aux->user = get_current_user();
 	prog->len = attr->insn_cnt;
 
@@ -2783,12 +2779,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	if (copy_from_bpfptr(prog->insns,
 			     make_bpfptr(attr->insns, uattr.is_kernel),
 			     bpf_prog_insn_size(prog)) != 0)
-		goto free_prog_sec;
+		goto free_prog;
 	/* copy eBPF program license from user space */
 	if (strncpy_from_bpfptr(license,
 				make_bpfptr(attr->license, uattr.is_kernel),
 				sizeof(license) - 1) < 0)
-		goto free_prog_sec;
+		goto free_prog;
 	license[sizeof(license) - 1] = 0;
 
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
@@ -2802,14 +2798,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	if (bpf_prog_is_dev_bound(prog->aux)) {
 		err = bpf_prog_dev_bound_init(prog, attr);
 		if (err)
-			goto free_prog_sec;
+			goto free_prog;
 	}
 
 	if (type == BPF_PROG_TYPE_EXT && dst_prog &&
 	    bpf_prog_is_dev_bound(dst_prog->aux)) {
 		err = bpf_prog_dev_bound_inherit(prog, dst_prog);
 		if (err)
-			goto free_prog_sec;
+			goto free_prog;
 	}
 
 	/*
@@ -2831,12 +2827,16 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
 	if (err < 0)
-		goto free_prog_sec;
+		goto free_prog;
 
 	prog->aux->load_time = ktime_get_boottime_ns();
 	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
 			       sizeof(attr->prog_name));
 	if (err < 0)
+		goto free_prog;
+
+	err = security_bpf_prog_load(prog, attr, token);
+	if (err)
 		goto free_prog_sec;
 
 	/* run eBPF verifier */
@@ -2882,10 +2882,11 @@ free_used_maps:
 	 */
 	__bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
 	return err;
+
 free_prog_sec:
-	free_uid(prog->aux->user);
-	security_bpf_prog_free(prog->aux);
+	security_bpf_prog_free(prog);
 free_prog:
+	free_uid(prog->aux->user);
 	if (prog->aux->attach_btf)
 		btf_put(prog->aux->attach_btf);
 	bpf_prog_free(prog);
diff --git a/security/security.c b/security/security.c
index 0144a98d3712..eb159da4b146 100644
--- a/security/security.c
+++ b/security/security.c
@@ -5423,16 +5423,21 @@ int security_bpf_map_alloc(struct bpf_map *map)
 }
 
 /**
- * security_bpf_prog_alloc() - Allocate a bpf program LSM blob
- * @aux: bpf program aux info struct
+ * security_bpf_prog_load() - Check if loading of BPF program is allowed
+ * @prog: BPF program object
+ * @attr: BPF syscall attributes used to create BPF program
+ * @token: BPF token used to grant user access to BPF subsystem
  *
- * Initialize the security field inside bpf program.
+ * Perform an access control check when the kernel loads a BPF program and
+ * allocates associated BPF program object. This hook is also responsible for
+ * allocating any required LSM state for the BPF program.
  *
  * Return: Returns 0 on success, error on failure.
  */
-int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
+			   struct bpf_token *token)
 {
-	return call_int_hook(bpf_prog_alloc_security, 0, aux);
+	return call_int_hook(bpf_prog_load, 0, prog, attr, token);
 }
 
 /**
@@ -5447,14 +5452,14 @@ void security_bpf_map_free(struct bpf_map *map)
 }
 
 /**
- * security_bpf_prog_free() - Free a bpf program's LSM blob
- * @aux: bpf program aux info struct
+ * security_bpf_prog_free() - Free a BPF program's LSM blob
+ * @prog: BPF program struct
  *
- * Clean up the security information stored inside bpf prog.
+ * Clean up the security information stored inside BPF program.
  */
-void security_bpf_prog_free(struct bpf_prog_aux *aux)
+void security_bpf_prog_free(struct bpf_prog *prog)
 {
-	call_void_hook(bpf_prog_free_security, aux);
+	call_void_hook(bpf_prog_free, prog);
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index a6bf90ace84c..6d64fb189b1b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6942,7 +6942,8 @@ static void selinux_bpf_map_free(struct bpf_map *map)
 	kfree(bpfsec);
 }
 
-static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux)
+static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
+				 struct bpf_token *token)
 {
 	struct bpf_security_struct *bpfsec;
 
@@ -6951,16 +6952,16 @@ static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux)
 		return -ENOMEM;
 
 	bpfsec->sid = current_sid();
-	aux->security = bpfsec;
+	prog->aux->security = bpfsec;
 
 	return 0;
 }
 
-static void selinux_bpf_prog_free(struct bpf_prog_aux *aux)
+static void selinux_bpf_prog_free(struct bpf_prog *prog)
 {
-	struct bpf_security_struct *bpfsec = aux->security;
+	struct bpf_security_struct *bpfsec = prog->aux->security;
 
-	aux->security = NULL;
+	prog->aux->security = NULL;
 	kfree(bpfsec);
 }
 #endif
@@ -7325,7 +7326,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(bpf_map, selinux_bpf_map),
 	LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog),
 	LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
-	LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
+	LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free),
 #endif
 
 #ifdef CONFIG_PERF_EVENTS
@@ -7383,7 +7384,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 #endif
 #ifdef CONFIG_BPF_SYSCALL
 	LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc),
-	LSM_HOOK_INIT(bpf_prog_alloc_security, selinux_bpf_prog_alloc),
+	LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load),
 #endif
 #ifdef CONFIG_PERF_EVENTS
 	LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc),
-- 
cgit v1.2.3


From a2431c7eabcf9bd5a1e7a1f7ecded40fdda4a8c5 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Jan 2024 18:21:07 -0800
Subject: bpf,lsm: Refactor bpf_map_alloc/bpf_map_free LSM hooks

Similarly to bpf_prog_alloc LSM hook, rename and extend bpf_map_alloc
hook into bpf_map_create, taking not just struct bpf_map, but also
bpf_attr and bpf_token, to give a fuller context to LSMs.

Unlike bpf_prog_alloc, there is no need to move the hook around, as it
currently is firing right before allocating BPF map ID and FD, which
seems to be a sweet spot.

But like bpf_prog_alloc/bpf_prog_free combo, make sure that bpf_map_free
LSM hook is called even if bpf_map_create hook returned error, as if few
LSMs are combined together it could be that one LSM successfully
allocated security blob for its needs, while subsequent LSM rejected BPF
map creation. The former LSM would still need to free up LSM blob, so we
need to ensure security_bpf_map_free() is called regardless of the
outcome.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Paul Moore <paul@paul-moore.com>
Link: https://lore.kernel.org/bpf/20240124022127.2379740-11-andrii@kernel.org
---
 include/linux/lsm_hook_defs.h |  5 +++--
 include/linux/security.h      |  6 ++++--
 kernel/bpf/bpf_lsm.c          |  6 +++---
 kernel/bpf/syscall.c          |  4 ++--
 security/security.c           | 16 ++++++++++------
 security/selinux/hooks.c      |  7 ++++---
 6 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 370181aa685b..1be4d3ca6efb 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -404,8 +404,9 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule)
 LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size)
 LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
 LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
-LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map)
-LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map)
+LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr,
+	 struct bpf_token *token)
+LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map)
 LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
 	 struct bpf_token *token)
 LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
diff --git a/include/linux/security.h b/include/linux/security.h
index cb2932fce448..83fcdc974116 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2069,7 +2069,8 @@ struct bpf_token;
 extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
 extern int security_bpf_prog(struct bpf_prog *prog);
-extern int security_bpf_map_alloc(struct bpf_map *map);
+extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
+				   struct bpf_token *token);
 extern void security_bpf_map_free(struct bpf_map *map);
 extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 				  struct bpf_token *token);
@@ -2091,7 +2092,8 @@ static inline int security_bpf_prog(struct bpf_prog *prog)
 	return 0;
 }
 
-static inline int security_bpf_map_alloc(struct bpf_map *map)
+static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
+					  struct bpf_token *token)
 {
 	return 0;
 }
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 7ee0dd011de4..76976908b302 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -260,8 +260,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 BTF_SET_START(sleepable_lsm_hooks)
 BTF_ID(func, bpf_lsm_bpf)
 BTF_ID(func, bpf_lsm_bpf_map)
-BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
-BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_map_create)
+BTF_ID(func, bpf_lsm_bpf_map_free)
 BTF_ID(func, bpf_lsm_bpf_prog)
 BTF_ID(func, bpf_lsm_bpf_prog_load)
 BTF_ID(func, bpf_lsm_bpf_prog_free)
@@ -359,7 +359,7 @@ BTF_ID(func, bpf_lsm_userns_create)
 BTF_SET_END(sleepable_lsm_hooks)
 
 BTF_SET_START(untrusted_lsm_hooks)
-BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_map_free)
 BTF_ID(func, bpf_lsm_bpf_prog_free)
 BTF_ID(func, bpf_lsm_file_alloc_security)
 BTF_ID(func, bpf_lsm_file_free_security)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3857a2a9ae7f..b2750b79ac80 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1318,9 +1318,9 @@ static int map_create(union bpf_attr *attr)
 			attr->btf_vmlinux_value_type_id;
 	}
 
-	err = security_bpf_map_alloc(map);
+	err = security_bpf_map_create(map, attr, token);
 	if (err)
-		goto free_map;
+		goto free_map_sec;
 
 	err = bpf_map_alloc_id(map);
 	if (err)
diff --git a/security/security.c b/security/security.c
index eb159da4b146..26fcab35b6cd 100644
--- a/security/security.c
+++ b/security/security.c
@@ -5410,16 +5410,20 @@ int security_bpf_prog(struct bpf_prog *prog)
 }
 
 /**
- * security_bpf_map_alloc() - Allocate a bpf map LSM blob
- * @map: bpf map
+ * security_bpf_map_create() - Check if BPF map creation is allowed
+ * @map: BPF map object
+ * @attr: BPF syscall attributes used to create BPF map
+ * @token: BPF token used to grant user access
  *
- * Initialize the security field inside bpf map.
+ * Do a check when the kernel creates a new BPF map. This is also the
+ * point where LSM blob is allocated for LSMs that need them.
  *
  * Return: Returns 0 on success, error on failure.
  */
-int security_bpf_map_alloc(struct bpf_map *map)
+int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
+			    struct bpf_token *token)
 {
-	return call_int_hook(bpf_map_alloc_security, 0, map);
+	return call_int_hook(bpf_map_create, 0, map, attr, token);
 }
 
 /**
@@ -5448,7 +5452,7 @@ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
  */
 void security_bpf_map_free(struct bpf_map *map)
 {
-	call_void_hook(bpf_map_free_security, map);
+	call_void_hook(bpf_map_free, map);
 }
 
 /**
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 6d64fb189b1b..3d336a7952f7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6920,7 +6920,8 @@ static int selinux_bpf_prog(struct bpf_prog *prog)
 			    BPF__PROG_RUN, NULL);
 }
 
-static int selinux_bpf_map_alloc(struct bpf_map *map)
+static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
+				  struct bpf_token *token)
 {
 	struct bpf_security_struct *bpfsec;
 
@@ -7325,7 +7326,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(bpf, selinux_bpf),
 	LSM_HOOK_INIT(bpf_map, selinux_bpf_map),
 	LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog),
-	LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
+	LSM_HOOK_INIT(bpf_map_free, selinux_bpf_map_free),
 	LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free),
 #endif
 
@@ -7383,7 +7384,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(audit_rule_init, selinux_audit_rule_init),
 #endif
 #ifdef CONFIG_BPF_SYSCALL
-	LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc),
+	LSM_HOOK_INIT(bpf_map_create, selinux_bpf_map_create),
 	LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load),
 #endif
 #ifdef CONFIG_PERF_EVENTS
-- 
cgit v1.2.3


From f568a3d49af9aed813a184353592efe29b0e3d16 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 23 Jan 2024 18:21:08 -0800
Subject: bpf,lsm: Add BPF token LSM hooks

Wire up bpf_token_create and bpf_token_free LSM hooks, which allow to
allocate LSM security blob (we add `void *security` field to struct
bpf_token for that), but also control who can instantiate BPF token.
This follows existing pattern for BPF map and BPF prog.

Also add security_bpf_token_allow_cmd() and security_bpf_token_capable()
LSM hooks that allow LSM implementation to control and negate (if
necessary) BPF token's delegation of a specific bpf_cmd and capability,
respectively.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Paul Moore <paul@paul-moore.com>
Link: https://lore.kernel.org/bpf/20240124022127.2379740-12-andrii@kernel.org
---
 include/linux/bpf.h           |  3 +++
 include/linux/lsm_hook_defs.h |  5 ++++
 include/linux/security.h      | 25 ++++++++++++++++++
 kernel/bpf/bpf_lsm.c          |  4 +++
 kernel/bpf/token.c            | 12 ++++++++-
 security/security.c           | 60 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 108 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4e146e9708be..b86bd15a051d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1634,6 +1634,9 @@ struct bpf_token {
 	u64 allowed_maps;
 	u64 allowed_progs;
 	u64 allowed_attachs;
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 };
 
 struct bpf_struct_ops_value;
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 1be4d3ca6efb..cd6fbc7af3f8 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -410,6 +410,11 @@ LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map)
 LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
 	 struct bpf_token *token)
 LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
+LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr,
+	 struct path *path)
+LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token)
+LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd)
+LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap)
 #endif /* CONFIG_BPF_SYSCALL */
 
 LSM_HOOK(int, 0, locked_down, enum lockdown_reason what)
diff --git a/include/linux/security.h b/include/linux/security.h
index 83fcdc974116..15804af54f37 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/sockptr.h>
+#include <linux/bpf.h>
 #include <uapi/linux/lsm.h>
 
 struct linux_binprm;
@@ -2075,6 +2076,11 @@ extern void security_bpf_map_free(struct bpf_map *map);
 extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 				  struct bpf_token *token);
 extern void security_bpf_prog_free(struct bpf_prog *prog);
+extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
+				     struct path *path);
+extern void security_bpf_token_free(struct bpf_token *token);
+extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
+extern int security_bpf_token_capable(const struct bpf_token *token, int cap);
 #else
 static inline int security_bpf(int cmd, union bpf_attr *attr,
 					     unsigned int size)
@@ -2109,6 +2115,25 @@ static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *
 
 static inline void security_bpf_prog_free(struct bpf_prog *prog)
 { }
+
+static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
+				     struct path *path)
+{
+	return 0;
+}
+
+static inline void security_bpf_token_free(struct bpf_token *token)
+{ }
+
+static inline int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+{
+	return 0;
+}
+
+static inline int security_bpf_token_capable(const struct bpf_token *token, int cap)
+{
+	return 0;
+}
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_BPF_SYSCALL */
 
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 76976908b302..63b4dc495125 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -265,6 +265,10 @@ BTF_ID(func, bpf_lsm_bpf_map_free)
 BTF_ID(func, bpf_lsm_bpf_prog)
 BTF_ID(func, bpf_lsm_bpf_prog_load)
 BTF_ID(func, bpf_lsm_bpf_prog_free)
+BTF_ID(func, bpf_lsm_bpf_token_create)
+BTF_ID(func, bpf_lsm_bpf_token_free)
+BTF_ID(func, bpf_lsm_bpf_token_cmd)
+BTF_ID(func, bpf_lsm_bpf_token_capable)
 BTF_ID(func, bpf_lsm_bprm_check_security)
 BTF_ID(func, bpf_lsm_bprm_committed_creds)
 BTF_ID(func, bpf_lsm_bprm_committing_creds)
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index c13c73788d8c..64c568f47f69 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -7,6 +7,7 @@
 #include <linux/idr.h>
 #include <linux/namei.h>
 #include <linux/user_namespace.h>
+#include <linux/security.h>
 
 static bool bpf_ns_capable(struct user_namespace *ns, int cap)
 {
@@ -21,6 +22,8 @@ bool bpf_token_capable(const struct bpf_token *token, int cap)
 	userns = token ? token->userns : &init_user_ns;
 	if (!bpf_ns_capable(userns, cap))
 		return false;
+	if (token && security_bpf_token_capable(token, cap) < 0)
+		return false;
 	return true;
 }
 
@@ -31,6 +34,7 @@ void bpf_token_inc(struct bpf_token *token)
 
 static void bpf_token_free(struct bpf_token *token)
 {
+	security_bpf_token_free(token);
 	put_user_ns(token->userns);
 	kfree(token);
 }
@@ -193,6 +197,10 @@ int bpf_token_create(union bpf_attr *attr)
 	token->allowed_progs = mnt_opts->delegate_progs;
 	token->allowed_attachs = mnt_opts->delegate_attachs;
 
+	err = security_bpf_token_create(token, attr, &path);
+	if (err)
+		goto out_token;
+
 	fd = get_unused_fd_flags(O_CLOEXEC);
 	if (fd < 0) {
 		err = fd;
@@ -237,7 +245,9 @@ bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
 {
 	if (!token)
 		return false;
-	return token->allowed_cmds & (1ULL << cmd);
+	if (!(token->allowed_cmds & (1ULL << cmd)))
+		return false;
+	return security_bpf_token_cmd(token, cmd) == 0;
 }
 
 bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type)
diff --git a/security/security.c b/security/security.c
index 26fcab35b6cd..73e009e3d937 100644
--- a/security/security.c
+++ b/security/security.c
@@ -5444,6 +5444,55 @@ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 	return call_int_hook(bpf_prog_load, 0, prog, attr, token);
 }
 
+/**
+ * security_bpf_token_create() - Check if creating of BPF token is allowed
+ * @token: BPF token object
+ * @attr: BPF syscall attributes used to create BPF token
+ * @path: path pointing to BPF FS mount point from which BPF token is created
+ *
+ * Do a check when the kernel instantiates a new BPF token object from BPF FS
+ * instance. This is also the point where LSM blob can be allocated for LSMs.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
+int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
+			      struct path *path)
+{
+	return call_int_hook(bpf_token_create, 0, token, attr, path);
+}
+
+/**
+ * security_bpf_token_cmd() - Check if BPF token is allowed to delegate
+ * requested BPF syscall command
+ * @token: BPF token object
+ * @cmd: BPF syscall command requested to be delegated by BPF token
+ *
+ * Do a check when the kernel decides whether provided BPF token should allow
+ * delegation of requested BPF syscall command.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
+int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+{
+	return call_int_hook(bpf_token_cmd, 0, token, cmd);
+}
+
+/**
+ * security_bpf_token_capable() - Check if BPF token is allowed to delegate
+ * requested BPF-related capability
+ * @token: BPF token object
+ * @cap: capabilities requested to be delegated by BPF token
+ *
+ * Do a check when the kernel decides whether provided BPF token should allow
+ * delegation of requested BPF-related capabilities.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
+int security_bpf_token_capable(const struct bpf_token *token, int cap)
+{
+	return call_int_hook(bpf_token_capable, 0, token, cap);
+}
+
 /**
  * security_bpf_map_free() - Free a bpf map's LSM blob
  * @map: bpf map
@@ -5465,6 +5514,17 @@ void security_bpf_prog_free(struct bpf_prog *prog)
 {
 	call_void_hook(bpf_prog_free, prog);
 }
+
+/**
+ * security_bpf_token_free() - Free a BPF token's LSM blob
+ * @token: BPF token struct
+ *
+ * Clean up the security information stored inside BPF token.
+ */
+void security_bpf_token_free(struct bpf_token *token)
+{
+	call_void_hook(bpf_token_free, token);
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 /**
-- 
cgit v1.2.3


From 83e897a961b801536dd1d736e9ede5b1ddb1c188 Mon Sep 17 00:00:00 2001
From: Ayala Beker <ayala.beker@intel.com>
Date: Tue, 2 Jan 2024 21:35:33 +0200
Subject: wifi: ieee80211: add definitions for negotiated TID to Link map

Add the relevant definitions and structures for TID to Link mapping
negotiation request/response/teardown according to P802.11be_D4.0.

Signed-off-by: Ayala Beker <ayala.beker@intel.com>
Reviewed-by: Gregory Greenman <gregory.greenman@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240102213313.9ef2b866c8c7.Ieaf7dadea9961e0edc55d19c99f0f9fbae591de6@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 83c4d060a559..eafa70e5ba94 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1454,6 +1454,20 @@ struct ieee80211_mgmt {
 					u8 max_tod_error;
 					u8 max_toa_error;
 				} __packed wnm_timing_msr;
+				struct {
+					u8 action_code;
+					u8 dialog_token;
+					u8 variable[];
+				} __packed ttlm_req;
+				struct {
+					u8 action_code;
+					u8 dialog_token;
+					u8 status_code;
+					u8 variable[];
+				} __packed ttlm_res;
+				struct {
+					u8 action_code;
+				} __packed ttlm_tear_down;
 			} u;
 		} __packed action;
 		DECLARE_FLEX_ARRAY(u8, body); /* Generic frame body */
@@ -3357,6 +3371,8 @@ enum ieee80211_statuscode {
 	WLAN_STATUS_UNKNOWN_AUTHENTICATION_SERVER = 109,
 	WLAN_STATUS_SAE_HASH_TO_ELEMENT = 126,
 	WLAN_STATUS_SAE_PK = 127,
+	WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING = 133,
+	WLAN_STATUS_PREF_TID_TO_LINK_MAPPING_SUGGESTED = 134,
 };
 
 
@@ -3682,6 +3698,7 @@ enum ieee80211_category {
 	WLAN_CATEGORY_UNPROT_DMG = 20,
 	WLAN_CATEGORY_VHT = 21,
 	WLAN_CATEGORY_S1G = 22,
+	WLAN_CATEGORY_PROTECTED_EHT = 37,
 	WLAN_CATEGORY_VENDOR_SPECIFIC_PROTECTED = 126,
 	WLAN_CATEGORY_VENDOR_SPECIFIC = 127,
 };
@@ -3745,6 +3762,13 @@ enum ieee80211_unprotected_wnm_actioncode {
 	WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE = 1,
 };
 
+/* Protected EHT action codes */
+enum ieee80211_protected_eht_actioncode {
+	WLAN_PROTECTED_EHT_ACTION_TTLM_REQ = 0,
+	WLAN_PROTECTED_EHT_ACTION_TTLM_RES = 1,
+	WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN = 2,
+};
+
 /* Security key length */
 enum ieee80211_key_len {
 	WLAN_KEY_LEN_WEP40 = 5,
@@ -4845,6 +4869,10 @@ struct ieee80211_multi_link_elem {
 #define IEEE80211_MLD_CAP_OP_MAX_SIMUL_LINKS		0x000f
 #define IEEE80211_MLD_CAP_OP_SRS_SUPPORT		0x0010
 #define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP	0x0060
+#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_NO_SUPP	0
+#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_SAME	1
+#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_RESERVED	2
+#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_DIFF	3
 #define IEEE80211_MLD_CAP_OP_FREQ_SEP_TYPE_IND		0x0f80
 #define IEEE80211_MLD_CAP_OP_AAR_SUPPORT		0x1000
 
-- 
cgit v1.2.3


From 8f500fbc6c655976c8062b1f1e55bd0b3095d6c2 Mon Sep 17 00:00:00 2001
From: Ayala Beker <ayala.beker@intel.com>
Date: Tue, 2 Jan 2024 21:35:34 +0200
Subject: wifi: mac80211: process and save negotiated TID to Link mapping
 request

An MLD may send TID-to-Link mapping request frame to negotiate
TID to link mapping with a peer MLD.
Support handling negotiated TID-to-Link mapping request frame
by parsing the frame, asking the driver whether it supports the
received mapping or not, and sending a TID-to-Link mapping response
to the AP MLD.
Theoretically, links that became inactive due to the received TID-to-Link
mapping request, can be selected to be activated but this would require
tearing down the negotiated TID-to-Link mapping, which is still not
supported.

Signed-off-by: Ayala Beker <ayala.beker@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Gregory Greenman <gregory.greenman@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240102213313.0bc1a24fcc9d.Ie72e47dc6f8c77d4a2f0947b775ef6367fe0edac@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  46 ++++++-
 net/mac80211/driver-ops.h  |  19 +++
 net/mac80211/ieee80211_i.h |   2 +
 net/mac80211/iface.c       |  12 ++
 net/mac80211/main.c        |   3 +-
 net/mac80211/mlme.c        | 335 ++++++++++++++++++++++++++++++++++++++++++---
 net/mac80211/rx.c          |  14 ++
 net/mac80211/trace.h       |  52 +++++++
 8 files changed, 465 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index d400fe2e8668..6490b92d5cc1 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -342,6 +342,7 @@ struct ieee80211_vif_chanctx_switch {
  *	status changed.
  * @BSS_CHANGED_EHT_PUNCTURING: The channel puncturing bitmap changed.
  * @BSS_CHANGED_MLD_VALID_LINKS: MLD valid links status changed.
+ * @BSS_CHANGED_MLD_TTLM: TID to link mapping was changed
  */
 enum ieee80211_bss_change {
 	BSS_CHANGED_ASSOC		= 1<<0,
@@ -378,6 +379,7 @@ enum ieee80211_bss_change {
 	BSS_CHANGED_UNSOL_BCAST_PROBE_RESP = 1<<31,
 	BSS_CHANGED_EHT_PUNCTURING	= BIT_ULL(32),
 	BSS_CHANGED_MLD_VALID_LINKS	= BIT_ULL(33),
+	BSS_CHANGED_MLD_TTLM		= BIT_ULL(34),
 
 	/* when adding here, make sure to change ieee80211_reconfig */
 };
@@ -1845,6 +1847,35 @@ struct ieee80211_vif_cfg {
 	u8 ap_addr[ETH_ALEN] __aligned(2);
 };
 
+#define IEEE80211_TTLM_NUM_TIDS 8
+
+/**
+ * struct ieee80211_neg_ttlm - negotiated TID to link map info
+ *
+ * @downlink: bitmap of active links per TID for downlink, or 0 if mapping for
+ *	this TID is not included.
+ * @uplink: bitmap of active links per TID for uplink, or 0 if mapping for this
+ *	TID is not included.
+ * @valid: info is valid or not.
+ */
+struct ieee80211_neg_ttlm {
+	u16 downlink[IEEE80211_TTLM_NUM_TIDS];
+	u16 uplink[IEEE80211_TTLM_NUM_TIDS];
+	bool valid;
+};
+
+/**
+ * enum ieee80211_neg_ttlm_res - return value for negotiated TTLM handling
+ * @NEG_TTLM_RES_ACCEPT: accept the request
+ * @NEG_TTLM_RES_REJECT: reject the request
+ * @NEG_TTLM_RES_SUGGEST_PREFERRED: reject and suggest a new mapping
+ */
+enum ieee80211_neg_ttlm_res {
+	NEG_TTLM_RES_ACCEPT,
+	NEG_TTLM_RES_REJECT,
+	NEG_TTLM_RES_SUGGEST_PREFERRED
+};
+
 /**
  * struct ieee80211_vif - per-interface data
  *
@@ -1863,6 +1894,11 @@ struct ieee80211_vif_cfg {
  *	API calls meant for that purpose.
  * @dormant_links: bitmap of valid but disabled links, or 0 for non-MLO.
  *	Must be a subset of valid_links.
+ * @suspended_links: subset of dormant_links representing links that are
+ *	suspended.
+ *	0 for non-MLO.
+ * @neg_ttlm: negotiated TID to link mapping info.
+ *	see &struct ieee80211_neg_ttlm.
  * @addr: address of this interface
  * @p2p: indicates whether this AP or STA interface is a p2p
  *	interface, i.e. a GO or p2p-sta respectively
@@ -1900,7 +1936,8 @@ struct ieee80211_vif {
 	struct ieee80211_vif_cfg cfg;
 	struct ieee80211_bss_conf bss_conf;
 	struct ieee80211_bss_conf __rcu *link_conf[IEEE80211_MLD_MAX_NUM_LINKS];
-	u16 valid_links, active_links, dormant_links;
+	u16 valid_links, active_links, dormant_links, suspended_links;
+	struct ieee80211_neg_ttlm neg_ttlm;
 	u8 addr[ETH_ALEN] __aligned(2);
 	bool p2p;
 
@@ -4293,6 +4330,10 @@ struct ieee80211_prep_tx_info {
  *	flow offloading for flows originating from the vif.
  *	Note that the driver must not assume that the vif driver_data is valid
  *	at this point, since the callback can be called during netdev teardown.
+ * @can_neg_ttlm: for managed interface, requests the driver to determine
+ *	if the requested TID-To-Link mapping can be accepted or not.
+ *	If it's not accepted the driver may suggest a preferred mapping and
+ *	modify @ttlm parameter with the suggested TID-to-Link mapping.
  */
 struct ieee80211_ops {
 	void (*tx)(struct ieee80211_hw *hw,
@@ -4673,6 +4714,9 @@ struct ieee80211_ops {
 			    struct net_device *dev,
 			    enum tc_setup_type type,
 			    void *type_data);
+	enum ieee80211_neg_ttlm_res
+	(*can_neg_ttlm)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+			struct ieee80211_neg_ttlm *ttlm);
 };
 
 /**
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index eb482fb8c3af..e20c64edb880 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1695,4 +1695,23 @@ int drv_change_sta_links(struct ieee80211_local *local,
 			 struct ieee80211_sta *sta,
 			 u16 old_links, u16 new_links);
 
+static inline enum ieee80211_neg_ttlm_res
+drv_can_neg_ttlm(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 struct ieee80211_neg_ttlm *neg_ttlm)
+{
+	enum ieee80211_neg_ttlm_res res = NEG_TTLM_RES_REJECT;
+
+	might_sleep();
+	if (!check_sdata_in_driver(sdata))
+		return -EIO;
+
+	trace_drv_can_neg_ttlm(local, sdata, neg_ttlm);
+	if (local->ops->can_neg_ttlm)
+		res = local->ops->can_neg_ttlm(&local->hw, &sdata->vif,
+					       neg_ttlm);
+	trace_drv_neg_ttlm_res(local, sdata, res, neg_ttlm);
+
+	return res;
+}
 #endif /* __MAC80211_DRIVER_OPS */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 0b2b53550bd9..38755f6f6fa0 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2603,6 +2603,8 @@ ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata,
 				    const struct ieee80211_eht_cap_elem *eht_cap_ie_elem,
 				    u8 eht_cap_len,
 				    struct link_sta_info *link_sta);
+void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata,
+				    struct ieee80211_mgmt *mgmt, size_t len);
 
 void ieee80211_check_wbrf_support(struct ieee80211_local *local);
 void ieee80211_add_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index e4e7c0b38cb6..4a87d2d336ae 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1546,6 +1546,18 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local,
 		default:
 			break;
 		}
+	} else if (ieee80211_is_action(mgmt->frame_control) &&
+		   mgmt->u.action.category == WLAN_CATEGORY_PROTECTED_EHT) {
+		if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+			switch (mgmt->u.action.u.ttlm_req.action_code) {
+			case WLAN_PROTECTED_EHT_ACTION_TTLM_REQ:
+				ieee80211_process_neg_ttlm_req(sdata, mgmt,
+							       skb->len);
+				break;
+			default:
+				break;
+			}
+		}
 	} else if (ieee80211_is_ext(mgmt->frame_control)) {
 		if (sdata->vif.type == NL80211_IFTYPE_STATION)
 			ieee80211_sta_rx_queued_ext(sdata, skb);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index f2ece7793573..13c417eda281 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -208,7 +208,8 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
 				   BSS_CHANGED_IBSS |\
 				   BSS_CHANGED_ARP_FILTER |\
 				   BSS_CHANGED_SSID |\
-				   BSS_CHANGED_MLD_VALID_LINKS)
+				   BSS_CHANGED_MLD_VALID_LINKS |\
+				   BSS_CHANGED_MLD_TTLM)
 
 void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 				      u64 changed)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 073105deb424..b56f7de3b53f 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1318,8 +1318,6 @@ static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata,
 			cpu_to_le16(IEEE80211_MLC_BASIC_PRES_EML_CAPA);
 		skb_put_data(skb, &eml_capa, sizeof(eml_capa));
 	}
-	/* need indication from userspace to support this */
-	mld_capa_ops &= ~cpu_to_le16(IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP);
 	skb_put_data(skb, &mld_capa_ops, sizeof(mld_capa_ops));
 
 	for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) {
@@ -5890,6 +5888,56 @@ static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata,
 				 TU_TO_JIFFIES(delay));
 }
 
+static int ieee80211_ttlm_set_links(struct ieee80211_sub_if_data *sdata,
+				    u16 active_links, u16 dormant_links,
+				    u16 suspended_links)
+{
+	u64 changed = 0;
+	int ret;
+
+	if (!active_links) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* If there is an active negotiated TTLM, it should be discarded by
+	 * the new negotiated/advertised TTLM.
+	 */
+	if (sdata->vif.neg_ttlm.valid) {
+		memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm));
+		sdata->vif.suspended_links = 0;
+		changed = BSS_CHANGED_MLD_TTLM;
+	}
+
+	if (sdata->vif.active_links != active_links) {
+		ret = ieee80211_set_active_links(&sdata->vif, active_links);
+		if (ret) {
+			sdata_info(sdata, "Failed to set TTLM active links\n");
+			goto out;
+		}
+	}
+
+	ret = ieee80211_vif_set_links(sdata, sdata->vif.valid_links,
+				      dormant_links);
+	if (ret) {
+		sdata_info(sdata, "Failed to set TTLM dormant links\n");
+		goto out;
+	}
+
+	changed |= BSS_CHANGED_MLD_VALID_LINKS;
+	sdata->vif.suspended_links = suspended_links;
+	if (sdata->vif.suspended_links)
+		changed |= BSS_CHANGED_MLD_TTLM;
+
+	ieee80211_vif_cfg_change_notify(sdata, changed);
+
+out:
+	if (ret)
+		ieee80211_disconnect(&sdata->vif, false);
+
+	return ret;
+}
+
 static void ieee80211_tid_to_link_map_work(struct wiphy *wiphy,
 					   struct wiphy_work *work)
 {
@@ -5897,30 +5945,19 @@ static void ieee80211_tid_to_link_map_work(struct wiphy *wiphy,
 	struct ieee80211_sub_if_data *sdata =
 		container_of(work, struct ieee80211_sub_if_data,
 			     u.mgd.ttlm_work.work);
-	int ret;
 
 	new_active_links = sdata->u.mgd.ttlm_info.map &
 			   sdata->vif.valid_links;
 	new_dormant_links = ~sdata->u.mgd.ttlm_info.map &
 			    sdata->vif.valid_links;
-	if (!new_active_links) {
-		ieee80211_disconnect(&sdata->vif, false);
-		return;
-	}
 
 	ieee80211_vif_set_links(sdata, sdata->vif.valid_links, 0);
-	new_active_links = BIT(ffs(new_active_links) - 1);
-	ieee80211_set_active_links(&sdata->vif, new_active_links);
-
-	ret = ieee80211_vif_set_links(sdata, sdata->vif.valid_links,
-				      new_dormant_links);
+	if (ieee80211_ttlm_set_links(sdata, new_active_links, new_dormant_links,
+				     0))
+		return;
 
 	sdata->u.mgd.ttlm_info.active = true;
 	sdata->u.mgd.ttlm_info.switch_time = 0;
-
-	if (!ret)
-		ieee80211_vif_cfg_change_notify(sdata,
-						BSS_CHANGED_MLD_VALID_LINKS);
 }
 
 static u16 ieee80211_get_ttlm(u8 bm_size, u8 *data)
@@ -6437,6 +6474,272 @@ free:
 	kfree(elems);
 }
 
+static void ieee80211_apply_neg_ttlm(struct ieee80211_sub_if_data *sdata,
+				     struct ieee80211_neg_ttlm neg_ttlm)
+{
+	u16 new_active_links, new_dormant_links, new_suspended_links, map = 0;
+	u8 i;
+
+	for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++)
+		map |= neg_ttlm.downlink[i] | neg_ttlm.uplink[i];
+
+	/* If there is an active TTLM, unset previously suspended links */
+	if (sdata->vif.neg_ttlm.valid)
+		sdata->vif.dormant_links &= ~sdata->vif.suspended_links;
+
+	/* exclude links that are already disabled by advertised TTLM */
+	new_active_links =
+		map & sdata->vif.valid_links & ~sdata->vif.dormant_links;
+	new_suspended_links =
+		(~map & sdata->vif.valid_links) & ~sdata->vif.dormant_links;
+	new_dormant_links = sdata->vif.dormant_links | new_suspended_links;
+	if (ieee80211_ttlm_set_links(sdata, new_active_links,
+				     new_dormant_links, new_suspended_links))
+		return;
+
+	sdata->vif.neg_ttlm = neg_ttlm;
+	sdata->vif.neg_ttlm.valid = true;
+}
+
+static void
+ieee80211_neg_ttlm_add_suggested_map(struct sk_buff *skb,
+				     struct ieee80211_neg_ttlm *neg_ttlm)
+{
+	u8 i, direction[IEEE80211_TTLM_MAX_CNT];
+
+	if (memcmp(neg_ttlm->downlink, neg_ttlm->uplink,
+		   sizeof(neg_ttlm->downlink))) {
+		direction[0] = IEEE80211_TTLM_DIRECTION_DOWN;
+		direction[1] = IEEE80211_TTLM_DIRECTION_UP;
+	} else {
+		direction[0] = IEEE80211_TTLM_DIRECTION_BOTH;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(direction); i++) {
+		u8 tid, len, map_ind = 0, *len_pos, *map_ind_pos, *pos;
+		__le16 map;
+
+		len = sizeof(struct ieee80211_ttlm_elem) + 1 + 1;
+
+		pos = skb_put(skb, len + 2);
+		*pos++ = WLAN_EID_EXTENSION;
+		len_pos = pos++;
+		*pos++ = WLAN_EID_EXT_TID_TO_LINK_MAPPING;
+		*pos++ = direction[i];
+		map_ind_pos = pos++;
+		for (tid = 0; tid < IEEE80211_TTLM_NUM_TIDS; tid++) {
+			map = direction[i] == IEEE80211_TTLM_DIRECTION_UP ?
+				cpu_to_le16(neg_ttlm->uplink[tid]) :
+				cpu_to_le16(neg_ttlm->downlink[tid]);
+			if (!map)
+				continue;
+
+			len += 2;
+			map_ind |= BIT(tid);
+			skb_put_data(skb, &map, sizeof(map));
+		}
+
+		*map_ind_pos = map_ind;
+		*len_pos = len;
+
+		if (direction[i] == IEEE80211_TTLM_DIRECTION_BOTH)
+			break;
+	}
+}
+
+static void
+ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata,
+			    enum ieee80211_neg_ttlm_res ttlm_res,
+			    u8 dialog_token,
+			    struct ieee80211_neg_ttlm *neg_ttlm)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_mgmt *mgmt;
+	struct sk_buff *skb;
+	int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_res);
+	int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 +
+		2 * 2 * IEEE80211_TTLM_NUM_TIDS;
+
+	skb = dev_alloc_skb(local->tx_headroom + hdr_len + ttlm_max_len);
+	if (!skb)
+		return;
+
+	skb_reserve(skb, local->tx_headroom);
+	mgmt = skb_put_zero(skb, hdr_len);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+	memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+	memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN);
+
+	mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT;
+	mgmt->u.action.u.ttlm_res.action_code =
+		WLAN_PROTECTED_EHT_ACTION_TTLM_RES;
+	mgmt->u.action.u.ttlm_res.dialog_token = dialog_token;
+	switch (ttlm_res) {
+	default:
+		WARN_ON(1);
+		fallthrough;
+	case NEG_TTLM_RES_REJECT:
+		mgmt->u.action.u.ttlm_res.status_code =
+			WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING;
+		break;
+	case NEG_TTLM_RES_ACCEPT:
+		mgmt->u.action.u.ttlm_res.status_code = WLAN_STATUS_SUCCESS;
+		break;
+	case NEG_TTLM_RES_SUGGEST_PREFERRED:
+		mgmt->u.action.u.ttlm_res.status_code =
+			WLAN_STATUS_PREF_TID_TO_LINK_MAPPING_SUGGESTED;
+		ieee80211_neg_ttlm_add_suggested_map(skb, neg_ttlm);
+		break;
+	}
+
+	ieee80211_tx_skb(sdata, skb);
+}
+
+static int
+ieee80211_parse_neg_ttlm(struct ieee80211_sub_if_data *sdata,
+			 const struct ieee80211_ttlm_elem *ttlm,
+			 struct ieee80211_neg_ttlm *neg_ttlm,
+			 u8 *direction)
+{
+	u8 control, link_map_presence, map_size, tid;
+	u8 *pos;
+
+	/* The element size was already validated in
+	 * ieee80211_tid_to_link_map_size_ok()
+	 */
+	pos = (void *)ttlm->optional;
+
+	control = ttlm->control;
+
+	/* mapping switch time and expected duration fields are not expected
+	 * in case of negotiated TTLM
+	 */
+	if (control & (IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT |
+		       IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT)) {
+		mlme_dbg(sdata,
+			 "Invalid TTLM element in negotiated TTLM request\n");
+		return -EINVAL;
+	}
+
+	if (control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP) {
+		for (tid = 0; tid < IEEE80211_TTLM_NUM_TIDS; tid++) {
+			neg_ttlm->downlink[tid] = sdata->vif.valid_links;
+			neg_ttlm->uplink[tid] = sdata->vif.valid_links;
+		}
+		*direction = IEEE80211_TTLM_DIRECTION_BOTH;
+		return 0;
+	}
+
+	*direction = u8_get_bits(control, IEEE80211_TTLM_CONTROL_DIRECTION);
+	if (*direction != IEEE80211_TTLM_DIRECTION_DOWN &&
+	    *direction != IEEE80211_TTLM_DIRECTION_UP &&
+	    *direction != IEEE80211_TTLM_DIRECTION_BOTH)
+		return -EINVAL;
+
+	link_map_presence = *pos;
+	pos++;
+
+	if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE)
+		map_size = 1;
+	else
+		map_size = 2;
+
+	for (tid = 0; tid < IEEE80211_TTLM_NUM_TIDS; tid++) {
+		u16 map;
+
+		if (link_map_presence & BIT(tid)) {
+			map = ieee80211_get_ttlm(map_size, pos);
+			if (!map) {
+				mlme_dbg(sdata,
+					 "No active links for TID %d", tid);
+				return -EINVAL;
+			}
+		} else {
+			map = 0;
+		}
+
+		switch (*direction) {
+		case IEEE80211_TTLM_DIRECTION_BOTH:
+			neg_ttlm->downlink[tid] = map;
+			neg_ttlm->uplink[tid] = map;
+			break;
+		case IEEE80211_TTLM_DIRECTION_DOWN:
+			neg_ttlm->downlink[tid] = map;
+			break;
+		case IEEE80211_TTLM_DIRECTION_UP:
+			neg_ttlm->uplink[tid] = map;
+			break;
+		default:
+			return -EINVAL;
+		}
+		pos += map_size;
+	}
+	return 0;
+}
+
+void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata,
+				    struct ieee80211_mgmt *mgmt, size_t len)
+{
+	u8 dialog_token, direction[IEEE80211_TTLM_MAX_CNT] = {}, i;
+	size_t ies_len;
+	enum ieee80211_neg_ttlm_res ttlm_res = NEG_TTLM_RES_ACCEPT;
+	struct ieee802_11_elems *elems = NULL;
+	struct ieee80211_neg_ttlm neg_ttlm = {};
+
+	BUILD_BUG_ON(ARRAY_SIZE(direction) != ARRAY_SIZE(elems->ttlm));
+
+	if (!ieee80211_vif_is_mld(&sdata->vif))
+		return;
+
+	dialog_token = mgmt->u.action.u.ttlm_req.dialog_token;
+	ies_len  = len - offsetof(struct ieee80211_mgmt,
+				  u.action.u.ttlm_req.variable);
+	elems = ieee802_11_parse_elems(mgmt->u.action.u.ttlm_req.variable,
+				       ies_len, true, NULL);
+	if (!elems) {
+		ttlm_res = NEG_TTLM_RES_REJECT;
+		goto out;
+	}
+
+	for (i = 0; i < elems->ttlm_num; i++) {
+		if (ieee80211_parse_neg_ttlm(sdata, elems->ttlm[i],
+					     &neg_ttlm, &direction[i]) ||
+		    (direction[i] == IEEE80211_TTLM_DIRECTION_BOTH &&
+		     elems->ttlm_num != 1)) {
+			ttlm_res = NEG_TTLM_RES_REJECT;
+			goto out;
+		}
+	}
+
+	if (!elems->ttlm_num ||
+	    (elems->ttlm_num == 2 && direction[0] == direction[1])) {
+		ttlm_res = NEG_TTLM_RES_REJECT;
+		goto out;
+	}
+
+	for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) {
+		if ((neg_ttlm.downlink[i] &&
+		     (neg_ttlm.downlink[i] & ~sdata->vif.valid_links)) ||
+		    (neg_ttlm.uplink[i] &&
+		     (neg_ttlm.uplink[i] & ~sdata->vif.valid_links))) {
+			ttlm_res = NEG_TTLM_RES_REJECT;
+			goto out;
+		}
+	}
+
+	ttlm_res = drv_can_neg_ttlm(sdata->local, sdata, &neg_ttlm);
+
+	if (ttlm_res != NEG_TTLM_RES_ACCEPT)
+		goto out;
+
+	ieee80211_apply_neg_ttlm(sdata, neg_ttlm);
+out:
+	kfree(elems);
+	ieee80211_send_neg_ttlm_res(sdata, ttlm_res, dialog_token, &neg_ttlm);
+}
+
 void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata,
 				 struct sk_buff *skb)
 {
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 0bf72928ccfc..75eb3e55eaec 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3763,6 +3763,20 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 			break;
 		}
 		break;
+	case WLAN_CATEGORY_PROTECTED_EHT:
+		switch (mgmt->u.action.u.ttlm_req.action_code) {
+		case WLAN_PROTECTED_EHT_ACTION_TTLM_REQ:
+			if (sdata->vif.type != NL80211_IFTYPE_STATION)
+				break;
+
+			if (len < offsetofend(typeof(*mgmt),
+					      u.action.u.ttlm_req))
+				goto invalid;
+			goto queue;
+		default:
+			break;
+		}
+		break;
 	}
 
 	return RX_CONTINUE;
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 06835ed4c44f..1c0c46b11c6d 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -3088,6 +3088,58 @@ TRACE_EVENT(stop_queue,
 	)
 );
 
+TRACE_EVENT(drv_can_neg_ttlm,
+	    TP_PROTO(struct ieee80211_local *local,
+		     struct ieee80211_sub_if_data *sdata,
+		     struct ieee80211_neg_ttlm *neg_ttlm),
+
+	TP_ARGS(local, sdata, neg_ttlm),
+
+	TP_STRUCT__entry(LOCAL_ENTRY
+			 VIF_ENTRY
+			 __array(u16, downlink, sizeof(u16) * 8)
+			 __array(u16, uplink, sizeof(u16) * 8)
+	),
+
+	TP_fast_assign(LOCAL_ASSIGN;
+		       VIF_ASSIGN;
+		       memcpy(__entry->downlink, neg_ttlm->downlink,
+			      sizeof(neg_ttlm->downlink));
+		       memcpy(__entry->uplink, neg_ttlm->uplink,
+			      sizeof(neg_ttlm->uplink));
+	),
+
+	TP_printk(LOCAL_PR_FMT ", " VIF_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG)
+);
+
+TRACE_EVENT(drv_neg_ttlm_res,
+	    TP_PROTO(struct ieee80211_local *local,
+		     struct ieee80211_sub_if_data *sdata,
+		     enum ieee80211_neg_ttlm_res res,
+		     struct ieee80211_neg_ttlm *neg_ttlm),
+
+	TP_ARGS(local, sdata, res, neg_ttlm),
+
+	TP_STRUCT__entry(LOCAL_ENTRY
+			 VIF_ENTRY
+			 __field(u32, res)
+			 __array(u16, downlink, sizeof(u16) * 8)
+			 __array(u16, uplink, sizeof(u16) * 8)
+	),
+
+	TP_fast_assign(LOCAL_ASSIGN;
+		       VIF_ASSIGN;
+		       __entry->res = res;
+		       memcpy(__entry->downlink, neg_ttlm->downlink,
+			      sizeof(neg_ttlm->downlink));
+		       memcpy(__entry->uplink, neg_ttlm->uplink,
+			      sizeof(neg_ttlm->uplink));
+	),
+
+	TP_printk(LOCAL_PR_FMT  VIF_PR_FMT " response: %d\n ",
+		  LOCAL_PR_ARG, VIF_PR_ARG, __entry->res
+	)
+);
 #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3


From f7660b3f584aadd25dde18aa1902488577a15863 Mon Sep 17 00:00:00 2001
From: Ayala Beker <ayala.beker@intel.com>
Date: Tue, 2 Jan 2024 21:35:37 +0200
Subject: wifi: mac80211: add support for negotiated TTLM request

Update neg_ttlm and active_links according to the new mapping,
and send a negotiated TID-to-link map request with the new mapping.

Signed-off-by: Ayala Beker <ayala.beker@intel.com>
Reviewed-by: Gregory Greenman <gregory.greenman@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240102213313.eeb385d771df.I2a5441c14421de884dbd93d1624ce7bb2c944833@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  |  37 +++++++++++++++
 include/net/mac80211.h     |   7 ++-
 net/mac80211/cfg.c         |  12 +++++
 net/mac80211/ieee80211_i.h |   8 ++++
 net/mac80211/iface.c       |   4 ++
 net/mac80211/mlme.c        | 114 +++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/rx.c          |   8 ++++
 7 files changed, 188 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index eafa70e5ba94..f0c068446c79 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -5024,6 +5024,43 @@ static inline u16 ieee80211_mle_get_eml_cap(const u8 *data)
 	return get_unaligned_le16(common);
 }
 
+/**
+ * ieee80211_mle_get_mld_capa_op - returns the MLD capabilities and operations.
+ * @data: pointer to the multi link EHT IE
+ *
+ * The element is assumed to be of the correct type (BASIC) and big enough,
+ * this must be checked using ieee80211_mle_type_ok().
+ *
+ * If the MLD capabilities and operations field is not present, 0 will be
+ * returned.
+ */
+static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data)
+{
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
+	u16 control = le16_to_cpu(mle->control);
+	const u8 *common = mle->variable;
+
+	/*
+	 * common points now at the beginning of
+	 * ieee80211_mle_basic_common_info
+	 */
+	common += sizeof(struct ieee80211_mle_basic_common_info);
+
+	if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP))
+		return 0;
+
+	if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
+		common += 1;
+	if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
+		common += 1;
+	if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
+		common += 2;
+	if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)
+		common += 2;
+
+	return get_unaligned_le16(common);
+}
+
 /**
  * ieee80211_mle_size_ok - validate multi-link element size
  * @data: pointer to the element data
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 6490b92d5cc1..84cc66dd93c1 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1810,9 +1810,11 @@ enum ieee80211_offload_flags {
  * @ps: power-save mode (STA only). This flag is NOT affected by
  *	offchannel/dynamic_ps operations.
  * @aid: association ID number, valid only when @assoc is true
- * @eml_cap: EML capabilities as described in P802.11be_D2.2 Figure 9-1002k.
+ * @eml_cap: EML capabilities as described in P802.11be_D4.1 Figure 9-1001j.
  * @eml_med_sync_delay: Medium Synchronization delay as described in
- *	P802.11be_D2.2 Figure 9-1002j.
+ *	P802.11be_D4.1 Figure 9-1001i.
+ * @mld_capa_op: MLD Capabilities and Operations per P802.11be_D4.1
+ *	Figure 9-1001k
  * @arp_addr_list: List of IPv4 addresses for hardware ARP filtering. The
  *	may filter ARP queries targeted for other addresses than listed here.
  *	The driver must allow ARP queries targeted for all address listed here
@@ -1837,6 +1839,7 @@ struct ieee80211_vif_cfg {
 	u16 aid;
 	u16 eml_cap;
 	u16 eml_med_sync_delay;
+	u16 mld_capa_op;
 
 	__be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN];
 	int arp_addr_cnt;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 489dd97f5172..1e8da6372da2 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -4966,6 +4966,17 @@ static int ieee80211_set_hw_timestamp(struct wiphy *wiphy,
 	return local->ops->set_hw_timestamp(&local->hw, &sdata->vif, hwts);
 }
 
+static int
+ieee80211_set_ttlm(struct wiphy *wiphy, struct net_device *dev,
+		   struct cfg80211_ttlm_params *params)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
+	return ieee80211_req_neg_ttlm(sdata, params);
+}
+
 const struct cfg80211_ops mac80211_config_ops = {
 	.add_virtual_intf = ieee80211_add_iface,
 	.del_virtual_intf = ieee80211_del_iface,
@@ -5078,4 +5089,5 @@ const struct cfg80211_ops mac80211_config_ops = {
 	.mod_link_station = ieee80211_mod_link_station,
 	.del_link_station = ieee80211_del_link_station,
 	.set_hw_timestamp = ieee80211_set_hw_timestamp,
+	.set_ttlm = ieee80211_set_ttlm,
 };
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 38755f6f6fa0..79a5067ce82b 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -579,6 +579,10 @@ struct ieee80211_if_managed {
 	/* TID-to-link mapping support */
 	struct wiphy_delayed_work ttlm_work;
 	struct ieee80211_adv_ttlm_info ttlm_info;
+
+	/* dialog token enumerator for neg TTLM request */
+	u8 dialog_token_alloc;
+	struct wiphy_delayed_work neg_ttlm_timeout_work;
 };
 
 struct ieee80211_if_ibss {
@@ -2605,6 +2609,10 @@ ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata,
 				    struct link_sta_info *link_sta);
 void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata,
 				    struct ieee80211_mgmt *mgmt, size_t len);
+void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata,
+				    struct ieee80211_mgmt *mgmt, size_t len);
+int ieee80211_req_neg_ttlm(struct ieee80211_sub_if_data *sdata,
+			   struct cfg80211_ttlm_params *params);
 
 void ieee80211_check_wbrf_support(struct ieee80211_local *local);
 void ieee80211_add_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 4a87d2d336ae..df314222c2c9 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1554,6 +1554,10 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local,
 				ieee80211_process_neg_ttlm_req(sdata, mgmt,
 							       skb->len);
 				break;
+			case WLAN_PROTECTED_EHT_ACTION_TTLM_RES:
+				ieee80211_process_neg_ttlm_res(sdata, mgmt,
+							       skb->len);
+				break;
 			default:
 				break;
 			}
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index b56f7de3b53f..5279af09fd53 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -46,6 +46,8 @@
 #define IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS msecs_to_jiffies(100)
 #define IEEE80211_ADV_TTLM_ST_UNDERFLOW 0xff00
 
+#define IEEE80211_NEG_TTLM_REQ_TIMEOUT (HZ / 5)
+
 static int max_nullfunc_tries = 2;
 module_param(max_nullfunc_tries, int, 0644);
 MODULE_PARM_DESC(max_nullfunc_tries,
@@ -3087,6 +3089,8 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	memset(&sdata->u.mgd.ttlm_info, 0,
 	       sizeof(sdata->u.mgd.ttlm_info));
 	wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->ttlm_work);
+	wiphy_delayed_work_cancel(sdata->local->hw.wiphy,
+				  &ifmgd->neg_ttlm_timeout_work);
 	ieee80211_vif_set_links(sdata, 0, 0);
 }
 
@@ -4984,6 +4988,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 				ieee80211_mle_get_eml_cap(eht_ml_elem->data + 1);
 			sdata->vif.cfg.eml_med_sync_delay =
 				ieee80211_mle_get_eml_med_sync_delay(eht_ml_elem->data + 1);
+			sdata->vif.cfg.mld_capa_op =
+				ieee80211_mle_get_mld_capa_op(eht_ml_elem->data + 1);
 		}
 	}
 
@@ -6501,6 +6507,19 @@ static void ieee80211_apply_neg_ttlm(struct ieee80211_sub_if_data *sdata,
 	sdata->vif.neg_ttlm.valid = true;
 }
 
+static void ieee80211_neg_ttlm_timeout_work(struct wiphy *wiphy,
+					    struct wiphy_work *work)
+{
+	struct ieee80211_sub_if_data *sdata =
+		container_of(work, struct ieee80211_sub_if_data,
+			     u.mgd.neg_ttlm_timeout_work.work);
+
+	sdata_info(sdata,
+		   "No negotiated TTLM response from AP, disconnecting.\n");
+
+	__ieee80211_disconnect(sdata);
+}
+
 static void
 ieee80211_neg_ttlm_add_suggested_map(struct sk_buff *skb,
 				     struct ieee80211_neg_ttlm *neg_ttlm)
@@ -6547,6 +6566,74 @@ ieee80211_neg_ttlm_add_suggested_map(struct sk_buff *skb,
 	}
 }
 
+static void
+ieee80211_send_neg_ttlm_req(struct ieee80211_sub_if_data *sdata,
+			    struct ieee80211_neg_ttlm *neg_ttlm,
+			    u8 dialog_token)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_mgmt *mgmt;
+	struct sk_buff *skb;
+	int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_req);
+	int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 +
+		2 * 2 * IEEE80211_TTLM_NUM_TIDS;
+
+	skb = dev_alloc_skb(local->tx_headroom + hdr_len + ttlm_max_len);
+	if (!skb)
+		return;
+
+	skb_reserve(skb, local->tx_headroom);
+	mgmt = skb_put_zero(skb, hdr_len);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+	memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+	memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN);
+
+	mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT;
+	mgmt->u.action.u.ttlm_req.action_code =
+		WLAN_PROTECTED_EHT_ACTION_TTLM_REQ;
+	mgmt->u.action.u.ttlm_req.dialog_token = dialog_token;
+	ieee80211_neg_ttlm_add_suggested_map(skb, neg_ttlm);
+	ieee80211_tx_skb(sdata, skb);
+}
+
+int ieee80211_req_neg_ttlm(struct ieee80211_sub_if_data *sdata,
+			   struct cfg80211_ttlm_params *params)
+{
+	struct ieee80211_neg_ttlm neg_ttlm = {};
+	u8 i;
+
+	if (!ieee80211_vif_is_mld(&sdata->vif) ||
+	    !(sdata->vif.cfg.mld_capa_op &
+	      IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP))
+		return -EINVAL;
+
+	for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) {
+		if ((params->dlink[i] & ~sdata->vif.valid_links) ||
+		    (params->ulink[i] & ~sdata->vif.valid_links))
+			return -EINVAL;
+
+		neg_ttlm.downlink[i] = params->dlink[i];
+		neg_ttlm.uplink[i] = params->ulink[i];
+	}
+
+	if (drv_can_neg_ttlm(sdata->local, sdata, &neg_ttlm) !=
+	    NEG_TTLM_RES_ACCEPT)
+		return -EINVAL;
+
+	ieee80211_apply_neg_ttlm(sdata, neg_ttlm);
+	sdata->u.mgd.dialog_token_alloc++;
+	ieee80211_send_neg_ttlm_req(sdata, &sdata->vif.neg_ttlm,
+				    sdata->u.mgd.dialog_token_alloc);
+	wiphy_delayed_work_cancel(sdata->local->hw.wiphy,
+				  &sdata->u.mgd.neg_ttlm_timeout_work);
+	wiphy_delayed_work_queue(sdata->local->hw.wiphy,
+				 &sdata->u.mgd.neg_ttlm_timeout_work,
+				 IEEE80211_NEG_TTLM_REQ_TIMEOUT);
+	return 0;
+}
+
 static void
 ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata,
 			    enum ieee80211_neg_ttlm_res ttlm_res,
@@ -6740,6 +6827,29 @@ out:
 	ieee80211_send_neg_ttlm_res(sdata, ttlm_res, dialog_token, &neg_ttlm);
 }
 
+void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata,
+				    struct ieee80211_mgmt *mgmt, size_t len)
+{
+	if (!ieee80211_vif_is_mld(&sdata->vif) ||
+	    mgmt->u.action.u.ttlm_req.dialog_token !=
+	    sdata->u.mgd.dialog_token_alloc)
+		return;
+
+	wiphy_delayed_work_cancel(sdata->local->hw.wiphy,
+				  &sdata->u.mgd.neg_ttlm_timeout_work);
+
+	/* MLD station sends a TID to link mapping request, mainly to handle
+	 * BTM (BSS transition management) request, in which case it needs to
+	 * restrict the active links set.
+	 * In this case it's not expected that the MLD AP will reject the
+	 * negotiated TTLM request.
+	 * This can be better implemented in the future, to handle request
+	 * rejections.
+	 */
+	if (mgmt->u.action.u.ttlm_res.status_code != WLAN_STATUS_SUCCESS)
+		__ieee80211_disconnect(sdata);
+}
+
 void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata,
 				 struct sk_buff *skb)
 {
@@ -7369,6 +7479,8 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
 				ieee80211_sta_handle_tspec_ac_params_wk);
 	wiphy_delayed_work_init(&ifmgd->ttlm_work,
 				ieee80211_tid_to_link_map_work);
+	wiphy_delayed_work_init(&ifmgd->neg_ttlm_timeout_work,
+				ieee80211_neg_ttlm_timeout_work);
 
 	ifmgd->flags = 0;
 	ifmgd->powersave = sdata->wdev.ps;
@@ -8459,6 +8571,8 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata)
 	wiphy_delayed_work_cancel(sdata->local->hw.wiphy,
 				  &ifmgd->ml_reconf_work);
 	wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->ttlm_work);
+	wiphy_delayed_work_cancel(sdata->local->hw.wiphy,
+				  &ifmgd->neg_ttlm_timeout_work);
 
 	if (ifmgd->assoc_data)
 		ieee80211_destroy_assoc_data(sdata, ASSOC_TIMEOUT);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 75eb3e55eaec..615795c4b052 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3773,6 +3773,14 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 					      u.action.u.ttlm_req))
 				goto invalid;
 			goto queue;
+		case WLAN_PROTECTED_EHT_ACTION_TTLM_RES:
+			if (sdata->vif.type != NL80211_IFTYPE_STATION)
+				break;
+
+			if (len < offsetofend(typeof(*mgmt),
+					      u.action.u.ttlm_res))
+				goto invalid;
+			goto queue;
 		default:
 			break;
 		}
-- 
cgit v1.2.3


From 2518e89d5b1913c360f8e4cd9fc6eda6146b8800 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 2 Jan 2024 21:35:39 +0200
Subject: wifi: cfg80211: add support for SPP A-MSDUs

Add SPP (signaling and payload protected) AMSDU support.

Since userspace has to build the RSNX element, add an extended
feature flag to indicate that this is supported.

In order to avoid downgrade/mismatch attacks, add a flag to the assoc
command on the station side, so that we can be sure that the value of
the flag comes from the same RSNX element that will be validated by
the supplicant against the 4-way-handshake. If we just pulled the
data out of a beacon/probe response, we could theoretically look an
RSNX element from a different frame, with a different value for this
flag, than the supplicant is using to validate in the
4-way-handshake.

Note that this patch is only geared towards software crypto
implementations or hardware ones that can perfectly implement SPP
A-MSDUs, i.e. are able to switch the AAD construction on the fly for
each TX/RX frame.

For more limited hardware implementations, more capability
advertisement  would be required, e.g. if the hardware has no way
to switch this on the fly but has only a global configuration that
must apply to all stations.

The driver could of course *reject* mismatches, but the supplicant
must know so it can do things like not negotiating SPP A-MSDUs on
a T-DLS link when connected to an AP that doesn't support it, or
similar.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Daniel Gabay <daniel.gabay@intel.com>
Reviewed-by: Gregory Greenman <gregory.greenman@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240102213313.fadac8df7030.I9240aebcba1be49636a73c647ed0af862713fc6f@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  2 ++
 include/uapi/linux/nl80211.h | 14 ++++++++++++++
 net/wireless/nl80211.c       | 24 +++++++++++++++++++++---
 3 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index cf79656ce09c..56bce924bec6 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3059,6 +3059,7 @@ struct cfg80211_assoc_link {
  * @CONNECT_REQ_MLO_SUPPORT: Userspace indicates support for handling MLD links.
  *	Drivers shall disable MLO features for the current association if this
  *	flag is not set.
+ * @ASSOC_REQ_SPP_AMSDU: SPP A-MSDUs will be used on this connection (if any)
  */
 enum cfg80211_assoc_req_flags {
 	ASSOC_REQ_DISABLE_HT			= BIT(0),
@@ -3068,6 +3069,7 @@ enum cfg80211_assoc_req_flags {
 	ASSOC_REQ_DISABLE_HE			= BIT(4),
 	ASSOC_REQ_DISABLE_EHT			= BIT(5),
 	CONNECT_REQ_MLO_SUPPORT			= BIT(6),
+	ASSOC_REQ_SPP_AMSDU			= BIT(7),
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 1ccdcae24372..3e239df3528f 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2851,6 +2851,10 @@ enum nl80211_commands {
  *      mapping is as defined in section 9.4.2.314 (TID-To-Link Mapping element)
  *      in Draft P802.11be_D4.0.
  *
+ * @NL80211_ATTR_ASSOC_SPP_AMSDU: flag attribute used with
+ *	%NL80211_CMD_ASSOCIATE indicating the SPP A-MSDUs
+ *	are used on this connection
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3394,6 +3398,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_MLO_TTLM_DLINK,
 	NL80211_ATTR_MLO_TTLM_ULINK,
 
+	NL80211_ATTR_ASSOC_SPP_AMSDU,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3534,6 +3540,7 @@ enum nl80211_iftype {
  * @NL80211_STA_FLAG_ASSOCIATED: station is associated; used with drivers
  *	that support %NL80211_FEATURE_FULL_AP_CLIENT_STATE to transition a
  *	previously added station into associated state
+ * @NL80211_STA_FLAG_SPP_AMSDU: station supports SPP A-MSDUs
  * @NL80211_STA_FLAG_MAX: highest station flag number currently defined
  * @__NL80211_STA_FLAG_AFTER_LAST: internal use
  */
@@ -3546,6 +3553,7 @@ enum nl80211_sta_flags {
 	NL80211_STA_FLAG_AUTHENTICATED,
 	NL80211_STA_FLAG_TDLS_PEER,
 	NL80211_STA_FLAG_ASSOCIATED,
+	NL80211_STA_FLAG_SPP_AMSDU,
 
 	/* keep last */
 	__NL80211_STA_FLAG_AFTER_LAST,
@@ -6520,6 +6528,11 @@ enum nl80211_feature_flags {
  *	DFS master on the same channel as described in FCC-594280 D01
  *	(Section B.3). This, for example, allows P2P GO and P2P clients to
  *	operate on DFS channels as long as there's a concurrent BSS connection.
+ *
+ * @NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT: The driver has support for SPP
+ *	(signaling and payload protected) A-MSDUs and this shall be advertised
+ *	in the RSNXE.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -6594,6 +6607,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_OWE_OFFLOAD,
 	NL80211_EXT_FEATURE_OWE_OFFLOAD_AP,
 	NL80211_EXT_FEATURE_DFS_CONCURRENT,
+	NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b09700400d09..b267317aa33c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -821,6 +821,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_BSS_DUMP_INCLUDE_USE_DATA] = { .type = NLA_FLAG },
 	[NL80211_ATTR_MLO_TTLM_DLINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8),
 	[NL80211_ATTR_MLO_TTLM_ULINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8),
+	[NL80211_ATTR_ASSOC_SPP_AMSDU] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -6874,7 +6875,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
 		return -EINVAL;
 
 	/* When you run into this, adjust the code below for the new flag */
-	BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7);
+	BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 8);
 
 	switch (statype) {
 	case CFG80211_STA_MESH_PEER_KERNEL:
@@ -6934,6 +6935,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
 		    params->link_sta_params.he_capa ||
 		    params->link_sta_params.eht_capa)
 			return -EINVAL;
+		if (params->sta_flags_mask & BIT(NL80211_STA_FLAG_SPP_AMSDU))
+			return -EINVAL;
 	}
 
 	if (statype != CFG80211_STA_AP_CLIENT &&
@@ -6957,7 +6960,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
 				  BIT(NL80211_STA_FLAG_ASSOCIATED) |
 				  BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) |
 				  BIT(NL80211_STA_FLAG_WME) |
-				  BIT(NL80211_STA_FLAG_MFP)))
+				  BIT(NL80211_STA_FLAG_MFP) |
+				  BIT(NL80211_STA_FLAG_SPP_AMSDU)))
 			return -EINVAL;
 
 		/* but authenticated/associated only if driver handles it */
@@ -7516,7 +7520,7 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 
 	/* When you run into this, adjust the code below for the new flag */
-	BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7);
+	BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 8);
 
 	switch (dev->ieee80211_ptr->iftype) {
 	case NL80211_IFTYPE_AP:
@@ -7540,6 +7544,11 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 		    params.sta_flags_mask & auth_assoc)
 			return -EINVAL;
 
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT) &&
+		    params.sta_flags_mask & BIT(NL80211_STA_FLAG_SPP_AMSDU))
+			return -EINVAL;
+
 		/* Older userspace, or userspace wanting to be compatible with
 		 * !NL80211_FEATURE_FULL_AP_CLIENT_STATE, will not set the auth
 		 * and assoc flags in the mask, but assumes the station will be
@@ -11102,6 +11111,15 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 		       sizeof(req.s1g_capa));
 	}
 
+	if (nla_get_flag(info->attrs[NL80211_ATTR_ASSOC_SPP_AMSDU])) {
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT)) {
+			GENL_SET_ERR_MSG(info, "SPP A-MSDUs not supported");
+			return -EINVAL;
+		}
+		req.flags |= ASSOC_REQ_SPP_AMSDU;
+	}
+
 	req.link_id = nl80211_link_id_or_invalid(info->attrs);
 
 	if (info->attrs[NL80211_ATTR_MLO_LINKS]) {
-- 
cgit v1.2.3


From 3b220ed8b2172fd8edb22a62a5c3a9a9c9f2b6bd Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 2 Jan 2024 21:35:40 +0200
Subject: wifi: mac80211: add support for SPP A-MSDUs

If software crypto is used, simply add support for SPP A-MSDUs
(and use it whenever enabled as required by the cfg80211 API).

If hardware crypto is used, leave it up to the driver to set
the NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT flag and then check
sta->spp_amsdu or the IEEE80211_KEY_FLAG_SPP_AMSDU key flag.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Daniel Gabay <daniel.gabay@intel.com>
Reviewed-by: Gregory Greenman <gregory.greenman@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240102213313.b8ada4514e2b.I1ac25d5f158165b5a88062a5a5e4c4fbeecf9a5d@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  5 +++++
 net/mac80211/cfg.c         |  3 +++
 net/mac80211/ieee80211_i.h |  1 +
 net/mac80211/key.c         |  4 ++++
 net/mac80211/main.c        |  5 ++++-
 net/mac80211/mlme.c        |  4 ++++
 net/mac80211/wpa.c         | 33 +++++++++++++++++++++++----------
 7 files changed, 44 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 84cc66dd93c1..8d6ae22c09bf 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2081,6 +2081,8 @@ static inline bool lockdep_vif_wiphy_mutex_held(struct ieee80211_vif *vif)
  * @IEEE80211_KEY_FLAG_GENERATE_MMIE: This flag should be set by the driver
  *	for a AES_CMAC key to indicate that it requires sequence number
  *	generation only
+ * @IEEE80211_KEY_FLAG_SPP_AMSDU: SPP A-MSDUs can be used with this key
+ *	(set by mac80211 from the sta->spp_amsdu flag)
  */
 enum ieee80211_key_flags {
 	IEEE80211_KEY_FLAG_GENERATE_IV_MGMT	= BIT(0),
@@ -2094,6 +2096,7 @@ enum ieee80211_key_flags {
 	IEEE80211_KEY_FLAG_PUT_MIC_SPACE	= BIT(8),
 	IEEE80211_KEY_FLAG_NO_AUTO_TX		= BIT(9),
 	IEEE80211_KEY_FLAG_GENERATE_MMIE	= BIT(10),
+	IEEE80211_KEY_FLAG_SPP_AMSDU		= BIT(11),
 };
 
 /**
@@ -2392,6 +2395,7 @@ struct ieee80211_link_sta {
  *	would be assigned to link[link_id] where link_id is the id assigned
  *	by the AP.
  * @valid_links: bitmap of valid links, or 0 for non-MLO
+ * @spp_amsdu: indicates whether the STA uses SPP A-MSDU or not.
  */
 struct ieee80211_sta {
 	u8 addr[ETH_ALEN];
@@ -2405,6 +2409,7 @@ struct ieee80211_sta {
 	bool tdls_initiator;
 	bool mfp;
 	bool mlo;
+	bool spp_amsdu;
 	u8 max_amsdu_subframes;
 
 	struct ieee80211_sta_aggregates *cur;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 1e8da6372da2..dd237081dbe3 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1942,6 +1942,9 @@ static int sta_apply_parameters(struct ieee80211_local *local,
 			clear_sta_flag(sta, WLAN_STA_TDLS_PEER);
 	}
 
+	if (mask & BIT(NL80211_STA_FLAG_SPP_AMSDU))
+		sta->sta.spp_amsdu = set & BIT(NL80211_STA_FLAG_SPP_AMSDU);
+
 	/* mark TDLS channel switch support, if the AP allows it */
 	if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) &&
 	    !sdata->deflink.u.mgd.tdls_chan_switch_prohibited &&
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 79a5067ce82b..eb32174984c3 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -441,6 +441,7 @@ struct ieee80211_mgd_assoc_data {
 	bool timeout_started;
 	bool comeback; /* whether the AP has requested association comeback */
 	bool s1g;
+	bool spp_amsdu;
 
 	unsigned int assoc_link_id;
 
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index af74d7f9d94d..a2cce62c97b7 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -925,6 +925,10 @@ int ieee80211_key_link(struct ieee80211_key *key,
 	 */
 	key->color = atomic_inc_return(&key_color);
 
+	/* keep this flag for easier access later */
+	if (sta && sta->sta.spp_amsdu)
+		key->conf.flags |= IEEE80211_KEY_FLAG_SPP_AMSDU;
+
 	increment_tailroom_need_count(sdata);
 
 	ret = ieee80211_key_replace(sdata, link, sta, pairwise, old_key, key);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 13c417eda281..d48fa1147c14 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -735,8 +735,11 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 				      NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT);
 	}
 
-	if (!ops->set_key)
+	if (!ops->set_key) {
 		wiphy->flags |= WIPHY_FLAG_IBSS_RSN;
+		wiphy_ext_feature_set(wiphy,
+				      NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT);
+	}
 
 	wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_TXQS);
 	wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_RRM);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 5279af09fd53..fba596d81280 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -5132,6 +5132,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 	if (WARN_ON(!sta))
 		goto out_err;
 
+	sta->sta.spp_amsdu = assoc_data->spp_amsdu;
+
 	if (ieee80211_vif_is_mld(&sdata->vif)) {
 		for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) {
 			if (!assoc_data->link[link_id].bss)
@@ -8216,6 +8218,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 	assoc_data->wmm = bss->wmm_used &&
 			  (local->hw.queues >= IEEE80211_NUM_ACS);
 
+	assoc_data->spp_amsdu = req->flags & ASSOC_REQ_SPP_AMSDU;
+
 	/*
 	 * IEEE802.11n does not allow TKIP/WEP as pairwise ciphers in HT mode.
 	 * We still associate in non-HT mode (11a/b/g) if any one of these
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 94dae7cb6dbd..e40529b8c5c9 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -315,7 +315,7 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx)
  * Calculate AAD for CCMP/GCMP, returning qos_tid since we
  * need that in CCMP also for b_0.
  */
-static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad)
+static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad, bool spp_amsdu)
 {
 	struct ieee80211_hdr *hdr = (void *)skb->data;
 	__le16 mask_fc;
@@ -340,7 +340,14 @@ static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad)
 		len_a += 6;
 
 	if (ieee80211_is_data_qos(hdr->frame_control)) {
-		qos_tid = ieee80211_get_tid(hdr);
+		qos_tid = *ieee80211_get_qos_ctl(hdr);
+
+		if (spp_amsdu)
+			qos_tid &= IEEE80211_QOS_CTL_TID_MASK |
+				   IEEE80211_QOS_CTL_A_MSDU_PRESENT;
+		else
+			qos_tid &= IEEE80211_QOS_CTL_TID_MASK;
+
 		mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_ORDER);
 		len_a += 2;
 	} else {
@@ -369,10 +376,11 @@ static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad)
 	return qos_tid;
 }
 
-static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *b_0, u8 *aad)
+static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *b_0, u8 *aad,
+				bool spp_amsdu)
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
-	u8 qos_tid = ccmp_gcmp_aad(skb, aad);
+	u8 qos_tid = ccmp_gcmp_aad(skb, aad, spp_amsdu);
 
 	/* In CCM, the initial vectors (IV) used for CTR mode encryption and CBC
 	 * mode authentication are not allowed to collide, yet both are derived
@@ -479,7 +487,8 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb,
 		return 0;
 
 	pos += IEEE80211_CCMP_HDR_LEN;
-	ccmp_special_blocks(skb, pn, b_0, aad);
+	ccmp_special_blocks(skb, pn, b_0, aad,
+			    key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU);
 	return ieee80211_aes_ccm_encrypt(key->u.ccmp.tfm, b_0, aad, pos, len,
 					 skb_put(skb, mic_len));
 }
@@ -557,7 +566,8 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
 			u8 aad[2 * AES_BLOCK_SIZE];
 			u8 b_0[AES_BLOCK_SIZE];
 			/* hardware didn't decrypt/verify MIC */
-			ccmp_special_blocks(skb, pn, b_0, aad);
+			ccmp_special_blocks(skb, pn, b_0, aad,
+					    key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU);
 
 			if (ieee80211_aes_ccm_decrypt(
 				    key->u.ccmp.tfm, b_0, aad,
@@ -581,7 +591,8 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
 	return RX_CONTINUE;
 }
 
-static void gcmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *j_0, u8 *aad)
+static void gcmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *j_0, u8 *aad,
+				bool spp_amsdu)
 {
 	struct ieee80211_hdr *hdr = (void *)skb->data;
 
@@ -591,7 +602,7 @@ static void gcmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *j_0, u8 *aad)
 	j_0[14] = 0;
 	j_0[AES_BLOCK_SIZE - 1] = 0x01;
 
-	ccmp_gcmp_aad(skb, aad);
+	ccmp_gcmp_aad(skb, aad, spp_amsdu);
 }
 
 static inline void gcmp_pn2hdr(u8 *hdr, const u8 *pn, int key_id)
@@ -680,7 +691,8 @@ static int gcmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
 		return 0;
 
 	pos += IEEE80211_GCMP_HDR_LEN;
-	gcmp_special_blocks(skb, pn, j_0, aad);
+	gcmp_special_blocks(skb, pn, j_0, aad,
+			    key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU);
 	return ieee80211_aes_gcm_encrypt(key->u.gcmp.tfm, j_0, aad, pos, len,
 					 skb_put(skb, IEEE80211_GCMP_MIC_LEN));
 }
@@ -753,7 +765,8 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
 			u8 aad[2 * AES_BLOCK_SIZE];
 			u8 j_0[AES_BLOCK_SIZE];
 			/* hardware didn't decrypt/verify MIC */
-			gcmp_special_blocks(skb, pn, j_0, aad);
+			gcmp_special_blocks(skb, pn, j_0, aad,
+					    key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU);
 
 			if (ieee80211_aes_gcm_decrypt(
 				    key->u.gcmp.tfm, j_0, aad,
-- 
cgit v1.2.3


From ccb964b4ab1663ce92f389b72c052fc47a0ffdb9 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 2 Jan 2024 21:35:44 +0200
Subject: wifi: cfg80211: validate MLO connections better

When going into an MLO connection, validate that the link IDs
match what userspace indicated, and that the AP MLD addresses
and capabilities are all matching between the links.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Gregory Greenman <gregory.greenman@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240102213313.ff83c034cb9a.I9962db0bfa8c73b37b8d5b59a3fad7f02f2129ae@changeid
[roll in extra fix from Miri to actually check the return value]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h |  24 ++++++++
 net/wireless/core.h       |   3 +-
 net/wireless/mlme.c       | 136 +++++++++++++++++++++++++++++++++++++++++-----
 net/wireless/nl80211.c    |   3 +-
 net/wireless/sme.c        |   3 +-
 5 files changed, 152 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index f0c068446c79..a70388ae3a7b 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4935,6 +4935,30 @@ static inline u8 ieee80211_mle_common_size(const u8 *data)
 	return sizeof(*mle) + common + mle->variable[0];
 }
 
+/**
+ * ieee80211_mle_get_link_id - returns the link ID
+ * @data: the basic multi link element
+ *
+ * The element is assumed to be of the correct type (BASIC) and big enough,
+ * this must be checked using ieee80211_mle_type_ok().
+ *
+ * If the BSS link ID can't be found, -1 will be returned
+ */
+static inline int ieee80211_mle_get_link_id(const u8 *data)
+{
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
+	u16 control = le16_to_cpu(mle->control);
+	const u8 *common = mle->variable;
+
+	/* common points now at the beginning of ieee80211_mle_basic_common_info */
+	common += sizeof(struct ieee80211_mle_basic_common_info);
+
+	if (!(control & IEEE80211_MLC_BASIC_PRES_LINK_ID))
+		return -1;
+
+	return *common;
+}
+
 /**
  * ieee80211_mle_get_bss_param_ch_cnt - returns the BSS parameter change count
  * @mle: the basic multi link element
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 13657a85cf61..30434551b377 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -362,7 +362,8 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
 		       struct cfg80211_auth_request *req);
 int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
 			struct net_device *dev,
-			struct cfg80211_assoc_request *req);
+			struct cfg80211_assoc_request *req,
+			struct netlink_ext_ack *extack);
 int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
 			 struct net_device *dev, const u8 *bssid,
 			 const u8 *ie, int ie_len, u16 reason,
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 43ba7891e2a3..4052041a19ea 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -4,7 +4,7 @@
  *
  * Copyright (c) 2009, Jouni Malinen <j@w1.fi>
  * Copyright (c) 2015		Intel Deutschland GmbH
- * Copyright (C) 2019-2020, 2022-2023 Intel Corporation
+ * Copyright (C) 2019-2020, 2022-2024 Intel Corporation
  */
 
 #include <linux/kernel.h>
@@ -325,28 +325,136 @@ void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa,
 		p1[i] &= p2[i];
 }
 
-/* Note: caller must cfg80211_put_bss() regardless of result */
-int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
-			struct net_device *dev,
-			struct cfg80211_assoc_request *req)
+static int
+cfg80211_mlme_check_mlo_compat(const struct ieee80211_multi_link_elem *mle_a,
+			       const struct ieee80211_multi_link_elem *mle_b,
+			       struct netlink_ext_ack *extack)
 {
-	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	int err, i, j;
+	const struct ieee80211_mle_basic_common_info *common_a, *common_b;
 
-	lockdep_assert_wiphy(wdev->wiphy);
+	common_a = (const void *)mle_a->variable;
+	common_b = (const void *)mle_b->variable;
+
+	if (memcmp(common_a->mld_mac_addr, common_b->mld_mac_addr, ETH_ALEN)) {
+		NL_SET_ERR_MSG(extack, "AP MLD address mismatch");
+		return -EINVAL;
+	}
+
+	if (ieee80211_mle_get_eml_med_sync_delay((const u8 *)mle_a) !=
+	    ieee80211_mle_get_eml_med_sync_delay((const u8 *)mle_b)) {
+		NL_SET_ERR_MSG(extack, "link EML medium sync delay mismatch");
+		return -EINVAL;
+	}
+
+	if (ieee80211_mle_get_eml_cap((const u8 *)mle_a) !=
+	    ieee80211_mle_get_eml_cap((const u8 *)mle_b)) {
+		NL_SET_ERR_MSG(extack, "link EML capabilities mismatch");
+		return -EINVAL;
+	}
+
+	if (ieee80211_mle_get_mld_capa_op((const u8 *)mle_a) !=
+	    ieee80211_mle_get_mld_capa_op((const u8 *)mle_b)) {
+		NL_SET_ERR_MSG(extack, "link MLD capabilities/ops mismatch");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int cfg80211_mlme_check_mlo(struct net_device *dev,
+				   struct cfg80211_assoc_request *req,
+				   struct netlink_ext_ack *extack)
+{
+	const struct ieee80211_multi_link_elem *mles[ARRAY_SIZE(req->links)] = {};
+	int i;
+
+	if (req->link_id < 0)
+		return 0;
+
+	if (!req->links[req->link_id].bss) {
+		NL_SET_ERR_MSG(extack, "no BSS for assoc link");
+		return -EINVAL;
+	}
+
+	rcu_read_lock();
+	for (i = 0; i < ARRAY_SIZE(req->links); i++) {
+		const struct cfg80211_bss_ies *ies;
+		const struct element *ml;
 
-	for (i = 1; i < ARRAY_SIZE(req->links); i++) {
 		if (!req->links[i].bss)
 			continue;
-		for (j = 0; j < i; j++) {
-			if (req->links[i].bss == req->links[j].bss)
-				return -EINVAL;
+
+		if (ether_addr_equal(req->links[i].bss->bssid, dev->dev_addr)) {
+			NL_SET_ERR_MSG(extack, "BSSID must not be our address");
+			req->links[i].error = -EINVAL;
+			goto error;
 		}
 
-		if (ether_addr_equal(req->links[i].bss->bssid, dev->dev_addr))
-			return -EINVAL;
+		ies = rcu_dereference(req->links[i].bss->ies);
+		ml = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_MULTI_LINK,
+					    ies->data, ies->len);
+		if (!ml) {
+			NL_SET_ERR_MSG(extack, "MLO BSS w/o ML element");
+			req->links[i].error = -EINVAL;
+			goto error;
+		}
+
+		if (!ieee80211_mle_type_ok(ml->data + 1,
+					   IEEE80211_ML_CONTROL_TYPE_BASIC,
+					   ml->datalen - 1)) {
+			NL_SET_ERR_MSG(extack, "BSS with invalid ML element");
+			req->links[i].error = -EINVAL;
+			goto error;
+		}
+
+		mles[i] = (const void *)(ml->data + 1);
+
+		if (ieee80211_mle_get_link_id((const u8 *)mles[i]) != i) {
+			NL_SET_ERR_MSG(extack, "link ID mismatch");
+			req->links[i].error = -EINVAL;
+			goto error;
+		}
+	}
+
+	if (WARN_ON(!mles[req->link_id]))
+		goto error;
+
+	for (i = 0; i < ARRAY_SIZE(req->links); i++) {
+		if (i == req->link_id || !req->links[i].bss)
+			continue;
+
+		if (WARN_ON(!mles[i]))
+			goto error;
+
+		if (cfg80211_mlme_check_mlo_compat(mles[req->link_id], mles[i],
+						   extack)) {
+			req->links[i].error = -EINVAL;
+			goto error;
+		}
 	}
 
+	rcu_read_unlock();
+	return 0;
+error:
+	rcu_read_unlock();
+	return -EINVAL;
+}
+
+/* Note: caller must cfg80211_put_bss() regardless of result */
+int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
+			struct net_device *dev,
+			struct cfg80211_assoc_request *req,
+			struct netlink_ext_ack *extack)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err;
+
+	lockdep_assert_wiphy(wdev->wiphy);
+
+	err = cfg80211_mlme_check_mlo(dev, req, extack);
+	if (err)
+		return err;
+
 	if (wdev->connected &&
 	    (!req->prev_bssid ||
 	     !ether_addr_equal(wdev->u.client.connected_addr, req->prev_bssid)))
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b267317aa33c..0809f721f045 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -11245,7 +11245,8 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 		struct nlattr *link;
 		int rem = 0;
 
-		err = cfg80211_mlme_assoc(rdev, dev, &req);
+		err = cfg80211_mlme_assoc(rdev, dev, &req,
+					  info->extack);
 
 		if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
 			dev->ieee80211_ptr->conn_owner_nlportid =
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 195c8532734b..82e3ce42206c 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -209,7 +209,8 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev,
 		if (!req.bss) {
 			err = -ENOENT;
 		} else {
-			err = cfg80211_mlme_assoc(rdev, wdev->netdev, &req);
+			err = cfg80211_mlme_assoc(rdev, wdev->netdev,
+						  &req, NULL);
 			cfg80211_put_bss(&rdev->wiphy, req.bss);
 		}
 
-- 
cgit v1.2.3


From a923ff876f4b6133a093482a6d465cde3bc2e65c Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Thu, 25 Jan 2024 14:55:47 -0800
Subject: Revert "nl80211/cfg80211: Specify band specific min RSSI thresholds
 with sched scan"

This *mostly* reverts commit 1e1b11b6a111 ("nl80211/cfg80211: Specify
band specific min RSSI thresholds with sched scan").

During the review of a new patch [1] it was observed that the
functionality being modified was not actually being used by any
in-tree driver. Further research determined that the functionality was
originally introduced to support a new Android interface, but that
interface was subsequently abandoned. Since the functionality has
apparently never been used, remove it. However, to mantain the
sanctity of the UABI, keep the nl80211.h assignments, but clearly mark
them as obsolete.

Cc: Lin Ma <linma@zju.edu.cn>
Cc: Vamsi Krishna <quic_vamsin@quicinc.com>
Link: https://lore.kernel.org/linux-wireless/20240119151201.8670-1-linma@zju.edu.cn/ [1]
Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Link: https://msgid.link/20240125-for-next-v1-1-fd79e01c6c09@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  8 -------
 include/uapi/linux/nl80211.h | 16 ++++---------
 net/wireless/nl80211.c       | 55 --------------------------------------------
 3 files changed, 4 insertions(+), 75 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 56bce924bec6..51b9e6fa12f8 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2695,19 +2695,11 @@ static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask)
  * @bssid: BSSID to be matched; may be all-zero BSSID in case of SSID match
  *	or no match (RSSI only)
  * @rssi_thold: don't report scan results below this threshold (in s32 dBm)
- * @per_band_rssi_thold: Minimum rssi threshold for each band to be applied
- *	for filtering out scan results received. Drivers advertise this support
- *	of band specific rssi based filtering through the feature capability
- *	%NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD. These band
- *	specific rssi thresholds take precedence over rssi_thold, if specified.
- *	If not specified for any band, it will be assigned with rssi_thold of
- *	corresponding matchset.
  */
 struct cfg80211_match_set {
 	struct cfg80211_ssid ssid;
 	u8 bssid[ETH_ALEN];
 	s32 rssi_thold;
-	s32 per_band_rssi_thold[NUM_NL80211_BANDS];
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 3e239df3528f..853ac538a686 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4463,14 +4463,7 @@ enum nl80211_reg_rule_attr {
  *	value as specified by &struct nl80211_bss_select_rssi_adjust.
  * @NL80211_SCHED_SCAN_MATCH_ATTR_BSSID: BSSID to be used for matching
  *	(this cannot be used together with SSID).
- * @NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI: Nested attribute that carries the
- *	band specific minimum rssi thresholds for the bands defined in
- *	enum nl80211_band. The minimum rssi threshold value(s32) specific to a
- *	band shall be encapsulated in attribute with type value equals to one
- *	of the NL80211_BAND_* defined in enum nl80211_band. For example, the
- *	minimum rssi threshold value for 2.4GHZ band shall be encapsulated
- *	within an attribute of type NL80211_BAND_2GHZ. And one or more of such
- *	attributes will be nested within this attribute.
+ * @NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI: Obsolete
  * @NL80211_SCHED_SCAN_MATCH_ATTR_MAX: highest scheduled scan filter
  *	attribute number currently defined
  * @__NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST: internal use
@@ -4483,7 +4476,7 @@ enum nl80211_sched_scan_match_attr {
 	NL80211_SCHED_SCAN_MATCH_ATTR_RELATIVE_RSSI,
 	NL80211_SCHED_SCAN_MATCH_ATTR_RSSI_ADJUST,
 	NL80211_SCHED_SCAN_MATCH_ATTR_BSSID,
-	NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI,
+	NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI, /* obsolete */
 
 	/* keep last */
 	__NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST,
@@ -6418,8 +6411,7 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_AP_PMKSA_CACHING: Driver/device supports PMKSA caching
  *	(set/del PMKSA operations) in AP mode.
  *
- * @NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD: Driver supports
- *	filtering of sched scan results using band specific RSSI thresholds.
+ * @NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD: Obsolete
  *
  * @NL80211_EXT_FEATURE_STA_TX_PWR: This driver supports controlling tx power
  *	to a station.
@@ -6574,7 +6566,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER,
 	NL80211_EXT_FEATURE_AIRTIME_FAIRNESS,
 	NL80211_EXT_FEATURE_AP_PMKSA_CACHING,
-	NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD,
+	NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD, /* obsolete */
 	NL80211_EXT_FEATURE_EXT_KEY_ID,
 	NL80211_EXT_FEATURE_STA_TX_PWR,
 	NL80211_EXT_FEATURE_SAE_OFFLOAD,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 0809f721f045..e4f41f86e295 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -906,23 +906,12 @@ nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = {
 	[NL80211_REKEY_DATA_AKM] = { .type = NLA_U32 },
 };
 
-static const struct nla_policy
-nl80211_match_band_rssi_policy[NUM_NL80211_BANDS] = {
-	[NL80211_BAND_2GHZ] = { .type = NLA_S32 },
-	[NL80211_BAND_5GHZ] = { .type = NLA_S32 },
-	[NL80211_BAND_6GHZ] = { .type = NLA_S32 },
-	[NL80211_BAND_60GHZ] = { .type = NLA_S32 },
-	[NL80211_BAND_LC]    = { .type = NLA_S32 },
-};
-
 static const struct nla_policy
 nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = {
 	[NL80211_SCHED_SCAN_MATCH_ATTR_SSID] = { .type = NLA_BINARY,
 						 .len = IEEE80211_MAX_SSID_LEN },
 	[NL80211_SCHED_SCAN_MATCH_ATTR_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
 	[NL80211_SCHED_SCAN_MATCH_ATTR_RSSI] = { .type = NLA_U32 },
-	[NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI] =
-		NLA_POLICY_NESTED(nl80211_match_band_rssi_policy),
 };
 
 static const struct nla_policy
@@ -9490,41 +9479,6 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
 	return 0;
 }
 
-static int
-nl80211_parse_sched_scan_per_band_rssi(struct wiphy *wiphy,
-				       struct cfg80211_match_set *match_sets,
-				       struct nlattr *tb_band_rssi,
-				       s32 rssi_thold)
-{
-	struct nlattr *attr;
-	int i, tmp, ret = 0;
-
-	if (!wiphy_ext_feature_isset(wiphy,
-		    NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD)) {
-		if (tb_band_rssi)
-			ret = -EOPNOTSUPP;
-		else
-			for (i = 0; i < NUM_NL80211_BANDS; i++)
-				match_sets->per_band_rssi_thold[i] =
-					NL80211_SCAN_RSSI_THOLD_OFF;
-		return ret;
-	}
-
-	for (i = 0; i < NUM_NL80211_BANDS; i++)
-		match_sets->per_band_rssi_thold[i] = rssi_thold;
-
-	nla_for_each_nested(attr, tb_band_rssi, tmp) {
-		enum nl80211_band band = nla_type(attr);
-
-		if (band < 0 || band >= NUM_NL80211_BANDS)
-			return -EINVAL;
-
-		match_sets->per_band_rssi_thold[band] =	nla_get_s32(attr);
-	}
-
-	return 0;
-}
-
 static struct cfg80211_sched_scan_request *
 nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
 			 struct nlattr **attrs, int max_match_sets)
@@ -9799,15 +9753,6 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
 			if (rssi)
 				request->match_sets[i].rssi_thold =
 					nla_get_s32(rssi);
-
-			/* Parse per band RSSI attribute */
-			err = nl80211_parse_sched_scan_per_band_rssi(wiphy,
-				&request->match_sets[i],
-				tb[NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI],
-				request->match_sets[i].rssi_thold);
-			if (err)
-				goto out_free;
-
 			i++;
 		}
 
-- 
cgit v1.2.3


From 97af84a6bba2ab2b9c704c08e67de3b5ea551bb2 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 23 Jan 2024 09:08:53 -0800
Subject: af_unix: Do not use atomic ops for unix_sk(sk)->inflight.

When touching unix_sk(sk)->inflight, we are always under
spin_lock(&unix_gc_lock).

Let's convert unix_sk(sk)->inflight to the normal unsigned long.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240123170856.41348-3-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h |  2 +-
 net/unix/af_unix.c    |  4 ++--
 net/unix/garbage.c    | 17 ++++++++---------
 net/unix/scm.c        |  8 +++++---
 4 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 49c4640027d8..ac38b63db554 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -61,7 +61,7 @@ struct unix_sock {
 	struct mutex		iolock, bindlock;
 	struct sock		*peer;
 	struct list_head	link;
-	atomic_long_t		inflight;
+	unsigned long		inflight;
 	spinlock_t		lock;
 	unsigned long		gc_flags;
 #define UNIX_GC_CANDIDATE	0
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index ac1f2bc18fc9..1e9378036dcc 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -993,11 +993,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
 	sk->sk_write_space	= unix_write_space;
 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
 	sk->sk_destruct		= unix_sock_destructor;
-	u	  = unix_sk(sk);
+	u = unix_sk(sk);
+	u->inflight = 0;
 	u->path.dentry = NULL;
 	u->path.mnt = NULL;
 	spin_lock_init(&u->lock);
-	atomic_long_set(&u->inflight, 0);
 	INIT_LIST_HEAD(&u->link);
 	mutex_init(&u->iolock); /* single task reading lock */
 	mutex_init(&u->bindlock); /* single task binding lock */
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index af3d2221cf6a..051f96a3ab02 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -166,17 +166,18 @@ static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
 
 static void dec_inflight(struct unix_sock *usk)
 {
-	atomic_long_dec(&usk->inflight);
+	usk->inflight--;
 }
 
 static void inc_inflight(struct unix_sock *usk)
 {
-	atomic_long_inc(&usk->inflight);
+	usk->inflight++;
 }
 
 static void inc_inflight_move_tail(struct unix_sock *u)
 {
-	atomic_long_inc(&u->inflight);
+	u->inflight++;
+
 	/* If this still might be part of a cycle, move it to the end
 	 * of the list, so that it's checked even if it was already
 	 * passed over
@@ -237,14 +238,12 @@ void unix_gc(void)
 	 */
 	list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
 		long total_refs;
-		long inflight_refs;
 
 		total_refs = file_count(u->sk.sk_socket->file);
-		inflight_refs = atomic_long_read(&u->inflight);
 
-		BUG_ON(inflight_refs < 1);
-		BUG_ON(total_refs < inflight_refs);
-		if (total_refs == inflight_refs) {
+		BUG_ON(!u->inflight);
+		BUG_ON(total_refs < u->inflight);
+		if (total_refs == u->inflight) {
 			list_move_tail(&u->link, &gc_candidates);
 			__set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
 			__set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
@@ -271,7 +270,7 @@ void unix_gc(void)
 		/* Move cursor to after the current position. */
 		list_move(&cursor, &u->link);
 
-		if (atomic_long_read(&u->inflight) > 0) {
+		if (u->inflight) {
 			list_move_tail(&u->link, &not_cycle_list);
 			__clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
 			scan_children(&u->sk, inc_inflight_move_tail, NULL);
diff --git a/net/unix/scm.c b/net/unix/scm.c
index 822ce0d0d791..e92f2fad6410 100644
--- a/net/unix/scm.c
+++ b/net/unix/scm.c
@@ -53,12 +53,13 @@ void unix_inflight(struct user_struct *user, struct file *fp)
 	if (s) {
 		struct unix_sock *u = unix_sk(s);
 
-		if (atomic_long_inc_return(&u->inflight) == 1) {
+		if (!u->inflight) {
 			BUG_ON(!list_empty(&u->link));
 			list_add_tail(&u->link, &gc_inflight_list);
 		} else {
 			BUG_ON(list_empty(&u->link));
 		}
+		u->inflight++;
 		/* Paired with READ_ONCE() in wait_for_unix_gc() */
 		WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
 	}
@@ -75,10 +76,11 @@ void unix_notinflight(struct user_struct *user, struct file *fp)
 	if (s) {
 		struct unix_sock *u = unix_sk(s);
 
-		BUG_ON(!atomic_long_read(&u->inflight));
+		BUG_ON(!u->inflight);
 		BUG_ON(list_empty(&u->link));
 
-		if (atomic_long_dec_and_test(&u->inflight))
+		u->inflight--;
+		if (!u->inflight)
 			list_del_init(&u->link);
 		/* Paired with READ_ONCE() in wait_for_unix_gc() */
 		WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
-- 
cgit v1.2.3


From 5b17307bd0789edea0675d524a2b277b93bbde62 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 23 Jan 2024 09:08:54 -0800
Subject: af_unix: Return struct unix_sock from unix_get_socket().

Currently, unix_get_socket() returns struct sock, but after calling
it, we always cast it to unix_sk().

Let's return struct unix_sock from unix_get_socket().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240123170856.41348-4-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h |  2 +-
 net/unix/garbage.c    | 19 +++++++------------
 net/unix/scm.c        | 19 +++++++------------
 3 files changed, 15 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index ac38b63db554..2c98ef95017b 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -14,7 +14,7 @@ void unix_destruct_scm(struct sk_buff *skb);
 void io_uring_destruct_scm(struct sk_buff *skb);
 void unix_gc(void);
 void wait_for_unix_gc(void);
-struct sock *unix_get_socket(struct file *filp);
+struct unix_sock *unix_get_socket(struct file *filp);
 struct sock *unix_peer_get(struct sock *sk);
 
 #define UNIX_HASH_MOD	(256 - 1)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 051f96a3ab02..d5ef46a75f1f 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -105,20 +105,15 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
 
 			while (nfd--) {
 				/* Get the socket the fd matches if it indeed does so */
-				struct sock *sk = unix_get_socket(*fp++);
+				struct unix_sock *u = unix_get_socket(*fp++);
 
-				if (sk) {
-					struct unix_sock *u = unix_sk(sk);
+				/* Ignore non-candidates, they could have been added
+				 * to the queues after starting the garbage collection
+				 */
+				if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
+					hit = true;
 
-					/* Ignore non-candidates, they could
-					 * have been added to the queues after
-					 * starting the garbage collection
-					 */
-					if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
-						hit = true;
-
-						func(u);
-					}
+					func(u);
 				}
 			}
 			if (hit && hitlist != NULL) {
diff --git a/net/unix/scm.c b/net/unix/scm.c
index e92f2fad6410..b5ae5ab16777 100644
--- a/net/unix/scm.c
+++ b/net/unix/scm.c
@@ -21,9 +21,8 @@ EXPORT_SYMBOL(gc_inflight_list);
 DEFINE_SPINLOCK(unix_gc_lock);
 EXPORT_SYMBOL(unix_gc_lock);
 
-struct sock *unix_get_socket(struct file *filp)
+struct unix_sock *unix_get_socket(struct file *filp)
 {
-	struct sock *u_sock = NULL;
 	struct inode *inode = file_inode(filp);
 
 	/* Socket ? */
@@ -34,10 +33,10 @@ struct sock *unix_get_socket(struct file *filp)
 
 		/* PF_UNIX ? */
 		if (s && ops && ops->family == PF_UNIX)
-			u_sock = s;
+			return unix_sk(s);
 	}
 
-	return u_sock;
+	return NULL;
 }
 EXPORT_SYMBOL(unix_get_socket);
 
@@ -46,13 +45,11 @@ EXPORT_SYMBOL(unix_get_socket);
  */
 void unix_inflight(struct user_struct *user, struct file *fp)
 {
-	struct sock *s = unix_get_socket(fp);
+	struct unix_sock *u = unix_get_socket(fp);
 
 	spin_lock(&unix_gc_lock);
 
-	if (s) {
-		struct unix_sock *u = unix_sk(s);
-
+	if (u) {
 		if (!u->inflight) {
 			BUG_ON(!list_empty(&u->link));
 			list_add_tail(&u->link, &gc_inflight_list);
@@ -69,13 +66,11 @@ void unix_inflight(struct user_struct *user, struct file *fp)
 
 void unix_notinflight(struct user_struct *user, struct file *fp)
 {
-	struct sock *s = unix_get_socket(fp);
+	struct unix_sock *u = unix_get_socket(fp);
 
 	spin_lock(&unix_gc_lock);
 
-	if (s) {
-		struct unix_sock *u = unix_sk(s);
-
+	if (u) {
 		BUG_ON(!u->inflight);
 		BUG_ON(list_empty(&u->link));
 
-- 
cgit v1.2.3


From d9f21b3613337b55cc9d4a6ead484dca68475143 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 23 Jan 2024 09:08:56 -0800
Subject: af_unix: Try to run GC async.

If more than 16000 inflight AF_UNIX sockets exist and the garbage
collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc().
Also, they wait for unix_gc() to complete.

In unix_gc(), all inflight AF_UNIX sockets are traversed at least once,
and more if they are the GC candidate.  Thus, sendmsg() significantly
slows down with too many inflight AF_UNIX sockets.

However, if a process sends data with no AF_UNIX FD, the sendmsg() call
does not need to wait for GC.  After this change, only the process that
meets the condition below will be blocked under such a situation.

  1) cmsg contains AF_UNIX socket
  2) more than 32 AF_UNIX sent by the same user are still inflight

Note that even a sendmsg() call that does not meet the condition but has
AF_UNIX FD will be blocked later in unix_scm_to_skb() by the spinlock,
but we allow that as a bonus for sane users.

The results below are the time spent in unix_dgram_sendmsg() sending 1
byte of data with no FD 4096 times on a host where 32K inflight AF_UNIX
sockets exist.

Without series: the sane sendmsg() needs to wait gc unreasonably.

  $ sudo /usr/share/bcc/tools/funclatency -p 11165 unix_dgram_sendmsg
  Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
  ^C
       nsecs               : count     distribution
  [...]
      524288 -> 1048575    : 0        |                                        |
     1048576 -> 2097151    : 3881     |****************************************|
     2097152 -> 4194303    : 214      |**                                      |
     4194304 -> 8388607    : 1        |                                        |

  avg = 1825567 nsecs, total: 7477526027 nsecs, count: 4096

With series: the sane sendmsg() can finish much faster.

  $ sudo /usr/share/bcc/tools/funclatency -p 8702  unix_dgram_sendmsg
  Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
  ^C
       nsecs               : count     distribution
  [...]
         128 -> 255        : 0        |                                        |
         256 -> 511        : 4092     |****************************************|
         512 -> 1023       : 2        |                                        |
        1024 -> 2047       : 0        |                                        |
        2048 -> 4095       : 0        |                                        |
        4096 -> 8191       : 1        |                                        |
        8192 -> 16383      : 1        |                                        |

  avg = 410 nsecs, total: 1680510 nsecs, count: 4096

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20240123170856.41348-6-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h | 12 ++++++++++--
 include/net/scm.h     |  1 +
 net/core/scm.c        |  5 +++++
 net/unix/af_unix.c    |  6 ++++--
 net/unix/garbage.c    | 10 +++++++++-
 5 files changed, 29 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 2c98ef95017b..f045bbd9017d 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -8,13 +8,21 @@
 #include <linux/refcount.h>
 #include <net/sock.h>
 
+#if IS_ENABLED(CONFIG_UNIX)
+struct unix_sock *unix_get_socket(struct file *filp);
+#else
+static inline struct unix_sock *unix_get_socket(struct file *filp)
+{
+	return NULL;
+}
+#endif
+
 void unix_inflight(struct user_struct *user, struct file *fp);
 void unix_notinflight(struct user_struct *user, struct file *fp);
 void unix_destruct_scm(struct sk_buff *skb);
 void io_uring_destruct_scm(struct sk_buff *skb);
 void unix_gc(void);
-void wait_for_unix_gc(void);
-struct unix_sock *unix_get_socket(struct file *filp);
+void wait_for_unix_gc(struct scm_fp_list *fpl);
 struct sock *unix_peer_get(struct sock *sk);
 
 #define UNIX_HASH_MOD	(256 - 1)
diff --git a/include/net/scm.h b/include/net/scm.h
index cf68acec4d70..92276a2c5543 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -25,6 +25,7 @@ struct scm_creds {
 
 struct scm_fp_list {
 	short			count;
+	short			count_unix;
 	short			max;
 	struct user_struct	*user;
 	struct file		*fp[SCM_MAX_FD];
diff --git a/net/core/scm.c b/net/core/scm.c
index d0e0852a24d5..9cd4b0a01cd6 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -36,6 +36,7 @@
 #include <net/compat.h>
 #include <net/scm.h>
 #include <net/cls_cgroup.h>
+#include <net/af_unix.h>
 
 
 /*
@@ -85,6 +86,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 			return -ENOMEM;
 		*fplp = fpl;
 		fpl->count = 0;
+		fpl->count_unix = 0;
 		fpl->max = SCM_MAX_FD;
 		fpl->user = NULL;
 	}
@@ -109,6 +111,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 			fput(file);
 			return -EINVAL;
 		}
+		if (unix_get_socket(file))
+			fpl->count_unix++;
+
 		*fpp++ = file;
 		fpl->count++;
 	}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 1e9378036dcc..1720419d93d6 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1923,11 +1923,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 	long timeo;
 	int err;
 
-	wait_for_unix_gc();
 	err = scm_send(sock, msg, &scm, false);
 	if (err < 0)
 		return err;
 
+	wait_for_unix_gc(scm.fp);
+
 	err = -EOPNOTSUPP;
 	if (msg->msg_flags&MSG_OOB)
 		goto out;
@@ -2199,11 +2200,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 	bool fds_sent = false;
 	int data_len;
 
-	wait_for_unix_gc();
 	err = scm_send(sock, msg, &scm, false);
 	if (err < 0)
 		return err;
 
+	wait_for_unix_gc(scm.fp);
+
 	err = -EOPNOTSUPP;
 	if (msg->msg_flags & MSG_OOB) {
 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 3f599db59f9d..4046c606f0e6 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -311,8 +311,9 @@ void unix_gc(void)
 }
 
 #define UNIX_INFLIGHT_TRIGGER_GC 16000
+#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8)
 
-void wait_for_unix_gc(void)
+void wait_for_unix_gc(struct scm_fp_list *fpl)
 {
 	/* If number of inflight sockets is insane,
 	 * force a garbage collect right now.
@@ -324,6 +325,13 @@ void wait_for_unix_gc(void)
 	    !READ_ONCE(gc_in_progress))
 		unix_gc();
 
+	/* Penalise users who want to send AF_UNIX sockets
+	 * but whose sockets have not been received yet.
+	 */
+	if (!fpl || !fpl->count_unix ||
+	    READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER)
+		return;
+
 	if (READ_ONCE(gc_in_progress))
 		flush_work(&unix_gc_work);
 }
-- 
cgit v1.2.3


From 7ae215ee7bb855f13c80565470fc7f67db4ba82f Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Thu, 25 Jan 2024 21:36:59 +0100
Subject: net: phy: add support for PHY LEDs polarity modes

Add support for PHY LEDs polarity modes. Some PHY require LED to be set
to active low to be turned ON. Adds support for this by declaring
active-low property in DT.

PHY driver needs to declare .led_polarity_set() to configure LED
polarity modes. Function will pass the index with the LED index and a
bitmap with all the required modes to set.

Current supported modes are:
- active-low with the flag PHY_LED_ACTIVE_LOW. LED is set to active-low
  to turn it ON.
- inactive-high-impedance with the flag PHY_LED_INACTIVE_HIGH_IMPEDANCE.
  LED is set to high impedance to turn it OFF.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20240125203702.4552-4-ansuelsmth@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy_device.c | 16 ++++++++++++++++
 include/linux/phy.h          | 22 ++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 3611ea64875e..dd778c7fde1d 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -3097,6 +3097,7 @@ static int of_phy_led(struct phy_device *phydev,
 	struct device *dev = &phydev->mdio.dev;
 	struct led_init_data init_data = {};
 	struct led_classdev *cdev;
+	unsigned long modes = 0;
 	struct phy_led *phyled;
 	u32 index;
 	int err;
@@ -3114,6 +3115,21 @@ static int of_phy_led(struct phy_device *phydev,
 	if (index > U8_MAX)
 		return -EINVAL;
 
+	if (of_property_read_bool(led, "active-low"))
+		set_bit(PHY_LED_ACTIVE_LOW, &modes);
+	if (of_property_read_bool(led, "inactive-high-impedance"))
+		set_bit(PHY_LED_INACTIVE_HIGH_IMPEDANCE, &modes);
+
+	if (modes) {
+		/* Return error if asked to set polarity modes but not supported */
+		if (!phydev->drv->led_polarity_set)
+			return -EINVAL;
+
+		err = phydev->drv->led_polarity_set(phydev, index, modes);
+		if (err)
+			return err;
+	}
+
 	phyled->index = index;
 	if (phydev->drv->led_brightness_set)
 		cdev->brightness_set_blocking = phy_led_set_brightness;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 684efaeca07c..c9994a59ca2e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -852,6 +852,15 @@ struct phy_plca_status {
 	bool pst;
 };
 
+/* Modes for PHY LED configuration */
+enum phy_led_modes {
+	PHY_LED_ACTIVE_LOW = 0,
+	PHY_LED_INACTIVE_HIGH_IMPEDANCE = 1,
+
+	/* keep it last */
+	__PHY_LED_MODES_NUM,
+};
+
 /**
  * struct phy_led: An LED driven by the PHY
  *
@@ -1145,6 +1154,19 @@ struct phy_driver {
 	int (*led_hw_control_get)(struct phy_device *dev, u8 index,
 				  unsigned long *rules);
 
+	/**
+	 * @led_polarity_set: Set the LED polarity modes
+	 * @dev: PHY device which has the LED
+	 * @index: Which LED of the PHY device
+	 * @modes: bitmap of LED polarity modes
+	 *
+	 * Configure LED with all the required polarity modes in @modes
+	 * to make it correctly turn ON or OFF.
+	 *
+	 * Returns 0, or an error code.
+	 */
+	int (*led_polarity_set)(struct phy_device *dev, int index,
+				unsigned long modes);
 };
 #define to_phy_driver(d) container_of(to_mdio_common_driver(d),		\
 				      struct phy_driver, mdiodrv)
-- 
cgit v1.2.3


From ea1cc3ee34a5f3144f6c2cdc07c19c914ccb9526 Mon Sep 17 00:00:00 2001
From: Min Li <min.li.xe@renesas.com>
Date: Wed, 24 Jan 2024 13:49:46 -0500
Subject: ptp: introduce PTP_CLOCK_EXTOFF event for the measured external
 offset

This change is for the PHC devices that can measure the phase offset
between PHC signal and the external signal, such as the 1PPS signal of
GNSS. Reporting PTP_CLOCK_EXTOFF to user space will be piggy-backed to
the existing ptp_extts_event so that application such as ts2phc can
poll the external offset the same way as extts. Hence, ts2phc can use
the offset to achieve the alignment between PHC and the external signal
by the help of either SW or HW filters.

Signed-off-by: Min Li <min.li.xe@renesas.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_clock.c          | 16 +++++++++++++++-
 include/linux/ptp_clock_kernel.h |  3 +++
 include/uapi/linux/ptp_clock.h   | 13 ++++++++++---
 3 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index 15b804ba4868..3aaf1a3430c5 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -44,18 +44,31 @@ static void enqueue_external_timestamp(struct timestamp_event_queue *queue,
 				       struct ptp_clock_event *src)
 {
 	struct ptp_extts_event *dst;
+	struct timespec64 offset_ts;
 	unsigned long flags;
 	s64 seconds;
 	u32 remainder;
 
-	seconds = div_u64_rem(src->timestamp, 1000000000, &remainder);
+	if (src->type == PTP_CLOCK_EXTTS) {
+		seconds = div_u64_rem(src->timestamp, 1000000000, &remainder);
+	} else if (src->type == PTP_CLOCK_EXTOFF) {
+		offset_ts = ns_to_timespec64(src->offset);
+		seconds = offset_ts.tv_sec;
+		remainder = offset_ts.tv_nsec;
+	} else {
+		WARN(1, "%s: unknown type %d\n", __func__, src->type);
+		return;
+	}
 
 	spin_lock_irqsave(&queue->lock, flags);
 
 	dst = &queue->buf[queue->tail];
 	dst->index = src->index;
+	dst->flags = PTP_EXTTS_EVENT_VALID;
 	dst->t.sec = seconds;
 	dst->t.nsec = remainder;
+	if (src->type == PTP_CLOCK_EXTOFF)
+		dst->flags |= PTP_EXT_OFFSET;
 
 	/* Both WRITE_ONCE() are paired with READ_ONCE() in queue_cnt() */
 	if (!queue_free(queue))
@@ -417,6 +430,7 @@ void ptp_clock_event(struct ptp_clock *ptp, struct ptp_clock_event *event)
 		break;
 
 	case PTP_CLOCK_EXTTS:
+	case PTP_CLOCK_EXTOFF:
 		/* Enqueue timestamp on selected queues */
 		spin_lock_irqsave(&ptp->tsevqs_lock, flags);
 		list_for_each_entry(tsevq, &ptp->tsevqs, qlist) {
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 1ef4e0f9bd2a..6e4b8206c7d0 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -200,6 +200,7 @@ struct ptp_clock;
 enum ptp_clock_events {
 	PTP_CLOCK_ALARM,
 	PTP_CLOCK_EXTTS,
+	PTP_CLOCK_EXTOFF,
 	PTP_CLOCK_PPS,
 	PTP_CLOCK_PPSUSR,
 };
@@ -210,6 +211,7 @@ enum ptp_clock_events {
  * @type:  One of the ptp_clock_events enumeration values.
  * @index: Identifies the source of the event.
  * @timestamp: When the event occurred (%PTP_CLOCK_EXTTS only).
+ * @offset:    When the event occurred (%PTP_CLOCK_EXTOFF only).
  * @pps_times: When the event occurred (%PTP_CLOCK_PPSUSR only).
  */
 
@@ -218,6 +220,7 @@ struct ptp_clock_event {
 	int index;
 	union {
 		u64 timestamp;
+		s64 offset;
 		struct pps_event_time pps_times;
 	};
 };
diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index da700999cad4..053b40d642de 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -32,6 +32,7 @@
 #define PTP_RISING_EDGE    (1<<1)
 #define PTP_FALLING_EDGE   (1<<2)
 #define PTP_STRICT_FLAGS   (1<<3)
+#define PTP_EXT_OFFSET     (1<<4)
 #define PTP_EXTTS_EDGES    (PTP_RISING_EDGE | PTP_FALLING_EDGE)
 
 /*
@@ -40,7 +41,8 @@
 #define PTP_EXTTS_VALID_FLAGS	(PTP_ENABLE_FEATURE |	\
 				 PTP_RISING_EDGE |	\
 				 PTP_FALLING_EDGE |	\
-				 PTP_STRICT_FLAGS)
+				 PTP_STRICT_FLAGS |	\
+				 PTP_EXT_OFFSET)
 
 /*
  * flag fields valid for the original PTP_EXTTS_REQUEST ioctl.
@@ -50,6 +52,11 @@
 					 PTP_RISING_EDGE |	\
 					 PTP_FALLING_EDGE)
 
+/*
+ * flag fields valid for the ptp_extts_event report.
+ */
+#define PTP_EXTTS_EVENT_VALID	(PTP_ENABLE_FEATURE)
+
 /*
  * Bits of the ptp_perout_request.flags field:
  */
@@ -228,9 +235,9 @@ struct ptp_pin_desc {
 #define PTP_MASK_EN_SINGLE  _IOW(PTP_CLK_MAGIC, 20, unsigned int)
 
 struct ptp_extts_event {
-	struct ptp_clock_time t; /* Time event occured. */
+	struct ptp_clock_time t; /* Time event occurred. */
 	unsigned int index;      /* Which channel produced the event. */
-	unsigned int flags;      /* Reserved for future use. */
+	unsigned int flags;      /* Event type. */
 	unsigned int rsv[2];     /* Reserved for future use. */
 };
 
-- 
cgit v1.2.3


From 1ddfecafabf71e0e5345dff877d2680083c7e078 Mon Sep 17 00:00:00 2001
From: Min Li <min.li.xe@renesas.com>
Date: Wed, 24 Jan 2024 13:49:47 -0500
Subject: ptp: add FemtoClock3 Wireless as ptp hardware clock

The RENESAS FemtoClock3 Wireless is a high-performance jitter attenuator,
frequency translator, and clock synthesizer. The device is comprised of 3
digital PLLs (DPLL) to track CLKIN inputs and three independent low phase
noise fractional output dividers (FOD) that output low phase noise clocks.

FemtoClock3 supports one Time Synchronization (Time Sync) channel to enable
an external processor to control the phase and frequency of the Time Sync
channel and to take phase measurements using the TDC. Intended applications
are synchronization using the precision time protocol (PTP) and
synchronization with 0.5 Hz and 1 Hz signals from GNSS.

Signed-off-by: Min Li <min.li.xe@renesas.com>
Acked-by: Lee Jones <lee@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/Kconfig                |   12 +
 drivers/ptp/Makefile               |    1 +
 drivers/ptp/ptp_fc3.c              | 1016 ++++++++++++++++++++++++++++++++++++
 drivers/ptp/ptp_fc3.h              |   45 ++
 include/linux/mfd/idtRC38xxx_reg.h |  273 ++++++++++
 5 files changed, 1347 insertions(+)
 create mode 100644 drivers/ptp/ptp_fc3.c
 create mode 100644 drivers/ptp/ptp_fc3.h
 create mode 100644 include/linux/mfd/idtRC38xxx_reg.h

(limited to 'include')

diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index 5dd5f188e14f..604541dcb320 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -155,6 +155,18 @@ config PTP_1588_CLOCK_IDTCM
 	  To compile this driver as a module, choose M here: the module
 	  will be called ptp_clockmatrix.
 
+config PTP_1588_CLOCK_FC3W
+	tristate "RENESAS FemtoClock3 Wireless as PTP clock"
+	depends on PTP_1588_CLOCK && I2C
+	default n
+	help
+	  This driver adds support for using Renesas FemtoClock3 Wireless
+	  as a PTP clock. This clock is only useful if your time stamping
+	  MAC is connected to the RENESAS chip.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called ptp_fc3.
+
 config PTP_1588_CLOCK_MOCK
 	tristate "Mock-up PTP clock"
 	depends on PTP_1588_CLOCK
diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index dea0cebd2303..68bf02078053 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_PTP_1588_CLOCK_QORIQ)	+= ptp-qoriq.o
 ptp-qoriq-y				+= ptp_qoriq.o
 ptp-qoriq-$(CONFIG_DEBUG_FS)		+= ptp_qoriq_debugfs.o
 obj-$(CONFIG_PTP_1588_CLOCK_IDTCM)	+= ptp_clockmatrix.o
+obj-$(CONFIG_PTP_1588_CLOCK_FC3W)	+= ptp_fc3.o
 obj-$(CONFIG_PTP_1588_CLOCK_IDT82P33)	+= ptp_idt82p33.o
 obj-$(CONFIG_PTP_1588_CLOCK_MOCK)	+= ptp_mock.o
 obj-$(CONFIG_PTP_1588_CLOCK_VMW)	+= ptp_vmw.o
diff --git a/drivers/ptp/ptp_fc3.c b/drivers/ptp/ptp_fc3.c
new file mode 100644
index 000000000000..0e2286ba088a
--- /dev/null
+++ b/drivers/ptp/ptp_fc3.c
@@ -0,0 +1,1016 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * PTP hardware clock driver for the FemtoClock3 family of timing and
+ * synchronization devices.
+ *
+ * Copyright (C) 2023 Integrated Device Technology, Inc., a Renesas Company.
+ */
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/timekeeping.h>
+#include <linux/string.h>
+#include <linux/of.h>
+#include <linux/bitfield.h>
+#include <linux/mfd/rsmu.h>
+#include <linux/mfd/idtRC38xxx_reg.h>
+#include <asm/unaligned.h>
+
+#include "ptp_private.h"
+#include "ptp_fc3.h"
+
+MODULE_DESCRIPTION("Driver for IDT FemtoClock3(TM) family");
+MODULE_AUTHOR("IDT support-1588 <IDT-support-1588@lm.renesas.com>");
+MODULE_VERSION("1.0");
+MODULE_LICENSE("GPL");
+
+/*
+ * The name of the firmware file to be loaded
+ * over-rides any automatic selection
+ */
+static char *firmware;
+module_param(firmware, charp, 0);
+
+static s64 ns2counters(struct idtfc3 *idtfc3, s64 nsec, u32 *sub_ns)
+{
+	s64 sync;
+	s32 rem;
+
+	if (likely(nsec >= 0)) {
+		sync = div_u64_rem(nsec, idtfc3->ns_per_sync, &rem);
+		*sub_ns = rem;
+	} else {
+		sync = -div_u64_rem(-nsec - 1, idtfc3->ns_per_sync, &rem) - 1;
+		*sub_ns = idtfc3->ns_per_sync - rem - 1;
+	}
+
+	return sync * idtfc3->ns_per_sync;
+}
+
+static s64 tdc_meas2offset(struct idtfc3 *idtfc3, u64 meas_read)
+{
+	s64 coarse, fine;
+
+	fine = sign_extend64(FIELD_GET(FINE_MEAS_MASK, meas_read), 12);
+	coarse = sign_extend64(FIELD_GET(COARSE_MEAS_MASK, meas_read), (39 - 13));
+
+	fine = div64_s64(fine * NSEC_PER_SEC, idtfc3->tdc_apll_freq * 62LL);
+	coarse = div64_s64(coarse * NSEC_PER_SEC, idtfc3->time_ref_freq);
+
+	return coarse + fine;
+}
+
+static s64 tdc_offset2phase(struct idtfc3 *idtfc3, s64 offset_ns)
+{
+	if (offset_ns > idtfc3->ns_per_sync / 2)
+		offset_ns -= idtfc3->ns_per_sync;
+
+	return offset_ns * idtfc3->tdc_offset_sign;
+}
+
+static int idtfc3_set_lpf_mode(struct idtfc3 *idtfc3, u8 mode)
+{
+	int err;
+
+	if (mode >= LPF_INVALID)
+		return -EINVAL;
+
+	if (idtfc3->lpf_mode == mode)
+		return 0;
+
+	err = regmap_bulk_write(idtfc3->regmap, LPF_MODE_CNFG, &mode, sizeof(mode));
+	if (err)
+		return err;
+
+	idtfc3->lpf_mode = mode;
+
+	return 0;
+}
+
+static int idtfc3_enable_lpf(struct idtfc3 *idtfc3, bool enable)
+{
+	u8 val;
+	int err;
+
+	err = regmap_bulk_read(idtfc3->regmap, LPF_CTRL, &val, sizeof(val));
+	if (err)
+		return err;
+
+	if (enable == true)
+		val |= LPF_EN;
+	else
+		val &= ~LPF_EN;
+
+	return regmap_bulk_write(idtfc3->regmap, LPF_CTRL, &val, sizeof(val));
+}
+
+static int idtfc3_get_time_ref_freq(struct idtfc3 *idtfc3)
+{
+	int err;
+	u8 buf[4];
+	u8 time_ref_div;
+	u8 time_clk_div;
+
+	err = regmap_bulk_read(idtfc3->regmap, TIME_CLOCK_MEAS_DIV_CNFG, buf, sizeof(buf));
+	if (err)
+		return err;
+	time_ref_div = FIELD_GET(TIME_REF_DIV_MASK, get_unaligned_le32(buf)) + 1;
+
+	err = regmap_bulk_read(idtfc3->regmap, TIME_CLOCK_COUNT, buf, 1);
+	if (err)
+		return err;
+	time_clk_div = (buf[0] & TIME_CLOCK_COUNT_MASK) + 1;
+	idtfc3->time_ref_freq = idtfc3->hw_param.time_clk_freq *
+				time_clk_div / time_ref_div;
+
+	return 0;
+}
+
+static int idtfc3_get_tdc_offset_sign(struct idtfc3 *idtfc3)
+{
+	int err;
+	u8 buf[4];
+	u32 val;
+	u8 sig1, sig2;
+
+	err = regmap_bulk_read(idtfc3->regmap, TIME_CLOCK_TDC_FANOUT_CNFG, buf, sizeof(buf));
+	if (err)
+		return err;
+
+	val = get_unaligned_le32(buf);
+	if ((val & TIME_SYNC_TO_TDC_EN) != TIME_SYNC_TO_TDC_EN) {
+		dev_err(idtfc3->dev, "TIME_SYNC_TO_TDC_EN is off !!!");
+		return -EINVAL;
+	}
+
+	sig1 = FIELD_GET(SIG1_MUX_SEL_MASK, val);
+	sig2 = FIELD_GET(SIG2_MUX_SEL_MASK, val);
+
+	if ((sig1 == sig2) || ((sig1 != TIME_SYNC) && (sig2 != TIME_SYNC))) {
+		dev_err(idtfc3->dev, "Invalid tdc_mux_sel sig1=%d sig2=%d", sig1, sig2);
+		return -EINVAL;
+	} else if (sig1 == TIME_SYNC) {
+		idtfc3->tdc_offset_sign = 1;
+	} else if (sig2 == TIME_SYNC) {
+		idtfc3->tdc_offset_sign = -1;
+	}
+
+	return 0;
+}
+
+static int idtfc3_lpf_bw(struct idtfc3 *idtfc3, u8 shift, u8 mult)
+{
+	u8 val = FIELD_PREP(LPF_BW_SHIFT, shift) | FIELD_PREP(LPF_BW_MULT, mult);
+
+	return regmap_bulk_write(idtfc3->regmap, LPF_BW_CNFG, &val, sizeof(val));
+}
+
+static int idtfc3_enable_tdc(struct idtfc3 *idtfc3, bool enable, u8 meas_mode)
+{
+	int err;
+	u8 val = 0;
+
+	/* Disable TDC first */
+	err = regmap_bulk_write(idtfc3->regmap, TIME_CLOCK_MEAS_CTRL, &val, sizeof(val));
+	if (err)
+		return err;
+
+	if (enable == false)
+		return idtfc3_lpf_bw(idtfc3, LPF_BW_SHIFT_DEFAULT, LPF_BW_MULT_DEFAULT);
+
+	if (meas_mode >= MEAS_MODE_INVALID)
+		return -EINVAL;
+
+	/* Change TDC meas mode */
+	err = regmap_bulk_write(idtfc3->regmap, TIME_CLOCK_MEAS_CNFG,
+				&meas_mode, sizeof(meas_mode));
+	if (err)
+		return err;
+
+	/* Enable TDC */
+	val = TDC_MEAS_EN;
+	if (meas_mode == CONTINUOUS)
+		val |= TDC_MEAS_START;
+	err = regmap_bulk_write(idtfc3->regmap, TIME_CLOCK_MEAS_CTRL, &val, sizeof(val));
+	if (err)
+		return err;
+
+	return idtfc3_lpf_bw(idtfc3, LPF_BW_SHIFT_1PPS, LPF_BW_MULT_DEFAULT);
+}
+
+static bool get_tdc_meas(struct idtfc3 *idtfc3, s64 *offset_ns)
+{
+	bool valid = false;
+	u8 buf[9];
+	u8 val;
+	int err;
+
+	while (true) {
+		err = regmap_bulk_read(idtfc3->regmap, TDC_FIFO_STS,
+				       &val, sizeof(val));
+		if (err)
+			return false;
+
+		if (val & FIFO_EMPTY)
+			break;
+
+		err = regmap_bulk_read(idtfc3->regmap, TDC_FIFO_READ_REQ,
+				       &buf, sizeof(buf));
+		if (err)
+			return false;
+
+		valid = true;
+	}
+
+	if (valid)
+		*offset_ns = tdc_meas2offset(idtfc3, get_unaligned_le64(&buf[1]));
+
+	return valid;
+}
+
+static int check_tdc_fifo_overrun(struct idtfc3 *idtfc3)
+{
+	u8 val;
+	int err;
+
+	/* Check if FIFO is overrun */
+	err = regmap_bulk_read(idtfc3->regmap, TDC_FIFO_STS, &val, sizeof(val));
+	if (err)
+		return err;
+
+	if (!(val & FIFO_FULL))
+		return 0;
+
+	dev_warn(idtfc3->dev, "TDC FIFO overrun !!!");
+
+	err = idtfc3_enable_tdc(idtfc3, true, CONTINUOUS);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int get_tdc_meas_continuous(struct idtfc3 *idtfc3)
+{
+	int err;
+	s64 offset_ns;
+	struct ptp_clock_event event;
+
+	err = check_tdc_fifo_overrun(idtfc3);
+	if (err)
+		return err;
+
+	if (get_tdc_meas(idtfc3, &offset_ns) && offset_ns >= 0) {
+		event.index = 0;
+		event.offset = tdc_offset2phase(idtfc3, offset_ns);
+		event.type = PTP_CLOCK_EXTOFF;
+		ptp_clock_event(idtfc3->ptp_clock, &event);
+	}
+
+	return 0;
+}
+
+static int idtfc3_read_subcounter(struct idtfc3 *idtfc3)
+{
+	u8 buf[5] = {0};
+	int err;
+
+	err = regmap_bulk_read(idtfc3->regmap, TOD_COUNTER_READ_REQ,
+			       &buf, sizeof(buf));
+	if (err)
+		return err;
+
+	/* sync_counter_value is [31:82] and sub_sync_counter_value is [0:30] */
+	return get_unaligned_le32(&buf[1]) & SUB_SYNC_COUNTER_MASK;
+}
+
+static int idtfc3_tod_update_is_done(struct idtfc3 *idtfc3)
+{
+	int err;
+	u8 req;
+
+	err = read_poll_timeout_atomic(regmap_bulk_read, err, !req, USEC_PER_MSEC,
+				       idtfc3->tc_write_timeout, true, idtfc3->regmap,
+				       TOD_SYNC_LOAD_REQ_CTRL, &req, 1);
+	if (err)
+		dev_err(idtfc3->dev, "TOD counter write timeout !!!");
+
+	return err;
+}
+
+static int idtfc3_write_subcounter(struct idtfc3 *idtfc3, u32 counter)
+{
+	u8 buf[18] = {0};
+	int err;
+
+	/* sync_counter_value is [31:82] and sub_sync_counter_value is [0:30] */
+	put_unaligned_le32(counter & SUB_SYNC_COUNTER_MASK, &buf[0]);
+
+	buf[16] = SUB_SYNC_LOAD_ENABLE | SYNC_LOAD_ENABLE;
+	buf[17] = SYNC_LOAD_REQ;
+
+	err = regmap_bulk_write(idtfc3->regmap, TOD_SYNC_LOAD_VAL_CTRL,
+				&buf, sizeof(buf));
+	if (err)
+		return err;
+
+	return idtfc3_tod_update_is_done(idtfc3);
+}
+
+static int idtfc3_timecounter_update(struct idtfc3 *idtfc3, u32 counter, s64 ns)
+{
+	int err;
+
+	err = idtfc3_write_subcounter(idtfc3, counter);
+	if (err)
+		return err;
+
+	/* Update time counter */
+	idtfc3->ns = ns;
+	idtfc3->last_counter = counter;
+
+	return 0;
+}
+
+static int idtfc3_timecounter_read(struct idtfc3 *idtfc3)
+{
+	int now, delta;
+
+	now = idtfc3_read_subcounter(idtfc3);
+	if (now < 0)
+		return now;
+
+	/* calculate the delta since the last idtfc3_timecounter_read(): */
+	if (now >= idtfc3->last_counter)
+		delta = now - idtfc3->last_counter;
+	else
+		delta = idtfc3->sub_sync_count - idtfc3->last_counter + now;
+
+	/* Update time counter */
+	idtfc3->ns += delta * idtfc3->ns_per_counter;
+	idtfc3->last_counter = now;
+
+	return 0;
+}
+
+static int _idtfc3_gettime(struct idtfc3 *idtfc3, struct timespec64 *ts)
+{
+	int err;
+
+	err = idtfc3_timecounter_read(idtfc3);
+	if (err)
+		return err;
+
+	*ts = ns_to_timespec64(idtfc3->ns);
+
+	return 0;
+}
+
+static int idtfc3_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
+{
+	struct idtfc3 *idtfc3 = container_of(ptp, struct idtfc3, caps);
+	int err;
+
+	mutex_lock(idtfc3->lock);
+	err = _idtfc3_gettime(idtfc3, ts);
+	mutex_unlock(idtfc3->lock);
+
+	return err;
+}
+
+static int _idtfc3_settime(struct idtfc3 *idtfc3, const struct timespec64 *ts)
+{
+	s64 offset_ns, now_ns;
+	u32 counter, sub_ns;
+	int now;
+
+	if (timespec64_valid(ts) == false) {
+		dev_err(idtfc3->dev, "%s: invalid timespec", __func__);
+		return -EINVAL;
+	}
+
+	now = idtfc3_read_subcounter(idtfc3);
+	if (now < 0)
+		return now;
+
+	offset_ns = (idtfc3->sub_sync_count - now) * idtfc3->ns_per_counter;
+	now_ns = timespec64_to_ns(ts);
+	(void)ns2counters(idtfc3, offset_ns + now_ns, &sub_ns);
+
+	counter = sub_ns / idtfc3->ns_per_counter;
+	return idtfc3_timecounter_update(idtfc3, counter, now_ns);
+}
+
+static int idtfc3_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts)
+{
+	struct idtfc3 *idtfc3 = container_of(ptp, struct idtfc3, caps);
+	int err;
+
+	mutex_lock(idtfc3->lock);
+	err = _idtfc3_settime(idtfc3, ts);
+	mutex_unlock(idtfc3->lock);
+
+	return err;
+}
+
+static int _idtfc3_adjtime(struct idtfc3 *idtfc3, s64 delta)
+{
+	/*
+	 * The TOD counter can be synchronously loaded with any value,
+	 * to be loaded on the next Time Sync pulse
+	 */
+	s64 sync_ns;
+	u32 sub_ns;
+	u32 counter;
+
+	if (idtfc3->ns + delta < 0) {
+		dev_err(idtfc3->dev, "%lld ns adj is too large", delta);
+		return -EINVAL;
+	}
+
+	sync_ns = ns2counters(idtfc3, delta + idtfc3->ns_per_sync, &sub_ns);
+
+	counter = sub_ns / idtfc3->ns_per_counter;
+	return idtfc3_timecounter_update(idtfc3, counter, idtfc3->ns + sync_ns +
+									counter * idtfc3->ns_per_counter);
+}
+
+static int idtfc3_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct idtfc3 *idtfc3 = container_of(ptp, struct idtfc3, caps);
+	int err;
+
+	mutex_lock(idtfc3->lock);
+	err = _idtfc3_adjtime(idtfc3, delta);
+	mutex_unlock(idtfc3->lock);
+
+	return err;
+}
+
+static int _idtfc3_adjphase(struct idtfc3 *idtfc3, s32 delta)
+{
+	u8 buf[8] = {0};
+	int err;
+	s64 pcw;
+
+	err = idtfc3_set_lpf_mode(idtfc3, LPF_WP);
+	if (err)
+		return err;
+
+	/*
+	 * Phase Control Word unit is: 10^9 / (TDC_APLL_FREQ * 124)
+	 *
+	 *       delta * TDC_APLL_FREQ * 124
+	 * PCW = ---------------------------
+	 *                  10^9
+	 *
+	 */
+	pcw = div_s64((s64)delta * idtfc3->tdc_apll_freq * 124, NSEC_PER_SEC);
+
+	put_unaligned_le64(pcw, buf);
+
+	return regmap_bulk_write(idtfc3->regmap, LPF_WR_PHASE_CTRL, buf, sizeof(buf));
+}
+
+static int idtfc3_adjphase(struct ptp_clock_info *ptp, s32 delta)
+{
+	struct idtfc3 *idtfc3 = container_of(ptp, struct idtfc3, caps);
+	int err;
+
+	mutex_lock(idtfc3->lock);
+	err = _idtfc3_adjphase(idtfc3, delta);
+	mutex_unlock(idtfc3->lock);
+
+	return err;
+}
+
+static int _idtfc3_adjfine(struct idtfc3 *idtfc3, long scaled_ppm)
+{
+	u8 buf[8] = {0};
+	int err;
+	s64 fcw;
+
+	err = idtfc3_set_lpf_mode(idtfc3, LPF_WF);
+	if (err)
+		return err;
+
+	/*
+	 * Frequency Control Word unit is: 2^-44 * 10^6 ppm
+	 *
+	 * adjfreq:
+	 *       ppb * 2^44
+	 * FCW = ----------
+	 *          10^9
+	 *
+	 * adjfine:
+	 *       ppm_16 * 2^28
+	 * FCW = -------------
+	 *           10^6
+	 */
+	fcw = scaled_ppm * BIT(28);
+	fcw = div_s64(fcw, 1000000);
+
+	put_unaligned_le64(fcw, buf);
+
+	return regmap_bulk_write(idtfc3->regmap, LPF_WR_FREQ_CTRL, buf, sizeof(buf));
+}
+
+static int idtfc3_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
+{
+	struct idtfc3 *idtfc3 = container_of(ptp, struct idtfc3, caps);
+	int err;
+
+	mutex_lock(idtfc3->lock);
+	err = _idtfc3_adjfine(idtfc3, scaled_ppm);
+	mutex_unlock(idtfc3->lock);
+
+	return err;
+}
+
+static int idtfc3_enable(struct ptp_clock_info *ptp,
+			 struct ptp_clock_request *rq, int on)
+{
+	struct idtfc3 *idtfc3 = container_of(ptp, struct idtfc3, caps);
+	int err = -EOPNOTSUPP;
+
+	mutex_lock(idtfc3->lock);
+	switch (rq->type) {
+	case PTP_CLK_REQ_PEROUT:
+		if (!on)
+			err = 0;
+		/* Only accept a 1-PPS aligned to the second. */
+		else if (rq->perout.start.nsec || rq->perout.period.sec != 1 ||
+			 rq->perout.period.nsec)
+			err = -ERANGE;
+		else
+			err = 0;
+		break;
+	case PTP_CLK_REQ_EXTTS:
+		if (on) {
+			/* Only accept requests for external phase offset */
+			if ((rq->extts.flags & PTP_EXT_OFFSET) != (PTP_EXT_OFFSET))
+				err = -EOPNOTSUPP;
+			else
+				err = idtfc3_enable_tdc(idtfc3, true, CONTINUOUS);
+		} else {
+			err = idtfc3_enable_tdc(idtfc3, false, MEAS_MODE_INVALID);
+		}
+		break;
+	default:
+		break;
+	}
+	mutex_unlock(idtfc3->lock);
+
+	if (err)
+		dev_err(idtfc3->dev, "Failed in %s with err %d!", __func__, err);
+
+	return err;
+}
+
+static long idtfc3_aux_work(struct ptp_clock_info *ptp)
+{
+	struct idtfc3 *idtfc3 = container_of(ptp, struct idtfc3, caps);
+	static int tdc_get;
+
+	mutex_lock(idtfc3->lock);
+	tdc_get %= TDC_GET_PERIOD;
+	if ((tdc_get == 0) || (tdc_get == TDC_GET_PERIOD / 2))
+		idtfc3_timecounter_read(idtfc3);
+	get_tdc_meas_continuous(idtfc3);
+	tdc_get++;
+	mutex_unlock(idtfc3->lock);
+
+	return idtfc3->tc_update_period;
+}
+
+static const struct ptp_clock_info idtfc3_caps = {
+	.owner		= THIS_MODULE,
+	.max_adj	= MAX_FFO_PPB,
+	.n_per_out	= 1,
+	.n_ext_ts	= 1,
+	.adjphase	= &idtfc3_adjphase,
+	.adjfine	= &idtfc3_adjfine,
+	.adjtime	= &idtfc3_adjtime,
+	.gettime64	= &idtfc3_gettime,
+	.settime64	= &idtfc3_settime,
+	.enable		= &idtfc3_enable,
+	.do_aux_work	= &idtfc3_aux_work,
+};
+
+static int idtfc3_hw_calibrate(struct idtfc3 *idtfc3)
+{
+	int err = 0;
+	u8 val;
+
+	mdelay(10);
+	/*
+	 * Toggle TDC_DAC_RECAL_REQ:
+	 * (1) set tdc_en to 1
+	 * (2) set tdc_dac_recal_req to 0
+	 * (3) set tdc_dac_recal_req to 1
+	 */
+	val = TDC_EN;
+	err = regmap_bulk_write(idtfc3->regmap, TDC_CTRL,
+				&val, sizeof(val));
+	if (err)
+		return err;
+	val = TDC_EN | TDC_DAC_RECAL_REQ;
+	err = regmap_bulk_write(idtfc3->regmap, TDC_CTRL,
+				&val, sizeof(val));
+	if (err)
+		return err;
+	mdelay(10);
+
+	/*
+	 * Toggle APLL_REINIT:
+	 * (1) set apll_reinit to 0
+	 * (2) set apll_reinit to 1
+	 */
+	val = 0;
+	err = regmap_bulk_write(idtfc3->regmap, SOFT_RESET_CTRL,
+				&val, sizeof(val));
+	if (err)
+		return err;
+	val = APLL_REINIT;
+	err = regmap_bulk_write(idtfc3->regmap, SOFT_RESET_CTRL,
+				&val, sizeof(val));
+	if (err)
+		return err;
+	mdelay(10);
+
+	return err;
+}
+
+static int idtfc3_init_timecounter(struct idtfc3 *idtfc3)
+{
+	int err;
+	u32 period_ms;
+
+	period_ms = idtfc3->sub_sync_count * MSEC_PER_SEC /
+			idtfc3->hw_param.time_clk_freq;
+
+	idtfc3->tc_update_period = msecs_to_jiffies(period_ms / TDC_GET_PERIOD);
+	idtfc3->tc_write_timeout = period_ms * USEC_PER_MSEC;
+
+	err = idtfc3_timecounter_update(idtfc3, 0, 0);
+	if (err)
+		return err;
+
+	err = idtfc3_timecounter_read(idtfc3);
+	if (err)
+		return err;
+
+	ptp_schedule_worker(idtfc3->ptp_clock, idtfc3->tc_update_period);
+
+	return 0;
+}
+
+static int idtfc3_get_tdc_apll_freq(struct idtfc3 *idtfc3)
+{
+	int err;
+	u8 tdc_fb_div_int;
+	u8 tdc_ref_div;
+	struct idtfc3_hw_param *param = &idtfc3->hw_param;
+
+	err = regmap_bulk_read(idtfc3->regmap, TDC_REF_DIV_CNFG,
+				&tdc_ref_div, sizeof(tdc_ref_div));
+	if (err)
+		return err;
+
+	err = regmap_bulk_read(idtfc3->regmap, TDC_FB_DIV_INT_CNFG,
+				&tdc_fb_div_int, sizeof(tdc_fb_div_int));
+	if (err)
+		return err;
+
+	tdc_fb_div_int &= TDC_FB_DIV_INT_MASK;
+	tdc_ref_div &= TDC_REF_DIV_CONFIG_MASK;
+
+	idtfc3->tdc_apll_freq = div_u64(param->xtal_freq * (u64)tdc_fb_div_int,
+					1 << tdc_ref_div);
+
+	return 0;
+}
+
+static int idtfc3_get_fod(struct idtfc3 *idtfc3)
+{
+	int err;
+	u8 fod;
+
+	err = regmap_bulk_read(idtfc3->regmap, TIME_CLOCK_SRC, &fod, sizeof(fod));
+	if (err)
+		return err;
+
+	switch (fod) {
+	case 0:
+		idtfc3->fod_n = FOD_0;
+		break;
+	case 1:
+		idtfc3->fod_n = FOD_1;
+		break;
+	case 2:
+		idtfc3->fod_n = FOD_2;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int idtfc3_get_sync_count(struct idtfc3 *idtfc3)
+{
+	int err;
+	u8 buf[4];
+
+	err = regmap_bulk_read(idtfc3->regmap, SUB_SYNC_GEN_CNFG, buf, sizeof(buf));
+	if (err)
+		return err;
+
+	idtfc3->sub_sync_count = (get_unaligned_le32(buf) & SUB_SYNC_COUNTER_MASK) + 1;
+	idtfc3->ns_per_counter = NSEC_PER_SEC / idtfc3->hw_param.time_clk_freq;
+	idtfc3->ns_per_sync = idtfc3->sub_sync_count * idtfc3->ns_per_counter;
+
+	return 0;
+}
+
+static int idtfc3_setup_hw_param(struct idtfc3 *idtfc3)
+{
+	int err;
+
+	err = idtfc3_get_fod(idtfc3);
+	if (err)
+		return err;
+
+	err = idtfc3_get_sync_count(idtfc3);
+	if (err)
+		return err;
+
+	err = idtfc3_get_time_ref_freq(idtfc3);
+	if (err)
+		return err;
+
+	return idtfc3_get_tdc_apll_freq(idtfc3);
+}
+
+static int idtfc3_configure_hw(struct idtfc3 *idtfc3)
+{
+	int err = 0;
+
+	err = idtfc3_hw_calibrate(idtfc3);
+	if (err)
+		return err;
+
+	err = idtfc3_enable_lpf(idtfc3, true);
+	if (err)
+		return err;
+
+	err = idtfc3_enable_tdc(idtfc3, false, MEAS_MODE_INVALID);
+	if (err)
+		return err;
+
+	err = idtfc3_get_tdc_offset_sign(idtfc3);
+	if (err)
+		return err;
+
+	return idtfc3_setup_hw_param(idtfc3);
+}
+
+static int idtfc3_set_overhead(struct idtfc3 *idtfc3)
+{
+	s64 current_ns = 0;
+	s64 lowest_ns = 0;
+	int err;
+	u8 i;
+	ktime_t start;
+	ktime_t stop;
+	ktime_t diff;
+
+	char buf[18] = {0};
+
+	for (i = 0; i < 5; i++) {
+		start = ktime_get_raw();
+
+		err = regmap_bulk_write(idtfc3->regmap, TOD_SYNC_LOAD_VAL_CTRL,
+					&buf, sizeof(buf));
+		if (err)
+			return err;
+
+		stop = ktime_get_raw();
+
+		diff = ktime_sub(stop, start);
+
+		current_ns = ktime_to_ns(diff);
+
+		if (i == 0) {
+			lowest_ns = current_ns;
+		} else {
+			if (current_ns < lowest_ns)
+				lowest_ns = current_ns;
+		}
+	}
+
+	idtfc3->tod_write_overhead = lowest_ns;
+
+	return err;
+}
+
+static int idtfc3_enable_ptp(struct idtfc3 *idtfc3)
+{
+	int err;
+
+	idtfc3->caps = idtfc3_caps;
+	snprintf(idtfc3->caps.name, sizeof(idtfc3->caps.name), "IDT FC3W");
+	idtfc3->ptp_clock = ptp_clock_register(&idtfc3->caps, NULL);
+
+	if (IS_ERR(idtfc3->ptp_clock)) {
+		err = PTR_ERR(idtfc3->ptp_clock);
+		idtfc3->ptp_clock = NULL;
+		return err;
+	}
+
+	err = idtfc3_set_overhead(idtfc3);
+	if (err)
+		return err;
+
+	err = idtfc3_init_timecounter(idtfc3);
+	if (err)
+		return err;
+
+	dev_info(idtfc3->dev, "TIME_SYNC_CHANNEL registered as ptp%d",
+		 idtfc3->ptp_clock->index);
+
+	return 0;
+}
+
+static int idtfc3_load_firmware(struct idtfc3 *idtfc3)
+{
+	char fname[128] = FW_FILENAME;
+	const struct firmware *fw;
+	struct idtfc3_fwrc *rec;
+	u16 addr;
+	u8 val;
+	int err;
+	s32 len;
+
+	idtfc3_default_hw_param(&idtfc3->hw_param);
+
+	if (firmware) /* module parameter */
+		snprintf(fname, sizeof(fname), "%s", firmware);
+
+	dev_info(idtfc3->dev, "requesting firmware '%s'\n", fname);
+
+	err = request_firmware(&fw, fname, idtfc3->dev);
+
+	if (err) {
+		dev_err(idtfc3->dev,
+			"requesting firmware failed with err %d!\n", err);
+		return err;
+	}
+
+	dev_dbg(idtfc3->dev, "firmware size %zu bytes\n", fw->size);
+
+	rec = (struct idtfc3_fwrc *)fw->data;
+
+	for (len = fw->size; len > 0; len -= sizeof(*rec)) {
+		if (rec->reserved) {
+			dev_err(idtfc3->dev,
+				"bad firmware, reserved field non-zero\n");
+			err = -EINVAL;
+		} else {
+			val = rec->value;
+			addr = rec->hiaddr << 8 | rec->loaddr;
+
+			rec++;
+
+			err = idtfc3_set_hw_param(&idtfc3->hw_param, addr, val);
+		}
+
+		if (err != -EINVAL) {
+			err = 0;
+
+			/* Max register */
+			if (addr >= 0xE88)
+				continue;
+
+			err = regmap_bulk_write(idtfc3->regmap, addr,
+						&val, sizeof(val));
+		}
+
+		if (err)
+			goto out;
+	}
+
+	err = idtfc3_configure_hw(idtfc3);
+out:
+	release_firmware(fw);
+	return err;
+}
+
+static int idtfc3_read_device_id(struct idtfc3 *idtfc3, u16 *device_id)
+{
+	int err;
+	u8 buf[2] = {0};
+
+	err = regmap_bulk_read(idtfc3->regmap, DEVICE_ID,
+			       &buf, sizeof(buf));
+	if (err) {
+		dev_err(idtfc3->dev, "%s failed with %d", __func__, err);
+		return err;
+	}
+
+	*device_id = get_unaligned_le16(buf);
+
+	return 0;
+}
+
+static int idtfc3_check_device_compatibility(struct idtfc3 *idtfc3)
+{
+	int err;
+	u16 device_id;
+
+	err = idtfc3_read_device_id(idtfc3, &device_id);
+	if (err)
+		return err;
+
+	if ((device_id & DEVICE_ID_MASK) == 0) {
+		dev_err(idtfc3->dev, "invalid device");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int idtfc3_probe(struct platform_device *pdev)
+{
+	struct rsmu_ddata *ddata = dev_get_drvdata(pdev->dev.parent);
+	struct idtfc3 *idtfc3;
+	int err;
+
+	idtfc3 = devm_kzalloc(&pdev->dev, sizeof(struct idtfc3), GFP_KERNEL);
+
+	if (!idtfc3)
+		return -ENOMEM;
+
+	idtfc3->dev = &pdev->dev;
+	idtfc3->mfd = pdev->dev.parent;
+	idtfc3->lock = &ddata->lock;
+	idtfc3->regmap = ddata->regmap;
+
+	mutex_lock(idtfc3->lock);
+
+	err = idtfc3_check_device_compatibility(idtfc3);
+	if (err) {
+		mutex_unlock(idtfc3->lock);
+		return err;
+	}
+
+	err = idtfc3_load_firmware(idtfc3);
+	if (err) {
+		if (err == -ENOENT) {
+			mutex_unlock(idtfc3->lock);
+			return -EPROBE_DEFER;
+		}
+		dev_warn(idtfc3->dev, "loading firmware failed with %d", err);
+	}
+
+	err = idtfc3_enable_ptp(idtfc3);
+	if (err) {
+		dev_err(idtfc3->dev, "idtfc3_enable_ptp failed with %d", err);
+		mutex_unlock(idtfc3->lock);
+		return err;
+	}
+
+	mutex_unlock(idtfc3->lock);
+
+	if (err) {
+		ptp_clock_unregister(idtfc3->ptp_clock);
+		return err;
+	}
+
+	platform_set_drvdata(pdev, idtfc3);
+
+	return 0;
+}
+
+static int idtfc3_remove(struct platform_device *pdev)
+{
+	struct idtfc3 *idtfc3 = platform_get_drvdata(pdev);
+
+	ptp_clock_unregister(idtfc3->ptp_clock);
+
+	return 0;
+}
+
+static struct platform_driver idtfc3_driver = {
+	.driver = {
+		.name = "rc38xxx-phc",
+	},
+	.probe = idtfc3_probe,
+	.remove	= idtfc3_remove,
+};
+
+module_platform_driver(idtfc3_driver);
diff --git a/drivers/ptp/ptp_fc3.h b/drivers/ptp/ptp_fc3.h
new file mode 100644
index 000000000000..897101579207
--- /dev/null
+++ b/drivers/ptp/ptp_fc3.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * PTP hardware clock driver for the FemtoClock3 family of timing and
+ * synchronization devices.
+ *
+ * Copyright (C) 2023 Integrated Device Technology, Inc., a Renesas Company.
+ */
+#ifndef PTP_IDTFC3_H
+#define PTP_IDTFC3_H
+
+#include <linux/ktime.h>
+#include <linux/ptp_clock.h>
+#include <linux/regmap.h>
+
+#define FW_FILENAME	"idtfc3.bin"
+
+#define MAX_FFO_PPB	(244000)
+#define TDC_GET_PERIOD	(10)
+
+struct idtfc3 {
+	struct ptp_clock_info	caps;
+	struct ptp_clock	*ptp_clock;
+	struct device		*dev;
+	/* Mutex to protect operations from being interrupted */
+	struct mutex		*lock;
+	struct device		*mfd;
+	struct regmap		*regmap;
+	struct idtfc3_hw_param	hw_param;
+	u32			sub_sync_count;
+	u32			ns_per_sync;
+	int			tdc_offset_sign;
+	u64			tdc_apll_freq;
+	u32			time_ref_freq;
+	u16			fod_n;
+	u8			lpf_mode;
+	/* Time counter */
+	u32			last_counter;
+	s64			ns;
+	u32			ns_per_counter;
+	u32			tc_update_period;
+	u32			tc_write_timeout;
+	s64			tod_write_overhead;
+};
+
+#endif /* PTP_IDTFC3_H */
diff --git a/include/linux/mfd/idtRC38xxx_reg.h b/include/linux/mfd/idtRC38xxx_reg.h
new file mode 100644
index 000000000000..ec11872f51ad
--- /dev/null
+++ b/include/linux/mfd/idtRC38xxx_reg.h
@@ -0,0 +1,273 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Register Map - Based on PolarBear_CSRs.RevA.xlsx (2023-04-21)
+ *
+ * Copyright (C) 2023 Integrated Device Technology, Inc., a Renesas Company.
+ */
+#ifndef MFD_IDTRC38XXX_REG
+#define MFD_IDTRC38XXX_REG
+
+/* GLOBAL */
+#define SOFT_RESET_CTRL		(0x15) /* Specific to FC3W */
+#define MISC_CTRL		(0x14) /* Specific to FC3A */
+#define APLL_REINIT		BIT(1)
+#define APLL_REINIT_VFC3A	BIT(2)
+
+#define DEVICE_ID		(0x2)
+#define DEVICE_ID_MASK		(0x1000) /* Bit 12 is 1 if FC3W and 0 if FC3A */
+#define DEVICE_ID_SHIFT		(12)
+
+/* FOD */
+#define FOD_0		(0x300)
+#define FOD_0_VFC3A	(0x400)
+#define FOD_1		(0x340)
+#define FOD_1_VFC3A	(0x440)
+#define FOD_2		(0x380)
+#define FOD_2_VFC3A	(0x480)
+
+/* TDCAPLL */
+#define TDC_CTRL		(0x44a) /* Specific to FC3W */
+#define TDC_ENABLE_CTRL		(0x169) /* Specific to FC3A */
+#define TDC_DAC_CAL_CTRL	(0x16a) /* Specific to FC3A */
+#define TDC_EN			BIT(0)
+#define TDC_DAC_RECAL_REQ	BIT(1)
+#define TDC_DAC_RECAL_REQ_VFC3A	BIT(0)
+
+#define TDC_FB_DIV_INT_CNFG		(0x442)
+#define TDC_FB_DIV_INT_CNFG_VFC3A	(0x162)
+#define TDC_FB_DIV_INT_MASK		GENMASK(7, 0)
+#define TDC_REF_DIV_CNFG		(0x443)
+#define TDC_REF_DIV_CNFG_VFC3A		(0x163)
+#define TDC_REF_DIV_CONFIG_MASK		GENMASK(2, 0)
+
+/* TIME SYNC CHANNEL */
+#define TIME_CLOCK_SRC		(0xa01) /* Specific to FC3W */
+#define TIME_CLOCK_COUNT	(0xa00) /* Specific to FC3W */
+#define TIME_CLOCK_COUNT_MASK	GENMASK(5, 0)
+
+#define SUB_SYNC_GEN_CNFG	(0xa04)
+
+#define TOD_COUNTER_READ_REQ		(0xa5f)
+#define TOD_COUNTER_READ_REQ_VFC3A	(0x6df)
+#define TOD_SYNC_LOAD_VAL_CTRL		(0xa10)
+#define TOD_SYNC_LOAD_VAL_CTRL_VFC3A	(0x690)
+#define SYNC_COUNTER_MASK		GENMASK_ULL(51, 0)
+#define SUB_SYNC_COUNTER_MASK		GENMASK(30, 0)
+#define TOD_SYNC_LOAD_REQ_CTRL		(0xa21)
+#define TOD_SYNC_LOAD_REQ_CTRL_VFC3A	(0x6a1)
+#define SYNC_LOAD_ENABLE		BIT(1)
+#define SUB_SYNC_LOAD_ENABLE		BIT(0)
+#define SYNC_LOAD_REQ			BIT(0)
+
+#define LPF_MODE_CNFG		(0xa80)
+#define LPF_MODE_CNFG_VFC3A	(0x700)
+enum lpf_mode {
+	LPF_DISABLED = 0,
+	LPF_WP       = 1,
+	LPF_HOLDOVER = 2,
+	LPF_WF       = 3,
+	LPF_INVALID  = 4
+};
+#define LPF_CTRL	(0xa98)
+#define LPF_CTRL_VFC3A	(0x718)
+#define LPF_EN		BIT(0)
+
+#define LPF_BW_CNFG	(0xa81)
+#define LPF_BW_SHIFT	GENMASK(7, 3)
+#define LPF_BW_MULT		GENMASK(2, 0)
+#define LPF_BW_SHIFT_DEFAULT	(0xb)
+#define LPF_BW_MULT_DEFAULT		(0x0)
+#define LPF_BW_SHIFT_1PPS		(0x5)
+
+#define LPF_WR_PHASE_CTRL	(0xaa8)
+#define LPF_WR_PHASE_CTRL_VFC3A	(0x728)
+#define LPF_WR_FREQ_CTRL	(0xab0)
+#define LPF_WR_FREQ_CTRL_VFC3A	(0x730)
+
+#define TIME_CLOCK_TDC_FANOUT_CNFG	(0xB00)
+#define TIME_SYNC_TO_TDC_EN	BIT(0)
+#define SIG1_MUX_SEL_MASK	GENMASK(7, 4)
+#define SIG2_MUX_SEL_MASK	GENMASK(11, 8)
+enum tdc_mux_sel {
+	REF0 = 0,
+	REF1 = 1,
+	REF2 = 2,
+	REF3 = 3,
+	REF_CLK5 = 4,
+	REF_CLK6 = 5,
+	DPLL_FB_TO_TDC = 6,
+	DPLL_FB_DIVIDED_TO_TDC = 7,
+	TIME_CLK_DIVIDED = 8,
+	TIME_SYNC = 9,
+};
+
+#define TIME_CLOCK_MEAS_CNFG	(0xB04)
+#define TDC_MEAS_MODE	BIT(0)
+enum tdc_meas_mode {
+	CONTINUOUS = 0,
+	ONE_SHOT = 1,
+	MEAS_MODE_INVALID = 2,
+};
+
+#define TIME_CLOCK_MEAS_DIV_CNFG	(0xB08)
+#define TIME_REF_DIV_MASK	GENMASK(29, 24)
+
+#define TIME_CLOCK_MEAS_CTRL	(0xB10)
+#define TDC_MEAS_EN	BIT(0)
+#define TDC_MEAS_START	BIT(1)
+
+#define TDC_FIFO_READ_REQ	(0xB2F)
+#define TDC_FIFO_READ		(0xB30)
+#define COARSE_MEAS_MASK	GENMASK_ULL(39, 13)
+#define FINE_MEAS_MASK		GENMASK(12, 0)
+
+#define TDC_FIFO_CTRL		(0xB12)
+#define FIFO_CLEAR		BIT(0)
+#define TDC_FIFO_STS		(0xB38)
+#define FIFO_FULL		BIT(1)
+#define FIFO_EMPTY		BIT(0)
+#define TDC_FIFO_EVENT		(0xB39)
+#define FIFO_OVERRUN		BIT(1)
+
+/* DPLL */
+#define MAX_REFERENCE_INDEX	(3)
+#define MAX_NUM_REF_PRIORITY	(4)
+
+#define MAX_DPLL_INDEX	(2)
+
+#define DPLL_STS		(0x580)
+#define DPLL_STS_VFC3A		(0x571)
+#define DPLL_STATE_STS_MASK	(0x70)
+#define DPLL_STATE_STS_SHIFT	(4)
+#define DPLL_REF_SEL_STS_MASK	(0x6)
+#define DPLL_REF_SEL_STS_SHIFT	(1)
+
+#define DPLL_REF_PRIORITY_CNFG			(0x502)
+#define DPLL_REFX_PRIORITY_DISABLE_MASK		(0xf)
+#define DPLL_REF0_PRIORITY_ENABLE_AND_SET_MASK	(0x31)
+#define DPLL_REF1_PRIORITY_ENABLE_AND_SET_MASK	(0xc2)
+#define DPLL_REF2_PRIORITY_ENABLE_AND_SET_MASK	(0x304)
+#define DPLL_REF3_PRIORITY_ENABLE_AND_SET_MASK	(0xc08)
+#define DPLL_REF0_PRIORITY_SHIFT		(4)
+#define DPLL_REF1_PRIORITY_SHIFT		(6)
+#define DPLL_REF2_PRIORITY_SHIFT		(8)
+#define DPLL_REF3_PRIORITY_SHIFT		(10)
+
+enum dpll_state {
+	DPLL_STATE_MIN             = 0,
+	DPLL_STATE_FREERUN         = DPLL_STATE_MIN,
+	DPLL_STATE_LOCKED          = 1,
+	DPLL_STATE_HOLDOVER        = 2,
+	DPLL_STATE_WRITE_FREQUENCY = 3,
+	DPLL_STATE_ACQUIRE         = 4,
+	DPLL_STATE_HITLESS_SWITCH  = 5,
+	DPLL_STATE_MAX             = DPLL_STATE_HITLESS_SWITCH
+};
+
+/* REFMON */
+#define LOSMON_STS_0		(0x81e)
+#define LOSMON_STS_0_VFC3A	(0x18e)
+#define LOSMON_STS_1		(0x82e)
+#define LOSMON_STS_1_VFC3A	(0x19e)
+#define LOSMON_STS_2		(0x83e)
+#define LOSMON_STS_2_VFC3A	(0x1ae)
+#define LOSMON_STS_3		(0x84e)
+#define LOSMON_STS_3_VFC3A	(0x1be)
+#define LOS_STS_MASK		(0x1)
+
+#define FREQMON_STS_0		(0x874)
+#define FREQMON_STS_0_VFC3A	(0x1d4)
+#define FREQMON_STS_1		(0x894)
+#define FREQMON_STS_1_VFC3A	(0x1f4)
+#define FREQMON_STS_2		(0x8b4)
+#define FREQMON_STS_2_VFC3A	(0x214)
+#define FREQMON_STS_3		(0x8d4)
+#define FREQMON_STS_3_VFC3A	(0x234)
+#define FREQ_FAIL_STS_SHIFT	(31)
+
+/* Firmware interface */
+#define TIME_CLK_FREQ_ADDR	(0xffa0)
+#define XTAL_FREQ_ADDR		(0xffa1)
+
+/*
+ * Return register address and field mask based on passed in firmware version
+ */
+#define IDTFC3_FW_REG(FW, VER, REG)	(((FW) < (VER)) ? (REG) : (REG##_##VER))
+#define IDTFC3_FW_FIELD(FW, VER, FIELD)	(((FW) < (VER)) ? (FIELD) : (FIELD##_##VER))
+enum fw_version {
+	V_DEFAULT = 0,
+	VFC3W     = 1,
+	VFC3A     = 2
+};
+
+/* XTAL_FREQ_ADDR/TIME_CLK_FREQ_ADDR */
+enum {
+	FREQ_MIN     = 0,
+	FREQ_25M     = 1,
+	FREQ_49_152M = 2,
+	FREQ_50M     = 3,
+	FREQ_100M    = 4,
+	FREQ_125M    = 5,
+	FREQ_250M    = 6,
+	FREQ_MAX
+};
+
+struct idtfc3_hw_param {
+	u32 xtal_freq;
+	u32 time_clk_freq;
+};
+
+struct idtfc3_fwrc {
+	u8 hiaddr;
+	u8 loaddr;
+	u8 value;
+	u8 reserved;
+} __packed;
+
+static inline void idtfc3_default_hw_param(struct idtfc3_hw_param *hw_param)
+{
+	hw_param->xtal_freq = 49152000;
+	hw_param->time_clk_freq = 25000000;
+}
+
+static inline int idtfc3_set_hw_param(struct idtfc3_hw_param *hw_param,
+				      u16 addr, u8 val)
+{
+	if (addr == XTAL_FREQ_ADDR)
+		switch (val) {
+		case FREQ_49_152M:
+			hw_param->xtal_freq = 49152000;
+			break;
+		case FREQ_50M:
+			hw_param->xtal_freq = 50000000;
+			break;
+		default:
+			return -EINVAL;
+		}
+	else if (addr == TIME_CLK_FREQ_ADDR)
+		switch (val) {
+		case FREQ_25M:
+			hw_param->time_clk_freq = 25000000;
+			break;
+		case FREQ_50M:
+			hw_param->time_clk_freq = 50000000;
+			break;
+		case FREQ_100M:
+			hw_param->time_clk_freq = 100000000;
+			break;
+		case FREQ_125M:
+			hw_param->time_clk_freq = 125000000;
+			break;
+		case FREQ_250M:
+			hw_param->time_clk_freq = 250000000;
+			break;
+		default:
+			return -EINVAL;
+		}
+	else
+		return -EFAULT;
+
+	return 0;
+}
+
+#endif
-- 
cgit v1.2.3


From 941988af572434e4aa93fb0f2f509f92adfd691a Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 21 Dec 2023 14:31:57 +0100
Subject: netfilter: uapi: Document NFT_TABLE_F_OWNER flag

Add at least this one-liner describing the obvious.

Fixes: 6001a930ce03 ("netfilter: nftables: introduce table ownership")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/uapi/linux/netfilter/nf_tables.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index ca30232b7bc8..fbce238abdc1 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -179,6 +179,7 @@ enum nft_hook_attributes {
  * enum nft_table_flags - nf_tables table flags
  *
  * @NFT_TABLE_F_DORMANT: this table is not active
+ * @NFT_TABLE_F_OWNER:   this table is owned by a process
  */
 enum nft_table_flags {
 	NFT_TABLE_F_DORMANT	= 0x1,
-- 
cgit v1.2.3


From da5141bbe0c2693d85f14a89ee991921904f4d0c Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 21 Dec 2023 14:31:58 +0100
Subject: netfilter: nf_tables: Introduce NFT_TABLE_F_PERSIST

This companion flag to NFT_TABLE_F_OWNER requests the kernel to keep the
table around after the process has exited. It marks such table as
orphaned (by dropping OWNER flag but keeping PERSIST flag in place),
which opens it for other processes to manipulate. For the sake of
simplicity, PERSIST flag may not be altered though.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/uapi/linux/netfilter/nf_tables.h | 5 ++++-
 net/netfilter/nf_tables_api.c            | 7 +++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index fbce238abdc1..3fee994721cd 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -180,13 +180,16 @@ enum nft_hook_attributes {
  *
  * @NFT_TABLE_F_DORMANT: this table is not active
  * @NFT_TABLE_F_OWNER:   this table is owned by a process
+ * @NFT_TABLE_F_PERSIST: this table shall outlive its owner
  */
 enum nft_table_flags {
 	NFT_TABLE_F_DORMANT	= 0x1,
 	NFT_TABLE_F_OWNER	= 0x2,
+	NFT_TABLE_F_PERSIST	= 0x4,
 };
 #define NFT_TABLE_F_MASK	(NFT_TABLE_F_DORMANT | \
-				 NFT_TABLE_F_OWNER)
+				 NFT_TABLE_F_OWNER | \
+				 NFT_TABLE_F_PERSIST)
 
 /**
  * enum nft_table_attributes - nf_tables table netlink attributes
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index c537104411e7..6a96f0003faa 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1219,6 +1219,9 @@ static int nf_tables_updtable(struct nft_ctx *ctx)
 	     flags & NFT_TABLE_F_OWNER))
 		return -EOPNOTSUPP;
 
+	if ((flags ^ ctx->table->flags) & NFT_TABLE_F_PERSIST)
+		return -EOPNOTSUPP;
+
 	/* No dormant off/on/off/on games in single transaction */
 	if (ctx->table->flags & __NFT_TABLE_F_UPDATE)
 		return -EINVAL;
@@ -11345,6 +11348,10 @@ again:
 	list_for_each_entry(table, &nft_net->tables, list) {
 		if (nft_table_has_owner(table) &&
 		    n->portid == table->nlpid) {
+			if (table->flags & NFT_TABLE_F_PERSIST) {
+				table->flags &= ~NFT_TABLE_F_OWNER;
+				continue;
+			}
 			__nft_release_hook(net, table);
 			list_del_rcu(&table->list);
 			to_delete[deleted++] = table;
-- 
cgit v1.2.3


From 31bf508be656a429a17e3adb31e9acae5c1a6299 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 21 Dec 2023 14:31:59 +0100
Subject: netfilter: nf_tables: Implement table adoption support

Allow a new process to take ownership of a previously owned table,
useful mostly for firewall management services restarting or suspending
when idle.

By extending __NFT_TABLE_F_UPDATE, the on/off/on check in
nf_tables_updtable() also covers table adoption, although it is actually
not needed: Table adoption is irreversible because nf_tables_updtable()
rejects attempts to drop NFT_TABLE_F_OWNER so table->nlpid setting can
happen just once within the transaction.

If the transaction commences, table's nlpid and flags fields are already
set and no further action is required. If it aborts, the table returns
to orphaned state.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/netfilter/nf_tables.h |  6 ++++++
 net/netfilter/nf_tables_api.c     | 19 ++++++++++++++++---
 2 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 4e1ea18eb5f0..ac7c94d3648e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1271,6 +1271,12 @@ static inline bool nft_table_has_owner(const struct nft_table *table)
 	return table->flags & NFT_TABLE_F_OWNER;
 }
 
+static inline bool nft_table_is_orphan(const struct nft_table *table)
+{
+	return (table->flags & (NFT_TABLE_F_OWNER | NFT_TABLE_F_PERSIST)) ==
+			NFT_TABLE_F_PERSIST;
+}
+
 static inline bool nft_base_chain_netdev(int family, u32 hooknum)
 {
 	return family == NFPROTO_NETDEV ||
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 6a96f0003faa..b0e0d039897e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1194,8 +1194,10 @@ static void nf_tables_table_disable(struct net *net, struct nft_table *table)
 #define __NFT_TABLE_F_INTERNAL		(NFT_TABLE_F_MASK + 1)
 #define __NFT_TABLE_F_WAS_DORMANT	(__NFT_TABLE_F_INTERNAL << 0)
 #define __NFT_TABLE_F_WAS_AWAKEN	(__NFT_TABLE_F_INTERNAL << 1)
+#define __NFT_TABLE_F_WAS_ORPHAN	(__NFT_TABLE_F_INTERNAL << 2)
 #define __NFT_TABLE_F_UPDATE		(__NFT_TABLE_F_WAS_DORMANT | \
-					 __NFT_TABLE_F_WAS_AWAKEN)
+					 __NFT_TABLE_F_WAS_AWAKEN | \
+					 __NFT_TABLE_F_WAS_ORPHAN)
 
 static int nf_tables_updtable(struct nft_ctx *ctx)
 {
@@ -1215,8 +1217,8 @@ static int nf_tables_updtable(struct nft_ctx *ctx)
 
 	if ((nft_table_has_owner(ctx->table) &&
 	     !(flags & NFT_TABLE_F_OWNER)) ||
-	    (!nft_table_has_owner(ctx->table) &&
-	     flags & NFT_TABLE_F_OWNER))
+	    (flags & NFT_TABLE_F_OWNER &&
+	     !nft_table_is_orphan(ctx->table)))
 		return -EOPNOTSUPP;
 
 	if ((flags ^ ctx->table->flags) & NFT_TABLE_F_PERSIST)
@@ -1248,6 +1250,13 @@ static int nf_tables_updtable(struct nft_ctx *ctx)
 		}
 	}
 
+	if ((flags & NFT_TABLE_F_OWNER) &&
+	    !nft_table_has_owner(ctx->table)) {
+		ctx->table->nlpid = ctx->portid;
+		ctx->table->flags |= NFT_TABLE_F_OWNER |
+				     __NFT_TABLE_F_WAS_ORPHAN;
+	}
+
 	nft_trans_table_update(trans) = true;
 	nft_trans_commit_list_add_tail(ctx->net, trans);
 
@@ -10423,6 +10432,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
 				} else if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_AWAKEN) {
 					trans->ctx.table->flags &= ~NFT_TABLE_F_DORMANT;
 				}
+				if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_ORPHAN) {
+					trans->ctx.table->flags &= ~NFT_TABLE_F_OWNER;
+					trans->ctx.table->nlpid = 0;
+				}
 				trans->ctx.table->flags &= ~__NFT_TABLE_F_UPDATE;
 				nft_trans_destroy(trans);
 			} else {
-- 
cgit v1.2.3


From efaa47db92451608499ab7edf108bf30141c33db Mon Sep 17 00:00:00 2001
From: Menglong Dong <dongmenglong.8@bytedance.com>
Date: Sun, 28 Jan 2024 13:54:43 +0800
Subject: bpf: Remove unused field "mod" in struct bpf_trampoline

It seems that the field "mod" in struct bpf_trampoline is not used
anywhere after the commit 31bf1dbccfb0 ("bpf: Fix attaching
fentry/fexit/fmod_ret/lsm to modules"). So we can just remove it now.

Fixes: 31bf1dbccfb0 ("bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules")
Signed-off-by: Menglong Dong <dongmenglong.8@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20240128055443.413291-1-dongmenglong.8@bytedance.com
---
 include/linux/bpf.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b86bd15a051d..1ebbee1d648e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1189,7 +1189,6 @@ struct bpf_trampoline {
 	int progs_cnt[BPF_TRAMP_MAX];
 	/* Executable image of trampoline */
 	struct bpf_tramp_image *cur_image;
-	struct module *mod;
 };
 
 struct bpf_attach_target_info {
-- 
cgit v1.2.3


From e2b3c4ff5d183da6d1863c2321413406a2752e7a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 29 Jan 2024 16:06:45 -0800
Subject: bpf: add __arg_trusted global func arg tag

Add support for passing PTR_TO_BTF_ID registers to global subprogs.
Currently only PTR_TRUSTED flavor of PTR_TO_BTF_ID is supported.
Non-NULL semantics is assumed, so caller will be forced to prove
PTR_TO_BTF_ID can't be NULL.

Note, we disallow global subprogs to destroy passed in PTR_TO_BTF_ID
arguments, even the trusted one. We achieve that by not setting
ref_obj_id when validating subprog code. This basically enforces (in
Rust terms) borrowing semantics vs move semantics. Borrowing semantics
seems to be a better fit for isolated global subprog validation
approach.

Implementation-wise, we utilize existing logic for matching
user-provided BTF type to kernel-side BTF type, used by BPF CO-RE logic
and following same matching rules. We enforce a unique match for types.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240130000648.2144827-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/btf.c             | 99 ++++++++++++++++++++++++++++++++++++++------
 kernel/bpf/verifier.c        | 24 +++++++++++
 3 files changed, 111 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7f5816482a10..0dcde339dc7e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -610,6 +610,7 @@ struct bpf_subprog_arg_info {
 	enum bpf_arg_type arg_type;
 	union {
 		u32 mem_size;
+		u32 btf_id;
 	};
 };
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 9ec08cfb2967..ed7a05815984 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6985,9 +6985,77 @@ static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t)
 	return false;
 }
 
+struct bpf_cand_cache {
+	const char *name;
+	u32 name_len;
+	u16 kind;
+	u16 cnt;
+	struct {
+		const struct btf *btf;
+		u32 id;
+	} cands[];
+};
+
+static DEFINE_MUTEX(cand_cache_mutex);
+
+static struct bpf_cand_cache *
+bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id);
+
+static int btf_get_ptr_to_btf_id(struct bpf_verifier_log *log, int arg_idx,
+				 const struct btf *btf, const struct btf_type *t)
+{
+	struct bpf_cand_cache *cc;
+	struct bpf_core_ctx ctx = {
+		.btf = btf,
+		.log = log,
+	};
+	u32 kern_type_id, type_id;
+	int err = 0;
+
+	/* skip PTR and modifiers */
+	type_id = t->type;
+	t = btf_type_by_id(btf, t->type);
+	while (btf_type_is_modifier(t)) {
+		type_id = t->type;
+		t = btf_type_by_id(btf, t->type);
+	}
+
+	mutex_lock(&cand_cache_mutex);
+	cc = bpf_core_find_cands(&ctx, type_id);
+	if (IS_ERR(cc)) {
+		err = PTR_ERR(cc);
+		bpf_log(log, "arg#%d reference type('%s %s') candidate matching error: %d\n",
+			arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off),
+			err);
+		goto cand_cache_unlock;
+	}
+	if (cc->cnt != 1) {
+		bpf_log(log, "arg#%d reference type('%s %s') %s\n",
+			arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off),
+			cc->cnt == 0 ? "has no matches" : "is ambiguous");
+		err = cc->cnt == 0 ? -ENOENT : -ESRCH;
+		goto cand_cache_unlock;
+	}
+	if (btf_is_module(cc->cands[0].btf)) {
+		bpf_log(log, "arg#%d reference type('%s %s') points to kernel module type (unsupported)\n",
+			arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off));
+		err = -EOPNOTSUPP;
+		goto cand_cache_unlock;
+	}
+	kern_type_id = cc->cands[0].id;
+
+cand_cache_unlock:
+	mutex_unlock(&cand_cache_mutex);
+	if (err)
+		return err;
+
+	return kern_type_id;
+}
+
 enum btf_arg_tag {
 	ARG_TAG_CTX = 0x1,
 	ARG_TAG_NONNULL = 0x2,
+	ARG_TAG_TRUSTED = 0x4,
 };
 
 /* Process BTF of a function to produce high-level expectation of function
@@ -7089,6 +7157,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 
 			if (strcmp(tag, "ctx") == 0) {
 				tags |= ARG_TAG_CTX;
+			} else if (strcmp(tag, "trusted") == 0) {
+				tags |= ARG_TAG_TRUSTED;
 			} else if (strcmp(tag, "nonnull") == 0) {
 				tags |= ARG_TAG_NONNULL;
 			} else {
@@ -7127,6 +7197,22 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 			sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY;
 			continue;
 		}
+		if (tags & ARG_TAG_TRUSTED) {
+			int kern_type_id;
+
+			if (tags & ARG_TAG_NONNULL) {
+				bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+				return -EINVAL;
+			}
+
+			kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t);
+			if (kern_type_id < 0)
+				return kern_type_id;
+
+			sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_TRUSTED;
+			sub->args[i].btf_id = kern_type_id;
+			continue;
+		}
 		if (is_global) { /* generic user data pointer */
 			u32 mem_size;
 
@@ -8229,17 +8315,6 @@ size_t bpf_core_essential_name_len(const char *name)
 	return n;
 }
 
-struct bpf_cand_cache {
-	const char *name;
-	u32 name_len;
-	u16 kind;
-	u16 cnt;
-	struct {
-		const struct btf *btf;
-		u32 id;
-	} cands[];
-};
-
 static void bpf_free_cands(struct bpf_cand_cache *cands)
 {
 	if (!cands->cnt)
@@ -8260,8 +8335,6 @@ static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];
 #define MODULE_CAND_CACHE_SIZE 31
 static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];
 
-static DEFINE_MUTEX(cand_cache_mutex);
-
 static void __print_cand_cache(struct bpf_verifier_log *log,
 			       struct bpf_cand_cache **cache,
 			       int cache_size)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c5d68a9d8acc..cd4d780e5400 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9336,6 +9336,18 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 			ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
 			if (ret)
 				return ret;
+		} else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
+			struct bpf_call_arg_meta meta;
+			int err;
+
+			if (register_is_null(reg) && type_may_be_null(arg->arg_type))
+				continue;
+
+			memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
+			err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta);
+			err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type);
+			if (err)
+				return err;
 		} else {
 			bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n",
 				i, arg->arg_type);
@@ -20137,6 +20149,18 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 				mark_reg_known_zero(env, regs, i);
 				reg->mem_size = arg->mem_size;
 				reg->id = ++env->id_gen;
+			} else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
+				reg->type = PTR_TO_BTF_ID;
+				if (arg->arg_type & PTR_MAYBE_NULL)
+					reg->type |= PTR_MAYBE_NULL;
+				if (arg->arg_type & PTR_UNTRUSTED)
+					reg->type |= PTR_UNTRUSTED;
+				if (arg->arg_type & PTR_TRUSTED)
+					reg->type |= PTR_TRUSTED;
+				mark_reg_known_zero(env, regs, i);
+				reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */
+				reg->btf_id = arg->btf_id;
+				reg->id = ++env->id_gen;
 			} else {
 				WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n",
 					  i - BPF_REG_1, arg->arg_type);
-- 
cgit v1.2.3


From c5c3e1bfc9e0ee72af528df8d773980f4855938a Mon Sep 17 00:00:00 2001
From: Rohan G Thomas <rohan.g.thomas@intel.com>
Date: Sat, 27 Jan 2024 12:04:41 +0800
Subject: net: stmmac: Offload queueMaxSDU from tc-taprio

Add support for configuring queueMaxSDU. As DWMAC IPs doesn't support
queueMaxSDU table handle this in the SW. The maximum 802.3 frame size
that is allowed to be transmitted by any queue is queueMaxSDU +
16 bytes (i.e. 6 bytes SA + 6 bytes DA + 4 bytes FCS).

Inspired from intel i225 driver.

Signed-off-by: Rohan G Thomas <rohan.g.thomas@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/common.h      |  1 +
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 22 ++++++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c   | 25 +++++++++++++++++++++++
 include/linux/stmmac.h                            |  1 +
 4 files changed, 49 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 721c1f8e892f..d8d2a90fd228 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -202,6 +202,7 @@ struct stmmac_extra_stats {
 	unsigned long mtl_est_hlbf;
 	unsigned long mtl_est_btre;
 	unsigned long mtl_est_btrlm;
+	unsigned long max_sdu_txq_drop[MTL_MAX_TX_QUEUES];
 	/* per queue statistics */
 	struct stmmac_txq_stats txq_stats[MTL_MAX_TX_QUEUES];
 	struct stmmac_rxq_stats rxq_stats[MTL_MAX_RX_QUEUES];
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index b334eb16da23..33509237fe60 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2507,6 +2507,13 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
 		if (!xsk_tx_peek_desc(pool, &xdp_desc))
 			break;
 
+		if (priv->plat->est && priv->plat->est->enable &&
+		    priv->plat->est->max_sdu[queue] &&
+		    xdp_desc.len > priv->plat->est->max_sdu[queue]) {
+			priv->xstats.max_sdu_txq_drop[queue]++;
+			continue;
+		}
+
 		if (likely(priv->extend_desc))
 			tx_desc = (struct dma_desc *)(tx_q->dma_etx + entry);
 		else if (tx_q->tbs & STMMAC_TBS_AVAIL)
@@ -4498,6 +4505,13 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 			return stmmac_tso_xmit(skb, dev);
 	}
 
+	if (priv->plat->est && priv->plat->est->enable &&
+	    priv->plat->est->max_sdu[queue] &&
+	    skb->len > priv->plat->est->max_sdu[queue]){
+		priv->xstats.max_sdu_txq_drop[queue]++;
+		goto max_sdu_err;
+	}
+
 	if (unlikely(stmmac_tx_avail(priv, queue) < nfrags + 1)) {
 		if (!netif_tx_queue_stopped(netdev_get_tx_queue(dev, queue))) {
 			netif_tx_stop_queue(netdev_get_tx_queue(priv->dev,
@@ -4715,6 +4729,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 
 dma_map_err:
 	netdev_err(priv->dev, "Tx DMA map failed\n");
+max_sdu_err:
 	dev_kfree_skb(skb);
 	priv->xstats.tx_dropped++;
 	return NETDEV_TX_OK;
@@ -4871,6 +4886,13 @@ static int stmmac_xdp_xmit_xdpf(struct stmmac_priv *priv, int queue,
 	if (stmmac_tx_avail(priv, queue) < STMMAC_TX_THRESH(priv))
 		return STMMAC_XDP_CONSUMED;
 
+	if (priv->plat->est && priv->plat->est->enable &&
+	    priv->plat->est->max_sdu[queue] &&
+	    xdpf->len > priv->plat->est->max_sdu[queue]) {
+		priv->xstats.max_sdu_txq_drop[queue]++;
+		return STMMAC_XDP_CONSUMED;
+	}
+
 	if (likely(priv->extend_desc))
 		tx_desc = (struct dma_desc *)(tx_q->dma_etx + entry);
 	else if (tx_q->tbs & STMMAC_TBS_AVAIL)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
index 26fa33e5ec34..07aa3a3089dc 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
@@ -915,6 +915,28 @@ struct timespec64 stmmac_calc_tas_basetime(ktime_t old_base_time,
 	return time;
 }
 
+static void tc_taprio_map_maxsdu_txq(struct stmmac_priv *priv,
+				     struct tc_taprio_qopt_offload *qopt)
+{
+	struct plat_stmmacenet_data *plat = priv->plat;
+	u32 num_tc = qopt->mqprio.qopt.num_tc;
+	u32 offset, count, i, j;
+
+	/* QueueMaxSDU received from the driver corresponds to the Linux traffic
+	 * class. Map queueMaxSDU per Linux traffic class to DWMAC Tx queues.
+	 */
+	for (i = 0; i < num_tc; i++) {
+		if (!qopt->max_sdu[i])
+			continue;
+
+		offset = qopt->mqprio.qopt.offset[i];
+		count = qopt->mqprio.qopt.count[i];
+
+		for (j = offset; j < offset + count; j++)
+			plat->est->max_sdu[j] = qopt->max_sdu[i] + ETH_HLEN - ETH_TLEN;
+	}
+}
+
 static int tc_setup_taprio(struct stmmac_priv *priv,
 			   struct tc_taprio_qopt_offload *qopt)
 {
@@ -1045,6 +1067,8 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 
 	priv->plat->est->ter = qopt->cycle_time_extension;
 
+	tc_taprio_map_maxsdu_txq(priv, qopt);
+
 	if (fpe && !priv->dma_cap.fpesel) {
 		mutex_unlock(&priv->plat->est->lock);
 		return -EOPNOTSUPP;
@@ -1126,6 +1150,7 @@ static int tc_query_caps(struct stmmac_priv *priv,
 			return -EOPNOTSUPP;
 
 		caps->gate_mask_per_txq = true;
+		caps->supports_queue_max_sdu = true;
 
 		return 0;
 	}
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index dee5ad6e48c5..dfa1828cd756 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -127,6 +127,7 @@ struct stmmac_est {
 	u32 gcl_unaligned[EST_GCL];
 	u32 gcl[EST_GCL];
 	u32 gcl_size;
+	u32 max_sdu[MTL_MAX_TX_QUEUES];
 };
 
 struct stmmac_rxq_cfg {
-- 
cgit v1.2.3


From d80a52335374e484a4ff2afdc9af843e73273945 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 27 Jan 2024 14:25:09 +0100
Subject: ethtool: replace struct ethtool_eee with a new struct ethtool_keee on
 kernel side

In order to pass EEE link modes beyond bit 32 to userspace we have to
complement the 32 bit bitmaps in struct ethtool_eee with linkmode
bitmaps. Therefore, similar to ethtool_link_settings and
ethtool_link_ksettings, add a struct ethtool_keee. In a first step
it's an identical copy of ethtool_eee. This patch simply does a
s/ethtool_eee/ethtool_keee/g for all users.
No functional change intended.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c                     | 10 +++++-----
 drivers/net/dsa/b53/b53_priv.h                       |  6 +++---
 drivers/net/dsa/bcm_sf2.c                            |  2 +-
 drivers/net/dsa/microchip/ksz_common.c               |  4 ++--
 drivers/net/dsa/mt7530.c                             |  4 ++--
 drivers/net/dsa/mv88e6xxx/chip.c                     |  4 ++--
 drivers/net/dsa/qca/qca8k-common.c                   |  4 ++--
 drivers/net/dsa/qca/qca8k.h                          |  4 ++--
 drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c  |  4 ++--
 drivers/net/ethernet/broadcom/asp2/bcmasp.h          |  2 +-
 drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c  |  8 ++++----
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c  |  4 ++--
 drivers/net/ethernet/broadcom/bnxt/bnxt.c            |  8 ++++----
 drivers/net/ethernet/broadcom/bnxt/bnxt.h            |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c    |  6 +++---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c       |  8 ++++----
 drivers/net/ethernet/broadcom/genet/bcmgenet.h       |  2 +-
 drivers/net/ethernet/broadcom/tg3.c                  | 10 +++++-----
 drivers/net/ethernet/broadcom/tg3.h                  |  2 +-
 drivers/net/ethernet/engleder/tsnep_main.c           |  6 +++---
 drivers/net/ethernet/freescale/enetc/enetc.c         |  4 ++--
 drivers/net/ethernet/freescale/fec.h                 |  2 +-
 drivers/net/ethernet/freescale/fec_main.c            | 10 +++++-----
 drivers/net/ethernet/freescale/gianfar.c             |  4 ++--
 drivers/net/ethernet/intel/e1000e/ethtool.c          |  6 +++---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c       |  6 +++---
 drivers/net/ethernet/intel/igb/igb_ethtool.c         |  8 ++++----
 drivers/net/ethernet/intel/igc/igc.h                 |  2 +-
 drivers/net/ethernet/intel/igc/igc_ethtool.c         |  8 ++++----
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c     | 10 +++++-----
 drivers/net/ethernet/marvell/mvneta.c                |  4 ++--
 drivers/net/ethernet/microchip/lan743x_ethtool.c     |  4 ++--
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c      |  4 ++--
 drivers/net/ethernet/realtek/r8169_main.c            |  4 ++--
 drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c   |  4 ++--
 drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c |  4 ++--
 drivers/net/ethernet/ti/am65-cpsw-ethtool.c          |  4 ++--
 drivers/net/ethernet/ti/cpsw_ethtool.c               |  4 ++--
 drivers/net/ethernet/ti/cpsw_priv.h                  |  4 ++--
 drivers/net/ethernet/ti/icssg/icssg_ethtool.c        |  4 ++--
 drivers/net/phy/marvell.c                            |  2 +-
 drivers/net/phy/phy-c45.c                            |  8 ++++----
 drivers/net/phy/phy.c                                |  8 ++++----
 drivers/net/phy/phylink.c                            |  8 ++++----
 drivers/net/usb/ax88179_178a.c                       | 10 +++++-----
 drivers/net/usb/lan78xx.c                            |  4 ++--
 drivers/net/usb/r8152.c                              | 14 +++++++-------
 include/linux/ethtool.h                              | 16 ++++++++++++++--
 include/linux/phy.h                                  |  8 ++++----
 include/linux/phylink.h                              |  4 ++--
 include/net/dsa.h                                    |  4 ++--
 net/dsa/user.c                                       |  4 ++--
 net/ethtool/eee.c                                    | 10 +++++-----
 net/ethtool/ioctl.c                                  |  6 +++---
 54 files changed, 159 insertions(+), 147 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 0d628b35fd5c..adc93abf4551 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1257,7 +1257,7 @@ static void b53_adjust_link(struct dsa_switch *ds, int port,
 			    struct phy_device *phydev)
 {
 	struct b53_device *dev = ds->priv;
-	struct ethtool_eee *p = &dev->ports[port].eee;
+	struct ethtool_keee *p = &dev->ports[port].eee;
 	u8 rgmii_ctrl = 0, reg = 0, off;
 	bool tx_pause = false;
 	bool rx_pause = false;
@@ -2224,10 +2224,10 @@ int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy)
 }
 EXPORT_SYMBOL(b53_eee_init);
 
-int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e)
+int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e)
 {
 	struct b53_device *dev = ds->priv;
-	struct ethtool_eee *p = &dev->ports[port].eee;
+	struct ethtool_keee *p = &dev->ports[port].eee;
 	u16 reg;
 
 	if (is5325(dev) || is5365(dev))
@@ -2241,10 +2241,10 @@ int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e)
 }
 EXPORT_SYMBOL(b53_get_mac_eee);
 
-int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e)
+int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e)
 {
 	struct b53_device *dev = ds->priv;
-	struct ethtool_eee *p = &dev->ports[port].eee;
+	struct ethtool_keee *p = &dev->ports[port].eee;
 
 	if (is5325(dev) || is5365(dev))
 		return -EOPNOTSUPP;
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index fdcfd5081c28..c26a03755e83 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -95,7 +95,7 @@ struct b53_pcs {
 
 struct b53_port {
 	u16		vlan_ctl_mask;
-	struct ethtool_eee eee;
+	struct ethtool_keee eee;
 };
 
 struct b53_vlan {
@@ -397,7 +397,7 @@ void b53_disable_port(struct dsa_switch *ds, int port);
 void b53_brcm_hdr_setup(struct dsa_switch *ds, int port);
 void b53_eee_enable_set(struct dsa_switch *ds, int port, bool enable);
 int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy);
-int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e);
-int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e);
+int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e);
+int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e);
 
 #endif
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 4a52ccbe393f..bc77ee9e6d0a 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -835,7 +835,7 @@ static void bcm_sf2_sw_mac_link_up(struct dsa_switch *ds, int port,
 				   bool tx_pause, bool rx_pause)
 {
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-	struct ethtool_eee *p = &priv->dev->ports[port].eee;
+	struct ethtool_keee *p = &priv->dev->ports[port].eee;
 	u32 reg_rgmii_ctrl = 0;
 	u32 reg, offset;
 
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 245dfb7a7a31..a7b5ddb8656e 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -2852,7 +2852,7 @@ static int ksz_validate_eee(struct dsa_switch *ds, int port)
 }
 
 static int ksz_get_mac_eee(struct dsa_switch *ds, int port,
-			   struct ethtool_eee *e)
+			   struct ethtool_keee *e)
 {
 	int ret;
 
@@ -2872,7 +2872,7 @@ static int ksz_get_mac_eee(struct dsa_switch *ds, int port,
 }
 
 static int ksz_set_mac_eee(struct dsa_switch *ds, int port,
-			   struct ethtool_eee *e)
+			   struct ethtool_keee *e)
 {
 	struct ksz_device *dev = ds->priv;
 	int ret;
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index 68be38ae66e0..98a73a62f2ee 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -3048,7 +3048,7 @@ mt753x_setup(struct dsa_switch *ds)
 }
 
 static int mt753x_get_mac_eee(struct dsa_switch *ds, int port,
-			      struct ethtool_eee *e)
+			      struct ethtool_keee *e)
 {
 	struct mt7530_priv *priv = ds->priv;
 	u32 eeecr = mt7530_read(priv, MT7530_PMEEECR_P(port));
@@ -3060,7 +3060,7 @@ static int mt753x_get_mac_eee(struct dsa_switch *ds, int port,
 }
 
 static int mt753x_set_mac_eee(struct dsa_switch *ds, int port,
-			      struct ethtool_eee *e)
+			      struct ethtool_keee *e)
 {
 	struct mt7530_priv *priv = ds->priv;
 	u32 set, mask = LPI_THRESH_MASK | LPI_MODE_EN;
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 383b3c4d6f59..8b0079b8e0e0 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1451,14 +1451,14 @@ static void mv88e6xxx_get_regs(struct dsa_switch *ds, int port,
 }
 
 static int mv88e6xxx_get_mac_eee(struct dsa_switch *ds, int port,
-				 struct ethtool_eee *e)
+				 struct ethtool_keee *e)
 {
 	/* Nothing to do on the port's MAC */
 	return 0;
 }
 
 static int mv88e6xxx_set_mac_eee(struct dsa_switch *ds, int port,
-				 struct ethtool_eee *e)
+				 struct ethtool_keee *e)
 {
 	/* Nothing to do on the port's MAC */
 	return 0;
diff --git a/drivers/net/dsa/qca/qca8k-common.c b/drivers/net/dsa/qca/qca8k-common.c
index 2358cd399c7e..7f80035c5441 100644
--- a/drivers/net/dsa/qca/qca8k-common.c
+++ b/drivers/net/dsa/qca/qca8k-common.c
@@ -534,7 +534,7 @@ int qca8k_get_sset_count(struct dsa_switch *ds, int port, int sset)
 }
 
 int qca8k_set_mac_eee(struct dsa_switch *ds, int port,
-		      struct ethtool_eee *eee)
+		      struct ethtool_keee *eee)
 {
 	u32 lpi_en = QCA8K_REG_EEE_CTRL_LPI_EN(port);
 	struct qca8k_priv *priv = ds->priv;
@@ -558,7 +558,7 @@ exit:
 }
 
 int qca8k_get_mac_eee(struct dsa_switch *ds, int port,
-		      struct ethtool_eee *e)
+		      struct ethtool_keee *e)
 {
 	/* Nothing to do on the port's MAC */
 	return 0;
diff --git a/drivers/net/dsa/qca/qca8k.h b/drivers/net/dsa/qca/qca8k.h
index c8785c36c54e..2184d8d2d5a9 100644
--- a/drivers/net/dsa/qca/qca8k.h
+++ b/drivers/net/dsa/qca/qca8k.h
@@ -518,8 +518,8 @@ void qca8k_get_ethtool_stats(struct dsa_switch *ds, int port,
 int qca8k_get_sset_count(struct dsa_switch *ds, int port, int sset);
 
 /* Common eee function */
-int qca8k_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *eee);
-int qca8k_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e);
+int qca8k_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *eee);
+int qca8k_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e);
 
 /* Common bridge function */
 void qca8k_port_stp_state_set(struct dsa_switch *ds, int port, u8 state);
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
index 18a6c8d99fa0..be865776de55 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
@@ -697,7 +697,7 @@ static u32 eee_mask_to_ethtool_mask(u32 speed)
 	return rate;
 }
 
-static int aq_ethtool_get_eee(struct net_device *ndev, struct ethtool_eee *eee)
+static int aq_ethtool_get_eee(struct net_device *ndev, struct ethtool_keee *eee)
 {
 	struct aq_nic_s *aq_nic = netdev_priv(ndev);
 	u32 rate, supported_rates;
@@ -729,7 +729,7 @@ static int aq_ethtool_get_eee(struct net_device *ndev, struct ethtool_eee *eee)
 	return 0;
 }
 
-static int aq_ethtool_set_eee(struct net_device *ndev, struct ethtool_eee *eee)
+static int aq_ethtool_set_eee(struct net_device *ndev, struct ethtool_keee *eee)
 {
 	struct aq_nic_s *aq_nic = netdev_priv(ndev);
 	u32 rate, supported_rates;
diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp.h b/drivers/net/ethernet/broadcom/asp2/bcmasp.h
index ec90add6b03e..312bf9b6576e 100644
--- a/drivers/net/ethernet/broadcom/asp2/bcmasp.h
+++ b/drivers/net/ethernet/broadcom/asp2/bcmasp.h
@@ -337,7 +337,7 @@ struct bcmasp_intf {
 	int				wol_irq;
 	unsigned int			wol_irq_enabled:1;
 
-	struct ethtool_eee		eee;
+	struct ethtool_keee		eee;
 };
 
 #define NUM_NET_FILTERS				32
diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c b/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c
index ce6a3d56fb23..2851bed153e6 100644
--- a/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c
+++ b/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c
@@ -363,10 +363,10 @@ void bcmasp_eee_enable_set(struct bcmasp_intf *intf, bool enable)
 	intf->eee.eee_active = enable;
 }
 
-static int bcmasp_get_eee(struct net_device *dev, struct ethtool_eee *e)
+static int bcmasp_get_eee(struct net_device *dev, struct ethtool_keee *e)
 {
 	struct bcmasp_intf *intf = netdev_priv(dev);
-	struct ethtool_eee *p = &intf->eee;
+	struct ethtool_keee *p = &intf->eee;
 
 	if (!dev->phydev)
 		return -ENODEV;
@@ -379,10 +379,10 @@ static int bcmasp_get_eee(struct net_device *dev, struct ethtool_eee *e)
 	return phy_ethtool_get_eee(dev->phydev, e);
 }
 
-static int bcmasp_set_eee(struct net_device *dev, struct ethtool_eee *e)
+static int bcmasp_set_eee(struct net_device *dev, struct ethtool_keee *e)
 {
 	struct bcmasp_intf *intf = netdev_priv(dev);
-	struct ethtool_eee *p = &intf->eee;
+	struct ethtool_keee *p = &intf->eee;
 	int ret;
 
 	if (!dev->phydev)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 0bc7690cdee1..12d2785eeb8a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -2108,7 +2108,7 @@ static u32 bnx2x_adv_to_eee(u32 modes, u32 shift)
 	return eee_adv << shift;
 }
 
-static int bnx2x_get_eee(struct net_device *dev, struct ethtool_eee *edata)
+static int bnx2x_get_eee(struct net_device *dev, struct ethtool_keee *edata)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 	u32 eee_cfg;
@@ -2141,7 +2141,7 @@ static int bnx2x_get_eee(struct net_device *dev, struct ethtool_eee *edata)
 	return 0;
 }
 
-static int bnx2x_set_eee(struct net_device *dev, struct ethtool_eee *edata)
+static int bnx2x_set_eee(struct net_device *dev, struct ethtool_keee *edata)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 	u32 eee_cfg;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 39845d556baf..d7626c26f9a9 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -10621,7 +10621,7 @@ static int bnxt_hwrm_phy_qcaps(struct bnxt *bp)
 
 	bp->phy_flags = resp->flags | (le16_to_cpu(resp->flags2) << 8);
 	if (resp->flags & PORT_PHY_QCAPS_RESP_FLAGS_EEE_SUPPORTED) {
-		struct ethtool_eee *eee = &bp->eee;
+		struct ethtool_keee *eee = &bp->eee;
 		u16 fw_speeds = le16_to_cpu(resp->supported_speeds_eee_mode);
 
 		eee->supported = _bnxt_fw_to_ethtool_adv_spds(fw_speeds, 0);
@@ -10766,7 +10766,7 @@ int bnxt_update_link(struct bnxt *bp, bool chng_link_state)
 	link_info->module_status = resp->module_status;
 
 	if (bp->phy_flags & BNXT_PHY_FL_EEE_CAP) {
-		struct ethtool_eee *eee = &bp->eee;
+		struct ethtool_keee *eee = &bp->eee;
 		u16 fw_speeds;
 
 		eee->eee_active = 0;
@@ -10957,7 +10957,7 @@ int bnxt_hwrm_set_pause(struct bnxt *bp)
 static void bnxt_hwrm_set_eee(struct bnxt *bp,
 			      struct hwrm_port_phy_cfg_input *req)
 {
-	struct ethtool_eee *eee = &bp->eee;
+	struct ethtool_keee *eee = &bp->eee;
 
 	if (eee->eee_enabled) {
 		u16 eee_speeds;
@@ -11322,7 +11322,7 @@ static void bnxt_get_wol_settings(struct bnxt *bp)
 
 static bool bnxt_eee_config_ok(struct bnxt *bp)
 {
-	struct ethtool_eee *eee = &bp->eee;
+	struct ethtool_keee *eee = &bp->eee;
 	struct bnxt_link_info *link_info = &bp->link_info;
 
 	if (!(bp->phy_flags & BNXT_PHY_FL_EEE_CAP))
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 47338b48ca20..b2cb3e77559d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -2442,7 +2442,7 @@ struct bnxt {
 	 */
 	struct mutex		link_lock;
 	struct bnxt_link_info	link_info;
-	struct ethtool_eee	eee;
+	struct ethtool_keee	eee;
 	u32			lpi_tmr_lo;
 	u32			lpi_tmr_hi;
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index dc4ca706b0e2..d6a8577d68af 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -3884,10 +3884,10 @@ static int bnxt_set_eeprom(struct net_device *dev,
 				eeprom->len);
 }
 
-static int bnxt_set_eee(struct net_device *dev, struct ethtool_eee *edata)
+static int bnxt_set_eee(struct net_device *dev, struct ethtool_keee *edata)
 {
 	struct bnxt *bp = netdev_priv(dev);
-	struct ethtool_eee *eee = &bp->eee;
+	struct ethtool_keee *eee = &bp->eee;
 	struct bnxt_link_info *link_info = &bp->link_info;
 	u32 advertising;
 	int rc = 0;
@@ -3942,7 +3942,7 @@ eee_exit:
 	return rc;
 }
 
-static int bnxt_get_eee(struct net_device *dev, struct ethtool_eee *edata)
+static int bnxt_get_eee(struct net_device *dev, struct ethtool_keee *edata)
 {
 	struct bnxt *bp = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 2d7ae71287b1..051c31fb17c2 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -1317,10 +1317,10 @@ void bcmgenet_eee_enable_set(struct net_device *dev, bool enable,
 	priv->eee.tx_lpi_enabled = tx_lpi_enabled;
 }
 
-static int bcmgenet_get_eee(struct net_device *dev, struct ethtool_eee *e)
+static int bcmgenet_get_eee(struct net_device *dev, struct ethtool_keee *e)
 {
 	struct bcmgenet_priv *priv = netdev_priv(dev);
-	struct ethtool_eee *p = &priv->eee;
+	struct ethtool_keee *p = &priv->eee;
 
 	if (GENET_IS_V1(priv))
 		return -EOPNOTSUPP;
@@ -1336,10 +1336,10 @@ static int bcmgenet_get_eee(struct net_device *dev, struct ethtool_eee *e)
 	return phy_ethtool_get_eee(dev->phydev, e);
 }
 
-static int bcmgenet_set_eee(struct net_device *dev, struct ethtool_eee *e)
+static int bcmgenet_set_eee(struct net_device *dev, struct ethtool_keee *e)
 {
 	struct bcmgenet_priv *priv = netdev_priv(dev);
-	struct ethtool_eee *p = &priv->eee;
+	struct ethtool_keee *p = &priv->eee;
 
 	if (GENET_IS_V1(priv))
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
index 1985c0ec4da2..7523b60b3c1c 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
@@ -645,7 +645,7 @@ struct bcmgenet_priv {
 
 	struct bcmgenet_mib_counters mib;
 
-	struct ethtool_eee eee;
+	struct ethtool_keee eee;
 };
 
 #define GENET_IO_MACRO(name, offset)					\
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 04964bbe08cf..11054177c2ec 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -2338,10 +2338,10 @@ static void tg3_phy_apply_otp(struct tg3 *tp)
 	tg3_phy_toggle_auxctl_smdsp(tp, false);
 }
 
-static void tg3_eee_pull_config(struct tg3 *tp, struct ethtool_eee *eee)
+static void tg3_eee_pull_config(struct tg3 *tp, struct ethtool_keee *eee)
 {
 	u32 val;
-	struct ethtool_eee *dest = &tp->eee;
+	struct ethtool_keee *dest = &tp->eee;
 
 	if (!(tp->phy_flags & TG3_PHYFLG_EEE_CAP))
 		return;
@@ -4618,7 +4618,7 @@ static int tg3_init_5401phy_dsp(struct tg3 *tp)
 
 static bool tg3_phy_eee_config_ok(struct tg3 *tp)
 {
-	struct ethtool_eee eee;
+	struct ethtool_keee eee;
 
 	if (!(tp->phy_flags & TG3_PHYFLG_EEE_CAP))
 		return true;
@@ -14180,7 +14180,7 @@ static int tg3_set_coalesce(struct net_device *dev,
 	return 0;
 }
 
-static int tg3_set_eee(struct net_device *dev, struct ethtool_eee *edata)
+static int tg3_set_eee(struct net_device *dev, struct ethtool_keee *edata)
 {
 	struct tg3 *tp = netdev_priv(dev);
 
@@ -14217,7 +14217,7 @@ static int tg3_set_eee(struct net_device *dev, struct ethtool_eee *edata)
 	return 0;
 }
 
-static int tg3_get_eee(struct net_device *dev, struct ethtool_eee *edata)
+static int tg3_get_eee(struct net_device *dev, struct ethtool_keee *edata)
 {
 	struct tg3 *tp = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index 5016475e5005..cf1b2b123c7e 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -3419,7 +3419,7 @@ struct tg3 {
 	unsigned int			irq_cnt;
 
 	struct ethtool_coalesce		coal;
-	struct ethtool_eee		eee;
+	struct ethtool_keee		eee;
 
 	/* firmware info */
 	const char			*fw_needed;
diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
index ae0b8b37b9bf..eb64118f5b18 100644
--- a/drivers/net/ethernet/engleder/tsnep_main.c
+++ b/drivers/net/ethernet/engleder/tsnep_main.c
@@ -240,7 +240,7 @@ static int tsnep_phy_loopback(struct tsnep_adapter *adapter, bool enable)
 static int tsnep_phy_open(struct tsnep_adapter *adapter)
 {
 	struct phy_device *phydev;
-	struct ethtool_eee ethtool_eee;
+	struct ethtool_keee ethtool_keee;
 	int retval;
 
 	retval = phy_connect_direct(adapter->netdev, adapter->phydev,
@@ -259,8 +259,8 @@ static int tsnep_phy_open(struct tsnep_adapter *adapter)
 	phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
 
 	/* disable EEE autoneg, EEE not supported by TSNEP */
-	memset(&ethtool_eee, 0, sizeof(ethtool_eee));
-	phy_ethtool_set_eee(adapter->phydev, &ethtool_eee);
+	memset(&ethtool_keee, 0, sizeof(ethtool_keee));
+	phy_ethtool_set_eee(adapter->phydev, &ethtool_keee);
 
 	adapter->phydev->irq = PHY_MAC_INTERRUPT;
 	phy_start(adapter->phydev);
diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c
index bfdbdab443ae..9f07f4947b63 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc.c
@@ -2402,7 +2402,7 @@ static void enetc_clear_interrupts(struct enetc_ndev_priv *priv)
 static int enetc_phylink_connect(struct net_device *ndev)
 {
 	struct enetc_ndev_priv *priv = netdev_priv(ndev);
-	struct ethtool_eee edata;
+	struct ethtool_keee edata;
 	int err;
 
 	if (!priv->phylink) {
@@ -2418,7 +2418,7 @@ static int enetc_phylink_connect(struct net_device *ndev)
 	}
 
 	/* disable EEE autoneg, until ENETC driver supports it */
-	memset(&edata, 0, sizeof(struct ethtool_eee));
+	memset(&edata, 0, sizeof(struct ethtool_keee));
 	phylink_ethtool_set_eee(priv->phylink, &edata);
 
 	phylink_start(priv->phylink);
diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
index a8fbcada6b01..a19cb2a786fd 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -672,7 +672,7 @@ struct fec_enet_private {
 	unsigned int itr_clk_rate;
 
 	/* tx lpi eee mode */
-	struct ethtool_eee eee;
+	struct ethtool_keee eee;
 	unsigned int clk_ref_rate;
 
 	/* ptp clock period in ns*/
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 432523b2c789..63707e065141 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3122,7 +3122,7 @@ static int fec_enet_us_to_tx_cycle(struct net_device *ndev, int us)
 static int fec_enet_eee_mode_set(struct net_device *ndev, bool enable)
 {
 	struct fec_enet_private *fep = netdev_priv(ndev);
-	struct ethtool_eee *p = &fep->eee;
+	struct ethtool_keee *p = &fep->eee;
 	unsigned int sleep_cycle, wake_cycle;
 	int ret = 0;
 
@@ -3149,10 +3149,10 @@ static int fec_enet_eee_mode_set(struct net_device *ndev, bool enable)
 }
 
 static int
-fec_enet_get_eee(struct net_device *ndev, struct ethtool_eee *edata)
+fec_enet_get_eee(struct net_device *ndev, struct ethtool_keee *edata)
 {
 	struct fec_enet_private *fep = netdev_priv(ndev);
-	struct ethtool_eee *p = &fep->eee;
+	struct ethtool_keee *p = &fep->eee;
 
 	if (!(fep->quirks & FEC_QUIRK_HAS_EEE))
 		return -EOPNOTSUPP;
@@ -3169,10 +3169,10 @@ fec_enet_get_eee(struct net_device *ndev, struct ethtool_eee *edata)
 }
 
 static int
-fec_enet_set_eee(struct net_device *ndev, struct ethtool_eee *edata)
+fec_enet_set_eee(struct net_device *ndev, struct ethtool_keee *edata)
 {
 	struct fec_enet_private *fep = netdev_priv(ndev);
-	struct ethtool_eee *p = &fep->eee;
+	struct ethtool_keee *p = &fep->eee;
 	int ret = 0;
 
 	if (!(fep->quirks & FEC_QUIRK_HAS_EEE))
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index e3dfbd7a4236..a811238c018d 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -1649,7 +1649,7 @@ static int init_phy(struct net_device *dev)
 	struct gfar_private *priv = netdev_priv(dev);
 	phy_interface_t interface = priv->interface;
 	struct phy_device *phydev;
-	struct ethtool_eee edata;
+	struct ethtool_keee edata;
 
 	linkmode_set_bit_array(phy_10_100_features_array,
 			       ARRAY_SIZE(phy_10_100_features_array),
@@ -1681,7 +1681,7 @@ static int init_phy(struct net_device *dev)
 	phy_support_asym_pause(phydev);
 
 	/* disable EEE autoneg, EEE not supported by eTSEC */
-	memset(&edata, 0, sizeof(struct ethtool_eee));
+	memset(&edata, 0, sizeof(struct ethtool_keee));
 	phy_ethtool_set_eee(phydev, &edata);
 
 	return 0;
diff --git a/drivers/net/ethernet/intel/e1000e/ethtool.c b/drivers/net/ethernet/intel/e1000e/ethtool.c
index fc0f98ea6133..343f54b2b6ae 100644
--- a/drivers/net/ethernet/intel/e1000e/ethtool.c
+++ b/drivers/net/ethernet/intel/e1000e/ethtool.c
@@ -2186,7 +2186,7 @@ static int e1000_get_rxnfc(struct net_device *netdev,
 	}
 }
 
-static int e1000e_get_eee(struct net_device *netdev, struct ethtool_eee *edata)
+static int e1000e_get_eee(struct net_device *netdev, struct ethtool_keee *edata)
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
@@ -2262,11 +2262,11 @@ release:
 	return ret_val;
 }
 
-static int e1000e_set_eee(struct net_device *netdev, struct ethtool_eee *edata)
+static int e1000e_set_eee(struct net_device *netdev, struct ethtool_keee *edata)
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
-	struct ethtool_eee eee_curr;
+	struct ethtool_keee eee_curr;
 	s32 ret_val;
 
 	ret_val = e1000e_get_eee(netdev, &eee_curr);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index c841779713f6..9dfda3c48af3 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -5644,7 +5644,7 @@ static int i40e_get_module_eeprom(struct net_device *netdev,
 	return 0;
 }
 
-static int i40e_get_eee(struct net_device *netdev, struct ethtool_eee *edata)
+static int i40e_get_eee(struct net_device *netdev, struct ethtool_keee *edata)
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_aq_get_phy_abilities_resp phy_cfg;
@@ -5682,7 +5682,7 @@ static int i40e_get_eee(struct net_device *netdev, struct ethtool_eee *edata)
 }
 
 static int i40e_is_eee_param_supported(struct net_device *netdev,
-				       struct ethtool_eee *edata)
+				       struct ethtool_keee *edata)
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_vsi *vsi = np->vsi;
@@ -5709,7 +5709,7 @@ static int i40e_is_eee_param_supported(struct net_device *netdev,
 	return 0;
 }
 
-static int i40e_set_eee(struct net_device *netdev, struct ethtool_eee *edata)
+static int i40e_set_eee(struct net_device *netdev, struct ethtool_keee *edata)
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_aq_get_phy_abilities_resp abilities;
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index b66199c9bb3a..778d1e6cfc45 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -3027,7 +3027,7 @@ static int igb_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
 	return ret;
 }
 
-static int igb_get_eee(struct net_device *netdev, struct ethtool_eee *edata)
+static int igb_get_eee(struct net_device *netdev, struct ethtool_keee *edata)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
@@ -3106,11 +3106,11 @@ static int igb_get_eee(struct net_device *netdev, struct ethtool_eee *edata)
 }
 
 static int igb_set_eee(struct net_device *netdev,
-		       struct ethtool_eee *edata)
+		       struct ethtool_keee *edata)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
-	struct ethtool_eee eee_curr;
+	struct ethtool_keee eee_curr;
 	bool adv1g_eee = true, adv100m_eee = true;
 	s32 ret_val;
 
@@ -3118,7 +3118,7 @@ static int igb_set_eee(struct net_device *netdev,
 	    (hw->phy.media_type != e1000_media_type_copper))
 		return -EOPNOTSUPP;
 
-	memset(&eee_curr, 0, sizeof(struct ethtool_eee));
+	memset(&eee_curr, 0, sizeof(struct ethtool_keee));
 
 	ret_val = igb_get_eee(netdev, &eee_curr);
 	if (ret_val)
diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
index 45430e246e9c..75f7c5ba65e0 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -168,7 +168,7 @@ struct igc_ring {
 struct igc_adapter {
 	struct net_device *netdev;
 
-	struct ethtool_eee eee;
+	struct ethtool_keee eee;
 	u16 eee_advert;
 
 	unsigned long state;
diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
index b95d2c86e803..f2dcfe920f4f 100644
--- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
+++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
@@ -1623,7 +1623,7 @@ static int igc_ethtool_set_priv_flags(struct net_device *netdev, u32 priv_flags)
 }
 
 static int igc_ethtool_get_eee(struct net_device *netdev,
-			       struct ethtool_eee *edata)
+			       struct ethtool_keee *edata)
 {
 	struct igc_adapter *adapter = netdev_priv(netdev);
 	struct igc_hw *hw = &adapter->hw;
@@ -1664,14 +1664,14 @@ static int igc_ethtool_get_eee(struct net_device *netdev,
 }
 
 static int igc_ethtool_set_eee(struct net_device *netdev,
-			       struct ethtool_eee *edata)
+			       struct ethtool_keee *edata)
 {
 	struct igc_adapter *adapter = netdev_priv(netdev);
 	struct igc_hw *hw = &adapter->hw;
-	struct ethtool_eee eee_curr;
+	struct ethtool_keee eee_curr;
 	s32 ret_val;
 
-	memset(&eee_curr, 0, sizeof(struct ethtool_eee));
+	memset(&eee_curr, 0, sizeof(struct ethtool_keee));
 
 	ret_val = igc_ethtool_get_eee(netdev, &eee_curr);
 	if (ret_val) {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 9a63457712c7..0aa73519a29d 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -3425,7 +3425,7 @@ static const struct {
 };
 
 static int
-ixgbe_get_eee_fw(struct ixgbe_adapter *adapter, struct ethtool_eee *edata)
+ixgbe_get_eee_fw(struct ixgbe_adapter *adapter, struct ethtool_keee *edata)
 {
 	u32 info[FW_PHY_ACT_DATA_COUNT] = { 0 };
 	struct ixgbe_hw *hw = &adapter->hw;
@@ -3462,7 +3462,7 @@ ixgbe_get_eee_fw(struct ixgbe_adapter *adapter, struct ethtool_eee *edata)
 	return 0;
 }
 
-static int ixgbe_get_eee(struct net_device *netdev, struct ethtool_eee *edata)
+static int ixgbe_get_eee(struct net_device *netdev, struct ethtool_keee *edata)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	struct ixgbe_hw *hw = &adapter->hw;
@@ -3476,17 +3476,17 @@ static int ixgbe_get_eee(struct net_device *netdev, struct ethtool_eee *edata)
 	return -EOPNOTSUPP;
 }
 
-static int ixgbe_set_eee(struct net_device *netdev, struct ethtool_eee *edata)
+static int ixgbe_set_eee(struct net_device *netdev, struct ethtool_keee *edata)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	struct ixgbe_hw *hw = &adapter->hw;
-	struct ethtool_eee eee_data;
+	struct ethtool_keee eee_data;
 	s32 ret_val;
 
 	if (!(adapter->flags2 & IXGBE_FLAG2_EEE_CAPABLE))
 		return -EOPNOTSUPP;
 
-	memset(&eee_data, 0, sizeof(struct ethtool_eee));
+	memset(&eee_data, 0, sizeof(struct ethtool_keee));
 
 	ret_val = ixgbe_get_eee(netdev, &eee_data);
 	if (ret_val)
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index a641b3534ca3..40a5f1431e4e 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -5097,7 +5097,7 @@ static int mvneta_ethtool_set_wol(struct net_device *dev,
 }
 
 static int mvneta_ethtool_get_eee(struct net_device *dev,
-				  struct ethtool_eee *eee)
+				  struct ethtool_keee *eee)
 {
 	struct mvneta_port *pp = netdev_priv(dev);
 	u32 lpi_ctl0;
@@ -5113,7 +5113,7 @@ static int mvneta_ethtool_get_eee(struct net_device *dev,
 }
 
 static int mvneta_ethtool_set_eee(struct net_device *dev,
-				  struct ethtool_eee *eee)
+				  struct ethtool_keee *eee)
 {
 	struct mvneta_port *pp = netdev_priv(dev);
 	u32 lpi_ctl0;
diff --git a/drivers/net/ethernet/microchip/lan743x_ethtool.c b/drivers/net/ethernet/microchip/lan743x_ethtool.c
index a2b3f4433ca8..8a6ae171e375 100644
--- a/drivers/net/ethernet/microchip/lan743x_ethtool.c
+++ b/drivers/net/ethernet/microchip/lan743x_ethtool.c
@@ -1055,7 +1055,7 @@ static int lan743x_ethtool_get_ts_info(struct net_device *netdev,
 }
 
 static int lan743x_ethtool_get_eee(struct net_device *netdev,
-				   struct ethtool_eee *eee)
+				   struct ethtool_keee *eee)
 {
 	struct lan743x_adapter *adapter = netdev_priv(netdev);
 	struct phy_device *phydev = netdev->phydev;
@@ -1092,7 +1092,7 @@ static int lan743x_ethtool_get_eee(struct net_device *netdev,
 }
 
 static int lan743x_ethtool_set_eee(struct net_device *netdev,
-				   struct ethtool_eee *eee)
+				   struct ethtool_keee *eee)
 {
 	struct lan743x_adapter *adapter;
 	struct phy_device *phydev;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 0e240b5ab8d4..77491fb64039 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -1776,7 +1776,7 @@ static int qede_get_tunable(struct net_device *dev,
 	return 0;
 }
 
-static int qede_get_eee(struct net_device *dev, struct ethtool_eee *edata)
+static int qede_get_eee(struct net_device *dev, struct ethtool_keee *edata)
 {
 	struct qede_dev *edev = netdev_priv(dev);
 	struct qed_link_output current_link;
@@ -1810,7 +1810,7 @@ static int qede_get_eee(struct net_device *dev, struct ethtool_eee *edata)
 	return 0;
 }
 
-static int qede_set_eee(struct net_device *dev, struct ethtool_eee *edata)
+static int qede_set_eee(struct net_device *dev, struct ethtool_keee *edata)
 {
 	struct qede_dev *edev = netdev_priv(dev);
 	struct qed_link_output current_link;
diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index dd73df6b17b0..3d30d4499791 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -1974,7 +1974,7 @@ static int rtl_set_coalesce(struct net_device *dev,
 	return 0;
 }
 
-static int rtl8169_get_eee(struct net_device *dev, struct ethtool_eee *data)
+static int rtl8169_get_eee(struct net_device *dev, struct ethtool_keee *data)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
 
@@ -1984,7 +1984,7 @@ static int rtl8169_get_eee(struct net_device *dev, struct ethtool_eee *data)
 	return phy_ethtool_get_eee(tp->phydev, data);
 }
 
-static int rtl8169_set_eee(struct net_device *dev, struct ethtool_eee *data)
+static int rtl8169_set_eee(struct net_device *dev, struct ethtool_keee *data)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
 	int ret;
diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c
index 8ba017ec9849..d93b628b7046 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_ethtool.c
@@ -133,7 +133,7 @@ static const struct sxgbe_stats sxgbe_gstrings_stats[] = {
 #define SXGBE_STATS_LEN ARRAY_SIZE(sxgbe_gstrings_stats)
 
 static int sxgbe_get_eee(struct net_device *dev,
-			 struct ethtool_eee *edata)
+			 struct ethtool_keee *edata)
 {
 	struct sxgbe_priv_data *priv = netdev_priv(dev);
 
@@ -148,7 +148,7 @@ static int sxgbe_get_eee(struct net_device *dev,
 }
 
 static int sxgbe_set_eee(struct net_device *dev,
-			 struct ethtool_eee *edata)
+			 struct ethtool_keee *edata)
 {
 	struct sxgbe_priv_data *priv = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 42d27b97dd1d..bbecb3b89535 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -852,7 +852,7 @@ static int stmmac_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 }
 
 static int stmmac_ethtool_op_get_eee(struct net_device *dev,
-				     struct ethtool_eee *edata)
+				     struct ethtool_keee *edata)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 
@@ -868,7 +868,7 @@ static int stmmac_ethtool_op_get_eee(struct net_device *dev,
 }
 
 static int stmmac_ethtool_op_set_eee(struct net_device *dev,
-				     struct ethtool_eee *edata)
+				     struct ethtool_keee *edata)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 	int ret;
diff --git a/drivers/net/ethernet/ti/am65-cpsw-ethtool.c b/drivers/net/ethernet/ti/am65-cpsw-ethtool.c
index 35fceba01ea4..d6ce2c9f0a8d 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-ethtool.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-ethtool.c
@@ -514,14 +514,14 @@ am65_cpsw_set_link_ksettings(struct net_device *ndev,
 	return phylink_ethtool_ksettings_set(salve->phylink, ecmd);
 }
 
-static int am65_cpsw_get_eee(struct net_device *ndev, struct ethtool_eee *edata)
+static int am65_cpsw_get_eee(struct net_device *ndev, struct ethtool_keee *edata)
 {
 	struct am65_cpsw_slave_data *salve = am65_ndev_to_slave(ndev);
 
 	return phylink_ethtool_get_eee(salve->phylink, edata);
 }
 
-static int am65_cpsw_set_eee(struct net_device *ndev, struct ethtool_eee *edata)
+static int am65_cpsw_set_eee(struct net_device *ndev, struct ethtool_keee *edata)
 {
 	struct am65_cpsw_slave_data *salve = am65_ndev_to_slave(ndev);
 
diff --git a/drivers/net/ethernet/ti/cpsw_ethtool.c b/drivers/net/ethernet/ti/cpsw_ethtool.c
index a557a477d039..f7b283353ba2 100644
--- a/drivers/net/ethernet/ti/cpsw_ethtool.c
+++ b/drivers/net/ethernet/ti/cpsw_ethtool.c
@@ -422,7 +422,7 @@ int cpsw_set_link_ksettings(struct net_device *ndev,
 	return phy_ethtool_ksettings_set(cpsw->slaves[slave_no].phy, ecmd);
 }
 
-int cpsw_get_eee(struct net_device *ndev, struct ethtool_eee *edata)
+int cpsw_get_eee(struct net_device *ndev, struct ethtool_keee *edata)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct cpsw_common *cpsw = priv->cpsw;
@@ -434,7 +434,7 @@ int cpsw_get_eee(struct net_device *ndev, struct ethtool_eee *edata)
 		return -EOPNOTSUPP;
 }
 
-int cpsw_set_eee(struct net_device *ndev, struct ethtool_eee *edata)
+int cpsw_set_eee(struct net_device *ndev, struct ethtool_keee *edata)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct cpsw_common *cpsw = priv->cpsw;
diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h
index 0e27c433098d..7efa72502c86 100644
--- a/drivers/net/ethernet/ti/cpsw_priv.h
+++ b/drivers/net/ethernet/ti/cpsw_priv.h
@@ -496,8 +496,8 @@ int cpsw_get_link_ksettings(struct net_device *ndev,
 			    struct ethtool_link_ksettings *ecmd);
 int cpsw_set_link_ksettings(struct net_device *ndev,
 			    const struct ethtool_link_ksettings *ecmd);
-int cpsw_get_eee(struct net_device *ndev, struct ethtool_eee *edata);
-int cpsw_set_eee(struct net_device *ndev, struct ethtool_eee *edata);
+int cpsw_get_eee(struct net_device *ndev, struct ethtool_keee *edata);
+int cpsw_set_eee(struct net_device *ndev, struct ethtool_keee *edata);
 int cpsw_nway_reset(struct net_device *ndev);
 void cpsw_get_ringparam(struct net_device *ndev,
 			struct ethtool_ringparam *ering,
diff --git a/drivers/net/ethernet/ti/icssg/icssg_ethtool.c b/drivers/net/ethernet/ti/icssg/icssg_ethtool.c
index a27ec1dcc8d5..9a7dd7efcf69 100644
--- a/drivers/net/ethernet/ti/icssg/icssg_ethtool.c
+++ b/drivers/net/ethernet/ti/icssg/icssg_ethtool.c
@@ -45,7 +45,7 @@ static int emac_set_link_ksettings(struct net_device *ndev,
 	return phy_ethtool_set_link_ksettings(ndev, ecmd);
 }
 
-static int emac_get_eee(struct net_device *ndev, struct ethtool_eee *edata)
+static int emac_get_eee(struct net_device *ndev, struct ethtool_keee *edata)
 {
 	if (!ndev->phydev)
 		return -EOPNOTSUPP;
@@ -53,7 +53,7 @@ static int emac_get_eee(struct net_device *ndev, struct ethtool_eee *edata)
 	return phy_ethtool_get_eee(ndev->phydev, edata);
 }
 
-static int emac_set_eee(struct net_device *ndev, struct ethtool_eee *edata)
+static int emac_set_eee(struct net_device *ndev, struct ethtool_keee *edata)
 {
 	if (!ndev->phydev)
 		return -EOPNOTSUPP;
diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index eba652a4c1d8..1faa22f58366 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -1461,7 +1461,7 @@ static int m88e1540_get_fld(struct phy_device *phydev, u8 *msecs)
 
 static int m88e1540_set_fld(struct phy_device *phydev, const u8 *msecs)
 {
-	struct ethtool_eee eee;
+	struct ethtool_keee eee;
 	int val, ret;
 
 	if (*msecs == ETHTOOL_PHY_FAST_LINK_DOWN_OFF)
diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index 747d14bf152c..adee5e712871 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -1443,13 +1443,13 @@ EXPORT_SYMBOL(genphy_c45_eee_is_active);
 /**
  * genphy_c45_ethtool_get_eee - get EEE supported and status
  * @phydev: target phy_device struct
- * @data: ethtool_eee data
+ * @data: ethtool_keee data
  *
  * Description: it reports the Supported/Advertisement/LP Advertisement
  * capabilities.
  */
 int genphy_c45_ethtool_get_eee(struct phy_device *phydev,
-			       struct ethtool_eee *data)
+			       struct ethtool_keee *data)
 {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(adv) = {};
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(lp) = {};
@@ -1481,7 +1481,7 @@ EXPORT_SYMBOL(genphy_c45_ethtool_get_eee);
 /**
  * genphy_c45_ethtool_set_eee - set EEE supported and status
  * @phydev: target phy_device struct
- * @data: ethtool_eee data
+ * @data: ethtool_keee data
  *
  * Description: sets the Supported/Advertisement/LP Advertisement
  * capabilities. If eee_enabled is false, no links modes are
@@ -1490,7 +1490,7 @@ EXPORT_SYMBOL(genphy_c45_ethtool_get_eee);
  * non-destructive way.
  */
 int genphy_c45_ethtool_set_eee(struct phy_device *phydev,
-			       struct ethtool_eee *data)
+			       struct ethtool_keee *data)
 {
 	int ret;
 
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 3376e58e2b88..3b9531143be1 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1632,12 +1632,12 @@ EXPORT_SYMBOL(phy_get_eee_err);
 /**
  * phy_ethtool_get_eee - get EEE supported and status
  * @phydev: target phy_device struct
- * @data: ethtool_eee data
+ * @data: ethtool_keee data
  *
  * Description: it reportes the Supported/Advertisement/LP Advertisement
  * capabilities.
  */
-int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_eee *data)
+int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data)
 {
 	int ret;
 
@@ -1655,11 +1655,11 @@ EXPORT_SYMBOL(phy_ethtool_get_eee);
 /**
  * phy_ethtool_set_eee - set EEE supported and status
  * @phydev: target phy_device struct
- * @data: ethtool_eee data
+ * @data: ethtool_keee data
  *
  * Description: it is to program the Advertisement EEE register.
  */
-int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data)
+int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data)
 {
 	int ret;
 
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index ed0b4ccaa6a6..503fd7c40523 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -2764,9 +2764,9 @@ EXPORT_SYMBOL_GPL(phylink_init_eee);
 /**
  * phylink_ethtool_get_eee() - read the energy efficient ethernet parameters
  * @pl: a pointer to a &struct phylink returned from phylink_create()
- * @eee: a pointer to a &struct ethtool_eee for the read parameters
+ * @eee: a pointer to a &struct ethtool_keee for the read parameters
  */
-int phylink_ethtool_get_eee(struct phylink *pl, struct ethtool_eee *eee)
+int phylink_ethtool_get_eee(struct phylink *pl, struct ethtool_keee *eee)
 {
 	int ret = -EOPNOTSUPP;
 
@@ -2782,9 +2782,9 @@ EXPORT_SYMBOL_GPL(phylink_ethtool_get_eee);
 /**
  * phylink_ethtool_set_eee() - set the energy efficient ethernet parameters
  * @pl: a pointer to a &struct phylink returned from phylink_create()
- * @eee: a pointer to a &struct ethtool_eee for the desired parameters
+ * @eee: a pointer to a &struct ethtool_keee for the desired parameters
  */
-int phylink_ethtool_set_eee(struct phylink *pl, struct ethtool_eee *eee)
+int phylink_ethtool_set_eee(struct phylink *pl, struct ethtool_keee *eee)
 {
 	int ret = -EOPNOTSUPP;
 
diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c
index d837c1887416..3922a9afdd1f 100644
--- a/drivers/net/usb/ax88179_178a.c
+++ b/drivers/net/usb/ax88179_178a.c
@@ -667,7 +667,7 @@ static int ax88179_set_link_ksettings(struct net_device *net,
 }
 
 static int
-ax88179_ethtool_get_eee(struct usbnet *dev, struct ethtool_eee *data)
+ax88179_ethtool_get_eee(struct usbnet *dev, struct ethtool_keee *data)
 {
 	int val;
 
@@ -696,7 +696,7 @@ ax88179_ethtool_get_eee(struct usbnet *dev, struct ethtool_eee *data)
 }
 
 static int
-ax88179_ethtool_set_eee(struct usbnet *dev, struct ethtool_eee *data)
+ax88179_ethtool_set_eee(struct usbnet *dev, struct ethtool_keee *data)
 {
 	u16 tmp16 = ethtool_adv_to_mmd_eee_adv_t(data->advertised);
 
@@ -807,7 +807,7 @@ static void ax88179_enable_eee(struct usbnet *dev)
 			  GMII_PHY_PAGE_SELECT, 2, &tmp16);
 }
 
-static int ax88179_get_eee(struct net_device *net, struct ethtool_eee *edata)
+static int ax88179_get_eee(struct net_device *net, struct ethtool_keee *edata)
 {
 	struct usbnet *dev = netdev_priv(net);
 	struct ax88179_data *priv = dev->driver_priv;
@@ -818,7 +818,7 @@ static int ax88179_get_eee(struct net_device *net, struct ethtool_eee *edata)
 	return ax88179_ethtool_get_eee(dev, edata);
 }
 
-static int ax88179_set_eee(struct net_device *net, struct ethtool_eee *edata)
+static int ax88179_set_eee(struct net_device *net, struct ethtool_keee *edata)
 {
 	struct usbnet *dev = netdev_priv(net);
 	struct ax88179_data *priv = dev->driver_priv;
@@ -1587,7 +1587,7 @@ static int ax88179_reset(struct usbnet *dev)
 	u16 *tmp16;
 	u8 *tmp;
 	struct ax88179_data *ax179_data = dev->driver_priv;
-	struct ethtool_eee eee_data;
+	struct ethtool_keee eee_data;
 
 	tmp16 = (u16 *)buf;
 	tmp = (u8 *)buf;
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index a6d653ff552a..106282612bc2 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -1673,7 +1673,7 @@ static int lan78xx_set_wol(struct net_device *netdev,
 	return ret;
 }
 
-static int lan78xx_get_eee(struct net_device *net, struct ethtool_eee *edata)
+static int lan78xx_get_eee(struct net_device *net, struct ethtool_keee *edata)
 {
 	struct lan78xx_net *dev = netdev_priv(net);
 	struct phy_device *phydev = net->phydev;
@@ -1709,7 +1709,7 @@ exit:
 	return ret;
 }
 
-static int lan78xx_set_eee(struct net_device *net, struct ethtool_eee *edata)
+static int lan78xx_set_eee(struct net_device *net, struct ethtool_keee *edata)
 {
 	struct lan78xx_net *dev = netdev_priv(net);
 	int ret;
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index 0d0672d2a654..dc163b766a89 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -891,8 +891,8 @@ struct r8152 {
 		void (*up)(struct r8152 *tp);
 		void (*down)(struct r8152 *tp);
 		void (*unload)(struct r8152 *tp);
-		int (*eee_get)(struct r8152 *tp, struct ethtool_eee *eee);
-		int (*eee_set)(struct r8152 *tp, struct ethtool_eee *eee);
+		int (*eee_get)(struct r8152 *tp, struct ethtool_keee *eee);
+		int (*eee_set)(struct r8152 *tp, struct ethtool_keee *eee);
 		bool (*in_nway)(struct r8152 *tp);
 		void (*hw_phy_cfg)(struct r8152 *tp);
 		void (*autosuspend_en)(struct r8152 *tp, bool enable);
@@ -8922,7 +8922,7 @@ static void rtl8152_get_strings(struct net_device *dev, u32 stringset, u8 *data)
 	}
 }
 
-static int r8152_get_eee(struct r8152 *tp, struct ethtool_eee *eee)
+static int r8152_get_eee(struct r8152 *tp, struct ethtool_keee *eee)
 {
 	u32 lp, adv, supported = 0;
 	u16 val;
@@ -8945,7 +8945,7 @@ static int r8152_get_eee(struct r8152 *tp, struct ethtool_eee *eee)
 	return 0;
 }
 
-static int r8152_set_eee(struct r8152 *tp, struct ethtool_eee *eee)
+static int r8152_set_eee(struct r8152 *tp, struct ethtool_keee *eee)
 {
 	u16 val = ethtool_adv_to_mmd_eee_adv_t(eee->advertised);
 
@@ -8957,7 +8957,7 @@ static int r8152_set_eee(struct r8152 *tp, struct ethtool_eee *eee)
 	return 0;
 }
 
-static int r8153_get_eee(struct r8152 *tp, struct ethtool_eee *eee)
+static int r8153_get_eee(struct r8152 *tp, struct ethtool_keee *eee)
 {
 	u32 lp, adv, supported = 0;
 	u16 val;
@@ -8981,7 +8981,7 @@ static int r8153_get_eee(struct r8152 *tp, struct ethtool_eee *eee)
 }
 
 static int
-rtl_ethtool_get_eee(struct net_device *net, struct ethtool_eee *edata)
+rtl_ethtool_get_eee(struct net_device *net, struct ethtool_keee *edata)
 {
 	struct r8152 *tp = netdev_priv(net);
 	int ret;
@@ -9008,7 +9008,7 @@ out:
 }
 
 static int
-rtl_ethtool_set_eee(struct net_device *net, struct ethtool_eee *edata)
+rtl_ethtool_set_eee(struct net_device *net, struct ethtool_keee *edata)
 {
 	struct r8152 *tp = netdev_priv(net);
 	int ret;
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 325e0778e937..a850bab8489f 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -222,6 +222,18 @@ extern int
 __ethtool_get_link_ksettings(struct net_device *dev,
 			     struct ethtool_link_ksettings *link_ksettings);
 
+struct ethtool_keee {
+	u32	cmd;
+	u32	supported;
+	u32	advertised;
+	u32	lp_advertised;
+	u32	eee_active;
+	u32	eee_enabled;
+	u32	tx_lpi_enabled;
+	u32	tx_lpi_timer;
+	u32	reserved[2];
+};
+
 struct kernel_ethtool_coalesce {
 	u8 use_cqe_mode_tx;
 	u8 use_cqe_mode_rx;
@@ -892,8 +904,8 @@ struct ethtool_ops {
 				   struct ethtool_modinfo *);
 	int     (*get_module_eeprom)(struct net_device *,
 				     struct ethtool_eeprom *, u8 *);
-	int	(*get_eee)(struct net_device *, struct ethtool_eee *);
-	int	(*set_eee)(struct net_device *, struct ethtool_eee *);
+	int	(*get_eee)(struct net_device *dev, struct ethtool_keee *eee);
+	int	(*set_eee)(struct net_device *dev, struct ethtool_keee *eee);
 	int	(*get_tunable)(struct net_device *,
 			       const struct ethtool_tunable *, void *);
 	int	(*set_tunable)(struct net_device *,
diff --git a/include/linux/phy.h b/include/linux/phy.h
index c9994a59ca2e..a66f07d3f5f4 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1908,9 +1908,9 @@ int genphy_c45_plca_get_status(struct phy_device *phydev,
 int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *adv,
 			     unsigned long *lp, bool *is_enabled);
 int genphy_c45_ethtool_get_eee(struct phy_device *phydev,
-			       struct ethtool_eee *data);
+			       struct ethtool_keee *data);
 int genphy_c45_ethtool_set_eee(struct phy_device *phydev,
-			       struct ethtool_eee *data);
+			       struct ethtool_keee *data);
 int genphy_c45_write_eee_adv(struct phy_device *phydev, unsigned long *adv);
 int genphy_c45_an_config_eee_aneg(struct phy_device *phydev);
 int genphy_c45_read_eee_adv(struct phy_device *phydev, unsigned long *adv);
@@ -1988,8 +1988,8 @@ int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask);
 
 int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable);
 int phy_get_eee_err(struct phy_device *phydev);
-int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data);
-int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_eee *data);
+int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data);
+int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data);
 int phy_ethtool_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol);
 void phy_ethtool_get_wol(struct phy_device *phydev,
 			 struct ethtool_wolinfo *wol);
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index d589f89c612c..6ba411732a0d 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -584,8 +584,8 @@ int phylink_ethtool_set_pauseparam(struct phylink *,
 				   struct ethtool_pauseparam *);
 int phylink_get_eee_err(struct phylink *);
 int phylink_init_eee(struct phylink *, bool);
-int phylink_ethtool_get_eee(struct phylink *, struct ethtool_eee *);
-int phylink_ethtool_set_eee(struct phylink *, struct ethtool_eee *);
+int phylink_ethtool_get_eee(struct phylink *link, struct ethtool_keee *eee);
+int phylink_ethtool_set_eee(struct phylink *link, struct ethtool_keee *eee);
 int phylink_mii_ioctl(struct phylink *, struct ifreq *, int);
 int phylink_speed_down(struct phylink *pl, bool sync);
 int phylink_speed_up(struct phylink *pl);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 82135fbdb1e6..7c0da9effe4e 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -991,9 +991,9 @@ struct dsa_switch_ops {
 	 * Port's MAC EEE settings
 	 */
 	int	(*set_mac_eee)(struct dsa_switch *ds, int port,
-			       struct ethtool_eee *e);
+			       struct ethtool_keee *e);
 	int	(*get_mac_eee)(struct dsa_switch *ds, int port,
-			       struct ethtool_eee *e);
+			       struct ethtool_keee *e);
 
 	/* EEPROM access */
 	int	(*get_eeprom_len)(struct dsa_switch *ds);
diff --git a/net/dsa/user.c b/net/dsa/user.c
index b15e71cc342c..e03da3a4ac38 100644
--- a/net/dsa/user.c
+++ b/net/dsa/user.c
@@ -1222,7 +1222,7 @@ static int dsa_user_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 	return ret;
 }
 
-static int dsa_user_set_eee(struct net_device *dev, struct ethtool_eee *e)
+static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e)
 {
 	struct dsa_port *dp = dsa_user_to_port(dev);
 	struct dsa_switch *ds = dp->ds;
@@ -1242,7 +1242,7 @@ static int dsa_user_set_eee(struct net_device *dev, struct ethtool_eee *e)
 	return phylink_ethtool_set_eee(dp->pl, e);
 }
 
-static int dsa_user_get_eee(struct net_device *dev, struct ethtool_eee *e)
+static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e)
 {
 	struct dsa_port *dp = dsa_user_to_port(dev);
 	struct dsa_switch *ds = dp->ds;
diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c
index 2853394d06a8..21b0e845a531 100644
--- a/net/ethtool/eee.c
+++ b/net/ethtool/eee.c
@@ -5,7 +5,7 @@
 #include "bitset.h"
 
 #define EEE_MODES_COUNT \
-	(sizeof_field(struct ethtool_eee, supported) * BITS_PER_BYTE)
+	(sizeof_field(struct ethtool_keee, supported) * BITS_PER_BYTE)
 
 struct eee_req_info {
 	struct ethnl_req_info		base;
@@ -13,7 +13,7 @@ struct eee_req_info {
 
 struct eee_reply_data {
 	struct ethnl_reply_data		base;
-	struct ethtool_eee		eee;
+	struct ethtool_keee		eee;
 };
 
 #define EEE_REPDATA(__reply_base) \
@@ -48,7 +48,7 @@ static int eee_reply_size(const struct ethnl_req_info *req_base,
 {
 	bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
 	const struct eee_reply_data *data = EEE_REPDATA(reply_base);
-	const struct ethtool_eee *eee = &data->eee;
+	const struct ethtool_keee *eee = &data->eee;
 	int len = 0;
 	int ret;
 
@@ -84,7 +84,7 @@ static int eee_fill_reply(struct sk_buff *skb,
 {
 	bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
 	const struct eee_reply_data *data = EEE_REPDATA(reply_base);
-	const struct ethtool_eee *eee = &data->eee;
+	const struct ethtool_keee *eee = &data->eee;
 	int ret;
 
 	ret = ethnl_put_bitset32(skb, ETHTOOL_A_EEE_MODES_OURS,
@@ -132,7 +132,7 @@ ethnl_set_eee(struct ethnl_req_info *req_info, struct genl_info *info)
 {
 	struct net_device *dev = req_info->dev;
 	struct nlattr **tb = info->attrs;
-	struct ethtool_eee eee = {};
+	struct ethtool_keee eee = {};
 	bool mod = false;
 	int ret;
 
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 7519b0818b91..b02ca72f438e 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1510,13 +1510,13 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
 
 static int ethtool_get_eee(struct net_device *dev, char __user *useraddr)
 {
-	struct ethtool_eee edata;
+	struct ethtool_keee edata;
 	int rc;
 
 	if (!dev->ethtool_ops->get_eee)
 		return -EOPNOTSUPP;
 
-	memset(&edata, 0, sizeof(struct ethtool_eee));
+	memset(&edata, 0, sizeof(struct ethtool_keee));
 	edata.cmd = ETHTOOL_GEEE;
 	rc = dev->ethtool_ops->get_eee(dev, &edata);
 
@@ -1531,7 +1531,7 @@ static int ethtool_get_eee(struct net_device *dev, char __user *useraddr)
 
 static int ethtool_set_eee(struct net_device *dev, char __user *useraddr)
 {
-	struct ethtool_eee edata;
+	struct ethtool_keee edata;
 	int ret;
 
 	if (!dev->ethtool_ops->set_eee)
-- 
cgit v1.2.3


From 285cc15cc555b4f05ebf2556bc6e85a6d36b790a Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 27 Jan 2024 14:26:50 +0100
Subject: ethtool: adjust struct ethtool_keee to kernel needs

This patch changes the following in struct ethtool_keee
- remove member cmd, it's not needed on kernel side
- remove reserved fields
- switch the semantically boolean members to type bool

We don't have to change any user of the boolean members due to the
implicit casting from/to bool. A small change is needed where a
pointer to bool members is used, in addition remove few now unneeded
double negations.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  8 +++-----
 net/ethtool/eee.c       | 12 ++++++------
 2 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index a850bab8489f..14549cb9e2b2 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -223,15 +223,13 @@ __ethtool_get_link_ksettings(struct net_device *dev,
 			     struct ethtool_link_ksettings *link_ksettings);
 
 struct ethtool_keee {
-	u32	cmd;
 	u32	supported;
 	u32	advertised;
 	u32	lp_advertised;
-	u32	eee_active;
-	u32	eee_enabled;
-	u32	tx_lpi_enabled;
 	u32	tx_lpi_timer;
-	u32	reserved[2];
+	bool	tx_lpi_enabled;
+	bool	eee_active;
+	bool	eee_enabled;
 };
 
 struct kernel_ethtool_coalesce {
diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c
index 21b0e845a531..ac9f694ffe20 100644
--- a/net/ethtool/eee.c
+++ b/net/ethtool/eee.c
@@ -98,10 +98,10 @@ static int eee_fill_reply(struct sk_buff *skb,
 	if (ret < 0)
 		return ret;
 
-	if (nla_put_u8(skb, ETHTOOL_A_EEE_ACTIVE, !!eee->eee_active) ||
-	    nla_put_u8(skb, ETHTOOL_A_EEE_ENABLED, !!eee->eee_enabled) ||
+	if (nla_put_u8(skb, ETHTOOL_A_EEE_ACTIVE, eee->eee_active) ||
+	    nla_put_u8(skb, ETHTOOL_A_EEE_ENABLED, eee->eee_enabled) ||
 	    nla_put_u8(skb, ETHTOOL_A_EEE_TX_LPI_ENABLED,
-		       !!eee->tx_lpi_enabled) ||
+		       eee->tx_lpi_enabled) ||
 	    nla_put_u32(skb, ETHTOOL_A_EEE_TX_LPI_TIMER, eee->tx_lpi_timer))
 		return -EMSGSIZE;
 
@@ -145,9 +145,9 @@ ethnl_set_eee(struct ethnl_req_info *req_info, struct genl_info *info)
 				    link_mode_names, info->extack, &mod);
 	if (ret < 0)
 		return ret;
-	ethnl_update_bool32(&eee.eee_enabled, tb[ETHTOOL_A_EEE_ENABLED], &mod);
-	ethnl_update_bool32(&eee.tx_lpi_enabled,
-			    tb[ETHTOOL_A_EEE_TX_LPI_ENABLED], &mod);
+	ethnl_update_bool(&eee.eee_enabled, tb[ETHTOOL_A_EEE_ENABLED], &mod);
+	ethnl_update_bool(&eee.tx_lpi_enabled, tb[ETHTOOL_A_EEE_TX_LPI_ENABLED],
+			  &mod);
 	ethnl_update_u32(&eee.tx_lpi_timer, tb[ETHTOOL_A_EEE_TX_LPI_TIMER],
 			 &mod);
 	if (!mod)
-- 
cgit v1.2.3


From 1d756ff13da6a2222ac4387511f2a0e2e83ce670 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 27 Jan 2024 14:28:47 +0100
Subject: ethtool: add suffix _u32 to legacy bitmap members of struct
 ethtool_keee

This is in preparation of using the existing names for linkmode
bitmaps.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/aquantia/atlantic/aq_ethtool.c    |  8 +++----
 .../net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c    |  8 +++----
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          | 12 +++++-----
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c  | 14 +++++------
 drivers/net/ethernet/broadcom/tg3.c                | 22 ++++++++---------
 drivers/net/ethernet/intel/e1000e/ethtool.c        | 10 ++++----
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c     | 10 ++++----
 drivers/net/ethernet/intel/igb/igb_ethtool.c       | 20 ++++++++--------
 drivers/net/ethernet/intel/igc/igc_ethtool.c       | 12 +++++-----
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c   | 18 +++++++-------
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c    | 28 +++++++++++-----------
 drivers/net/phy/phy-c45.c                          | 12 +++++-----
 drivers/net/usb/ax88179_178a.c                     | 10 ++++----
 drivers/net/usb/r8152.c                            | 14 +++++------
 include/linux/ethtool.h                            |  6 ++---
 net/ethtool/eee.c                                  | 16 ++++++-------
 net/ethtool/ioctl.c                                | 12 +++++-----
 17 files changed, 116 insertions(+), 116 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
index be865776de55..0bd1a0a1ae6a 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
@@ -713,14 +713,14 @@ static int aq_ethtool_get_eee(struct net_device *ndev, struct ethtool_keee *eee)
 	if (err < 0)
 		return err;
 
-	eee->supported = eee_mask_to_ethtool_mask(supported_rates);
+	eee->supported_u32 = eee_mask_to_ethtool_mask(supported_rates);
 
 	if (aq_nic->aq_nic_cfg.eee_speeds)
-		eee->advertised = eee->supported;
+		eee->advertised_u32 = eee->supported_u32;
 
-	eee->lp_advertised = eee_mask_to_ethtool_mask(rate);
+	eee->lp_advertised_u32 = eee_mask_to_ethtool_mask(rate);
 
-	eee->eee_enabled = !!eee->advertised;
+	eee->eee_enabled = !!eee->advertised_u32;
 
 	eee->tx_lpi_enabled = eee->eee_enabled;
 	if ((supported_rates & rate) & AQ_NIC_RATE_EEE_MSK)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 12d2785eeb8a..5f0e1759d078 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -2120,14 +2120,14 @@ static int bnx2x_get_eee(struct net_device *dev, struct ethtool_keee *edata)
 
 	eee_cfg = bp->link_vars.eee_status;
 
-	edata->supported =
+	edata->supported_u32 =
 		bnx2x_eee_to_adv((eee_cfg & SHMEM_EEE_SUPPORTED_MASK) >>
 				 SHMEM_EEE_SUPPORTED_SHIFT);
 
-	edata->advertised =
+	edata->advertised_u32 =
 		bnx2x_eee_to_adv((eee_cfg & SHMEM_EEE_ADV_STATUS_MASK) >>
 				 SHMEM_EEE_ADV_STATUS_SHIFT);
-	edata->lp_advertised =
+	edata->lp_advertised_u32 =
 		bnx2x_eee_to_adv((eee_cfg & SHMEM_EEE_LP_ADV_STATUS_MASK) >>
 				 SHMEM_EEE_LP_ADV_STATUS_SHIFT);
 
@@ -2162,7 +2162,7 @@ static int bnx2x_set_eee(struct net_device *dev, struct ethtool_keee *edata)
 		return -EOPNOTSUPP;
 	}
 
-	advertised = bnx2x_adv_to_eee(edata->advertised,
+	advertised = bnx2x_adv_to_eee(edata->advertised_u32,
 				      SHMEM_EEE_ADV_STATUS_SHIFT);
 	if ((advertised != (eee_cfg & SHMEM_EEE_ADV_STATUS_MASK))) {
 		DP(BNX2X_MSG_ETHTOOL,
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index d7626c26f9a9..fde32b32fa81 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -10624,7 +10624,7 @@ static int bnxt_hwrm_phy_qcaps(struct bnxt *bp)
 		struct ethtool_keee *eee = &bp->eee;
 		u16 fw_speeds = le16_to_cpu(resp->supported_speeds_eee_mode);
 
-		eee->supported = _bnxt_fw_to_ethtool_adv_spds(fw_speeds, 0);
+		eee->supported_u32 = _bnxt_fw_to_ethtool_adv_spds(fw_speeds, 0);
 		bp->lpi_tmr_lo = le32_to_cpu(resp->tx_lpi_timer_low) &
 				 PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_LOW_MASK;
 		bp->lpi_tmr_hi = le32_to_cpu(resp->valid_tx_lpi_timer_high) &
@@ -10775,7 +10775,7 @@ int bnxt_update_link(struct bnxt *bp, bool chng_link_state)
 			eee->eee_active = 1;
 			fw_speeds = le16_to_cpu(
 				resp->link_partner_adv_eee_link_speed_mask);
-			eee->lp_advertised =
+			eee->lp_advertised_u32 =
 				_bnxt_fw_to_ethtool_adv_spds(fw_speeds, 0);
 		}
 
@@ -10786,7 +10786,7 @@ int bnxt_update_link(struct bnxt *bp, bool chng_link_state)
 				eee->eee_enabled = 1;
 
 			fw_speeds = le16_to_cpu(resp->adv_eee_link_speed_mask);
-			eee->advertised =
+			eee->advertised_u32 =
 				_bnxt_fw_to_ethtool_adv_spds(fw_speeds, 0);
 
 			if (resp->eee_config_phy_addr &
@@ -10969,7 +10969,7 @@ static void bnxt_hwrm_set_eee(struct bnxt *bp,
 			flags |= PORT_PHY_CFG_REQ_FLAGS_EEE_TX_LPI_DISABLE;
 
 		req->flags |= cpu_to_le32(flags);
-		eee_speeds = bnxt_get_fw_auto_link_speeds(eee->advertised);
+		eee_speeds = bnxt_get_fw_auto_link_speeds(eee->advertised_u32);
 		req->eee_link_speed_mask = cpu_to_le16(eee_speeds);
 		req->tx_lpi_timer = cpu_to_le32(eee->tx_lpi_timer);
 	} else {
@@ -11336,8 +11336,8 @@ static bool bnxt_eee_config_ok(struct bnxt *bp)
 			eee->eee_enabled = 0;
 			return false;
 		}
-		if (eee->advertised & ~advertising) {
-			eee->advertised = advertising & eee->supported;
+		if (eee->advertised_u32 & ~advertising) {
+			eee->advertised_u32 = advertising & eee->supported_u32;
 			return false;
 		}
 	}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index d6a8577d68af..481b835a7703 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -3919,16 +3919,16 @@ static int bnxt_set_eee(struct net_device *dev, struct ethtool_keee *edata)
 			edata->tx_lpi_timer = eee->tx_lpi_timer;
 		}
 	}
-	if (!edata->advertised) {
-		edata->advertised = advertising & eee->supported;
-	} else if (edata->advertised & ~advertising) {
+	if (!edata->advertised_u32) {
+		edata->advertised_u32 = advertising & eee->supported_u32;
+	} else if (edata->advertised_u32 & ~advertising) {
 		netdev_warn(dev, "EEE advertised %x must be a subset of autoneg advertised speeds %x\n",
-			    edata->advertised, advertising);
+			    edata->advertised_u32, advertising);
 		rc = -EINVAL;
 		goto eee_exit;
 	}
 
-	eee->advertised = edata->advertised;
+	eee->advertised_u32 = edata->advertised_u32;
 	eee->tx_lpi_enabled = edata->tx_lpi_enabled;
 	eee->tx_lpi_timer = edata->tx_lpi_timer;
 eee_ok:
@@ -3954,12 +3954,12 @@ static int bnxt_get_eee(struct net_device *dev, struct ethtool_keee *edata)
 		/* Preserve tx_lpi_timer so that the last value will be used
 		 * by default when it is re-enabled.
 		 */
-		edata->advertised = 0;
+		edata->advertised_u32 = 0;
 		edata->tx_lpi_enabled = 0;
 	}
 
 	if (!bp->eee.eee_active)
-		edata->lp_advertised = 0;
+		edata->lp_advertised_u32 = 0;
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 11054177c2ec..f644a91317c9 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -2362,13 +2362,13 @@ static void tg3_eee_pull_config(struct tg3 *tp, struct ethtool_keee *eee)
 	/* Pull lp advertised settings */
 	if (tg3_phy_cl45_read(tp, MDIO_MMD_AN, MDIO_AN_EEE_LPABLE, &val))
 		return;
-	dest->lp_advertised = mmd_eee_adv_to_ethtool_adv_t(val);
+	dest->lp_advertised_u32 = mmd_eee_adv_to_ethtool_adv_t(val);
 
 	/* Pull advertised and eee_enabled settings */
 	if (tg3_phy_cl45_read(tp, MDIO_MMD_AN, MDIO_AN_EEE_ADV, &val))
 		return;
 	dest->eee_enabled = !!val;
-	dest->advertised = mmd_eee_adv_to_ethtool_adv_t(val);
+	dest->advertised_u32 = mmd_eee_adv_to_ethtool_adv_t(val);
 
 	/* Pull tx_lpi_enabled */
 	val = tr32(TG3_CPMU_EEE_MODE);
@@ -4364,9 +4364,9 @@ static int tg3_phy_autoneg_cfg(struct tg3 *tp, u32 advertise, u32 flowctrl)
 
 		if (!tp->eee.eee_enabled) {
 			val = 0;
-			tp->eee.advertised = 0;
+			tp->eee.advertised_u32 = 0;
 		} else {
-			tp->eee.advertised = advertise &
+			tp->eee.advertised_u32 = advertise &
 					     (ADVERTISED_100baseT_Full |
 					      ADVERTISED_1000baseT_Full);
 		}
@@ -4626,13 +4626,13 @@ static bool tg3_phy_eee_config_ok(struct tg3 *tp)
 	tg3_eee_pull_config(tp, &eee);
 
 	if (tp->eee.eee_enabled) {
-		if (tp->eee.advertised != eee.advertised ||
+		if (tp->eee.advertised_u32 != eee.advertised_u32 ||
 		    tp->eee.tx_lpi_timer != eee.tx_lpi_timer ||
 		    tp->eee.tx_lpi_enabled != eee.tx_lpi_enabled)
 			return false;
 	} else {
 		/* EEE is disabled but we're advertising */
-		if (eee.advertised)
+		if (eee.advertised_u32)
 			return false;
 	}
 
@@ -14189,7 +14189,7 @@ static int tg3_set_eee(struct net_device *dev, struct ethtool_keee *edata)
 		return -EOPNOTSUPP;
 	}
 
-	if (edata->advertised != tp->eee.advertised) {
+	if (edata->advertised_u32 != tp->eee.advertised_u32) {
 		netdev_warn(tp->dev,
 			    "Direct manipulation of EEE advertisement is not supported\n");
 		return -EINVAL;
@@ -15655,10 +15655,10 @@ static int tg3_phy_probe(struct tg3 *tp)
 	      tg3_chip_rev_id(tp) != CHIPREV_ID_57765_A0))) {
 		tp->phy_flags |= TG3_PHYFLG_EEE_CAP;
 
-		tp->eee.supported = SUPPORTED_100baseT_Full |
-				    SUPPORTED_1000baseT_Full;
-		tp->eee.advertised = ADVERTISED_100baseT_Full |
-				     ADVERTISED_1000baseT_Full;
+		tp->eee.supported_u32 = SUPPORTED_100baseT_Full |
+					SUPPORTED_1000baseT_Full;
+		tp->eee.advertised_u32 = ADVERTISED_100baseT_Full |
+					 ADVERTISED_1000baseT_Full;
 		tp->eee.eee_enabled = 1;
 		tp->eee.tx_lpi_enabled = 1;
 		tp->eee.tx_lpi_timer = TG3_CPMU_DBTMR1_LNKIDLE_2047US;
diff --git a/drivers/net/ethernet/intel/e1000e/ethtool.c b/drivers/net/ethernet/intel/e1000e/ethtool.c
index 343f54b2b6ae..ff243ae71b78 100644
--- a/drivers/net/ethernet/intel/e1000e/ethtool.c
+++ b/drivers/net/ethernet/intel/e1000e/ethtool.c
@@ -2223,16 +2223,16 @@ static int e1000e_get_eee(struct net_device *netdev, struct ethtool_keee *edata)
 	ret_val = e1000_read_emi_reg_locked(hw, cap_addr, &phy_data);
 	if (ret_val)
 		goto release;
-	edata->supported = mmd_eee_cap_to_ethtool_sup_t(phy_data);
+	edata->supported_u32 = mmd_eee_cap_to_ethtool_sup_t(phy_data);
 
 	/* EEE Advertised */
-	edata->advertised = mmd_eee_adv_to_ethtool_adv_t(adapter->eee_advert);
+	edata->advertised_u32 = mmd_eee_adv_to_ethtool_adv_t(adapter->eee_advert);
 
 	/* EEE Link Partner Advertised */
 	ret_val = e1000_read_emi_reg_locked(hw, lpa_addr, &phy_data);
 	if (ret_val)
 		goto release;
-	edata->lp_advertised = mmd_eee_adv_to_ethtool_adv_t(phy_data);
+	edata->lp_advertised_u32 = mmd_eee_adv_to_ethtool_adv_t(phy_data);
 
 	/* EEE PCS Status */
 	ret_val = e1000_read_emi_reg_locked(hw, pcs_stat_addr, &phy_data);
@@ -2283,12 +2283,12 @@ static int e1000e_set_eee(struct net_device *netdev, struct ethtool_keee *edata)
 		return -EINVAL;
 	}
 
-	if (edata->advertised & ~(ADVERTISE_100_FULL | ADVERTISE_1000_FULL)) {
+	if (edata->advertised_u32 & ~(ADVERTISE_100_FULL | ADVERTISE_1000_FULL)) {
 		e_err("EEE advertisement supports only 100TX and/or 1000T full-duplex\n");
 		return -EINVAL;
 	}
 
-	adapter->eee_advert = ethtool_adv_to_mmd_eee_adv_t(edata->advertised);
+	adapter->eee_advert = ethtool_adv_to_mmd_eee_adv_t(edata->advertised_u32);
 
 	hw->dev_spec.ich8lan.eee_disable = !edata->eee_enabled;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 9dfda3c48af3..1b5473358e1a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -5664,16 +5664,16 @@ static int i40e_get_eee(struct net_device *netdev, struct ethtool_keee *edata)
 	if (phy_cfg.eee_capability == 0)
 		return -EOPNOTSUPP;
 
-	edata->supported = SUPPORTED_Autoneg;
-	edata->lp_advertised = edata->supported;
+	edata->supported_u32 = SUPPORTED_Autoneg;
+	edata->lp_advertised_u32 = edata->supported_u32;
 
 	/* Get current configuration */
 	status = i40e_aq_get_phy_capabilities(hw, false, false, &phy_cfg, NULL);
 	if (status)
 		return -EAGAIN;
 
-	edata->advertised = phy_cfg.eee_capability ? SUPPORTED_Autoneg : 0U;
-	edata->eee_enabled = !!edata->advertised;
+	edata->advertised_u32 = phy_cfg.eee_capability ? SUPPORTED_Autoneg : 0U;
+	edata->eee_enabled = !!edata->advertised_u32;
 	edata->tx_lpi_enabled = pf->stats.tx_lpi_status;
 
 	edata->eee_active = pf->stats.tx_lpi_status && pf->stats.rx_lpi_status;
@@ -5691,7 +5691,7 @@ static int i40e_is_eee_param_supported(struct net_device *netdev,
 		u32 value;
 		const char *name;
 	} param[] = {
-		{edata->advertised & ~SUPPORTED_Autoneg, "advertise"},
+		{edata->advertised_u32 & ~SUPPORTED_Autoneg, "advertise"},
 		{edata->tx_lpi_timer, "tx-timer"},
 		{edata->tx_lpi_enabled != pf->stats.tx_lpi_status, "tx-lpi"}
 	};
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 778d1e6cfc45..b87b23d2151c 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -3038,10 +3038,10 @@ static int igb_get_eee(struct net_device *netdev, struct ethtool_keee *edata)
 	    (hw->phy.media_type != e1000_media_type_copper))
 		return -EOPNOTSUPP;
 
-	edata->supported = (SUPPORTED_1000baseT_Full |
-			    SUPPORTED_100baseT_Full);
+	edata->supported_u32 = (SUPPORTED_1000baseT_Full |
+				SUPPORTED_100baseT_Full);
 	if (!hw->dev_spec._82575.eee_disable)
-		edata->advertised =
+		edata->advertised_u32 =
 			mmd_eee_adv_to_ethtool_adv_t(adapter->eee_advert);
 
 	/* The IPCNFG and EEER registers are not supported on I354. */
@@ -3068,7 +3068,7 @@ static int igb_get_eee(struct net_device *netdev, struct ethtool_keee *edata)
 		if (ret_val)
 			return -ENODATA;
 
-		edata->lp_advertised = mmd_eee_adv_to_ethtool_adv_t(phy_data);
+		edata->lp_advertised_u32 = mmd_eee_adv_to_ethtool_adv_t(phy_data);
 		break;
 	case e1000_i354:
 	case e1000_i210:
@@ -3079,7 +3079,7 @@ static int igb_get_eee(struct net_device *netdev, struct ethtool_keee *edata)
 		if (ret_val)
 			return -ENODATA;
 
-		edata->lp_advertised = mmd_eee_adv_to_ethtool_adv_t(phy_data);
+		edata->lp_advertised_u32 = mmd_eee_adv_to_ethtool_adv_t(phy_data);
 
 		break;
 	default:
@@ -3099,7 +3099,7 @@ static int igb_get_eee(struct net_device *netdev, struct ethtool_keee *edata)
 		edata->eee_enabled = false;
 		edata->eee_active = false;
 		edata->tx_lpi_enabled = false;
-		edata->advertised &= ~edata->advertised;
+		edata->advertised_u32 &= ~edata->advertised_u32;
 	}
 
 	return 0;
@@ -3138,14 +3138,14 @@ static int igb_set_eee(struct net_device *netdev,
 			return -EINVAL;
 		}
 
-		if (!edata->advertised || (edata->advertised &
+		if (!edata->advertised_u32 || (edata->advertised_u32 &
 		    ~(ADVERTISE_100_FULL | ADVERTISE_1000_FULL))) {
 			dev_err(&adapter->pdev->dev,
 				"EEE Advertisement supports only 100Tx and/or 100T full duplex\n");
 			return -EINVAL;
 		}
-		adv100m_eee = !!(edata->advertised & ADVERTISE_100_FULL);
-		adv1g_eee = !!(edata->advertised & ADVERTISE_1000_FULL);
+		adv100m_eee = !!(edata->advertised_u32 & ADVERTISE_100_FULL);
+		adv1g_eee = !!(edata->advertised_u32 & ADVERTISE_1000_FULL);
 
 	} else if (!edata->eee_enabled) {
 		dev_err(&adapter->pdev->dev,
@@ -3153,7 +3153,7 @@ static int igb_set_eee(struct net_device *netdev,
 		return -EINVAL;
 	}
 
-	adapter->eee_advert = ethtool_adv_to_mmd_eee_adv_t(edata->advertised);
+	adapter->eee_advert = ethtool_adv_to_mmd_eee_adv_t(edata->advertised_u32);
 	if (hw->dev_spec._82575.eee_disable != !edata->eee_enabled) {
 		hw->dev_spec._82575.eee_disable = !edata->eee_enabled;
 		adapter->flags |= IGB_FLAG_EEE;
diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
index f2dcfe920f4f..7f844e967421 100644
--- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
+++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
@@ -1630,11 +1630,11 @@ static int igc_ethtool_get_eee(struct net_device *netdev,
 	u32 eeer;
 
 	if (hw->dev_spec._base.eee_enable)
-		edata->advertised =
+		edata->advertised_u32 =
 			mmd_eee_adv_to_ethtool_adv_t(adapter->eee_advert);
 
 	*edata = adapter->eee;
-	edata->supported = SUPPORTED_Autoneg;
+	edata->supported_u32 = SUPPORTED_Autoneg;
 
 	eeer = rd32(IGC_EEER);
 
@@ -1647,8 +1647,8 @@ static int igc_ethtool_get_eee(struct net_device *netdev,
 
 	edata->eee_enabled = hw->dev_spec._base.eee_enable;
 
-	edata->advertised = SUPPORTED_Autoneg;
-	edata->lp_advertised = SUPPORTED_Autoneg;
+	edata->advertised_u32 = SUPPORTED_Autoneg;
+	edata->lp_advertised_u32 = SUPPORTED_Autoneg;
 
 	/* Report correct negotiated EEE status for devices that
 	 * wrongly report EEE at half-duplex
@@ -1657,7 +1657,7 @@ static int igc_ethtool_get_eee(struct net_device *netdev,
 		edata->eee_enabled = false;
 		edata->eee_active = false;
 		edata->tx_lpi_enabled = false;
-		edata->advertised &= ~edata->advertised;
+		edata->advertised_u32 &= ~edata->advertised_u32;
 	}
 
 	return 0;
@@ -1699,7 +1699,7 @@ static int igc_ethtool_set_eee(struct net_device *netdev,
 		return -EINVAL;
 	}
 
-	adapter->eee_advert = ethtool_adv_to_mmd_eee_adv_t(edata->advertised);
+	adapter->eee_advert = ethtool_adv_to_mmd_eee_adv_t(edata->advertised_u32);
 	if (hw->dev_spec._base.eee_enable != edata->eee_enabled) {
 		hw->dev_spec._base.eee_enable = edata->eee_enabled;
 		adapter->flags |= IGC_FLAG_EEE;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 0aa73519a29d..ca69a8221793 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -3436,27 +3436,27 @@ ixgbe_get_eee_fw(struct ixgbe_adapter *adapter, struct ethtool_keee *edata)
 	if (rc)
 		return rc;
 
-	edata->lp_advertised = 0;
+	edata->lp_advertised_u32 = 0;
 	for (i = 0; i < ARRAY_SIZE(ixgbe_lp_map); ++i) {
 		if (info[0] & ixgbe_lp_map[i].lp_advertised)
-			edata->lp_advertised |= ixgbe_lp_map[i].mac_speed;
+			edata->lp_advertised_u32 |= ixgbe_lp_map[i].mac_speed;
 	}
 
-	edata->supported = 0;
+	edata->supported_u32 = 0;
 	for (i = 0; i < ARRAY_SIZE(ixgbe_ls_map); ++i) {
 		if (hw->phy.eee_speeds_supported & ixgbe_ls_map[i].mac_speed)
-			edata->supported |= ixgbe_ls_map[i].supported;
+			edata->supported_u32 |= ixgbe_ls_map[i].supported;
 	}
 
-	edata->advertised = 0;
+	edata->advertised_u32 = 0;
 	for (i = 0; i < ARRAY_SIZE(ixgbe_ls_map); ++i) {
 		if (hw->phy.eee_speeds_advertised & ixgbe_ls_map[i].mac_speed)
-			edata->advertised |= ixgbe_ls_map[i].supported;
+			edata->advertised_u32 |= ixgbe_ls_map[i].supported;
 	}
 
-	edata->eee_enabled = !!edata->advertised;
+	edata->eee_enabled = !!edata->advertised_u32;
 	edata->tx_lpi_enabled = edata->eee_enabled;
-	if (edata->advertised & edata->lp_advertised)
+	if (edata->advertised_u32 & edata->lp_advertised_u32)
 		edata->eee_active = true;
 
 	return 0;
@@ -3504,7 +3504,7 @@ static int ixgbe_set_eee(struct net_device *netdev, struct ethtool_keee *edata)
 			return -EINVAL;
 		}
 
-		if (eee_data.advertised != edata->advertised) {
+		if (eee_data.advertised_u32 != edata->advertised_u32) {
 			e_err(drv,
 			      "Setting EEE advertised speeds is not supported\n");
 			return -EINVAL;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 77491fb64039..dfa15619fd78 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -1790,17 +1790,17 @@ static int qede_get_eee(struct net_device *dev, struct ethtool_keee *edata)
 	}
 
 	if (current_link.eee.adv_caps & QED_EEE_1G_ADV)
-		edata->advertised = ADVERTISED_1000baseT_Full;
+		edata->advertised_u32 = ADVERTISED_1000baseT_Full;
 	if (current_link.eee.adv_caps & QED_EEE_10G_ADV)
-		edata->advertised |= ADVERTISED_10000baseT_Full;
+		edata->advertised_u32 |= ADVERTISED_10000baseT_Full;
 	if (current_link.sup_caps & QED_EEE_1G_ADV)
-		edata->supported = ADVERTISED_1000baseT_Full;
+		edata->supported_u32 = ADVERTISED_1000baseT_Full;
 	if (current_link.sup_caps & QED_EEE_10G_ADV)
-		edata->supported |= ADVERTISED_10000baseT_Full;
+		edata->supported_u32 |= ADVERTISED_10000baseT_Full;
 	if (current_link.eee.lp_adv_caps & QED_EEE_1G_ADV)
-		edata->lp_advertised = ADVERTISED_1000baseT_Full;
+		edata->lp_advertised_u32 = ADVERTISED_1000baseT_Full;
 	if (current_link.eee.lp_adv_caps & QED_EEE_10G_ADV)
-		edata->lp_advertised |= ADVERTISED_10000baseT_Full;
+		edata->lp_advertised_u32 |= ADVERTISED_10000baseT_Full;
 
 	edata->tx_lpi_timer = current_link.eee.tx_lpi_timer;
 	edata->eee_enabled = current_link.eee.enable;
@@ -1832,20 +1832,20 @@ static int qede_set_eee(struct net_device *dev, struct ethtool_keee *edata)
 	memset(&params, 0, sizeof(params));
 	params.override_flags |= QED_LINK_OVERRIDE_EEE_CONFIG;
 
-	if (!(edata->advertised & (ADVERTISED_1000baseT_Full |
-				   ADVERTISED_10000baseT_Full)) ||
-	    ((edata->advertised & (ADVERTISED_1000baseT_Full |
-				   ADVERTISED_10000baseT_Full)) !=
-	     edata->advertised)) {
+	if (!(edata->advertised_u32 & (ADVERTISED_1000baseT_Full |
+				       ADVERTISED_10000baseT_Full)) ||
+	    ((edata->advertised_u32 & (ADVERTISED_1000baseT_Full |
+				       ADVERTISED_10000baseT_Full)) !=
+	     edata->advertised_u32)) {
 		DP_VERBOSE(edev, QED_MSG_DEBUG,
 			   "Invalid advertised capabilities %d\n",
-			   edata->advertised);
+			   edata->advertised_u32);
 		return -EINVAL;
 	}
 
-	if (edata->advertised & ADVERTISED_1000baseT_Full)
+	if (edata->advertised_u32 & ADVERTISED_1000baseT_Full)
 		params.eee.adv_caps = QED_EEE_1G_ADV;
-	if (edata->advertised & ADVERTISED_10000baseT_Full)
+	if (edata->advertised_u32 & ADVERTISED_10000baseT_Full)
 		params.eee.adv_caps |= QED_EEE_10G_ADV;
 	params.eee.enable = edata->eee_enabled;
 	params.eee.tx_lpi_enable = edata->tx_lpi_enabled;
diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index adee5e712871..99c84af25746 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -1463,12 +1463,12 @@ int genphy_c45_ethtool_get_eee(struct phy_device *phydev,
 	data->eee_enabled = is_enabled;
 	data->eee_active = ret;
 
-	if (!ethtool_convert_link_mode_to_legacy_u32(&data->supported,
+	if (!ethtool_convert_link_mode_to_legacy_u32(&data->supported_u32,
 						     phydev->supported_eee))
 		overflow = true;
-	if (!ethtool_convert_link_mode_to_legacy_u32(&data->advertised, adv))
+	if (!ethtool_convert_link_mode_to_legacy_u32(&data->advertised_u32, adv))
 		overflow = true;
-	if (!ethtool_convert_link_mode_to_legacy_u32(&data->lp_advertised, lp))
+	if (!ethtool_convert_link_mode_to_legacy_u32(&data->lp_advertised_u32, lp))
 		overflow = true;
 
 	if (overflow)
@@ -1495,11 +1495,11 @@ int genphy_c45_ethtool_set_eee(struct phy_device *phydev,
 	int ret;
 
 	if (data->eee_enabled) {
-		if (data->advertised) {
+		if (data->advertised_u32) {
 			__ETHTOOL_DECLARE_LINK_MODE_MASK(adv);
 
 			ethtool_convert_legacy_u32_to_link_mode(adv,
-								data->advertised);
+								data->advertised_u32);
 			linkmode_andnot(adv, adv, phydev->supported_eee);
 			if (!linkmode_empty(adv)) {
 				phydev_warn(phydev, "At least some EEE link modes are not supported.\n");
@@ -1507,7 +1507,7 @@ int genphy_c45_ethtool_set_eee(struct phy_device *phydev,
 			}
 
 			ethtool_convert_legacy_u32_to_link_mode(phydev->advertising_eee,
-								data->advertised);
+								data->advertised_u32);
 		} else {
 			linkmode_copy(phydev->advertising_eee,
 				      phydev->supported_eee);
diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c
index 3922a9afdd1f..d6168eaa286f 100644
--- a/drivers/net/usb/ax88179_178a.c
+++ b/drivers/net/usb/ax88179_178a.c
@@ -676,21 +676,21 @@ ax88179_ethtool_get_eee(struct usbnet *dev, struct ethtool_keee *data)
 					    MDIO_MMD_PCS);
 	if (val < 0)
 		return val;
-	data->supported = mmd_eee_cap_to_ethtool_sup_t(val);
+	data->supported_u32 = mmd_eee_cap_to_ethtool_sup_t(val);
 
 	/* Get advertisement EEE */
 	val = ax88179_phy_read_mmd_indirect(dev, MDIO_AN_EEE_ADV,
 					    MDIO_MMD_AN);
 	if (val < 0)
 		return val;
-	data->advertised = mmd_eee_adv_to_ethtool_adv_t(val);
+	data->advertised_u32 = mmd_eee_adv_to_ethtool_adv_t(val);
 
 	/* Get LP advertisement EEE */
 	val = ax88179_phy_read_mmd_indirect(dev, MDIO_AN_EEE_LPABLE,
 					    MDIO_MMD_AN);
 	if (val < 0)
 		return val;
-	data->lp_advertised = mmd_eee_adv_to_ethtool_adv_t(val);
+	data->lp_advertised_u32 = mmd_eee_adv_to_ethtool_adv_t(val);
 
 	return 0;
 }
@@ -698,7 +698,7 @@ ax88179_ethtool_get_eee(struct usbnet *dev, struct ethtool_keee *data)
 static int
 ax88179_ethtool_set_eee(struct usbnet *dev, struct ethtool_keee *data)
 {
-	u16 tmp16 = ethtool_adv_to_mmd_eee_adv_t(data->advertised);
+	u16 tmp16 = ethtool_adv_to_mmd_eee_adv_t(data->advertised_u32);
 
 	return ax88179_phy_write_mmd_indirect(dev, MDIO_AN_EEE_ADV,
 					      MDIO_MMD_AN, tmp16);
@@ -1663,7 +1663,7 @@ static int ax88179_reset(struct usbnet *dev)
 	ax88179_disable_eee(dev);
 
 	ax88179_ethtool_get_eee(dev, &eee_data);
-	eee_data.advertised = 0;
+	eee_data.advertised_u32 = 0;
 	ax88179_ethtool_set_eee(dev, &eee_data);
 
 	/* Restart autoneg */
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index dc163b766a89..3d806b3ff425 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -8938,16 +8938,16 @@ static int r8152_get_eee(struct r8152 *tp, struct ethtool_keee *eee)
 
 	eee->eee_enabled = tp->eee_en;
 	eee->eee_active = !!(supported & adv & lp);
-	eee->supported = supported;
-	eee->advertised = tp->eee_adv;
-	eee->lp_advertised = lp;
+	eee->supported_u32 = supported;
+	eee->advertised_u32 = tp->eee_adv;
+	eee->lp_advertised_u32 = lp;
 
 	return 0;
 }
 
 static int r8152_set_eee(struct r8152 *tp, struct ethtool_keee *eee)
 {
-	u16 val = ethtool_adv_to_mmd_eee_adv_t(eee->advertised);
+	u16 val = ethtool_adv_to_mmd_eee_adv_t(eee->advertised_u32);
 
 	tp->eee_en = eee->eee_enabled;
 	tp->eee_adv = val;
@@ -8973,9 +8973,9 @@ static int r8153_get_eee(struct r8152 *tp, struct ethtool_keee *eee)
 
 	eee->eee_enabled = tp->eee_en;
 	eee->eee_active = !!(supported & adv & lp);
-	eee->supported = supported;
-	eee->advertised = tp->eee_adv;
-	eee->lp_advertised = lp;
+	eee->supported_u32 = supported;
+	eee->advertised_u32 = tp->eee_adv;
+	eee->lp_advertised_u32 = lp;
 
 	return 0;
 }
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 14549cb9e2b2..89807c30f5a7 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -223,9 +223,9 @@ __ethtool_get_link_ksettings(struct net_device *dev,
 			     struct ethtool_link_ksettings *link_ksettings);
 
 struct ethtool_keee {
-	u32	supported;
-	u32	advertised;
-	u32	lp_advertised;
+	u32	supported_u32;
+	u32	advertised_u32;
+	u32	lp_advertised_u32;
 	u32	tx_lpi_timer;
 	bool	tx_lpi_enabled;
 	bool	eee_active;
diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c
index ac9f694ffe20..ca56f28173d5 100644
--- a/net/ethtool/eee.c
+++ b/net/ethtool/eee.c
@@ -5,7 +5,7 @@
 #include "bitset.h"
 
 #define EEE_MODES_COUNT \
-	(sizeof_field(struct ethtool_keee, supported) * BITS_PER_BYTE)
+	(sizeof_field(struct ethtool_keee, supported_u32) * BITS_PER_BYTE)
 
 struct eee_req_info {
 	struct ethnl_req_info		base;
@@ -52,19 +52,19 @@ static int eee_reply_size(const struct ethnl_req_info *req_base,
 	int len = 0;
 	int ret;
 
-	BUILD_BUG_ON(sizeof(eee->advertised) * BITS_PER_BYTE !=
+	BUILD_BUG_ON(sizeof(eee->advertised_u32) * BITS_PER_BYTE !=
 		     EEE_MODES_COUNT);
-	BUILD_BUG_ON(sizeof(eee->lp_advertised) * BITS_PER_BYTE !=
+	BUILD_BUG_ON(sizeof(eee->lp_advertised_u32) * BITS_PER_BYTE !=
 		     EEE_MODES_COUNT);
 
 	/* MODES_OURS */
-	ret = ethnl_bitset32_size(&eee->advertised, &eee->supported,
+	ret = ethnl_bitset32_size(&eee->advertised_u32, &eee->supported_u32,
 				  EEE_MODES_COUNT, link_mode_names, compact);
 	if (ret < 0)
 		return ret;
 	len += ret;
 	/* MODES_PEERS */
-	ret = ethnl_bitset32_size(&eee->lp_advertised, NULL,
+	ret = ethnl_bitset32_size(&eee->lp_advertised_u32, NULL,
 				  EEE_MODES_COUNT, link_mode_names, compact);
 	if (ret < 0)
 		return ret;
@@ -88,12 +88,12 @@ static int eee_fill_reply(struct sk_buff *skb,
 	int ret;
 
 	ret = ethnl_put_bitset32(skb, ETHTOOL_A_EEE_MODES_OURS,
-				 &eee->advertised, &eee->supported,
+				 &eee->advertised_u32, &eee->supported_u32,
 				 EEE_MODES_COUNT, link_mode_names, compact);
 	if (ret < 0)
 		return ret;
 	ret = ethnl_put_bitset32(skb, ETHTOOL_A_EEE_MODES_PEER,
-				 &eee->lp_advertised, NULL, EEE_MODES_COUNT,
+				 &eee->lp_advertised_u32, NULL, EEE_MODES_COUNT,
 				 link_mode_names, compact);
 	if (ret < 0)
 		return ret;
@@ -140,7 +140,7 @@ ethnl_set_eee(struct ethnl_req_info *req_info, struct genl_info *info)
 	if (ret < 0)
 		return ret;
 
-	ret = ethnl_update_bitset32(&eee.advertised, EEE_MODES_COUNT,
+	ret = ethnl_update_bitset32(&eee.advertised_u32, EEE_MODES_COUNT,
 				    tb[ETHTOOL_A_EEE_MODES_OURS],
 				    link_mode_names, info->extack, &mod);
 	if (ret < 0)
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 46c29b369aeb..5b2ca72e3203 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1513,9 +1513,9 @@ static void eee_to_keee(struct ethtool_keee *keee,
 {
 	memset(keee, 0, sizeof(*keee));
 
-	keee->supported = eee->supported;
-	keee->advertised = eee->advertised;
-	keee->lp_advertised = eee->lp_advertised;
+	keee->supported_u32 = eee->supported;
+	keee->advertised_u32 = eee->advertised;
+	keee->lp_advertised_u32 = eee->lp_advertised;
 	keee->eee_active = eee->eee_active;
 	keee->eee_enabled = eee->eee_enabled;
 	keee->tx_lpi_enabled = eee->tx_lpi_enabled;
@@ -1527,9 +1527,9 @@ static void keee_to_eee(struct ethtool_eee *eee,
 {
 	memset(eee, 0, sizeof(*eee));
 
-	eee->supported = keee->supported;
-	eee->advertised = keee->advertised;
-	eee->lp_advertised = keee->lp_advertised;
+	eee->supported = keee->supported_u32;
+	eee->advertised = keee->advertised_u32;
+	eee->lp_advertised = keee->lp_advertised_u32;
 	eee->eee_active = keee->eee_active;
 	eee->eee_enabled = keee->eee_enabled;
 	eee->tx_lpi_enabled = keee->tx_lpi_enabled;
-- 
cgit v1.2.3


From 1f069de63602e8d39d7d9fd6195f65235316f79a Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 27 Jan 2024 14:29:33 +0100
Subject: ethtool: add linkmode bitmap support to struct ethtool_keee

Add linkmode bitmap members to struct ethtool_keee, but keep the legacy
u32 bitmaps for compatibility with existing drivers.
Use linkmode "supported" not being empty as indicator that a user wants
to use the linkmode bitmap members instead of the legacy bitmaps.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  3 +++
 net/ethtool/common.c    |  5 +++++
 net/ethtool/common.h    |  1 +
 net/ethtool/eee.c       | 49 +++++++++++++++++++++++++++++++++++--------------
 net/ethtool/ioctl.c     | 27 ++++++++++++++++++++++++---
 5 files changed, 68 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 89807c30f5a7..b90c33607594 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -223,6 +223,9 @@ __ethtool_get_link_ksettings(struct net_device *dev,
 			     struct ethtool_link_ksettings *link_ksettings);
 
 struct ethtool_keee {
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(advertised);
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertised);
 	u32	supported_u32;
 	u32	advertised_u32;
 	u32	lp_advertised_u32;
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 6b2a360dcdf0..ce486cec346c 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -712,3 +712,8 @@ ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size)
 	}
 }
 EXPORT_SYMBOL_GPL(ethtool_forced_speed_maps_init);
+
+bool ethtool_eee_use_linkmodes(const struct ethtool_keee *eee)
+{
+	return !linkmode_empty(eee->supported);
+}
diff --git a/net/ethtool/common.h b/net/ethtool/common.h
index 28b8aaaf9bcb..0f2b5f7eacee 100644
--- a/net/ethtool/common.h
+++ b/net/ethtool/common.h
@@ -55,5 +55,6 @@ int ethtool_get_module_eeprom_call(struct net_device *dev,
 				   struct ethtool_eeprom *ee, u8 *data);
 
 bool __ethtool_dev_mm_supported(struct net_device *dev);
+bool ethtool_eee_use_linkmodes(const struct ethtool_keee *eee);
 
 #endif /* _ETHTOOL_COMMON_H */
diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c
index ca56f28173d5..db6faa18fe41 100644
--- a/net/ethtool/eee.c
+++ b/net/ethtool/eee.c
@@ -30,6 +30,7 @@ static int eee_prepare_data(const struct ethnl_req_info *req_base,
 {
 	struct eee_reply_data *data = EEE_REPDATA(reply_base);
 	struct net_device *dev = reply_base->dev;
+	struct ethtool_keee *eee = &data->eee;
 	int ret;
 
 	if (!dev->ethtool_ops->get_eee)
@@ -37,9 +38,18 @@ static int eee_prepare_data(const struct ethnl_req_info *req_base,
 	ret = ethnl_ops_begin(dev);
 	if (ret < 0)
 		return ret;
-	ret = dev->ethtool_ops->get_eee(dev, &data->eee);
+	ret = dev->ethtool_ops->get_eee(dev, eee);
 	ethnl_ops_complete(dev);
 
+	if (!ret && !ethtool_eee_use_linkmodes(eee)) {
+		ethtool_convert_legacy_u32_to_link_mode(eee->supported,
+							eee->supported_u32);
+		ethtool_convert_legacy_u32_to_link_mode(eee->advertised,
+							eee->advertised_u32);
+		ethtool_convert_legacy_u32_to_link_mode(eee->lp_advertised,
+							eee->lp_advertised_u32);
+	}
+
 	return ret;
 }
 
@@ -58,14 +68,16 @@ static int eee_reply_size(const struct ethnl_req_info *req_base,
 		     EEE_MODES_COUNT);
 
 	/* MODES_OURS */
-	ret = ethnl_bitset32_size(&eee->advertised_u32, &eee->supported_u32,
-				  EEE_MODES_COUNT, link_mode_names, compact);
+	ret = ethnl_bitset_size(eee->advertised, eee->supported,
+				__ETHTOOL_LINK_MODE_MASK_NBITS,
+				link_mode_names, compact);
 	if (ret < 0)
 		return ret;
 	len += ret;
 	/* MODES_PEERS */
-	ret = ethnl_bitset32_size(&eee->lp_advertised_u32, NULL,
-				  EEE_MODES_COUNT, link_mode_names, compact);
+	ret = ethnl_bitset_size(eee->lp_advertised, NULL,
+				__ETHTOOL_LINK_MODE_MASK_NBITS,
+				link_mode_names, compact);
 	if (ret < 0)
 		return ret;
 	len += ret;
@@ -87,14 +99,16 @@ static int eee_fill_reply(struct sk_buff *skb,
 	const struct ethtool_keee *eee = &data->eee;
 	int ret;
 
-	ret = ethnl_put_bitset32(skb, ETHTOOL_A_EEE_MODES_OURS,
-				 &eee->advertised_u32, &eee->supported_u32,
-				 EEE_MODES_COUNT, link_mode_names, compact);
+	ret = ethnl_put_bitset(skb, ETHTOOL_A_EEE_MODES_OURS,
+			       eee->advertised, eee->supported,
+			       __ETHTOOL_LINK_MODE_MASK_NBITS,
+			       link_mode_names, compact);
 	if (ret < 0)
 		return ret;
-	ret = ethnl_put_bitset32(skb, ETHTOOL_A_EEE_MODES_PEER,
-				 &eee->lp_advertised_u32, NULL, EEE_MODES_COUNT,
-				 link_mode_names, compact);
+	ret = ethnl_put_bitset(skb, ETHTOOL_A_EEE_MODES_PEER,
+			       eee->lp_advertised, NULL,
+			       __ETHTOOL_LINK_MODE_MASK_NBITS,
+			       link_mode_names, compact);
 	if (ret < 0)
 		return ret;
 
@@ -140,9 +154,16 @@ ethnl_set_eee(struct ethnl_req_info *req_info, struct genl_info *info)
 	if (ret < 0)
 		return ret;
 
-	ret = ethnl_update_bitset32(&eee.advertised_u32, EEE_MODES_COUNT,
-				    tb[ETHTOOL_A_EEE_MODES_OURS],
-				    link_mode_names, info->extack, &mod);
+	if (ethtool_eee_use_linkmodes(&eee)) {
+		ret = ethnl_update_bitset(eee.advertised,
+					  __ETHTOOL_LINK_MODE_MASK_NBITS,
+					  tb[ETHTOOL_A_EEE_MODES_OURS],
+					  link_mode_names, info->extack, &mod);
+	} else {
+		ret = ethnl_update_bitset32(&eee.advertised_u32, EEE_MODES_COUNT,
+					    tb[ETHTOOL_A_EEE_MODES_OURS],
+					    link_mode_names, info->extack, &mod);
+	}
 	if (ret < 0)
 		return ret;
 	ethnl_update_bool(&eee.eee_enabled, tb[ETHTOOL_A_EEE_ENABLED], &mod);
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 5b2ca72e3203..1763e8b697e1 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1520,6 +1520,13 @@ static void eee_to_keee(struct ethtool_keee *keee,
 	keee->eee_enabled = eee->eee_enabled;
 	keee->tx_lpi_enabled = eee->tx_lpi_enabled;
 	keee->tx_lpi_timer = eee->tx_lpi_timer;
+
+	ethtool_convert_legacy_u32_to_link_mode(keee->supported,
+						eee->supported);
+	ethtool_convert_legacy_u32_to_link_mode(keee->advertised,
+						eee->advertised);
+	ethtool_convert_legacy_u32_to_link_mode(keee->lp_advertised,
+						eee->lp_advertised);
 }
 
 static void keee_to_eee(struct ethtool_eee *eee,
@@ -1527,13 +1534,27 @@ static void keee_to_eee(struct ethtool_eee *eee,
 {
 	memset(eee, 0, sizeof(*eee));
 
-	eee->supported = keee->supported_u32;
-	eee->advertised = keee->advertised_u32;
-	eee->lp_advertised = keee->lp_advertised_u32;
 	eee->eee_active = keee->eee_active;
 	eee->eee_enabled = keee->eee_enabled;
 	eee->tx_lpi_enabled = keee->tx_lpi_enabled;
 	eee->tx_lpi_timer = keee->tx_lpi_timer;
+
+	if (ethtool_eee_use_linkmodes(keee)) {
+		bool overflow;
+
+		overflow = !ethtool_convert_link_mode_to_legacy_u32(&eee->supported,
+								    keee->supported);
+		ethtool_convert_link_mode_to_legacy_u32(&eee->advertised,
+							keee->advertised);
+		ethtool_convert_link_mode_to_legacy_u32(&eee->lp_advertised,
+							keee->lp_advertised);
+		if (overflow)
+			pr_warn("Ethtool ioctl interface doesn't support passing EEE linkmodes beyond bit 32\n");
+	} else {
+		eee->supported = keee->supported_u32;
+		eee->advertised = keee->advertised_u32;
+		eee->lp_advertised = keee->lp_advertised_u32;
+	}
 }
 
 static int ethtool_get_eee(struct net_device *dev, char __user *useraddr)
-- 
cgit v1.2.3


From 2a71528427c635f0a8bff704b2e62ce81c641d6f Mon Sep 17 00:00:00 2001
From: Arend van Spriel <arend.vanspriel@broadcom.com>
Date: Sun, 28 Jan 2024 10:30:57 +0100
Subject: wifi: brcmfmac: fix copyright year mentioned in platform_data header

The driver found its inception a little after the year 201. According
git blame output it was added in 2016 so lets go with that.

Fixes: 4d7928959832 ("brcmfmac: switch to new platform data")
Reported-by: Dmitry Antipov <dmantipov@yandex.ru>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://msgid.link/20240128093057.164791-3-arend.vanspriel@broadcom.com
---
 include/linux/platform_data/brcmfmac.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/platform_data/brcmfmac.h b/include/linux/platform_data/brcmfmac.h
index f922a192fe58..ec99b7b73d1d 100644
--- a/include/linux/platform_data/brcmfmac.h
+++ b/include/linux/platform_data/brcmfmac.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 201 Broadcom Corporation
+ * Copyright (c) 2016 Broadcom Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
-- 
cgit v1.2.3


From 79b47344bbc5a693a92ed6b2b09dac59254bfac8 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Sun, 28 Jan 2024 18:24:06 -0700
Subject: bpf: btf: Support flags for BTF_SET8 sets

This commit adds support for flags on BTF_SET8s. struct btf_id_set8
already supported 32 bits worth of flags, but was only used for
alignment purposes before.

We now use these bits to encode flags. The first use case is tagging
kfunc sets with a flag so that pahole can recognize which
BTF_ID_FLAGS(func, ..) are actual kfuncs.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/7bb152ec76d6c2c930daec88e995bf18484a5ebb.1706491398.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf_ids.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index a9cb10b0e2e9..dca09b7f21dc 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -21,6 +21,7 @@ struct btf_id_set8 {
 
 #include <linux/compiler.h> /* for __PASTE */
 #include <linux/compiler_attributes.h> /* for __maybe_unused */
+#include <linux/stringify.h>
 
 /*
  * Following macros help to define lists of BTF IDs placed
@@ -183,17 +184,18 @@ extern struct btf_id_set name;
  * .word (1 << 3) | (1 << 1) | (1 << 2)
  *
  */
-#define __BTF_SET8_START(name, scope)			\
+#define __BTF_SET8_START(name, scope, flags)		\
+__BTF_ID_LIST(name, local)				\
 asm(							\
 ".pushsection " BTF_IDS_SECTION ",\"a\";       \n"	\
 "." #scope " __BTF_ID__set8__" #name ";        \n"	\
 "__BTF_ID__set8__" #name ":;                   \n"	\
-".zero 8                                       \n"	\
+".zero 4                                       \n"	\
+".long " __stringify(flags)                   "\n"	\
 ".popsection;                                  \n");
 
 #define BTF_SET8_START(name)				\
-__BTF_ID_LIST(name, local)				\
-__BTF_SET8_START(name, local)
+__BTF_SET8_START(name, local, 0)
 
 #define BTF_SET8_END(name)				\
 asm(							\
-- 
cgit v1.2.3


From 11498715f266a3fb4caabba9dd575636cbcaa8f1 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 29 Jan 2024 11:04:34 -0800
Subject: af_unix: Remove io_uring code for GC.

Since commit 705318a99a13 ("io_uring/af_unix: disable sending
io_uring over sockets"), io_uring's unix socket cannot be passed
via SCM_RIGHTS, so it does not contribute to cyclic reference and
no longer be candidate for garbage collection.

Also, commit 6e5e6d274956 ("io_uring: drop any code related to
SCM_RIGHTS") cleaned up SCM_RIGHTS code in io_uring.

Let's do it in AF_UNIX as well by reverting commit 0091bfc81741
("io_uring/af_unix: defer registered files gc to io_uring release")
and commit 10369080454d ("net: reclaim skb->scm_io_uring bit").

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20240129190435.57228-3-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h |  1 -
 net/unix/garbage.c    | 25 ++-----------------------
 net/unix/scm.c        |  6 ------
 3 files changed, 2 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index f045bbd9017d..9e39b2ec4524 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -20,7 +20,6 @@ static inline struct unix_sock *unix_get_socket(struct file *filp)
 void unix_inflight(struct user_struct *user, struct file *fp);
 void unix_notinflight(struct user_struct *user, struct file *fp);
 void unix_destruct_scm(struct sk_buff *skb);
-void io_uring_destruct_scm(struct sk_buff *skb);
 void unix_gc(void);
 void wait_for_unix_gc(struct scm_fp_list *fpl);
 struct sock *unix_peer_get(struct sock *sk);
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index af676bb8fb67..ce5b5f87b16e 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -184,12 +184,10 @@ static bool gc_in_progress;
 
 static void __unix_gc(struct work_struct *work)
 {
-	struct sk_buff *next_skb, *skb;
-	struct unix_sock *u;
-	struct unix_sock *next;
 	struct sk_buff_head hitlist;
-	struct list_head cursor;
+	struct unix_sock *u, *next;
 	LIST_HEAD(not_cycle_list);
+	struct list_head cursor;
 
 	spin_lock(&unix_gc_lock);
 
@@ -269,30 +267,11 @@ static void __unix_gc(struct work_struct *work)
 
 	spin_unlock(&unix_gc_lock);
 
-	/* We need io_uring to clean its registered files, ignore all io_uring
-	 * originated skbs. It's fine as io_uring doesn't keep references to
-	 * other io_uring instances and so killing all other files in the cycle
-	 * will put all io_uring references forcing it to go through normal
-	 * release.path eventually putting registered files.
-	 */
-	skb_queue_walk_safe(&hitlist, skb, next_skb) {
-		if (skb->destructor == io_uring_destruct_scm) {
-			__skb_unlink(skb, &hitlist);
-			skb_queue_tail(&skb->sk->sk_receive_queue, skb);
-		}
-	}
-
 	/* Here we are. Hitlist is filled. Die. */
 	__skb_queue_purge(&hitlist);
 
 	spin_lock(&unix_gc_lock);
 
-	/* There could be io_uring registered files, just push them back to
-	 * the inflight list
-	 */
-	list_for_each_entry_safe(u, next, &gc_candidates, link)
-		list_move_tail(&u->link, &gc_inflight_list);
-
 	/* All candidates should have been detached by now. */
 	WARN_ON_ONCE(!list_empty(&gc_candidates));
 
diff --git a/net/unix/scm.c b/net/unix/scm.c
index 505e56cf02a2..db65b0ab5947 100644
--- a/net/unix/scm.c
+++ b/net/unix/scm.c
@@ -148,9 +148,3 @@ void unix_destruct_scm(struct sk_buff *skb)
 	sock_wfree(skb);
 }
 EXPORT_SYMBOL(unix_destruct_scm);
-
-void io_uring_destruct_scm(struct sk_buff *skb)
-{
-	unix_destruct_scm(skb);
-}
-EXPORT_SYMBOL(io_uring_destruct_scm);
-- 
cgit v1.2.3


From 99a7a5b9943ea2d05fb0dee38e4ae2290477ed83 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 29 Jan 2024 11:04:35 -0800
Subject: af_unix: Remove CONFIG_UNIX_SCM.

Originally, the code related to garbage collection was all in garbage.c.

Commit f4e65870e5ce ("net: split out functions related to registering
inflight socket files") moved some functions to scm.c for io_uring and
added CONFIG_UNIX_SCM just in case AF_UNIX was built as module.

However, since commit 97154bcf4d1b ("af_unix: Kconfig: make CONFIG_UNIX
bool"), AF_UNIX is no longer built separately.  Also, io_uring does not
support SCM_RIGHTS now.

Let's move the functions back to garbage.c

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20240129190435.57228-4-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h |   7 ++-
 net/Makefile          |   2 +-
 net/unix/Kconfig      |   5 --
 net/unix/Makefile     |   2 -
 net/unix/af_unix.c    |  63 ++++++++++++++++++++-
 net/unix/garbage.c    |  73 +++++++++++++++++++++++-
 net/unix/scm.c        | 150 --------------------------------------------------
 net/unix/scm.h        |  10 ----
 8 files changed, 137 insertions(+), 175 deletions(-)
 delete mode 100644 net/unix/scm.c
 delete mode 100644 net/unix/scm.h

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 9e39b2ec4524..54e346152eb1 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -17,19 +17,20 @@ static inline struct unix_sock *unix_get_socket(struct file *filp)
 }
 #endif
 
+extern spinlock_t unix_gc_lock;
+extern unsigned int unix_tot_inflight;
+
 void unix_inflight(struct user_struct *user, struct file *fp);
 void unix_notinflight(struct user_struct *user, struct file *fp);
-void unix_destruct_scm(struct sk_buff *skb);
 void unix_gc(void);
 void wait_for_unix_gc(struct scm_fp_list *fpl);
+
 struct sock *unix_peer_get(struct sock *sk);
 
 #define UNIX_HASH_MOD	(256 - 1)
 #define UNIX_HASH_SIZE	(256 * 2)
 #define UNIX_HASH_BITS	8
 
-extern unsigned int unix_tot_inflight;
-
 struct unix_address {
 	refcount_t	refcnt;
 	int		len;
diff --git a/net/Makefile b/net/Makefile
index b06b5539e7a6..65bb8c72a35e 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -17,7 +17,7 @@ obj-$(CONFIG_NETFILTER)		+= netfilter/
 obj-$(CONFIG_INET)		+= ipv4/
 obj-$(CONFIG_TLS)		+= tls/
 obj-$(CONFIG_XFRM)		+= xfrm/
-obj-$(CONFIG_UNIX_SCM)		+= unix/
+obj-$(CONFIG_UNIX)		+= unix/
 obj-y				+= ipv6/
 obj-$(CONFIG_PACKET)		+= packet/
 obj-$(CONFIG_NET_KEY)		+= key/
diff --git a/net/unix/Kconfig b/net/unix/Kconfig
index 28b232f281ab..8b5d04210d7c 100644
--- a/net/unix/Kconfig
+++ b/net/unix/Kconfig
@@ -16,11 +16,6 @@ config UNIX
 
 	  Say Y unless you know what you are doing.
 
-config UNIX_SCM
-	bool
-	depends on UNIX
-	default y
-
 config	AF_UNIX_OOB
 	bool
 	depends on UNIX
diff --git a/net/unix/Makefile b/net/unix/Makefile
index 20491825b4d0..4ddd125c4642 100644
--- a/net/unix/Makefile
+++ b/net/unix/Makefile
@@ -11,5 +11,3 @@ unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o
 
 obj-$(CONFIG_UNIX_DIAG)	+= unix_diag.o
 unix_diag-y		:= diag.o
-
-obj-$(CONFIG_UNIX_SCM)	+= scm.o
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 1720419d93d6..1cfbc586adb4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -118,8 +118,6 @@
 #include <linux/btf_ids.h>
 #include <linux/bpf-cgroup.h>
 
-#include "scm.h"
-
 static atomic_long_t unix_nr_socks;
 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
@@ -1790,6 +1788,52 @@ out:
 	return err;
 }
 
+/* The "user->unix_inflight" variable is protected by the garbage
+ * collection lock, and we just read it locklessly here. If you go
+ * over the limit, there might be a tiny race in actually noticing
+ * it across threads. Tough.
+ */
+static inline bool too_many_unix_fds(struct task_struct *p)
+{
+	struct user_struct *user = current_user();
+
+	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
+		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+	return false;
+}
+
+static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+{
+	int i;
+
+	if (too_many_unix_fds(current))
+		return -ETOOMANYREFS;
+
+	/* Need to duplicate file references for the sake of garbage
+	 * collection.  Otherwise a socket in the fps might become a
+	 * candidate for GC while the skb is not yet queued.
+	 */
+	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
+	if (!UNIXCB(skb).fp)
+		return -ENOMEM;
+
+	for (i = scm->fp->count - 1; i >= 0; i--)
+		unix_inflight(scm->fp->user, scm->fp->fp[i]);
+
+	return 0;
+}
+
+static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+{
+	int i;
+
+	scm->fp = UNIXCB(skb).fp;
+	UNIXCB(skb).fp = NULL;
+
+	for (i = scm->fp->count - 1; i >= 0; i--)
+		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
+}
+
 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
 {
 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
@@ -1837,6 +1881,21 @@ static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
 	spin_unlock(&unix_gc_lock);
 }
 
+static void unix_destruct_scm(struct sk_buff *skb)
+{
+	struct scm_cookie scm;
+
+	memset(&scm, 0, sizeof(scm));
+	scm.pid  = UNIXCB(skb).pid;
+	if (UNIXCB(skb).fp)
+		unix_detach_fds(&scm, skb);
+
+	/* Alas, it calls VFS */
+	/* So fscking what? fput() had been SMP-safe since the last Summer */
+	scm_destroy(&scm);
+	sock_wfree(skb);
+}
+
 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
 {
 	int err = 0;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index ce5b5f87b16e..9b8473dd79a4 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -81,11 +81,80 @@
 #include <net/scm.h>
 #include <net/tcp_states.h>
 
-#include "scm.h"
+struct unix_sock *unix_get_socket(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+
+	/* Socket ? */
+	if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
+		struct socket *sock = SOCKET_I(inode);
+		const struct proto_ops *ops;
+		struct sock *sk = sock->sk;
 
-/* Internal data structures and random procedures: */
+		ops = READ_ONCE(sock->ops);
 
+		/* PF_UNIX ? */
+		if (sk && ops && ops->family == PF_UNIX)
+			return unix_sk(sk);
+	}
+
+	return NULL;
+}
+
+DEFINE_SPINLOCK(unix_gc_lock);
+unsigned int unix_tot_inflight;
 static LIST_HEAD(gc_candidates);
+static LIST_HEAD(gc_inflight_list);
+
+/* Keep the number of times in flight count for the file
+ * descriptor if it is for an AF_UNIX socket.
+ */
+void unix_inflight(struct user_struct *user, struct file *filp)
+{
+	struct unix_sock *u = unix_get_socket(filp);
+
+	spin_lock(&unix_gc_lock);
+
+	if (u) {
+		if (!u->inflight) {
+			WARN_ON_ONCE(!list_empty(&u->link));
+			list_add_tail(&u->link, &gc_inflight_list);
+		} else {
+			WARN_ON_ONCE(list_empty(&u->link));
+		}
+		u->inflight++;
+
+		/* Paired with READ_ONCE() in wait_for_unix_gc() */
+		WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
+	}
+
+	WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1);
+
+	spin_unlock(&unix_gc_lock);
+}
+
+void unix_notinflight(struct user_struct *user, struct file *filp)
+{
+	struct unix_sock *u = unix_get_socket(filp);
+
+	spin_lock(&unix_gc_lock);
+
+	if (u) {
+		WARN_ON_ONCE(!u->inflight);
+		WARN_ON_ONCE(list_empty(&u->link));
+
+		u->inflight--;
+		if (!u->inflight)
+			list_del_init(&u->link);
+
+		/* Paired with READ_ONCE() in wait_for_unix_gc() */
+		WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
+	}
+
+	WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1);
+
+	spin_unlock(&unix_gc_lock);
+}
 
 static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
 			  struct sk_buff_head *hitlist)
diff --git a/net/unix/scm.c b/net/unix/scm.c
deleted file mode 100644
index db65b0ab5947..000000000000
--- a/net/unix/scm.c
+++ /dev/null
@@ -1,150 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/net.h>
-#include <linux/fs.h>
-#include <net/af_unix.h>
-#include <net/scm.h>
-#include <linux/init.h>
-#include <linux/io_uring.h>
-
-#include "scm.h"
-
-unsigned int unix_tot_inflight;
-EXPORT_SYMBOL(unix_tot_inflight);
-
-LIST_HEAD(gc_inflight_list);
-EXPORT_SYMBOL(gc_inflight_list);
-
-DEFINE_SPINLOCK(unix_gc_lock);
-EXPORT_SYMBOL(unix_gc_lock);
-
-struct unix_sock *unix_get_socket(struct file *filp)
-{
-	struct inode *inode = file_inode(filp);
-
-	/* Socket ? */
-	if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
-		struct socket *sock = SOCKET_I(inode);
-		const struct proto_ops *ops = READ_ONCE(sock->ops);
-		struct sock *s = sock->sk;
-
-		/* PF_UNIX ? */
-		if (s && ops && ops->family == PF_UNIX)
-			return unix_sk(s);
-	}
-
-	return NULL;
-}
-EXPORT_SYMBOL(unix_get_socket);
-
-/* Keep the number of times in flight count for the file
- * descriptor if it is for an AF_UNIX socket.
- */
-void unix_inflight(struct user_struct *user, struct file *fp)
-{
-	struct unix_sock *u = unix_get_socket(fp);
-
-	spin_lock(&unix_gc_lock);
-
-	if (u) {
-		if (!u->inflight) {
-			WARN_ON_ONCE(!list_empty(&u->link));
-			list_add_tail(&u->link, &gc_inflight_list);
-		} else {
-			WARN_ON_ONCE(list_empty(&u->link));
-		}
-		u->inflight++;
-		/* Paired with READ_ONCE() in wait_for_unix_gc() */
-		WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
-	}
-	WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1);
-	spin_unlock(&unix_gc_lock);
-}
-
-void unix_notinflight(struct user_struct *user, struct file *fp)
-{
-	struct unix_sock *u = unix_get_socket(fp);
-
-	spin_lock(&unix_gc_lock);
-
-	if (u) {
-		WARN_ON_ONCE(!u->inflight);
-		WARN_ON_ONCE(list_empty(&u->link));
-
-		u->inflight--;
-		if (!u->inflight)
-			list_del_init(&u->link);
-		/* Paired with READ_ONCE() in wait_for_unix_gc() */
-		WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
-	}
-	WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1);
-	spin_unlock(&unix_gc_lock);
-}
-
-/*
- * The "user->unix_inflight" variable is protected by the garbage
- * collection lock, and we just read it locklessly here. If you go
- * over the limit, there might be a tiny race in actually noticing
- * it across threads. Tough.
- */
-static inline bool too_many_unix_fds(struct task_struct *p)
-{
-	struct user_struct *user = current_user();
-
-	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
-		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
-	return false;
-}
-
-int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
-{
-	int i;
-
-	if (too_many_unix_fds(current))
-		return -ETOOMANYREFS;
-
-	/*
-	 * Need to duplicate file references for the sake of garbage
-	 * collection.  Otherwise a socket in the fps might become a
-	 * candidate for GC while the skb is not yet queued.
-	 */
-	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
-	if (!UNIXCB(skb).fp)
-		return -ENOMEM;
-
-	for (i = scm->fp->count - 1; i >= 0; i--)
-		unix_inflight(scm->fp->user, scm->fp->fp[i]);
-	return 0;
-}
-EXPORT_SYMBOL(unix_attach_fds);
-
-void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
-{
-	int i;
-
-	scm->fp = UNIXCB(skb).fp;
-	UNIXCB(skb).fp = NULL;
-
-	for (i = scm->fp->count-1; i >= 0; i--)
-		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
-}
-EXPORT_SYMBOL(unix_detach_fds);
-
-void unix_destruct_scm(struct sk_buff *skb)
-{
-	struct scm_cookie scm;
-
-	memset(&scm, 0, sizeof(scm));
-	scm.pid  = UNIXCB(skb).pid;
-	if (UNIXCB(skb).fp)
-		unix_detach_fds(&scm, skb);
-
-	/* Alas, it calls VFS */
-	/* So fscking what? fput() had been SMP-safe since the last Summer */
-	scm_destroy(&scm);
-	sock_wfree(skb);
-}
-EXPORT_SYMBOL(unix_destruct_scm);
diff --git a/net/unix/scm.h b/net/unix/scm.h
deleted file mode 100644
index 5a255a477f16..000000000000
--- a/net/unix/scm.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef NET_UNIX_SCM_H
-#define NET_UNIX_SCM_H
-
-extern struct list_head gc_inflight_list;
-extern spinlock_t unix_gc_lock;
-
-int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb);
-void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb);
-
-#endif
-- 
cgit v1.2.3


From a05e90427ef6706f59188b379ad6366b9d298bc5 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Sun, 28 Jan 2024 18:24:07 -0700
Subject: bpf: btf: Add BTF_KFUNCS_START/END macro pair

This macro pair is functionally equivalent to BTF_SET8_START/END, except
with BTF_SET8_KFUNCS flag set in the btf_id_set8 flags field. The next
commit will codemod all kfunc set8s to this new variant such that all
kfuncs are tagged as such in .BTF_ids section.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/d536c57c7c2af428686853cc7396b7a44faa53b7.1706491398.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf_ids.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index dca09b7f21dc..e24aabfe8ecc 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -8,6 +8,9 @@ struct btf_id_set {
 	u32 ids[];
 };
 
+/* This flag implies BTF_SET8 holds kfunc(s) */
+#define BTF_SET8_KFUNCS		(1 << 0)
+
 struct btf_id_set8 {
 	u32 cnt;
 	u32 flags;
@@ -204,6 +207,12 @@ asm(							\
 ".popsection;                                 \n");	\
 extern struct btf_id_set8 name;
 
+#define BTF_KFUNCS_START(name)				\
+__BTF_SET8_START(name, local, BTF_SET8_KFUNCS)
+
+#define BTF_KFUNCS_END(name)				\
+BTF_SET8_END(name)
+
 #else
 
 #define BTF_ID_LIST(name) static u32 __maybe_unused name[64];
@@ -218,6 +227,8 @@ extern struct btf_id_set8 name;
 #define BTF_SET_END(name)
 #define BTF_SET8_START(name) static struct btf_id_set8 __maybe_unused name = { 0 };
 #define BTF_SET8_END(name)
+#define BTF_KFUNCS_START(name) static struct btf_id_set8 __maybe_unused name = { .flags = BTF_SET8_KFUNCS };
+#define BTF_KFUNCS_END(name)
 
 #endif /* CONFIG_DEBUG_INFO_BTF */
 
-- 
cgit v1.2.3


From 3723b56d6f73f7c8c3b521a80556f129830f6fb9 Mon Sep 17 00:00:00 2001
From: Philippe Schenker <philippe.schenker@impulsing.ch>
Date: Tue, 30 Jan 2024 09:34:19 +0100
Subject: net: dsa: Add KSZ8567 switch support

This commit introduces support for the KSZ8567, a robust 7-port
Ethernet switch. The KSZ8567 features two RGMII/MII/RMII interfaces,
each capable of gigabit speeds, complemented by five 10/100 Mbps
MAC/PHYs.

Signed-off-by: Philippe Schenker <philippe.schenker@impulsing.ch>
Acked-by: Arun Ramadoss <arun.ramadoss@microchip.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://lore.kernel.org/r/20240130083419.135763-2-dev@pschenker.ch
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/dsa/microchip/ksz9477_i2c.c     |  4 +++
 drivers/net/dsa/microchip/ksz_common.c      | 43 ++++++++++++++++++++++++++++-
 drivers/net/dsa/microchip/ksz_common.h      |  1 +
 drivers/net/dsa/microchip/ksz_spi.c         |  5 ++++
 include/linux/platform_data/microchip-ksz.h |  1 +
 5 files changed, 53 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/dsa/microchip/ksz9477_i2c.c b/drivers/net/dsa/microchip/ksz9477_i2c.c
index cac4a607e54a..82bebee4615c 100644
--- a/drivers/net/dsa/microchip/ksz9477_i2c.c
+++ b/drivers/net/dsa/microchip/ksz9477_i2c.c
@@ -103,6 +103,10 @@ static const struct of_device_id ksz9477_dt_ids[] = {
 		.compatible = "microchip,ksz8563",
 		.data = &ksz_switch_chips[KSZ8563]
 	},
+	{
+		.compatible = "microchip,ksz8567",
+		.data = &ksz_switch_chips[KSZ8567]
+	},
 	{
 		.compatible = "microchip,ksz9567",
 		.data = &ksz_switch_chips[KSZ9567]
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index a7b5ddb8656e..9ff5132f3ac6 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -1476,6 +1476,39 @@ const struct ksz_chip_data ksz_switch_chips[] = {
 		.gbit_capable = {true, true, true},
 	},
 
+	[KSZ8567] = {
+		.chip_id = KSZ8567_CHIP_ID,
+		.dev_name = "KSZ8567",
+		.num_vlans = 4096,
+		.num_alus = 4096,
+		.num_statics = 16,
+		.cpu_ports = 0x7F,	/* can be configured as cpu port */
+		.port_cnt = 7,		/* total port count */
+		.port_nirqs = 3,
+		.num_tx_queues = 4,
+		.tc_cbs_supported = true,
+		.tc_ets_supported = true,
+		.ops = &ksz9477_dev_ops,
+		.mib_names = ksz9477_mib_names,
+		.mib_cnt = ARRAY_SIZE(ksz9477_mib_names),
+		.reg_mib_cnt = MIB_COUNTER_NUM,
+		.regs = ksz9477_regs,
+		.masks = ksz9477_masks,
+		.shifts = ksz9477_shifts,
+		.xmii_ctrl0 = ksz9477_xmii_ctrl0,
+		.xmii_ctrl1 = ksz9477_xmii_ctrl1,
+		.supports_mii	= {false, false, false, false,
+				   false, true, true},
+		.supports_rmii	= {false, false, false, false,
+				   false, true, true},
+		.supports_rgmii = {false, false, false, false,
+				   false, true, true},
+		.internal_phy	= {true, true, true, true,
+				   true, false, false},
+		.gbit_capable	= {false, false, false, false, false,
+				   true, true},
+	},
+
 	[KSZ9567] = {
 		.chip_id = KSZ9567_CHIP_ID,
 		.dev_name = "KSZ9567",
@@ -2649,6 +2682,7 @@ static void ksz_port_teardown(struct dsa_switch *ds, int port)
 
 	switch (dev->chip_id) {
 	case KSZ8563_CHIP_ID:
+	case KSZ8567_CHIP_ID:
 	case KSZ9477_CHIP_ID:
 	case KSZ9563_CHIP_ID:
 	case KSZ9567_CHIP_ID:
@@ -2705,7 +2739,8 @@ static enum dsa_tag_protocol ksz_get_tag_protocol(struct dsa_switch *ds,
 	    dev->chip_id == KSZ9563_CHIP_ID)
 		proto = DSA_TAG_PROTO_KSZ9893;
 
-	if (dev->chip_id == KSZ9477_CHIP_ID ||
+	if (dev->chip_id == KSZ8567_CHIP_ID ||
+	    dev->chip_id == KSZ9477_CHIP_ID ||
 	    dev->chip_id == KSZ9896_CHIP_ID ||
 	    dev->chip_id == KSZ9897_CHIP_ID ||
 	    dev->chip_id == KSZ9567_CHIP_ID)
@@ -2813,6 +2848,7 @@ static int ksz_max_mtu(struct dsa_switch *ds, int port)
 	case KSZ8830_CHIP_ID:
 		return KSZ8863_HUGE_PACKET_SIZE - VLAN_ETH_HLEN - ETH_FCS_LEN;
 	case KSZ8563_CHIP_ID:
+	case KSZ8567_CHIP_ID:
 	case KSZ9477_CHIP_ID:
 	case KSZ9563_CHIP_ID:
 	case KSZ9567_CHIP_ID:
@@ -2839,6 +2875,7 @@ static int ksz_validate_eee(struct dsa_switch *ds, int port)
 
 	switch (dev->chip_id) {
 	case KSZ8563_CHIP_ID:
+	case KSZ8567_CHIP_ID:
 	case KSZ9477_CHIP_ID:
 	case KSZ9563_CHIP_ID:
 	case KSZ9567_CHIP_ID:
@@ -3183,6 +3220,7 @@ static int ksz_switch_detect(struct ksz_device *dev)
 		case KSZ9896_CHIP_ID:
 		case KSZ9897_CHIP_ID:
 		case KSZ9567_CHIP_ID:
+		case KSZ8567_CHIP_ID:
 		case LAN9370_CHIP_ID:
 		case LAN9371_CHIP_ID:
 		case LAN9372_CHIP_ID:
@@ -3220,6 +3258,7 @@ static int ksz_cls_flower_add(struct dsa_switch *ds, int port,
 
 	switch (dev->chip_id) {
 	case KSZ8563_CHIP_ID:
+	case KSZ8567_CHIP_ID:
 	case KSZ9477_CHIP_ID:
 	case KSZ9563_CHIP_ID:
 	case KSZ9567_CHIP_ID:
@@ -3239,6 +3278,7 @@ static int ksz_cls_flower_del(struct dsa_switch *ds, int port,
 
 	switch (dev->chip_id) {
 	case KSZ8563_CHIP_ID:
+	case KSZ8567_CHIP_ID:
 	case KSZ9477_CHIP_ID:
 	case KSZ9563_CHIP_ID:
 	case KSZ9567_CHIP_ID:
@@ -4142,6 +4182,7 @@ static int ksz_parse_drive_strength(struct ksz_device *dev)
 	case KSZ8794_CHIP_ID:
 	case KSZ8765_CHIP_ID:
 	case KSZ8563_CHIP_ID:
+	case KSZ8567_CHIP_ID:
 	case KSZ9477_CHIP_ID:
 	case KSZ9563_CHIP_ID:
 	case KSZ9567_CHIP_ID:
diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index 15612101a155..060c5de9aa05 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -187,6 +187,7 @@ struct ksz_device {
 /* List of supported models */
 enum ksz_model {
 	KSZ8563,
+	KSZ8567,
 	KSZ8795,
 	KSZ8794,
 	KSZ8765,
diff --git a/drivers/net/dsa/microchip/ksz_spi.c b/drivers/net/dsa/microchip/ksz_spi.c
index 6f6d878e742c..c8166fb440ab 100644
--- a/drivers/net/dsa/microchip/ksz_spi.c
+++ b/drivers/net/dsa/microchip/ksz_spi.c
@@ -164,6 +164,10 @@ static const struct of_device_id ksz_dt_ids[] = {
 		.compatible = "microchip,ksz8563",
 		.data = &ksz_switch_chips[KSZ8563]
 	},
+	{
+		.compatible = "microchip,ksz8567",
+		.data = &ksz_switch_chips[KSZ8567]
+	},
 	{
 		.compatible = "microchip,ksz9567",
 		.data = &ksz_switch_chips[KSZ9567]
@@ -204,6 +208,7 @@ static const struct spi_device_id ksz_spi_ids[] = {
 	{ "ksz9893" },
 	{ "ksz9563" },
 	{ "ksz8563" },
+	{ "ksz8567" },
 	{ "ksz9567" },
 	{ "lan9370" },
 	{ "lan9371" },
diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h
index f177416635a2..8c659db4da6b 100644
--- a/include/linux/platform_data/microchip-ksz.h
+++ b/include/linux/platform_data/microchip-ksz.h
@@ -33,6 +33,7 @@ enum ksz_chip_id {
 	KSZ9897_CHIP_ID = 0x00989700,
 	KSZ9893_CHIP_ID = 0x00989300,
 	KSZ9563_CHIP_ID = 0x00956300,
+	KSZ8567_CHIP_ID = 0x00856700,
 	KSZ9567_CHIP_ID = 0x00956700,
 	LAN9370_CHIP_ID = 0x00937000,
 	LAN9371_CHIP_ID = 0x00937100,
-- 
cgit v1.2.3


From c1f5204efcbcced83f67f12fa8f1a7f5f244fb87 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sun, 28 Jan 2024 22:21:04 -0800
Subject: cpumask: add cpumask_weight_andnot()

Similarly to cpumask_weight_and(), cpumask_weight_andnot() is a handy
helper that may help to avoid creating an intermediate mask just to
calculate number of bits that set in a 1st given mask, and clear in 2nd
one.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/bitmap.h  | 12 ++++++++++++
 include/linux/cpumask.h | 13 +++++++++++++
 lib/bitmap.c            |  7 +++++++
 3 files changed, 32 insertions(+)

(limited to 'include')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 99451431e4d6..5814e9ee40ba 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -54,6 +54,7 @@ struct device;
  *  bitmap_full(src, nbits)                     Are all bits set in *src?
  *  bitmap_weight(src, nbits)                   Hamming Weight: number set bits
  *  bitmap_weight_and(src1, src2, nbits)        Hamming Weight of and'ed bitmap
+ *  bitmap_weight_andnot(src1, src2, nbits)     Hamming Weight of andnot'ed bitmap
  *  bitmap_set(dst, pos, nbits)                 Set specified bit area
  *  bitmap_clear(dst, pos, nbits)               Clear specified bit area
  *  bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
@@ -169,6 +170,8 @@ bool __bitmap_subset(const unsigned long *bitmap1,
 unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
 unsigned int __bitmap_weight_and(const unsigned long *bitmap1,
 				 const unsigned long *bitmap2, unsigned int nbits);
+unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1,
+				    const unsigned long *bitmap2, unsigned int nbits);
 void __bitmap_set(unsigned long *map, unsigned int start, int len);
 void __bitmap_clear(unsigned long *map, unsigned int start, int len);
 
@@ -425,6 +428,15 @@ unsigned long bitmap_weight_and(const unsigned long *src1,
 	return __bitmap_weight_and(src1, src2, nbits);
 }
 
+static __always_inline
+unsigned long bitmap_weight_andnot(const unsigned long *src1,
+				   const unsigned long *src2, unsigned int nbits)
+{
+	if (small_const_nbits(nbits))
+		return hweight_long(*src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits));
+	return __bitmap_weight_andnot(src1, src2, nbits);
+}
+
 static __always_inline void bitmap_set(unsigned long *map, unsigned int start,
 		unsigned int nbits)
 {
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index cfb545841a2c..228c23eb36d2 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -719,6 +719,19 @@ static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
 	return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
 }
 
+/**
+ * cpumask_weight_andnot - Count of bits in (*srcp1 & ~*srcp2)
+ * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
+ * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
+ *
+ * Return: count of bits set in both *srcp1 and *srcp2
+ */
+static inline unsigned int cpumask_weight_andnot(const struct cpumask *srcp1,
+						const struct cpumask *srcp2)
+{
+	return bitmap_weight_andnot(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
+}
+
 /**
  * cpumask_shift_right - *dstp = *srcp >> n
  * @dstp: the cpumask result
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 09522af227f1..b97692854966 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -348,6 +348,13 @@ unsigned int __bitmap_weight_and(const unsigned long *bitmap1,
 }
 EXPORT_SYMBOL(__bitmap_weight_and);
 
+unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1,
+				const unsigned long *bitmap2, unsigned int bits)
+{
+	return BITMAP_WEIGHT(bitmap1[idx] & ~bitmap2[idx], bits);
+}
+EXPORT_SYMBOL(__bitmap_weight_andnot);
+
 void __bitmap_set(unsigned long *map, unsigned int start, int len)
 {
 	unsigned long *p = map + BIT_WORD(start);
-- 
cgit v1.2.3


From dcee228078c34b63089c4b589d4bddf08019d0f6 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sun, 28 Jan 2024 22:21:05 -0800
Subject: cpumask: define cleanup function for cpumasks

Now we can simplify code that allocates cpumasks for local needs.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/cpumask.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 228c23eb36d2..1c29947db848 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -7,6 +7,7 @@
  * set of CPUs in a system, one bit position per CPU number.  In general,
  * only nr_cpu_ids (<= NR_CPUS) bits are valid.
  */
+#include <linux/cleanup.h>
 #include <linux/kernel.h>
 #include <linux/threads.h>
 #include <linux/bitmap.h>
@@ -990,6 +991,8 @@ static inline bool cpumask_available(cpumask_var_t mask)
 }
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 
+DEFINE_FREE(free_cpumask_var, struct cpumask *, if (_T) free_cpumask_var(_T));
+
 /* It's common to want to use cpu_all_mask in struct member initializers,
  * so it has to refer to an address rather than a pointer. */
 extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
-- 
cgit v1.2.3


From cf4f0f1e1c465da7c1f6bc89c3ff50bf42f0ab02 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Tue, 30 Jan 2024 13:08:29 +0100
Subject: dpll: extend uapi by lock status error attribute

If the dpll devices goes to state "unlocked" or "holdover", it may be
caused by an error. In that case, allow user to see what the error was.
Introduce a new attribute and values it can carry.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Acked-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/netlink/specs/dpll.yaml | 39 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/dpll.h             | 30 +++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

(limited to 'include')

diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml
index b14aed18065f..1755066d8308 100644
--- a/Documentation/netlink/specs/dpll.yaml
+++ b/Documentation/netlink/specs/dpll.yaml
@@ -51,6 +51,40 @@ definitions:
           if dpll lock-state was not DPLL_LOCK_STATUS_LOCKED_HO_ACQ, the
           dpll's lock-state shall remain DPLL_LOCK_STATUS_UNLOCKED)
     render-max: true
+  -
+    type: enum
+    name: lock-status-error
+    doc: |
+      if previous status change was done due to a failure, this provides
+      information of dpll device lock status error.
+      Valid values for DPLL_A_LOCK_STATUS_ERROR attribute
+    entries:
+      -
+        name: none
+        doc: |
+          dpll device lock status was changed without any error
+        value: 1
+      -
+        name: undefined
+        doc: |
+          dpll device lock status was changed due to undefined error.
+          Driver fills this value up in case it is not able
+          to obtain suitable exact error type.
+      -
+        name: media-down
+        doc: |
+          dpll device lock status was changed because of associated
+          media got down.
+          This may happen for example if dpll device was previously
+          locked on an input pin of type PIN_TYPE_SYNCE_ETH_PORT.
+      -
+        name: fractional-frequency-offset-too-high
+        doc: |
+          the FFO (Fractional Frequency Offset) between the RX and TX
+          symbol rate on the media got too high.
+          This may happen for example if dpll device was previously
+          locked on an input pin of type PIN_TYPE_SYNCE_ETH_PORT.
+    render-max: true
   -
     type: const
     name: temp-divider
@@ -214,6 +248,10 @@ attribute-sets:
         name: type
         type: u32
         enum: type
+      -
+        name: lock-status-error
+        type: u32
+        enum: lock-status-error
   -
     name: pin
     enum-name: dpll_a_pin
@@ -379,6 +417,7 @@ operations:
             - mode
             - mode-supported
             - lock-status
+            - lock-status-error
             - temp
             - clock-id
             - type
diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h
index b4e947f9bfbc..0c13d7f1a1bc 100644
--- a/include/uapi/linux/dpll.h
+++ b/include/uapi/linux/dpll.h
@@ -50,6 +50,35 @@ enum dpll_lock_status {
 	DPLL_LOCK_STATUS_MAX = (__DPLL_LOCK_STATUS_MAX - 1)
 };
 
+/**
+ * enum dpll_lock_status_error - if previous status change was done due to a
+ *   failure, this provides information of dpll device lock status error. Valid
+ *   values for DPLL_A_LOCK_STATUS_ERROR attribute
+ * @DPLL_LOCK_STATUS_ERROR_NONE: dpll device lock status was changed without
+ *   any error
+ * @DPLL_LOCK_STATUS_ERROR_UNDEFINED: dpll device lock status was changed due
+ *   to undefined error. Driver fills this value up in case it is not able to
+ *   obtain suitable exact error type.
+ * @DPLL_LOCK_STATUS_ERROR_MEDIA_DOWN: dpll device lock status was changed
+ *   because of associated media got down. This may happen for example if dpll
+ *   device was previously locked on an input pin of type
+ *   PIN_TYPE_SYNCE_ETH_PORT.
+ * @DPLL_LOCK_STATUS_ERROR_FRACTIONAL_FREQUENCY_OFFSET_TOO_HIGH: the FFO
+ *   (Fractional Frequency Offset) between the RX and TX symbol rate on the
+ *   media got too high. This may happen for example if dpll device was
+ *   previously locked on an input pin of type PIN_TYPE_SYNCE_ETH_PORT.
+ */
+enum dpll_lock_status_error {
+	DPLL_LOCK_STATUS_ERROR_NONE = 1,
+	DPLL_LOCK_STATUS_ERROR_UNDEFINED,
+	DPLL_LOCK_STATUS_ERROR_MEDIA_DOWN,
+	DPLL_LOCK_STATUS_ERROR_FRACTIONAL_FREQUENCY_OFFSET_TOO_HIGH,
+
+	/* private: */
+	__DPLL_LOCK_STATUS_ERROR_MAX,
+	DPLL_LOCK_STATUS_ERROR_MAX = (__DPLL_LOCK_STATUS_ERROR_MAX - 1)
+};
+
 #define DPLL_TEMP_DIVIDER	1000
 
 /**
@@ -150,6 +179,7 @@ enum dpll_a {
 	DPLL_A_LOCK_STATUS,
 	DPLL_A_TEMP,
 	DPLL_A_TYPE,
+	DPLL_A_LOCK_STATUS_ERROR,
 
 	__DPLL_A_MAX,
 	DPLL_A_MAX = (__DPLL_A_MAX - 1)
-- 
cgit v1.2.3


From e2ca9e75849e63eab6544549b6888595997e8153 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Tue, 30 Jan 2024 13:08:30 +0100
Subject: dpll: extend lock_status_get() op by status error and expose to user

Pass additional argunent status_error over lock_status_get()
so drivers can fill it up. In case they do, expose the value over
previously introduced attribute to user. Do it only in case the
current lock_status is either "unlocked" or "holdover".

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Acked-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/dpll/dpll_netlink.c                    | 9 ++++++++-
 drivers/net/ethernet/intel/ice/ice_dpll.c      | 2 ++
 drivers/net/ethernet/mellanox/mlx5/core/dpll.c | 9 +++++----
 drivers/ptp/ptp_ocp.c                          | 9 +++++----
 include/linux/dpll.h                           | 1 +
 5 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
index 314bb3775465..cf3313517ae1 100644
--- a/drivers/dpll/dpll_netlink.c
+++ b/drivers/dpll/dpll_netlink.c
@@ -121,14 +121,21 @@ dpll_msg_add_lock_status(struct sk_buff *msg, struct dpll_device *dpll,
 			 struct netlink_ext_ack *extack)
 {
 	const struct dpll_device_ops *ops = dpll_device_ops(dpll);
+	enum dpll_lock_status_error status_error = 0;
 	enum dpll_lock_status status;
 	int ret;
 
-	ret = ops->lock_status_get(dpll, dpll_priv(dpll), &status, extack);
+	ret = ops->lock_status_get(dpll, dpll_priv(dpll), &status,
+				   &status_error, extack);
 	if (ret)
 		return ret;
 	if (nla_put_u32(msg, DPLL_A_LOCK_STATUS, status))
 		return -EMSGSIZE;
+	if (status_error &&
+	    (status == DPLL_LOCK_STATUS_UNLOCKED ||
+	     status == DPLL_LOCK_STATUS_HOLDOVER) &&
+	    nla_put_u32(msg, DPLL_A_LOCK_STATUS_ERROR, status_error))
+		return -EMSGSIZE;
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c
index b9c5eced6326..c0256564e998 100644
--- a/drivers/net/ethernet/intel/ice/ice_dpll.c
+++ b/drivers/net/ethernet/intel/ice/ice_dpll.c
@@ -488,6 +488,7 @@ ice_dpll_hw_input_prio_set(struct ice_pf *pf, struct ice_dpll *dpll,
  * @dpll: registered dpll pointer
  * @dpll_priv: private data pointer passed on dpll registration
  * @status: on success holds dpll's lock status
+ * @status_error: status error value
  * @extack: error reporting
  *
  * Dpll subsystem callback, provides dpll's lock status.
@@ -500,6 +501,7 @@ ice_dpll_hw_input_prio_set(struct ice_pf *pf, struct ice_dpll *dpll,
 static int
 ice_dpll_lock_status_get(const struct dpll_device *dpll, void *dpll_priv,
 			 enum dpll_lock_status *status,
+			 enum dpll_lock_status_error *status_error,
 			 struct netlink_ext_ack *extack)
 {
 	struct ice_dpll *d = dpll_priv;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
index 18fed2b34fb1..07f43d5c90c6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
@@ -118,10 +118,11 @@ mlx5_dpll_pin_ffo_get(struct mlx5_dpll_synce_status *synce_status,
 	return 0;
 }
 
-static int mlx5_dpll_device_lock_status_get(const struct dpll_device *dpll,
-					    void *priv,
-					    enum dpll_lock_status *status,
-					    struct netlink_ext_ack *extack)
+static int
+mlx5_dpll_device_lock_status_get(const struct dpll_device *dpll, void *priv,
+				 enum dpll_lock_status *status,
+				 enum dpll_lock_status_error *status_error,
+				 struct netlink_ext_ack *extack)
 {
 	struct mlx5_dpll_synce_status synce_status;
 	struct mlx5_dpll *mdpll = priv;
diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c
index 5f858e426bbd..9507681e0d12 100644
--- a/drivers/ptp/ptp_ocp.c
+++ b/drivers/ptp/ptp_ocp.c
@@ -4209,10 +4209,11 @@ ptp_ocp_detach(struct ptp_ocp *bp)
 	device_unregister(&bp->dev);
 }
 
-static int ptp_ocp_dpll_lock_status_get(const struct dpll_device *dpll,
-					void *priv,
-					enum dpll_lock_status *status,
-					struct netlink_ext_ack *extack)
+static int
+ptp_ocp_dpll_lock_status_get(const struct dpll_device *dpll, void *priv,
+			     enum dpll_lock_status *status,
+			     enum dpll_lock_status_error *status_error,
+			     struct netlink_ext_ack *extack)
 {
 	struct ptp_ocp *bp = priv;
 
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index 9cf896ea1d41..9cb02ad73d51 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -19,6 +19,7 @@ struct dpll_device_ops {
 			enum dpll_mode *mode, struct netlink_ext_ack *extack);
 	int (*lock_status_get)(const struct dpll_device *dpll, void *dpll_priv,
 			       enum dpll_lock_status *status,
+			       enum dpll_lock_status_error *status_error,
 			       struct netlink_ext_ack *extack);
 	int (*temp_get)(const struct dpll_device *dpll, void *dpll_priv,
 			s32 *temp, struct netlink_ext_ack *extack);
-- 
cgit v1.2.3


From 2c54a4d71246379f4ffb9beb6a780f9a49fdfc24 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Tue, 30 Jan 2024 13:08:31 +0100
Subject: net/mlx5: DPLL, Implement lock status error value

Fill-up the lock status error value properly.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Acked-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/dpll.c | 23 +++++++++++++++++++++++
 include/linux/mlx5/mlx5_ifc.h                  |  8 ++++++++
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
index 07f43d5c90c6..4ad3d2d3d4c8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dpll.c
@@ -41,6 +41,7 @@ struct mlx5_dpll_synce_status {
 	enum mlx5_msees_oper_status oper_status;
 	bool ho_acq;
 	bool oper_freq_measure;
+	enum mlx5_msees_failure_reason failure_reason;
 	s32 frequency_diff;
 };
 
@@ -60,6 +61,7 @@ mlx5_dpll_synce_status_get(struct mlx5_core_dev *mdev,
 	synce_status->oper_status = MLX5_GET(msees_reg, out, oper_status);
 	synce_status->ho_acq = MLX5_GET(msees_reg, out, ho_acq);
 	synce_status->oper_freq_measure = MLX5_GET(msees_reg, out, oper_freq_measure);
+	synce_status->failure_reason = MLX5_GET(msees_reg, out, failure_reason);
 	synce_status->frequency_diff = MLX5_GET(msees_reg, out, frequency_diff);
 	return 0;
 }
@@ -99,6 +101,26 @@ mlx5_dpll_lock_status_get(struct mlx5_dpll_synce_status *synce_status)
 	}
 }
 
+static enum dpll_lock_status_error
+mlx5_dpll_lock_status_error_get(struct mlx5_dpll_synce_status *synce_status)
+{
+	switch (synce_status->oper_status) {
+	case MLX5_MSEES_OPER_STATUS_FAIL_HOLDOVER:
+		fallthrough;
+	case MLX5_MSEES_OPER_STATUS_FAIL_FREE_RUNNING:
+		switch (synce_status->failure_reason) {
+		case MLX5_MSEES_FAILURE_REASON_PORT_DOWN:
+			return DPLL_LOCK_STATUS_ERROR_MEDIA_DOWN;
+		case MLX5_MSEES_FAILURE_REASON_TOO_HIGH_FREQUENCY_DIFF:
+			return DPLL_LOCK_STATUS_ERROR_FRACTIONAL_FREQUENCY_OFFSET_TOO_HIGH;
+		default:
+			return DPLL_LOCK_STATUS_ERROR_UNDEFINED;
+		}
+	default:
+		return DPLL_LOCK_STATUS_ERROR_NONE;
+	}
+}
+
 static enum dpll_pin_state
 mlx5_dpll_pin_state_get(struct mlx5_dpll_synce_status *synce_status)
 {
@@ -132,6 +154,7 @@ mlx5_dpll_device_lock_status_get(const struct dpll_device *dpll, void *priv,
 	if (err)
 		return err;
 	*status = mlx5_dpll_lock_status_get(&synce_status);
+	*status_error = mlx5_dpll_lock_status_error_get(&synce_status);
 	return 0;
 }
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c726f90ab752..6c44f107b8ba 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -12705,6 +12705,14 @@ enum mlx5_msees_oper_status {
 	MLX5_MSEES_OPER_STATUS_FAIL_FREE_RUNNING	= 0x5,
 };
 
+enum mlx5_msees_failure_reason {
+	MLX5_MSEES_FAILURE_REASON_UNDEFINED_ERROR		= 0x0,
+	MLX5_MSEES_FAILURE_REASON_PORT_DOWN			= 0x1,
+	MLX5_MSEES_FAILURE_REASON_TOO_HIGH_FREQUENCY_DIFF	= 0x2,
+	MLX5_MSEES_FAILURE_REASON_NET_SYNCHRONIZER_DEVICE_ERROR	= 0x3,
+	MLX5_MSEES_FAILURE_REASON_LACK_OF_RESOURCES		= 0x4,
+};
+
 struct mlx5_ifc_msees_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         local_port[0x8];
-- 
cgit v1.2.3


From b26577001af49a20f09770fd6e6cfd10d5daac93 Mon Sep 17 00:00:00 2001
From: Michal Koutný <mkoutny@suse.com>
Date: Thu, 1 Feb 2024 14:09:40 +0100
Subject: net/sched: Add helper macros with module names
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The macros are preparation for adding module aliases en mass in a
separate commit.
Although it would be tempting to create aliases like cls-foo for name
cls_foo, this could not be used because modprobe utilities treat '-' and
'_' interchangeably.
In the end, the naming follows pattern of proto modules in linux/net.h.

Signed-off-by: Michal Koutný <mkoutny@suse.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/20240201130943.19536-2-mkoutny@suse.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/act_api.h   | 2 ++
 include/net/pkt_cls.h   | 2 ++
 include/net/pkt_sched.h | 2 ++
 3 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index e1e5e72b901e..77ee0c657e2c 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -201,6 +201,8 @@ int tcf_idr_release(struct tc_action *a, bool bind);
 int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops);
 int tcf_unregister_action(struct tc_action_ops *a,
 			  struct pernet_operations *ops);
+#define NET_ACT_ALIAS_PREFIX "net-act-"
+#define MODULE_ALIAS_NET_ACT(kind)	MODULE_ALIAS(NET_ACT_ALIAS_PREFIX kind)
 int tcf_action_destroy(struct tc_action *actions[], int bind);
 int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
 		    int nr_actions, struct tcf_result *res);
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index f308e8268651..a4ee43f493bb 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -24,6 +24,8 @@ struct tcf_walker {
 
 int register_tcf_proto_ops(struct tcf_proto_ops *ops);
 void unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
+#define NET_CLS_ALIAS_PREFIX "net-cls-"
+#define MODULE_ALIAS_NET_CLS(kind)	MODULE_ALIAS(NET_CLS_ALIAS_PREFIX kind)
 
 struct tcf_block_ext_info {
 	enum flow_block_binder_type binder_type;
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 1e200d9a066d..d7b7b6cd4aa1 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -100,6 +100,8 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
 
 int register_qdisc(struct Qdisc_ops *qops);
 void unregister_qdisc(struct Qdisc_ops *qops);
+#define NET_SCH_ALIAS_PREFIX "net-sch-"
+#define MODULE_ALIAS_NET_SCH(id)	MODULE_ALIAS(NET_SCH_ALIAS_PREFIX id)
 void qdisc_get_default(char *id, size_t len);
 int qdisc_set_default(const char *id);
 
-- 
cgit v1.2.3


From c1e6148cb4f83cec841db1f066e8db4a86c1f118 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maxim@isovalent.com>
Date: Sat, 27 Jan 2024 19:52:34 +0200
Subject: bpf: Preserve boundaries and track scalars on narrowing fill

When the width of a fill is smaller than the width of the preceding
spill, the information about scalar boundaries can still be preserved,
as long as it's coerced to the right width (done by coerce_reg_to_size).
Even further, if the actual value fits into the fill width, the ID can
be preserved as well for further tracking of equal scalars.

Implement the above improvements, which makes narrowing fills behave the
same as narrowing spills and MOVs between registers.

Two tests are adjusted to accommodate for endianness differences and to
take into account that it's now allowed to do a narrowing fill from the
least significant bits.

reg_bounds_sync is added to coerce_reg_to_size to correctly adjust
umin/umax boundaries after the var_off truncation, for example, a 64-bit
value 0xXXXXXXXX00000000, when read as a 32-bit, gets umin = 0, umax =
0xFFFFFFFF, var_off = (0x0; 0xffffffff00000000), which needs to be
synced down to umax = 0, otherwise reg_bounds_sanity_check doesn't pass.

Signed-off-by: Maxim Mikityanskiy <maxim@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240127175237.526726-4-maxtram95@gmail.com
---
 include/linux/bpf_verifier.h                       |  9 ++++++++
 kernel/bpf/verifier.c                              | 15 +++++++++----
 .../selftests/bpf/progs/verifier_spill_fill.c      | 26 ++++++++++++++++------
 3 files changed, 39 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 0dcde339dc7e..84365e6dd85d 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -919,6 +919,15 @@ static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env)
 	env->scratched_stack_slots = ~0ULL;
 }
 
+static inline bool bpf_stack_narrow_access_ok(int off, int fill_size, int spill_size)
+{
+#ifdef __BIG_ENDIAN
+	off -= spill_size - fill_size;
+#endif
+
+	return !(off % BPF_REG_SIZE);
+}
+
 const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type);
 const char *dynptr_type_str(enum bpf_dynptr_type type);
 const char *iter_type_str(const struct btf *btf, u32 btf_id);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 28f62f24da7e..82af971926ac 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4778,7 +4778,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 			if (dst_regno < 0)
 				return 0;
 
-			if (!(off % BPF_REG_SIZE) && size == spill_size) {
+			if (size <= spill_size &&
+			    bpf_stack_narrow_access_ok(off, size, spill_size)) {
 				/* The earlier check_reg_arg() has decided the
 				 * subreg_def for this insn.  Save it first.
 				 */
@@ -4786,6 +4787,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 
 				copy_register_state(&state->regs[dst_regno], reg);
 				state->regs[dst_regno].subreg_def = subreg_def;
+
+				/* Break the relation on a narrowing fill.
+				 * coerce_reg_to_size will adjust the boundaries.
+				 */
+				if (get_reg_width(reg) > size * BITS_PER_BYTE)
+					state->regs[dst_regno].id = 0;
 			} else {
 				int spill_cnt = 0, zero_cnt = 0;
 
@@ -6061,10 +6068,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
 	 * values are also truncated so we push 64-bit bounds into
 	 * 32-bit bounds. Above were truncated < 32-bits already.
 	 */
-	if (size < 4) {
+	if (size < 4)
 		__mark_reg32_unbounded(reg);
-		reg_bounds_sync(reg);
-	}
+
+	reg_bounds_sync(reg);
 }
 
 static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c
index f9803005e1c0..3e5d063ea7e8 100644
--- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c
+++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c
@@ -217,7 +217,7 @@ __naked void uninit_u32_from_the_stack(void)
 
 SEC("tc")
 __description("Spill a u32 const scalar.  Refill as u16.  Offset to skb->data")
-__failure __msg("invalid access to packet")
+__success __retval(0)
 __naked void u16_offset_to_skb_data(void)
 {
 	asm volatile ("					\
@@ -225,13 +225,19 @@ __naked void u16_offset_to_skb_data(void)
 	r3 = *(u32*)(r1 + %[__sk_buff_data_end]);	\
 	w4 = 20;					\
 	*(u32*)(r10 - 8) = r4;				\
-	r4 = *(u16*)(r10 - 8);				\
+	"
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	"r4 = *(u16*)(r10 - 8);"
+#else
+	"r4 = *(u16*)(r10 - 6);"
+#endif
+	"						\
 	r0 = r2;					\
-	/* r0 += r4 R0=pkt R2=pkt R3=pkt_end R4=umax=65535 */\
+	/* r0 += r4 R0=pkt R2=pkt R3=pkt_end R4=20 */\
 	r0 += r4;					\
-	/* if (r0 > r3) R0=pkt,umax=65535 R2=pkt R3=pkt_end R4=umax=65535 */\
+	/* if (r0 > r3) R0=pkt,off=20 R2=pkt R3=pkt_end R4=20 */\
 	if r0 > r3 goto l0_%=;				\
-	/* r0 = *(u32 *)r2 R0=pkt,umax=65535 R2=pkt R3=pkt_end R4=20 */\
+	/* r0 = *(u32 *)r2 R0=pkt,off=20 R2=pkt R3=pkt_end R4=20 */\
 	r0 = *(u32*)(r2 + 0);				\
 l0_%=:	r0 = 0;						\
 	exit;						\
@@ -268,7 +274,7 @@ l0_%=:	r0 = 0;						\
 }
 
 SEC("tc")
-__description("Spill a u32 const scalar.  Refill as u16 from fp-6.  Offset to skb->data")
+__description("Spill a u32 const scalar.  Refill as u16 from MSB.  Offset to skb->data")
 __failure __msg("invalid access to packet")
 __naked void _6_offset_to_skb_data(void)
 {
@@ -277,7 +283,13 @@ __naked void _6_offset_to_skb_data(void)
 	r3 = *(u32*)(r1 + %[__sk_buff_data_end]);	\
 	w4 = 20;					\
 	*(u32*)(r10 - 8) = r4;				\
-	r4 = *(u16*)(r10 - 6);				\
+	"
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	"r4 = *(u16*)(r10 - 6);"
+#else
+	"r4 = *(u16*)(r10 - 8);"
+#endif
+	"						\
 	r0 = r2;					\
 	/* r0 += r4 R0=pkt R2=pkt R3=pkt_end R4=umax=65535 */\
 	r0 += r4;					\
-- 
cgit v1.2.3


From bd8a8d5ec5048ef74002d9f3db5cae971e68712c Mon Sep 17 00:00:00 2001
From: Yunjian Wang <wangyunjian@huawei.com>
Date: Fri, 2 Feb 2024 15:25:55 +0800
Subject: tun: Fix code style issues in <linux/if_tun.h>

This fixes the following code style problem:
- WARNING: please, no spaces at the start of a line
- CHECK: Please use a blank line after
         function/struct/union/enum declarations

Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_tun.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 2a7660843444..043d442994b0 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -27,44 +27,54 @@ struct tun_xdp_hdr {
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
 struct ptr_ring *tun_get_tx_ring(struct file *file);
+
 static inline bool tun_is_xdp_frame(void *ptr)
 {
-       return (unsigned long)ptr & TUN_XDP_FLAG;
+	return (unsigned long)ptr & TUN_XDP_FLAG;
 }
+
 static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp)
 {
-       return (void *)((unsigned long)xdp | TUN_XDP_FLAG);
+	return (void *)((unsigned long)xdp | TUN_XDP_FLAG);
 }
+
 static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr)
 {
-       return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
+	return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
 }
+
 void tun_ptr_free(void *ptr);
 #else
 #include <linux/err.h>
 #include <linux/errno.h>
 struct file;
 struct socket;
+
 static inline struct socket *tun_get_socket(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
+
 static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
+
 static inline bool tun_is_xdp_frame(void *ptr)
 {
 	return false;
 }
+
 static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp)
 {
 	return NULL;
 }
+
 static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr)
 {
 	return NULL;
 }
+
 static inline void tun_ptr_free(void *ptr)
 {
 }
-- 
cgit v1.2.3


From ffabe98cb576097b77d404d39e8b3df03caa986a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 2 Feb 2024 10:11:06 +0000
Subject: net: make dev_unreg_count global

We can use a global dev_unreg_count counter instead
of a per netns one.

As a bonus we can factorize the changes done on it
for bulk device removals.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h   |  1 +
 include/net/net_namespace.h |  2 --
 net/core/dev.c              | 12 +++++++++---
 net/core/rtnetlink.c        | 11 +----------
 4 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 410529fca18b..21780608cf47 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -47,6 +47,7 @@ extern int rtnl_lock_killable(void);
 extern bool refcount_dec_and_rtnl_lock(refcount_t *r);
 
 extern wait_queue_head_t netdev_unregistering_wq;
+extern atomic_t dev_unreg_count;
 extern struct rw_semaphore pernet_ops_rwsem;
 extern struct rw_semaphore net_rwsem;
 
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 13b3a4e29fdb..cd0c2eedbb5e 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -67,8 +67,6 @@ struct net {
 						 */
 	spinlock_t		rules_mod_lock;
 
-	atomic_t		dev_unreg_count;
-
 	unsigned int		dev_base_seq;	/* protected by rtnl_mutex */
 	u32			ifindex;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index b53b9c94de40..27ba057d06c4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9698,11 +9698,11 @@ static void dev_index_release(struct net *net, int ifindex)
 /* Delayed registration/unregisteration */
 LIST_HEAD(net_todo_list);
 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
+atomic_t dev_unreg_count = ATOMIC_INIT(0);
 
 static void net_set_todo(struct net_device *dev)
 {
 	list_add_tail(&dev->todo_list, &net_todo_list);
-	atomic_inc(&dev_net(dev)->dev_unreg_count);
 }
 
 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
@@ -10529,6 +10529,7 @@ void netdev_run_todo(void)
 {
 	struct net_device *dev, *tmp;
 	struct list_head list;
+	int cnt;
 #ifdef CONFIG_LOCKDEP
 	struct list_head unlink_list;
 
@@ -10565,6 +10566,7 @@ void netdev_run_todo(void)
 		linkwatch_sync_dev(dev);
 	}
 
+	cnt = 0;
 	while (!list_empty(&list)) {
 		dev = netdev_wait_allrefs_any(&list);
 		list_del(&dev->todo_list);
@@ -10582,12 +10584,13 @@ void netdev_run_todo(void)
 		if (dev->needs_free_netdev)
 			free_netdev(dev);
 
-		if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
-			wake_up(&netdev_unregistering_wq);
+		cnt++;
 
 		/* Free network device */
 		kobject_put(&dev->dev.kobj);
 	}
+	if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
+		wake_up(&netdev_unregistering_wq);
 }
 
 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
@@ -11034,6 +11037,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
 {
 	struct net_device *dev, *tmp;
 	LIST_HEAD(close_head);
+	int cnt = 0;
 
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
@@ -11130,7 +11134,9 @@ void unregister_netdevice_many_notify(struct list_head *head,
 	list_for_each_entry(dev, head, unreg_list) {
 		netdev_put(dev, &dev->dev_registered_tracker);
 		net_set_todo(dev);
+		cnt++;
 	}
+	atomic_add(cnt, &dev_unreg_count);
 
 	list_del(head);
 }
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f6f29eb03ec2..31f433950c8d 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -483,24 +483,15 @@ EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
  */
 static void rtnl_lock_unregistering_all(void)
 {
-	struct net *net;
-	bool unregistering;
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 
 	add_wait_queue(&netdev_unregistering_wq, &wait);
 	for (;;) {
-		unregistering = false;
 		rtnl_lock();
 		/* We held write locked pernet_ops_rwsem, and parallel
 		 * setup_net() and cleanup_net() are not possible.
 		 */
-		for_each_net(net) {
-			if (atomic_read(&net->dev_unreg_count) > 0) {
-				unregistering = true;
-				break;
-			}
-		}
-		if (!unregistering)
+		if (!atomic_read(&dev_unreg_count))
 			break;
 		__rtnl_unlock();
 
-- 
cgit v1.2.3


From 89304f91bf8efe832557b00f034493420f16dbdb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 2 Feb 2024 10:14:03 +0000
Subject: sctp: preserve const qualifier in sctp_sk()

We can change sctp_sk() to propagate its argument const qualifier,
thanks to container_of_const().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Cc: Xin Long <lucien.xin@gmail.com>
Acked-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 5a24d6d8522a..f24a1bbcb3ef 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -242,10 +242,7 @@ struct sctp_sock {
 	int do_auto_asconf;
 };
 
-static inline struct sctp_sock *sctp_sk(const struct sock *sk)
-{
-       return (struct sctp_sock *)sk;
-}
+#define sctp_sk(ptr) container_of_const(ptr, struct sctp_sock, inet.sk)
 
 static inline struct sock *sctp_opt2sk(const struct sctp_sock *sp)
 {
-- 
cgit v1.2.3


From 2719a9e7156c4b3983b43db467c1ff96801bda99 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 31 Jan 2024 23:37:25 +0100
Subject: wifi: cw1200: Convert to GPIO descriptors

The CW1200 uses two GPIOs to control the powerup and reset
pins, get these from GPIO descriptors instead of being passed
as platform data from boardfiles.

The RESET line will need to be marked as active low as we will
let gpiolib handle the polarity inversion.

The SDIO case is a bit special since the "card" need to be
powered up before it gets detected on the SDIO bus and
properly probed. Fix this by using board-specific GPIOs
assigned to device "NULL".

There are currently no in-tree users.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://msgid.link/20240131-descriptors-wireless-v1-6-e1c7c5d68746@linaro.org
---
 drivers/net/wireless/st/cw1200/cw1200_sdio.c | 42 +++++++++-------
 drivers/net/wireless/st/cw1200/cw1200_spi.c  | 71 ++++++++++++++++------------
 include/linux/platform_data/net-cw1200.h     |  4 --
 3 files changed, 65 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/st/cw1200/cw1200_sdio.c b/drivers/net/wireless/st/cw1200/cw1200_sdio.c
index 4c30b5772ce0..00c4731d8f8e 100644
--- a/drivers/net/wireless/st/cw1200/cw1200_sdio.c
+++ b/drivers/net/wireless/st/cw1200/cw1200_sdio.c
@@ -8,7 +8,7 @@
 
 #include <linux/module.h>
 #include <linux/interrupt.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/delay.h>
 #include <linux/mmc/host.h>
 #include <linux/mmc/sdio_func.h>
@@ -178,12 +178,15 @@ static int cw1200_sdio_irq_unsubscribe(struct hwbus_priv *self)
 	return ret;
 }
 
+/* Like the rest of the driver, this only supports one device per system */
+static struct gpio_desc *cw1200_reset;
+static struct gpio_desc *cw1200_powerup;
+
 static int cw1200_sdio_off(const struct cw1200_platform_data_sdio *pdata)
 {
-	if (pdata->reset) {
-		gpio_set_value(pdata->reset, 0);
+	if (cw1200_reset) {
+		gpiod_set_value(cw1200_reset, 0);
 		msleep(30); /* Min is 2 * CLK32K cycles */
-		gpio_free(pdata->reset);
 	}
 
 	if (pdata->power_ctrl)
@@ -196,16 +199,21 @@ static int cw1200_sdio_off(const struct cw1200_platform_data_sdio *pdata)
 
 static int cw1200_sdio_on(const struct cw1200_platform_data_sdio *pdata)
 {
-	/* Ensure I/Os are pulled low */
-	if (pdata->reset) {
-		gpio_request(pdata->reset, "cw1200_wlan_reset");
-		gpio_direction_output(pdata->reset, 0);
+	/* Ensure I/Os are pulled low (reset is active low) */
+	cw1200_reset = devm_gpiod_get_optional(NULL, "reset", GPIOD_OUT_HIGH);
+	if (IS_ERR(cw1200_reset)) {
+		pr_err("could not get CW1200 SDIO reset GPIO\n");
+		return PTR_ERR(cw1200_reset);
 	}
-	if (pdata->powerup) {
-		gpio_request(pdata->powerup, "cw1200_wlan_powerup");
-		gpio_direction_output(pdata->powerup, 0);
+	gpiod_set_consumer_name(cw1200_reset, "cw1200_wlan_reset");
+	cw1200_powerup = devm_gpiod_get_optional(NULL, "powerup", GPIOD_OUT_LOW);
+	if (IS_ERR(cw1200_powerup)) {
+		pr_err("could not get CW1200 SDIO powerup GPIO\n");
+		return PTR_ERR(cw1200_powerup);
 	}
-	if (pdata->reset || pdata->powerup)
+	gpiod_set_consumer_name(cw1200_powerup, "cw1200_wlan_powerup");
+
+	if (cw1200_reset || cw1200_powerup)
 		msleep(10); /* Settle time? */
 
 	/* Enable 3v3 and 1v8 to hardware */
@@ -226,13 +234,13 @@ static int cw1200_sdio_on(const struct cw1200_platform_data_sdio *pdata)
 	}
 
 	/* Enable POWERUP signal */
-	if (pdata->powerup) {
-		gpio_set_value(pdata->powerup, 1);
+	if (cw1200_powerup) {
+		gpiod_set_value(cw1200_powerup, 1);
 		msleep(250); /* or more..? */
 	}
-	/* Enable RSTn signal */
-	if (pdata->reset) {
-		gpio_set_value(pdata->reset, 1);
+	/* Deassert RSTn signal, note active low */
+	if (cw1200_reset) {
+		gpiod_set_value(cw1200_reset, 0);
 		msleep(50); /* Or more..? */
 	}
 	return 0;
diff --git a/drivers/net/wireless/st/cw1200/cw1200_spi.c b/drivers/net/wireless/st/cw1200/cw1200_spi.c
index b27b57fc25bc..fb3aafcafe18 100644
--- a/drivers/net/wireless/st/cw1200/cw1200_spi.c
+++ b/drivers/net/wireless/st/cw1200/cw1200_spi.c
@@ -11,7 +11,7 @@
  */
 
 #include <linux/module.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/delay.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
@@ -38,6 +38,8 @@ struct hwbus_priv {
 	const struct cw1200_platform_data_spi *pdata;
 	spinlock_t		lock; /* Serialize all bus operations */
 	wait_queue_head_t       wq;
+	struct gpio_desc	*reset;
+	struct gpio_desc	*powerup;
 	int claimed;
 };
 
@@ -269,12 +271,12 @@ static void cw1200_spi_irq_unsubscribe(struct hwbus_priv *self)
 	free_irq(self->func->irq, self);
 }
 
-static int cw1200_spi_off(const struct cw1200_platform_data_spi *pdata)
+static int cw1200_spi_off(struct hwbus_priv *self, const struct cw1200_platform_data_spi *pdata)
 {
-	if (pdata->reset) {
-		gpio_set_value(pdata->reset, 0);
+	if (self->reset) {
+		/* Assert RESET, note active low */
+		gpiod_set_value(self->reset, 1);
 		msleep(30); /* Min is 2 * CLK32K cycles */
-		gpio_free(pdata->reset);
 	}
 
 	if (pdata->power_ctrl)
@@ -285,18 +287,12 @@ static int cw1200_spi_off(const struct cw1200_platform_data_spi *pdata)
 	return 0;
 }
 
-static int cw1200_spi_on(const struct cw1200_platform_data_spi *pdata)
+static int cw1200_spi_on(struct hwbus_priv *self, const struct cw1200_platform_data_spi *pdata)
 {
 	/* Ensure I/Os are pulled low */
-	if (pdata->reset) {
-		gpio_request(pdata->reset, "cw1200_wlan_reset");
-		gpio_direction_output(pdata->reset, 0);
-	}
-	if (pdata->powerup) {
-		gpio_request(pdata->powerup, "cw1200_wlan_powerup");
-		gpio_direction_output(pdata->powerup, 0);
-	}
-	if (pdata->reset || pdata->powerup)
+	gpiod_direction_output(self->reset, 1); /* Active low */
+	gpiod_direction_output(self->powerup, 0);
+	if (self->reset || self->powerup)
 		msleep(10); /* Settle time? */
 
 	/* Enable 3v3 and 1v8 to hardware */
@@ -317,13 +313,13 @@ static int cw1200_spi_on(const struct cw1200_platform_data_spi *pdata)
 	}
 
 	/* Enable POWERUP signal */
-	if (pdata->powerup) {
-		gpio_set_value(pdata->powerup, 1);
+	if (self->powerup) {
+		gpiod_set_value(self->powerup, 1);
 		msleep(250); /* or more..? */
 	}
-	/* Enable RSTn signal */
-	if (pdata->reset) {
-		gpio_set_value(pdata->reset, 1);
+	/* Assert RSTn signal, note active low */
+	if (self->reset) {
+		gpiod_set_value(self->reset, 0);
 		msleep(50); /* Or more..? */
 	}
 	return 0;
@@ -375,20 +371,33 @@ static int cw1200_spi_probe(struct spi_device *func)
 		spi_get_chipselect(func, 0), func->mode, func->bits_per_word,
 		func->max_speed_hz);
 
-	if (cw1200_spi_on(plat_data)) {
+	self = devm_kzalloc(&func->dev, sizeof(*self), GFP_KERNEL);
+	if (!self) {
+		pr_err("Can't allocate SPI hwbus_priv.");
+		return -ENOMEM;
+	}
+
+	/* Request reset asserted */
+	self->reset = devm_gpiod_get_optional(&func->dev, "reset", GPIOD_OUT_HIGH);
+	if (IS_ERR(self->reset))
+		return dev_err_probe(&func->dev, PTR_ERR(self->reset),
+				     "could not get reset GPIO\n");
+	gpiod_set_consumer_name(self->reset, "cw1200_wlan_reset");
+
+	self->powerup = devm_gpiod_get_optional(&func->dev, "powerup", GPIOD_OUT_LOW);
+	if (IS_ERR(self->powerup))
+		return dev_err_probe(&func->dev, PTR_ERR(self->powerup),
+				     "could not get powerup GPIO\n");
+	gpiod_set_consumer_name(self->reset, "cw1200_wlan_powerup");
+
+	if (cw1200_spi_on(self, plat_data)) {
 		pr_err("spi_on() failed!\n");
-		return -1;
+		return -ENODEV;
 	}
 
 	if (spi_setup(func)) {
 		pr_err("spi_setup() failed!\n");
-		return -1;
-	}
-
-	self = devm_kzalloc(&func->dev, sizeof(*self), GFP_KERNEL);
-	if (!self) {
-		pr_err("Can't allocate SPI hwbus_priv.");
-		return -ENOMEM;
+		return -ENODEV;
 	}
 
 	self->pdata = plat_data;
@@ -410,7 +419,7 @@ static int cw1200_spi_probe(struct spi_device *func)
 
 	if (status) {
 		cw1200_spi_irq_unsubscribe(self);
-		cw1200_spi_off(plat_data);
+		cw1200_spi_off(self, plat_data);
 	}
 
 	return status;
@@ -428,7 +437,7 @@ static void cw1200_spi_disconnect(struct spi_device *func)
 			self->core = NULL;
 		}
 	}
-	cw1200_spi_off(dev_get_platdata(&func->dev));
+	cw1200_spi_off(self, dev_get_platdata(&func->dev));
 }
 
 static int __maybe_unused cw1200_spi_suspend(struct device *dev)
diff --git a/include/linux/platform_data/net-cw1200.h b/include/linux/platform_data/net-cw1200.h
index c510734405bb..89d0ec6f7d46 100644
--- a/include/linux/platform_data/net-cw1200.h
+++ b/include/linux/platform_data/net-cw1200.h
@@ -14,8 +14,6 @@ struct cw1200_platform_data_spi {
 
 	/* All others are optional */
 	bool have_5ghz;
-	int reset;                     /* GPIO to RSTn signal (0 disables) */
-	int powerup;                   /* GPIO to POWERUP signal (0 disables) */
 	int (*power_ctrl)(const struct cw1200_platform_data_spi *pdata,
 			  bool enable); /* Control 3v3 / 1v8 supply */
 	int (*clk_ctrl)(const struct cw1200_platform_data_spi *pdata,
@@ -30,8 +28,6 @@ struct cw1200_platform_data_sdio {
 	/* All others are optional */
 	bool have_5ghz;
 	bool no_nptb;       /* SDIO hardware does not support non-power-of-2-blocksizes */
-	int reset;          /* GPIO to RSTn signal (0 disables) */
-	int powerup;        /* GPIO to POWERUP signal (0 disables) */
 	int irq;            /* IRQ line or 0 to use SDIO IRQ */
 	int (*power_ctrl)(const struct cw1200_platform_data_sdio *pdata,
 			  bool enable); /* Control 3v3 / 1v8 supply */
-- 
cgit v1.2.3


From fd2bc4195d5107f88c1b90e1ec935888ccbfc5c0 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Tue, 3 Oct 2023 20:57:20 +0300
Subject: xfrm: generalize xdo_dev_state_update_curlft to allow statistics
 update

In order to allow drivers to fill all statistics, change the name
of xdo_dev_state_update_curlft to be xdo_dev_state_update_stats.

Acked-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 Documentation/networking/xfrm_device.rst                 |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c |  7 ++++---
 include/linux/netdevice.h                                |  2 +-
 include/net/xfrm.h                                       | 11 ++++-------
 net/xfrm/xfrm_state.c                                    |  4 ++--
 net/xfrm/xfrm_user.c                                     |  2 +-
 6 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/xfrm_device.rst b/Documentation/networking/xfrm_device.rst
index 535077cbeb07..bfea9d8579ed 100644
--- a/Documentation/networking/xfrm_device.rst
+++ b/Documentation/networking/xfrm_device.rst
@@ -71,9 +71,9 @@ Callbacks to implement
 	bool	(*xdo_dev_offload_ok) (struct sk_buff *skb,
 				       struct xfrm_state *x);
 	void    (*xdo_dev_state_advance_esn) (struct xfrm_state *x);
+	void    (*xdo_dev_state_update_stats) (struct xfrm_state *x);
 
         /* Solely packet offload callbacks */
-	void    (*xdo_dev_state_update_curlft) (struct xfrm_state *x);
 	int	(*xdo_dev_policy_add) (struct xfrm_policy *x, struct netlink_ext_ack *extack);
 	void	(*xdo_dev_policy_delete) (struct xfrm_policy *x);
 	void	(*xdo_dev_policy_free) (struct xfrm_policy *x);
@@ -191,6 +191,6 @@ xdo_dev_policy_free() on any remaining offloaded states.
 
 Outcome of HW handling packets, the XFRM core can't count hard, soft limits.
 The HW/driver are responsible to perform it and provide accurate data when
-xdo_dev_state_update_curlft() is called. In case of one of these limits
+xdo_dev_state_update_stats() is called. In case of one of these limits
 occuried, the driver needs to call to xfrm_state_check_expire() to make sure
 that XFRM performs rekeying sequence.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
index 05612d9c6080..f160522fbe75 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
@@ -984,7 +984,7 @@ static void mlx5e_xfrm_advance_esn_state(struct xfrm_state *x)
 	queue_work(sa_entry->ipsec->wq, &work->work);
 }
 
-static void mlx5e_xfrm_update_curlft(struct xfrm_state *x)
+static void mlx5e_xfrm_update_stats(struct xfrm_state *x)
 {
 	struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x);
 	struct mlx5e_ipsec_rule *ipsec_rule = &sa_entry->ipsec_rule;
@@ -993,7 +993,8 @@ static void mlx5e_xfrm_update_curlft(struct xfrm_state *x)
 	lockdep_assert(lockdep_is_held(&x->lock) ||
 		       lockdep_is_held(&dev_net(x->xso.real_dev)->xfrm.xfrm_cfg_mutex));
 
-	if (x->xso.flags & XFRM_DEV_OFFLOAD_FLAG_ACQ)
+	if (x->xso.flags & XFRM_DEV_OFFLOAD_FLAG_ACQ ||
+	    x->xso.type != XFRM_DEV_OFFLOAD_PACKET)
 		return;
 
 	mlx5_fc_query_cached(ipsec_rule->fc, &bytes, &packets, &lastuse);
@@ -1156,7 +1157,7 @@ static const struct xfrmdev_ops mlx5e_ipsec_xfrmdev_ops = {
 	.xdo_dev_offload_ok	= mlx5e_ipsec_offload_ok,
 	.xdo_dev_state_advance_esn = mlx5e_xfrm_advance_esn_state,
 
-	.xdo_dev_state_update_curlft = mlx5e_xfrm_update_curlft,
+	.xdo_dev_state_update_stats = mlx5e_xfrm_update_stats,
 	.xdo_dev_policy_add = mlx5e_xfrm_add_policy,
 	.xdo_dev_policy_delete = mlx5e_xfrm_del_policy,
 	.xdo_dev_policy_free = mlx5e_xfrm_free_policy,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 118c40258d07..9538576dbebc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1062,7 +1062,7 @@ struct xfrmdev_ops {
 	bool	(*xdo_dev_offload_ok) (struct sk_buff *skb,
 				       struct xfrm_state *x);
 	void	(*xdo_dev_state_advance_esn) (struct xfrm_state *x);
-	void	(*xdo_dev_state_update_curlft) (struct xfrm_state *x);
+	void	(*xdo_dev_state_update_stats) (struct xfrm_state *x);
 	int	(*xdo_dev_policy_add) (struct xfrm_policy *x, struct netlink_ext_ack *extack);
 	void	(*xdo_dev_policy_delete) (struct xfrm_policy *x);
 	void	(*xdo_dev_policy_free) (struct xfrm_policy *x);
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 1d107241b901..4ca2f3205190 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1578,21 +1578,18 @@ struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi,
 					      unsigned short family);
 int xfrm_state_check_expire(struct xfrm_state *x);
 #ifdef CONFIG_XFRM_OFFLOAD
-static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x)
+static inline void xfrm_dev_state_update_stats(struct xfrm_state *x)
 {
 	struct xfrm_dev_offload *xdo = &x->xso;
 	struct net_device *dev = xdo->dev;
 
-	if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET)
-		return;
-
 	if (dev && dev->xfrmdev_ops &&
-	    dev->xfrmdev_ops->xdo_dev_state_update_curlft)
-		dev->xfrmdev_ops->xdo_dev_state_update_curlft(x);
+	    dev->xfrmdev_ops->xdo_dev_state_update_stats)
+		dev->xfrmdev_ops->xdo_dev_state_update_stats(x);
 
 }
 #else
-static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x) {}
+static inline void xfrm_dev_state_update_stats(struct xfrm_state *x) {}
 #endif
 void xfrm_state_insert(struct xfrm_state *x);
 int xfrm_state_add(struct xfrm_state *x);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index bda5327bf34d..d8701b2d0d57 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -570,7 +570,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
 	int err = 0;
 
 	spin_lock(&x->lock);
-	xfrm_dev_state_update_curlft(x);
+	xfrm_dev_state_update_stats(x);
 
 	if (x->km.state == XFRM_STATE_DEAD)
 		goto out;
@@ -1935,7 +1935,7 @@ EXPORT_SYMBOL(xfrm_state_update);
 
 int xfrm_state_check_expire(struct xfrm_state *x)
 {
-	xfrm_dev_state_update_curlft(x);
+	xfrm_dev_state_update_stats(x);
 
 	if (!READ_ONCE(x->curlft.use_time))
 		WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds());
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index ad01997c3aa9..dc4f9b8d7cb0 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -902,7 +902,7 @@ static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
 	memcpy(&p->sel, &x->sel, sizeof(p->sel));
 	memcpy(&p->lft, &x->lft, sizeof(p->lft));
 	if (x->xso.dev)
-		xfrm_dev_state_update_curlft(x);
+		xfrm_dev_state_update_stats(x);
 	memcpy(&p->curlft, &x->curlft, sizeof(p->curlft));
 	put_unaligned(x->stats.replay_window, &p->stats.replay_window);
 	put_unaligned(x->stats.replay, &p->stats.replay);
-- 
cgit v1.2.3


From f9f221c98fd83df518fbb2f5ad33980cfedfe1bf Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Wed, 4 Oct 2023 14:11:48 +0300
Subject: xfrm: get global statistics from the offloaded device

Iterate over all SAs in order to fill global IPsec statistics.

Acked-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c |  3 ++-
 include/net/xfrm.h                                       |  3 +++
 net/xfrm/xfrm_proc.c                                     |  1 +
 net/xfrm/xfrm_state.c                                    | 13 +++++++++++++
 4 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
index f160522fbe75..46cce4e38d84 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
@@ -991,7 +991,8 @@ static void mlx5e_xfrm_update_stats(struct xfrm_state *x)
 	u64 packets, bytes, lastuse;
 
 	lockdep_assert(lockdep_is_held(&x->lock) ||
-		       lockdep_is_held(&dev_net(x->xso.real_dev)->xfrm.xfrm_cfg_mutex));
+		       lockdep_is_held(&dev_net(x->xso.real_dev)->xfrm.xfrm_cfg_mutex) ||
+		       lockdep_is_held(&dev_net(x->xso.real_dev)->xfrm.xfrm_state_lock));
 
 	if (x->xso.flags & XFRM_DEV_OFFLOAD_FLAG_ACQ ||
 	    x->xso.type != XFRM_DEV_OFFLOAD_PACKET)
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 4ca2f3205190..57c743b7e4fe 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -51,8 +51,10 @@
 
 #ifdef CONFIG_XFRM_STATISTICS
 #define XFRM_INC_STATS(net, field)	SNMP_INC_STATS((net)->mib.xfrm_statistics, field)
+#define XFRM_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.xfrm_statistics, field, val)
 #else
 #define XFRM_INC_STATS(net, field)	((void)(net))
+#define XFRM_ADD_STATS(net, field, val) ((void)(net))
 #endif
 
 
@@ -1577,6 +1579,7 @@ struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
 struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi,
 					      unsigned short family);
 int xfrm_state_check_expire(struct xfrm_state *x);
+void xfrm_state_update_stats(struct net *net);
 #ifdef CONFIG_XFRM_OFFLOAD
 static inline void xfrm_dev_state_update_stats(struct xfrm_state *x)
 {
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index fee9b5cf37a7..5f9bf8e5c933 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -52,6 +52,7 @@ static int xfrm_statistics_seq_show(struct seq_file *seq, void *v)
 
 	memset(buff, 0, sizeof(unsigned long) * LINUX_MIB_XFRMMAX);
 
+	xfrm_state_update_stats(net);
 	snmp_get_cpu_field_batch(buff, xfrm_mib_list,
 				 net->mib.xfrm_statistics);
 	for (i = 0; xfrm_mib_list[i].name; i++)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index d8701b2d0d57..0c306473a79d 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1957,6 +1957,19 @@ int xfrm_state_check_expire(struct xfrm_state *x)
 }
 EXPORT_SYMBOL(xfrm_state_check_expire);
 
+void xfrm_state_update_stats(struct net *net)
+{
+	struct xfrm_state *x;
+	int i;
+
+	spin_lock_bh(&net->xfrm.xfrm_state_lock);
+	for (i = 0; i <= net->xfrm.state_hmask; i++) {
+		hlist_for_each_entry(x, net->xfrm.state_bydst + i, bydst)
+			xfrm_dev_state_update_stats(x);
+	}
+	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+}
+
 struct xfrm_state *
 xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi,
 		  u8 proto, unsigned short family)
-- 
cgit v1.2.3


From 91a72ada66053b4dba95cf1a60a5a23fdbd6faf7 Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Tue, 26 Dec 2023 10:22:08 +0200
Subject: net/mlx5: Remove initial segmentation duplicate definitions

Device definitions belong in mlx5_ifc, remove the duplicates in
mlx5_core.h.

Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Jianbo Liu <jianbol@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fw.c        |  6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/health.c    | 20 ++++++++++----------
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h |  7 -------
 include/linux/mlx5/mlx5_ifc.h                       |  1 +
 4 files changed, 14 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 58f4c0d0fafa..e7faf7e73ca4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -366,18 +366,18 @@ int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev)
 		return -EIO;
 	}
 
-	mlx5_set_nic_state(dev, MLX5_NIC_IFC_DISABLED);
+	mlx5_set_nic_state(dev, MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED);
 
 	/* Loop until device state turns to disable */
 	end = jiffies + msecs_to_jiffies(delay_ms);
 	do {
-		if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
+		if (mlx5_get_nic_state(dev) == MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED)
 			break;
 
 		cond_resched();
 	} while (!time_after(jiffies, end));
 
-	if (mlx5_get_nic_state(dev) != MLX5_NIC_IFC_DISABLED) {
+	if (mlx5_get_nic_state(dev) != MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED) {
 		dev_err(&dev->pdev->dev, "NIC IFC still %d after %lums.\n",
 			mlx5_get_nic_state(dev), delay_ms);
 		return -EIO;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 5c2ac2d9dbd9..9463ede84d8d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -116,9 +116,9 @@ u32 mlx5_health_check_fatal_sensors(struct mlx5_core_dev *dev)
 		return MLX5_SENSOR_PCI_COMM_ERR;
 	if (pci_channel_offline(dev->pdev))
 		return MLX5_SENSOR_PCI_ERR;
-	if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
+	if (mlx5_get_nic_state(dev) == MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED)
 		return MLX5_SENSOR_NIC_DISABLED;
-	if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET)
+	if (mlx5_get_nic_state(dev) == MLX5_INITIAL_SEG_NIC_INTERFACE_SW_RESET)
 		return MLX5_SENSOR_NIC_SW_RESET;
 	if (sensor_fw_synd_rfr(dev))
 		return MLX5_SENSOR_FW_SYND_RFR;
@@ -185,7 +185,7 @@ static bool reset_fw_if_needed(struct mlx5_core_dev *dev)
 	/* Write the NIC interface field to initiate the reset, the command
 	 * interface address also resides here, don't overwrite it.
 	 */
-	mlx5_set_nic_state(dev, MLX5_NIC_IFC_SW_RESET);
+	mlx5_set_nic_state(dev, MLX5_INITIAL_SEG_NIC_INTERFACE_SW_RESET);
 
 	return true;
 }
@@ -246,13 +246,13 @@ recover_from_sw_reset:
 	/* Recover from SW reset */
 	end = jiffies + msecs_to_jiffies(delay_ms);
 	do {
-		if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
+		if (mlx5_get_nic_state(dev) == MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED)
 			break;
 
 		msleep(20);
 	} while (!time_after(jiffies, end));
 
-	if (mlx5_get_nic_state(dev) != MLX5_NIC_IFC_DISABLED) {
+	if (mlx5_get_nic_state(dev) != MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED) {
 		dev_err(&dev->pdev->dev, "NIC IFC still %d after %lums.\n",
 			mlx5_get_nic_state(dev), delay_ms);
 	}
@@ -272,26 +272,26 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
 	u8 nic_interface = mlx5_get_nic_state(dev);
 
 	switch (nic_interface) {
-	case MLX5_NIC_IFC_FULL:
+	case MLX5_INITIAL_SEG_NIC_INTERFACE_FULL_DRIVER:
 		mlx5_core_warn(dev, "Expected to see disabled NIC but it is full driver\n");
 		break;
 
-	case MLX5_NIC_IFC_DISABLED:
+	case MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED:
 		mlx5_core_warn(dev, "starting teardown\n");
 		break;
 
-	case MLX5_NIC_IFC_NO_DRAM_NIC:
+	case MLX5_INITIAL_SEG_NIC_INTERFACE_NO_DRAM_NIC:
 		mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n");
 		break;
 
-	case MLX5_NIC_IFC_SW_RESET:
+	case MLX5_INITIAL_SEG_NIC_INTERFACE_SW_RESET:
 		/* The IFC mode field is 3 bits, so it will read 0x7 in 2 cases:
 		 * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded
 		 *    and this is a VF), this is not recoverable by SW reset.
 		 *    Logging of this is handled elsewhere.
 		 * 2. FW reset has been issued by another function, driver can
 		 *    be reloaded to recover after the mode switches to
-		 *    MLX5_NIC_IFC_DISABLED.
+		 *    MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED.
 		 */
 		if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR)
 			mlx5_core_warn(dev, "NIC SW reset in progress\n");
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index a79b7959361b..58732f44940f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -312,13 +312,6 @@ static inline int mlx5_rescan_drivers(struct mlx5_core_dev *dev)
 	return ret;
 }
 
-enum {
-	MLX5_NIC_IFC_FULL		= 0,
-	MLX5_NIC_IFC_DISABLED		= 1,
-	MLX5_NIC_IFC_NO_DRAM_NIC	= 2,
-	MLX5_NIC_IFC_SW_RESET		= 7
-};
-
 u8 mlx5_get_nic_state(struct mlx5_core_dev *dev);
 void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state);
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6c44f107b8ba..7f5e846eb46d 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10661,6 +10661,7 @@ enum {
 	MLX5_INITIAL_SEG_NIC_INTERFACE_FULL_DRIVER  = 0x0,
 	MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED     = 0x1,
 	MLX5_INITIAL_SEG_NIC_INTERFACE_NO_DRAM_NIC  = 0x2,
+	MLX5_INITIAL_SEG_NIC_INTERFACE_SW_RESET     = 0x7,
 };
 
 enum {
-- 
cgit v1.2.3


From 03ba6dc035c60991033529e630bd1552b2bca4d7 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 2 Feb 2024 17:37:46 +0100
Subject: net: dst: Make dst_destroy() static and return void.

Since commit 52df157f17e56 ("xfrm: take refcnt of dst when creating
struct xfrm_dst bundle") dst_destroy() returns only NULL and no caller
cares about the return value.
There are no in in-tree users of dst_destroy() outside of the file.

Make dst_destroy() static and return void.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240202163746.2489150-1-bigeasy@linutronix.de
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/dst.h | 1 -
 net/core/dst.c    | 6 ++----
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index f5dfc8fb7b37..0aa331bd2fdb 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -390,7 +390,6 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
 void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	      struct net_device *dev, int initial_obsolete,
 	      unsigned short flags);
-struct dst_entry *dst_destroy(struct dst_entry *dst);
 void dst_dev_put(struct dst_entry *dst);
 
 static inline void dst_confirm(struct dst_entry *dst)
diff --git a/net/core/dst.c b/net/core/dst.c
index 6838d3212c37..95f533844f17 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -96,7 +96,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
 }
 EXPORT_SYMBOL(dst_alloc);
 
-struct dst_entry *dst_destroy(struct dst_entry * dst)
+static void dst_destroy(struct dst_entry *dst)
 {
 	struct dst_entry *child = NULL;
 
@@ -126,15 +126,13 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
 	dst = child;
 	if (dst)
 		dst_release_immediate(dst);
-	return NULL;
 }
-EXPORT_SYMBOL(dst_destroy);
 
 static void dst_destroy_rcu(struct rcu_head *head)
 {
 	struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
 
-	dst = dst_destroy(dst);
+	dst_destroy(dst);
 }
 
 /* Operations to mark dst as DEAD and clean up the net device referenced
-- 
cgit v1.2.3


From 0bd199fd9c19aa545f677fd0a99f2be101cb6309 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 2 Feb 2024 17:41:45 +0000
Subject: net: phy: constify phydev->drv

Device driver structures are shared between all devices that they
match, and thus nothing should never write to the device driver
structure through the phydev->drv pointer. Let's make this pointer
const to catch code that attempts to do so.

Suggested-by: Christian Marangi <ansuelsmth@gmail.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/E1rVxXt-002YqY-9G@rmk-PC.armlinux.org.uk
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/phy/phy.c               | 3 +--
 drivers/net/phy/phy_device.c        | 6 +++---
 drivers/net/phy/xilinx_gmii2rgmii.c | 2 +-
 include/linux/phy.h                 | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 3b9531143be1..14224e06d69f 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1290,7 +1290,6 @@ int phy_disable_interrupts(struct phy_device *phydev)
 static irqreturn_t phy_interrupt(int irq, void *phy_dat)
 {
 	struct phy_device *phydev = phy_dat;
-	struct phy_driver *drv = phydev->drv;
 	irqreturn_t ret;
 
 	/* Wakeup interrupts may occur during a system sleep transition.
@@ -1316,7 +1315,7 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat)
 	}
 
 	mutex_lock(&phydev->lock);
-	ret = drv->handle_interrupt(phydev);
+	ret = phydev->drv->handle_interrupt(phydev);
 	mutex_unlock(&phydev->lock);
 
 	return ret;
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 52828d1c64f7..2eed8f03621d 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1413,7 +1413,7 @@ int phy_sfp_probe(struct phy_device *phydev,
 }
 EXPORT_SYMBOL(phy_sfp_probe);
 
-static bool phy_drv_supports_irq(struct phy_driver *phydrv)
+static bool phy_drv_supports_irq(const struct phy_driver *phydrv)
 {
 	return phydrv->config_intr && phydrv->handle_interrupt;
 }
@@ -1867,7 +1867,7 @@ int phy_suspend(struct phy_device *phydev)
 {
 	struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
 	struct net_device *netdev = phydev->attached_dev;
-	struct phy_driver *phydrv = phydev->drv;
+	const struct phy_driver *phydrv = phydev->drv;
 	int ret;
 
 	if (phydev->suspended)
@@ -1892,7 +1892,7 @@ EXPORT_SYMBOL(phy_suspend);
 
 int __phy_resume(struct phy_device *phydev)
 {
-	struct phy_driver *phydrv = phydev->drv;
+	const struct phy_driver *phydrv = phydev->drv;
 	int ret;
 
 	lockdep_assert_held(&phydev->lock);
diff --git a/drivers/net/phy/xilinx_gmii2rgmii.c b/drivers/net/phy/xilinx_gmii2rgmii.c
index 7fd9fe6a602b..7b1bc5fcef9b 100644
--- a/drivers/net/phy/xilinx_gmii2rgmii.c
+++ b/drivers/net/phy/xilinx_gmii2rgmii.c
@@ -22,7 +22,7 @@
 
 struct gmii2rgmii {
 	struct phy_device *phy_dev;
-	struct phy_driver *phy_drv;
+	const struct phy_driver *phy_drv;
 	struct phy_driver conv_phy_drv;
 	struct mdio_device *mdio;
 };
diff --git a/include/linux/phy.h b/include/linux/phy.h
index a66f07d3f5f4..ad93f8b1b128 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -638,7 +638,7 @@ struct phy_device {
 
 	/* Information about the PHY type */
 	/* And management functions */
-	struct phy_driver *drv;
+	const struct phy_driver *drv;
 
 	struct device_link *devlink;
 
-- 
cgit v1.2.3


From 240fd405528bbf7fafa0559202ca7aa524c9cd96 Mon Sep 17 00:00:00 2001
From: Aahil Awatramani <aahila@google.com>
Date: Fri, 2 Feb 2024 17:58:58 +0000
Subject: bonding: Add independent control state machine

Add support for the independent control state machine per IEEE
802.1AX-2008 5.4.15 in addition to the existing implementation of the
coupled control state machine.

Introduces two new states, AD_MUX_COLLECTING and AD_MUX_DISTRIBUTING in
the LACP MUX state machine for separated handling of an initial
Collecting state before the Collecting and Distributing state. This
enables a port to be in a state where it can receive incoming packets
while not still distributing. This is useful for reducing packet loss when
a port begins distributing before its partner is able to collect.

Added new functions such as bond_set_slave_tx_disabled_flags and
bond_set_slave_rx_enabled_flags to precisely manage the port's collecting
and distributing states. Previously, there was no dedicated method to
disable TX while keeping RX enabled, which this patch addresses.

Note that the regular flow process in the kernel's bonding driver remains
unaffected by this patch. The extension requires explicit opt-in by the
user (in order to ensure no disruptions for existing setups) via netlink
support using the new bonding parameter coupled_control. The default value
for coupled_control is set to 1 so as to preserve existing behaviour.

Signed-off-by: Aahil Awatramani <aahila@google.com>
Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
Link: https://lore.kernel.org/r/20240202175858.1573852-1-aahila@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/networking/bonding.rst |  12 +++
 drivers/net/bonding/bond_3ad.c       | 157 +++++++++++++++++++++++++++++++++--
 drivers/net/bonding/bond_main.c      |   1 +
 drivers/net/bonding/bond_netlink.c   |  16 ++++
 drivers/net/bonding/bond_options.c   |  28 ++++++-
 include/net/bond_3ad.h               |   2 +
 include/net/bond_options.h           |   1 +
 include/net/bonding.h                |  23 +++++
 include/uapi/linux/if_link.h         |   1 +
 tools/include/uapi/linux/if_link.h   |   1 +
 10 files changed, 234 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
index f7a73421eb76..e774b48de9f5 100644
--- a/Documentation/networking/bonding.rst
+++ b/Documentation/networking/bonding.rst
@@ -444,6 +444,18 @@ arp_missed_max
 
 	The default value is 2, and the allowable range is 1 - 255.
 
+coupled_control
+
+    Specifies whether the LACP state machine's MUX in the 802.3ad mode
+    should have separate Collecting and Distributing states.
+
+    This is by implementing the independent control state machine per
+    IEEE 802.1AX-2008 5.4.15 in addition to the existing coupled control
+    state machine.
+
+    The default value is 1. This setting does not separate the Collecting
+    and Distributing states, maintaining the bond in coupled control.
+
 downdelay
 
 	Specifies the time, in milliseconds, to wait before disabling
diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index c99ffe6c683a..f2942e8c6c91 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -106,6 +106,9 @@ static void ad_agg_selection_logic(struct aggregator *aggregator,
 static void ad_clear_agg(struct aggregator *aggregator);
 static void ad_initialize_agg(struct aggregator *aggregator);
 static void ad_initialize_port(struct port *port, int lacp_fast);
+static void ad_enable_collecting(struct port *port);
+static void ad_disable_distributing(struct port *port,
+				    bool *update_slave_arr);
 static void ad_enable_collecting_distributing(struct port *port,
 					      bool *update_slave_arr);
 static void ad_disable_collecting_distributing(struct port *port,
@@ -171,9 +174,38 @@ static inline int __agg_has_partner(struct aggregator *agg)
 	return !is_zero_ether_addr(agg->partner_system.mac_addr_value);
 }
 
+/**
+ * __disable_distributing_port - disable the port's slave for distributing.
+ * Port will still be able to collect.
+ * @port: the port we're looking at
+ *
+ * This will disable only distributing on the port's slave.
+ */
+static void __disable_distributing_port(struct port *port)
+{
+	bond_set_slave_tx_disabled_flags(port->slave, BOND_SLAVE_NOTIFY_LATER);
+}
+
+/**
+ * __enable_collecting_port - enable the port's slave for collecting,
+ * if it's up
+ * @port: the port we're looking at
+ *
+ * This will enable only collecting on the port's slave.
+ */
+static void __enable_collecting_port(struct port *port)
+{
+	struct slave *slave = port->slave;
+
+	if (slave->link == BOND_LINK_UP && bond_slave_is_up(slave))
+		bond_set_slave_rx_enabled_flags(slave, BOND_SLAVE_NOTIFY_LATER);
+}
+
 /**
  * __disable_port - disable the port's slave
  * @port: the port we're looking at
+ *
+ * This will disable both collecting and distributing on the port's slave.
  */
 static inline void __disable_port(struct port *port)
 {
@@ -183,6 +215,8 @@ static inline void __disable_port(struct port *port)
 /**
  * __enable_port - enable the port's slave, if it's up
  * @port: the port we're looking at
+ *
+ * This will enable both collecting and distributing on the port's slave.
  */
 static inline void __enable_port(struct port *port)
 {
@@ -193,10 +227,27 @@ static inline void __enable_port(struct port *port)
 }
 
 /**
- * __port_is_enabled - check if the port's slave is in active state
+ * __port_move_to_attached_state - check if port should transition back to attached
+ * state.
+ * @port: the port we're looking at
+ */
+static bool __port_move_to_attached_state(struct port *port)
+{
+	if (!(port->sm_vars & AD_PORT_SELECTED) ||
+	    (port->sm_vars & AD_PORT_STANDBY) ||
+	    !(port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) ||
+	    !(port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION))
+		port->sm_mux_state = AD_MUX_ATTACHED;
+
+	return port->sm_mux_state == AD_MUX_ATTACHED;
+}
+
+/**
+ * __port_is_collecting_distributing - check if the port's slave is in the
+ * combined collecting/distributing state
  * @port: the port we're looking at
  */
-static inline int __port_is_enabled(struct port *port)
+static int __port_is_collecting_distributing(struct port *port)
 {
 	return bond_is_active_slave(port->slave);
 }
@@ -942,6 +993,7 @@ static int ad_marker_send(struct port *port, struct bond_marker *marker)
  */
 static void ad_mux_machine(struct port *port, bool *update_slave_arr)
 {
+	struct bonding *bond = __get_bond_by_port(port);
 	mux_states_t last_state;
 
 	/* keep current State Machine state to compare later if it was
@@ -999,9 +1051,13 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr)
 			if ((port->sm_vars & AD_PORT_SELECTED) &&
 			    (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) &&
 			    !__check_agg_selection_timer(port)) {
-				if (port->aggregator->is_active)
-					port->sm_mux_state =
-					    AD_MUX_COLLECTING_DISTRIBUTING;
+				if (port->aggregator->is_active) {
+					int state = AD_MUX_COLLECTING_DISTRIBUTING;
+
+					if (!bond->params.coupled_control)
+						state = AD_MUX_COLLECTING;
+					port->sm_mux_state = state;
+				}
 			} else if (!(port->sm_vars & AD_PORT_SELECTED) ||
 				   (port->sm_vars & AD_PORT_STANDBY)) {
 				/* if UNSELECTED or STANDBY */
@@ -1019,11 +1075,45 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr)
 			}
 			break;
 		case AD_MUX_COLLECTING_DISTRIBUTING:
+			if (!__port_move_to_attached_state(port)) {
+				/* if port state hasn't changed make
+				 * sure that a collecting distributing
+				 * port in an active aggregator is enabled
+				 */
+				if (port->aggregator->is_active &&
+				    !__port_is_collecting_distributing(port)) {
+					__enable_port(port);
+					*update_slave_arr = true;
+				}
+			}
+			break;
+		case AD_MUX_COLLECTING:
+			if (!__port_move_to_attached_state(port)) {
+				if ((port->sm_vars & AD_PORT_SELECTED) &&
+				    (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) &&
+				    (port->partner_oper.port_state & LACP_STATE_COLLECTING)) {
+					port->sm_mux_state = AD_MUX_DISTRIBUTING;
+				} else {
+					/* If port state hasn't changed, make sure that a collecting
+					 * port is enabled for an active aggregator.
+					 */
+					struct slave *slave = port->slave;
+
+					if (port->aggregator->is_active &&
+					    bond_is_slave_rx_disabled(slave)) {
+						ad_enable_collecting(port);
+						*update_slave_arr = true;
+					}
+				}
+			}
+			break;
+		case AD_MUX_DISTRIBUTING:
 			if (!(port->sm_vars & AD_PORT_SELECTED) ||
 			    (port->sm_vars & AD_PORT_STANDBY) ||
+			    !(port->partner_oper.port_state & LACP_STATE_COLLECTING) ||
 			    !(port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) ||
 			    !(port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION)) {
-				port->sm_mux_state = AD_MUX_ATTACHED;
+				port->sm_mux_state = AD_MUX_COLLECTING;
 			} else {
 				/* if port state hasn't changed make
 				 * sure that a collecting distributing
@@ -1031,7 +1121,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr)
 				 */
 				if (port->aggregator &&
 				    port->aggregator->is_active &&
-				    !__port_is_enabled(port)) {
+				    !__port_is_collecting_distributing(port)) {
 					__enable_port(port);
 					*update_slave_arr = true;
 				}
@@ -1082,6 +1172,20 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr)
 							  update_slave_arr);
 			port->ntt = true;
 			break;
+		case AD_MUX_COLLECTING:
+			port->actor_oper_port_state |= LACP_STATE_COLLECTING;
+			port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING;
+			port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION;
+			ad_enable_collecting(port);
+			ad_disable_distributing(port, update_slave_arr);
+			port->ntt = true;
+			break;
+		case AD_MUX_DISTRIBUTING:
+			port->actor_oper_port_state |= LACP_STATE_DISTRIBUTING;
+			port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION;
+			ad_enable_collecting_distributing(port,
+							  update_slave_arr);
+			break;
 		default:
 			break;
 		}
@@ -1906,6 +2010,45 @@ static void ad_initialize_port(struct port *port, int lacp_fast)
 	}
 }
 
+/**
+ * ad_enable_collecting - enable a port's receive
+ * @port: the port we're looking at
+ *
+ * Enable @port if it's in an active aggregator
+ */
+static void ad_enable_collecting(struct port *port)
+{
+	if (port->aggregator->is_active) {
+		struct slave *slave = port->slave;
+
+		slave_dbg(slave->bond->dev, slave->dev,
+			  "Enabling collecting on port %d (LAG %d)\n",
+			  port->actor_port_number,
+			  port->aggregator->aggregator_identifier);
+		__enable_collecting_port(port);
+	}
+}
+
+/**
+ * ad_disable_distributing - disable a port's transmit
+ * @port: the port we're looking at
+ * @update_slave_arr: Does slave array need update?
+ */
+static void ad_disable_distributing(struct port *port, bool *update_slave_arr)
+{
+	if (port->aggregator &&
+	    !MAC_ADDRESS_EQUAL(&port->aggregator->partner_system,
+			       &(null_mac_addr))) {
+		slave_dbg(port->slave->bond->dev, port->slave->dev,
+			  "Disabling distributing on port %d (LAG %d)\n",
+			  port->actor_port_number,
+			  port->aggregator->aggregator_identifier);
+		__disable_distributing_port(port);
+		/* Slave array needs an update */
+		*update_slave_arr = true;
+	}
+}
+
 /**
  * ad_enable_collecting_distributing - enable a port's transmit/receive
  * @port: the port we're looking at
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 4e0600c7b050..ae9d32c0faf4 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -6306,6 +6306,7 @@ static int __init bond_check_params(struct bond_params *params)
 	params->ad_actor_sys_prio = ad_actor_sys_prio;
 	eth_zero_addr(params->ad_actor_system);
 	params->ad_user_port_key = ad_user_port_key;
+	params->coupled_control = 1;
 	if (packets_per_slave > 0) {
 		params->reciprocal_packets_per_slave =
 			reciprocal_value(packets_per_slave);
diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index cfa74cf8bb1a..29b4c3d1b9b6 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -122,6 +122,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = {
 	[IFLA_BOND_PEER_NOTIF_DELAY]    = NLA_POLICY_FULL_RANGE(NLA_U32, &delay_range),
 	[IFLA_BOND_MISSED_MAX]		= { .type = NLA_U8 },
 	[IFLA_BOND_NS_IP6_TARGET]	= { .type = NLA_NESTED },
+	[IFLA_BOND_COUPLED_CONTROL]	= { .type = NLA_U8 },
 };
 
 static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = {
@@ -549,6 +550,16 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			return err;
 	}
 
+	if (data[IFLA_BOND_COUPLED_CONTROL]) {
+		int coupled_control = nla_get_u8(data[IFLA_BOND_COUPLED_CONTROL]);
+
+		bond_opt_initval(&newval, coupled_control);
+		err = __bond_opt_set(bond, BOND_OPT_COUPLED_CONTROL, &newval,
+				     data[IFLA_BOND_COUPLED_CONTROL], extack);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
@@ -615,6 +626,7 @@ static size_t bond_get_size(const struct net_device *bond_dev)
 						/* IFLA_BOND_NS_IP6_TARGET */
 		nla_total_size(sizeof(struct nlattr)) +
 		nla_total_size(sizeof(struct in6_addr)) * BOND_MAX_NS_TARGETS +
+		nla_total_size(sizeof(u8)) +	/* IFLA_BOND_COUPLED_CONTROL */
 		0;
 }
 
@@ -774,6 +786,10 @@ static int bond_fill_info(struct sk_buff *skb,
 		       bond->params.missed_max))
 		goto nla_put_failure;
 
+	if (nla_put_u8(skb, IFLA_BOND_COUPLED_CONTROL,
+		       bond->params.coupled_control))
+		goto nla_put_failure;
+
 	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
 		struct ad_info info;
 
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index f3f27f0bd2a6..4cdbc7e084f4 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -84,7 +84,8 @@ static int bond_option_ad_user_port_key_set(struct bonding *bond,
 					    const struct bond_opt_value *newval);
 static int bond_option_missed_max_set(struct bonding *bond,
 				      const struct bond_opt_value *newval);
-
+static int bond_option_coupled_control_set(struct bonding *bond,
+					   const struct bond_opt_value *newval);
 
 static const struct bond_opt_value bond_mode_tbl[] = {
 	{ "balance-rr",    BOND_MODE_ROUNDROBIN,   BOND_VALFLAG_DEFAULT},
@@ -232,6 +233,12 @@ static const struct bond_opt_value bond_missed_max_tbl[] = {
 	{ NULL,		-1,	0},
 };
 
+static const struct bond_opt_value bond_coupled_control_tbl[] = {
+	{ "on",  1,  BOND_VALFLAG_DEFAULT},
+	{ "off", 0,  0},
+	{ NULL,  -1, 0},
+};
+
 static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 	[BOND_OPT_MODE] = {
 		.id = BOND_OPT_MODE,
@@ -496,6 +503,15 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.desc = "Delay between each peer notification on failover event, in milliseconds",
 		.values = bond_peer_notif_delay_tbl,
 		.set = bond_option_peer_notif_delay_set
+	},
+	[BOND_OPT_COUPLED_CONTROL] = {
+		.id = BOND_OPT_COUPLED_CONTROL,
+		.name = "coupled_control",
+		.desc = "Opt into using coupled control MUX for LACP states",
+		.unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)),
+		.flags = BOND_OPTFLAG_IFDOWN,
+		.values = bond_coupled_control_tbl,
+		.set = bond_option_coupled_control_set,
 	}
 };
 
@@ -1692,3 +1708,13 @@ static int bond_option_ad_user_port_key_set(struct bonding *bond,
 	bond->params.ad_user_port_key = newval->value;
 	return 0;
 }
+
+static int bond_option_coupled_control_set(struct bonding *bond,
+					   const struct bond_opt_value *newval)
+{
+	netdev_info(bond->dev, "Setting coupled_control to %s (%llu)\n",
+		    newval->string, newval->value);
+
+	bond->params.coupled_control = newval->value;
+	return 0;
+}
diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h
index c5e57c6bd873..9ce5ac2bfbad 100644
--- a/include/net/bond_3ad.h
+++ b/include/net/bond_3ad.h
@@ -54,6 +54,8 @@ typedef enum {
 	AD_MUX_DETACHED,	/* mux machine */
 	AD_MUX_WAITING,		/* mux machine */
 	AD_MUX_ATTACHED,	/* mux machine */
+	AD_MUX_COLLECTING,	/* mux machine */
+	AD_MUX_DISTRIBUTING,	/* mux machine */
 	AD_MUX_COLLECTING_DISTRIBUTING	/* mux machine */
 } mux_states_t;
 
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index 69292ecc0325..473a0147769e 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -76,6 +76,7 @@ enum {
 	BOND_OPT_MISSED_MAX,
 	BOND_OPT_NS_TARGETS,
 	BOND_OPT_PRIO,
+	BOND_OPT_COUPLED_CONTROL,
 	BOND_OPT_LAST
 };
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 5b8b1b644a2d..b61fb1aa3a56 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -148,6 +148,7 @@ struct bond_params {
 #if IS_ENABLED(CONFIG_IPV6)
 	struct in6_addr ns_targets[BOND_MAX_NS_TARGETS];
 #endif
+	int coupled_control;
 
 	/* 2 bytes of padding : see ether_addr_equal_64bits() */
 	u8 ad_actor_system[ETH_ALEN + 2];
@@ -167,6 +168,7 @@ struct slave {
 	u8     backup:1,   /* indicates backup slave. Value corresponds with
 			      BOND_STATE_ACTIVE and BOND_STATE_BACKUP */
 	       inactive:1, /* indicates inactive slave */
+	       rx_disabled:1, /* indicates whether slave's Rx is disabled */
 	       should_notify:1, /* indicates whether the state changed */
 	       should_notify_link:1; /* indicates whether the link changed */
 	u8     duplex;
@@ -568,6 +570,14 @@ static inline void bond_set_slave_inactive_flags(struct slave *slave,
 		bond_set_slave_state(slave, BOND_STATE_BACKUP, notify);
 	if (!slave->bond->params.all_slaves_active)
 		slave->inactive = 1;
+	if (BOND_MODE(slave->bond) == BOND_MODE_8023AD)
+		slave->rx_disabled = 1;
+}
+
+static inline void bond_set_slave_tx_disabled_flags(struct slave *slave,
+						 bool notify)
+{
+	bond_set_slave_state(slave, BOND_STATE_BACKUP, notify);
 }
 
 static inline void bond_set_slave_active_flags(struct slave *slave,
@@ -575,6 +585,14 @@ static inline void bond_set_slave_active_flags(struct slave *slave,
 {
 	bond_set_slave_state(slave, BOND_STATE_ACTIVE, notify);
 	slave->inactive = 0;
+	if (BOND_MODE(slave->bond) == BOND_MODE_8023AD)
+		slave->rx_disabled = 0;
+}
+
+static inline void bond_set_slave_rx_enabled_flags(struct slave *slave,
+					       bool notify)
+{
+	slave->rx_disabled = 0;
 }
 
 static inline bool bond_is_slave_inactive(struct slave *slave)
@@ -582,6 +600,11 @@ static inline bool bond_is_slave_inactive(struct slave *slave)
 	return slave->inactive;
 }
 
+static inline bool bond_is_slave_rx_disabled(struct slave *slave)
+{
+	return slave->rx_disabled;
+}
+
 static inline void bond_propose_link_state(struct slave *slave, int state)
 {
 	slave->link_new_state = state;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ab9bcff96e4d..ffa637b38c93 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1505,6 +1505,7 @@ enum {
 	IFLA_BOND_AD_LACP_ACTIVE,
 	IFLA_BOND_MISSED_MAX,
 	IFLA_BOND_NS_IP6_TARGET,
+	IFLA_BOND_COUPLED_CONTROL,
 	__IFLA_BOND_MAX,
 };
 
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index a0aa05a28cf2..f0d71b2a3f1e 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -974,6 +974,7 @@ enum {
 	IFLA_BOND_AD_LACP_ACTIVE,
 	IFLA_BOND_MISSED_MAX,
 	IFLA_BOND_NS_IP6_TARGET,
+	IFLA_BOND_COUPLED_CONTROL,
 	__IFLA_BOND_MAX,
 };
 
-- 
cgit v1.2.3


From b64691274f5d33fc9d93af73483162967f7ec5bb Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 3 Feb 2024 20:53:15 +0100
Subject: net: phy: add helper phy_advertise_eee_all

Per default phylib preserves the EEE advertising at the time of
phy probing. The EEE advertising can be changed from user space,
in addition this helper allows to set the EEE advertising to all
supported modes from drivers in kernel space.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20bfc471-aeeb-4ae4-ba09-7d6d4be6b86a@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy_device.c | 16 ++++++++++++++++
 include/linux/phy.h          |  1 +
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 2eed8f03621d..839e42f9e16a 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -2777,6 +2777,22 @@ void phy_advertise_supported(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_advertise_supported);
 
+/**
+ * phy_advertise_eee_all - Advertise all supported EEE modes
+ * @phydev: target phy_device struct
+ *
+ * Description: Per default phylib preserves the EEE advertising at the time of
+ * phy probing, which might be a subset of the supported EEE modes. Use this
+ * function when all supported EEE modes should be advertised. This does not
+ * trigger auto-negotiation, so must be called before phy_start()/
+ * phylink_start() which will start auto-negotiation.
+ */
+void phy_advertise_eee_all(struct phy_device *phydev)
+{
+	linkmode_copy(phydev->advertising_eee, phydev->supported_eee);
+}
+EXPORT_SYMBOL_GPL(phy_advertise_eee_all);
+
 /**
  * phy_support_sym_pause - Enable support of symmetrical pause
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index ad93f8b1b128..fd8dbea9b4d9 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1960,6 +1960,7 @@ int phy_get_rate_matching(struct phy_device *phydev,
 void phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
 void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode);
 void phy_advertise_supported(struct phy_device *phydev);
+void phy_advertise_eee_all(struct phy_device *phydev);
 void phy_support_sym_pause(struct phy_device *phydev);
 void phy_support_asym_pause(struct phy_device *phydev);
 void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx,
-- 
cgit v1.2.3


From 23c5ae6d467520987dbc8682c3ae6ea0e80a5f27 Mon Sep 17 00:00:00 2001
From: George Guo <guodongtai@kylinos.cn>
Date: Sun, 4 Feb 2024 10:35:31 +0800
Subject: netlabel: cleanup struct netlbl_lsm_catmap

Simplify the code from macro NETLBL_CATMAP_MAPTYPE to u64, and fix
warning "Macros with complex values should be enclosed in parentheses"
on "#define NETLBL_CATMAP_BIT (NETLBL_CATMAP_MAPTYPE)0x01", which is
modified to "#define NETLBL_CATMAP_BIT ((u64)0x01)".

Signed-off-by: George Guo <guodongtai@kylinos.cn>
Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlabel.h       | 7 +++----
 net/netlabel/netlabel_kapi.c | 8 ++++----
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index 43ae50337685..f3ab0b8a4b18 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -145,15 +145,14 @@ struct netlbl_lsm_cache {
  * processing.
  *
  */
-#define NETLBL_CATMAP_MAPTYPE           u64
 #define NETLBL_CATMAP_MAPCNT            4
-#define NETLBL_CATMAP_MAPSIZE           (sizeof(NETLBL_CATMAP_MAPTYPE) * 8)
+#define NETLBL_CATMAP_MAPSIZE           (sizeof(u64) * 8)
 #define NETLBL_CATMAP_SIZE              (NETLBL_CATMAP_MAPSIZE * \
 					 NETLBL_CATMAP_MAPCNT)
-#define NETLBL_CATMAP_BIT               (NETLBL_CATMAP_MAPTYPE)0x01
+#define NETLBL_CATMAP_BIT               ((u64)0x01)
 struct netlbl_lsm_catmap {
 	u32 startbit;
-	NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT];
+	u64 bitmap[NETLBL_CATMAP_MAPCNT];
 	struct netlbl_lsm_catmap *next;
 };
 
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 27511c90a26f..7b844581ebee 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -610,7 +610,7 @@ int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset)
 	struct netlbl_lsm_catmap *iter;
 	u32 idx;
 	u32 bit;
-	NETLBL_CATMAP_MAPTYPE bitmap;
+	u64 bitmap;
 
 	iter = _netlbl_catmap_getnode(&catmap, offset, _CM_F_WALK, 0);
 	if (iter == NULL)
@@ -666,8 +666,8 @@ int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap, u32 offset)
 	struct netlbl_lsm_catmap *prev = NULL;
 	u32 idx;
 	u32 bit;
-	NETLBL_CATMAP_MAPTYPE bitmask;
-	NETLBL_CATMAP_MAPTYPE bitmap;
+	u64 bitmask;
+	u64 bitmap;
 
 	iter = _netlbl_catmap_getnode(&catmap, offset, _CM_F_WALK, 0);
 	if (iter == NULL)
@@ -857,7 +857,7 @@ int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
 
 	offset -= iter->startbit;
 	idx = offset / NETLBL_CATMAP_MAPSIZE;
-	iter->bitmap[idx] |= (NETLBL_CATMAP_MAPTYPE)bitmap
+	iter->bitmap[idx] |= (u64)bitmap
 			     << (offset % NETLBL_CATMAP_MAPSIZE);
 
 	return 0;
-- 
cgit v1.2.3


From d160c66cda0ac8614adc53a5b5b0e6d6f1a05a5b Mon Sep 17 00:00:00 2001
From: Amit Cohen <amcohen@nvidia.com>
Date: Mon, 5 Feb 2024 12:30:22 +0200
Subject: net: Do not return value from init_dummy_netdev()

init_dummy_netdev() always returns zero and all the callers do not check
the returned value. Set the function to not return value, as it is not
really used today.

Signed-off-by: Amit Cohen <amcohen@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240205103022.440946-1-amcohen@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 +-
 net/core/dev.c            | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 118c40258d07..1845dd5043b4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3198,7 +3198,7 @@ static inline void unregister_netdevice(struct net_device *dev)
 int netdev_refcnt_read(const struct net_device *dev);
 void free_netdev(struct net_device *dev);
 void netdev_freemem(struct net_device *dev);
-int init_dummy_netdev(struct net_device *dev);
+void init_dummy_netdev(struct net_device *dev);
 
 struct net_device *netdev_get_xmit_slave(struct net_device *dev,
 					 struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index 27ba057d06c4..e52e2888cccd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10345,7 +10345,7 @@ EXPORT_SYMBOL(register_netdevice);
  *	that need to tie several hardware interfaces to a single NAPI
  *	poll scheduler due to HW limitations.
  */
-int init_dummy_netdev(struct net_device *dev)
+void init_dummy_netdev(struct net_device *dev)
 {
 	/* Clear everything. Note we don't initialize spinlocks
 	 * are they aren't supposed to be taken by any of the
@@ -10373,8 +10373,6 @@ int init_dummy_netdev(struct net_device *dev)
 	 * because users of this 'device' dont need to change
 	 * its refcount.
 	 */
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(init_dummy_netdev);
 
-- 
cgit v1.2.3


From 6c06c88fa838fcc1b7e5380facd086f57fd9d1c4 Mon Sep 17 00:00:00 2001
From: Marek Behún <kabel@kernel.org>
Date: Sun, 4 Feb 2024 15:16:46 +0100
Subject: net: mdio: add 2.5g and 5g related PMA speed constants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add constants indicating 2.5g and 5g ability in the MMD PMA speed
register.

Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/98e15038-d96c-442f-93e4-410100d27866@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/mdio.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
index d03863da180e..3c9097502403 100644
--- a/include/uapi/linux/mdio.h
+++ b/include/uapi/linux/mdio.h
@@ -138,6 +138,8 @@
 #define MDIO_PMA_SPEED_1000		0x0010	/* 1000M capable */
 #define MDIO_PMA_SPEED_100		0x0020	/* 100M capable */
 #define MDIO_PMA_SPEED_10		0x0040	/* 10M capable */
+#define MDIO_PMA_SPEED_2_5G		0x2000	/* 2.5G capable */
+#define MDIO_PMA_SPEED_5G		0x4000	/* 5G capable */
 #define MDIO_PCS_SPEED_10P2B		0x0002	/* 10PASS-TS/2BASE-TL capable */
 #define MDIO_PCS_SPEED_2_5G		0x0040	/* 2.5G capable */
 #define MDIO_PCS_SPEED_5G		0x0080	/* 5G capable */
-- 
cgit v1.2.3


From fd4f101edbd9f99567ab2adb1f2169579ede7c13 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 6 Feb 2024 14:42:57 +0000
Subject: net: add exit_batch_rtnl() method

Many (struct pernet_operations)->exit_batch() methods have
to acquire rtnl.

In presence of rtnl mutex pressure, this makes cleanup_net()
very slow.

This patch adds a new exit_batch_rtnl() method to reduce
number of rtnl acquisitions from cleanup_net().

exit_batch_rtnl() handlers are called while rtnl is locked,
and devices to be killed can be queued in a list provided
as their second argument.

A single unregister_netdevice_many() is called right
before rtnl is released.

exit_batch_rtnl() handlers are called before ->exit() and
->exit_batch() handlers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Antoine Tenart <atenart@kernel.org>
Link: https://lore.kernel.org/r/20240206144313.2050392-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/net_namespace.h |  3 +++
 net/core/net_namespace.c    | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index cd0c2eedbb5e..20c34bd7a077 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -448,6 +448,9 @@ struct pernet_operations {
 	void (*pre_exit)(struct net *net);
 	void (*exit)(struct net *net);
 	void (*exit_batch)(struct list_head *net_exit_list);
+	/* Following method is called with RTNL held. */
+	void (*exit_batch_rtnl)(struct list_head *net_exit_list,
+				struct list_head *dev_kill_list);
 	unsigned int *id;
 	size_t size;
 };
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 72799533426b..233ec0cdd011 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -318,8 +318,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 {
 	/* Must be called with pernet_ops_rwsem held */
 	const struct pernet_operations *ops, *saved_ops;
-	int error = 0;
 	LIST_HEAD(net_exit_list);
+	LIST_HEAD(dev_kill_list);
+	int error = 0;
 
 	refcount_set(&net->ns.count, 1);
 	ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt");
@@ -357,6 +358,15 @@ out_undo:
 
 	synchronize_rcu();
 
+	ops = saved_ops;
+	rtnl_lock();
+	list_for_each_entry_continue_reverse(ops, &pernet_list, list) {
+		if (ops->exit_batch_rtnl)
+			ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list);
+	}
+	unregister_netdevice_many(&dev_kill_list);
+	rtnl_unlock();
+
 	ops = saved_ops;
 	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
 		ops_exit_list(ops, &net_exit_list);
@@ -573,6 +583,7 @@ static void cleanup_net(struct work_struct *work)
 	struct net *net, *tmp, *last;
 	struct llist_node *net_kill_list;
 	LIST_HEAD(net_exit_list);
+	LIST_HEAD(dev_kill_list);
 
 	/* Atomically snapshot the list of namespaces to cleanup */
 	net_kill_list = llist_del_all(&cleanup_list);
@@ -613,6 +624,14 @@ static void cleanup_net(struct work_struct *work)
 	 */
 	synchronize_rcu();
 
+	rtnl_lock();
+	list_for_each_entry_reverse(ops, &pernet_list, list) {
+		if (ops->exit_batch_rtnl)
+			ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list);
+	}
+	unregister_netdevice_many(&dev_kill_list);
+	rtnl_unlock();
+
 	/* Run all of the network namespace exit methods */
 	list_for_each_entry_reverse(ops, &pernet_list, list)
 		ops_exit_list(ops, &net_exit_list);
@@ -1193,7 +1212,17 @@ static void free_exit_list(struct pernet_operations *ops, struct list_head *net_
 {
 	ops_pre_exit_list(ops, net_exit_list);
 	synchronize_rcu();
+
+	if (ops->exit_batch_rtnl) {
+		LIST_HEAD(dev_kill_list);
+
+		rtnl_lock();
+		ops->exit_batch_rtnl(net_exit_list, &dev_kill_list);
+		unregister_netdevice_many(&dev_kill_list);
+		rtnl_unlock();
+	}
 	ops_exit_list(ops, net_exit_list);
+
 	ops_free_list(ops, net_exit_list);
 }
 
-- 
cgit v1.2.3


From 70f16ea2e4f673fc769fd13c00c20a32b4fe238a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 6 Feb 2024 14:43:04 +0000
Subject: ipv4: add __unregister_nexthop_notifier()

unregister_nexthop_notifier() assumes the caller does not hold rtnl.

We need in the following patch to use it from a context
already holding rtnl.

Add __unregister_nexthop_notifier().

unregister_nexthop_notifier() becomes a wrapper.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Antoine Tenart <atenart@kernel.org>
Link: https://lore.kernel.org/r/20240206144313.2050392-9-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/nexthop.h |  1 +
 net/ipv4/nexthop.c    | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index d92046a4a078..6647ad509faa 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -218,6 +218,7 @@ struct nh_notifier_info {
 
 int register_nexthop_notifier(struct net *net, struct notifier_block *nb,
 			      struct netlink_ext_ack *extack);
+int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);
 int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);
 void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap);
 void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 7270a8631406..70509da4f080 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -3631,17 +3631,24 @@ unlock:
 }
 EXPORT_SYMBOL(register_nexthop_notifier);
 
-int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
+int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
 {
 	int err;
 
-	rtnl_lock();
 	err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain,
 						 nb);
-	if (err)
-		goto unlock;
-	nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL);
-unlock:
+	if (!err)
+		nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL);
+	return err;
+}
+EXPORT_SYMBOL(__unregister_nexthop_notifier);
+
+int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
+{
+	int err;
+
+	rtnl_lock();
+	err = __unregister_nexthop_notifier(net, nb);
 	rtnl_unlock();
 	return err;
 }
-- 
cgit v1.2.3


From 9b5b36374ed6953f3efcc82e7cb4c353b9869faf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 6 Feb 2024 14:43:10 +0000
Subject: ip_tunnel: use exit_batch_rtnl() method

exit_batch_rtnl() is called while RTNL is held,
and devices to be unregistered can be queued in the dev_kill_list.

This saves one rtnl_lock()/rtnl_unlock() pair
and one unregister_netdevice_many() call.

This patch takes care of ipip, ip_vti, and ip_gre tunnels.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Antoine Tenart <atenart@kernel.org>
Link: https://lore.kernel.org/r/20240206144313.2050392-15-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip_tunnels.h |  3 ++-
 net/ipv4/ip_gre.c        | 24 +++++++++++++++---------
 net/ipv4/ip_tunnel.c     | 10 ++++------
 net/ipv4/ip_vti.c        |  8 +++++---
 net/ipv4/ipip.c          |  8 +++++---
 5 files changed, 31 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 2d746f4c9a0a..5cd64bb2104d 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -284,7 +284,8 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
 		       struct rtnl_link_ops *ops, char *devname);
 
 void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id,
-			   struct rtnl_link_ops *ops);
+			   struct rtnl_link_ops *ops,
+			   struct list_head *dev_to_kill);
 
 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, const u8 protocol);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 5169c3c72cff..aad5125b7a65 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1025,14 +1025,16 @@ static int __net_init ipgre_init_net(struct net *net)
 	return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
 }
 
-static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
+static void __net_exit ipgre_exit_batch_rtnl(struct list_head *list_net,
+					     struct list_head *dev_to_kill)
 {
-	ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
+	ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops,
+			      dev_to_kill);
 }
 
 static struct pernet_operations ipgre_net_ops = {
 	.init = ipgre_init_net,
-	.exit_batch = ipgre_exit_batch_net,
+	.exit_batch_rtnl = ipgre_exit_batch_rtnl,
 	.id   = &ipgre_net_id,
 	.size = sizeof(struct ip_tunnel_net),
 };
@@ -1697,14 +1699,16 @@ static int __net_init ipgre_tap_init_net(struct net *net)
 	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
 }
 
-static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
+static void __net_exit ipgre_tap_exit_batch_rtnl(struct list_head *list_net,
+						 struct list_head *dev_to_kill)
 {
-	ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
+	ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops,
+			      dev_to_kill);
 }
 
 static struct pernet_operations ipgre_tap_net_ops = {
 	.init = ipgre_tap_init_net,
-	.exit_batch = ipgre_tap_exit_batch_net,
+	.exit_batch_rtnl = ipgre_tap_exit_batch_rtnl,
 	.id   = &gre_tap_net_id,
 	.size = sizeof(struct ip_tunnel_net),
 };
@@ -1715,14 +1719,16 @@ static int __net_init erspan_init_net(struct net *net)
 				  &erspan_link_ops, "erspan0");
 }
 
-static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
+static void __net_exit erspan_exit_batch_rtnl(struct list_head *net_list,
+					      struct list_head *dev_to_kill)
 {
-	ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
+	ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops,
+			      dev_to_kill);
 }
 
 static struct pernet_operations erspan_net_ops = {
 	.init = erspan_init_net,
-	.exit_batch = erspan_exit_batch_net,
+	.exit_batch_rtnl = erspan_exit_batch_rtnl,
 	.id   = &erspan_net_id,
 	.size = sizeof(struct ip_tunnel_net),
 };
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index beeae624c412..00da0b80320f 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -1130,19 +1130,17 @@ static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
 }
 
 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
-			   struct rtnl_link_ops *ops)
+			   struct rtnl_link_ops *ops,
+			   struct list_head *dev_to_kill)
 {
 	struct ip_tunnel_net *itn;
 	struct net *net;
-	LIST_HEAD(list);
 
-	rtnl_lock();
+	ASSERT_RTNL();
 	list_for_each_entry(net, net_list, exit_list) {
 		itn = net_generic(net, id);
-		ip_tunnel_destroy(net, itn, &list, ops);
+		ip_tunnel_destroy(net, itn, dev_to_kill, ops);
 	}
-	unregister_netdevice_many(&list);
-	rtnl_unlock();
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
 
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 9ab9b3ebe0cd..fb1f52d21311 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -510,14 +510,16 @@ static int __net_init vti_init_net(struct net *net)
 	return 0;
 }
 
-static void __net_exit vti_exit_batch_net(struct list_head *list_net)
+static void __net_exit vti_exit_batch_rtnl(struct list_head *list_net,
+					   struct list_head *dev_to_kill)
 {
-	ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops);
+	ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops,
+			      dev_to_kill);
 }
 
 static struct pernet_operations vti_net_ops = {
 	.init = vti_init_net,
-	.exit_batch = vti_exit_batch_net,
+	.exit_batch_rtnl = vti_exit_batch_rtnl,
 	.id   = &vti_net_id,
 	.size = sizeof(struct ip_tunnel_net),
 };
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 27b8f83c6ea2..0151eea06cc5 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -592,14 +592,16 @@ static int __net_init ipip_init_net(struct net *net)
 	return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
 }
 
-static void __net_exit ipip_exit_batch_net(struct list_head *list_net)
+static void __net_exit ipip_exit_batch_rtnl(struct list_head *list_net,
+					    struct list_head *dev_to_kill)
 {
-	ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops);
+	ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops,
+			      dev_to_kill);
 }
 
 static struct pernet_operations ipip_net_ops = {
 	.init = ipip_init_net,
-	.exit_batch = ipip_exit_batch_net,
+	.exit_batch_rtnl = ipip_exit_batch_rtnl,
 	.id   = &ipip_net_id,
 	.size = sizeof(struct ip_tunnel_net),
 };
-- 
cgit v1.2.3


From 0a44dfc070749514b804ccac0b1fd38718f7daa1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 29 Jan 2024 19:34:38 +0100
Subject: wifi: mac80211: simplify non-chanctx drivers

There are still surprisingly many non-chanctx drivers, but in
mac80211 that code is a bit awkward. Simplify this by having
those drivers assign 'emulated' ops, so that the mac80211 code
can be more unified between non-chanctx/chanctx drivers. This
cuts the number of places caring about it by about 15, which
are scattered across - now they're fewer and no longer in the
channel context handling.

Link: https://msgid.link/20240129194108.6d0ead50f5cf.I60d093b2fc81ca1853925a4d0ac3a2337d5baa5b@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/admtek/adm8211.c              |   4 +
 drivers/net/wireless/ath/ar5523/ar5523.c           |   4 +
 drivers/net/wireless/ath/ath5k/mac80211-ops.c      |   4 +
 drivers/net/wireless/ath/ath9k/htc_drv_main.c      |   4 +
 drivers/net/wireless/ath/ath9k/main.c              |   4 +
 drivers/net/wireless/ath/carl9170/main.c           |   4 +
 drivers/net/wireless/ath/wcn36xx/main.c            |   4 +
 drivers/net/wireless/atmel/at76c50x-usb.c          |   4 +
 drivers/net/wireless/broadcom/b43/main.c           |   4 +
 drivers/net/wireless/broadcom/b43legacy/main.c     |   4 +
 .../broadcom/brcm80211/brcmsmac/mac80211_if.c      |   4 +
 drivers/net/wireless/intel/iwlegacy/3945-mac.c     |   4 +
 drivers/net/wireless/intel/iwlegacy/4965-mac.c     |   4 +
 drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c  |   4 +
 drivers/net/wireless/intersil/p54/main.c           |   4 +
 drivers/net/wireless/marvell/libertas_tf/main.c    |   4 +
 drivers/net/wireless/marvell/mwl8k.c               |   4 +
 drivers/net/wireless/mediatek/mt76/mt7603/main.c   |   4 +
 drivers/net/wireless/mediatek/mt76/mt76x0/pci.c    |   4 +
 drivers/net/wireless/mediatek/mt76/mt76x0/usb.c    |   4 +
 .../net/wireless/mediatek/mt76/mt76x2/pci_main.c   |   4 +
 .../net/wireless/mediatek/mt76/mt76x2/usb_main.c   |   4 +
 drivers/net/wireless/mediatek/mt76/mt792x_core.c   |   7 +-
 drivers/net/wireless/mediatek/mt76/mt7996/main.c   |   4 +
 drivers/net/wireless/mediatek/mt7601u/main.c       |   4 +
 drivers/net/wireless/purelifi/plfxlc/mac.c         |   4 +
 drivers/net/wireless/ralink/rt2x00/rt2400pci.c     |   4 +
 drivers/net/wireless/ralink/rt2x00/rt2500pci.c     |   4 +
 drivers/net/wireless/ralink/rt2x00/rt2500usb.c     |   4 +
 drivers/net/wireless/ralink/rt2x00/rt2800pci.c     |   4 +
 drivers/net/wireless/ralink/rt2x00/rt2800soc.c     |   4 +
 drivers/net/wireless/ralink/rt2x00/rt2800usb.c     |   4 +
 drivers/net/wireless/ralink/rt2x00/rt61pci.c       |   4 +
 drivers/net/wireless/ralink/rt2x00/rt73usb.c       |   4 +
 drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c |   4 +
 drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c |   4 +
 .../net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c  |   4 +
 drivers/net/wireless/realtek/rtlwifi/core.c        |   4 +
 drivers/net/wireless/realtek/rtw88/mac80211.c      |   4 +
 drivers/net/wireless/realtek/rtw89/core.c          |   7 +-
 drivers/net/wireless/rsi/rsi_91x_mac80211.c        |   4 +
 drivers/net/wireless/st/cw1200/main.c              |   4 +
 drivers/net/wireless/ti/wl1251/main.c              |   4 +
 drivers/net/wireless/virtual/mac80211_hwsim.c      |   4 +
 drivers/net/wireless/zydas/zd1211rw/zd_mac.c       |   4 +
 drivers/staging/vt6655/device_main.c               |   4 +
 drivers/staging/vt6656/main_usb.c                  |   4 +
 include/net/mac80211.h                             |  13 ++
 net/mac80211/cfg.c                                 |  42 ++---
 net/mac80211/chan.c                                | 111 ++---------
 net/mac80211/ieee80211_i.h                         |   9 +-
 net/mac80211/iface.c                               |   6 +-
 net/mac80211/main.c                                | 205 +++++++++++++++++----
 net/mac80211/mlme.c                                |   3 +-
 net/mac80211/offchannel.c                          |  21 ++-
 net/mac80211/scan.c                                |  18 +-
 net/mac80211/tx.c                                  |   2 -
 net/mac80211/util.c                                |  27 ++-
 58 files changed, 444 insertions(+), 207 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/admtek/adm8211.c b/drivers/net/wireless/admtek/adm8211.c
index 2fceea9f6550..e3fd48dd3909 100644
--- a/drivers/net/wireless/admtek/adm8211.c
+++ b/drivers/net/wireless/admtek/adm8211.c
@@ -1759,6 +1759,10 @@ static int adm8211_alloc_rings(struct ieee80211_hw *dev)
 }
 
 static const struct ieee80211_ops adm8211_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= adm8211_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= adm8211_start,
diff --git a/drivers/net/wireless/ath/ar5523/ar5523.c b/drivers/net/wireless/ath/ar5523/ar5523.c
index a742cec44e3d..815f8f599f5d 100644
--- a/drivers/net/wireless/ath/ar5523/ar5523.c
+++ b/drivers/net/wireless/ath/ar5523/ar5523.c
@@ -1358,6 +1358,10 @@ static void ar5523_configure_filter(struct ieee80211_hw *hw,
 }
 
 static const struct ieee80211_ops ar5523_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.start			= ar5523_start,
 	.stop			= ar5523_stop,
 	.tx			= ar5523_tx,
diff --git a/drivers/net/wireless/ath/ath5k/mac80211-ops.c b/drivers/net/wireless/ath/ath5k/mac80211-ops.c
index c630343ca4f9..eea4bda77608 100644
--- a/drivers/net/wireless/ath/ath5k/mac80211-ops.c
+++ b/drivers/net/wireless/ath/ath5k/mac80211-ops.c
@@ -779,6 +779,10 @@ static int ath5k_set_ringparam(struct ieee80211_hw *hw, u32 tx, u32 rx)
 
 
 const struct ieee80211_ops ath5k_hw_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= ath5k_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= ath5k_start,
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
index 9a9b5212051a..b389e19381c4 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
@@ -1868,6 +1868,10 @@ static void ath9k_htc_channel_switch_beacon(struct ieee80211_hw *hw,
 }
 
 struct ieee80211_ops ath9k_htc_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx                 = ath9k_htc_tx,
 	.wake_tx_queue      = ieee80211_handle_wake_tx_queue,
 	.start              = ath9k_htc_start,
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index c48ff0ffbfef..a2943aaecb20 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -2786,6 +2786,10 @@ static int ath9k_get_txpower(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 }
 
 struct ieee80211_ops ath9k_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx 		    = ath9k_tx,
 	.start 		    = ath9k_start,
 	.stop 		    = ath9k_stop,
diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c
index 524327d24964..7e7797bf44b7 100644
--- a/drivers/net/wireless/ath/carl9170/main.c
+++ b/drivers/net/wireless/ath/carl9170/main.c
@@ -1712,6 +1712,10 @@ static bool carl9170_tx_frames_pending(struct ieee80211_hw *hw)
 }
 
 static const struct ieee80211_ops carl9170_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.start			= carl9170_op_start,
 	.stop			= carl9170_op_stop,
 	.tx			= carl9170_op_tx,
diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index 4e6b4df8562f..bfbd3c7a70b3 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -1347,6 +1347,10 @@ static void wcn36xx_sta_statistics(struct ieee80211_hw *hw, struct ieee80211_vif
 }
 
 static const struct ieee80211_ops wcn36xx_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.start			= wcn36xx_start,
 	.stop			= wcn36xx_stop,
 	.add_interface		= wcn36xx_add_interface,
diff --git a/drivers/net/wireless/atmel/at76c50x-usb.c b/drivers/net/wireless/atmel/at76c50x-usb.c
index 447b51cff8f9..0b55a272bfd6 100644
--- a/drivers/net/wireless/atmel/at76c50x-usb.c
+++ b/drivers/net/wireless/atmel/at76c50x-usb.c
@@ -2178,6 +2178,10 @@ static int at76_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 }
 
 static const struct ieee80211_ops at76_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx = at76_mac80211_tx,
 	.wake_tx_queue = ieee80211_handle_wake_tx_queue,
 	.add_interface = at76_add_interface,
diff --git a/drivers/net/wireless/broadcom/b43/main.c b/drivers/net/wireless/broadcom/b43/main.c
index effb6c23f825..badb2f494035 100644
--- a/drivers/net/wireless/broadcom/b43/main.c
+++ b/drivers/net/wireless/broadcom/b43/main.c
@@ -5172,6 +5172,10 @@ static int b43_op_get_survey(struct ieee80211_hw *hw, int idx,
 }
 
 static const struct ieee80211_ops b43_hw_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= b43_op_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.conf_tx		= b43_op_conf_tx,
diff --git a/drivers/net/wireless/broadcom/b43legacy/main.c b/drivers/net/wireless/broadcom/b43legacy/main.c
index 760136638a95..18eb610f600a 100644
--- a/drivers/net/wireless/broadcom/b43legacy/main.c
+++ b/drivers/net/wireless/broadcom/b43legacy/main.c
@@ -3531,6 +3531,10 @@ static int b43legacy_op_get_survey(struct ieee80211_hw *hw, int idx,
 }
 
 static const struct ieee80211_ops b43legacy_hw_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= b43legacy_op_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.conf_tx		= b43legacy_op_conf_tx,
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
index 543e93ec49d2..92860dc0a92e 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
@@ -959,6 +959,10 @@ static int brcms_ops_beacon_set_tim(struct ieee80211_hw *hw,
 }
 
 static const struct ieee80211_ops brcms_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx = brcms_ops_tx,
 	.wake_tx_queue = ieee80211_handle_wake_tx_queue,
 	.start = brcms_ops_start,
diff --git a/drivers/net/wireless/intel/iwlegacy/3945-mac.c b/drivers/net/wireless/intel/iwlegacy/3945-mac.c
index 9eaf5ec133f9..075b705a8d7b 100644
--- a/drivers/net/wireless/intel/iwlegacy/3945-mac.c
+++ b/drivers/net/wireless/intel/iwlegacy/3945-mac.c
@@ -3432,6 +3432,10 @@ static const struct attribute_group il3945_attribute_group = {
 };
 
 static struct ieee80211_ops il3945_mac_ops __ro_after_init = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx = il3945_mac_tx,
 	.wake_tx_queue = ieee80211_handle_wake_tx_queue,
 	.start = il3945_mac_start,
diff --git a/drivers/net/wireless/intel/iwlegacy/4965-mac.c b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
index 70e420df1643..4beb7be6d51d 100644
--- a/drivers/net/wireless/intel/iwlegacy/4965-mac.c
+++ b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
@@ -6301,6 +6301,10 @@ il4965_tx_queue_set_status(struct il_priv *il, struct il_tx_queue *txq,
 }
 
 static const struct ieee80211_ops il4965_mac_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx = il4965_mac_tx,
 	.wake_tx_queue = ieee80211_handle_wake_tx_queue,
 	.start = il4965_mac_start,
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
index 5f3d5b15f727..52b008ce53bd 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
@@ -1570,6 +1570,10 @@ static void iwlagn_mac_sta_notify(struct ieee80211_hw *hw,
 }
 
 const struct ieee80211_ops iwlagn_hw_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx = iwlagn_mac_tx,
 	.wake_tx_queue = ieee80211_handle_wake_tx_queue,
 	.start = iwlagn_mac_start,
diff --git a/drivers/net/wireless/intersil/p54/main.c b/drivers/net/wireless/intersil/p54/main.c
index c6084683aedd..687841b2fa2a 100644
--- a/drivers/net/wireless/intersil/p54/main.c
+++ b/drivers/net/wireless/intersil/p54/main.c
@@ -704,6 +704,10 @@ static void p54_set_coverage_class(struct ieee80211_hw *dev,
 }
 
 static const struct ieee80211_ops p54_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= p54_tx_80211,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= p54_start,
diff --git a/drivers/net/wireless/marvell/libertas_tf/main.c b/drivers/net/wireless/marvell/libertas_tf/main.c
index 199d33ed3bb9..9cca69fe04d7 100644
--- a/drivers/net/wireless/marvell/libertas_tf/main.c
+++ b/drivers/net/wireless/marvell/libertas_tf/main.c
@@ -473,6 +473,10 @@ static int lbtf_op_get_survey(struct ieee80211_hw *hw, int idx,
 }
 
 static const struct ieee80211_ops lbtf_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= lbtf_op_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= lbtf_op_start,
diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c
index 13bcb123d122..ce8fea76dbb2 100644
--- a/drivers/net/wireless/marvell/mwl8k.c
+++ b/drivers/net/wireless/marvell/mwl8k.c
@@ -5610,6 +5610,10 @@ static void mwl8k_sw_scan_complete(struct ieee80211_hw *hw,
 }
 
 static const struct ieee80211_ops mwl8k_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= mwl8k_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= mwl8k_start,
diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/main.c b/drivers/net/wireless/mediatek/mt76/mt7603/main.c
index e2146d30e553..9b49267b1eab 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7603/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7603/main.c
@@ -701,6 +701,10 @@ static void mt7603_tx(struct ieee80211_hw *hw,
 }
 
 const struct ieee80211_ops mt7603_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx = mt7603_tx,
 	.start = mt7603_start,
 	.stop = mt7603_stop,
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c b/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c
index 293e66fa83d5..79b7996ad1a8 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x0/pci.c
@@ -59,6 +59,10 @@ mt76x0e_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 }
 
 static const struct ieee80211_ops mt76x0e_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx = mt76x02_tx,
 	.start = mt76x0e_start,
 	.stop = mt76x0e_stop,
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c b/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c
index dd042949cf82..bba44f289b4e 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c
@@ -118,6 +118,10 @@ static int mt76x0u_start(struct ieee80211_hw *hw)
 }
 
 static const struct ieee80211_ops mt76x0u_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx = mt76x02_tx,
 	.start = mt76x0u_start,
 	.stop = mt76x0u_stop,
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c b/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c
index b38bb7a2362b..bfc8c69f43fa 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c
@@ -132,6 +132,10 @@ static int mt76x2_set_antenna(struct ieee80211_hw *hw, u32 tx_ant,
 }
 
 const struct ieee80211_ops mt76x2_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx = mt76x02_tx,
 	.start = mt76x2_start,
 	.stop = mt76x2_stop,
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c b/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c
index ac07ed1f63a3..9fe390fdd730 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c
@@ -103,6 +103,10 @@ mt76x2u_config(struct ieee80211_hw *hw, u32 changed)
 }
 
 const struct ieee80211_ops mt76x2u_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx = mt76x02_tx,
 	.start = mt76x2u_start,
 	.stop = mt76x2u_stop,
diff --git a/drivers/net/wireless/mediatek/mt76/mt792x_core.c b/drivers/net/wireless/mediatek/mt76/mt792x_core.c
index c42101aa9e45..7872fbae7252 100644
--- a/drivers/net/wireless/mediatek/mt76/mt792x_core.c
+++ b/drivers/net/wireless/mediatek/mt76/mt792x_core.c
@@ -684,9 +684,10 @@ mt792x_get_mac80211_ops(struct device *dev,
 	if (!(*fw_features & MT792x_FW_CAP_CNM)) {
 		ops->remain_on_channel = NULL;
 		ops->cancel_remain_on_channel = NULL;
-		ops->add_chanctx = NULL;
-		ops->remove_chanctx = NULL;
-		ops->change_chanctx = NULL;
+		ops->add_chanctx = ieee80211_emulate_add_chanctx;
+		ops->remove_chanctx = ieee80211_emulate_remove_chanctx;
+		ops->change_chanctx = ieee80211_emulate_change_chanctx;
+		ops->switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx;
 		ops->assign_vif_chanctx = NULL;
 		ops->unassign_vif_chanctx = NULL;
 		ops->mgd_prepare_tx = NULL;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/main.c b/drivers/net/wireless/mediatek/mt76/mt7996/main.c
index 51deea84b642..234e6495871b 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7996/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7996/main.c
@@ -1450,6 +1450,10 @@ mt7996_net_fill_forward_path(struct ieee80211_hw *hw,
 #endif
 
 const struct ieee80211_ops mt7996_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx = mt7996_tx,
 	.start = mt7996_start,
 	.stop = mt7996_stop,
diff --git a/drivers/net/wireless/mediatek/mt7601u/main.c b/drivers/net/wireless/mediatek/mt7601u/main.c
index c8d332456a6b..a7330576486b 100644
--- a/drivers/net/wireless/mediatek/mt7601u/main.c
+++ b/drivers/net/wireless/mediatek/mt7601u/main.c
@@ -405,6 +405,10 @@ out:
 }
 
 const struct ieee80211_ops mt7601u_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx = mt7601u_tx,
 	.wake_tx_queue = ieee80211_handle_wake_tx_queue,
 	.start = mt7601u_start,
diff --git a/drivers/net/wireless/purelifi/plfxlc/mac.c b/drivers/net/wireless/purelifi/plfxlc/mac.c
index 6f5857d09af0..641f847d47ab 100644
--- a/drivers/net/wireless/purelifi/plfxlc/mac.c
+++ b/drivers/net/wireless/purelifi/plfxlc/mac.c
@@ -684,6 +684,10 @@ static int plfxlc_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
 }
 
 static const struct ieee80211_ops plfxlc_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx = plfxlc_op_tx,
 	.wake_tx_queue = ieee80211_handle_wake_tx_queue,
 	.start = plfxlc_op_start,
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2400pci.c b/drivers/net/wireless/ralink/rt2x00/rt2400pci.c
index 13dd672b825e..42e21e9f303b 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2400pci.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2400pci.c
@@ -1705,6 +1705,10 @@ static int rt2400pci_tx_last_beacon(struct ieee80211_hw *hw)
 }
 
 static const struct ieee80211_ops rt2400pci_mac80211_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= rt2x00mac_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= rt2x00mac_start,
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2500pci.c b/drivers/net/wireless/ralink/rt2x00/rt2500pci.c
index ecddda4c471e..36ddc5a69fa4 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2500pci.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2500pci.c
@@ -2003,6 +2003,10 @@ static int rt2500pci_tx_last_beacon(struct ieee80211_hw *hw)
 }
 
 static const struct ieee80211_ops rt2500pci_mac80211_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= rt2x00mac_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= rt2x00mac_start,
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2500usb.c b/drivers/net/wireless/ralink/rt2x00/rt2500usb.c
index 13fdcff0ad66..09923765e2db 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2500usb.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2500usb.c
@@ -1794,6 +1794,10 @@ static int rt2500usb_probe_hw(struct rt2x00_dev *rt2x00dev)
 }
 
 static const struct ieee80211_ops rt2500usb_mac80211_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= rt2x00mac_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= rt2x00mac_start,
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800pci.c b/drivers/net/wireless/ralink/rt2x00/rt2800pci.c
index dcb56f708a5f..14c45aba836f 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800pci.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800pci.c
@@ -287,6 +287,10 @@ static int rt2800pci_read_eeprom(struct rt2x00_dev *rt2x00dev)
 }
 
 static const struct ieee80211_ops rt2800pci_mac80211_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= rt2x00mac_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= rt2x00mac_start,
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800soc.c b/drivers/net/wireless/ralink/rt2x00/rt2800soc.c
index 7118d4f9038d..701ba54bf3e5 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800soc.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800soc.c
@@ -132,6 +132,10 @@ static int rt2800soc_write_firmware(struct rt2x00_dev *rt2x00dev,
 }
 
 static const struct ieee80211_ops rt2800soc_mac80211_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= rt2x00mac_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= rt2x00mac_start,
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800usb.c b/drivers/net/wireless/ralink/rt2x00/rt2800usb.c
index b2a8e75a901b..160bef79acdb 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800usb.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800usb.c
@@ -629,6 +629,10 @@ static int rt2800usb_probe_hw(struct rt2x00_dev *rt2x00dev)
 }
 
 static const struct ieee80211_ops rt2800usb_mac80211_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= rt2x00mac_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= rt2x00mac_start,
diff --git a/drivers/net/wireless/ralink/rt2x00/rt61pci.c b/drivers/net/wireless/ralink/rt2x00/rt61pci.c
index 483723bf514b..d1cd5694e3c7 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt61pci.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt61pci.c
@@ -2872,6 +2872,10 @@ static u64 rt61pci_get_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 }
 
 static const struct ieee80211_ops rt61pci_mac80211_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= rt2x00mac_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= rt2x00mac_start,
diff --git a/drivers/net/wireless/ralink/rt2x00/rt73usb.c b/drivers/net/wireless/ralink/rt2x00/rt73usb.c
index dfa9d5213898..b79dda952a33 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt73usb.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt73usb.c
@@ -2291,6 +2291,10 @@ static u64 rt73usb_get_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 }
 
 static const struct ieee80211_ops rt73usb_mac80211_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= rt2x00mac_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= rt2x00mac_start,
diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c
index f6c25a52b69a..77b6cb7e1f6b 100644
--- a/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c
+++ b/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c
@@ -1607,6 +1607,10 @@ static void rtl8180_configure_filter(struct ieee80211_hw *dev,
 }
 
 static const struct ieee80211_ops rtl8180_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= rtl8180_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= rtl8180_start,
diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
index 04945f905d6d..78d99afa373d 100644
--- a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
+++ b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
@@ -1377,6 +1377,10 @@ static int rtl8187_conf_tx(struct ieee80211_hw *dev,
 
 
 static const struct ieee80211_ops rtl8187_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= rtl8187_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= rtl8187_start,
diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
index 055f66b623ff..b0c1db726d7a 100644
--- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
+++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
@@ -7654,6 +7654,10 @@ static int rtl8xxxu_sta_remove(struct ieee80211_hw *hw,
 }
 
 static const struct ieee80211_ops rtl8xxxu_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx = rtl8xxxu_tx,
 	.wake_tx_queue = ieee80211_handle_wake_tx_queue,
 	.add_interface = rtl8xxxu_add_interface,
diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c
index 69e97647e3d6..2e60a6991ca1 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -1903,6 +1903,10 @@ void rtl_init_sw_leds(struct ieee80211_hw *hw)
 EXPORT_SYMBOL(rtl_init_sw_leds);
 
 const struct ieee80211_ops rtl_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.start = rtl_op_start,
 	.stop = rtl_op_stop,
 	.tx = rtl_op_tx,
diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c
index d8d68f16014e..7af5bf7fe5b6 100644
--- a/drivers/net/wireless/realtek/rtw88/mac80211.c
+++ b/drivers/net/wireless/realtek/rtw88/mac80211.c
@@ -927,6 +927,10 @@ static void rtw_ops_sta_rc_update(struct ieee80211_hw *hw,
 }
 
 const struct ieee80211_ops rtw_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= rtw_ops_tx,
 	.wake_tx_queue		= rtw_ops_wake_tx_queue,
 	.start			= rtw_ops_start,
diff --git a/drivers/net/wireless/realtek/rtw89/core.c b/drivers/net/wireless/realtek/rtw89/core.c
index 61a216464b6d..650c507c8ed3 100644
--- a/drivers/net/wireless/realtek/rtw89/core.c
+++ b/drivers/net/wireless/realtek/rtw89/core.c
@@ -4579,9 +4579,10 @@ struct rtw89_dev *rtw89_alloc_ieee80211_hw(struct device *device,
 		     !RTW89_CHK_FW_FEATURE(BEACON_FILTER, &early_fw);
 
 	if (no_chanctx) {
-		ops->add_chanctx = NULL;
-		ops->remove_chanctx = NULL;
-		ops->change_chanctx = NULL;
+		ops->add_chanctx = ieee80211_emulate_add_chanctx;
+		ops->remove_chanctx = ieee80211_emulate_remove_chanctx;
+		ops->change_chanctx = ieee80211_emulate_change_chanctx;
+		ops->switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx;
 		ops->assign_vif_chanctx = NULL;
 		ops->unassign_vif_chanctx = NULL;
 		ops->remain_on_channel = NULL;
diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index 05890536e353..e8aeb4d76c13 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -1957,6 +1957,10 @@ static int rsi_mac80211_resume(struct ieee80211_hw *hw)
 #endif
 
 static const struct ieee80211_ops mac80211_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx = rsi_mac80211_tx,
 	.wake_tx_queue = ieee80211_handle_wake_tx_queue,
 	.start = rsi_mac80211_start,
diff --git a/drivers/net/wireless/st/cw1200/main.c b/drivers/net/wireless/st/cw1200/main.c
index 381013e0db63..a54a7b86864f 100644
--- a/drivers/net/wireless/st/cw1200/main.c
+++ b/drivers/net/wireless/st/cw1200/main.c
@@ -203,6 +203,10 @@ static const unsigned long cw1200_ttl[] = {
 };
 
 static const struct ieee80211_ops cw1200_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.start			= cw1200_start,
 	.stop			= cw1200_stop,
 	.add_interface		= cw1200_add_interface,
diff --git a/drivers/net/wireless/ti/wl1251/main.c b/drivers/net/wireless/ti/wl1251/main.c
index cd9a41f59f32..0da2d29dd7bd 100644
--- a/drivers/net/wireless/ti/wl1251/main.c
+++ b/drivers/net/wireless/ti/wl1251/main.c
@@ -1351,6 +1351,10 @@ static struct ieee80211_supported_band wl1251_band_2ghz = {
 };
 
 static const struct ieee80211_ops wl1251_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.start = wl1251_op_start,
 	.stop = wl1251_op_stop,
 	.add_interface = wl1251_op_add_interface,
diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c
index a06a462d38f0..907c0842fee7 100644
--- a/drivers/net/wireless/virtual/mac80211_hwsim.c
+++ b/drivers/net/wireless/virtual/mac80211_hwsim.c
@@ -3922,6 +3922,10 @@ static const struct ieee80211_ops mac80211_hwsim_ops = {
 	HWSIM_NON_MLO_OPS
 	.sw_scan_start = mac80211_hwsim_sw_scan,
 	.sw_scan_complete = mac80211_hwsim_sw_scan_complete,
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 };
 
 #define HWSIM_CHANCTX_OPS					\
diff --git a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
index 5d534e15a844..900c063bd724 100644
--- a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
@@ -1343,6 +1343,10 @@ static u64 zd_op_get_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 }
 
 static const struct ieee80211_ops zd_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= zd_op_tx,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= zd_op_start,
diff --git a/drivers/staging/vt6655/device_main.c b/drivers/staging/vt6655/device_main.c
index b0b262de6480..e23a5da2b67e 100644
--- a/drivers/staging/vt6655/device_main.c
+++ b/drivers/staging/vt6655/device_main.c
@@ -1684,6 +1684,10 @@ static void vnt_reset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 }
 
 static const struct ieee80211_ops vnt_mac_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= vnt_tx_80211,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= vnt_start,
diff --git a/drivers/staging/vt6656/main_usb.c b/drivers/staging/vt6656/main_usb.c
index 2abae90f3f52..6c70493d1b01 100644
--- a/drivers/staging/vt6656/main_usb.c
+++ b/drivers/staging/vt6656/main_usb.c
@@ -956,6 +956,10 @@ static void vnt_reset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 }
 
 static const struct ieee80211_ops vnt_mac_ops = {
+	.add_chanctx = ieee80211_emulate_add_chanctx,
+	.remove_chanctx = ieee80211_emulate_remove_chanctx,
+	.change_chanctx = ieee80211_emulate_change_chanctx,
+	.switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
 	.tx			= vnt_tx_80211,
 	.wake_tx_queue		= ieee80211_handle_wake_tx_queue,
 	.start			= vnt_start,
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 8d6ae22c09bf..62c4b4d10bb4 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -7532,4 +7532,17 @@ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links);
 void ieee80211_set_active_links_async(struct ieee80211_vif *vif,
 				      u16 active_links);
 
+/* for older drivers - let's not document these ... */
+int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw,
+				  struct ieee80211_chanctx_conf *ctx);
+void ieee80211_emulate_remove_chanctx(struct ieee80211_hw *hw,
+				      struct ieee80211_chanctx_conf *ctx);
+void ieee80211_emulate_change_chanctx(struct ieee80211_hw *hw,
+				      struct ieee80211_chanctx_conf *ctx,
+				      u32 changed);
+int ieee80211_emulate_switch_vif_chanctx(struct ieee80211_hw *hw,
+					 struct ieee80211_vif_chanctx_switch *vifs,
+					 int n_vifs,
+					 enum ieee80211_chanctx_switch_mode mode);
+
 #endif /* MAC80211_H */
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 6e2acad7fd71..d3bf029709d5 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -886,33 +886,30 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy,
 {
 	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_sub_if_data *sdata;
-	int ret = 0;
+	int ret;
 
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	if (cfg80211_chandef_identical(&local->monitor_chandef, chandef))
 		return 0;
 
-	if (local->use_chanctx) {
-		sdata = wiphy_dereference(local->hw.wiphy,
-					  local->monitor_sdata);
-		if (sdata) {
-			ieee80211_link_release_channel(&sdata->deflink);
-			ret = ieee80211_link_use_channel(&sdata->deflink,
-							 chandef,
-							 IEEE80211_CHANCTX_EXCLUSIVE);
-		}
-	} else {
-		if (local->open_count == local->monitors) {
-			local->_oper_chandef = *chandef;
-			ieee80211_hw_config(local, 0);
-		}
-	}
+	sdata = wiphy_dereference(local->hw.wiphy,
+				  local->monitor_sdata);
+	if (!sdata)
+		goto done;
 
-	if (ret == 0)
-		local->monitor_chandef = *chandef;
+	if (cfg80211_chandef_identical(&sdata->vif.bss_conf.chandef, chandef))
+		return 0;
 
-	return ret;
+	ieee80211_link_release_channel(&sdata->deflink);
+	ret = ieee80211_link_use_channel(&sdata->deflink,
+					 chandef,
+					 IEEE80211_CHANCTX_EXCLUSIVE);
+	if (ret)
+		return ret;
+done:
+	local->monitor_chandef = *chandef;
+	return 0;
 }
 
 static int
@@ -3086,7 +3083,7 @@ static int ieee80211_get_tx_power(struct wiphy *wiphy,
 	if (local->ops->get_txpower)
 		return drv_get_txpower(local, sdata, dbm);
 
-	if (!local->use_chanctx)
+	if (local->emulate_chanctx)
 		*dbm = local->hw.conf.power_level;
 	else
 		*dbm = sdata->vif.bss_conf.txpower;
@@ -4214,10 +4211,7 @@ static int ieee80211_cfg_get_channel(struct wiphy *wiphy,
 	} else if (local->open_count > 0 &&
 		   local->open_count == local->monitors &&
 		   sdata->vif.type == NL80211_IFTYPE_MONITOR) {
-		if (local->use_chanctx)
-			*chandef = local->monitor_chandef;
-		else
-			*chandef = local->_oper_chandef;
+		*chandef = local->monitor_chandef;
 		ret = 0;
 	}
 out:
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 70ba5dc4b283..cf6297ffaef3 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -519,11 +519,6 @@ static void _ieee80211_change_chanctx(struct ieee80211_local *local,
 
 	drv_change_chanctx(local, ctx, changed);
 
-	if (!local->use_chanctx) {
-		local->_oper_chandef = *chandef;
-		ieee80211_hw_config(local, 0);
-	}
-
 	/* check is BW wider */
 	ieee80211_chan_bw_change(local, old_ctx, false);
 }
@@ -674,23 +669,15 @@ static int ieee80211_add_chanctx(struct ieee80211_local *local,
 
 	ieee80211_add_wbrf(local, &ctx->conf.def);
 
-	if (!local->use_chanctx)
-		local->hw.conf.radar_enabled = ctx->conf.radar_enabled;
-
 	/* turn idle off *before* setting channel -- some drivers need that */
 	changed = ieee80211_idle_off(local);
 	if (changed)
 		ieee80211_hw_config(local, changed);
 
-	if (!local->use_chanctx) {
-		local->_oper_chandef = ctx->conf.def;
-		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
-	} else {
-		err = drv_add_chanctx(local, ctx);
-		if (err) {
-			ieee80211_recalc_idle(local);
-			return err;
-		}
+	err = drv_add_chanctx(local, ctx);
+	if (err) {
+		ieee80211_recalc_idle(local);
+		return err;
 	}
 
 	return 0;
@@ -725,32 +712,7 @@ static void ieee80211_del_chanctx(struct ieee80211_local *local,
 {
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	if (!local->use_chanctx) {
-		struct cfg80211_chan_def *chandef = &local->_oper_chandef;
-		/* S1G doesn't have 20MHz, so get the correct width for the
-		 * current channel.
-		 */
-		if (chandef->chan->band == NL80211_BAND_S1GHZ)
-			chandef->width =
-				ieee80211_s1g_channel_width(chandef->chan);
-		else
-			chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
-		chandef->center_freq1 = chandef->chan->center_freq;
-		chandef->freq1_offset = chandef->chan->freq_offset;
-		chandef->center_freq2 = 0;
-
-		/* NOTE: Disabling radar is only valid here for
-		 * single channel context. To be sure, check it ...
-		 */
-		WARN_ON(local->hw.conf.radar_enabled &&
-			!list_empty(&local->chanctx_list));
-
-		local->hw.conf.radar_enabled = false;
-
-		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
-	} else {
-		drv_remove_chanctx(local, ctx);
-	}
+	drv_remove_chanctx(local, ctx);
 
 	ieee80211_recalc_idle(local);
 
@@ -849,11 +811,6 @@ static void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local,
 
 	chanctx->conf.radar_enabled = radar_enabled;
 
-	if (!local->use_chanctx) {
-		local->hw.conf.radar_enabled = chanctx->conf.radar_enabled;
-		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
-	}
-
 	drv_change_chanctx(local, chanctx, IEEE80211_CHANCTX_CHANGE_RADAR);
 }
 
@@ -995,16 +952,6 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
 
 	rcu_read_unlock();
 
-	if (!local->use_chanctx) {
-		if (rx_chains_static > 1)
-			local->smps_mode = IEEE80211_SMPS_OFF;
-		else if (rx_chains_dynamic > 1)
-			local->smps_mode = IEEE80211_SMPS_DYNAMIC;
-		else
-			local->smps_mode = IEEE80211_SMPS_STATIC;
-		ieee80211_hw_config(local, 0);
-	}
-
 	if (rx_chains_static == chanctx->conf.rx_chains_static &&
 	    rx_chains_dynamic == chanctx->conf.rx_chains_dynamic)
 		return;
@@ -1114,7 +1061,7 @@ int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link,
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	curr_ctx = ieee80211_link_get_chanctx(link);
-	if (curr_ctx && local->use_chanctx && !local->ops->switch_vif_chanctx)
+	if (curr_ctx && !local->ops->switch_vif_chanctx)
 		return -EOPNOTSUPP;
 
 	new_ctx = ieee80211_find_reservation_chanctx(local, chandef, mode);
@@ -1412,24 +1359,6 @@ ieee80211_link_has_in_place_reservation(struct ieee80211_link_data *link)
 	return true;
 }
 
-static int ieee80211_chsw_switch_hwconf(struct ieee80211_local *local,
-					struct ieee80211_chanctx *new_ctx)
-{
-	const struct cfg80211_chan_def *chandef;
-
-	lockdep_assert_wiphy(local->hw.wiphy);
-
-	chandef = ieee80211_chanctx_reserved_chandef(local, new_ctx, NULL);
-	if (WARN_ON(!chandef))
-		return -EINVAL;
-
-	local->hw.conf.radar_enabled = new_ctx->conf.radar_enabled;
-	local->_oper_chandef = *chandef;
-	ieee80211_hw_config(local, 0);
-
-	return 0;
-}
-
 static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local,
 				      int n_vifs)
 {
@@ -1518,7 +1447,6 @@ err:
 static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 {
 	struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx;
-	struct ieee80211_chanctx *new_ctx = NULL;
 	int err, n_assigned, n_reserved, n_ready;
 	int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0;
 
@@ -1551,9 +1479,6 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 			goto err;
 		}
 
-		if (!local->use_chanctx)
-			new_ctx = ctx;
-
 		n_ctx++;
 
 		n_assigned = 0;
@@ -1607,9 +1532,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 	if (WARN_ON(n_ctx == 0) ||
 	    WARN_ON(n_vifs_switch == 0 &&
 		    n_vifs_assign == 0 &&
-		    n_vifs_ctxless == 0) ||
-	    WARN_ON(n_ctx > 1 && !local->use_chanctx) ||
-	    WARN_ON(!new_ctx && !local->use_chanctx)) {
+		    n_vifs_ctxless == 0)) {
 		err = -EINVAL;
 		goto err;
 	}
@@ -1619,20 +1542,14 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 	 * reservations and driver capabilities.
 	 */
 
-	if (local->use_chanctx) {
-		if (n_vifs_switch > 0) {
-			err = ieee80211_chsw_switch_vifs(local, n_vifs_switch);
-			if (err)
-				goto err;
-		}
+	if (n_vifs_switch > 0) {
+		err = ieee80211_chsw_switch_vifs(local, n_vifs_switch);
+		if (err)
+			goto err;
+	}
 
-		if (n_vifs_assign > 0 || n_vifs_ctxless > 0) {
-			err = ieee80211_chsw_switch_ctxs(local);
-			if (err)
-				goto err;
-		}
-	} else {
-		err = ieee80211_chsw_switch_hwconf(local, new_ctx);
+	if (n_vifs_assign > 0 || n_vifs_ctxless > 0) {
+		err = ieee80211_chsw_switch_ctxs(local);
 		if (err)
 			goto err;
 	}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 29294cf88d39..cb4684a9451e 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1348,7 +1348,8 @@ struct ieee80211_local {
 
 	bool wiphy_ciphers_allocated;
 
-	bool use_chanctx;
+	struct cfg80211_chan_def dflt_chandef;
+	bool emulate_chanctx;
 
 	/* protects the aggregated multicast list and filter calls */
 	spinlock_t filter_lock;
@@ -1474,8 +1475,6 @@ struct ieee80211_local {
 	enum mac80211_scan_state next_scan_state;
 	struct wiphy_delayed_work scan_work;
 	struct ieee80211_sub_if_data __rcu *scan_sdata;
-	/* For backward compatibility only -- do not use */
-	struct cfg80211_chan_def _oper_chandef;
 
 	/* Temporary remain-on-channel for off-channel operations */
 	struct ieee80211_channel *tmp_channel;
@@ -1549,8 +1548,6 @@ struct ieee80211_local {
 
 	int user_power_level; /* in dBm, for all interfaces */
 
-	enum ieee80211_smps_mode smps_mode;
-
 	struct work_struct restart_work;
 
 #ifdef CONFIG_MAC80211_DEBUGFS
@@ -1819,6 +1816,8 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
 				     unsigned int mpdu_len,
 				     unsigned int mpdu_offset);
 int ieee80211_hw_config(struct ieee80211_local *local, u32 changed);
+int ieee80211_hw_conf_chan(struct ieee80211_local *local);
+void ieee80211_hw_conf_init(struct ieee80211_local *local);
 void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx);
 void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 				      u64 changed);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 1a6e9dd39a37..d81162bf7d48 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1288,8 +1288,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 		res = drv_start(local);
 		if (res)
 			goto err_del_bss;
-		/* we're brought up, everything changes */
-		hw_reconf_flags = ~0;
 		ieee80211_led_radio(local, true);
 		ieee80211_mod_tpt_led_trig(local,
 					   IEEE80211_TPT_LEDTRIG_FL_RADIO, 0);
@@ -1436,7 +1434,9 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 	if (coming_up)
 		local->open_count++;
 
-	if (hw_reconf_flags)
+	if (local->open_count == 1)
+		ieee80211_hw_conf_init(local);
+	else if (hw_reconf_flags)
 		ieee80211_hw_config(local, hw_reconf_flags);
 
 	ieee80211_recalc_ps(local);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index e05bcc35bc1e..ce0cba8d7afc 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -93,16 +93,32 @@ static void ieee80211_reconfig_filter(struct wiphy *wiphy,
 	ieee80211_configure_filter(local);
 }
 
-static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local)
+static u32 ieee80211_calc_hw_conf_chan(struct ieee80211_local *local,
+				       struct ieee80211_chanctx_conf *ctx)
 {
 	struct ieee80211_sub_if_data *sdata;
 	struct cfg80211_chan_def chandef = {};
+	struct cfg80211_chan_def *oper = NULL;
+	enum ieee80211_smps_mode smps_mode = IEEE80211_SMPS_STATIC;
 	u32 changed = 0;
 	int power;
 	u32 offchannel_flag;
 
+	if (!local->emulate_chanctx)
+		return 0;
+
 	offchannel_flag = local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL;
 
+	if (ctx && !WARN_ON(!ctx->def.chan)) {
+		oper = &ctx->def;
+		if (ctx->rx_chains_static > 1)
+			smps_mode = IEEE80211_SMPS_OFF;
+		else if (ctx->rx_chains_dynamic > 1)
+			smps_mode = IEEE80211_SMPS_DYNAMIC;
+		else
+			smps_mode = IEEE80211_SMPS_STATIC;
+	}
+
 	if (local->scan_chandef.chan) {
 		chandef = local->scan_chandef;
 	} else if (local->tmp_channel) {
@@ -110,25 +126,30 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local)
 		chandef.width = NL80211_CHAN_WIDTH_20_NOHT;
 		chandef.center_freq1 = chandef.chan->center_freq;
 		chandef.freq1_offset = chandef.chan->freq_offset;
-	} else
-		chandef = local->_oper_chandef;
+	} else if (oper) {
+		chandef = *oper;
+	} else {
+		chandef = local->dflt_chandef;
+	}
 
-	WARN(!cfg80211_chandef_valid(&chandef),
-	     "control:%d.%03d MHz width:%d center: %d.%03d/%d MHz",
-	     chandef.chan->center_freq, chandef.chan->freq_offset,
-	     chandef.width, chandef.center_freq1, chandef.freq1_offset,
-	     chandef.center_freq2);
+	if (WARN(!cfg80211_chandef_valid(&chandef),
+		 "control:%d.%03d MHz width:%d center: %d.%03d/%d MHz",
+		 chandef.chan ? chandef.chan->center_freq : -1,
+		 chandef.chan ? chandef.chan->freq_offset : 0,
+		 chandef.width, chandef.center_freq1, chandef.freq1_offset,
+		 chandef.center_freq2))
+		return 0;
 
-	if (!cfg80211_chandef_identical(&chandef, &local->_oper_chandef))
+	if (!oper || !cfg80211_chandef_identical(&chandef, oper))
 		local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL;
 	else
 		local->hw.conf.flags &= ~IEEE80211_CONF_OFFCHANNEL;
 
 	offchannel_flag ^= local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL;
 
-	if (offchannel_flag ||
-	    !cfg80211_chandef_identical(&local->hw.conf.chandef,
-					&local->_oper_chandef)) {
+	/* force it also for scanning, since drivers might config differently */
+	if (offchannel_flag || local->scanning ||
+	    !cfg80211_chandef_identical(&local->hw.conf.chandef, &chandef)) {
 		local->hw.conf.chandef = chandef;
 		changed |= IEEE80211_CONF_CHANGE_CHANNEL;
 	}
@@ -140,8 +161,8 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local)
 		 * that otherwise STATIC is used.
 		 */
 		local->hw.conf.smps_mode = IEEE80211_SMPS_STATIC;
-	} else if (local->hw.conf.smps_mode != local->smps_mode) {
-		local->hw.conf.smps_mode = local->smps_mode;
+	} else if (local->hw.conf.smps_mode != smps_mode) {
+		local->hw.conf.smps_mode = smps_mode;
 		changed |= IEEE80211_CONF_CHANGE_SMPS;
 	}
 
@@ -173,12 +194,9 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
 
 	might_sleep();
 
-	if (!local->use_chanctx)
-		changed |= ieee80211_hw_conf_chan(local);
-	else
-		changed &= ~(IEEE80211_CONF_CHANGE_CHANNEL |
-			     IEEE80211_CONF_CHANGE_POWER |
-			     IEEE80211_CONF_CHANGE_SMPS);
+	WARN_ON(changed & (IEEE80211_CONF_CHANGE_CHANNEL |
+			   IEEE80211_CONF_CHANGE_POWER |
+			   IEEE80211_CONF_CHANGE_SMPS));
 
 	if (changed && local->open_count) {
 		ret = drv_config(local, changed);
@@ -202,6 +220,107 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
 	return ret;
 }
 
+/* for scanning, offchannel and chanctx emulation only */
+static int _ieee80211_hw_conf_chan(struct ieee80211_local *local,
+				   struct ieee80211_chanctx_conf *ctx)
+{
+	u32 changed;
+
+	if (!local->open_count)
+		return 0;
+
+	changed = ieee80211_calc_hw_conf_chan(local, ctx);
+	if (!changed)
+		return 0;
+
+	return drv_config(local, changed);
+}
+
+int ieee80211_hw_conf_chan(struct ieee80211_local *local)
+{
+	struct ieee80211_chanctx *ctx;
+
+	ctx = list_first_entry_or_null(&local->chanctx_list,
+				       struct ieee80211_chanctx,
+				       list);
+
+	return _ieee80211_hw_conf_chan(local, ctx ? &ctx->conf : NULL);
+}
+
+void ieee80211_hw_conf_init(struct ieee80211_local *local)
+{
+	u32 changed = ~(IEEE80211_CONF_CHANGE_CHANNEL |
+			IEEE80211_CONF_CHANGE_POWER |
+			IEEE80211_CONF_CHANGE_SMPS);
+
+	if (WARN_ON(!local->open_count))
+		return;
+
+	if (local->emulate_chanctx) {
+		struct ieee80211_chanctx *ctx;
+
+		ctx = list_first_entry_or_null(&local->chanctx_list,
+					       struct ieee80211_chanctx,
+					       list);
+
+		changed |= ieee80211_calc_hw_conf_chan(local,
+						       ctx ? &ctx->conf : NULL);
+	}
+
+	WARN_ON(drv_config(local, changed));
+}
+
+int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw,
+				  struct ieee80211_chanctx_conf *ctx)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	local->hw.conf.radar_enabled = ctx->radar_enabled;
+
+	return _ieee80211_hw_conf_chan(local, ctx);
+}
+EXPORT_SYMBOL(ieee80211_emulate_add_chanctx);
+
+void ieee80211_emulate_remove_chanctx(struct ieee80211_hw *hw,
+				      struct ieee80211_chanctx_conf *ctx)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	local->hw.conf.radar_enabled = false;
+
+	_ieee80211_hw_conf_chan(local, NULL);
+}
+EXPORT_SYMBOL(ieee80211_emulate_remove_chanctx);
+
+void ieee80211_emulate_change_chanctx(struct ieee80211_hw *hw,
+				      struct ieee80211_chanctx_conf *ctx,
+				      u32 changed)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	local->hw.conf.radar_enabled = ctx->radar_enabled;
+
+	_ieee80211_hw_conf_chan(local, ctx);
+}
+EXPORT_SYMBOL(ieee80211_emulate_change_chanctx);
+
+int ieee80211_emulate_switch_vif_chanctx(struct ieee80211_hw *hw,
+					 struct ieee80211_vif_chanctx_switch *vifs,
+					 int n_vifs,
+					 enum ieee80211_chanctx_switch_mode mode)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	if (n_vifs <= 0)
+		return -EINVAL;
+
+	local->hw.conf.radar_enabled = vifs[0].new_ctx->radar_enabled;
+	_ieee80211_hw_conf_chan(local, vifs[0].new_ctx);
+
+	return 0;
+}
+EXPORT_SYMBOL(ieee80211_emulate_switch_vif_chanctx);
+
 #define BSS_CHANGED_VIF_CFG_FLAGS (BSS_CHANGED_ASSOC |\
 				   BSS_CHANGED_IDLE |\
 				   BSS_CHANGED_PS |\
@@ -645,7 +764,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 	struct ieee80211_local *local;
 	int priv_size, i;
 	struct wiphy *wiphy;
-	bool use_chanctx;
+	bool emulate_chanctx;
 
 	if (WARN_ON(!ops->tx || !ops->start || !ops->stop || !ops->config ||
 		    !ops->add_interface || !ops->remove_interface ||
@@ -660,12 +779,26 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 		return NULL;
 
 	/* check all or no channel context operations exist */
-	i = !!ops->add_chanctx + !!ops->remove_chanctx +
-	    !!ops->change_chanctx + !!ops->assign_vif_chanctx +
-	    !!ops->unassign_vif_chanctx;
-	if (WARN_ON(i != 0 && i != 5))
-		return NULL;
-	use_chanctx = i == 5;
+	if (ops->add_chanctx == ieee80211_emulate_add_chanctx &&
+	    ops->remove_chanctx == ieee80211_emulate_remove_chanctx &&
+	    ops->change_chanctx == ieee80211_emulate_change_chanctx) {
+		if (WARN_ON(ops->assign_vif_chanctx ||
+			    ops->unassign_vif_chanctx))
+			return NULL;
+		emulate_chanctx = true;
+	} else {
+		if (WARN_ON(ops->add_chanctx == ieee80211_emulate_add_chanctx ||
+			    ops->remove_chanctx == ieee80211_emulate_remove_chanctx ||
+			    ops->change_chanctx == ieee80211_emulate_change_chanctx))
+			return NULL;
+		if (WARN_ON(!ops->add_chanctx ||
+			    !ops->remove_chanctx ||
+			    !ops->change_chanctx ||
+			    !ops->assign_vif_chanctx ||
+			    !ops->unassign_vif_chanctx))
+			return NULL;
+		emulate_chanctx = false;
+	}
 
 	/* Ensure 32-byte alignment of our private data and hw private data.
 	 * We use the wiphy priv data for both our ieee80211_local and for
@@ -699,7 +832,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 			WIPHY_FLAG_REPORTS_OBSS |
 			WIPHY_FLAG_OFFCHAN_TX;
 
-	if (!use_chanctx || ops->remain_on_channel)
+	if (emulate_chanctx || ops->remain_on_channel)
 		wiphy->flags |= WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL;
 
 	wiphy->features |= NL80211_FEATURE_SK_TX_STATUS |
@@ -756,7 +889,10 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 	local->hw.priv = (char *)local + ALIGN(sizeof(*local), NETDEV_ALIGN);
 
 	local->ops = ops;
-	local->use_chanctx = use_chanctx;
+	local->emulate_chanctx = emulate_chanctx;
+
+	if (emulate_chanctx)
+		ieee80211_hw_set(&local->hw, CHANCTX_STA_CSA);
 
 	/*
 	 * We need a bit of data queued to build aggregates properly, so
@@ -833,7 +969,6 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 			ieee80211_dfs_radar_detected_work);
 
 	wiphy_work_init(&local->reconfig_filter, ieee80211_reconfig_filter);
-	local->smps_mode = IEEE80211_SMPS_OFF;
 
 	wiphy_work_init(&local->dynamic_ps_enable_work,
 			ieee80211_dynamic_ps_enable_work);
@@ -984,7 +1119,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 		 * as much, e.g. monitoring beacons would be hard if we
 		 * might not even know which link is active at which time.
 		 */
-		if (WARN_ON(!local->use_chanctx))
+		if (WARN_ON(local->emulate_chanctx))
 			return -EINVAL;
 
 		if (WARN_ON(!local->ops->link_info_changed))
@@ -1028,7 +1163,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 		return -EINVAL;
 #endif
 
-	if (!local->use_chanctx) {
+	if (local->emulate_chanctx) {
 		for (i = 0; i < local->hw.wiphy->n_iface_combinations; i++) {
 			const struct ieee80211_iface_combination *comb;
 
@@ -1094,11 +1229,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 						&sband->channels[i],
 						NL80211_CHAN_NO_HT);
 			/* init channel we're on */
-			if (!local->use_chanctx && !local->_oper_chandef.chan) {
+			local->monitor_chandef = dflt_chandef;
+			if (local->emulate_chanctx) {
+				local->dflt_chandef = dflt_chandef;
 				local->hw.conf.chandef = dflt_chandef;
-				local->_oper_chandef = dflt_chandef;
 			}
-			local->monitor_chandef = dflt_chandef;
 		}
 
 		channels += sband->n_channels;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 7c0339d4f059..9968bc0ddf6e 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2287,8 +2287,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link,
 
 	chanctx = container_of(conf, struct ieee80211_chanctx, conf);
 
-	if (local->use_chanctx &&
-	    !ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA)) {
+	if (!ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA)) {
 		sdata_info(sdata,
 			   "driver doesn't support chan-switch with channel contexts\n");
 		goto drop_connection;
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index 6c4080202573..221695d841fd 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -86,7 +86,7 @@ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local)
 
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	if (WARN_ON(local->use_chanctx))
+	if (WARN_ON(!local->emulate_chanctx))
 		return;
 
 	/*
@@ -136,7 +136,7 @@ void ieee80211_offchannel_return(struct ieee80211_local *local)
 
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	if (WARN_ON(local->use_chanctx))
+	if (WARN_ON(!local->emulate_chanctx))
 		return;
 
 	list_for_each_entry(sdata, &local->interfaces, list) {
@@ -351,10 +351,13 @@ static void _ieee80211_start_next_roc(struct ieee80211_local *local)
 		 * 20 MHz channel width) don't stop all the operations but still
 		 * treat it as though the ROC operation started properly, so
 		 * other ROC operations won't interfere with this one.
+		 *
+		 * Note: scan can't run, tmp_channel is what we use, so this
+		 * must be the currently active channel.
 		 */
-		roc->on_channel = roc->chan == local->_oper_chandef.chan &&
-				  local->_oper_chandef.width != NL80211_CHAN_WIDTH_5 &&
-				  local->_oper_chandef.width != NL80211_CHAN_WIDTH_10;
+		roc->on_channel = roc->chan == local->hw.conf.chandef.chan &&
+				  local->hw.conf.chandef.width != NL80211_CHAN_WIDTH_5 &&
+				  local->hw.conf.chandef.width != NL80211_CHAN_WIDTH_10;
 
 		/* start this ROC */
 		ieee80211_recalc_idle(local);
@@ -363,7 +366,7 @@ static void _ieee80211_start_next_roc(struct ieee80211_local *local)
 			ieee80211_offchannel_stop_vifs(local);
 
 			local->tmp_channel = roc->chan;
-			ieee80211_hw_config(local, 0);
+			ieee80211_hw_conf_chan(local);
 		}
 
 		wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work,
@@ -426,7 +429,7 @@ static void __ieee80211_roc_work(struct ieee80211_local *local)
 		return;
 
 	if (!roc->started) {
-		WARN_ON(local->use_chanctx);
+		WARN_ON(!local->emulate_chanctx);
 		_ieee80211_start_next_roc(local);
 	} else {
 		on_channel = roc->on_channel;
@@ -439,7 +442,7 @@ static void __ieee80211_roc_work(struct ieee80211_local *local)
 			ieee80211_flush_queues(local, NULL, false);
 
 			local->tmp_channel = NULL;
-			ieee80211_hw_config(local, 0);
+			ieee80211_hw_conf_chan(local);
 
 			ieee80211_offchannel_return(local);
 		}
@@ -539,7 +542,7 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local,
 		/* this may work, but is untested */
 		return -EOPNOTSUPP;
 
-	if (local->use_chanctx && !local->ops->remain_on_channel)
+	if (!local->emulate_chanctx && !local->ops->remain_on_channel)
 		return -EOPNOTSUPP;
 
 	roc = kzalloc(sizeof(*roc), GFP_KERNEL);
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 81644d15fe2f..6dedbbba153a 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -476,7 +476,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
 	}
 
 	/* Set power back to normal operating levels. */
-	ieee80211_hw_config(local, 0);
+	ieee80211_hw_conf_chan(local);
 
 	if (!hw_scan && was_scanning) {
 		ieee80211_configure_filter(local);
@@ -523,7 +523,7 @@ static int ieee80211_start_sw_scan(struct ieee80211_local *local,
 				   struct ieee80211_sub_if_data *sdata)
 {
 	/* Software scan is not supported in multi-channel cases */
-	if (local->use_chanctx)
+	if (!local->emulate_chanctx)
 		return -EOPNOTSUPP;
 
 	/*
@@ -553,7 +553,7 @@ static int ieee80211_start_sw_scan(struct ieee80211_local *local,
 	ieee80211_configure_filter(local);
 
 	/* We need to set power level at maximum rate for scanning. */
-	ieee80211_hw_config(local, 0);
+	ieee80211_hw_conf_chan(local);
 
 	wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0);
 
@@ -790,7 +790,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
 	if (hw_scan) {
 		__set_bit(SCAN_HW_SCANNING, &local->scanning);
 	} else if ((req->n_channels == 1) &&
-		   (req->channels[0] == local->_oper_chandef.chan)) {
+		   (req->channels[0] == local->hw.conf.chandef.chan)) {
 		/*
 		 * If we are scanning only on the operating channel
 		 * then we do not need to stop normal activities
@@ -808,7 +808,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
 		ieee80211_configure_filter(local); /* accept probe-responses */
 
 		/* We need to ensure power level is at max for scanning. */
-		ieee80211_hw_config(local, 0);
+		ieee80211_hw_conf_chan(local);
 
 		if ((req->channels[0]->flags & (IEEE80211_CHAN_NO_IR |
 						IEEE80211_CHAN_RADAR)) ||
@@ -973,13 +973,13 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local,
 	/* If scanning on oper channel, use whatever channel-type
 	 * is currently in use.
 	 */
-	if (chan == local->_oper_chandef.chan)
-		local->scan_chandef = local->_oper_chandef;
+	if (chan == local->hw.conf.chandef.chan)
+		local->scan_chandef = local->hw.conf.chandef;
 	else
 		local->scan_chandef.width = NL80211_CHAN_WIDTH_20_NOHT;
 
 set_channel:
-	if (ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL))
+	if (ieee80211_hw_conf_chan(local))
 		skip = 1;
 
 	/* advance state machine to next channel/band */
@@ -1023,7 +1023,7 @@ static void ieee80211_scan_state_suspend(struct ieee80211_local *local,
 {
 	/* switch back to the operating channel */
 	local->scan_chandef.chan = NULL;
-	ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
+	ieee80211_hw_conf_chan(local);
 
 	/* disable PS */
 	ieee80211_offchannel_return(local);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 7b33942ea51f..f57f7963ca37 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2390,8 +2390,6 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
 
 	if (chanctx_conf)
 		chandef = &chanctx_conf->def;
-	else if (!local->use_chanctx)
-		chandef = &local->_oper_chandef;
 	else
 		goto fail_rcu;
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 1a2522e699d4..51c1a99f57b8 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2472,9 +2472,6 @@ static void ieee80211_assign_chanctx(struct ieee80211_local *local,
 
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	if (!local->use_chanctx)
-		return;
-
 	conf = rcu_dereference_protected(link->conf->chanctx_conf,
 					 lockdep_is_held(&local->hw.wiphy->mtx));
 	if (conf) {
@@ -2704,20 +2701,20 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 	}
 
 	/* add channel contexts */
-	if (local->use_chanctx) {
-		list_for_each_entry(ctx, &local->chanctx_list, list)
-			if (ctx->replace_state !=
-			    IEEE80211_CHANCTX_REPLACES_OTHER)
-				WARN_ON(drv_add_chanctx(local, ctx));
-
-		sdata = wiphy_dereference(local->hw.wiphy,
-					  local->monitor_sdata);
-		if (sdata && ieee80211_sdata_running(sdata))
-			ieee80211_assign_chanctx(local, sdata, &sdata->deflink);
-	}
+	list_for_each_entry(ctx, &local->chanctx_list, list)
+		if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
+			WARN_ON(drv_add_chanctx(local, ctx));
+
+	sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata);
+	if (sdata && ieee80211_sdata_running(sdata))
+		ieee80211_assign_chanctx(local, sdata, &sdata->deflink);
 
 	/* reconfigure hardware */
-	ieee80211_hw_config(local, ~0);
+	ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_LISTEN_INTERVAL |
+				   IEEE80211_CONF_CHANGE_MONITOR |
+				   IEEE80211_CONF_CHANGE_PS |
+				   IEEE80211_CONF_CHANGE_RETRY_LIMITS |
+				   IEEE80211_CONF_CHANGE_IDLE);
 
 	ieee80211_configure_filter(local);
 
-- 
cgit v1.2.3


From 6092077ad09ce880c61735c314060f0bd79ae4aa Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 29 Jan 2024 19:34:40 +0100
Subject: wifi: mac80211: introduce 'channel request'

For channel contexts, mac80211 currently uses the cfg80211
chandef struct (control channel, center freq(s), width) to
define towards drivers and internally how these behave. In
fact, there are _two_ such structs used, where the min_def
can reduce bandwidth according to the stations connected.

Unfortunately,  with EHT this is longer be sufficient,  at
least not for all hardware.  EHT requires that non-AP STAs
that are connected to an AP with a lower bandwidth than it
(the AP) advertises (e.g. 160 MHz STA connected to 320 MHz
AP) still be able to receive downlink OFDMA and respond to
trigger frames for uplink OFDMA  that specify the position
and bandwidth  for the non-AP STA  relative to the channel
the AP is using.  Therefore, they need to be aware of this,
and at least for some hardware (e.g. Intel) this awareness
is in the hardware. As a result, use of the "same" channel
may need to be split over  two channel contexts where they
differ by the AP being used.

As a first step,  introduce a concept of a channel request
('chanreq') for each interface,  to control the context it
requests.   This step does nothing but reorganise the code,
so that later the AP's chandef can be added to the request
in order to handle the EHT case described above.

Link: https://msgid.link/20240129194108.2e88e48bd2e9.I4256183debe975c5ed71621611206fdbb69ba330@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c  |   8 +-
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c  |  12 +-
 .../net/wireless/intel/iwlwifi/mvm/mld-mac80211.c  |  10 +-
 drivers/net/wireless/intel/iwlwifi/mvm/ops.c       |   8 +-
 drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c     |   6 +-
 drivers/net/wireless/intel/iwlwifi/mvm/sta.c       |   4 +-
 drivers/net/wireless/intel/iwlwifi/mvm/tx.c        |   4 +-
 drivers/net/wireless/mediatek/mt76/mt7915/mcu.c    |   4 +-
 drivers/net/wireless/realtek/rtw89/mac.c           |   4 +-
 drivers/net/wireless/rsi/rsi_91x_mac80211.c        |   4 +-
 drivers/net/wireless/silabs/wfx/sta.c              |   4 +-
 drivers/net/wireless/ti/wlcore/main.c              |   6 +-
 drivers/staging/vt6655/device_main.c               |   2 +-
 drivers/staging/vt6656/main_usb.c                  |   2 +-
 include/net/mac80211.h                             |  12 +-
 net/mac80211/agg-tx.c                              |   2 +-
 net/mac80211/cfg.c                                 |  66 +++---
 net/mac80211/chan.c                                | 233 +++++++++++----------
 net/mac80211/ht.c                                  |   2 +-
 net/mac80211/ibss.c                                |  36 ++--
 net/mac80211/ieee80211_i.h                         |  21 +-
 net/mac80211/iface.c                               |   6 +-
 net/mac80211/link.c                                |   3 +-
 net/mac80211/main.c                                |   2 +-
 net/mac80211/mesh.c                                |  81 +++----
 net/mac80211/mesh_plink.c                          |   4 +-
 net/mac80211/mlme.c                                |  87 ++++----
 net/mac80211/ocb.c                                 |   3 +-
 net/mac80211/rate.c                                |  12 +-
 net/mac80211/spectmgmt.c                           |  22 +-
 net/mac80211/tdls.c                                |   8 +-
 net/mac80211/trace.h                               |   6 +-
 net/mac80211/util.c                                |  18 +-
 net/mac80211/vht.c                                 |   6 +-
 34 files changed, 379 insertions(+), 329 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
index bcf78ccba8c1..61269c7b1934 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
@@ -455,7 +455,7 @@ void iwl_mvm_set_fw_protection_flags(struct iwl_mvm *mvm,
 		break;
 	case IEEE80211_HT_OP_MODE_PROTECTION_20MHZ:
 		/* Protect when channel wider than 20MHz */
-		if (link_conf->chandef.width > NL80211_CHAN_WIDTH_20)
+		if (link_conf->chanreq.oper.width > NL80211_CHAN_WIDTH_20)
 			*protection_flags |= cpu_to_le32(ht_flag);
 		break;
 	default:
@@ -494,7 +494,7 @@ void iwl_mvm_set_fw_qos_params(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 	if (link_conf->qos)
 		*qos_flags |= cpu_to_le32(MAC_QOS_FLG_UPDATE_EDCA);
 
-	if (link_conf->chandef.width != NL80211_CHAN_WIDTH_20_NOHT)
+	if (link_conf->chanreq.oper.width != NL80211_CHAN_WIDTH_20_NOHT)
 		*qos_flags |= cpu_to_le32(MAC_QOS_FLG_TGN);
 }
 
@@ -910,8 +910,8 @@ u8 iwl_mvm_mac_ctxt_get_lowest_rate(struct iwl_mvm *mvm,
 		link_conf = rcu_dereference(vif->link_conf[link_id]);
 		if (link_conf) {
 			basic = link_conf->basic_rates;
-			if (link_conf->chandef.chan)
-				band = link_conf->chandef.chan->band;
+			if (link_conf->chanreq.oper.chan)
+				band = link_conf->chanreq.oper.chan->band;
 		}
 		rcu_read_unlock();
 	}
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index 93baec9bb3fc..65293b45a98f 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -1641,7 +1641,7 @@ static int iwl_mvm_mac_add_interface(struct ieee80211_hw *hw,
 	if (vif->type == NL80211_IFTYPE_MONITOR) {
 		mvm->monitor_on = true;
 		mvm->monitor_p80 =
-			iwl_mvm_chandef_get_primary_80(&vif->bss_conf.chandef);
+			iwl_mvm_chandef_get_primary_80(&vif->bss_conf.chanreq.oper);
 	}
 
 	if (!test_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status))
@@ -3421,16 +3421,16 @@ iwl_mvm_check_he_obss_narrow_bw_ru(struct ieee80211_hw *hw,
 		.tolerated = true,
 	};
 
-	if (WARN_ON_ONCE(!link_conf->chandef.chan ||
+	if (WARN_ON_ONCE(!link_conf->chanreq.oper.chan ||
 			 !mvmvif->link[link_id]))
 		return;
 
-	if (!(link_conf->chandef.chan->flags & IEEE80211_CHAN_RADAR)) {
+	if (!(link_conf->chanreq.oper.chan->flags & IEEE80211_CHAN_RADAR)) {
 		mvmvif->link[link_id]->he_ru_2mhz_block = false;
 		return;
 	}
 
-	cfg80211_bss_iter(hw->wiphy, &link_conf->chandef,
+	cfg80211_bss_iter(hw->wiphy, &link_conf->chanreq.oper,
 			  iwl_mvm_check_he_obss_narrow_bw_ru_iter,
 			  &iter_data);
 
@@ -3490,10 +3490,10 @@ static void iwl_mvm_mei_host_associated(struct iwl_mvm *mvm,
 		return;
 
 	/* FIXME: MEI needs to be updated for MLO */
-	if (!vif->bss_conf.chandef.chan)
+	if (!vif->bss_conf.chanreq.oper.chan)
 		return;
 
-	conn_info.channel = vif->bss_conf.chandef.chan->hw_value;
+	conn_info.channel = vif->bss_conf.chanreq.oper.chan->hw_value;
 
 	switch (mvm_sta->pairwise_cipher) {
 	case WLAN_CIPHER_SUITE_TKIP:
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c
index ff7d9a7d607e..b633a2a09c32 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c
@@ -656,8 +656,8 @@ void iwl_mvm_mld_select_links(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 			continue;
 
 		data[n_data].link_id = link_id;
-		data[n_data].band = link_conf->chandef.chan->band;
-		data[n_data].width = link_conf->chandef.width;
+		data[n_data].band = link_conf->chanreq.oper.chan->band;
+		data[n_data].width = link_conf->chanreq.oper.width;
 		data[n_data].active = vif->active_links & BIT(link_id);
 		n_data++;
 	}
@@ -1241,8 +1241,8 @@ int iwl_mvm_mld_get_primary_link(struct iwl_mvm *mvm,
 			continue;
 
 		data[n_data].link_id = link_id;
-		data[n_data].band = link_conf->chandef.chan->band;
-		data[n_data].width = link_conf->chandef.width;
+		data[n_data].band = link_conf->chanreq.oper.chan->band;
+		data[n_data].width = link_conf->chanreq.oper.width;
 		data[n_data].active = true;
 		n_data++;
 	}
@@ -1292,7 +1292,7 @@ static bool iwl_mvm_can_enter_esr(struct iwl_mvm *mvm,
 			continue;
 
 		/* BT Coex effects eSR mode only if one of the link is on LB */
-		if (link_conf->chandef.chan->band != NL80211_BAND_2GHZ)
+		if (link_conf->chanreq.oper.chan->band != NL80211_BAND_2GHZ)
 			continue;
 
 		ret = iwl_mvm_bt_coex_calculate_esr_mode(mvm, vif, link_id,
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
index 747fc91ef8d0..ad4146d3345a 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
@@ -161,9 +161,9 @@ static void iwl_mvm_rx_monitor_notif(struct iwl_mvm *mvm,
 	if (!vif || vif->type != NL80211_IFTYPE_STATION)
 		return;
 
-	if (!vif->bss_conf.chandef.chan ||
-	    vif->bss_conf.chandef.chan->band != NL80211_BAND_2GHZ ||
-	    vif->bss_conf.chandef.width < NL80211_CHAN_WIDTH_40)
+	if (!vif->bss_conf.chanreq.oper.chan ||
+	    vif->bss_conf.chanreq.oper.chan->band != NL80211_BAND_2GHZ ||
+	    vif->bss_conf.chanreq.oper.width < NL80211_CHAN_WIDTH_40)
 		return;
 
 	if (!vif->cfg.assoc)
@@ -219,7 +219,7 @@ void iwl_mvm_update_link_smps(struct ieee80211_vif *vif,
 		return;
 
 	if (mvm->fw_static_smps_request &&
-	    link_conf->chandef.width == NL80211_CHAN_WIDTH_160 &&
+	    link_conf->chanreq.oper.width == NL80211_CHAN_WIDTH_160 &&
 	    link_conf->he_support)
 		mode = IEEE80211_SMPS_STATIC;
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c
index 6cba8a353b53..71d92635d6d7 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c
@@ -525,10 +525,10 @@ u16 rs_fw_get_max_amsdu_len(struct ieee80211_sta *sta,
 	const struct ieee80211_sta_ht_cap *ht_cap = &link_sta->ht_cap;
 	const struct ieee80211_sta_eht_cap *eht_cap = &link_sta->eht_cap;
 
-	if (WARN_ON_ONCE(!link_conf->chandef.chan))
+	if (WARN_ON_ONCE(!link_conf->chanreq.oper.chan))
 		return IEEE80211_MAX_MPDU_LEN_VHT_3895;
 
-	if (link_conf->chandef.chan->band == NL80211_BAND_6GHZ) {
+	if (link_conf->chanreq.oper.chan->band == NL80211_BAND_6GHZ) {
 		switch (le16_get_bits(link_sta->he_6ghz_capa.capa,
 				      IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN)) {
 		case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454:
@@ -538,7 +538,7 @@ u16 rs_fw_get_max_amsdu_len(struct ieee80211_sta *sta,
 		default:
 			return IEEE80211_MAX_MPDU_LEN_VHT_3895;
 		}
-	} else if (link_conf->chandef.chan->band == NL80211_BAND_2GHZ &&
+	} else if (link_conf->chanreq.oper.chan->band == NL80211_BAND_2GHZ &&
 		   eht_cap->has_eht) {
 		switch (u8_get_bits(eht_cap->eht_cap_elem.mac_cap_info[0],
 				    IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_MASK)) {
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
index 8ffbb8efda73..7e9f3a670212 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
@@ -71,7 +71,7 @@ u32 iwl_mvm_get_sta_ampdu_dens(struct ieee80211_link_sta *link_sta,
 		mpdu_dens = link_sta->ht_cap.ampdu_density;
 	}
 
-	if (link_conf->chandef.chan->band == NL80211_BAND_6GHZ) {
+	if (link_conf->chanreq.oper.chan->band == NL80211_BAND_6GHZ) {
 		/* overwrite HT values on 6 GHz */
 		mpdu_dens = le16_get_bits(link_sta->he_6ghz_capa.capa,
 					  IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START);
@@ -208,7 +208,7 @@ int iwl_mvm_sta_send_to_fw(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
 	}
 
 	if (sta->deflink.ht_cap.ht_supported ||
-	    mvm_sta->vif->bss_conf.chandef.chan->band == NL80211_BAND_6GHZ)
+	    mvm_sta->vif->bss_conf.chanreq.oper.chan->band == NL80211_BAND_6GHZ)
 		add_sta_cmd.station_flags_msk |=
 			cpu_to_le32(STA_FLG_MAX_AGG_SIZE_MSK |
 				    STA_FLG_AGG_MPDU_DENS_MSK);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index 79eb6394e5a7..58d1f283d628 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -894,10 +894,10 @@ unsigned int iwl_mvm_max_amsdu_size(struct iwl_mvm *mvm,
 			if (WARN_ON(!link_conf))
 				band = NL80211_BAND_2GHZ;
 			else
-				band = link_conf->chandef.chan->band;
+				band = link_conf->chanreq.oper.chan->band;
 			rcu_read_unlock();
 		} else {
-			band = mvmsta->vif->bss_conf.chandef.chan->band;
+			band = mvmsta->vif->bss_conf.chanreq.oper.chan->band;
 		}
 
 		lmac = iwl_mvm_get_lmac_id(mvm, band);
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
index c67c4f6ca2aa..df1ad6d4e12d 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
@@ -463,10 +463,10 @@ static bool mt7915_check_he_obss_narrow_bw_ru(struct ieee80211_hw *hw,
 		.tolerated = true,
 	};
 
-	if (!(vif->bss_conf.chandef.chan->flags & IEEE80211_CHAN_RADAR))
+	if (!(vif->bss_conf.chanreq.oper.chan->flags & IEEE80211_CHAN_RADAR))
 		return false;
 
-	cfg80211_bss_iter(hw->wiphy, &vif->bss_conf.chandef,
+	cfg80211_bss_iter(hw->wiphy, &vif->bss_conf.chanreq.oper,
 			  mt7915_check_he_obss_narrow_bw_ru_iter,
 			  &iter_data);
 
diff --git a/drivers/net/wireless/realtek/rtw89/mac.c b/drivers/net/wireless/realtek/rtw89/mac.c
index dbf2d6fe4ea7..ef4c492003d8 100644
--- a/drivers/net/wireless/realtek/rtw89/mac.c
+++ b/drivers/net/wireless/realtek/rtw89/mac.c
@@ -4615,10 +4615,10 @@ void rtw89_mac_set_he_obss_narrow_bw_ru(struct rtw89_dev *rtwdev,
 	if (!vif->bss_conf.he_support || vif->type != NL80211_IFTYPE_STATION)
 		return;
 
-	if (!(vif->bss_conf.chandef.chan->flags & IEEE80211_CHAN_RADAR))
+	if (!(vif->bss_conf.chanreq.oper.chan->flags & IEEE80211_CHAN_RADAR))
 		return;
 
-	cfg80211_bss_iter(hw->wiphy, &vif->bss_conf.chandef,
+	cfg80211_bss_iter(hw->wiphy, &vif->bss_conf.chanreq.oper,
 			  rtw89_mac_check_he_obss_narrow_bw_ru_iter,
 			  &tolerated);
 
diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index e8aeb4d76c13..211fa25b9a78 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -740,7 +740,7 @@ u16 rsi_get_connected_channel(struct ieee80211_vif *vif)
 		return 0;
 
 	bss = &vif->bss_conf;
-	channel = bss->chandef.chan;
+	channel = bss->chanreq.oper.chan;
 
 	if (!channel)
 		return 0;
@@ -759,7 +759,7 @@ static void rsi_switch_channel(struct rsi_hw *adapter,
 	if (!vif)
 		return;
 
-	channel = vif->bss_conf.chandef.chan;
+	channel = vif->bss_conf.chanreq.oper.chan;
 
 	if (!channel)
 		return;
diff --git a/drivers/net/wireless/silabs/wfx/sta.c b/drivers/net/wireless/silabs/wfx/sta.c
index bb4446b88c12..a904602f02ce 100644
--- a/drivers/net/wireless/silabs/wfx/sta.c
+++ b/drivers/net/wireless/silabs/wfx/sta.c
@@ -144,13 +144,13 @@ static int wfx_get_ps_timeout(struct wfx_vif *wvif, bool *enable_ps)
 		struct wfx_vif *wvif_ch0 = wdev_to_wvif(wvif->wdev, 0);
 		struct ieee80211_vif *vif_ch0 = wvif_to_vif(wvif_ch0);
 
-		chan0 = vif_ch0->bss_conf.chandef.chan;
+		chan0 = vif_ch0->bss_conf.chanreq.oper.chan;
 	}
 	if (wdev_to_wvif(wvif->wdev, 1)) {
 		struct wfx_vif *wvif_ch1 = wdev_to_wvif(wvif->wdev, 1);
 		struct ieee80211_vif *vif_ch1 = wvif_to_vif(wvif_ch1);
 
-		chan1 = vif_ch1->bss_conf.chandef.chan;
+		chan1 = vif_ch1->bss_conf.chanreq.oper.chan;
 	}
 	if (chan0 && chan1 && vif->type != NL80211_IFTYPE_AP) {
 		if (chan0->hw_value == chan1->hw_value) {
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index 5736acb4d206..ef12169f8044 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -2910,7 +2910,7 @@ static int wlcore_set_assoc(struct wl1271 *wl, struct wl12xx_vif *wlvif,
 	int ret;
 
 	wlvif->aid = vif->cfg.aid;
-	wlvif->channel_type = cfg80211_get_chandef_type(&bss_conf->chandef);
+	wlvif->channel_type = cfg80211_get_chandef_type(&bss_conf->chanreq.oper);
 	wlvif->beacon_int = bss_conf->beacon_int;
 	wlvif->wmm_enabled = bss_conf->qos;
 
@@ -4242,7 +4242,7 @@ static void wl1271_bss_info_changed_ap(struct wl1271 *wl,
 
 	/* Handle HT information change */
 	if ((changed & BSS_CHANGED_HT) &&
-	    (bss_conf->chandef.width != NL80211_CHAN_WIDTH_20_NOHT)) {
+	    (bss_conf->chanreq.oper.width != NL80211_CHAN_WIDTH_20_NOHT)) {
 		ret = wl1271_acx_set_ht_information(wl, wlvif,
 					bss_conf->ht_operation_mode);
 		if (ret < 0) {
@@ -4515,7 +4515,7 @@ static void wl1271_bss_info_changed_sta(struct wl1271 *wl,
 	/* Handle new association with HT. Do this after join. */
 	if (sta_exists) {
 		bool enabled =
-			bss_conf->chandef.width != NL80211_CHAN_WIDTH_20_NOHT;
+			bss_conf->chanreq.oper.width != NL80211_CHAN_WIDTH_20_NOHT;
 
 		ret = wlcore_hw_set_peer_cap(wl,
 					     &sta_ht_cap,
diff --git a/drivers/staging/vt6655/device_main.c b/drivers/staging/vt6655/device_main.c
index e23a5da2b67e..283804b49e91 100644
--- a/drivers/staging/vt6655/device_main.c
+++ b/drivers/staging/vt6655/device_main.c
@@ -1515,7 +1515,7 @@ static void vnt_bss_info_changed(struct ieee80211_hw *hw,
 
 	if (changed & BSS_CHANGED_TXPOWER)
 		RFbSetPower(priv, priv->wCurrentRate,
-			    conf->chandef.chan->hw_value);
+			    conf->chanreq.oper.chan->hw_value);
 
 	if (changed & BSS_CHANGED_BEACON_ENABLED) {
 		dev_dbg(&priv->pcid->dev,
diff --git a/drivers/staging/vt6656/main_usb.c b/drivers/staging/vt6656/main_usb.c
index 6c70493d1b01..7bbed462f062 100644
--- a/drivers/staging/vt6656/main_usb.c
+++ b/drivers/staging/vt6656/main_usb.c
@@ -794,7 +794,7 @@ static void vnt_bss_info_changed(struct ieee80211_hw *hw,
 		vnt_set_bss_mode(priv);
 
 	if (changed & (BSS_CHANGED_TXPOWER | BSS_CHANGED_BANDWIDTH))
-		vnt_rf_setpower(priv, conf->chandef.chan);
+		vnt_rf_setpower(priv, conf->chanreq.oper.chan);
 
 	if (changed & BSS_CHANGED_BEACON_ENABLED) {
 		dev_dbg(&priv->usb->dev,
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 62c4b4d10bb4..dd8a66e9afd9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -223,6 +223,14 @@ enum ieee80211_chanctx_change {
 	IEEE80211_CHANCTX_CHANGE_MIN_WIDTH	= BIT(4),
 };
 
+/**
+ * struct ieee80211_chan_req - A channel "request"
+ * @oper: channel definition to use for operation
+ */
+struct ieee80211_chan_req {
+	struct cfg80211_chan_def oper;
+};
+
 /**
  * struct ieee80211_chanctx_conf - channel context that vifs may be tuned to
  *
@@ -583,7 +591,7 @@ struct ieee80211_fils_discovery {
  * @mcast_rate: per-band multicast rate index + 1 (0: disabled)
  * @bssid: The BSSID for this BSS
  * @enable_beacon: whether beaconing should be enabled or not
- * @chandef: Channel definition for this BSS -- the hardware might be
+ * @chanreq: Channel request for this BSS -- the hardware might be
  *	configured a higher bandwidth than this BSS uses, for example.
  * @mu_group: VHT MU-MIMO group membership data
  * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation.
@@ -716,7 +724,7 @@ struct ieee80211_bss_conf {
 	u32 cqm_rssi_hyst;
 	s32 cqm_rssi_low;
 	s32 cqm_rssi_high;
-	struct cfg80211_chan_def chandef;
+	struct ieee80211_chan_req chanreq;
 	struct ieee80211_mu_group_data mu_group;
 	bool qos;
 	bool hidden_ssid;
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index b8a278355e18..21d55dc539f6 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -616,7 +616,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
 		return -EINVAL;
 
 	if (!pubsta->deflink.ht_cap.ht_supported &&
-	    sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ)
+	    sta->sdata->vif.bss_conf.chanreq.oper.chan->band != NL80211_BAND_6GHZ)
 		return -EINVAL;
 
 	if (WARN_ON_ONCE(!local->ops->ampdu_action))
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index d3bf029709d5..5aa02b0872d9 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -886,11 +886,13 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy,
 {
 	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_chan_req chanreq = { .oper = *chandef };
 	int ret;
 
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	if (cfg80211_chandef_identical(&local->monitor_chandef, chandef))
+	if (cfg80211_chandef_identical(&local->monitor_chanreq.oper,
+				       &chanreq.oper))
 		return 0;
 
 	sdata = wiphy_dereference(local->hw.wiphy,
@@ -898,17 +900,17 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy,
 	if (!sdata)
 		goto done;
 
-	if (cfg80211_chandef_identical(&sdata->vif.bss_conf.chandef, chandef))
+	if (cfg80211_chandef_identical(&sdata->vif.bss_conf.chanreq.oper,
+				       &chanreq.oper))
 		return 0;
 
 	ieee80211_link_release_channel(&sdata->deflink);
-	ret = ieee80211_link_use_channel(&sdata->deflink,
-					 chandef,
+	ret = ieee80211_link_use_channel(&sdata->deflink, &chanreq,
 					 IEEE80211_CHANCTX_EXCLUSIVE);
 	if (ret)
 		return ret;
 done:
-	local->monitor_chandef = *chandef;
+	local->monitor_chanreq = chanreq;
 	return 0;
 }
 
@@ -1257,6 +1259,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	unsigned int link_id = params->beacon.link_id;
 	struct ieee80211_link_data *link;
 	struct ieee80211_bss_conf *link_conf;
+	struct ieee80211_chan_req chanreq = { .oper = params->chandef };
 
 	lockdep_assert_wiphy(local->hw.wiphy);
 
@@ -1369,7 +1372,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 			return err;
 	}
 
-	err = ieee80211_link_use_channel(link, &params->chandef,
+	err = ieee80211_link_use_channel(link, &chanreq,
 					 IEEE80211_CHANCTX_SHARED);
 	if (!err)
 		ieee80211_link_copy_chanctx_to_vlans(link, false);
@@ -1626,7 +1629,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 					  BSS_CHANGED_BEACON_ENABLED);
 
 	if (sdata->wdev.cac_started) {
-		chandef = link_conf->chandef;
+		chandef = link_conf->chanreq.oper;
 		wiphy_delayed_work_cancel(wiphy, &link->dfs_cac_timer_work);
 		cfg80211_cac_event(sdata->dev, &chandef,
 				   NL80211_RADAR_CAC_ABORTED,
@@ -1826,7 +1829,7 @@ static int sta_link_apply_parameters(struct ieee80211_local *local,
 
 	if (params->supported_rates &&
 	    params->supported_rates_len) {
-		ieee80211_parse_bitrates(link->conf->chandef.width,
+		ieee80211_parse_bitrates(link->conf->chanreq.oper.width,
 					 sband, params->supported_rates,
 					 params->supported_rates_len,
 					 &link_sta->pub->supp_rates[sband->band]);
@@ -2602,6 +2605,7 @@ static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev,
 			       const struct mesh_setup *setup)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_chan_req chanreq = { .oper = setup->chandef };
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 	int err;
 
@@ -2618,7 +2622,7 @@ static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev,
 	sdata->deflink.smps_mode = IEEE80211_SMPS_OFF;
 	sdata->deflink.needed_rx_chains = sdata->local->rx_chains;
 
-	err = ieee80211_link_use_channel(&sdata->deflink, &setup->chandef,
+	err = ieee80211_link_use_channel(&sdata->deflink, &chanreq,
 					 IEEE80211_CHANCTX_SHARED);
 	if (err)
 		return err;
@@ -2661,7 +2665,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
 		return -EINVAL;
 
 	if (params->basic_rates) {
-		if (!ieee80211_parse_bitrates(link->conf->chandef.width,
+		if (!ieee80211_parse_bitrates(link->conf->chanreq.oper.width,
 					      wiphy->bands[sband->band],
 					      params->basic_rates,
 					      params->basic_rates_len,
@@ -3176,7 +3180,7 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
 	 * the new value until we associate.
 	 */
 	if (!sdata->u.mgd.associated ||
-	    link->conf->chandef.width == NL80211_CHAN_WIDTH_20_NOHT)
+	    link->conf->chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT)
 		return 0;
 
 	ap = sdata->vif.cfg.ap_addr;
@@ -3331,9 +3335,11 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
 	 * so at a basic rate so that all clients can receive it.
 	 */
 	if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) &&
-	    sdata->vif.bss_conf.chandef.chan) {
+	    sdata->vif.bss_conf.chanreq.oper.chan) {
 		u32 basic_rates = sdata->vif.bss_conf.basic_rates;
-		enum nl80211_band band = sdata->vif.bss_conf.chandef.chan->band;
+		enum nl80211_band band;
+
+		band = sdata->vif.bss_conf.chanreq.oper.chan->band;
 
 		if (!(mask->control[band].legacy & basic_rates))
 			return -EINVAL;
@@ -3385,6 +3391,7 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy,
 					   u32 cac_time_ms)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_chan_req chanreq = { .oper = *chandef };
 	struct ieee80211_local *local = sdata->local;
 	int err;
 
@@ -3399,7 +3406,7 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy,
 	sdata->deflink.smps_mode = IEEE80211_SMPS_OFF;
 	sdata->deflink.needed_rx_chains = local->rx_chains;
 
-	err = ieee80211_link_use_channel(&sdata->deflink, chandef,
+	err = ieee80211_link_use_channel(&sdata->deflink, &chanreq,
 					 IEEE80211_CHANCTX_SHARED);
 	if (err)
 		goto out_unlock;
@@ -3651,8 +3658,8 @@ static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data)
 		return ieee80211_link_use_reserved_context(&sdata->deflink);
 	}
 
-	if (!cfg80211_chandef_identical(&link_data->conf->chandef,
-					&link_data->csa_chandef))
+	if (!cfg80211_chandef_identical(&link_data->conf->chanreq.oper,
+					&link_data->csa_chanreq.oper))
 		return -EINVAL;
 
 	sdata->vif.bss_conf.csa_active = false;
@@ -3679,7 +3686,7 @@ static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data)
 	if (err)
 		return err;
 
-	cfg80211_ch_switch_notify(sdata->dev, &link_data->csa_chandef,
+	cfg80211_ch_switch_notify(sdata->dev, &link_data->csa_chanreq.oper,
 				  link_data->link_id,
 				  link_data->conf->eht_puncturing);
 
@@ -3814,7 +3821,7 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
 		struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 
 		/* changes into another band are not supported */
-		if (sdata->vif.bss_conf.chandef.chan->band !=
+		if (sdata->vif.bss_conf.chanreq.oper.chan->band !=
 		    params->chandef.chan->band)
 			return -EINVAL;
 
@@ -3862,6 +3869,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 			   struct cfg80211_csa_settings *params)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_chan_req chanreq = { .oper = params->chandef };
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_channel_switch ch_switch;
 	struct ieee80211_chanctx_conf *conf;
@@ -3877,8 +3885,8 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 	if (sdata->wdev.cac_started)
 		return -EBUSY;
 
-	if (cfg80211_chandef_identical(&params->chandef,
-				       &sdata->vif.bss_conf.chandef))
+	if (cfg80211_chandef_identical(&chanreq.oper,
+				       &sdata->vif.bss_conf.chanreq.oper))
 		return -EINVAL;
 
 	/* don't allow another channel switch if one is already active. */
@@ -3903,14 +3911,14 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 	ch_switch.timestamp = 0;
 	ch_switch.device_timestamp = 0;
 	ch_switch.block_tx = params->block_tx;
-	ch_switch.chandef = params->chandef;
+	ch_switch.chandef = chanreq.oper;
 	ch_switch.count = params->count;
 
 	err = drv_pre_channel_switch(sdata, &ch_switch);
 	if (err)
 		goto out;
 
-	err = ieee80211_link_reserve_chanctx(&sdata->deflink, &params->chandef,
+	err = ieee80211_link_reserve_chanctx(&sdata->deflink, &chanreq,
 					     chanctx->mode,
 					     params->radar_required);
 	if (err)
@@ -3936,7 +3944,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 	if (params->punct_bitmap && !sdata->vif.bss_conf.eht_support)
 		goto out;
 
-	sdata->deflink.csa_chandef = params->chandef;
+	sdata->deflink.csa_chanreq = chanreq; 
 	sdata->deflink.csa_block_tx = params->block_tx;
 	sdata->vif.bss_conf.csa_active = true;
 	sdata->vif.bss_conf.csa_punct_bitmap = params->punct_bitmap;
@@ -3946,14 +3954,15 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 					  IEEE80211_QUEUE_STOP_REASON_CSA);
 
 	cfg80211_ch_switch_started_notify(sdata->dev,
-					  &sdata->deflink.csa_chandef, 0,
+					  &sdata->deflink.csa_chanreq.oper, 0,
 					  params->count, params->block_tx,
 					  sdata->vif.bss_conf.csa_punct_bitmap);
 
 	if (changed) {
 		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
 						  changed);
-		drv_channel_switch_beacon(sdata, &params->chandef);
+		drv_channel_switch_beacon(sdata,
+					  &sdata->deflink.csa_chanreq.oper);
 	} else {
 		/* if the beacon didn't change, we can finalize immediately */
 		ieee80211_csa_finalize(&sdata->deflink);
@@ -4206,12 +4215,12 @@ static int ieee80211_cfg_get_channel(struct wiphy *wiphy,
 
 	chanctx_conf = rcu_dereference(link->conf->chanctx_conf);
 	if (chanctx_conf) {
-		*chandef = link->conf->chandef;
+		*chandef = link->conf->chanreq.oper;
 		ret = 0;
 	} else if (local->open_count > 0 &&
 		   local->open_count == local->monitors &&
 		   sdata->vif.type == NL80211_IFTYPE_MONITOR) {
-		*chandef = local->monitor_chandef;
+		*chandef = local->monitor_chanreq.oper;
 		ret = 0;
 	}
 out:
@@ -4259,12 +4268,13 @@ static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy,
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_link_data *link;
+	struct ieee80211_chan_req chanreq = { .oper = *chandef };
 	int ret;
 	u64 changed = 0;
 
 	link = sdata_dereference(sdata->link[link_id], sdata);
 
-	ret = ieee80211_link_change_bandwidth(link, chandef, &changed);
+	ret = ieee80211_link_change_chanreq(link, &chanreq, &changed);
 	if (ret == 0)
 		ieee80211_link_info_change_notify(sdata, link, changed);
 
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 6b82c79cf7a6..f1cef332e4db 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -81,87 +81,97 @@ ieee80211_link_get_chanctx(struct ieee80211_link_data *link)
 	return container_of(conf, struct ieee80211_chanctx, conf);
 }
 
-static const struct cfg80211_chan_def *
-ieee80211_chanctx_reserved_chandef(struct ieee80211_local *local,
+static const struct ieee80211_chan_req *
+ieee80211_chanreq_compatible(const struct ieee80211_chan_req *a,
+			     const struct ieee80211_chan_req *b)
+{
+	const struct cfg80211_chan_def *compat;
+
+	compat = cfg80211_chandef_compatible(&a->oper, &b->oper);
+
+	if (compat == &a->oper)
+		return a;
+
+	if (compat == &b->oper)
+		return b;
+
+	WARN_ON(compat);
+	return NULL;
+}
+
+static const struct ieee80211_chan_req *
+ieee80211_chanctx_compatible(struct ieee80211_chanctx *ctx,
+			     const struct ieee80211_chan_req *req,
+			     struct ieee80211_chan_req *tmp)
+{
+	*tmp = (struct ieee80211_chan_req){
+		.oper = ctx->conf.def,
+	};
+
+	return ieee80211_chanreq_compatible(tmp, req);
+}
+
+static const struct ieee80211_chan_req *
+ieee80211_chanctx_reserved_chanreq(struct ieee80211_local *local,
 				   struct ieee80211_chanctx *ctx,
-				   const struct cfg80211_chan_def *compat)
+				   const struct ieee80211_chan_req *req)
 {
 	struct ieee80211_link_data *link;
 
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	if (WARN_ON(!compat))
+	if (WARN_ON(!req))
 		return NULL;
 
-	list_for_each_entry(link, &ctx->reserved_links,
-			    reserved_chanctx_list) {
-		compat = cfg80211_chandef_compatible(&link->reserved_chandef,
-						     compat);
-		if (!compat)
+	list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) {
+		req = ieee80211_chanreq_compatible(&link->reserved, req);
+		if (!req)
 			break;
 	}
 
-	return compat;
+	return req;
 }
 
-static const struct cfg80211_chan_def *
+static const struct ieee80211_chan_req *
 ieee80211_chanctx_non_reserved_chandef(struct ieee80211_local *local,
 				       struct ieee80211_chanctx *ctx,
-				       const struct cfg80211_chan_def *compat)
+				       const struct ieee80211_chan_req *compat)
 {
 	struct ieee80211_link_data *link;
+	const struct ieee80211_chan_req *comp_def = compat;
 
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	list_for_each_entry(link, &ctx->assigned_links,
-			    assigned_chanctx_list) {
+	list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) {
 		struct ieee80211_bss_conf *link_conf = link->conf;
 
 		if (link->reserved_chanctx)
 			continue;
 
-		if (!compat)
-			compat = &link_conf->chandef;
-
-		compat = cfg80211_chandef_compatible(
-				&link_conf->chandef, compat);
-		if (!compat)
+		comp_def = ieee80211_chanreq_compatible(&link_conf->chanreq,
+							comp_def);
+		if (!comp_def)
 			break;
 	}
 
-	return compat;
-}
-
-static const struct cfg80211_chan_def *
-ieee80211_chanctx_combined_chandef(struct ieee80211_local *local,
-				   struct ieee80211_chanctx *ctx,
-				   const struct cfg80211_chan_def *compat)
-{
-	lockdep_assert_wiphy(local->hw.wiphy);
-
-	compat = ieee80211_chanctx_reserved_chandef(local, ctx, compat);
-	if (!compat)
-		return NULL;
-
-	compat = ieee80211_chanctx_non_reserved_chandef(local, ctx, compat);
-	if (!compat)
-		return NULL;
-
-	return compat;
+	return comp_def;
 }
 
 static bool
-ieee80211_chanctx_can_reserve_chandef(struct ieee80211_local *local,
-				      struct ieee80211_chanctx *ctx,
-				      const struct cfg80211_chan_def *def)
+ieee80211_chanctx_can_reserve(struct ieee80211_local *local,
+			      struct ieee80211_chanctx *ctx,
+			      const struct ieee80211_chan_req *req)
 {
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	if (ieee80211_chanctx_combined_chandef(local, ctx, def))
-		return true;
+	if (!ieee80211_chanctx_reserved_chanreq(local, ctx, req))
+		return false;
+
+	if (!ieee80211_chanctx_non_reserved_chandef(local, ctx, req))
+		return false;
 
 	if (!list_empty(&ctx->reserved_links) &&
-	    ieee80211_chanctx_reserved_chandef(local, ctx, def))
+	    ieee80211_chanctx_reserved_chanreq(local, ctx, req))
 		return true;
 
 	return false;
@@ -169,7 +179,7 @@ ieee80211_chanctx_can_reserve_chandef(struct ieee80211_local *local,
 
 static struct ieee80211_chanctx *
 ieee80211_find_reservation_chanctx(struct ieee80211_local *local,
-				   const struct cfg80211_chan_def *chandef,
+				   const struct ieee80211_chan_req *chanreq,
 				   enum ieee80211_chanctx_mode mode)
 {
 	struct ieee80211_chanctx *ctx;
@@ -186,8 +196,7 @@ ieee80211_find_reservation_chanctx(struct ieee80211_local *local,
 		if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE)
 			continue;
 
-		if (!ieee80211_chanctx_can_reserve_chandef(local, ctx,
-							   chandef))
+		if (!ieee80211_chanctx_can_reserve(local, ctx, chanreq))
 			continue;
 
 		return ctx;
@@ -290,7 +299,7 @@ ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata,
 			 * point, so take the width from the chandef, but
 			 * account also for TDLS peers
 			 */
-			width = max(link->conf->chandef.width,
+			width = max(link->conf->chanreq.oper.width,
 				    ieee80211_get_max_required_bw(sdata, link_id));
 			break;
 		case NL80211_IFTYPE_P2P_DEVICE:
@@ -299,7 +308,7 @@ ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata,
 		case NL80211_IFTYPE_ADHOC:
 		case NL80211_IFTYPE_MESH_POINT:
 		case NL80211_IFTYPE_OCB:
-			width = link->conf->chandef.width;
+			width = link->conf->chanreq.oper.width;
 			break;
 		case NL80211_IFTYPE_WDS:
 		case NL80211_IFTYPE_UNSPECIFIED:
@@ -395,7 +404,7 @@ _ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local,
 }
 
 /* calling this function is assuming that station vif is updated to
- * lates changes by calling ieee80211_link_update_chandef
+ * lates changes by calling ieee80211_link_update_chanreq
  */
 static void ieee80211_chan_bw_change(struct ieee80211_local *local,
 				     struct ieee80211_chanctx *ctx,
@@ -475,9 +484,10 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local,
 static void _ieee80211_change_chanctx(struct ieee80211_local *local,
 				      struct ieee80211_chanctx *ctx,
 				      struct ieee80211_chanctx *old_ctx,
-				      const struct cfg80211_chan_def *chandef,
+				      const struct ieee80211_chan_req *chanreq,
 				      struct ieee80211_link_data *rsvd_for)
 {
+	const struct cfg80211_chan_def *chandef = &chanreq->oper;
 	u32 changed;
 
 	/* expected to handle only 20/40/80/160/320 channel widths */
@@ -526,16 +536,17 @@ static void _ieee80211_change_chanctx(struct ieee80211_local *local,
 static void ieee80211_change_chanctx(struct ieee80211_local *local,
 				     struct ieee80211_chanctx *ctx,
 				     struct ieee80211_chanctx *old_ctx,
-				     const struct cfg80211_chan_def *chandef)
+				     const struct ieee80211_chan_req *chanreq)
 {
-	_ieee80211_change_chanctx(local, ctx, old_ctx, chandef, NULL);
+	_ieee80211_change_chanctx(local, ctx, old_ctx, chanreq, NULL);
 }
 
 static struct ieee80211_chanctx *
 ieee80211_find_chanctx(struct ieee80211_local *local,
-		       const struct cfg80211_chan_def *chandef,
+		       const struct ieee80211_chan_req *chanreq,
 		       enum ieee80211_chanctx_mode mode)
 {
+	struct ieee80211_chan_req tmp;
 	struct ieee80211_chanctx *ctx;
 
 	lockdep_assert_wiphy(local->hw.wiphy);
@@ -544,7 +555,7 @@ ieee80211_find_chanctx(struct ieee80211_local *local,
 		return NULL;
 
 	list_for_each_entry(ctx, &local->chanctx_list, list) {
-		const struct cfg80211_chan_def *compat;
+		const struct ieee80211_chan_req *compat;
 
 		if (ctx->replace_state != IEEE80211_CHANCTX_REPLACE_NONE)
 			continue;
@@ -552,11 +563,11 @@ ieee80211_find_chanctx(struct ieee80211_local *local,
 		if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE)
 			continue;
 
-		compat = cfg80211_chandef_compatible(&ctx->conf.def, chandef);
+		compat = ieee80211_chanctx_compatible(ctx, chanreq, &tmp);
 		if (!compat)
 			continue;
 
-		compat = ieee80211_chanctx_reserved_chandef(local, ctx,
+		compat = ieee80211_chanctx_reserved_chanreq(local, ctx,
 							    compat);
 		if (!compat)
 			continue;
@@ -636,7 +647,7 @@ ieee80211_chanctx_radar_required(struct ieee80211_local *local,
 
 static struct ieee80211_chanctx *
 ieee80211_alloc_chanctx(struct ieee80211_local *local,
-			const struct cfg80211_chan_def *chandef,
+			const struct ieee80211_chan_req *chanreq,
 			enum ieee80211_chanctx_mode mode)
 {
 	struct ieee80211_chanctx *ctx;
@@ -649,7 +660,7 @@ ieee80211_alloc_chanctx(struct ieee80211_local *local,
 
 	INIT_LIST_HEAD(&ctx->assigned_links);
 	INIT_LIST_HEAD(&ctx->reserved_links);
-	ctx->conf.def = *chandef;
+	ctx->conf.def = chanreq->oper;
 	ctx->conf.rx_chains_static = 1;
 	ctx->conf.rx_chains_dynamic = 1;
 	ctx->mode = mode;
@@ -685,7 +696,7 @@ static int ieee80211_add_chanctx(struct ieee80211_local *local,
 
 static struct ieee80211_chanctx *
 ieee80211_new_chanctx(struct ieee80211_local *local,
-		      const struct cfg80211_chan_def *chandef,
+		      const struct ieee80211_chan_req *chanreq,
 		      enum ieee80211_chanctx_mode mode)
 {
 	struct ieee80211_chanctx *ctx;
@@ -693,7 +704,7 @@ ieee80211_new_chanctx(struct ieee80211_local *local,
 
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	ctx = ieee80211_alloc_chanctx(local, chandef, mode);
+	ctx = ieee80211_alloc_chanctx(local, chanreq, mode);
 	if (!ctx)
 		return ERR_PTR(-ENOMEM);
 
@@ -737,6 +748,7 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
 	struct ieee80211_chanctx_conf *conf = &ctx->conf;
 	struct ieee80211_sub_if_data *sdata;
 	const struct cfg80211_chan_def *compat = NULL;
+	struct ieee80211_chan_req chanreq = {};
 	struct sta_info *sta;
 
 	lockdep_assert_wiphy(local->hw.wiphy);
@@ -762,9 +774,9 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
 				continue;
 
 			if (!compat)
-				compat = &link_conf->chandef;
+				compat = &link_conf->chanreq.oper;
 
-			compat = cfg80211_chandef_compatible(&link_conf->chandef,
+			compat = cfg80211_chandef_compatible(&link_conf->chanreq.oper,
 							     compat);
 			if (WARN_ON_ONCE(!compat))
 				break;
@@ -794,7 +806,9 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
 	if (!compat)
 		return;
 
-	ieee80211_change_chanctx(local, ctx, ctx, compat);
+	chanreq.oper = *compat;
+
+	ieee80211_change_chanctx(local, ctx, ctx, &chanreq);
 }
 
 static void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local,
@@ -1050,7 +1064,7 @@ int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link)
 }
 
 int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link,
-				   const struct cfg80211_chan_def *chandef,
+				   const struct ieee80211_chan_req *chanreq,
 				   enum ieee80211_chanctx_mode mode,
 				   bool radar_required)
 {
@@ -1064,10 +1078,10 @@ int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link,
 	if (curr_ctx && !local->ops->switch_vif_chanctx)
 		return -EOPNOTSUPP;
 
-	new_ctx = ieee80211_find_reservation_chanctx(local, chandef, mode);
+	new_ctx = ieee80211_find_reservation_chanctx(local, chanreq, mode);
 	if (!new_ctx) {
 		if (ieee80211_can_create_new_chanctx(local)) {
-			new_ctx = ieee80211_new_chanctx(local, chandef, mode);
+			new_ctx = ieee80211_new_chanctx(local, chanreq, mode);
 			if (IS_ERR(new_ctx))
 				return PTR_ERR(new_ctx);
 		} else {
@@ -1121,7 +1135,7 @@ int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link,
 			    !list_empty(&curr_ctx->reserved_links))
 				return -EBUSY;
 
-			new_ctx = ieee80211_alloc_chanctx(local, chandef, mode);
+			new_ctx = ieee80211_alloc_chanctx(local, chanreq, mode);
 			if (!new_ctx)
 				return -ENOMEM;
 
@@ -1139,7 +1153,7 @@ int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link,
 
 	list_add(&link->reserved_chanctx_list, &new_ctx->reserved_links);
 	link->reserved_chanctx = new_ctx;
-	link->reserved_chandef = *chandef;
+	link->reserved = *chanreq;
 	link->reserved_radar_required = radar_required;
 	link->reserved_ready = false;
 
@@ -1178,14 +1192,14 @@ ieee80211_link_chanctx_reservation_complete(struct ieee80211_link_data *link)
 }
 
 static void
-ieee80211_link_update_chandef(struct ieee80211_link_data *link,
-			      const struct cfg80211_chan_def *chandef)
+ieee80211_link_update_chanreq(struct ieee80211_link_data *link,
+			      const struct ieee80211_chan_req *chanreq)
 {
 	struct ieee80211_sub_if_data *sdata = link->sdata;
 	unsigned int link_id = link->link_id;
 	struct ieee80211_sub_if_data *vlan;
 
-	link->conf->chandef = *chandef;
+	link->conf->chanreq = *chanreq;
 
 	if (sdata->vif.type != NL80211_IFTYPE_AP)
 		return;
@@ -1198,7 +1212,7 @@ ieee80211_link_update_chandef(struct ieee80211_link_data *link,
 		if (WARN_ON(!vlan_conf))
 			continue;
 
-		vlan_conf->chandef = *chandef;
+		vlan_conf->chanreq = *chanreq;
 	}
 	rcu_read_unlock();
 }
@@ -1211,7 +1225,7 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link)
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_vif_chanctx_switch vif_chsw[1] = {};
 	struct ieee80211_chanctx *old_ctx, *new_ctx;
-	const struct cfg80211_chan_def *chandef;
+	const struct ieee80211_chan_req *chanreq;
 	u64 changed = 0;
 	int err;
 
@@ -1233,17 +1247,17 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link)
 		    IEEE80211_CHANCTX_REPLACES_OTHER))
 		return -EINVAL;
 
-	chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx,
-				&link->reserved_chandef);
-	if (WARN_ON(!chandef))
+	chanreq = ieee80211_chanctx_non_reserved_chandef(local, new_ctx,
+							 &link->reserved);
+	if (WARN_ON(!chanreq))
 		return -EINVAL;
 
-	if (link_conf->chandef.width != link->reserved_chandef.width)
+	if (link_conf->chanreq.oper.width != link->reserved.oper.width)
 		changed = BSS_CHANGED_BANDWIDTH;
 
-	ieee80211_link_update_chandef(link, &link->reserved_chandef);
+	ieee80211_link_update_chanreq(link, &link->reserved);
 
-	_ieee80211_change_chanctx(local, new_ctx, old_ctx, chandef, link);
+	_ieee80211_change_chanctx(local, new_ctx, old_ctx, chanreq, link);
 
 	vif_chsw[0].vif = &sdata->vif;
 	vif_chsw[0].old_ctx = &old_ctx->conf;
@@ -1291,7 +1305,7 @@ ieee80211_link_use_reserved_assign(struct ieee80211_link_data *link)
 	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx *old_ctx, *new_ctx;
-	const struct cfg80211_chan_def *chandef;
+	const struct ieee80211_chan_req *chanreq;
 	int err;
 
 	old_ctx = ieee80211_link_get_chanctx(link);
@@ -1310,12 +1324,12 @@ ieee80211_link_use_reserved_assign(struct ieee80211_link_data *link)
 		    IEEE80211_CHANCTX_REPLACES_OTHER))
 		return -EINVAL;
 
-	chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx,
-				&link->reserved_chandef);
-	if (WARN_ON(!chandef))
+	chanreq = ieee80211_chanctx_non_reserved_chandef(local, new_ctx,
+							 &link->reserved);
+	if (WARN_ON(!chanreq))
 		return -EINVAL;
 
-	ieee80211_change_chanctx(local, new_ctx, new_ctx, chandef);
+	ieee80211_change_chanctx(local, new_ctx, new_ctx, chanreq);
 
 	list_del(&link->reserved_chanctx_list);
 	link->reserved_chanctx = NULL;
@@ -1589,10 +1603,10 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
 
 			link->radar_required = link->reserved_radar_required;
 
-			if (link_conf->chandef.width != link->reserved_chandef.width)
+			if (link_conf->chanreq.oper.width != link->reserved.oper.width)
 				changed = BSS_CHANGED_BANDWIDTH;
 
-			ieee80211_link_update_chandef(link, &link->reserved_chandef);
+			ieee80211_link_update_chanreq(link, &link->reserved);
 			if (changed)
 				ieee80211_link_info_change_notify(sdata,
 								  link,
@@ -1727,7 +1741,7 @@ static void __ieee80211_link_release_channel(struct ieee80211_link_data *link)
 }
 
 int ieee80211_link_use_channel(struct ieee80211_link_data *link,
-			       const struct cfg80211_chan_def *chandef,
+			       const struct ieee80211_chan_req *chanreq,
 			       enum ieee80211_chanctx_mode mode)
 {
 	struct ieee80211_sub_if_data *sdata = link->sdata;
@@ -1740,36 +1754,36 @@ int ieee80211_link_use_channel(struct ieee80211_link_data *link,
 
 	if (sdata->vif.active_links &&
 	    !(sdata->vif.active_links & BIT(link->link_id))) {
-		ieee80211_link_update_chandef(link, chandef);
+		ieee80211_link_update_chanreq(link, chanreq);
 		return 0;
 	}
 
 	ret = cfg80211_chandef_dfs_required(local->hw.wiphy,
-					    chandef,
+					    &chanreq->oper,
 					    sdata->wdev.iftype);
 	if (ret < 0)
 		goto out;
 	if (ret > 0)
-		radar_detect_width = BIT(chandef->width);
+		radar_detect_width = BIT(chanreq->oper.width);
 
 	link->radar_required = ret;
 
-	ret = ieee80211_check_combinations(sdata, chandef, mode,
+	ret = ieee80211_check_combinations(sdata, &chanreq->oper, mode,
 					   radar_detect_width);
 	if (ret < 0)
 		goto out;
 
 	__ieee80211_link_release_channel(link);
 
-	ctx = ieee80211_find_chanctx(local, chandef, mode);
+	ctx = ieee80211_find_chanctx(local, chanreq, mode);
 	if (!ctx)
-		ctx = ieee80211_new_chanctx(local, chandef, mode);
+		ctx = ieee80211_new_chanctx(local, chanreq, mode);
 	if (IS_ERR(ctx)) {
 		ret = PTR_ERR(ctx);
 		goto out;
 	}
 
-	ieee80211_link_update_chandef(link, chandef);
+	ieee80211_link_update_chanreq(link, chanreq);
 
 	ret = ieee80211_assign_link_chanctx(link, ctx);
 	if (ret) {
@@ -1849,28 +1863,33 @@ int ieee80211_link_use_reserved_context(struct ieee80211_link_data *link)
 	return 0;
 }
 
-int ieee80211_link_change_bandwidth(struct ieee80211_link_data *link,
-				    const struct cfg80211_chan_def *chandef,
-				    u64 *changed)
+int ieee80211_link_change_chanreq(struct ieee80211_link_data *link,
+				  const struct ieee80211_chan_req *chanreq,
+				  u64 *changed)
 {
 	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_bss_conf *link_conf = link->conf;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx_conf *conf;
 	struct ieee80211_chanctx *ctx;
-	const struct cfg80211_chan_def *compat;
+	const struct ieee80211_chan_req *compat;
+	struct ieee80211_chan_req tmp;
 
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef,
+	if (!cfg80211_chandef_usable(sdata->local->hw.wiphy,
+				     &chanreq->oper,
 				     IEEE80211_CHAN_DISABLED))
 		return -EINVAL;
 
-	if (cfg80211_chandef_identical(chandef, &link_conf->chandef))
+	/* for non-HT 20 MHz the rest doesn't matter */
+	if (chanreq->oper.width == NL80211_CHAN_WIDTH_20_NOHT &&
+	    cfg80211_chandef_identical(&chanreq->oper, &link_conf->chanreq.oper))
 		return 0;
 
-	if (chandef->width == NL80211_CHAN_WIDTH_20_NOHT ||
-	    link_conf->chandef.width == NL80211_CHAN_WIDTH_20_NOHT)
+	/* but you cannot switch to/from it */
+	if (chanreq->oper.width == NL80211_CHAN_WIDTH_20_NOHT ||
+	    link_conf->chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT)
 		return -EINVAL;
 
 	conf = rcu_dereference_protected(link_conf->chanctx_conf,
@@ -1880,13 +1899,13 @@ int ieee80211_link_change_bandwidth(struct ieee80211_link_data *link,
 
 	ctx = container_of(conf, struct ieee80211_chanctx, conf);
 
-	compat = cfg80211_chandef_compatible(&conf->def, chandef);
+	compat = ieee80211_chanctx_compatible(ctx, chanreq, &tmp);
 	if (!compat)
 		return -EINVAL;
 
 	switch (ctx->replace_state) {
 	case IEEE80211_CHANCTX_REPLACE_NONE:
-		if (!ieee80211_chanctx_reserved_chandef(local, ctx, compat))
+		if (!ieee80211_chanctx_reserved_chanreq(local, ctx, compat))
 			return -EBUSY;
 		break;
 	case IEEE80211_CHANCTX_WILL_BE_REPLACED:
@@ -1901,7 +1920,7 @@ int ieee80211_link_change_bandwidth(struct ieee80211_link_data *link,
 		break;
 	}
 
-	ieee80211_link_update_chandef(link, chandef);
+	ieee80211_link_update_chanreq(link, chanreq);
 
 	ieee80211_recalc_chanctx_chantype(local, ctx);
 
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index bccc99a00383..c3330aea4da3 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -257,7 +257,7 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
 	if (WARN_ON(!link_conf))
 		width = NL80211_CHAN_WIDTH_20_NOHT;
 	else
-		width = link_conf->chandef.width;
+		width = link_conf->chanreq.oper.width;
 
 	switch (width) {
 	default:
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index c23f46218af7..27cc9ddd0432 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -223,7 +223,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_mgmt *mgmt;
 	struct cfg80211_bss *bss;
 	u64 bss_change;
-	struct cfg80211_chan_def chandef;
+	struct ieee80211_chan_req chanreq = {};
 	struct ieee80211_channel *chan;
 	struct beacon_data *presp;
 	struct cfg80211_inform_bss bss_meta = {};
@@ -257,22 +257,22 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 		kfree_rcu(presp, rcu_head);
 
 	/* make a copy of the chandef, it could be modified below. */
-	chandef = *req_chandef;
-	chan = chandef.chan;
-	if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef,
+	chanreq.oper = *req_chandef;
+	chan = chanreq.oper.chan;
+	if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chanreq.oper,
 				     NL80211_IFTYPE_ADHOC)) {
-		if (chandef.width == NL80211_CHAN_WIDTH_5 ||
-		    chandef.width == NL80211_CHAN_WIDTH_10 ||
-		    chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
-		    chandef.width == NL80211_CHAN_WIDTH_20) {
+		if (chanreq.oper.width == NL80211_CHAN_WIDTH_5 ||
+		    chanreq.oper.width == NL80211_CHAN_WIDTH_10 ||
+		    chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT ||
+		    chanreq.oper.width == NL80211_CHAN_WIDTH_20) {
 			sdata_info(sdata,
 				   "Failed to join IBSS, beacons forbidden\n");
 			return;
 		}
-		chandef.width = NL80211_CHAN_WIDTH_20;
-		chandef.center_freq1 = chan->center_freq;
+		chanreq.oper.width = NL80211_CHAN_WIDTH_20;
+		chanreq.oper.center_freq1 = chan->center_freq;
 		/* check again for downgraded chandef */
-		if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef,
+		if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chanreq.oper,
 					     NL80211_IFTYPE_ADHOC)) {
 			sdata_info(sdata,
 				   "Failed to join IBSS, beacons forbidden\n");
@@ -281,7 +281,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	}
 
 	err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy,
-					    &chandef, NL80211_IFTYPE_ADHOC);
+					    &chanreq.oper, NL80211_IFTYPE_ADHOC);
 	if (err < 0) {
 		sdata_info(sdata,
 			   "Failed to join IBSS, invalid chandef\n");
@@ -295,7 +295,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 
 	radar_required = err;
 
-	if (ieee80211_link_use_channel(&sdata->deflink, &chandef,
+	if (ieee80211_link_use_channel(&sdata->deflink, &chanreq,
 				       ifibss->fixed_channel ?
 					IEEE80211_CHANCTX_SHARED :
 					IEEE80211_CHANCTX_EXCLUSIVE)) {
@@ -307,7 +307,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	memcpy(ifibss->bssid, bssid, ETH_ALEN);
 
 	presp = ieee80211_ibss_build_presp(sdata, beacon_int, basic_rates,
-					   capability, tsf, &chandef,
+					   capability, tsf, &chanreq.oper,
 					   &have_higher_than_11mbit, NULL);
 	if (!presp)
 		return;
@@ -533,12 +533,12 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata, u64 *changed)
 					IEEE80211_PRIVACY(ifibss->privacy));
 		/* XXX: should not really modify cfg80211 data */
 		if (cbss) {
-			cbss->channel = sdata->deflink.csa_chandef.chan;
+			cbss->channel = sdata->deflink.csa_chanreq.oper.chan;
 			cfg80211_put_bss(sdata->local->hw.wiphy, cbss);
 		}
 	}
 
-	ifibss->chandef = sdata->deflink.csa_chandef;
+	ifibss->chandef = sdata->deflink.csa_chanreq.oper;
 
 	/* generate the beacon */
 	return ieee80211_ibss_csa_beacon(sdata, NULL, changed);
@@ -799,7 +799,7 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 		goto disconnect;
 
 	params.count = csa_ie.count;
-	params.chandef = csa_ie.chandef;
+	params.chandef = csa_ie.chanreq.oper;
 
 	switch (ifibss->chandef.width) {
 	case NL80211_CHAN_WIDTH_20_NOHT:
@@ -858,7 +858,7 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	params.radar_required = err;
 
 	if (cfg80211_chandef_identical(&params.chandef,
-				       &sdata->vif.bss_conf.chandef)) {
+				       &sdata->vif.bss_conf.chanreq.oper)) {
 		ibss_dbg(sdata,
 			 "received csa with an identical chandef, ignoring\n");
 		return true;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index cb4684a9451e..70c48cad180a 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -884,6 +884,9 @@ struct ieee80211_chanctx {
 	enum ieee80211_chanctx_mode mode;
 	bool driver_present;
 
+	/* temporary data for search algorithm etc. */
+	struct ieee80211_chan_req req;
+
 	struct ieee80211_chanctx_conf conf;
 };
 
@@ -1035,7 +1038,7 @@ struct ieee80211_link_data {
 
 	bool operating_11g_mode;
 
-	struct cfg80211_chan_def csa_chandef;
+	struct ieee80211_chan_req csa_chanreq;
 
 	struct wiphy_work color_change_finalize_work;
 	struct delayed_work color_collision_detect_work;
@@ -1043,7 +1046,7 @@ struct ieee80211_link_data {
 
 	/* context reservation -- protected with wiphy mutex */
 	struct ieee80211_chanctx *reserved_chanctx;
-	struct cfg80211_chan_def reserved_chandef;
+	struct ieee80211_chan_req reserved;
 	bool reserved_radar_required;
 	bool reserved_ready;
 
@@ -1574,7 +1577,7 @@ struct ieee80211_local {
 
 	/* virtual monitor interface */
 	struct ieee80211_sub_if_data __rcu *monitor_sdata;
-	struct cfg80211_chan_def monitor_chandef;
+	struct ieee80211_chan_req monitor_chanreq;
 
 	/* extended capabilities provided by mac80211 */
 	u8 ext_capa[8];
@@ -1639,7 +1642,7 @@ ieee80211_get_link_sband(struct ieee80211_link_data *link)
 
 /* this struct holds the value parsing from channel switch IE  */
 struct ieee80211_csa_ie {
-	struct cfg80211_chan_def chandef;
+	struct ieee80211_chan_req chanreq;
 	u8 mode;
 	u8 count;
 	u8 ttl;
@@ -2523,11 +2526,11 @@ void ieee80211_chandef_downgrade(struct cfg80211_chan_def *chandef,
 
 int __must_check
 ieee80211_link_use_channel(struct ieee80211_link_data *link,
-			   const struct cfg80211_chan_def *chandef,
+			   const struct ieee80211_chan_req *req,
 			   enum ieee80211_chanctx_mode mode);
 int __must_check
 ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link,
-			       const struct cfg80211_chan_def *chandef,
+			       const struct ieee80211_chan_req *req,
 			       enum ieee80211_chanctx_mode mode,
 			       bool radar_required);
 int __must_check
@@ -2535,9 +2538,9 @@ ieee80211_link_use_reserved_context(struct ieee80211_link_data *link);
 int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link);
 
 int __must_check
-ieee80211_link_change_bandwidth(struct ieee80211_link_data *link,
-				const struct cfg80211_chan_def *chandef,
-				u64 *changed);
+ieee80211_link_change_chanreq(struct ieee80211_link_data *link,
+			      const struct ieee80211_chan_req *req,
+			      u64 *changed);
 void ieee80211_link_release_channel(struct ieee80211_link_data *link);
 void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link);
 void ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index d81162bf7d48..227c8dc3fbe5 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -557,7 +557,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
 				  &sdata->deflink.dfs_cac_timer_work);
 
 	if (sdata->wdev.cac_started) {
-		chandef = sdata->vif.bss_conf.chandef;
+		chandef = sdata->vif.bss_conf.chanreq.oper;
 		WARN_ON(local->suspended);
 		ieee80211_link_release_channel(&sdata->deflink);
 		cfg80211_cac_event(sdata->dev, &chandef,
@@ -1164,7 +1164,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
 	rcu_assign_pointer(local->monitor_sdata, sdata);
 	mutex_unlock(&local->iflist_mtx);
 
-	ret = ieee80211_link_use_channel(&sdata->deflink, &local->monitor_chandef,
+	ret = ieee80211_link_use_channel(&sdata->deflink, &local->monitor_chanreq,
 					 IEEE80211_CHANCTX_EXCLUSIVE);
 	if (ret) {
 		mutex_lock(&local->iflist_mtx);
@@ -1252,7 +1252,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 		sdata->vif.cab_queue = master->vif.cab_queue;
 		memcpy(sdata->vif.hw_queue, master->vif.hw_queue,
 		       sizeof(sdata->vif.hw_queue));
-		sdata->vif.bss_conf.chandef = master->vif.bss_conf.chandef;
+		sdata->vif.bss_conf.chanreq = master->vif.bss_conf.chanreq;
 
 		sdata->crypto_tx_tailroom_needed_cnt +=
 			master->crypto_tx_tailroom_needed_cnt;
diff --git a/net/mac80211/link.c b/net/mac80211/link.c
index c0d05efcf6f8..2231eb38a211 100644
--- a/net/mac80211/link.c
+++ b/net/mac80211/link.c
@@ -402,7 +402,8 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata,
 
 		link = sdata_dereference(sdata->link[link_id], sdata);
 
-		ret = ieee80211_link_use_channel(link, &link->conf->chandef,
+		ret = ieee80211_link_use_channel(link,
+						 &link->conf->chanreq,
 						 IEEE80211_CHANCTX_SHARED);
 		WARN_ON_ONCE(ret);
 
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index ce0cba8d7afc..879abe216a3e 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1229,7 +1229,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 						&sband->channels[i],
 						NL80211_CHAN_NO_HT);
 			/* init channel we're on */
-			local->monitor_chandef = dflt_chandef;
+			local->monitor_chanreq.oper = dflt_chandef;
 			if (local->emulate_chanctx) {
 				local->dflt_chandef = dflt_chandef;
 				local->hw.conf.chandef = dflt_chandef;
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 60dc453acb5a..e06d9ed2d31b 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -97,7 +97,7 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata,
 	if (sdata->vif.bss_conf.basic_rates != basic_rates)
 		return false;
 
-	cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chandef.chan,
+	cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chanreq.oper.chan,
 				NL80211_CHAN_NO_HT);
 	ieee80211_chandef_ht_oper(ie->ht_operation, &sta_chan_def);
 
@@ -111,7 +111,7 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata,
 				       ie->eht_operation,
 				       &sta_chan_def);
 
-	if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef,
+	if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chanreq.oper,
 					 &sta_chan_def))
 		return false;
 
@@ -436,9 +436,9 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata,
 		return 0;
 
 	if (!sband->ht_cap.ht_supported ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10)
 		return 0;
 
 	if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_cap))
@@ -477,16 +477,16 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata,
 		return 0;
 
 	if (!ht_cap->ht_supported ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10)
 		return 0;
 
 	if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_operation))
 		return -ENOMEM;
 
 	pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_operation));
-	ieee80211_ie_build_ht_oper(pos, ht_cap, &sdata->vif.bss_conf.chandef,
+	ieee80211_ie_build_ht_oper(pos, ht_cap, &sdata->vif.bss_conf.chanreq.oper,
 				   sdata->vif.bss_conf.ht_operation_mode,
 				   false);
 
@@ -508,9 +508,9 @@ int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata,
 		return 0;
 
 	if (!sband->vht_cap.vht_supported ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10)
 		return 0;
 
 	if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_cap))
@@ -549,9 +549,9 @@ int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata,
 		return 0;
 
 	if (!vht_cap->vht_supported ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10)
 		return 0;
 
 	if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_operation))
@@ -559,7 +559,7 @@ int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata,
 
 	pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_operation));
 	ieee80211_ie_build_vht_oper(pos, vht_cap,
-				    &sdata->vif.bss_conf.chandef);
+				    &sdata->vif.bss_conf.chanreq.oper);
 
 	return 0;
 }
@@ -578,9 +578,9 @@ int mesh_add_he_cap_ie(struct ieee80211_sub_if_data *sdata,
 	he_cap = ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT);
 
 	if (!he_cap ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10)
 		return 0;
 
 	if (skb_tailroom(skb) < ie_len)
@@ -606,20 +606,20 @@ int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata,
 
 	he_cap = ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT);
 	if (!he_cap ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10)
 		return 0;
 
 	len = 2 + 1 + sizeof(struct ieee80211_he_operation);
-	if (sdata->vif.bss_conf.chandef.chan->band == NL80211_BAND_6GHZ)
+	if (sdata->vif.bss_conf.chanreq.oper.chan->band == NL80211_BAND_6GHZ)
 		len += sizeof(struct ieee80211_he_6ghz_oper);
 
 	if (skb_tailroom(skb) < len)
 		return -ENOMEM;
 
 	pos = skb_put(skb, len);
-	ieee80211_ie_build_he_oper(pos, &sdata->vif.bss_conf.chandef);
+	ieee80211_ie_build_he_oper(pos, &sdata->vif.bss_conf.chanreq.oper);
 
 	return 0;
 }
@@ -659,9 +659,9 @@ int mesh_add_eht_cap_ie(struct ieee80211_sub_if_data *sdata,
 	he_cap = ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT);
 	eht_cap = ieee80211_get_eht_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT);
 	if (!he_cap || !eht_cap ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10)
 		return 0;
 
 	if (skb_tailroom(skb) < ie_len)
@@ -686,9 +686,9 @@ int mesh_add_eht_oper_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *sk
 
 	eht_cap = ieee80211_get_eht_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT);
 	if (!eht_cap ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10)
 		return 0;
 
 	len = 2 + 1 + offsetof(struct ieee80211_eht_operation, optional) +
@@ -698,7 +698,7 @@ int mesh_add_eht_oper_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *sk
 		return -ENOMEM;
 
 	pos = skb_put(skb, len);
-	ieee80211_ie_build_eht_oper(pos, &sdata->vif.bss_conf.chandef, eht_cap);
+	ieee80211_ie_build_eht_oper(pos, &sdata->vif.bss_conf.chanreq.oper, eht_cap);
 
 	return 0;
 }
@@ -746,9 +746,9 @@ ieee80211_mesh_update_bss_params(struct ieee80211_sub_if_data *sdata,
 		return;
 
 	if (!ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT) ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
-	    sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_5 ||
+	    sdata->vif.bss_conf.chanreq.oper.width == NL80211_CHAN_WIDTH_10)
 		return;
 
 	sdata->vif.bss_conf.he_support = true;
@@ -1277,11 +1277,12 @@ static void ieee80211_mesh_csa_mark_radar(struct ieee80211_sub_if_data *sdata)
 	 * unavailable.
 	 */
 	err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy,
-					    &sdata->vif.bss_conf.chandef,
+					    &sdata->vif.bss_conf.chanreq.oper,
 					    NL80211_IFTYPE_MESH_POINT);
 	if (err > 0)
 		cfg80211_radar_event(sdata->local->hw.wiphy,
-				     &sdata->vif.bss_conf.chandef, GFP_ATOMIC);
+				     &sdata->vif.bss_conf.chanreq.oper,
+				     GFP_ATOMIC);
 }
 
 static bool
@@ -1302,7 +1303,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata,
 	if (!sband)
 		return false;
 
-	switch (sdata->vif.bss_conf.chandef.width) {
+	switch (sdata->vif.bss_conf.chanreq.oper.width) {
 	case NL80211_CHAN_WIDTH_20_NOHT:
 		conn.mode = IEEE80211_CONN_MODE_LEGACY;
 		conn.bw_limit = IEEE80211_CONN_BW_LIMIT_20;
@@ -1339,7 +1340,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata,
 	if (csa_ie.reason_code == WLAN_REASON_MESH_CHAN_REGULATORY)
 		ieee80211_mesh_csa_mark_radar(sdata);
 
-	params.chandef = csa_ie.chandef;
+	params.chandef = csa_ie.chanreq.oper;
 	params.count = csa_ie.count;
 
 	if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, &params.chandef,
@@ -1375,7 +1376,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata,
 	params.radar_required = err;
 
 	if (cfg80211_chandef_identical(&params.chandef,
-				       &sdata->vif.bss_conf.chandef)) {
+				       &sdata->vif.bss_conf.chanreq.oper)) {
 		mcsa_dbg(sdata,
 			 "received csa with an identical chandef, ignoring\n");
 		return true;
@@ -1555,7 +1556,7 @@ int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata, u64 *changed)
 	*changed |= BSS_CHANGED_BEACON;
 
 	mcsa_dbg(sdata, "complete switching to center freq %d MHz",
-		 sdata->vif.bss_conf.chandef.chan->center_freq);
+		 sdata->vif.bss_conf.chanreq.oper.chan->center_freq);
 	return 0;
 }
 
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 28bf794f67f8..e3aad60f68ab 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -163,7 +163,7 @@ static u64 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata)
 	u16 ht_opmode;
 	bool non_ht_sta = false, ht20_sta = false;
 
-	switch (sdata->vif.bss_conf.chandef.width) {
+	switch (sdata->vif.bss_conf.chanreq.oper.width) {
 	case NL80211_CHAN_WIDTH_20_NOHT:
 	case NL80211_CHAN_WIDTH_5:
 	case NL80211_CHAN_WIDTH_10:
@@ -196,7 +196,7 @@ static u64 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata)
 	if (non_ht_sta)
 		ht_opmode = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED;
 	else if (ht20_sta &&
-		 sdata->vif.bss_conf.chandef.width > NL80211_CHAN_WIDTH_20)
+		 sdata->vif.bss_conf.chanreq.oper.width > NL80211_CHAN_WIDTH_20)
 		ht_opmode = IEEE80211_HT_OP_MODE_PROTECTION_20MHZ;
 	else
 		ht_opmode = IEEE80211_HT_OP_MODE_PROTECTION_NONE;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 9968bc0ddf6e..e9d720f25ddf 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -139,7 +139,7 @@ ieee80211_handle_puncturing_bitmap(struct ieee80211_link_data *link,
 				   const struct ieee80211_eht_operation *eht_oper,
 				   u16 bitmap, u64 *changed)
 {
-	struct cfg80211_chan_def *chandef = &link->conf->chandef;
+	struct cfg80211_chan_def *chandef = &link->conf->chanreq.oper;
 	struct ieee80211_local *local = link->sdata->local;
 	u16 extracted;
 	u64 _changed = 0;
@@ -862,8 +862,9 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link,
 			       struct ieee802_11_elems *elems,
 			       u64 *changed)
 {
-	struct ieee80211_channel *channel = link->conf->chandef.chan;
+	struct ieee80211_channel *channel = link->conf->chanreq.oper.chan;
 	struct ieee80211_sub_if_data *sdata = link->sdata;
+	struct ieee80211_chan_req chanreq = {};
 	struct cfg80211_chan_def ap_chandef;
 	enum ieee80211_conn_mode ap_mode;
 	u32 vht_cap_info = 0;
@@ -913,7 +914,7 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link,
 			ieee80211_min_bw_limit_from_chandef(&ap_chandef))
 		ieee80211_chandef_downgrade(&ap_chandef, NULL);
 
-	if (cfg80211_chandef_identical(&ap_chandef, &link->conf->chandef))
+	if (cfg80211_chandef_identical(&ap_chandef, &link->conf->chanreq.oper))
 		return 0;
 
 	link_info(link,
@@ -946,8 +947,9 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link,
 	 * bandwidth changes where a this could happen, but those cases are
 	 * less common and wouldn't completely prevent using the AP.
 	 */
+	chanreq.oper = ap_chandef;
 
-	ret = ieee80211_link_change_bandwidth(link, &ap_chandef, changed);
+	ret = ieee80211_link_change_chanreq(link, &chanreq, changed);
 	if (ret) {
 		sdata_info(sdata,
 			   "AP %pM changed bandwidth to incompatible one - disconnect\n",
@@ -2069,8 +2071,8 @@ static void ieee80211_chswitch_work(struct wiphy *wiphy,
 		return;
 	}
 
-	if (!cfg80211_chandef_identical(&link->conf->chandef,
-					&link->csa_chandef)) {
+	if (!cfg80211_chandef_identical(&link->conf->chanreq.oper,
+					&link->csa_chanreq.oper)) {
 		sdata_info(sdata,
 			   "failed to finalize channel switch, disconnecting\n");
 		wiphy_work_queue(sdata->local->hw.wiphy,
@@ -2118,7 +2120,7 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link)
 		return;
 	}
 
-	cfg80211_ch_switch_notify(sdata->dev, &link->reserved_chandef,
+	cfg80211_ch_switch_notify(sdata->dev, &link->reserved.oper,
 				  link->link_id, 0);
 }
 
@@ -2211,7 +2213,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link,
 		ch_switch.timestamp = timestamp;
 		ch_switch.device_timestamp = device_timestamp;
 		ch_switch.block_tx = csa_ie.mode;
-		ch_switch.chandef = csa_ie.chandef;
+		ch_switch.chandef = csa_ie.chanreq.oper;
 		ch_switch.count = csa_ie.count;
 		ch_switch.delay = csa_ie.max_switch_time;
 	}
@@ -2231,34 +2233,36 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link,
 		return;
 	}
 
-	if (link->conf->chandef.chan->band !=
-	    csa_ie.chandef.chan->band) {
+	if (link->conf->chanreq.oper.chan->band !=
+	    csa_ie.chanreq.oper.chan->band) {
 		sdata_info(sdata,
 			   "AP %pM switches to different band (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n",
 			   link->u.mgd.bssid,
-			   csa_ie.chandef.chan->center_freq,
-			   csa_ie.chandef.width, csa_ie.chandef.center_freq1,
-			   csa_ie.chandef.center_freq2);
+			   csa_ie.chanreq.oper.chan->center_freq,
+			   csa_ie.chanreq.oper.width,
+			   csa_ie.chanreq.oper.center_freq1,
+			   csa_ie.chanreq.oper.center_freq2);
 		goto drop_connection;
 	}
 
-	if (!cfg80211_chandef_usable(local->hw.wiphy, &csa_ie.chandef,
+	if (!cfg80211_chandef_usable(local->hw.wiphy, &csa_ie.chanreq.oper,
 				     IEEE80211_CHAN_DISABLED)) {
 		sdata_info(sdata,
 			   "AP %pM switches to unsupported channel "
 			   "(%d.%03d MHz, width:%d, CF1/2: %d.%03d/%d MHz), "
 			   "disconnecting\n",
 			   link->u.mgd.bssid,
-			   csa_ie.chandef.chan->center_freq,
-			   csa_ie.chandef.chan->freq_offset,
-			   csa_ie.chandef.width, csa_ie.chandef.center_freq1,
-			   csa_ie.chandef.freq1_offset,
-			   csa_ie.chandef.center_freq2);
+			   csa_ie.chanreq.oper.chan->center_freq,
+			   csa_ie.chanreq.oper.chan->freq_offset,
+			   csa_ie.chanreq.oper.width,
+			   csa_ie.chanreq.oper.center_freq1,
+			   csa_ie.chanreq.oper.freq1_offset,
+			   csa_ie.chanreq.oper.center_freq2);
 		goto drop_connection;
 	}
 
-	if (cfg80211_chandef_identical(&csa_ie.chandef,
-				       &link->conf->chandef) &&
+	if (cfg80211_chandef_identical(&csa_ie.chanreq.oper,
+				       &link->conf->chanreq.oper) &&
 	    (!csa_ie.mode || !beacon)) {
 		if (link->u.mgd.csa_ignored_same_chan)
 			return;
@@ -2299,7 +2303,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link,
 		goto drop_connection;
 	}
 
-	res = ieee80211_link_reserve_chanctx(link, &csa_ie.chandef,
+	res = ieee80211_link_reserve_chanctx(link, &csa_ie.chanreq,
 					     chanctx->mode, false);
 	if (res) {
 		sdata_info(sdata,
@@ -2309,7 +2313,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link,
 	}
 
 	link->conf->csa_active = true;
-	link->csa_chandef = csa_ie.chandef;
+	link->csa_chanreq = csa_ie.chanreq;
 	link->csa_block_tx = csa_ie.mode;
 	link->u.mgd.csa_ignored_same_chan = false;
 	link->u.mgd.beacon_crc_valid = false;
@@ -2318,7 +2322,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link,
 		ieee80211_stop_vif_queues(local, sdata,
 					  IEEE80211_QUEUE_STOP_REASON_CSA);
 
-	cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chandef,
+	cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chanreq.oper,
 					  link->link_id, csa_ie.count,
 					  csa_ie.mode, 0);
 
@@ -2741,7 +2745,7 @@ void ieee80211_dfs_cac_timer_work(struct wiphy *wiphy, struct wiphy_work *work)
 	struct ieee80211_link_data *link =
 		container_of(work, struct ieee80211_link_data,
 			     dfs_cac_timer_work.work);
-	struct cfg80211_chan_def chandef = link->conf->chandef;
+	struct cfg80211_chan_def chandef = link->conf->chanreq.oper;
 	struct ieee80211_sub_if_data *sdata = link->sdata;
 
 	lockdep_assert_wiphy(sdata->local->hw.wiphy);
@@ -4508,11 +4512,11 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
 		goto out;
 	}
 
-	if (WARN_ON(!link->conf->chandef.chan)) {
+	if (WARN_ON(!link->conf->chanreq.oper.chan)) {
 		ret = false;
 		goto out;
 	}
-	sband = local->hw.wiphy->bands[link->conf->chandef.chan->band];
+	sband = local->hw.wiphy->bands[link->conf->chanreq.oper.chan->band];
 
 	/* Set up internal HT/VHT capabilities */
 	if (elems->ht_cap_elem && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT)
@@ -4580,7 +4584,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
 		} else if (is_6ghz) {
 			link_info(link,
 				  "HE 6 GHz operation missing (on %d MHz), expect issues\n",
-				  bss_conf->chandef.chan->center_freq);
+				  bss_conf->chanreq.oper.chan->center_freq);
 		}
 
 		bss_conf->he_support = link_sta->pub->he_cap.has_he;
@@ -5132,8 +5136,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 				  struct ieee80211_conn_settings *conn)
 {
 	struct ieee80211_local *local = sdata->local;
-	struct cfg80211_chan_def chandef;
 	bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ;
+	struct ieee80211_chan_req chanreq = {};
 	struct ieee802_11_elems *elems;
 	int ret;
 	u32 i;
@@ -5142,7 +5146,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 
 	rcu_read_lock();
 	elems = ieee80211_determine_chan_mode(sdata, conn, cbss, link_id,
-					      &chandef);
+					      &chanreq.oper);
 
 	if (IS_ERR(elems)) {
 		rcu_read_unlock();
@@ -5196,17 +5200,18 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 	 * on incompatible channels, e.g. 80+80 and 160 sharing the
 	 * same control channel) try to use a smaller bandwidth.
 	 */
-	ret = ieee80211_link_use_channel(link, &chandef,
+	ret = ieee80211_link_use_channel(link, &chanreq,
 					 IEEE80211_CHANCTX_SHARED);
 
 	/* don't downgrade for 5 and 10 MHz channels, though. */
-	if (chandef.width == NL80211_CHAN_WIDTH_5 ||
-	    chandef.width == NL80211_CHAN_WIDTH_10)
+	if (chanreq.oper.width == NL80211_CHAN_WIDTH_5 ||
+	    chanreq.oper.width == NL80211_CHAN_WIDTH_10)
 		return ret;
 
-	while (ret && chandef.width != NL80211_CHAN_WIDTH_20_NOHT) {
-		ieee80211_chandef_downgrade(&chandef, conn);
-		ret = ieee80211_link_use_channel(link, &chandef,
+	while (ret && chanreq.oper.width != NL80211_CHAN_WIDTH_20_NOHT) {
+		ieee80211_chandef_downgrade(&chanreq.oper, conn);
+
+		ret = ieee80211_link_use_channel(link, &chanreq,
 						 IEEE80211_CHANCTX_SHARED);
 	}
 
@@ -5862,7 +5867,7 @@ static bool ieee80211_config_puncturing(struct ieee80211_link_data *link,
 	}
 
 	extracted = ieee80211_extract_dis_subch_bmap(eht_oper,
-						     &link->conf->chandef,
+						     &link->conf->chanreq.oper,
 						     bitmap);
 
 	/* accept if there are no changes */
@@ -5871,12 +5876,12 @@ static bool ieee80211_config_puncturing(struct ieee80211_link_data *link,
 		return true;
 
 	if (!cfg80211_valid_disable_subchannel_bitmap(&bitmap,
-						      &link->conf->chandef)) {
+						      &link->conf->chanreq.oper)) {
 		link_info(link,
 			  "Got an invalid disable subchannel bitmap from AP %pM: bitmap = 0x%x, bw = 0x%x. disconnect\n",
 			  link->u.mgd.bssid,
 			  bitmap,
-			  link->conf->chandef.width);
+			  link->conf->chanreq.oper.width);
 		return false;
 	}
 
@@ -6573,10 +6578,10 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
 		goto free;
 	}
 
-	if (WARN_ON(!link->conf->chandef.chan))
+	if (WARN_ON(!link->conf->chanreq.oper.chan))
 		goto free;
 
-	sband = local->hw.wiphy->bands[link->conf->chandef.chan->band];
+	sband = local->hw.wiphy->bands[link->conf->chanreq.oper.chan->band];
 
 	changed |= ieee80211_recalc_twt_req(sdata, sband, link, link_sta, elems);
 
diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c
index 449af4e1cca4..2dd4a2196af4 100644
--- a/net/mac80211/ocb.c
+++ b/net/mac80211/ocb.c
@@ -168,6 +168,7 @@ void ieee80211_ocb_setup_sdata(struct ieee80211_sub_if_data *sdata)
 int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata,
 		       struct ocb_setup *setup)
 {
+	struct ieee80211_chan_req chanreq = { .oper = setup->chandef };
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_ocb *ifocb = &sdata->u.ocb;
 	u64 changed = BSS_CHANGED_OCB | BSS_CHANGED_BSSID;
@@ -182,7 +183,7 @@ int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata,
 	sdata->deflink.smps_mode = IEEE80211_SMPS_OFF;
 	sdata->deflink.needed_rx_chains = sdata->local->rx_chains;
 
-	err = ieee80211_link_use_channel(&sdata->deflink, &setup->chandef,
+	err = ieee80211_link_use_channel(&sdata->deflink, &chanreq,
 					 IEEE80211_CHANCTX_SHARED);
 	if (err)
 		return err;
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index d5ea5f5bcf3a..34e03b9522c8 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -4,7 +4,7 @@
  * Copyright 2005-2006, Devicescape Software, Inc.
  * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
  * Copyright 2017	Intel Deutschland GmbH
- * Copyright (C) 2022 Intel Corporation
+ * Copyright (C) 2019, 2022-2024 Intel Corporation
  */
 
 #include <linux/kernel.h>
@@ -278,10 +278,10 @@ void ieee80211_check_rate_mask(struct ieee80211_link_data *link)
 	u32 user_mask, basic_rates = link->conf->basic_rates;
 	enum nl80211_band band;
 
-	if (WARN_ON(!link->conf->chandef.chan))
+	if (WARN_ON(!link->conf->chanreq.oper.chan))
 		return;
 
-	band = link->conf->chandef.chan->band;
+	band = link->conf->chanreq.oper.chan->band;
 	if (band == NL80211_BAND_S1GHZ) {
 		/* TODO */
 		return;
@@ -761,7 +761,7 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata,
 	u32 i, flags;
 
 	*mask = sdata->rc_rateidx_mask[sband->band];
-	flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef);
+	flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper);
 	for (i = 0; i < sband->n_bitrates; i++) {
 		if ((flags & sband->bitrates[i].flags) != flags)
 			*mask &= ~BIT(i);
@@ -817,7 +817,7 @@ rate_control_apply_mask_ratetbl(struct sta_info *sta,
 				   mcs_mask, vht_mask))
 		return;
 
-	chan_width = sta->sdata->vif.bss_conf.chandef.width;
+	chan_width = sta->sdata->vif.bss_conf.chanreq.oper.width;
 	for (i = 0; i < IEEE80211_TX_RATE_TABLE_SIZE; i++) {
 		if (rates->rate[i].idx < 0)
 			break;
@@ -854,7 +854,7 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata,
 	 * included in the configured mask and change the rate indexes
 	 * if needed.
 	 */
-	chan_width = sdata->vif.bss_conf.chandef.width;
+	chan_width = sdata->vif.bss_conf.chanreq.oper.width;
 	for (i = 0; i < max_rates; i++) {
 		/* Skip invalid rates */
 		if (rates[i].idx < 0)
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index 51efc9bc8168..2b0bf2a1a877 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -108,26 +108,26 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
 	default:
 		/* secondary_channel_offset was present but is invalid */
 	case IEEE80211_HT_PARAM_CHA_SEC_NONE:
-		cfg80211_chandef_create(&csa_ie->chandef, new_chan,
+		cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan,
 					NL80211_CHAN_HT20);
 		break;
 	case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
-		cfg80211_chandef_create(&csa_ie->chandef, new_chan,
+		cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan,
 					NL80211_CHAN_HT40PLUS);
 		break;
 	case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
-		cfg80211_chandef_create(&csa_ie->chandef, new_chan,
+		cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan,
 					NL80211_CHAN_HT40MINUS);
 		break;
 	case -1:
-		cfg80211_chandef_create(&csa_ie->chandef, new_chan,
+		cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan,
 					NL80211_CHAN_NO_HT);
 		/* keep width for 5/10 MHz channels */
-		switch (sdata->vif.bss_conf.chandef.width) {
+		switch (sdata->vif.bss_conf.chanreq.oper.width) {
 		case NL80211_CHAN_WIDTH_5:
 		case NL80211_CHAN_WIDTH_10:
-			csa_ie->chandef.width =
-				sdata->vif.bss_conf.chandef.width;
+			csa_ie->chanreq.oper.width =
+				sdata->vif.bss_conf.chanreq.oper.width;
 			break;
 		default:
 			break;
@@ -137,7 +137,7 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
 
 	if (bwi) {
 		/* start with the CSA one */
-		new_vht_chandef = csa_ie->chandef;
+		new_vht_chandef = csa_ie->chanreq.oper;
 		/* and update the width accordingly */
 		ieee80211_chandef_eht_oper(&bwi->info, &new_vht_chandef);
 	} else if (wide_bw_chansw_ie) {
@@ -159,7 +159,7 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
 		/* default, for the case of IEEE80211_VHT_CHANWIDTH_USE_HT,
 		 * to the previously parsed chandef
 		 */
-		new_vht_chandef = csa_ie->chandef;
+		new_vht_chandef = csa_ie->chanreq.oper;
 
 		/* ignore if parsing fails */
 		if (!ieee80211_chandef_vht_oper(&sdata->local->hw,
@@ -177,13 +177,13 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
 	/* if VHT data is there validate & use it */
 	if (new_vht_chandef.chan) {
 		if (!cfg80211_chandef_compatible(&new_vht_chandef,
-						 &csa_ie->chandef)) {
+						 &csa_ie->chanreq.oper)) {
 			sdata_info(sdata,
 				   "BSS %pM: CSA has inconsistent channel data, disconnecting\n",
 				   bssid);
 			return -EINVAL;
 		}
-		csa_ie->chandef = new_vht_chandef;
+		csa_ie->chanreq.oper = new_vht_chandef;
 	}
 
 	if (elems->max_channel_switch_time)
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 396fd54d8bf7..0f4aa42e070f 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -159,7 +159,7 @@ static void ieee80211_tdls_add_oper_classes(struct ieee80211_link_data *link,
 	u8 *pos;
 	u8 op_class;
 
-	if (!ieee80211_chandef_to_operating_class(&link->conf->chandef,
+	if (!ieee80211_chandef_to_operating_class(&link->conf->chanreq.oper,
 						  &op_class))
 		return;
 
@@ -438,7 +438,7 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_link_data *link,
 		if (WARN_ON_ONCE(!sta))
 			return;
 
-		sta->tdls_chandef = link->conf->chandef;
+		sta->tdls_chandef = link->conf->chanreq.oper;
 	}
 
 	ieee80211_tdls_add_oper_classes(link, skb);
@@ -638,7 +638,7 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_link_data *link,
 	if (WARN_ON_ONCE(!sta || !ap_sta))
 		return;
 
-	sta->tdls_chandef = link->conf->chandef;
+	sta->tdls_chandef = link->conf->chanreq.oper;
 
 	/* add any custom IEs that go before the QoS IE */
 	if (extra_ies_len) {
@@ -684,7 +684,7 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_link_data *link,
 
 		pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_operation));
 		ieee80211_ie_build_ht_oper(pos, &sta->sta.deflink.ht_cap,
-					   &link->conf->chandef, prot,
+					   &link->conf->chanreq.oper, prot,
 					   true);
 	}
 
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index e2dde3e77c30..efff20d7fe0e 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -503,9 +503,9 @@ TRACE_EVENT(drv_link_info_changed,
 		__entry->ht_operation_mode = link_conf->ht_operation_mode;
 		__entry->cqm_rssi_thold = link_conf->cqm_rssi_thold;
 		__entry->cqm_rssi_hyst = link_conf->cqm_rssi_hyst;
-		__entry->channel_width = link_conf->chandef.width;
-		__entry->channel_cfreq1 = link_conf->chandef.center_freq1;
-		__entry->channel_cfreq1_offset = link_conf->chandef.freq1_offset;
+		__entry->channel_width = link_conf->chanreq.oper.width;
+		__entry->channel_cfreq1 = link_conf->chanreq.oper.center_freq1;
+		__entry->channel_cfreq1_offset = link_conf->chanreq.oper.freq1_offset;
 		__entry->qos = link_conf->qos;
 		__entry->hidden_ssid = link_conf->hidden_ssid;
 		__entry->txpower = link_conf->txpower;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 51c1a99f57b8..49eef33b5e70 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2309,7 +2309,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
 	 * in order to maximize the chance that we get a response.  Some
 	 * badly-behaved APs don't respond when this parameter is included.
 	 */
-	chandef.width = sdata->vif.bss_conf.chandef.width;
+	chandef.width = sdata->vif.bss_conf.chanreq.oper.width;
 	if (flags & IEEE80211_PROBE_FLAG_DIRECTED)
 		chandef.chan = NULL;
 	else
@@ -2351,7 +2351,8 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
 	if (WARN_ON(!sband))
 		return 1;
 
-	rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef);
+	rate_flags =
+		ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper);
 
 	num_rates = sband->n_bitrates;
 	supp_rates = 0;
@@ -4026,7 +4027,8 @@ int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata,
 	u32 basic_rates = sdata->vif.bss_conf.basic_rates;
 	u32 rate_flags;
 
-	rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef);
+	rate_flags =
+		ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper);
 	sband = local->hw.wiphy->bands[band];
 	rates = 0;
 	for (i = 0; i < sband->n_bitrates; i++) {
@@ -4068,8 +4070,8 @@ int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata,
 	u32 basic_rates = sdata->vif.bss_conf.basic_rates;
 	u32 rate_flags;
 
-	rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef);
-
+	rate_flags =
+		ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper);
 	sband = local->hw.wiphy->bands[band];
 	exrates = 0;
 	for (i = 0; i < sband->n_bitrates; i++) {
@@ -4312,7 +4314,7 @@ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local)
 					  &sdata->deflink.dfs_cac_timer_work);
 
 		if (sdata->wdev.cac_started) {
-			chandef = sdata->vif.bss_conf.chandef;
+			chandef = sdata->vif.bss_conf.chanreq.oper;
 			ieee80211_link_release_channel(&sdata->deflink);
 			cfg80211_cac_event(sdata->dev,
 					   &chandef,
@@ -4756,7 +4758,7 @@ static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local,
 
 	list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list)
 		if (link->reserved_radar_required)
-			radar_detect |= BIT(link->reserved_chandef.width);
+			radar_detect |= BIT(link->reserved.oper.width);
 
 	/*
 	 * An in-place reservation context should not have any assigned vifs
@@ -4770,7 +4772,7 @@ static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local,
 			continue;
 
 		radar_detect |=
-			BIT(link->conf->chandef.width);
+			BIT(link->conf->chanreq.oper.width);
 	}
 
 	return radar_detect;
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index b3a5c3e96a72..2c475c439ba9 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -369,7 +369,7 @@ ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta)
 		link_conf = rcu_dereference(sdata->vif.link_conf[link_id]);
 
 		if (eht_cap->has_eht &&
-		    link_conf->chandef.chan->band == NL80211_BAND_6GHZ) {
+		    link_conf->chanreq.oper.chan->band == NL80211_BAND_6GHZ) {
 			info = eht_cap->eht_cap_elem.phy_cap_info[0];
 
 			if (info & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) {
@@ -380,7 +380,7 @@ ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta)
 
 		info = he_cap->he_cap_elem.phy_cap_info[0];
 
-		if (link_conf->chandef.chan->band == NL80211_BAND_2GHZ) {
+		if (link_conf->chanreq.oper.chan->band == NL80211_BAND_2GHZ) {
 			if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G)
 				ret = IEEE80211_STA_RX_BW_40;
 			else
@@ -515,7 +515,7 @@ ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta)
 	if (WARN_ON(!link_conf))
 		bss_width = NL80211_CHAN_WIDTH_20_NOHT;
 	else
-		bss_width = link_conf->chandef.width;
+		bss_width = link_conf->chanreq.oper.width;
 	rcu_read_unlock();
 
 	bw = ieee80211_sta_cap_rx_bw(link_sta);
-- 
cgit v1.2.3


From 761748f001800d925c2ee8b04407e7aee12c3ffb Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 29 Jan 2024 19:34:42 +0100
Subject: wifi: mac80211: support wider bandwidth OFDMA config

EHT requires that stations are able to participate in
wider bandwidth OFDMA, i.e. parse downlink OFDMA and
uplink OFDMA triggers when they're not capable of (or
not connected at) the (wider) bandwidth that the AP
is using. This requires hardware configuration, since
the entity responsible for parsing (possibly hardware)
needs to know the AP bandwidth.

To support this, change the channel request to have
the AP's bandwidth for clients, and track that in the
channel context in mac80211. This means that the same
chandef might need to be split up into two different
contexts, if the APs are different. Interfaces other
than client are not participating in OFDMA the same
way, so they don't request any AP setting.

Note that this doesn't introduce any API to split a
channel context, so that there are cases where this
might lead to a disconnect, e.g. if there are two
client interfaces using the same channel context, e.g.
both 160 MHz connected to different 320 MHz APs, and
one of the APs switches to 160 MHz.

Note also there are possible cases where this can be
optimised, e.g. when using the upper or lower 160 Mhz,
but I haven't been able to really fully understand the
spec and/or hardware limitations.

If, for some reason, there are no hardware limits on
this because the OFDMA (downlink/trigger) parsing is
done in firmware and can take the transmitter into
account, then drivers can set the new flag
IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW on interfaces to
not have them request any AP bandwidth in the channel
context and ignore this issue entirely. The bss_conf
still contains the AP configuration (if any, i.e. EHT)
in the chanreq.

Link: https://msgid.link/20240129194108.d3d5b35dd783.I939d04674f4ff06f39934b1591c8d36a30ce74c2@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  14 ++++
 net/mac80211/chan.c        | 167 ++++++++++++++++++++++++++++++++++-----------
 net/mac80211/ieee80211_i.h |  13 ++++
 net/mac80211/mlme.c        |  60 +++++++++-------
 net/mac80211/trace.h       |  31 +++++++--
 5 files changed, 215 insertions(+), 70 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index dd8a66e9afd9..ab6bc89d3394 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -214,6 +214,8 @@ struct ieee80211_low_level_stats {
  * @IEEE80211_CHANCTX_CHANGE_CHANNEL: switched to another operating channel,
  *	this is used only with channel switching with CSA
  * @IEEE80211_CHANCTX_CHANGE_MIN_WIDTH: The min required channel width changed
+ * @IEEE80211_CHANCTX_CHANGE_AP: The AP channel definition changed, so (wider
+ *	bandwidth) OFDMA settings need to be changed
  */
 enum ieee80211_chanctx_change {
 	IEEE80211_CHANCTX_CHANGE_WIDTH		= BIT(0),
@@ -221,14 +223,18 @@ enum ieee80211_chanctx_change {
 	IEEE80211_CHANCTX_CHANGE_RADAR		= BIT(2),
 	IEEE80211_CHANCTX_CHANGE_CHANNEL	= BIT(3),
 	IEEE80211_CHANCTX_CHANGE_MIN_WIDTH	= BIT(4),
+	IEEE80211_CHANCTX_CHANGE_AP		= BIT(5),
 };
 
 /**
  * struct ieee80211_chan_req - A channel "request"
  * @oper: channel definition to use for operation
+ * @ap: the channel definition of the AP, if any
+ *	(otherwise the chan member is %NULL)
  */
 struct ieee80211_chan_req {
 	struct cfg80211_chan_def oper;
+	struct cfg80211_chan_def ap;
 };
 
 /**
@@ -239,6 +245,8 @@ struct ieee80211_chan_req {
  *
  * @def: the channel definition
  * @min_def: the minimum channel definition currently required.
+ * @ap: the channel definition the AP actually is operating as,
+ *	for use with (wider bandwidth) OFDMA
  * @rx_chains_static: The number of RX chains that must always be
  *	active on the channel to receive MIMO transmissions
  * @rx_chains_dynamic: The number of RX chains that must be enabled
@@ -251,6 +259,7 @@ struct ieee80211_chan_req {
 struct ieee80211_chanctx_conf {
 	struct cfg80211_chan_def def;
 	struct cfg80211_chan_def min_def;
+	struct cfg80211_chan_def ap;
 
 	u8 rx_chains_static, rx_chains_dynamic;
 
@@ -1782,6 +1791,10 @@ struct ieee80211_channel_switch {
  *	this is not pure P2P vif.
  * @IEEE80211_VIF_EML_ACTIVE: The driver indicates that EML operation is
  *      enabled for the interface.
+ * @IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW: Ignore wider bandwidth OFDMA
+ *	operation on this interface and request a channel context without
+ *	the AP definition. Use this e.g. because the device is able to
+ *	handle OFDMA (downlink and trigger for uplink) on a per-AP basis.
  */
 enum ieee80211_vif_flags {
 	IEEE80211_VIF_BEACON_FILTER		= BIT(0),
@@ -1789,6 +1802,7 @@ enum ieee80211_vif_flags {
 	IEEE80211_VIF_SUPPORTS_UAPSD		= BIT(2),
 	IEEE80211_VIF_GET_NOA_UPDATE		= BIT(3),
 	IEEE80211_VIF_EML_ACTIVE	        = BIT(4),
+	IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW	= BIT(5),
 };
 
 
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index c84449bdc928..fe1489ba58c6 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -81,22 +81,35 @@ ieee80211_link_get_chanctx(struct ieee80211_link_data *link)
 	return container_of(conf, struct ieee80211_chanctx, conf);
 }
 
+bool ieee80211_chanreq_identical(const struct ieee80211_chan_req *a,
+				 const struct ieee80211_chan_req *b)
+{
+	if (!cfg80211_chandef_identical(&a->oper, &b->oper))
+		return false;
+	if (!a->ap.chan && !b->ap.chan)
+		return true;
+	return cfg80211_chandef_identical(&a->ap, &b->ap);
+}
+
 static const struct ieee80211_chan_req *
 ieee80211_chanreq_compatible(const struct ieee80211_chan_req *a,
-			     const struct ieee80211_chan_req *b)
+			     const struct ieee80211_chan_req *b,
+			     struct ieee80211_chan_req *tmp)
 {
 	const struct cfg80211_chan_def *compat;
 
-	compat = cfg80211_chandef_compatible(&a->oper, &b->oper);
-
-	if (compat == &a->oper)
-		return a;
+	if (a->ap.chan && b->ap.chan &&
+	    !cfg80211_chandef_identical(&a->ap, &b->ap))
+		return NULL;
 
-	if (compat == &b->oper)
-		return b;
+	compat = cfg80211_chandef_compatible(&a->oper, &b->oper);
+	if (!compat)
+		return NULL;
 
-	WARN_ON(compat);
-	return NULL;
+	/* Note: later code assumes this always fills & returns tmp if compat */
+	tmp->oper = *compat;
+	tmp->ap = a->ap.chan ? a->ap : b->ap;
+	return tmp;
 }
 
 static const struct ieee80211_chan_req *
@@ -104,17 +117,26 @@ ieee80211_chanctx_compatible(struct ieee80211_chanctx *ctx,
 			     const struct ieee80211_chan_req *req,
 			     struct ieee80211_chan_req *tmp)
 {
+	const struct ieee80211_chan_req *ret;
+	struct ieee80211_chan_req tmp2;
+
 	*tmp = (struct ieee80211_chan_req){
 		.oper = ctx->conf.def,
+		.ap = ctx->conf.ap,
 	};
 
-	return ieee80211_chanreq_compatible(tmp, req);
+	ret = ieee80211_chanreq_compatible(tmp, req, &tmp2);
+	if (!ret)
+		return NULL;
+	*tmp = *ret;
+	return tmp;
 }
 
 static const struct ieee80211_chan_req *
 ieee80211_chanctx_reserved_chanreq(struct ieee80211_local *local,
 				   struct ieee80211_chanctx *ctx,
-				   const struct ieee80211_chan_req *req)
+				   const struct ieee80211_chan_req *req,
+				   struct ieee80211_chan_req *tmp)
 {
 	struct ieee80211_link_data *link;
 
@@ -124,7 +146,7 @@ ieee80211_chanctx_reserved_chanreq(struct ieee80211_local *local,
 		return NULL;
 
 	list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) {
-		req = ieee80211_chanreq_compatible(&link->reserved, req);
+		req = ieee80211_chanreq_compatible(&link->reserved, req, tmp);
 		if (!req)
 			break;
 	}
@@ -135,7 +157,8 @@ ieee80211_chanctx_reserved_chanreq(struct ieee80211_local *local,
 static const struct ieee80211_chan_req *
 ieee80211_chanctx_non_reserved_chandef(struct ieee80211_local *local,
 				       struct ieee80211_chanctx *ctx,
-				       const struct ieee80211_chan_req *compat)
+				       const struct ieee80211_chan_req *compat,
+				       struct ieee80211_chan_req *tmp)
 {
 	struct ieee80211_link_data *link;
 	const struct ieee80211_chan_req *comp_def = compat;
@@ -149,7 +172,7 @@ ieee80211_chanctx_non_reserved_chandef(struct ieee80211_local *local,
 			continue;
 
 		comp_def = ieee80211_chanreq_compatible(&link_conf->chanreq,
-							comp_def);
+							comp_def, tmp);
 		if (!comp_def)
 			break;
 	}
@@ -162,16 +185,18 @@ ieee80211_chanctx_can_reserve(struct ieee80211_local *local,
 			      struct ieee80211_chanctx *ctx,
 			      const struct ieee80211_chan_req *req)
 {
+	struct ieee80211_chan_req tmp;
+
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	if (!ieee80211_chanctx_reserved_chanreq(local, ctx, req))
+	if (!ieee80211_chanctx_reserved_chanreq(local, ctx, req, &tmp))
 		return false;
 
-	if (!ieee80211_chanctx_non_reserved_chandef(local, ctx, req))
+	if (!ieee80211_chanctx_non_reserved_chandef(local, ctx, req, &tmp))
 		return false;
 
 	if (!list_empty(&ctx->reserved_links) &&
-	    ieee80211_chanctx_reserved_chanreq(local, ctx, req))
+	    ieee80211_chanctx_reserved_chanreq(local, ctx, req, &tmp))
 		return true;
 
 	return false;
@@ -456,7 +481,11 @@ static void _ieee80211_change_chanctx(struct ieee80211_local *local,
 				      struct ieee80211_link_data *rsvd_for)
 {
 	const struct cfg80211_chan_def *chandef = &chanreq->oper;
-	u32 changed;
+	struct ieee80211_chan_req ctx_req = {
+		.oper = ctx->conf.def,
+		.ap = ctx->conf.ap,
+	};
+	u32 changed = 0;
 
 	/* expected to handle only 20/40/80/160/320 channel widths */
 	switch (chandef->width) {
@@ -478,26 +507,31 @@ static void _ieee80211_change_chanctx(struct ieee80211_local *local,
 	 */
 	ieee80211_chan_bw_change(local, old_ctx, true);
 
-	if (cfg80211_chandef_identical(&ctx->conf.def, chandef)) {
+	if (ieee80211_chanreq_identical(&ctx_req, chanreq)) {
 		ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for);
 		return;
 	}
 
-	WARN_ON(!cfg80211_chandef_compatible(&ctx->conf.def, chandef));
+	WARN_ON(ieee80211_chanctx_refcount(local, ctx) > 1 &&
+		!cfg80211_chandef_compatible(&ctx->conf.def, &chanreq->oper));
 
 	ieee80211_remove_wbrf(local, &ctx->conf.def);
 
+	if (!cfg80211_chandef_identical(&ctx->conf.def, &chanreq->oper))
+		changed |= IEEE80211_CHANCTX_CHANGE_WIDTH;
+	if (!cfg80211_chandef_identical(&ctx->conf.ap, &chanreq->ap))
+		changed |= IEEE80211_CHANCTX_CHANGE_AP;
 	ctx->conf.def = *chandef;
+	ctx->conf.ap = chanreq->ap;
 
 	/* check if min chanctx also changed */
-	changed = IEEE80211_CHANCTX_CHANGE_WIDTH |
-		  _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for);
+	changed |= _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for);
 
 	ieee80211_add_wbrf(local, &ctx->conf.def);
 
 	drv_change_chanctx(local, ctx, changed);
 
-	/* check is BW wider */
+	/* check if BW is wider */
 	ieee80211_chan_bw_change(local, old_ctx, false);
 }
 
@@ -536,7 +570,7 @@ ieee80211_find_chanctx(struct ieee80211_local *local,
 			continue;
 
 		compat = ieee80211_chanctx_reserved_chanreq(local, ctx,
-							    compat);
+							    compat, &tmp);
 		if (!compat)
 			continue;
 
@@ -598,6 +632,7 @@ ieee80211_alloc_chanctx(struct ieee80211_local *local,
 	INIT_LIST_HEAD(&ctx->assigned_links);
 	INIT_LIST_HEAD(&ctx->reserved_links);
 	ctx->conf.def = chanreq->oper;
+	ctx->conf.ap = chanreq->ap;
 	ctx->conf.rx_chains_static = 1;
 	ctx->conf.rx_chains_dynamic = 1;
 	ctx->mode = mode;
@@ -683,9 +718,9 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
 				       struct ieee80211_chanctx *ctx)
 {
 	struct ieee80211_chanctx_conf *conf = &ctx->conf;
-	const struct cfg80211_chan_def *compat = NULL;
+	const struct ieee80211_chan_req *compat = NULL;
 	struct ieee80211_link_data *link;
-	struct ieee80211_chan_req chanreq = {};
+	struct ieee80211_chan_req tmp;
 	struct sta_info *sta;
 
 	lockdep_assert_wiphy(local->hw.wiphy);
@@ -702,10 +737,10 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
 			continue;
 
 		if (!compat)
-			compat = &link_conf->chanreq.oper;
+			compat = &link_conf->chanreq;
 
-		compat = cfg80211_chandef_compatible(&link_conf->chanreq.oper,
-						     compat);
+		compat = ieee80211_chanreq_compatible(&link_conf->chanreq,
+						      compat, &tmp);
 		if (WARN_ON_ONCE(!compat))
 			return;
 	}
@@ -715,24 +750,23 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
 
 	/* TDLS peers can sometimes affect the chandef width */
 	list_for_each_entry(sta, &local->sta_list, list) {
+		struct ieee80211_chan_req tdls_chanreq = {};
 		if (!sta->uploaded ||
 		    !test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW) ||
 		    !test_sta_flag(sta, WLAN_STA_AUTHORIZED) ||
 		    !sta->tdls_chandef.chan)
 			continue;
 
-		compat = cfg80211_chandef_compatible(&sta->tdls_chandef,
-						     compat);
+		tdls_chanreq.oper = sta->tdls_chandef;
+
+		/* note this always fills and returns &tmp if compat */
+		compat = ieee80211_chanreq_compatible(&tdls_chanreq,
+						      compat, &tmp);
 		if (WARN_ON_ONCE(!compat))
 			return;
 	}
 
-	if (!compat)
-		return;
-
-	chanreq.oper = *compat;
-
-	ieee80211_change_chanctx(local, ctx, ctx, &chanreq);
+	ieee80211_change_chanctx(local, ctx, ctx, compat);
 }
 
 static void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local,
@@ -1133,6 +1167,7 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link)
 	struct ieee80211_vif_chanctx_switch vif_chsw[1] = {};
 	struct ieee80211_chanctx *old_ctx, *new_ctx;
 	const struct ieee80211_chan_req *chanreq;
+	struct ieee80211_chan_req tmp;
 	u64 changed = 0;
 	int err;
 
@@ -1155,7 +1190,8 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link)
 		return -EINVAL;
 
 	chanreq = ieee80211_chanctx_non_reserved_chandef(local, new_ctx,
-							 &link->reserved);
+							 &link->reserved,
+							 &tmp);
 	if (WARN_ON(!chanreq))
 		return -EINVAL;
 
@@ -1213,6 +1249,7 @@ ieee80211_link_use_reserved_assign(struct ieee80211_link_data *link)
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_chanctx *old_ctx, *new_ctx;
 	const struct ieee80211_chan_req *chanreq;
+	struct ieee80211_chan_req tmp;
 	int err;
 
 	old_ctx = ieee80211_link_get_chanctx(link);
@@ -1232,7 +1269,8 @@ ieee80211_link_use_reserved_assign(struct ieee80211_link_data *link)
 		return -EINVAL;
 
 	chanreq = ieee80211_chanctx_non_reserved_chandef(local, new_ctx,
-							 &link->reserved);
+							 &link->reserved,
+							 &tmp);
 	if (WARN_ON(!chanreq))
 		return -EINVAL;
 
@@ -1770,6 +1808,52 @@ int ieee80211_link_use_reserved_context(struct ieee80211_link_data *link)
 	return 0;
 }
 
+/*
+ * This is similar to ieee80211_chanctx_compatible(), but rechecks
+ * against all the links actually using it (except the one that's
+ * passed, since that one is changing).
+ * This is done in order to allow changes to the AP's bandwidth for
+ * wider bandwidth OFDMA purposes, which wouldn't be treated as
+ * compatible by ieee80211_chanctx_recheck() but is OK if the link
+ * requesting the update is the only one using it.
+ */
+static const struct ieee80211_chan_req *
+ieee80211_chanctx_recheck(struct ieee80211_local *local,
+			  struct ieee80211_link_data *skip_link,
+			  struct ieee80211_chanctx *ctx,
+			  const struct ieee80211_chan_req *req,
+			  struct ieee80211_chan_req *tmp)
+{
+	const struct ieee80211_chan_req *ret = req;
+	struct ieee80211_link_data *link;
+
+	lockdep_assert_wiphy(local->hw.wiphy);
+
+	for_each_sdata_link(local, link) {
+		if (link == skip_link)
+			continue;
+
+		if (rcu_access_pointer(link->conf->chanctx_conf) == &ctx->conf) {
+			ret = ieee80211_chanreq_compatible(ret,
+							   &link->conf->chanreq,
+							   tmp);
+			if (!ret)
+				return NULL;
+		}
+
+		if (link->reserved_chanctx == ctx) {
+			ret = ieee80211_chanreq_compatible(ret,
+							   &link->reserved,
+							   tmp);
+			if (!ret)
+				return NULL;
+		}
+	}
+
+	*tmp = *ret;
+	return tmp;
+}
+
 int ieee80211_link_change_chanreq(struct ieee80211_link_data *link,
 				  const struct ieee80211_chan_req *chanreq,
 				  u64 *changed)
@@ -1806,13 +1890,14 @@ int ieee80211_link_change_chanreq(struct ieee80211_link_data *link,
 
 	ctx = container_of(conf, struct ieee80211_chanctx, conf);
 
-	compat = ieee80211_chanctx_compatible(ctx, chanreq, &tmp);
+	compat = ieee80211_chanctx_recheck(local, link, ctx, chanreq, &tmp);
 	if (!compat)
 		return -EINVAL;
 
 	switch (ctx->replace_state) {
 	case IEEE80211_CHANCTX_REPLACE_NONE:
-		if (!ieee80211_chanctx_reserved_chanreq(local, ctx, compat))
+		if (!ieee80211_chanctx_reserved_chanreq(local, ctx, compat,
+							&tmp))
 			return -EBUSY;
 		break;
 	case IEEE80211_CHANCTX_WILL_BE_REPLACED:
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 601b889b6237..534cac3fc8df 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2536,6 +2536,19 @@ bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper,
 				struct cfg80211_chan_def *chandef);
 void ieee80211_chandef_downgrade(struct cfg80211_chan_def *chandef,
 				 struct ieee80211_conn_settings *conn);
+static inline void
+ieee80211_chanreq_downgrade(struct ieee80211_chan_req *chanreq,
+			    struct ieee80211_conn_settings *conn)
+{
+	ieee80211_chandef_downgrade(&chanreq->oper, conn);
+	if (WARN_ON(!conn))
+		return;
+	if (conn->mode < IEEE80211_CONN_MODE_EHT)
+		chanreq->ap.chan = NULL;
+}
+
+bool ieee80211_chanreq_identical(const struct ieee80211_chan_req *a,
+				 const struct ieee80211_chan_req *b);
 
 int __must_check
 ieee80211_link_use_channel(struct ieee80211_link_data *link,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e9d720f25ddf..31d6a5603582 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -665,7 +665,7 @@ static struct ieee802_11_elems *
 ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata,
 			      struct ieee80211_conn_settings *conn,
 			      struct cfg80211_bss *cbss, int link_id,
-			      struct cfg80211_chan_def *chandef)
+			      struct ieee80211_chan_req *chanreq)
 {
 	struct ieee80211_local *local = sdata->local;
 	const struct cfg80211_bss_ies *ies = rcu_dereference(cbss->ies);
@@ -752,20 +752,27 @@ again:
 	}
 
 	conn->mode = ap_mode;
-	*chandef = ap_chandef;
+	chanreq->oper = ap_chandef;
 
-	while (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef,
+	/* wider-bandwidth OFDMA is only done in EHT */
+	if (conn->mode >= IEEE80211_CONN_MODE_EHT &&
+	    !(sdata->vif.driver_flags & IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW))
+		chanreq->ap = ap_chandef;
+	else
+		chanreq->ap.chan = NULL;
+
+	while (!cfg80211_chandef_usable(sdata->local->hw.wiphy, &chanreq->oper,
 					IEEE80211_CHAN_DISABLED)) {
-		if (WARN_ON(chandef->width == NL80211_CHAN_WIDTH_20_NOHT)) {
+		if (WARN_ON(chanreq->oper.width == NL80211_CHAN_WIDTH_20_NOHT)) {
 			ret = -EINVAL;
 			goto free;
 		}
 
-		ieee80211_chandef_downgrade(chandef, conn);
+		ieee80211_chanreq_downgrade(chanreq, conn);
 	}
 
 	if (conn->mode >= IEEE80211_CONN_MODE_HE &&
-	    !cfg80211_chandef_usable(sdata->wdev.wiphy, chandef,
+	    !cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper,
 				     IEEE80211_CHAN_NO_HE)) {
 		conn->mode = IEEE80211_CONN_MODE_VHT;
 		conn->bw_limit = min_t(enum ieee80211_conn_bw_limit,
@@ -774,7 +781,7 @@ again:
 	}
 
 	if (conn->mode >= IEEE80211_CONN_MODE_EHT &&
-	    !cfg80211_chandef_usable(sdata->wdev.wiphy, chandef,
+	    !cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper,
 				     IEEE80211_CHAN_NO_EHT)) {
 		conn->mode = IEEE80211_CONN_MODE_HE;
 		conn->bw_limit = min_t(enum ieee80211_conn_bw_limit,
@@ -782,7 +789,7 @@ again:
 				       IEEE80211_CONN_BW_LIMIT_160);
 	}
 
-	if (chandef->width != ap_chandef.width || ap_mode != conn->mode)
+	if (chanreq->oper.width != ap_chandef.width || ap_mode != conn->mode)
 		sdata_info(sdata,
 			   "regulatory prevented using AP config, downgraded\n");
 
@@ -847,7 +854,7 @@ again:
 			 ieee80211_conn_mode_str(conn->mode),
 			 20 * (1 << conn->bw_limit));
 
-	if (WARN_ON_ONCE(!cfg80211_chandef_valid(chandef))) {
+	if (WARN_ON_ONCE(!cfg80211_chandef_valid(&chanreq->oper))) {
 		ret = -EINVAL;
 		goto free;
 	}
@@ -865,7 +872,6 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link,
 	struct ieee80211_channel *channel = link->conf->chanreq.oper.chan;
 	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_chan_req chanreq = {};
-	struct cfg80211_chan_def ap_chandef;
 	enum ieee80211_conn_mode ap_mode;
 	u32 vht_cap_info = 0;
 	u16 ht_opmode;
@@ -881,7 +887,7 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link,
 
 	ap_mode = ieee80211_determine_ap_chan(sdata, channel, vht_cap_info,
 					      elems, true, &link->u.mgd.conn,
-					      &ap_chandef);
+					      &chanreq.ap);
 
 	if (ap_mode != link->u.mgd.conn.mode) {
 		link_info(link,
@@ -891,6 +897,11 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link,
 		return -EINVAL;
 	}
 
+	chanreq.oper = chanreq.ap;
+	if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_EHT ||
+	    sdata->vif.driver_flags & IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW)
+		chanreq.ap.chan = NULL;
+
 	/*
 	 * if HT operation mode changed store the new one -
 	 * this may be applicable even if channel is identical
@@ -911,20 +922,20 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link,
 	 * won't do us any good -- we couldn't use it with the AP.
 	 */
 	while (link->u.mgd.conn.bw_limit <
-			ieee80211_min_bw_limit_from_chandef(&ap_chandef))
-		ieee80211_chandef_downgrade(&ap_chandef, NULL);
+			ieee80211_min_bw_limit_from_chandef(&chanreq.oper))
+		ieee80211_chandef_downgrade(&chanreq.oper, NULL);
 
-	if (cfg80211_chandef_identical(&ap_chandef, &link->conf->chanreq.oper))
+	if (ieee80211_chanreq_identical(&chanreq, &link->conf->chanreq))
 		return 0;
 
 	link_info(link,
-		  "AP %pM changed bandwidth, new config is %d.%03d MHz, width %d (%d.%03d/%d MHz)\n",
-		  link->u.mgd.bssid, ap_chandef.chan->center_freq,
-		  ap_chandef.chan->freq_offset, ap_chandef.width,
-		  ap_chandef.center_freq1, ap_chandef.freq1_offset,
-		  ap_chandef.center_freq2);
+		  "AP %pM changed bandwidth, new used config is %d.%03d MHz, width %d (%d.%03d/%d MHz)\n",
+		  link->u.mgd.bssid, chanreq.oper.chan->center_freq,
+		  chanreq.oper.chan->freq_offset, chanreq.oper.width,
+		  chanreq.oper.center_freq1, chanreq.oper.freq1_offset,
+		  chanreq.oper.center_freq2);
 
-	if (!cfg80211_chandef_valid(&ap_chandef)) {
+	if (!cfg80211_chandef_valid(&chanreq.oper)) {
 		sdata_info(sdata,
 			   "AP %pM changed caps/bw in a way we can't support - disconnect\n",
 			   link->u.mgd.bssid);
@@ -947,7 +958,6 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link,
 	 * bandwidth changes where a this could happen, but those cases are
 	 * less common and wouldn't completely prevent using the AP.
 	 */
-	chanreq.oper = ap_chandef;
 
 	ret = ieee80211_link_change_chanreq(link, &chanreq, changed);
 	if (ret) {
@@ -2071,8 +2081,8 @@ static void ieee80211_chswitch_work(struct wiphy *wiphy,
 		return;
 	}
 
-	if (!cfg80211_chandef_identical(&link->conf->chanreq.oper,
-					&link->csa_chanreq.oper)) {
+	if (!ieee80211_chanreq_identical(&link->conf->chanreq,
+					 &link->csa_chanreq)) {
 		sdata_info(sdata,
 			   "failed to finalize channel switch, disconnecting\n");
 		wiphy_work_queue(sdata->local->hw.wiphy,
@@ -5146,7 +5156,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 
 	rcu_read_lock();
 	elems = ieee80211_determine_chan_mode(sdata, conn, cbss, link_id,
-					      &chanreq.oper);
+					      &chanreq);
 
 	if (IS_ERR(elems)) {
 		rcu_read_unlock();
@@ -5209,7 +5219,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 		return ret;
 
 	while (ret && chanreq.oper.width != NL80211_CHAN_WIDTH_20_NOHT) {
-		ieee80211_chandef_downgrade(&chanreq.oper, conn);
+		ieee80211_chanreq_downgrade(&chanreq, conn);
 
 		ret = ieee80211_link_use_channel(link, &chanreq,
 						 IEEE80211_CHANCTX_SHARED);
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index efff20d7fe0e..478b32d2520a 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -50,7 +50,7 @@
 			__entry->center_freq1 = (c) ? (c)->center_freq1 : 0;		\
 			__entry->freq1_offset = (c) ? (c)->freq1_offset : 0;		\
 			__entry->center_freq2 = (c) ? (c)->center_freq2 : 0;
-#define CHANDEF_PR_FMT	" control:%d.%03d MHz width:%d center: %d.%03d/%d MHz"
+#define CHANDEF_PR_FMT	" chandef(%d.%03d MHz,width:%d,center: %d.%03d/%d MHz)"
 #define CHANDEF_PR_ARG	__entry->control_freq, __entry->freq_offset, __entry->chan_width, \
 			__entry->center_freq1, __entry->freq1_offset, __entry->center_freq2
 
@@ -69,22 +69,45 @@
 			__entry->min_center_freq1 = (c)->center_freq1;			\
 			__entry->min_freq1_offset = (c)->freq1_offset;			\
 			__entry->min_center_freq2 = (c)->center_freq2;
-#define MIN_CHANDEF_PR_FMT	" min_control:%d.%03d MHz min_width:%d min_center: %d.%03d/%d MHz"
+#define MIN_CHANDEF_PR_FMT	" mindef(%d.%03d MHz,width:%d,center: %d.%03d/%d MHz)"
 #define MIN_CHANDEF_PR_ARG	__entry->min_control_freq, __entry->min_freq_offset,	\
 			__entry->min_chan_width,					\
 			__entry->min_center_freq1, __entry->min_freq1_offset,		\
 			__entry->min_center_freq2
 
+#define AP_CHANDEF_ENTRY								\
+			__field(u32, ap_control_freq)					\
+			__field(u32, ap_freq_offset)					\
+			__field(u32, ap_chan_width)					\
+			__field(u32, ap_center_freq1)					\
+			__field(u32, ap_freq1_offset)					\
+			__field(u32, ap_center_freq2)
+
+#define AP_CHANDEF_ASSIGN(c)								\
+			__entry->ap_control_freq = (c)->chan ? (c)->chan->center_freq : 0;\
+			__entry->ap_freq_offset = (c)->chan ? (c)->chan->freq_offset : 0;\
+			__entry->ap_chan_width = (c)->chan ? (c)->width : 0;		\
+			__entry->ap_center_freq1 = (c)->chan ? (c)->center_freq1 : 0;	\
+			__entry->ap_freq1_offset = (c)->chan ? (c)->freq1_offset : 0;	\
+			__entry->ap_center_freq2 = (c)->chan ? (c)->center_freq2 : 0;
+#define AP_CHANDEF_PR_FMT	" ap(%d.%03d MHz,width:%d,center: %d.%03d/%d MHz)"
+#define AP_CHANDEF_PR_ARG	__entry->ap_control_freq, __entry->ap_freq_offset,	\
+			__entry->ap_chan_width,						\
+			__entry->ap_center_freq1, __entry->ap_freq1_offset,		\
+			__entry->ap_center_freq2
+
 #define CHANCTX_ENTRY	CHANDEF_ENTRY							\
 			MIN_CHANDEF_ENTRY						\
+			AP_CHANDEF_ENTRY						\
 			__field(u8, rx_chains_static)					\
 			__field(u8, rx_chains_dynamic)
 #define CHANCTX_ASSIGN	CHANDEF_ASSIGN(&ctx->conf.def)					\
 			MIN_CHANDEF_ASSIGN(&ctx->conf.min_def)				\
+			AP_CHANDEF_ASSIGN(&ctx->conf.ap)				\
 			__entry->rx_chains_static = ctx->conf.rx_chains_static;		\
 			__entry->rx_chains_dynamic = ctx->conf.rx_chains_dynamic
-#define CHANCTX_PR_FMT	CHANDEF_PR_FMT MIN_CHANDEF_PR_FMT " chains:%d/%d"
-#define CHANCTX_PR_ARG	CHANDEF_PR_ARG,	MIN_CHANDEF_PR_ARG,				\
+#define CHANCTX_PR_FMT	CHANDEF_PR_FMT MIN_CHANDEF_PR_FMT AP_CHANDEF_PR_FMT " chains:%d/%d"
+#define CHANCTX_PR_ARG	CHANDEF_PR_ARG,	MIN_CHANDEF_PR_ARG, AP_CHANDEF_PR_ARG,		\
 			__entry->rx_chains_static, __entry->rx_chains_dynamic
 
 #define KEY_ENTRY	__field(u32, cipher)						\
-- 
cgit v1.2.3


From c478db84c8544156b80c5e5d3a8c7840d557707a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 29 Jan 2024 19:34:45 +0100
Subject: wifi: mac80211: refactor puncturing bitmap extraction

Add a new inline helper function to ieee80211.h to
extract the disabled subchannels bitmap from an EHT
operation element, and use that in mac80211 where
we do that.

Link: https://msgid.link/20240129194108.d9f50dcec8d0.I8b08cbc2490a734fafcce0fa0fc328211ba6f10b@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 16 +++++++++++++++
 net/mac80211/mlme.c       | 52 ++++++++++++++++-------------------------------
 2 files changed, 33 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index a70388ae3a7b..d9d2c1253157 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -3189,6 +3189,22 @@ ieee80211_eht_oper_size_ok(const u8 *data, u8 len)
 	return len >= needed;
 }
 
+/* must validate ieee80211_eht_oper_size_ok() first */
+static inline u16
+ieee80211_eht_oper_dis_subchan_bitmap(const struct ieee80211_eht_operation *eht_oper)
+{
+	const struct ieee80211_eht_operation_info *info =
+		(const void *)eht_oper->optional;
+
+	if (!(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT))
+		return 0;
+
+	if (!(eht_oper->params & IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT))
+		return 0;
+
+	return get_unaligned_le16(info->optional);
+}
+
 #define IEEE80211_BW_IND_DIS_SUBCH_PRESENT	BIT(1)
 
 struct ieee80211_bandwidth_indication {
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index da202103faf0..74a15f18e7ee 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -813,36 +813,27 @@ again:
 	}
 
 	if (conn->mode >= IEEE80211_CONN_MODE_EHT) {
-		const struct ieee80211_eht_operation *eht_oper;
+		u16 bitmap;
 
-		eht_oper = elems->eht_operation;
-
-		if (WARN_ON_ONCE(!eht_oper)) {
+		if (WARN_ON_ONCE(!elems->eht_operation)) {
 			ret = -EINVAL;
 			goto free;
 		}
 
-		if (eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT &&
-		    eht_oper->params & IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT) {
-			const struct ieee80211_eht_operation_info *info =
-				(void *)eht_oper->optional;
-			const u8 *disable_subchannel_bitmap = info->optional;
-			u16 bitmap;
-
-			bitmap = get_unaligned_le16(disable_subchannel_bitmap);
-			if (!cfg80211_valid_disable_subchannel_bitmap(&bitmap,
-								      &ap_chandef) ||
-			    (bitmap &&
-			     ieee80211_hw_check(&local->hw, DISALLOW_PUNCTURING))) {
-				conn->mode = IEEE80211_CONN_MODE_HE;
-				conn->bw_limit = min_t(enum ieee80211_conn_bw_limit,
-						       conn->bw_limit,
-						       IEEE80211_CONN_BW_LIMIT_160);
-				sdata_info(sdata,
-					   "AP has invalid/unsupported puncturing, disabling EHT\n");
-			}
-			/* FIXME: store puncturing bitmap */
+		bitmap = ieee80211_eht_oper_dis_subchan_bitmap(elems->eht_operation);
+
+		if (!cfg80211_valid_disable_subchannel_bitmap(&bitmap,
+							      &ap_chandef) ||
+		    (bitmap &&
+		     ieee80211_hw_check(&local->hw, DISALLOW_PUNCTURING))) {
+			conn->mode = IEEE80211_CONN_MODE_HE;
+			conn->bw_limit = min_t(enum ieee80211_conn_bw_limit,
+					       conn->bw_limit,
+					       IEEE80211_CONN_BW_LIMIT_160);
+			sdata_info(sdata,
+				   "AP has invalid/unsupported puncturing, disabling EHT\n");
 		}
+		/* FIXME: store puncturing bitmap */
 	}
 
 	/* the mode can only decrease, so this must terminate */
@@ -5879,18 +5870,9 @@ static bool ieee80211_config_puncturing(struct ieee80211_link_data *link,
 					u64 *changed)
 {
 	struct ieee80211_local *local = link->sdata->local;
-	u16 bitmap = 0, extracted;
-
-	if ((eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT) &&
-	    (eht_oper->params &
-	     IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)) {
-		const struct ieee80211_eht_operation_info *info =
-			(void *)eht_oper->optional;
-		const u8 *disable_subchannel_bitmap = info->optional;
-
-		bitmap = get_unaligned_le16(disable_subchannel_bitmap);
-	}
+	u16 bitmap, extracted;
 
+	bitmap = ieee80211_eht_oper_dis_subchan_bitmap(eht_oper);
 	extracted = ieee80211_extract_dis_subch_bmap(eht_oper,
 						     &link->conf->chanreq.oper,
 						     bitmap);
-- 
cgit v1.2.3


From 8f251a0a1566e3e1da0f1d9322c8ffae808a7509 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 29 Jan 2024 19:34:47 +0100
Subject: wifi: cfg80211: simplify cfg80211_chandef_compatible()

Simplify cfg80211_chandef_compatible() a bit by switching
c1 and c2 around so that c1 is always the narrower one
(once they're not identical or narrow/S1G). Then we can
just check the various primary channels and exit with the
wider one (c2), or NULL.

Also refactor the primary 40/80/160 function to not have
all the calculations hard-coded, and use a wrapper around
it to check primary 40/80/160 compatibility.

While at it, add some kunit tests for this functionality.

Also expose the new cfg80211_chandef_primary_freq() to
drivers, mac80211 will use it.

Link: https://msgid.link/20240129194108.be3e6eccaba3.I8399c2ff1435d7378e5837794cb5aa6dd2ee1416@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h      |  11 +++
 net/wireless/chan.c         | 170 ++++++++++++++++++++++-------------------
 net/wireless/tests/Makefile |   2 +-
 net/wireless/tests/chan.c   | 182 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 284 insertions(+), 81 deletions(-)
 create mode 100644 net/wireless/tests/chan.c

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5b42bfc1b660..fc2ad80118e8 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1047,6 +1047,17 @@ unsigned int
 cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy,
 			      const struct cfg80211_chan_def *chandef);
 
+/**
+ * cfg80211_chandef_primary_freq - calculate primary 40/80/160 MHz freq
+ * @chandef: chandef to calculate for
+ * @primary_chan_width: primary channel width to calculate center for
+ *
+ * Returns: the primary 40/80/160 MHz channel center frequency, or -1
+ *	for errors
+ */
+int cfg80211_chandef_primary_freq(const struct cfg80211_chan_def *chandef,
+				  enum nl80211_chan_width primary_chan_width);
+
 /**
  * nl80211_send_chandef - sends the channel definition.
  * @msg: the msg to send channel definition
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 159b8aac451e..bfa2ea935fc2 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -390,68 +390,60 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
 }
 EXPORT_SYMBOL(cfg80211_chandef_valid);
 
-static void chandef_primary_freqs(const struct cfg80211_chan_def *c,
-				  u32 *pri40, u32 *pri80, u32 *pri160)
+int cfg80211_chandef_primary_freq(const struct cfg80211_chan_def *c,
+				  enum nl80211_chan_width primary_chan_width)
 {
-	int tmp;
+	int pri_width = nl80211_chan_width_to_mhz(primary_chan_width);
+	int width = cfg80211_chandef_get_width(c);
+	u32 control = c->chan->center_freq;
+	u32 center = c->center_freq1;
 
-	switch (c->width) {
-	case NL80211_CHAN_WIDTH_40:
-		*pri40 = c->center_freq1;
-		*pri80 = 0;
-		*pri160 = 0;
-		break;
-	case NL80211_CHAN_WIDTH_80:
-	case NL80211_CHAN_WIDTH_80P80:
-		*pri160 = 0;
-		*pri80 = c->center_freq1;
-		/* n_P20 */
-		tmp = (30 + c->chan->center_freq - c->center_freq1)/20;
-		/* n_P40 */
-		tmp /= 2;
-		/* freq_P40 */
-		*pri40 = c->center_freq1 - 20 + 40 * tmp;
-		break;
-	case NL80211_CHAN_WIDTH_160:
-		*pri160 = c->center_freq1;
-		/* n_P20 */
-		tmp = (70 + c->chan->center_freq - c->center_freq1)/20;
-		/* n_P40 */
-		tmp /= 2;
-		/* freq_P40 */
-		*pri40 = c->center_freq1 - 60 + 40 * tmp;
-		/* n_P80 */
-		tmp /= 2;
-		*pri80 = c->center_freq1 - 40 + 80 * tmp;
-		break;
-	case NL80211_CHAN_WIDTH_320:
-		/* n_P20 */
-		tmp = (150 + c->chan->center_freq - c->center_freq1) / 20;
-		/* n_P40 */
-		tmp /= 2;
-		/* freq_P40 */
-		*pri40 = c->center_freq1 - 140 + 40 * tmp;
-		/* n_P80 */
-		tmp /= 2;
-		*pri80 = c->center_freq1 - 120 + 80 * tmp;
-		/* n_P160 */
-		tmp /= 2;
-		*pri160 = c->center_freq1 - 80 + 160 * tmp;
-		break;
-	default:
-		WARN_ON_ONCE(1);
+	if (WARN_ON_ONCE(pri_width < 0 || width < 0))
+		return -1;
+
+	/* not intended to be called this way, can't determine */
+	if (WARN_ON_ONCE(pri_width > width))
+		return -1;
+
+	while (width > pri_width) {
+		if (control > center)
+			center += width / 4;
+		else
+			center -= width / 4;
+		width /= 2;
 	}
+
+	return center;
 }
+EXPORT_SYMBOL(cfg80211_chandef_primary_freq);
 
-const struct cfg80211_chan_def *
-cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1,
-			    const struct cfg80211_chan_def *c2)
+static const struct cfg80211_chan_def *
+check_chandef_primary_compat(const struct cfg80211_chan_def *c1,
+			     const struct cfg80211_chan_def *c2,
+			     enum nl80211_chan_width primary_chan_width)
 {
-	u32 c1_pri40, c1_pri80, c2_pri40, c2_pri80, c1_pri160, c2_pri160;
+	/* check primary is compatible -> error if not */
+	if (cfg80211_chandef_primary_freq(c1, primary_chan_width) !=
+	    cfg80211_chandef_primary_freq(c2, primary_chan_width))
+		return ERR_PTR(-EINVAL);
+
+	/* assumes c1 is smaller width, if that was just checked -> done */
+	if (c1->width == primary_chan_width)
+		return c2;
+
+	/* otherwise continue checking the next width */
+	return NULL;
+}
+
+static const struct cfg80211_chan_def *
+_cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1,
+			     const struct cfg80211_chan_def *c2)
+{
+	const struct cfg80211_chan_def *ret;
 
 	/* If they are identical, return */
 	if (cfg80211_chandef_identical(c1, c2))
-		return c1;
+		return c2;
 
 	/* otherwise, must have same control channel */
 	if (c1->chan != c2->chan)
@@ -479,44 +471,62 @@ cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1,
 	if (NARROW_OR_S1G(c1->width) || NARROW_OR_S1G(c2->width))
 		return NULL;
 
-	if (c1->width == NL80211_CHAN_WIDTH_20_NOHT ||
-	    c1->width == NL80211_CHAN_WIDTH_20)
+	/*
+	 * Make sure that c1 is always the narrower one, so that later
+	 * we either return NULL or c2 and don't have to check both
+	 * directions.
+	 */
+	if (c1->width > c2->width)
+		swap(c1, c2);
+
+	/*
+	 * No further checks needed if the "narrower" one is only 20 MHz.
+	 * Here "narrower" includes being a 20 MHz non-HT channel vs. a
+	 * 20 MHz HT (or later) one.
+	 */
+	if (c1->width <= NL80211_CHAN_WIDTH_20)
 		return c2;
 
-	if (c2->width == NL80211_CHAN_WIDTH_20_NOHT ||
-	    c2->width == NL80211_CHAN_WIDTH_20)
-		return c1;
+	ret = check_chandef_primary_compat(c1, c2, NL80211_CHAN_WIDTH_40);
+	if (ret)
+		return ret;
 
-	chandef_primary_freqs(c1, &c1_pri40, &c1_pri80, &c1_pri160);
-	chandef_primary_freqs(c2, &c2_pri40, &c2_pri80, &c2_pri160);
+	ret = check_chandef_primary_compat(c1, c2, NL80211_CHAN_WIDTH_80);
+	if (ret)
+		return ret;
 
-	if (c1_pri40 != c2_pri40)
+	/*
+	 * If c1 is 80+80, then c2 is 160 or higher, but that cannot
+	 * match. If c2 was also 80+80 it was already either accepted
+	 * or rejected above (identical or not, respectively.)
+	 */
+	if (c1->width == NL80211_CHAN_WIDTH_80P80)
 		return NULL;
 
-	if (c1->width == NL80211_CHAN_WIDTH_40)
-		return c2;
-
-	if (c2->width == NL80211_CHAN_WIDTH_40)
-		return c1;
+	ret = check_chandef_primary_compat(c1, c2, NL80211_CHAN_WIDTH_160);
+	if (ret)
+		return ret;
 
-	if (c1_pri80 != c2_pri80)
-		return NULL;
+	/*
+	 * Getting here would mean they're both wider than 160, have the
+	 * same primary 160, but are not identical - this cannot happen
+	 * since they must be 320 (no wider chandefs exist, at least yet.)
+	 */
+	WARN_ON_ONCE(1);
 
-	if (c1->width == NL80211_CHAN_WIDTH_80 &&
-	    c2->width > NL80211_CHAN_WIDTH_80)
-		return c2;
+	return NULL;
+}
 
-	if (c2->width == NL80211_CHAN_WIDTH_80 &&
-	    c1->width > NL80211_CHAN_WIDTH_80)
-		return c1;
+const struct cfg80211_chan_def *
+cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1,
+			    const struct cfg80211_chan_def *c2)
+{
+	const struct cfg80211_chan_def *ret;
 
-	WARN_ON(!c1_pri160 && !c2_pri160);
-	if (c1_pri160 && c2_pri160 && c1_pri160 != c2_pri160)
+	ret = _cfg80211_chandef_compatible(c1, c2);
+	if (IS_ERR(ret))
 		return NULL;
-
-	if (c1->width > c2->width)
-		return c1;
-	return c2;
+	return ret;
 }
 EXPORT_SYMBOL(cfg80211_chandef_compatible);
 
diff --git a/net/wireless/tests/Makefile b/net/wireless/tests/Makefile
index 1f6622fcb758..c364e63b508e 100644
--- a/net/wireless/tests/Makefile
+++ b/net/wireless/tests/Makefile
@@ -1,3 +1,3 @@
-cfg80211-tests-y += module.o fragmentation.o scan.o util.o
+cfg80211-tests-y += module.o fragmentation.o scan.o util.o chan.o
 
 obj-$(CONFIG_CFG80211_KUNIT_TEST) += cfg80211-tests.o
diff --git a/net/wireless/tests/chan.c b/net/wireless/tests/chan.c
new file mode 100644
index 000000000000..b9e7a27e43cb
--- /dev/null
+++ b/net/wireless/tests/chan.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KUnit tests for channel helper functions
+ *
+ * Copyright (C) 2023 Intel Corporation
+ */
+#include <net/cfg80211.h>
+#include <kunit/test.h>
+
+MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING);
+
+static struct ieee80211_channel chan_6ghz_1 = {
+	.band = NL80211_BAND_6GHZ,
+	.center_freq = 5955,
+};
+
+static struct ieee80211_channel chan_6ghz_5 = {
+	.band = NL80211_BAND_6GHZ,
+	.center_freq = 5975,
+};
+
+static struct ieee80211_channel chan_6ghz_105 = {
+	.band = NL80211_BAND_6GHZ,
+	.center_freq = 6475,
+};
+
+static const struct chandef_compat_case {
+	const char *desc;
+	/* leave c1 empty for tests for identical */
+	struct cfg80211_chan_def c1, c2;
+	/* we test both ways around, so c2 should always be the compat one */
+	bool compat;
+} chandef_compat_cases[] = {
+	{
+		.desc = "identical non-HT",
+		.c2 = {
+			.width = NL80211_CHAN_WIDTH_20_NOHT,
+			.chan = &chan_6ghz_1,
+			.center_freq1 = 5955,
+		},
+		.compat = true,
+	},
+	{
+		.desc = "identical 20 MHz",
+		.c2 = {
+			.width = NL80211_CHAN_WIDTH_20,
+			.chan = &chan_6ghz_1,
+			.center_freq1 = 5955,
+		},
+		.compat = true,
+	},
+	{
+		.desc = "identical 40 MHz",
+		.c2 = {
+			.width = NL80211_CHAN_WIDTH_40,
+			.chan = &chan_6ghz_1,
+			.center_freq1 = 5955 + 10,
+		},
+		.compat = true,
+	},
+	{
+		.desc = "identical 80 MHz",
+		.c2 = {
+			.width = NL80211_CHAN_WIDTH_80,
+			.chan = &chan_6ghz_1,
+			.center_freq1 = 5955 + 10 + 20,
+		},
+		.compat = true,
+	},
+	{
+		.desc = "identical 160 MHz",
+		.c2 = {
+			.width = NL80211_CHAN_WIDTH_160,
+			.chan = &chan_6ghz_1,
+			.center_freq1 = 5955 + 10 + 20 + 40,
+		},
+		.compat = true,
+	},
+	{
+		.desc = "identical 320 MHz",
+		.c2 = {
+			.width = NL80211_CHAN_WIDTH_320,
+			.chan = &chan_6ghz_1,
+			.center_freq1 = 5955 + 10 + 20 + 40 + 80,
+		},
+		.compat = true,
+	},
+	{
+		.desc = "20 MHz in 320 MHz\n",
+		.c1 = {
+			.width = NL80211_CHAN_WIDTH_20,
+			.chan = &chan_6ghz_1,
+			.center_freq1 = 5955,
+		},
+		.c2 = {
+			.width = NL80211_CHAN_WIDTH_320,
+			.chan = &chan_6ghz_1,
+			.center_freq1 = 5955 + 10 + 20 + 40 + 80,
+		},
+		.compat = true,
+	},
+	{
+		.desc = "different 20 MHz",
+		.c1 = {
+			.width = NL80211_CHAN_WIDTH_20,
+			.chan = &chan_6ghz_1,
+			.center_freq1 = 5955,
+		},
+		.c2 = {
+			.width = NL80211_CHAN_WIDTH_20,
+			.chan = &chan_6ghz_5,
+			.center_freq1 = 5975,
+		},
+	},
+	{
+		.desc = "different primary 160 MHz",
+		.c1 = {
+			.width = NL80211_CHAN_WIDTH_320,
+			.chan = &chan_6ghz_105,
+			.center_freq1 = 6475 + 150,
+		},
+		.c2 = {
+			.width = NL80211_CHAN_WIDTH_320,
+			.chan = &chan_6ghz_105,
+			.center_freq1 = 6475 - 10,
+		},
+	},
+	{
+		/* similar to previous test but one has lower BW */
+		.desc = "matching primary 160 MHz",
+		.c1 = {
+			.width = NL80211_CHAN_WIDTH_160,
+			.chan = &chan_6ghz_105,
+			.center_freq1 = 6475 + 70,
+		},
+		.c2 = {
+			.width = NL80211_CHAN_WIDTH_320,
+			.chan = &chan_6ghz_105,
+			.center_freq1 = 6475 - 10,
+		},
+		.compat = true,
+	},
+};
+
+KUNIT_ARRAY_PARAM_DESC(chandef_compat, chandef_compat_cases, desc)
+
+static void test_chandef_compat(struct kunit *test)
+{
+	const struct chandef_compat_case *params = test->param_value;
+	const struct cfg80211_chan_def *ret, *expect;
+	struct cfg80211_chan_def c1 = params->c1;
+
+	/* tests with identical ones */
+	if (!params->c1.chan)
+		c1 = params->c2;
+
+	KUNIT_EXPECT_EQ(test, cfg80211_chandef_valid(&c1), true);
+	KUNIT_EXPECT_EQ(test, cfg80211_chandef_valid(&params->c2), true);
+
+	expect = params->compat ? &params->c2 : NULL;
+
+	ret = cfg80211_chandef_compatible(&c1, &params->c2);
+	KUNIT_EXPECT_PTR_EQ(test, ret, expect);
+
+	if (!params->c1.chan)
+		expect = &c1;
+
+	ret = cfg80211_chandef_compatible(&params->c2, &c1);
+	KUNIT_EXPECT_PTR_EQ(test, ret, expect);
+}
+
+static struct kunit_case chandef_compat_test_cases[] = {
+	KUNIT_CASE_PARAM(test_chandef_compat, chandef_compat_gen_params),
+	{}
+};
+
+static struct kunit_suite chandef_compat = {
+	.name = "cfg80211-chandef-compat",
+	.test_cases = chandef_compat_test_cases,
+};
+
+kunit_test_suite(chandef_compat);
-- 
cgit v1.2.3


From b82730bf57b54803ab94abbfd8c4422a7081886d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 29 Jan 2024 19:34:49 +0100
Subject: wifi: cfg80211/mac80211: move puncturing into chandef

Aloka originally suggested that puncturing should be part of
the chandef, so that it's treated correctly. At the time, I
disagreed and it ended up not part of the chandef, but I've
now realized that this was wrong. Even for clients, the RX,
and perhaps more importantly, CCA configuration needs to take
puncturing into account.

Move puncturing into the chandef, and adjust all the code
accordingly. Also add a few tests for puncturing in chandef
compatibility checking.

Link: https://lore.kernel.org/linux-wireless/20220214223051.3610-1-quic_alokad@quicinc.com/
Suggested-by: Aloka Dixit <quic_alokad@quicinc.com>
Link: https://msgid.link/20240129194108.307183a5d2e5.I4d7fe2f126b2366c1312010e2900dfb2abffa0f6@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath12k/mac.c              |  10 +-
 drivers/net/wireless/ath/ath6kl/cfg80211.c         |   2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/link.c      |  16 +-
 .../net/wireless/intel/iwlwifi/mvm/mld-mac80211.c  |   6 +-
 drivers/net/wireless/marvell/mwifiex/11h.c         |   2 +-
 drivers/net/wireless/quantenna/qtnfmac/event.c     |   2 +-
 drivers/net/wireless/realtek/rtw89/fw.c            |   5 +-
 include/net/cfg80211.h                             |  44 ++---
 include/net/mac80211.h                             |  11 +-
 net/mac80211/cfg.c                                 |  21 +--
 net/mac80211/chan.c                                |  10 +-
 net/mac80211/link.c                                |   2 +-
 net/mac80211/mlme.c                                | 180 +++------------------
 net/mac80211/util.c                                |  26 ++-
 net/wireless/chan.c                                |  62 ++++---
 net/wireless/nl80211.c                             |  67 +++-----
 net/wireless/tests/chan.c                          |  48 +++++-
 net/wireless/trace.h                               |  38 ++---
 18 files changed, 227 insertions(+), 325 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c
index a27480a69b27..18fb42c045cc 100644
--- a/drivers/net/wireless/ath/ath12k/mac.c
+++ b/drivers/net/wireless/ath/ath12k/mac.c
@@ -2784,9 +2784,6 @@ static void ath12k_mac_bss_info_changed(struct ath12k *ar,
 	}
 
 	ath12k_mac_fils_discovery(arvif, info);
-
-	if (changed & BSS_CHANGED_EHT_PUNCTURING)
-		arvif->punct_bitmap = info->eht_puncturing;
 }
 
 static void ath12k_mac_op_bss_info_changed(struct ieee80211_hw *hw,
@@ -6373,6 +6370,8 @@ ath12k_mac_update_vif_chan(struct ath12k *ar,
 		if (WARN_ON(!arvif->is_started))
 			continue;
 
+		arvif->punct_bitmap = vifs[i].new_ctx->def.punctured;
+
 		/* Firmware expect vdev_restart only if vdev is up.
 		 * If vdev is down then it expect vdev_stop->vdev_start.
 		 */
@@ -6473,7 +6472,8 @@ static void ath12k_mac_op_change_chanctx(struct ieee80211_hw *hw,
 		goto unlock;
 
 	if (changed & IEEE80211_CHANCTX_CHANGE_WIDTH ||
-	    changed & IEEE80211_CHANCTX_CHANGE_RADAR)
+	    changed & IEEE80211_CHANCTX_CHANGE_RADAR ||
+	    changed & IEEE80211_CHANCTX_CHANGE_PUNCTURING)
 		ath12k_mac_update_active_vif_chan(ar, ctx);
 
 	/* TODO: Recalc radar detection */
@@ -6536,7 +6536,7 @@ ath12k_mac_op_assign_vif_chanctx(struct ieee80211_hw *hw,
 		   "mac chanctx assign ptr %pK vdev_id %i\n",
 		   ctx, arvif->vdev_id);
 
-	arvif->punct_bitmap = link_conf->eht_puncturing;
+	arvif->punct_bitmap = ctx->def.punctured;
 
 	/* for some targets bss peer must be created before vdev_start */
 	if (ab->hw_params->vdev_start_delay &&
diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index e37db4af33de..61b2e3f15f0e 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -1119,7 +1119,7 @@ void ath6kl_cfg80211_ch_switch_notify(struct ath6kl_vif *vif, int freq,
 					NL80211_CHAN_HT20 : NL80211_CHAN_NO_HT);
 
 	wiphy_lock(vif->ar->wiphy);
-	cfg80211_ch_switch_notify(vif->ndev, &chandef, 0, 0);
+	cfg80211_ch_switch_notify(vif->ndev, &chandef, 0);
 	wiphy_unlock(vif->ar->wiphy);
 }
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/link.c b/drivers/net/wireless/intel/iwlwifi/mvm/link.c
index f3fcef9034ef..129eefcc45d6 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/link.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/link.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
- * Copyright (C) 2022 - 2023 Intel Corporation
+ * Copyright (C) 2022 - 2024 Intel Corporation
  */
 #include "mvm.h"
 #include "time-event.h"
@@ -195,12 +195,20 @@ int iwl_mvm_link_changed(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 	}
 
 	if (changes & LINK_CONTEXT_MODIFY_EHT_PARAMS) {
+		struct ieee80211_chanctx_conf *ctx;
+		struct cfg80211_chan_def *def = NULL;
+
+		rcu_read_lock();
+		ctx = rcu_dereference(link_conf->chanctx_conf);
+		if (ctx)
+			def = iwl_mvm_chanctx_def(mvm, ctx);
+
 		if (iwlwifi_mod_params.disable_11be ||
-		    !link_conf->eht_support)
+		    !link_conf->eht_support || !def)
 			changes &= ~LINK_CONTEXT_MODIFY_EHT_PARAMS;
 		else
-			cmd.puncture_mask =
-				cpu_to_le16(link_conf->eht_puncturing);
+			cmd.puncture_mask = cpu_to_le16(def->punctured);
+		rcu_read_unlock();
 	}
 
 	cmd.bss_color = link_conf->he_bss_color.color;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c
index b633a2a09c32..7d89f50fbb10 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
- * Copyright (C) 2022-2023 Intel Corporation
+ * Copyright (C) 2022-2024 Intel Corporation
  */
 #include "mvm.h"
 
@@ -752,8 +752,8 @@ iwl_mvm_mld_link_info_changed_station(struct iwl_mvm *mvm,
 		link_changes |= LINK_CONTEXT_MODIFY_HE_PARAMS;
 	}
 
-	/* Update EHT Puncturing info */
-	if (changes & BSS_CHANGED_EHT_PUNCTURING && vif->cfg.assoc)
+	/* if associated, maybe puncturing changed - we'll check later */
+	if (vif->cfg.assoc)
 		link_changes |= LINK_CONTEXT_MODIFY_EHT_PARAMS;
 
 	if (link_changes) {
diff --git a/drivers/net/wireless/marvell/mwifiex/11h.c b/drivers/net/wireless/marvell/mwifiex/11h.c
index da211372a481..b90f922f1cdc 100644
--- a/drivers/net/wireless/marvell/mwifiex/11h.c
+++ b/drivers/net/wireless/marvell/mwifiex/11h.c
@@ -288,6 +288,6 @@ void mwifiex_dfs_chan_sw_work_queue(struct work_struct *work)
 	mwifiex_dbg(priv->adapter, MSG,
 		    "indicating channel switch completion to kernel\n");
 	wiphy_lock(priv->wdev.wiphy);
-	cfg80211_ch_switch_notify(priv->netdev, &priv->dfs_chandef, 0, 0);
+	cfg80211_ch_switch_notify(priv->netdev, &priv->dfs_chandef, 0);
 	wiphy_unlock(priv->wdev.wiphy);
 }
diff --git a/drivers/net/wireless/quantenna/qtnfmac/event.c b/drivers/net/wireless/quantenna/qtnfmac/event.c
index 3b283e93a13e..76b07db284f8 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/event.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/event.c
@@ -478,7 +478,7 @@ qtnf_event_handle_freq_change(struct qtnf_wmac *mac,
 			continue;
 
 		wiphy_lock(priv_to_wiphy(vif->mac));
-		cfg80211_ch_switch_notify(vif->netdev, &chandef, 0, 0);
+		cfg80211_ch_switch_notify(vif->netdev, &chandef, 0);
 		wiphy_unlock(priv_to_wiphy(vif->mac));
 	}
 
diff --git a/drivers/net/wireless/realtek/rtw89/fw.c b/drivers/net/wireless/realtek/rtw89/fw.c
index 51072a2dcf10..540ea16f048e 100644
--- a/drivers/net/wireless/realtek/rtw89/fw.c
+++ b/drivers/net/wireless/realtek/rtw89/fw.c
@@ -2804,8 +2804,11 @@ int rtw89_fw_h2c_assoc_cmac_tbl_g7(struct rtw89_dev *rtwdev,
 	}
 
 	if (vif->bss_conf.eht_support) {
-		h2c->w4 |= le32_encode_bits(~vif->bss_conf.eht_puncturing,
+		u16 punct = vif->bss_conf.chanreq.oper.punctured;
+
+		h2c->w4 |= le32_encode_bits(~punct,
 					    CCTLINFO_G7_W4_ACT_SUBCH_CBW);
+		rcu_read_unlock();
 		h2c->m4 |= cpu_to_le32(CCTLINFO_G7_W4_ACT_SUBCH_CBW);
 	}
 
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index fc2ad80118e8..cb5e34d640cd 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -7,7 +7,7 @@
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014 Intel Mobile Communications GmbH
  * Copyright 2015-2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2021, 2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
  */
 
 #include <linux/ethtool.h>
@@ -808,6 +808,9 @@ struct key_params {
  *	chan will define the primary channel and all other
  *	parameters are ignored.
  * @freq1_offset: offset from @center_freq1, in KHz
+ * @punctured: mask of the punctured 20 MHz subchannels, with
+ *	bits turned on being disabled (punctured); numbered
+ *	from lower to higher frequency (like in the spec)
  */
 struct cfg80211_chan_def {
 	struct ieee80211_channel *chan;
@@ -816,6 +819,7 @@ struct cfg80211_chan_def {
 	u32 center_freq2;
 	struct ieee80211_edmg edmg;
 	u16 freq1_offset;
+	u16 punctured;
 };
 
 /*
@@ -956,7 +960,8 @@ cfg80211_chandef_identical(const struct cfg80211_chan_def *chandef1,
 		chandef1->width == chandef2->width &&
 		chandef1->center_freq1 == chandef2->center_freq1 &&
 		chandef1->freq1_offset == chandef2->freq1_offset &&
-		chandef1->center_freq2 == chandef2->center_freq2);
+		chandef1->center_freq2 == chandef2->center_freq2 &&
+		chandef1->punctured == chandef2->punctured);
 }
 
 /**
@@ -1051,12 +1056,15 @@ cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy,
  * cfg80211_chandef_primary_freq - calculate primary 40/80/160 MHz freq
  * @chandef: chandef to calculate for
  * @primary_chan_width: primary channel width to calculate center for
+ * @punctured: punctured sub-channel bitmap, will be recalculated
+ *	according to the new bandwidth, can be %NULL
  *
  * Returns: the primary 40/80/160 MHz channel center frequency, or -1
- *	for errors
+ *	for errors, updating the punctured bitmap
  */
-int cfg80211_chandef_primary_freq(const struct cfg80211_chan_def *chandef,
-				  enum nl80211_chan_width primary_chan_width);
+int cfg80211_chandef_primary(const struct cfg80211_chan_def *chandef,
+			     enum nl80211_chan_width primary_chan_width,
+			     u16 *punctured);
 
 /**
  * nl80211_send_chandef - sends the channel definition.
@@ -1468,9 +1476,6 @@ struct cfg80211_unsol_bcast_probe_resp {
  * @fils_discovery: FILS discovery transmission parameters
  * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
  * @mbssid_config: AP settings for multiple bssid
- * @punct_bitmap: Preamble puncturing bitmap. Each bit represents
- *	a 20 MHz channel, lowest bit corresponding to the lowest channel.
- *	Bit set to 1 indicates that the channel is punctured.
  */
 struct cfg80211_ap_settings {
 	struct cfg80211_chan_def chandef;
@@ -1505,7 +1510,6 @@ struct cfg80211_ap_settings {
 	struct cfg80211_fils_discovery fils_discovery;
 	struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
 	struct cfg80211_mbssid_config mbssid_config;
-	u16 punct_bitmap;
 };
 
 
@@ -1539,9 +1543,6 @@ struct cfg80211_ap_update {
  * @radar_required: whether radar detection is required on the new channel
  * @block_tx: whether transmissions should be blocked while changing
  * @count: number of beacons until switch
- * @punct_bitmap: Preamble puncturing bitmap. Each bit represents
- *	a 20 MHz channel, lowest bit corresponding to the lowest channel.
- *	Bit set to 1 indicates that the channel is punctured.
  */
 struct cfg80211_csa_settings {
 	struct cfg80211_chan_def chandef;
@@ -1554,7 +1555,6 @@ struct cfg80211_csa_settings {
 	bool radar_required;
 	bool block_tx;
 	u8 count;
-	u16 punct_bitmap;
 };
 
 /**
@@ -8738,14 +8738,13 @@ bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy,
  * @dev: the device which switched channels
  * @chandef: the new channel definition
  * @link_id: the link ID for MLO, must be 0 for non-MLO
- * @punct_bitmap: the new puncturing bitmap
  *
  * Caller must hold wiphy mutex, therefore must only be called from sleepable
  * driver context!
  */
 void cfg80211_ch_switch_notify(struct net_device *dev,
 			       struct cfg80211_chan_def *chandef,
-			       unsigned int link_id, u16 punct_bitmap);
+			       unsigned int link_id);
 
 /*
  * cfg80211_ch_switch_started_notify - notify channel switch start
@@ -8754,7 +8753,6 @@ void cfg80211_ch_switch_notify(struct net_device *dev,
  * @link_id: the link ID for MLO, must be 0 for non-MLO
  * @count: the number of TBTTs until the channel switch happens
  * @quiet: whether or not immediate quiet was requested by the AP
- * @punct_bitmap: the future puncturing bitmap
  *
  * Inform the userspace about the channel switch that has just
  * started, so that it can take appropriate actions (eg. starting
@@ -8763,7 +8761,7 @@ void cfg80211_ch_switch_notify(struct net_device *dev,
 void cfg80211_ch_switch_started_notify(struct net_device *dev,
 				       struct cfg80211_chan_def *chandef,
 				       unsigned int link_id, u8 count,
-				       bool quiet, u16 punct_bitmap);
+				       bool quiet);
 
 /**
  * ieee80211_operating_class_to_band - convert operating class to band
@@ -9381,18 +9379,6 @@ static inline int cfg80211_color_change_notify(struct net_device *dev)
 					 0, 0);
 }
 
-/**
- * cfg80211_valid_disable_subchannel_bitmap - validate puncturing bitmap
- * @bitmap: bitmap to be validated
- * @chandef: channel definition
- *
- * Validate the puncturing bitmap.
- *
- * Return: %true if the bitmap is valid. %false otherwise.
- */
-bool cfg80211_valid_disable_subchannel_bitmap(u16 *bitmap,
-					      const struct cfg80211_chan_def *chandef);
-
 /**
  * cfg80211_links_removed - Notify about removed STA MLD setup links.
  * @dev: network device.
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index ab6bc89d3394..54aa4a06c878 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -7,7 +7,7 @@
  * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright (C) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2023 Intel Corporation
+ * Copyright (C) 2018 - 2024 Intel Corporation
  */
 
 #ifndef MAC80211_H
@@ -216,6 +216,8 @@ struct ieee80211_low_level_stats {
  * @IEEE80211_CHANCTX_CHANGE_MIN_WIDTH: The min required channel width changed
  * @IEEE80211_CHANCTX_CHANGE_AP: The AP channel definition changed, so (wider
  *	bandwidth) OFDMA settings need to be changed
+ * @IEEE80211_CHANCTX_CHANGE_PUNCTURING: The punctured channel(s) bitmap
+ *	was changed.
  */
 enum ieee80211_chanctx_change {
 	IEEE80211_CHANCTX_CHANGE_WIDTH		= BIT(0),
@@ -224,6 +226,7 @@ enum ieee80211_chanctx_change {
 	IEEE80211_CHANCTX_CHANGE_CHANNEL	= BIT(3),
 	IEEE80211_CHANCTX_CHANGE_MIN_WIDTH	= BIT(4),
 	IEEE80211_CHANCTX_CHANGE_AP		= BIT(5),
+	IEEE80211_CHANCTX_CHANGE_PUNCTURING	= BIT(6),
 };
 
 /**
@@ -357,7 +360,6 @@ struct ieee80211_vif_chanctx_switch {
  * @BSS_CHANGED_FILS_DISCOVERY: FILS discovery status changed.
  * @BSS_CHANGED_UNSOL_BCAST_PROBE_RESP: Unsolicited broadcast probe response
  *	status changed.
- * @BSS_CHANGED_EHT_PUNCTURING: The channel puncturing bitmap changed.
  * @BSS_CHANGED_MLD_VALID_LINKS: MLD valid links status changed.
  * @BSS_CHANGED_MLD_TTLM: TID to link mapping was changed
  */
@@ -394,7 +396,6 @@ enum ieee80211_bss_change {
 	BSS_CHANGED_HE_BSS_COLOR	= 1<<29,
 	BSS_CHANGED_FILS_DISCOVERY      = 1<<30,
 	BSS_CHANGED_UNSOL_BCAST_PROBE_RESP = 1<<31,
-	BSS_CHANGED_EHT_PUNCTURING	= BIT_ULL(32),
 	BSS_CHANGED_MLD_VALID_LINKS	= BIT_ULL(33),
 	BSS_CHANGED_MLD_TTLM		= BIT_ULL(34),
 
@@ -661,9 +662,7 @@ struct ieee80211_fils_discovery {
  * @tx_pwr_env_num: number of @tx_pwr_env.
  * @pwr_reduction: power constraint of BSS.
  * @eht_support: does this BSS support EHT
- * @eht_puncturing: bitmap to indicate which channels are punctured in this BSS
  * @csa_active: marks whether a channel switch is going on.
- * @csa_punct_bitmap: new puncturing bitmap for channel switch
  * @mu_mimo_owner: indicates interface owns MU-MIMO capability
  * @chanctx_conf: The channel context this interface is assigned to, or %NULL
  *	when it is not assigned. This pointer is RCU-protected due to the TX
@@ -766,10 +765,8 @@ struct ieee80211_bss_conf {
 	u8 tx_pwr_env_num;
 	u8 pwr_reduction;
 	bool eht_support;
-	u16 eht_puncturing;
 
 	bool csa_active;
-	u16 csa_punct_bitmap;
 
 	bool mu_mimo_owner;
 	struct ieee80211_chanctx_conf __rcu *chanctx_conf;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 5aa02b0872d9..2ddaf0037952 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1343,8 +1343,6 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 			return -EOPNOTSUPP;
 
 		link_conf->eht_support = true;
-		link_conf->eht_puncturing = params->punct_bitmap;
-		changed |= BSS_CHANGED_EHT_PUNCTURING;
 
 		link_conf->eht_su_beamformer =
 			params->eht_cap->fixed.phy_cap_info[0] &
@@ -3668,12 +3666,6 @@ static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data)
 	if (err)
 		return err;
 
-	if (sdata->vif.bss_conf.eht_puncturing != sdata->vif.bss_conf.csa_punct_bitmap) {
-		sdata->vif.bss_conf.eht_puncturing =
-					sdata->vif.bss_conf.csa_punct_bitmap;
-		changed |= BSS_CHANGED_EHT_PUNCTURING;
-	}
-
 	ieee80211_link_info_change_notify(sdata, link_data, changed);
 
 	if (link_data->csa_block_tx) {
@@ -3687,8 +3679,7 @@ static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data)
 		return err;
 
 	cfg80211_ch_switch_notify(sdata->dev, &link_data->csa_chanreq.oper,
-				  link_data->link_id,
-				  link_data->conf->eht_puncturing);
+				  link_data->link_id);
 
 	return 0;
 }
@@ -3889,6 +3880,9 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 				       &sdata->vif.bss_conf.chanreq.oper))
 		return -EINVAL;
 
+	if (chanreq.oper.punctured && !sdata->vif.bss_conf.eht_support)
+		return -EINVAL;
+
 	/* don't allow another channel switch if one is already active. */
 	if (sdata->vif.bss_conf.csa_active)
 		return -EBUSY;
@@ -3941,13 +3935,9 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 		goto out;
 	}
 
-	if (params->punct_bitmap && !sdata->vif.bss_conf.eht_support)
-		goto out;
-
 	sdata->deflink.csa_chanreq = chanreq; 
 	sdata->deflink.csa_block_tx = params->block_tx;
 	sdata->vif.bss_conf.csa_active = true;
-	sdata->vif.bss_conf.csa_punct_bitmap = params->punct_bitmap;
 
 	if (sdata->deflink.csa_block_tx)
 		ieee80211_stop_vif_queues(local, sdata,
@@ -3955,8 +3945,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 
 	cfg80211_ch_switch_started_notify(sdata->dev,
 					  &sdata->deflink.csa_chanreq.oper, 0,
-					  params->count, params->block_tx,
-					  sdata->vif.bss_conf.csa_punct_bitmap);
+					  params->count, params->block_tx);
 
 	if (changed) {
 		ieee80211_link_info_change_notify(sdata, &sdata->deflink,
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index fe1489ba58c6..38acdc458c7c 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * mac80211 - channel management
- * Copyright 2020 - 2022 Intel Corporation
+ * Copyright 2020 - 2024 Intel Corporation
  */
 
 #include <linux/nl80211.h>
@@ -517,8 +517,12 @@ static void _ieee80211_change_chanctx(struct ieee80211_local *local,
 
 	ieee80211_remove_wbrf(local, &ctx->conf.def);
 
-	if (!cfg80211_chandef_identical(&ctx->conf.def, &chanreq->oper))
-		changed |= IEEE80211_CHANCTX_CHANGE_WIDTH;
+	if (!cfg80211_chandef_identical(&ctx->conf.def, &chanreq->oper)) {
+		if (ctx->conf.def.width != chanreq->oper.width)
+			changed |= IEEE80211_CHANCTX_CHANGE_WIDTH;
+		if (ctx->conf.def.punctured != chanreq->oper.punctured)
+			changed |= IEEE80211_CHANCTX_CHANGE_PUNCTURING;
+	}
 	if (!cfg80211_chandef_identical(&ctx->conf.ap, &chanreq->ap))
 		changed |= IEEE80211_CHANCTX_CHANGE_AP;
 	ctx->conf.def = *chandef;
diff --git a/net/mac80211/link.c b/net/mac80211/link.c
index 2231eb38a211..4f19d6479bef 100644
--- a/net/mac80211/link.c
+++ b/net/mac80211/link.c
@@ -2,7 +2,7 @@
 /*
  * MLO link handling
  *
- * Copyright (C) 2022-2023 Intel Corporation
+ * Copyright (C) 2022-2024 Intel Corporation
  */
 #include <linux/slab.h>
 #include <linux/kernel.h>
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 74a15f18e7ee..4eaf5c10efdb 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -93,83 +93,6 @@ MODULE_PARM_DESC(probe_wait_ms,
  */
 #define IEEE80211_SIGNAL_AVE_MIN_COUNT	4
 
-/*
- * Extract from the given disabled subchannel bitmap (raw format
- * from the EHT Operation Element) the bits for the subchannel
- * we're using right now.
- */
-static u16
-ieee80211_extract_dis_subch_bmap(const struct ieee80211_eht_operation *eht_oper,
-				 struct cfg80211_chan_def *chandef, u16 bitmap)
-{
-	struct ieee80211_eht_operation_info *info = (void *)eht_oper->optional;
-	struct cfg80211_chan_def ap_chandef = *chandef;
-	u32 ap_center_freq, local_center_freq;
-	u32 ap_bw, local_bw;
-	int ap_start_freq, local_start_freq;
-	u16 shift, mask;
-
-	if (!(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT) ||
-	    !(eht_oper->params &
-	      IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT))
-		return 0;
-
-	/* set 160/320 supported to get the full AP definition */
-	ieee80211_chandef_eht_oper((const void *)eht_oper->optional,
-				   &ap_chandef);
-	ap_center_freq = ap_chandef.center_freq1;
-	ap_bw = 20 * BIT(u8_get_bits(info->control,
-				     IEEE80211_EHT_OPER_CHAN_WIDTH));
-	ap_start_freq = ap_center_freq - ap_bw / 2;
-	local_center_freq = chandef->center_freq1;
-	local_bw = 20 * BIT(ieee80211_chan_width_to_rx_bw(chandef->width));
-	local_start_freq = local_center_freq - local_bw / 2;
-	shift = (local_start_freq - ap_start_freq) / 20;
-	mask = BIT(local_bw / 20) - 1;
-
-	return (bitmap >> shift) & mask;
-}
-
-/*
- * Handle the puncturing bitmap, possibly downgrading bandwidth to get a
- * valid bitmap.
- */
-static void
-ieee80211_handle_puncturing_bitmap(struct ieee80211_link_data *link,
-				   const struct ieee80211_eht_operation *eht_oper,
-				   u16 bitmap, u64 *changed)
-{
-	struct cfg80211_chan_def *chandef = &link->conf->chanreq.oper;
-	struct ieee80211_local *local = link->sdata->local;
-	u16 extracted;
-	u64 _changed = 0;
-
-	if (!changed)
-		changed = &_changed;
-
-	while (chandef->width > NL80211_CHAN_WIDTH_40) {
-		extracted =
-			ieee80211_extract_dis_subch_bmap(eht_oper, chandef,
-							 bitmap);
-
-		if (cfg80211_valid_disable_subchannel_bitmap(&bitmap,
-							     chandef) &&
-		    !(bitmap && ieee80211_hw_check(&local->hw,
-						   DISALLOW_PUNCTURING)))
-			break;
-		ieee80211_chandef_downgrade(chandef, &link->u.mgd.conn);
-		*changed |= BSS_CHANGED_BANDWIDTH;
-	}
-
-	if (chandef->width <= NL80211_CHAN_WIDTH_40)
-		extracted = 0;
-
-	if (link->conf->eht_puncturing != extracted) {
-		link->conf->eht_puncturing = extracted;
-		*changed |= BSS_CHANGED_EHT_PUNCTURING;
-	}
-}
-
 /*
  * We can have multiple work items (and connection probing)
  * scheduling this timer, but we need to take care to only
@@ -396,6 +319,9 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata,
 		ieee80211_chandef_eht_oper((const void *)eht_oper->optional,
 					   &eht_chandef);
 
+		eht_chandef.punctured =
+			ieee80211_eht_oper_dis_subchan_bitmap(eht_oper);
+
 		if (!cfg80211_chandef_valid(&eht_chandef)) {
 			sdata_info(sdata,
 				   "AP EHT information is invalid, disabling EHT\n");
@@ -661,13 +587,27 @@ ieee80211_verify_sta_eht_mcs_support(struct ieee80211_sub_if_data *sdata,
 	return true;
 }
 
+static bool ieee80211_chandef_usable(struct ieee80211_sub_if_data *sdata,
+				     const struct cfg80211_chan_def *chandef,
+				     u32 prohibited_flags)
+{
+	if (!cfg80211_chandef_usable(sdata->local->hw.wiphy,
+				     chandef, prohibited_flags))
+		return false;
+
+	if (chandef->punctured &&
+	    ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING))
+		return false;
+
+	return true;
+}
+
 static struct ieee802_11_elems *
 ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata,
 			      struct ieee80211_conn_settings *conn,
 			      struct cfg80211_bss *cbss, int link_id,
 			      struct ieee80211_chan_req *chanreq)
 {
-	struct ieee80211_local *local = sdata->local;
 	const struct cfg80211_bss_ies *ies = rcu_dereference(cbss->ies);
 	struct ieee80211_bss *bss = (void *)cbss->priv;
 	struct ieee80211_channel *channel = cbss->channel;
@@ -761,8 +701,8 @@ again:
 	else
 		chanreq->ap.chan = NULL;
 
-	while (!cfg80211_chandef_usable(sdata->local->hw.wiphy, &chanreq->oper,
-					IEEE80211_CHAN_DISABLED)) {
+	while (!ieee80211_chandef_usable(sdata, &chanreq->oper,
+					 IEEE80211_CHAN_DISABLED)) {
 		if (WARN_ON(chanreq->oper.width == NL80211_CHAN_WIDTH_20_NOHT)) {
 			ret = -EINVAL;
 			goto free;
@@ -812,30 +752,6 @@ again:
 		sdata_info(sdata, "required MCSes not supported, disabling EHT\n");
 	}
 
-	if (conn->mode >= IEEE80211_CONN_MODE_EHT) {
-		u16 bitmap;
-
-		if (WARN_ON_ONCE(!elems->eht_operation)) {
-			ret = -EINVAL;
-			goto free;
-		}
-
-		bitmap = ieee80211_eht_oper_dis_subchan_bitmap(elems->eht_operation);
-
-		if (!cfg80211_valid_disable_subchannel_bitmap(&bitmap,
-							      &ap_chandef) ||
-		    (bitmap &&
-		     ieee80211_hw_check(&local->hw, DISALLOW_PUNCTURING))) {
-			conn->mode = IEEE80211_CONN_MODE_HE;
-			conn->bw_limit = min_t(enum ieee80211_conn_bw_limit,
-					       conn->bw_limit,
-					       IEEE80211_CONN_BW_LIMIT_160);
-			sdata_info(sdata,
-				   "AP has invalid/unsupported puncturing, disabling EHT\n");
-		}
-		/* FIXME: store puncturing bitmap */
-	}
-
 	/* the mode can only decrease, so this must terminate */
 	if (ap_mode != conn->mode)
 		goto again;
@@ -2127,7 +2043,7 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link)
 	}
 
 	cfg80211_ch_switch_notify(sdata->dev, &link->reserved.oper,
-				  link->link_id, 0);
+				  link->link_id);
 }
 
 void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success,
@@ -2330,7 +2246,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link,
 
 	cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chanreq.oper,
 					  link->link_id, csa_ie.count,
-					  csa_ie.mode, 0);
+					  csa_ie.mode);
 
 	if (local->ops->channel_switch) {
 		/* use driver's channel switch callback */
@@ -3394,8 +3310,6 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	sdata->deflink.u.mgd.tracking_signal_avg = false;
 	sdata->deflink.u.mgd.disable_wmm_tracking = false;
 
-	sdata->vif.bss_conf.eht_puncturing = 0;
-
 	ifmgd->flags = 0;
 
 	for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
@@ -4625,7 +4539,6 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
 							    link_sta);
 
 			bss_conf->eht_support = link_sta->pub->eht_cap.has_eht;
-			*changed |= BSS_CHANGED_EHT_PUNCTURING;
 		} else {
 			bss_conf->eht_support = false;
 		}
@@ -5865,40 +5778,6 @@ static bool ieee80211_rx_our_beacon(const u8 *tx_bssid,
 	return ether_addr_equal(tx_bssid, bss->transmitted_bss->bssid);
 }
 
-static bool ieee80211_config_puncturing(struct ieee80211_link_data *link,
-					const struct ieee80211_eht_operation *eht_oper,
-					u64 *changed)
-{
-	struct ieee80211_local *local = link->sdata->local;
-	u16 bitmap, extracted;
-
-	bitmap = ieee80211_eht_oper_dis_subchan_bitmap(eht_oper);
-	extracted = ieee80211_extract_dis_subch_bmap(eht_oper,
-						     &link->conf->chanreq.oper,
-						     bitmap);
-
-	/* accept if there are no changes */
-	if (!(*changed & BSS_CHANGED_BANDWIDTH) &&
-	    extracted == link->conf->eht_puncturing)
-		return true;
-
-	if (!cfg80211_valid_disable_subchannel_bitmap(&bitmap,
-						      &link->conf->chanreq.oper)) {
-		link_info(link,
-			  "Got an invalid disable subchannel bitmap from AP %pM: bitmap = 0x%x, bw = 0x%x. disconnect\n",
-			  link->u.mgd.bssid,
-			  bitmap,
-			  link->conf->chanreq.oper.width);
-		return false;
-	}
-
-	if (bitmap && ieee80211_hw_check(&local->hw, DISALLOW_PUNCTURING))
-		return false;
-
-	ieee80211_handle_puncturing_bitmap(link, eht_oper, bitmap, changed);
-	return true;
-}
-
 static void ieee80211_ml_reconf_work(struct wiphy *wiphy,
 				     struct wiphy_work *work)
 {
@@ -6614,21 +6493,6 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
 					       elems->pwr_constr_elem,
 					       elems->cisco_dtpc_elem);
 
-	if (elems->eht_operation &&
-	    link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_EHT) {
-		if (!ieee80211_config_puncturing(link, elems->eht_operation,
-						 &changed)) {
-			ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
-					       WLAN_REASON_DEAUTH_LEAVING,
-					       true, deauth_buf);
-			ieee80211_report_disconnect(sdata, deauth_buf,
-						    sizeof(deauth_buf), true,
-						    WLAN_REASON_DEAUTH_LEAVING,
-						    false);
-			goto free;
-		}
-	}
-
 	ieee80211_ml_reconfiguration(sdata, elems);
 	ieee80211_process_adv_ttlm(sdata, elems,
 				      le64_to_cpu(mgmt->u.beacon.timestamp));
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 63a88169d53d..5108dbaa9360 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -6,7 +6,7 @@
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright (C) 2015-2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
  *
  * utilities for mac80211
  */
@@ -2810,9 +2810,6 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 				    sdata->vif.bss_conf.protected_keep_alive)
 					changed |= BSS_CHANGED_KEEP_ALIVE;
 
-				if (sdata->vif.bss_conf.eht_puncturing)
-					changed |= BSS_CHANGED_EHT_PUNCTURING;
-
 				ieee80211_bss_info_change_notify(sdata,
 								 changed);
 			} else if (!WARN_ON(!link)) {
@@ -4365,14 +4362,17 @@ EXPORT_SYMBOL(ieee80211_radar_detected);
 void ieee80211_chandef_downgrade(struct cfg80211_chan_def *c,
 				 struct ieee80211_conn_settings *conn)
 {
-	/* no-HT indicates nothing to do */
-	enum nl80211_chan_width new_primary_width = NL80211_CHAN_WIDTH_20_NOHT;
+	enum nl80211_chan_width new_primary_width;
 	struct ieee80211_conn_settings _ignored = {};
 
 	/* allow passing NULL if caller doesn't care */
 	if (!conn)
 		conn = &_ignored;
 
+again:
+	/* no-HT indicates nothing to do */
+	new_primary_width = NL80211_CHAN_WIDTH_20_NOHT;
+
 	switch (c->width) {
 	default:
 	case NL80211_CHAN_WIDTH_20_NOHT:
@@ -4382,6 +4382,7 @@ void ieee80211_chandef_downgrade(struct cfg80211_chan_def *c,
 		c->width = NL80211_CHAN_WIDTH_20_NOHT;
 		conn->mode = IEEE80211_CONN_MODE_LEGACY;
 		conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20;
+		c->punctured = 0;
 		break;
 	case NL80211_CHAN_WIDTH_40:
 		c->width = NL80211_CHAN_WIDTH_20;
@@ -4389,6 +4390,7 @@ void ieee80211_chandef_downgrade(struct cfg80211_chan_def *c,
 		if (conn->mode == IEEE80211_CONN_MODE_VHT)
 			conn->mode = IEEE80211_CONN_MODE_HT;
 		conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20;
+		c->punctured = 0;
 		break;
 	case NL80211_CHAN_WIDTH_80:
 		new_primary_width = NL80211_CHAN_WIDTH_40;
@@ -4429,11 +4431,19 @@ void ieee80211_chandef_downgrade(struct cfg80211_chan_def *c,
 	}
 
 	if (new_primary_width != NL80211_CHAN_WIDTH_20_NOHT) {
-		c->center_freq1 =
-			cfg80211_chandef_primary_freq(c, new_primary_width);
+		c->center_freq1 = cfg80211_chandef_primary(c, new_primary_width,
+							   &c->punctured);
 		c->width = new_primary_width;
 	}
 
+	/*
+	 * With an 80 MHz channel, we might have the puncturing in the primary
+	 * 40 Mhz channel, but that's not valid when downgraded to 40 MHz width.
+	 * In that case, downgrade again.
+	 */
+	if (!cfg80211_chandef_valid(c) && c->punctured)
+		goto again;
+
 	WARN_ON_ONCE(!cfg80211_chandef_valid(c));
 }
 
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index bfa2ea935fc2..e2ce89afa9ff 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -6,7 +6,7 @@
  *
  * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright 2018-2023	Intel Corporation
+ * Copyright 2018-2024	Intel Corporation
  */
 
 #include <linux/export.h>
@@ -27,11 +27,10 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef,
 	if (WARN_ON(!chan))
 		return;
 
-	chandef->chan = chan;
-	chandef->freq1_offset = chan->freq_offset;
-	chandef->center_freq2 = 0;
-	chandef->edmg.bw_config = 0;
-	chandef->edmg.channels = 0;
+	*chandef = (struct cfg80211_chan_def) {
+		.chan = chan,
+		.freq1_offset = chan->freq_offset,
+	};
 
 	switch (chan_type) {
 	case NL80211_CHAN_NO_HT:
@@ -87,10 +86,9 @@ static const struct cfg80211_per_bw_puncturing_values per_bw_puncturing[] = {
 	CFG80211_PER_BW_VALID_PUNCTURING_VALUES(320)
 };
 
-bool cfg80211_valid_disable_subchannel_bitmap(u16 *bitmap,
-					      const struct cfg80211_chan_def *chandef)
+static bool valid_puncturing_bitmap(const struct cfg80211_chan_def *chandef)
 {
-	u32 idx, i, start_freq;
+	u32 idx, i, start_freq, primary_center = chandef->chan->center_freq;
 
 	switch (chandef->width) {
 	case NL80211_CHAN_WIDTH_80:
@@ -106,24 +104,23 @@ bool cfg80211_valid_disable_subchannel_bitmap(u16 *bitmap,
 		start_freq = chandef->center_freq1 - 160;
 		break;
 	default:
-		*bitmap = 0;
-		break;
+		return chandef->punctured == 0;
 	}
 
-	if (!*bitmap)
+	if (!chandef->punctured)
 		return true;
 
 	/* check if primary channel is punctured */
-	if (*bitmap & (u16)BIT((chandef->chan->center_freq - start_freq) / 20))
+	if (chandef->punctured & (u16)BIT((primary_center - start_freq) / 20))
 		return false;
 
-	for (i = 0; i < per_bw_puncturing[idx].len; i++)
-		if (per_bw_puncturing[idx].valid_values[i] == *bitmap)
+	for (i = 0; i < per_bw_puncturing[idx].len; i++) {
+		if (per_bw_puncturing[idx].valid_values[i] == chandef->punctured)
 			return true;
+	}
 
 	return false;
 }
-EXPORT_SYMBOL(cfg80211_valid_disable_subchannel_bitmap);
 
 static bool cfg80211_edmg_chandef_valid(const struct cfg80211_chan_def *chandef)
 {
@@ -386,17 +383,19 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
 	    !cfg80211_edmg_chandef_valid(chandef))
 		return false;
 
-	return true;
+	return valid_puncturing_bitmap(chandef);
 }
 EXPORT_SYMBOL(cfg80211_chandef_valid);
 
-int cfg80211_chandef_primary_freq(const struct cfg80211_chan_def *c,
-				  enum nl80211_chan_width primary_chan_width)
+int cfg80211_chandef_primary(const struct cfg80211_chan_def *c,
+			     enum nl80211_chan_width primary_chan_width,
+			     u16 *punctured)
 {
 	int pri_width = nl80211_chan_width_to_mhz(primary_chan_width);
 	int width = cfg80211_chandef_get_width(c);
 	u32 control = c->chan->center_freq;
 	u32 center = c->center_freq1;
+	u16 _punct = 0;
 
 	if (WARN_ON_ONCE(pri_width < 0 || width < 0))
 		return -1;
@@ -405,26 +404,41 @@ int cfg80211_chandef_primary_freq(const struct cfg80211_chan_def *c,
 	if (WARN_ON_ONCE(pri_width > width))
 		return -1;
 
+	if (!punctured)
+		punctured = &_punct;
+
+	*punctured = c->punctured;
+
 	while (width > pri_width) {
-		if (control > center)
+		unsigned int bits_to_drop = width / 20 / 2;
+
+		if (control > center) {
 			center += width / 4;
-		else
+			*punctured >>= bits_to_drop;
+		} else {
 			center -= width / 4;
+			*punctured &= (1 << bits_to_drop) - 1;
+		}
 		width /= 2;
 	}
 
 	return center;
 }
-EXPORT_SYMBOL(cfg80211_chandef_primary_freq);
+EXPORT_SYMBOL(cfg80211_chandef_primary);
 
 static const struct cfg80211_chan_def *
 check_chandef_primary_compat(const struct cfg80211_chan_def *c1,
 			     const struct cfg80211_chan_def *c2,
 			     enum nl80211_chan_width primary_chan_width)
 {
+	u16 punct_c1 = 0, punct_c2 = 0;
+
 	/* check primary is compatible -> error if not */
-	if (cfg80211_chandef_primary_freq(c1, primary_chan_width) !=
-	    cfg80211_chandef_primary_freq(c2, primary_chan_width))
+	if (cfg80211_chandef_primary(c1, primary_chan_width, &punct_c1) !=
+	    cfg80211_chandef_primary(c2, primary_chan_width, &punct_c2))
+		return ERR_PTR(-EINVAL);
+
+	if (punct_c1 != punct_c2)
 		return ERR_PTR(-EINVAL);
 
 	/* assumes c1 is smaller width, if that was just checked -> done */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 68c20409eca6..b533412ad1e0 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3218,21 +3218,6 @@ static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev)
 		wdev->iftype == NL80211_IFTYPE_P2P_GO;
 }
 
-static int nl80211_parse_punct_bitmap(struct cfg80211_registered_device *rdev,
-				      struct genl_info *info,
-				      const struct cfg80211_chan_def *chandef,
-				      u16 *punct_bitmap)
-{
-	if (!wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_PUNCT))
-		return -EINVAL;
-
-	*punct_bitmap = nla_get_u32(info->attrs[NL80211_ATTR_PUNCT_BITMAP]);
-	if (!cfg80211_valid_disable_subchannel_bitmap(punct_bitmap, chandef))
-		return -EINVAL;
-
-	return 0;
-}
-
 int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
 			  struct genl_info *info,
 			  struct cfg80211_chan_def *chandef)
@@ -3340,6 +3325,19 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
 		chandef->edmg.channels = 0;
 	}
 
+	if (info->attrs[NL80211_ATTR_PUNCT_BITMAP]) {
+		chandef->punctured =
+			nla_get_u32(info->attrs[NL80211_ATTR_PUNCT_BITMAP]);
+
+		if (chandef->punctured &&
+		    !wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_PUNCT)) {
+			NL_SET_ERR_MSG(extack,
+				       "driver doesn't support puncturing");
+			return -EINVAL;
+		}
+	}
+
 	if (!cfg80211_chandef_valid(chandef)) {
 		NL_SET_ERR_MSG(extack, "invalid channel definition");
 		return -EINVAL;
@@ -3816,6 +3814,10 @@ int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *ch
 	if (chandef->center_freq2 &&
 	    nla_put_u32(msg, NL80211_ATTR_CENTER_FREQ2, chandef->center_freq2))
 		return -ENOBUFS;
+	if (chandef->punctured &&
+	    nla_put_u32(msg, NL80211_ATTR_PUNCT_BITMAP, chandef->punctured))
+		return -ENOBUFS;
+
 	return 0;
 }
 EXPORT_SYMBOL(nl80211_send_chandef);
@@ -6061,14 +6063,6 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 		goto out;
 	}
 
-	if (info->attrs[NL80211_ATTR_PUNCT_BITMAP]) {
-		err = nl80211_parse_punct_bitmap(rdev, info,
-						 &params->chandef,
-						 &params->punct_bitmap);
-		if (err)
-			goto out;
-	}
-
 	if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &params->chandef,
 					   wdev->iftype)) {
 		err = -EINVAL;
@@ -10228,14 +10222,6 @@ skip_beacons:
 	if (info->attrs[NL80211_ATTR_CH_SWITCH_BLOCK_TX])
 		params.block_tx = true;
 
-	if (info->attrs[NL80211_ATTR_PUNCT_BITMAP]) {
-		err = nl80211_parse_punct_bitmap(rdev, info,
-						 &params.chandef,
-						 &params.punct_bitmap);
-		if (err)
-			goto free;
-	}
-
 	err = rdev_channel_switch(rdev, dev, &params);
 
 free:
@@ -19356,7 +19342,7 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev,
 				     struct cfg80211_chan_def *chandef,
 				     gfp_t gfp,
 				     enum nl80211_commands notif,
-				     u8 count, bool quiet, u16 punct_bitmap)
+				     u8 count, bool quiet)
 {
 	struct wireless_dev *wdev = netdev->ieee80211_ptr;
 	struct sk_buff *msg;
@@ -19390,9 +19376,6 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev,
 			goto nla_put_failure;
 	}
 
-	if (nla_put_u32(msg, NL80211_ATTR_PUNCT_BITMAP, punct_bitmap))
-		goto nla_put_failure;
-
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
@@ -19405,7 +19388,7 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev,
 
 void cfg80211_ch_switch_notify(struct net_device *dev,
 			       struct cfg80211_chan_def *chandef,
-			       unsigned int link_id, u16 punct_bitmap)
+			       unsigned int link_id)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct wiphy *wiphy = wdev->wiphy;
@@ -19414,7 +19397,7 @@ void cfg80211_ch_switch_notify(struct net_device *dev,
 	lockdep_assert_wiphy(wdev->wiphy);
 	WARN_INVALID_LINK_ID(wdev, link_id);
 
-	trace_cfg80211_ch_switch_notify(dev, chandef, link_id, punct_bitmap);
+	trace_cfg80211_ch_switch_notify(dev, chandef, link_id);
 
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_STATION:
@@ -19443,15 +19426,14 @@ void cfg80211_ch_switch_notify(struct net_device *dev,
 	cfg80211_sched_dfs_chan_update(rdev);
 
 	nl80211_ch_switch_notify(rdev, dev, link_id, chandef, GFP_KERNEL,
-				 NL80211_CMD_CH_SWITCH_NOTIFY, 0, false,
-				 punct_bitmap);
+				 NL80211_CMD_CH_SWITCH_NOTIFY, 0, false);
 }
 EXPORT_SYMBOL(cfg80211_ch_switch_notify);
 
 void cfg80211_ch_switch_started_notify(struct net_device *dev,
 				       struct cfg80211_chan_def *chandef,
 				       unsigned int link_id, u8 count,
-				       bool quiet, u16 punct_bitmap)
+				       bool quiet)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct wiphy *wiphy = wdev->wiphy;
@@ -19460,13 +19442,12 @@ void cfg80211_ch_switch_started_notify(struct net_device *dev,
 	lockdep_assert_wiphy(wdev->wiphy);
 	WARN_INVALID_LINK_ID(wdev, link_id);
 
-	trace_cfg80211_ch_switch_started_notify(dev, chandef, link_id,
-						punct_bitmap);
+	trace_cfg80211_ch_switch_started_notify(dev, chandef, link_id);
 
 
 	nl80211_ch_switch_notify(rdev, dev, link_id, chandef, GFP_KERNEL,
 				 NL80211_CMD_CH_SWITCH_STARTED_NOTIFY,
-				 count, quiet, punct_bitmap);
+				 count, quiet);
 }
 EXPORT_SYMBOL(cfg80211_ch_switch_started_notify);
 
diff --git a/net/wireless/tests/chan.c b/net/wireless/tests/chan.c
index b9e7a27e43cb..d02258ac2dab 100644
--- a/net/wireless/tests/chan.c
+++ b/net/wireless/tests/chan.c
@@ -2,7 +2,7 @@
 /*
  * KUnit tests for channel helper functions
  *
- * Copyright (C) 2023 Intel Corporation
+ * Copyright (C) 2023-2024 Intel Corporation
  */
 #include <net/cfg80211.h>
 #include <kunit/test.h>
@@ -140,6 +140,52 @@ static const struct chandef_compat_case {
 		},
 		.compat = true,
 	},
+	{
+		.desc = "matching primary 160 MHz & punctured secondary 160 Mhz",
+		.c1 = {
+			.width = NL80211_CHAN_WIDTH_160,
+			.chan = &chan_6ghz_105,
+			.center_freq1 = 6475 + 70,
+		},
+		.c2 = {
+			.width = NL80211_CHAN_WIDTH_320,
+			.chan = &chan_6ghz_105,
+			.center_freq1 = 6475 - 10,
+			.punctured = 0xf,
+		},
+		.compat = true,
+	},
+	{
+		.desc = "matching primary 160 MHz & punctured matching",
+		.c1 = {
+			.width = NL80211_CHAN_WIDTH_160,
+			.chan = &chan_6ghz_105,
+			.center_freq1 = 6475 + 70,
+			.punctured = 0xc0,
+		},
+		.c2 = {
+			.width = NL80211_CHAN_WIDTH_320,
+			.chan = &chan_6ghz_105,
+			.center_freq1 = 6475 - 10,
+			.punctured = 0xc000,
+		},
+		.compat = true,
+	},
+	{
+		.desc = "matching primary 160 MHz & punctured not matching",
+		.c1 = {
+			.width = NL80211_CHAN_WIDTH_160,
+			.chan = &chan_6ghz_105,
+			.center_freq1 = 6475 + 70,
+			.punctured = 0x80,
+		},
+		.c2 = {
+			.width = NL80211_CHAN_WIDTH_320,
+			.chan = &chan_6ghz_105,
+			.center_freq1 = 6475 - 10,
+			.punctured = 0xc000,
+		},
+	},
 };
 
 KUNIT_ARRAY_PARAM_DESC(chandef_compat, chandef_compat_cases, desc)
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 1f374c8a17a5..ae5e585b6863 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1,4 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Portions of this file
+ * Copyright(c) 2016-2017 Intel Deutschland GmbH
+ * Copyright (C) 2018, 2020-2024 Intel Corporation
+ */
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM cfg80211
 
@@ -135,7 +140,8 @@
 		       __field(u32, width)				\
 		       __field(u32, center_freq1)			\
 		       __field(u32, freq1_offset)			\
-		       __field(u32, center_freq2)
+		       __field(u32, center_freq2)			\
+		       __field(u16, punctured)
 #define CHAN_DEF_ASSIGN(chandef)					\
 	do {								\
 		if ((chandef) && (chandef)->chan) {			\
@@ -148,6 +154,7 @@
 			__entry->center_freq1 = (chandef)->center_freq1;\
 			__entry->freq1_offset = (chandef)->freq1_offset;\
 			__entry->center_freq2 = (chandef)->center_freq2;\
+			__entry->punctured = (chandef)->punctured;	\
 		} else {						\
 			__entry->band = 0;				\
 			__entry->control_freq = 0;			\
@@ -156,14 +163,15 @@
 			__entry->center_freq1 = 0;			\
 			__entry->freq1_offset = 0;			\
 			__entry->center_freq2 = 0;			\
+			__entry->punctured = 0;				\
 		}							\
 	} while (0)
 #define CHAN_DEF_PR_FMT							\
-	"band: %d, control freq: %u.%03u, width: %d, cf1: %u.%03u, cf2: %u"
+	"band: %d, control freq: %u.%03u, width: %d, cf1: %u.%03u, cf2: %u, punct: 0x%x"
 #define CHAN_DEF_PR_ARG __entry->band, __entry->control_freq,		\
 			__entry->freq_offset, __entry->width,		\
 			__entry->center_freq1, __entry->freq1_offset,	\
-			__entry->center_freq2
+			__entry->center_freq2, __entry->punctured
 
 #define FILS_AAD_ASSIGN(fa)						\
 	do {								\
@@ -3267,47 +3275,39 @@ TRACE_EVENT(cfg80211_chandef_dfs_required,
 TRACE_EVENT(cfg80211_ch_switch_notify,
 	TP_PROTO(struct net_device *netdev,
 		 struct cfg80211_chan_def *chandef,
-		 unsigned int link_id,
-		 u16 punct_bitmap),
-	TP_ARGS(netdev, chandef, link_id, punct_bitmap),
+		 unsigned int link_id),
+	TP_ARGS(netdev, chandef, link_id),
 	TP_STRUCT__entry(
 		NETDEV_ENTRY
 		CHAN_DEF_ENTRY
 		__field(unsigned int, link_id)
-		__field(u16, punct_bitmap)
 	),
 	TP_fast_assign(
 		NETDEV_ASSIGN;
 		CHAN_DEF_ASSIGN(chandef);
 		__entry->link_id = link_id;
-		__entry->punct_bitmap = punct_bitmap;
 	),
-	TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d, punct_bitmap:%u",
-		  NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id,
-		  __entry->punct_bitmap)
+	TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d",
+		  NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id)
 );
 
 TRACE_EVENT(cfg80211_ch_switch_started_notify,
 	TP_PROTO(struct net_device *netdev,
 		 struct cfg80211_chan_def *chandef,
-		 unsigned int link_id,
-		 u16 punct_bitmap),
-	TP_ARGS(netdev, chandef, link_id, punct_bitmap),
+		 unsigned int link_id),
+	TP_ARGS(netdev, chandef, link_id),
 	TP_STRUCT__entry(
 		NETDEV_ENTRY
 		CHAN_DEF_ENTRY
 		__field(unsigned int, link_id)
-		__field(u16, punct_bitmap)
 	),
 	TP_fast_assign(
 		NETDEV_ASSIGN;
 		CHAN_DEF_ASSIGN(chandef);
 		__entry->link_id = link_id;
-		__entry->punct_bitmap = punct_bitmap;
 	),
-	TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d, punct_bitmap:%u",
-		  NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id,
-		  __entry->punct_bitmap)
+	TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d",
+		  NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id)
 );
 
 TRACE_EVENT(cfg80211_radar_event,
-- 
cgit v1.2.3


From b1344b1399daec9aca62bd0b2ea94874f5b8e126 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 29 Jan 2024 20:04:56 +0100
Subject: wifi: mac80211: add/use ieee80211_get_sn()

This will also be useful for MLO duplicate multicast
detection, but add it already here and use it in one
place that trivially converts.

Link: https://msgid.link/20240129200456.f0ff49c80006.I850d2785ab1640e56e262d3ad7343b87f6962552@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 7 ++++++-
 net/mac80211/rx.c         | 5 ++---
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index d9d2c1253157..b9367d5f04c4 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -9,7 +9,7 @@
  * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
  * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
  * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
- * Copyright (c) 2018 - 2023 Intel Corporation
+ * Copyright (c) 2018 - 2024 Intel Corporation
  */
 
 #ifndef LINUX_IEEE80211_H
@@ -808,6 +808,11 @@ static inline bool ieee80211_is_frag(struct ieee80211_hdr *hdr)
 	       hdr->seq_ctrl & cpu_to_le16(IEEE80211_SCTL_FRAG);
 }
 
+static inline u16 ieee80211_get_sn(struct ieee80211_hdr *hdr)
+{
+	return le16_get_bits(hdr->seq_ctrl, IEEE80211_SCTL_SEQ);
+}
+
 struct ieee80211s_hdr {
 	u8 flags;
 	u8 ttl;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 76798e8057f7..53c4764dc1ed 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -6,7 +6,7 @@
  * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright(c) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
  */
 
 #include <linux/jiffies.h>
@@ -1251,8 +1251,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
-	u16 sc = le16_to_cpu(hdr->seq_ctrl);
-	u16 mpdu_seq_num = (sc & IEEE80211_SCTL_SEQ) >> 4;
+	u16 mpdu_seq_num = ieee80211_get_sn(hdr);
 	u16 head_seq_num, buf_size;
 	int index;
 	bool ret = true;
-- 
cgit v1.2.3


From 676259100cf3a81dd2d47918b36edb237986b9df Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 29 Jan 2024 20:04:57 +0100
Subject: wifi: mac80211: implement MLO multicast deduplication

If the vif is an MLD then it may receive multicast from
different links, and should drop those frames according
to the SN. Implement that.

Link: https://msgid.link/20240129200456.693b77d14b44.I491846f2bea0058c14eab6422962c10bfae9b675@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  |  5 +++++
 net/mac80211/ieee80211_i.h |  4 +++-
 net/mac80211/mlme.c        |  3 +++
 net/mac80211/rx.c          | 22 ++++++++++++++++++++--
 4 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index b9367d5f04c4..e9078143b822 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -191,6 +191,11 @@ static inline bool ieee80211_sn_less(u16 sn1, u16 sn2)
 	return ((sn1 - sn2) & IEEE80211_SN_MASK) > (IEEE80211_SN_MODULO >> 1);
 }
 
+static inline bool ieee80211_sn_less_eq(u16 sn1, u16 sn2)
+{
+	return ((sn2 - sn1) & IEEE80211_SN_MASK) <= (IEEE80211_SN_MODULO >> 1);
+}
+
 static inline u16 ieee80211_sn_add(u16 sn1, u16 sn2)
 {
 	return (sn1 + sn2) & IEEE80211_SN_MASK;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 534cac3fc8df..46b517cf47ea 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -5,7 +5,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2015  Intel Mobile Communications GmbH
- * Copyright (C) 2018-2022 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
  */
 
 #ifndef IEEE80211_I_H
@@ -523,6 +523,8 @@ struct ieee80211_if_managed {
 
 	unsigned int flags;
 
+	u16 mcast_seq_last;
+
 	bool status_acked;
 	bool status_received;
 	__le16 status_fc;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 4eaf5c10efdb..35dda5982854 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3351,6 +3351,8 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	wiphy_delayed_work_cancel(sdata->local->hw.wiphy,
 				  &ifmgd->neg_ttlm_timeout_work);
 	ieee80211_vif_set_links(sdata, 0, 0);
+
+	ifmgd->mcast_seq_last = IEEE80211_SN_MODULO;
 }
 
 static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata)
@@ -7512,6 +7514,7 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
 	spin_lock_init(&ifmgd->teardown_lock);
 	ifmgd->teardown_skb = NULL;
 	ifmgd->orig_teardown_skb = NULL;
+	ifmgd->mcast_seq_last = IEEE80211_SN_MODULO;
 }
 
 static void ieee80211_recalc_smps_work(struct wiphy *wiphy,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 53c4764dc1ed..9902ea69af0a 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1434,13 +1434,31 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx)
 		return RX_CONTINUE;
 
 	if (ieee80211_is_ctl(hdr->frame_control) ||
-	    ieee80211_is_any_nullfunc(hdr->frame_control) ||
-	    is_multicast_ether_addr(hdr->addr1))
+	    ieee80211_is_any_nullfunc(hdr->frame_control))
 		return RX_CONTINUE;
 
 	if (!rx->sta)
 		return RX_CONTINUE;
 
+	if (unlikely(is_multicast_ether_addr(hdr->addr1))) {
+		struct ieee80211_sub_if_data *sdata = rx->sdata;
+		u16 sn = ieee80211_get_sn(hdr);
+
+		if (!ieee80211_is_data_present(hdr->frame_control))
+			return RX_CONTINUE;
+
+		if (!ieee80211_vif_is_mld(&sdata->vif) ||
+		    sdata->vif.type != NL80211_IFTYPE_STATION)
+			return RX_CONTINUE;
+
+		if (sdata->u.mgd.mcast_seq_last != IEEE80211_SN_MODULO &&
+		    ieee80211_sn_less_eq(sn, sdata->u.mgd.mcast_seq_last))
+			return RX_DROP_U_DUP;
+
+		sdata->u.mgd.mcast_seq_last = sn;
+		return RX_CONTINUE;
+	}
+
 	if (unlikely(ieee80211_has_retry(hdr->frame_control) &&
 		     rx->sta->last_seq_ctrl[rx->seqno_idx] == hdr->seq_ctrl)) {
 		I802_DEBUG_INC(rx->local->dot11FrameDuplicateCount);
-- 
cgit v1.2.3


From 6239da18d2f947523a80fb1f85f8d8a13d1726c1 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 29 Jan 2024 20:19:28 +0100
Subject: wifi: mac80211: adjust EHT capa when lowering bandwidth

If intending to associate with a lower bandwidth, remove capabilities
related to 320 MHz from the EHT capabilities element. Also change the
EHT MCS-NSS set accordingly: if just reducing 320->160 or similar the
format doesn't change, just cut off the last bytes. If changing from
higher bandwidth to 20 MHz only EHT STA, adjust the format.

Note that this also requires adjusting the caller in mlme.c since the
data written can now be shorter than it determined. We need to clean
all that up. Since the other callers pass NULL for the conn limit, we
don't need to change things there.

Link: https://msgid.link/20240129202041.b5f6df108c77.I0d8ea04079c61cb3744cc88625eeaf0d4776dc2b@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  |   3 ++
 net/mac80211/ieee80211_i.h |   3 +-
 net/mac80211/mesh.c        |   3 +-
 net/mac80211/mlme.c        |  15 ++++--
 net/mac80211/tdls.c        |   5 +-
 net/mac80211/util.c        | 114 ++++++++++++++++++++++++++++++++++-----------
 6 files changed, 106 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index e9078143b822..e4322238f273 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -3060,6 +3060,9 @@ ieee80211_he_spr_size(const u8 *he_spr_ie)
 #define IEEE80211_EHT_PHY_CAP5_SUPP_EXTRA_EHT_LTF		0x40
 #define IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK	0x07
 
+#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ			0x08
+#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ		0x30
+#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ		0x40
 #define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK			0x78
 #define IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP		0x80
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 44400ce9a0b1..43c55ea6349c 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2652,7 +2652,8 @@ void ieee80211_init_frag_cache(struct ieee80211_fragment_cache *cache);
 void ieee80211_destroy_frag_cache(struct ieee80211_fragment_cache *cache);
 
 u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata);
-u8 *ieee80211_ie_build_eht_cap(u8 *pos,
+u8 *ieee80211_ie_build_eht_cap(const struct ieee80211_conn_settings *conn,
+			       u8 *pos,
 			       const struct ieee80211_sta_he_cap *he_cap,
 			       const struct ieee80211_sta_eht_cap *eht_cap,
 			       u8 *end,
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 9fd209e4ca19..000fa9484b4e 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -668,7 +668,8 @@ int mesh_add_eht_cap_ie(struct ieee80211_sub_if_data *sdata,
 		return -ENOMEM;
 
 	pos = skb_put(skb, ie_len);
-	ieee80211_ie_build_eht_cap(pos, he_cap, eht_cap, pos + ie_len, false);
+	ieee80211_ie_build_eht_cap(NULL, pos, he_cap, eht_cap, pos + ie_len,
+				   false);
 
 	return 0;
 }
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 9a0331d914d3..bbc7894ccad0 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1072,9 +1072,10 @@ static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata,
 
 static void ieee80211_add_eht_ie(struct ieee80211_sub_if_data *sdata,
 				 struct sk_buff *skb,
-				 struct ieee80211_supported_band *sband)
+				 struct ieee80211_supported_band *sband,
+				 const struct ieee80211_conn_settings *conn)
 {
-	u8 *pos;
+	u8 *pos, *pre_eht_pos;
 	const struct ieee80211_sta_he_cap *he_cap;
 	const struct ieee80211_sta_eht_cap *eht_cap;
 	u8 eht_cap_size;
@@ -1097,8 +1098,11 @@ static void ieee80211_add_eht_ie(struct ieee80211_sub_if_data *sdata,
 		ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0],
 				       eht_cap->eht_cap_elem.phy_cap_info);
 	pos = skb_put(skb, eht_cap_size);
-	ieee80211_ie_build_eht_cap(pos, he_cap, eht_cap, pos + eht_cap_size,
-				   false);
+	pre_eht_pos = pos;
+	pos = ieee80211_ie_build_eht_cap(conn, pos, he_cap, eht_cap,
+					 pos + eht_cap_size, false);
+	/* trim excess if any */
+	skb_trim(skb, skb->len - (pre_eht_pos + eht_cap_size - pos));
 }
 
 static void ieee80211_assoc_add_rates(struct sk_buff *skb,
@@ -1453,7 +1457,8 @@ static size_t ieee80211_assoc_link_elems(struct ieee80211_sub_if_data *sdata,
 	present_elems = NULL;
 
 	if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_EHT)
-		ieee80211_add_eht_ie(sdata, skb, sband);
+		ieee80211_add_eht_ie(sdata, skb, sband,
+				     &assoc_data->link[link_id].conn);
 
 	if (sband->band == NL80211_BAND_S1GHZ) {
 		ieee80211_add_aid_request_ie(sdata, skb);
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 0f4aa42e070f..57673f27daf4 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -6,7 +6,7 @@
  * Copyright 2014, Intel Corporation
  * Copyright 2014  Intel Mobile Communications GmbH
  * Copyright 2015 - 2016 Intel Deutschland GmbH
- * Copyright (C) 2019, 2021-2023 Intel Corporation
+ * Copyright (C) 2019, 2021-2024 Intel Corporation
  */
 
 #include <linux/ieee80211.h>
@@ -604,7 +604,8 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_link_data *link,
 			ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0],
 					       eht_cap->eht_cap_elem.phy_cap_info);
 		pos = skb_put(skb, cap_size);
-		ieee80211_ie_build_eht_cap(pos, he_cap, eht_cap, pos + cap_size, false);
+		ieee80211_ie_build_eht_cap(NULL, pos, he_cap, eht_cap,
+					   pos + cap_size, false);
 	}
 
 	/* add any remaining IEs */
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index e9a7978d47d4..5224c22b1afc 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2250,7 +2250,8 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata,
 	    cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band),
 					 IEEE80211_CHAN_NO_HE |
 					 IEEE80211_CHAN_NO_EHT)) {
-		pos = ieee80211_ie_build_eht_cap(pos, he_cap, eht_cap, end,
+		pos = ieee80211_ie_build_eht_cap(NULL, pos, he_cap, eht_cap,
+						 end,
 						 sdata->vif.type == NL80211_IFTYPE_AP);
 		if (!pos)
 			goto out_err;
@@ -3294,6 +3295,24 @@ u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata)
 				     he_cap->he_cap_elem.phy_cap_info);
 }
 
+static void
+ieee80211_get_adjusted_he_cap(const struct ieee80211_conn_settings *conn,
+			      const struct ieee80211_sta_he_cap *he_cap,
+			      struct ieee80211_he_cap_elem *elem)
+{
+	*elem = he_cap->he_cap_elem;
+
+	if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40)
+		elem->phy_cap_info[0] &=
+			~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
+			  IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G);
+
+	if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160)
+		elem->phy_cap_info[0] &=
+			~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G |
+			  IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G);
+}
+
 u8 *ieee80211_ie_build_he_cap(const struct ieee80211_conn_settings *conn,
 			      const struct ieee80211_sta_he_cap *he_cap,
 			      u8 *pos, u8 *end)
@@ -3307,25 +3326,11 @@ u8 *ieee80211_ie_build_he_cap(const struct ieee80211_conn_settings *conn,
 		conn = &ieee80211_conn_settings_unlimited;
 
 	/* Make sure we have place for the IE */
-	/*
-	 * TODO: the 1 added is because this temporarily is under the EXTENSION
-	 * IE. Get rid of it when it moves.
-	 */
 	if (!he_cap)
 		return orig_pos;
 
 	/* modify on stack first to calculate 'n' and 'ie_len' correctly */
-	elem = he_cap->he_cap_elem;
-
-	if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40)
-		elem.phy_cap_info[0] &=
-			~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
-			  IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G);
-
-	if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160)
-		elem.phy_cap_info[0] &=
-			~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G |
-			  IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G);
+	ieee80211_get_adjusted_he_cap(conn, he_cap, &elem);
 
 	n = ieee80211_he_mcs_nss_size(&elem);
 	ie_len = 2 + 1 +
@@ -5096,25 +5101,65 @@ u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata)
 	return 0;
 }
 
-u8 *ieee80211_ie_build_eht_cap(u8 *pos,
+u8 *ieee80211_ie_build_eht_cap(const struct ieee80211_conn_settings *conn,
+			       u8 *pos,
 			       const struct ieee80211_sta_he_cap *he_cap,
 			       const struct ieee80211_sta_eht_cap *eht_cap,
-			       u8 *end,
-			       bool for_ap)
+			       u8 *end, bool for_ap)
 {
+	struct ieee80211_eht_cap_elem_fixed fixed, *out;
+	struct ieee80211_he_cap_elem he;
 	u8 mcs_nss_len, ppet_len;
+	u8 orig_mcs_nss_len;
 	u8 ie_len;
 	u8 *orig_pos = pos;
 
+	if (!conn)
+		conn = &ieee80211_conn_settings_unlimited;
+
 	/* Make sure we have place for the IE */
 	if (!he_cap || !eht_cap)
 		return orig_pos;
 
-	mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem,
-						 &eht_cap->eht_cap_elem,
-						 for_ap);
+	orig_mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem,
+						      &eht_cap->eht_cap_elem,
+						      for_ap);
+
+	ieee80211_get_adjusted_he_cap(conn, he_cap, &he);
+
+	fixed = eht_cap->eht_cap_elem;
+
+	if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_80)
+		fixed.phy_cap_info[6] &=
+			~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ;
+
+	if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160) {
+		fixed.phy_cap_info[1] &=
+			~IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK;
+		fixed.phy_cap_info[2] &=
+			~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK;
+		fixed.phy_cap_info[6] &=
+			~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ;
+	}
+
+	if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_320) {
+		fixed.phy_cap_info[0] &=
+			~IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ;
+		fixed.phy_cap_info[1] &=
+			~IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK;
+		fixed.phy_cap_info[2] &=
+			~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK;
+		fixed.phy_cap_info[6] &=
+			~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ;
+	}
+
+	if (conn->bw_limit == IEEE80211_CONN_BW_LIMIT_20)
+		fixed.phy_cap_info[0] &=
+			~IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ;
+
+	mcs_nss_len = ieee80211_eht_mcs_nss_size(&he, &fixed, for_ap);
 	ppet_len = ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0],
-					  eht_cap->eht_cap_elem.phy_cap_info);
+					  fixed.phy_cap_info);
 
 	ie_len = 2 + 1 + sizeof(eht_cap->eht_cap_elem) + mcs_nss_len + ppet_len;
 	if ((end - pos) < ie_len)
@@ -5124,12 +5169,25 @@ u8 *ieee80211_ie_build_eht_cap(u8 *pos,
 	*pos++ = ie_len - 2;
 	*pos++ = WLAN_EID_EXT_EHT_CAPABILITY;
 
-	/* Fixed data */
-	memcpy(pos, &eht_cap->eht_cap_elem, sizeof(eht_cap->eht_cap_elem));
-	pos += sizeof(eht_cap->eht_cap_elem);
+	out = (void *)pos;
+	*out = fixed;
+	pos += sizeof(*out);
 
-	memcpy(pos, &eht_cap->eht_mcs_nss_supp, mcs_nss_len);
-	pos += mcs_nss_len;
+	if (mcs_nss_len == 4 && orig_mcs_nss_len != 4) {
+		/*
+		 * If the (non-AP) STA became 20 MHz only, then convert from
+		 * <=80 to 20-MHz-only format, where MCSes are indicated in
+		 * the groups 0-7, 8-9, 10-11, 12-13 rather than just 0-9,
+		 * 10-11, 12-13. Thus, use 0-9 for 0-7 and 8-9.
+		 */
+		*pos++ = eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs9_max_nss;
+		*pos++ = eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs9_max_nss;
+		*pos++ = eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs11_max_nss;
+		*pos++ = eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs13_max_nss;
+	} else {
+		memcpy(pos, &eht_cap->eht_mcs_nss_supp, mcs_nss_len);
+		pos += mcs_nss_len;
+	}
 
 	if (ppet_len) {
 		memcpy(pos, &eht_cap->eht_ppe_thres, ppet_len);
-- 
cgit v1.2.3


From 68de13028b94572fc570b7eb1e0e2de1d751fe7e Mon Sep 17 00:00:00 2001
From: Michael-CY Lee <michael-cy.lee@mediatek.com>
Date: Fri, 22 Dec 2023 09:09:13 +0800
Subject: wifi: cfg80211: Add utility for converting op_class into chandef

This utility is used in STA CSA handling. The op_class in the ECSA
Element can be converted into chandef.

Co-developed-by: Money Wang <money.wang@mediatek.com>
Signed-off-by: Michael-CY Lee <michael-cy.lee@mediatek.com>
Link: https://msgid.link/20231222010914.6521-2-michael-cy.lee@mediatek.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 13 +++++++++
 net/wireless/util.c    | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index cb5e34d640cd..e27ed2307cdb 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -8774,6 +8774,19 @@ void cfg80211_ch_switch_started_notify(struct net_device *dev,
 bool ieee80211_operating_class_to_band(u8 operating_class,
 				       enum nl80211_band *band);
 
+/**
+ * ieee80211_operating_class_to_chandef - convert operating class to chandef
+ *
+ * @operating_class: the operating class to convert
+ * @chan: the ieee80211_channel to convert
+ * @chandef: a pointer to the resulting chandef
+ *
+ * Returns %true if the conversion was successful, %false otherwise.
+ */
+bool ieee80211_operating_class_to_chandef(u8 operating_class,
+					  struct ieee80211_channel *chan,
+					  struct cfg80211_chan_def *chandef);
+
 /**
  * ieee80211_chandef_to_operating_class - convert chandef to operation class
  *
diff --git a/net/wireless/util.c b/net/wireless/util.c
index d1ce3bee2797..379f742fd741 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -2073,6 +2073,82 @@ bool ieee80211_operating_class_to_band(u8 operating_class,
 }
 EXPORT_SYMBOL(ieee80211_operating_class_to_band);
 
+bool ieee80211_operating_class_to_chandef(u8 operating_class,
+					  struct ieee80211_channel *chan,
+					  struct cfg80211_chan_def *chandef)
+{
+	u32 control_freq, offset = 0;
+	enum nl80211_band band;
+
+	if (!ieee80211_operating_class_to_band(operating_class, &band) ||
+	    !chan || band != chan->band)
+		return false;
+
+	control_freq = chan->center_freq;
+	chandef->chan = chan;
+
+	if (control_freq >= 5955)
+		offset = control_freq - 5955;
+	else if (control_freq >= 5745)
+		offset = control_freq - 5745;
+	else if (control_freq >= 5180)
+		offset = control_freq - 5180;
+	offset /= 20;
+
+	switch (operating_class) {
+	case 81:  /* 2 GHz band; 20 MHz; channels 1..13 */
+	case 82:  /* 2 GHz band; 20 MHz; channel 14 */
+	case 115: /* 5 GHz band; 20 MHz; channels 36,40,44,48 */
+	case 118: /* 5 GHz band; 20 MHz; channels 52,56,60,64 */
+	case 121: /* 5 GHz band; 20 MHz; channels 100..144 */
+	case 124: /* 5 GHz band; 20 MHz; channels 149,153,157,161 */
+	case 125: /* 5 GHz band; 20 MHz; channels 149..177 */
+	case 131: /* 6 GHz band; 20 MHz; channels 1..233*/
+	case 136: /* 6 GHz band; 20 MHz; channel 2 */
+		chandef->center_freq1 = control_freq;
+		chandef->width = NL80211_CHAN_WIDTH_20;
+		return true;
+	case 83:  /* 2 GHz band; 40 MHz; channels 1..9 */
+	case 116: /* 5 GHz band; 40 MHz; channels 36,44 */
+	case 119: /* 5 GHz band; 40 MHz; channels 52,60 */
+	case 122: /* 5 GHz band; 40 MHz; channels 100,108,116,124,132,140 */
+	case 126: /* 5 GHz band; 40 MHz; channels 149,157,165,173 */
+		chandef->center_freq1 = control_freq + 10;
+		chandef->width = NL80211_CHAN_WIDTH_40;
+		return true;
+	case 84:  /* 2 GHz band; 40 MHz; channels 5..13 */
+	case 117: /* 5 GHz band; 40 MHz; channels 40,48 */
+	case 120: /* 5 GHz band; 40 MHz; channels 56,64 */
+	case 123: /* 5 GHz band; 40 MHz; channels 104,112,120,128,136,144 */
+	case 127: /* 5 GHz band; 40 MHz; channels 153,161,169,177 */
+		chandef->center_freq1 = control_freq - 10;
+		chandef->width = NL80211_CHAN_WIDTH_40;
+		return true;
+	case 132: /* 6 GHz band; 40 MHz; channels 1,5,..,229*/
+		chandef->center_freq1 = control_freq + 10 - (offset & 1) * 20;
+		chandef->width = NL80211_CHAN_WIDTH_40;
+		return true;
+	case 128: /* 5 GHz band; 80 MHz; channels 36..64,100..144,149..177 */
+	case 133: /* 6 GHz band; 80 MHz; channels 1,5,..,229 */
+		chandef->center_freq1 = control_freq + 30 - (offset & 3) * 20;
+		chandef->width = NL80211_CHAN_WIDTH_80;
+		return true;
+	case 129: /* 5 GHz band; 160 MHz; channels 36..64,100..144,149..177 */
+	case 134: /* 6 GHz band; 160 MHz; channels 1,5,..,229 */
+		chandef->center_freq1 = control_freq + 70 - (offset & 7) * 20;
+		chandef->width = NL80211_CHAN_WIDTH_160;
+		return true;
+	case 130: /* 5 GHz band; 80+80 MHz; channels 36..64,100..144,149..177 */
+	case 135: /* 6 GHz band; 80+80 MHz; channels 1,5,..,229 */
+		  /* The center_freq2 of 80+80 MHz is unknown */
+	case 137: /* 6 GHz band; 320 MHz; channels 1,5,..,229 */
+		  /* 320-1 or 320-2 channelization is unknown */
+	default:
+		return false;
+	}
+}
+EXPORT_SYMBOL(ieee80211_operating_class_to_chandef);
+
 bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
 					  u8 *op_class)
 {
-- 
cgit v1.2.3


From 4ace04c0bdbde3b028ec0a5a3be2471cdb1efb67 Mon Sep 17 00:00:00 2001
From: Aditya Kumar Singh <quic_adisi@quicinc.com>
Date: Tue, 30 Jan 2024 19:39:14 +0530
Subject: wifi: cfg80211: send link id in channel_switch ops

Currently, during channel switch, no link id information is passed down.
In order to support channel switch during Multi Link Operation, it is
required to pass link id as well.

Add changes to pass link id in the channel_switch cfg80211_ops.

Signed-off-by: Aditya Kumar Singh <quic_adisi@quicinc.com>
Link: https://msgid.link/20240130140918.1172387-2-quic_adisi@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 3 +++
 net/wireless/nl80211.c | 1 +
 net/wireless/trace.h   | 7 +++++--
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index e27ed2307cdb..d4c83ea3213d 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1543,6 +1543,8 @@ struct cfg80211_ap_update {
  * @radar_required: whether radar detection is required on the new channel
  * @block_tx: whether transmissions should be blocked while changing
  * @count: number of beacons until switch
+ * @link_id: defines the link on which channel switch is expected during
+ *	MLO. 0 in case of non-MLO.
  */
 struct cfg80211_csa_settings {
 	struct cfg80211_chan_def chandef;
@@ -1555,6 +1557,7 @@ struct cfg80211_csa_settings {
 	bool radar_required;
 	bool block_tx;
 	u8 count;
+	u8 link_id;
 };
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b533412ad1e0..e1106ae35e21 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -10222,6 +10222,7 @@ skip_beacons:
 	if (info->attrs[NL80211_ATTR_CH_SWITCH_BLOCK_TX])
 		params.block_tx = true;
 
+	params.link_id = link_id;
 	err = rdev_channel_switch(rdev, dev, &params);
 
 free:
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index ae5e585b6863..194ea2471717 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2332,6 +2332,7 @@ TRACE_EVENT(rdev_channel_switch,
 		__field(u8, count)
 		__dynamic_array(u16, bcn_ofs, params->n_counter_offsets_beacon)
 		__dynamic_array(u16, pres_ofs, params->n_counter_offsets_presp)
+		__field(u8, link_id)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
@@ -2349,11 +2350,13 @@ TRACE_EVENT(rdev_channel_switch,
 			memcpy(__get_dynamic_array(pres_ofs),
 			       params->counter_offsets_presp,
 			       params->n_counter_offsets_presp * sizeof(u16));
+		__entry->link_id = params->link_id;
 	),
 	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT
-		  ", block_tx: %d, count: %u, radar_required: %d",
+		  ", block_tx: %d, count: %u, radar_required: %d, link_id: %d",
 		  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
-		  __entry->block_tx, __entry->count, __entry->radar_required)
+		  __entry->block_tx, __entry->count, __entry->radar_required,
+		  __entry->link_id)
 );
 
 TRACE_EVENT(rdev_set_qos_map,
-- 
cgit v1.2.3


From 480e7048aa0bbf0a79a976cdfa0195fd157da902 Mon Sep 17 00:00:00 2001
From: Aditya Kumar Singh <quic_adisi@quicinc.com>
Date: Tue, 30 Jan 2024 19:39:15 +0530
Subject: wifi: mac80211: update beacon counters per link basis

Currently, function to update beacon counter uses deflink to fetch
the beacon and then update the counter. However, with MLO, there is
a need to update the counter for the beacon in a particular link.

Add support to use link_id in order to fetch the beacon from a particular
link data during beacon update counter.

Signed-off-by: Aditya Kumar Singh <quic_adisi@quicinc.com>
Link: https://msgid.link/20240130140918.1172387-3-quic_adisi@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath10k/mac.c             |  2 +-
 drivers/net/wireless/ath/ath11k/mac.c             |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c |  2 +-
 include/net/mac80211.h                            |  4 +++-
 net/mac80211/tx.c                                 | 14 +++++++++++---
 5 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index fc503db2fd8e..b80220406104 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -2035,7 +2035,7 @@ static void ath10k_mac_vif_ap_csa_count_down(struct ath10k_vif *arvif)
 		return;
 
 	if (!ieee80211_beacon_cntdwn_is_complete(vif)) {
-		ieee80211_beacon_update_cntdwn(vif);
+		ieee80211_beacon_update_cntdwn(vif, 0);
 
 		ret = ath10k_mac_setup_bcn_tmpl(arvif);
 		if (ret)
diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index bbf4d1f4d310..f7cab50bdfd1 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -1586,7 +1586,7 @@ void ath11k_mac_bcn_tx_event(struct ath11k_vif *arvif)
 	arvif->bcca_zero_sent = false;
 
 	if (vif->bss_conf.color_change_active)
-		ieee80211_beacon_update_cntdwn(vif);
+		ieee80211_beacon_update_cntdwn(vif, 0);
 	ath11k_mac_setup_bcn_tmpl(arvif);
 }
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
index 61269c7b1934..3dea06b28472 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
@@ -1467,7 +1467,7 @@ static void iwl_mvm_csa_count_down(struct iwl_mvm *mvm,
 	mvmvif->csa_countdown = true;
 
 	if (!ieee80211_beacon_cntdwn_is_complete(csa_vif)) {
-		int c = ieee80211_beacon_update_cntdwn(csa_vif);
+		int c = ieee80211_beacon_update_cntdwn(csa_vif, 0);
 
 		iwl_mvm_mac_ctxt_beacon_changed(mvm, csa_vif,
 						&csa_vif->bss_conf);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 54aa4a06c878..8acee7ce3aa9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5526,6 +5526,7 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 /**
  * ieee80211_beacon_update_cntdwn - request mac80211 to decrement the beacon countdown
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @link_id: valid link_id during MLO or 0 for non-MLO
  *
  * The beacon counter should be updated after each beacon transmission.
  * This function is called implicitly when
@@ -5535,7 +5536,8 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
  *
  * Return: new countdown value
  */
-u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif);
+u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif,
+				  unsigned int link_id);
 
 /**
  * ieee80211_beacon_set_cntdwn - request mac80211 to set beacon countdown
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 098b32947c2b..876de3ba98ba 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -5027,16 +5027,24 @@ static u8 __ieee80211_beacon_update_cntdwn(struct beacon_data *beacon)
 	return beacon->cntdwn_current_counter;
 }
 
-u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif)
+u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif, unsigned int link_id)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_link_data *link;
 	struct beacon_data *beacon = NULL;
 	u8 count = 0;
 
+	if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS))
+		return 0;
+
 	rcu_read_lock();
 
+	link = rcu_dereference(sdata->link[link_id]);
+	if (!link)
+		goto unlock;
+
 	if (sdata->vif.type == NL80211_IFTYPE_AP)
-		beacon = rcu_dereference(sdata->deflink.u.ap.beacon);
+		beacon = rcu_dereference(link->u.ap.beacon);
 	else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
 		beacon = rcu_dereference(sdata->u.ibss.presp);
 	else if (ieee80211_vif_is_mesh(&sdata->vif))
@@ -5277,7 +5285,7 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 
 	if (beacon->cntdwn_counter_offsets[0]) {
 		if (!is_template)
-			ieee80211_beacon_update_cntdwn(vif);
+			ieee80211_beacon_update_cntdwn(vif, link->link_id);
 
 		ieee80211_set_beacon_cntdwn(sdata, beacon, link);
 	}
-- 
cgit v1.2.3


From 04ada8599c35ecb2cf16c94eb118d227630d06ee Mon Sep 17 00:00:00 2001
From: Aditya Kumar Singh <quic_adisi@quicinc.com>
Date: Tue, 30 Jan 2024 19:39:18 +0530
Subject: wifi: mac80211: add support to call csa_finish on a link

Currently ieee80211_csa_finish() function finalizes CSA by scheduling a
finalizing worker using the deflink. With MLO, there is a need to do it
on a given link basis.

Pass link ID of the link on which CSA needs to be finalized.

Signed-off-by: Aditya Kumar Singh <quic_adisi@quicinc.com>
Link: https://msgid.link/20240130140918.1172387-6-quic_adisi@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath10k/mac.c                 |  2 +-
 drivers/net/wireless/ath/ath10k/wmi.c                 |  2 +-
 drivers/net/wireless/ath/ath11k/wmi.c                 |  2 +-
 drivers/net/wireless/ath/ath12k/wmi.c                 |  2 +-
 drivers/net/wireless/ath/ath9k/beacon.c               |  2 +-
 drivers/net/wireless/ath/ath9k/htc_drv_beacon.c       |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c     |  4 ++--
 drivers/net/wireless/intel/iwlwifi/mvm/time-event.c   |  2 +-
 drivers/net/wireless/mediatek/mt76/mac80211.c         |  2 +-
 drivers/net/wireless/mediatek/mt76/mt7615/mcu.c       |  2 +-
 drivers/net/wireless/mediatek/mt76/mt7915/mcu.c       |  2 +-
 drivers/net/wireless/mediatek/mt76/mt7996/mcu.c       |  2 +-
 drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c |  2 +-
 drivers/net/wireless/ti/wlcore/event.c                |  2 +-
 drivers/net/wireless/virtual/mac80211_hwsim.c         |  2 +-
 include/net/mac80211.h                                |  3 ++-
 net/mac80211/cfg.c                                    | 15 +++++++++++++--
 17 files changed, 31 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index b80220406104..41053219ee95 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -2047,7 +2047,7 @@ static void ath10k_mac_vif_ap_csa_count_down(struct ath10k_vif *arvif)
 			ath10k_warn(ar, "failed to update prb tmpl during csa: %d\n",
 				    ret);
 	} else {
-		ieee80211_csa_finish(vif);
+		ieee80211_csa_finish(vif, 0);
 	}
 }
 
diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index 9e2f0a50aaea..ddf15717d504 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -3885,7 +3885,7 @@ void ath10k_wmi_event_host_swba(struct ath10k *ar, struct sk_buff *skb)
 		 */
 		if (arvif->vif->bss_conf.csa_active &&
 		    ieee80211_beacon_cntdwn_is_complete(arvif->vif)) {
-			ieee80211_csa_finish(arvif->vif);
+			ieee80211_csa_finish(arvif->vif, 0);
 			continue;
 		}
 
diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c
index 688ee20528a0..3c9f3b0bcfaa 100644
--- a/drivers/net/wireless/ath/ath11k/wmi.c
+++ b/drivers/net/wireless/ath/ath11k/wmi.c
@@ -8226,7 +8226,7 @@ ath11k_wmi_process_csa_switch_count_event(struct ath11k_base *ab,
 		}
 
 		if (arvif->is_up && arvif->vif->bss_conf.csa_active)
-			ieee80211_csa_finish(arvif->vif);
+			ieee80211_csa_finish(arvif->vif, 0);
 	}
 	rcu_read_unlock();
 }
diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c
index 2fa724e5851a..75c3a0d9b9ec 100644
--- a/drivers/net/wireless/ath/ath12k/wmi.c
+++ b/drivers/net/wireless/ath/ath12k/wmi.c
@@ -6448,7 +6448,7 @@ ath12k_wmi_process_csa_switch_count_event(struct ath12k_base *ab,
 		}
 
 		if (arvif->is_up && arvif->vif->bss_conf.csa_active)
-			ieee80211_csa_finish(arvif->vif);
+			ieee80211_csa_finish(arvif->vif, 0);
 	}
 	rcu_read_unlock();
 }
diff --git a/drivers/net/wireless/ath/ath9k/beacon.c b/drivers/net/wireless/ath/ath9k/beacon.c
index ee72faac2f1d..4e48407138b2 100644
--- a/drivers/net/wireless/ath/ath9k/beacon.c
+++ b/drivers/net/wireless/ath/ath9k/beacon.c
@@ -368,7 +368,7 @@ bool ath9k_csa_is_finished(struct ath_softc *sc, struct ieee80211_vif *vif)
 	if (!ieee80211_beacon_cntdwn_is_complete(vif))
 		return false;
 
-	ieee80211_csa_finish(vif);
+	ieee80211_csa_finish(vif, 0);
 	return true;
 }
 
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c b/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c
index 533471e69400..8179d35dc310 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c
@@ -517,7 +517,7 @@ bool ath9k_htc_csa_is_finished(struct ath9k_htc_priv *priv)
 	if (!ieee80211_beacon_cntdwn_is_complete(vif))
 		return false;
 
-	ieee80211_csa_finish(vif);
+	ieee80211_csa_finish(vif, 0);
 
 	priv->csa_vif = NULL;
 	return true;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
index 3dea06b28472..7cb6bc97b26c 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
@@ -1486,7 +1486,7 @@ static void iwl_mvm_csa_count_down(struct iwl_mvm *mvm,
 		}
 	} else if (!iwl_mvm_te_scheduled(&mvmvif->time_event_data)) {
 		/* we don't have CSA NoA scheduled yet, switch now */
-		ieee80211_csa_finish(csa_vif);
+		ieee80211_csa_finish(csa_vif, 0);
 		RCU_INIT_POINTER(mvm->csa_vif, NULL);
 	}
 }
@@ -1836,7 +1836,7 @@ void iwl_mvm_channel_switch_start_notif(struct iwl_mvm *mvm,
 				      msecs_to_jiffies(IWL_MVM_CS_UNBLOCK_TX_TIMEOUT *
 						       csa_vif->bss_conf.beacon_int));
 
-		ieee80211_csa_finish(csa_vif);
+		ieee80211_csa_finish(csa_vif, 0);
 
 		rcu_read_unlock();
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
index 60ec5ca6927c..b7461bc32527 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
@@ -168,7 +168,7 @@ static void iwl_mvm_csa_noa_start(struct iwl_mvm *mvm)
 		goto out_unlock;
 	}
 
-	ieee80211_csa_finish(csa_vif);
+	ieee80211_csa_finish(csa_vif, 0);
 
 	rcu_read_unlock();
 
diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index 8a3a90d1bfac..8bf82755ca4c 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -1614,7 +1614,7 @@ static void
 __mt76_csa_finish(void *priv, u8 *mac, struct ieee80211_vif *vif)
 {
 	if (vif->bss_conf.csa_active && ieee80211_beacon_cntdwn_is_complete(vif))
-		ieee80211_csa_finish(vif);
+		ieee80211_csa_finish(vif, 0);
 }
 
 void mt76_csa_finish(struct mt76_dev *dev)
diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c
index ae34d019e588..c807bd8d928d 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c
@@ -353,7 +353,7 @@ static void
 mt7615_mcu_csa_finish(void *priv, u8 *mac, struct ieee80211_vif *vif)
 {
 	if (vif->bss_conf.csa_active)
-		ieee80211_csa_finish(vif);
+		ieee80211_csa_finish(vif, 0);
 }
 
 static void
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
index df1ad6d4e12d..d90f98c50039 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
@@ -228,7 +228,7 @@ mt7915_mcu_csa_finish(void *priv, u8 *mac, struct ieee80211_vif *vif)
 	if (!vif->bss_conf.csa_active || vif->type == NL80211_IFTYPE_STATION)
 		return;
 
-	ieee80211_csa_finish(vif);
+	ieee80211_csa_finish(vif, 0);
 }
 
 static void
diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c
index 699be57309c2..3ec813077dc2 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c
@@ -341,7 +341,7 @@ mt7996_mcu_csa_finish(void *priv, u8 *mac, struct ieee80211_vif *vif)
 	if (!vif->bss_conf.csa_active || vif->type == NL80211_IFTYPE_STATION)
 		return;
 
-	ieee80211_csa_finish(vif);
+	ieee80211_csa_finish(vif, 0);
 }
 
 static void
diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
index b0c1db726d7a..66bf92c164c3 100644
--- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
+++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
@@ -5740,7 +5740,7 @@ static void rtl8xxxu_update_beacon_work_callback(struct work_struct *work)
 
 	if (vif->bss_conf.csa_active) {
 		if (ieee80211_beacon_cntdwn_is_complete(vif)) {
-			ieee80211_csa_finish(vif);
+			ieee80211_csa_finish(vif, 0);
 			return;
 		}
 		schedule_delayed_work(&priv->update_beacon_work,
diff --git a/drivers/net/wireless/ti/wlcore/event.c b/drivers/net/wireless/ti/wlcore/event.c
index 1e082d039b82..2499dc908305 100644
--- a/drivers/net/wireless/ti/wlcore/event.c
+++ b/drivers/net/wireless/ti/wlcore/event.c
@@ -233,7 +233,7 @@ void wlcore_event_channel_switch(struct wl1271 *wl,
 			cancel_delayed_work(&wlvif->channel_switch_work);
 		} else {
 			set_bit(WLVIF_FLAG_BEACON_DISABLED, &wlvif->flags);
-			ieee80211_csa_finish(vif);
+			ieee80211_csa_finish(vif, 0);
 		}
 	}
 }
diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c
index 907c0842fee7..c337afd8bee2 100644
--- a/drivers/net/wireless/virtual/mac80211_hwsim.c
+++ b/drivers/net/wireless/virtual/mac80211_hwsim.c
@@ -2303,7 +2303,7 @@ static void mac80211_hwsim_beacon_tx(void *arg, u8 *mac,
 	}
 
 	if (link_conf->csa_active && ieee80211_beacon_cntdwn_is_complete(vif))
-		ieee80211_csa_finish(vif);
+		ieee80211_csa_finish(vif, link_id);
 }
 
 static enum hrtimer_restart
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 8acee7ce3aa9..45d905b17a65 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5555,12 +5555,13 @@ void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter);
 /**
  * ieee80211_csa_finish - notify mac80211 about channel switch
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @link_id: valid link_id during MLO or 0 for non-MLO
  *
  * After a channel switch announcement was scheduled and the counter in this
  * announcement hits 1, this function must be called by the driver to
  * notify mac80211 that the channel can be changed.
  */
-void ieee80211_csa_finish(struct ieee80211_vif *vif);
+void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id);
 
 /**
  * ieee80211_beacon_cntdwn_is_complete - find out if countdown reached 1
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index d30a64cf95cd..156a4215dcda 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3547,13 +3547,24 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon)
 	return new_beacon;
 }
 
-void ieee80211_csa_finish(struct ieee80211_vif *vif)
+void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
 	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_link_data *link_data;
+
+	if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS))
+		return;
 
 	rcu_read_lock();
 
+	link_data = rcu_dereference(sdata->link[link_id]);
+	if (WARN_ON(!link_data)) {
+		rcu_read_unlock();
+		return;
+	}
+
+	/* TODO: MBSSID with MLO changes */
 	if (vif->mbssid_tx_vif == vif) {
 		/* Trigger ieee80211_csa_finish() on the non-transmitting
 		 * interfaces when channel switch is received on
@@ -3572,7 +3583,7 @@ void ieee80211_csa_finish(struct ieee80211_vif *vif)
 					 &iter->deflink.csa_finalize_work);
 		}
 	}
-	wiphy_work_queue(local->hw.wiphy, &sdata->deflink.csa_finalize_work);
+	wiphy_work_queue(local->hw.wiphy, &link_data->csa_finalize_work);
 
 	rcu_read_unlock();
 }
-- 
cgit v1.2.3


From d4655db0a1e11eeacc55c44c81121c83b087982e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 9 Feb 2024 08:04:23 +0100
Subject: wifi: cfg80211: fix kernel-doc for cfg80211_chandef_primary

This was still referring to cfg80211_chandef_primary_freq(),
fix it.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Fixes: b82730bf57b5 ("wifi: cfg80211/mac80211: move puncturing into chandef")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index d4c83ea3213d..f52f989a54ad 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1053,7 +1053,7 @@ cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy,
 			      const struct cfg80211_chan_def *chandef);
 
 /**
- * cfg80211_chandef_primary_freq - calculate primary 40/80/160 MHz freq
+ * cfg80211_chandef_primary - calculate primary 40/80/160 MHz freq
  * @chandef: chandef to calculate for
  * @primary_chan_width: primary channel width to calculate center for
  * @punctured: punctured sub-channel bitmap, will be recalculated
-- 
cgit v1.2.3


From e3caf184107a4e2e196528b98b218ddc41e4cb8c Mon Sep 17 00:00:00 2001
From: Jinjian Song <jinjian.song@fibocom.com>
Date: Mon, 5 Feb 2024 18:22:27 +0800
Subject: wwan: core: Add WWAN fastboot port type

Add a new WWAN port that connects to the device fastboot protocol
interface.

Signed-off-by: Jinjian Song <jinjian.song@fibocom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wwan/wwan_core.c | 4 ++++
 include/linux/wwan.h         | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c
index 72e01e550a16..2ed20b20e7fc 100644
--- a/drivers/net/wwan/wwan_core.c
+++ b/drivers/net/wwan/wwan_core.c
@@ -328,6 +328,10 @@ static const struct {
 		.name = "XMMRPC",
 		.devsuf = "xmmrpc",
 	},
+	[WWAN_PORT_FASTBOOT] = {
+		.name = "FASTBOOT",
+		.devsuf = "fastboot",
+	},
 };
 
 static ssize_t type_show(struct device *dev, struct device_attribute *attr,
diff --git a/include/linux/wwan.h b/include/linux/wwan.h
index 01fa15506286..170fdee6339c 100644
--- a/include/linux/wwan.h
+++ b/include/linux/wwan.h
@@ -16,6 +16,7 @@
  * @WWAN_PORT_QCDM: Qcom Modem diagnostic interface
  * @WWAN_PORT_FIREHOSE: XML based command protocol
  * @WWAN_PORT_XMMRPC: Control protocol for Intel XMM modems
+ * @WWAN_PORT_FASTBOOT: Fastboot protocol control
  *
  * @WWAN_PORT_MAX: Highest supported port types
  * @WWAN_PORT_UNKNOWN: Special value to indicate an unknown port type
@@ -28,6 +29,7 @@ enum wwan_port_type {
 	WWAN_PORT_QCDM,
 	WWAN_PORT_FIREHOSE,
 	WWAN_PORT_XMMRPC,
+	WWAN_PORT_FASTBOOT,
 
 	/* Add new port types above this line */
 
-- 
cgit v1.2.3


From 471e8fd3afcef5a9f9089f0bd21965ad9ba35c91 Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Tue, 6 Feb 2024 18:31:06 +0100
Subject: net: phy: add devm/of_phy_package_join helper

Add devm/of_phy_package_join helper to join PHYs in a PHY package. These
are variant of the manual phy_package_join with the difference that
these will use DT nodes to derive the base_addr instead of manually
passing an hardcoded value.

An additional value is added in phy_package_shared, "np" to reference
the PHY package node pointer in specific PHY driver probe_once and
config_init_once functions to make use of additional specific properties
defined in the PHY package node in DT.

The np value is filled only with of_phy_package_join if a valid PHY
package node is found. A valid PHY package node must have the node name
set to "ethernet-phy-package".

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 96 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h          |  6 +++
 2 files changed, 102 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 2e7d5bfb338e..71d48152e8d5 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1712,6 +1712,7 @@ int phy_package_join(struct phy_device *phydev, int base_addr, size_t priv_size)
 			shared->priv_size = priv_size;
 		}
 		shared->base_addr = base_addr;
+		shared->np = NULL;
 		refcount_set(&shared->refcnt, 1);
 		bus->shared[base_addr] = shared;
 	} else {
@@ -1734,6 +1735,63 @@ err_unlock:
 }
 EXPORT_SYMBOL_GPL(phy_package_join);
 
+/**
+ * of_phy_package_join - join a common PHY group in PHY package
+ * @phydev: target phy_device struct
+ * @priv_size: if non-zero allocate this amount of bytes for private data
+ *
+ * This is a variant of phy_package_join for PHY package defined in DT.
+ *
+ * The parent node of the @phydev is checked as a valid PHY package node
+ * structure (by matching the node name "ethernet-phy-package") and the
+ * base_addr for the PHY package is passed to phy_package_join.
+ *
+ * With this configuration the shared struct will also have the np value
+ * filled to use additional DT defined properties in PHY specific
+ * probe_once and config_init_once PHY package OPs.
+ *
+ * Returns < 0 on error, 0 on success. Esp. calling phy_package_join()
+ * with the same cookie but a different priv_size is an error. Or a parent
+ * node is not detected or is not valid or doesn't match the expected node
+ * name for PHY package.
+ */
+int of_phy_package_join(struct phy_device *phydev, size_t priv_size)
+{
+	struct device_node *node = phydev->mdio.dev.of_node;
+	struct device_node *package_node;
+	u32 base_addr;
+	int ret;
+
+	if (!node)
+		return -EINVAL;
+
+	package_node = of_get_parent(node);
+	if (!package_node)
+		return -EINVAL;
+
+	if (!of_node_name_eq(package_node, "ethernet-phy-package")) {
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	if (of_property_read_u32(package_node, "reg", &base_addr)) {
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	ret = phy_package_join(phydev, base_addr, priv_size);
+	if (ret)
+		goto exit;
+
+	phydev->shared->np = package_node;
+
+	return 0;
+exit:
+	of_node_put(package_node);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(of_phy_package_join);
+
 /**
  * phy_package_leave - leave a common PHY group
  * @phydev: target phy_device struct
@@ -1750,6 +1808,10 @@ void phy_package_leave(struct phy_device *phydev)
 	if (!shared)
 		return;
 
+	/* Decrease the node refcount on leave if present */
+	if (shared->np)
+		of_node_put(shared->np);
+
 	if (refcount_dec_and_mutex_lock(&shared->refcnt, &bus->shared_lock)) {
 		bus->shared[shared->base_addr] = NULL;
 		mutex_unlock(&bus->shared_lock);
@@ -1802,6 +1864,40 @@ int devm_phy_package_join(struct device *dev, struct phy_device *phydev,
 }
 EXPORT_SYMBOL_GPL(devm_phy_package_join);
 
+/**
+ * devm_of_phy_package_join - resource managed of_phy_package_join()
+ * @dev: device that is registering this PHY package
+ * @phydev: target phy_device struct
+ * @priv_size: if non-zero allocate this amount of bytes for private data
+ *
+ * Managed of_phy_package_join(). Shared storage fetched by this function,
+ * phy_package_leave() is automatically called on driver detach. See
+ * of_phy_package_join() for more information.
+ */
+int devm_of_phy_package_join(struct device *dev, struct phy_device *phydev,
+			     size_t priv_size)
+{
+	struct phy_device **ptr;
+	int ret;
+
+	ptr = devres_alloc(devm_phy_package_leave, sizeof(*ptr),
+			   GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	ret = of_phy_package_join(phydev, priv_size);
+
+	if (!ret) {
+		*ptr = phydev;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devm_of_phy_package_join);
+
 /**
  * phy_detach - detach a PHY device from its network device
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index fd8dbea9b4d9..cbd49418b819 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -329,6 +329,7 @@ struct mdio_bus_stats {
  * struct phy_package_shared - Shared information in PHY packages
  * @base_addr: Base PHY address of PHY package used to combine PHYs
  *   in one package and for offset calculation of phy_package_read/write
+ * @np: Pointer to the Device Node if PHY package defined in DT
  * @refcnt: Number of PHYs connected to this shared data
  * @flags: Initialization of PHY package
  * @priv_size: Size of the shared private data @priv
@@ -340,6 +341,8 @@ struct mdio_bus_stats {
  */
 struct phy_package_shared {
 	u8 base_addr;
+	/* With PHY package defined in DT this points to the PHY package node */
+	struct device_node *np;
 	refcount_t refcnt;
 	unsigned long flags;
 	size_t priv_size;
@@ -2000,9 +2003,12 @@ int phy_ethtool_set_link_ksettings(struct net_device *ndev,
 				   const struct ethtool_link_ksettings *cmd);
 int phy_ethtool_nway_reset(struct net_device *ndev);
 int phy_package_join(struct phy_device *phydev, int base_addr, size_t priv_size);
+int of_phy_package_join(struct phy_device *phydev, size_t priv_size);
 void phy_package_leave(struct phy_device *phydev);
 int devm_phy_package_join(struct device *dev, struct phy_device *phydev,
 			  int base_addr, size_t priv_size);
+int devm_of_phy_package_join(struct device *dev, struct phy_device *phydev,
+			     size_t priv_size);
 
 int __init mdio_bus_init(void);
 void mdio_bus_exit(void);
-- 
cgit v1.2.3


From 9b1d5e055508393561e26bd1720f4c2639b03b1a Mon Sep 17 00:00:00 2001
From: Christian Marangi <ansuelsmth@gmail.com>
Date: Tue, 6 Feb 2024 18:31:09 +0100
Subject: net: phy: provide whether link has changed in c37_read_status

Some PHY driver might require additional regs call after
genphy_c37_read_status() is called.

Expand genphy_c37_read_status to provide a bool wheather the link has
changed or not to permit PHY driver to skip additional regs call if
nothing has changed.

Every user of genphy_c37_read_status() is updated with the new
additional bool.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c    |  3 ++-
 drivers/net/phy/phy_device.c  | 11 +++++++++--
 drivers/net/phy/qcom/at803x.c |  3 ++-
 include/linux/phy.h           |  2 +-
 4 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 312a8bb35d78..370e4ed45098 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -665,10 +665,11 @@ static int bcm54616s_config_aneg(struct phy_device *phydev)
 static int bcm54616s_read_status(struct phy_device *phydev)
 {
 	struct bcm54616s_phy_priv *priv = phydev->priv;
+	bool changed;
 	int err;
 
 	if (priv->mode_1000bx_en)
-		err = genphy_c37_read_status(phydev);
+		err = genphy_c37_read_status(phydev, &changed);
 	else
 		err = genphy_read_status(phydev);
 
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 71d48152e8d5..9f37c0bfbf8d 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -2621,12 +2621,15 @@ EXPORT_SYMBOL(genphy_read_status);
 /**
  * genphy_c37_read_status - check the link status and update current link state
  * @phydev: target phy_device struct
+ * @changed: pointer where to store if link changed
  *
  * Description: Check the link, then figure out the current state
  *   by comparing what we advertise with what the link partner
  *   advertises. This function is for Clause 37 1000Base-X mode.
+ *
+ *   If link has changed, @changed is set to true, false otherwise.
  */
-int genphy_c37_read_status(struct phy_device *phydev)
+int genphy_c37_read_status(struct phy_device *phydev, bool *changed)
 {
 	int lpa, err, old_link = phydev->link;
 
@@ -2636,9 +2639,13 @@ int genphy_c37_read_status(struct phy_device *phydev)
 		return err;
 
 	/* why bother the PHY if nothing can have changed */
-	if (phydev->autoneg == AUTONEG_ENABLE && old_link && phydev->link)
+	if (phydev->autoneg == AUTONEG_ENABLE && old_link && phydev->link) {
+		*changed = false;
 		return 0;
+	}
 
+	/* Signal link has changed */
+	*changed = true;
 	phydev->duplex = DUPLEX_UNKNOWN;
 	phydev->pause = 0;
 	phydev->asym_pause = 0;
diff --git a/drivers/net/phy/qcom/at803x.c b/drivers/net/phy/qcom/at803x.c
index 3e3ee4c1d4bc..4717c59d51d0 100644
--- a/drivers/net/phy/qcom/at803x.c
+++ b/drivers/net/phy/qcom/at803x.c
@@ -912,9 +912,10 @@ static int at8031_config_intr(struct phy_device *phydev)
 static int at8031_read_status(struct phy_device *phydev)
 {
 	struct at803x_priv *priv = phydev->priv;
+	bool changed;
 
 	if (priv->is_1000basex)
-		return genphy_c37_read_status(phydev);
+		return genphy_c37_read_status(phydev, &changed);
 
 	return at803x_read_status(phydev);
 }
diff --git a/include/linux/phy.h b/include/linux/phy.h
index cbd49418b819..2249cdb5957a 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1876,7 +1876,7 @@ int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum,
 
 /* Clause 37 */
 int genphy_c37_config_aneg(struct phy_device *phydev);
-int genphy_c37_read_status(struct phy_device *phydev);
+int genphy_c37_read_status(struct phy_device *phydev, bool *changed);
 
 /* Clause 45 PHY */
 int genphy_c45_restart_aneg(struct phy_device *phydev);
-- 
cgit v1.2.3


From 68bc61c26cacf152baf905786b5949769700f40d Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Wed, 7 Feb 2024 13:26:17 +0100
Subject: bpf: Allow compiler to inline most of bpf_local_storage_lookup()

In various performance profiles of kernels with BPF programs attached,
bpf_local_storage_lookup() appears as a significant portion of CPU
cycles spent. To enable the compiler generate more optimal code, turn
bpf_local_storage_lookup() into a static inline function, where only the
cache insertion code path is outlined

Notably, outlining cache insertion helps avoid bloating callers by
duplicating setting up calls to raw_spin_{lock,unlock}_irqsave() (on
architectures which do not inline spin_lock/unlock, such as x86), which
would cause the compiler produce worse code by deciding to outline
otherwise inlinable functions. The call overhead is neutral, because we
make 2 calls either way: either calling raw_spin_lock_irqsave() and
raw_spin_unlock_irqsave(); or call __bpf_local_storage_insert_cache(),
which calls raw_spin_lock_irqsave(), followed by a tail-call to
raw_spin_unlock_irqsave() where the compiler can perform TCO and (in
optimized uninstrumented builds) turns it into a plain jump. The call to
__bpf_local_storage_insert_cache() can be elided entirely if
cacheit_lockit is a false constant expression.

Based on results from './benchs/run_bench_local_storage.sh' (21 trials,
reboot between each trial; x86 defconfig + BPF, clang 16) this produces
improvements in throughput and latency in the majority of cases, with an
average (geomean) improvement of 8%:

+---- Hashmap Control --------------------
|
| + num keys: 10
| :                                         <before>             | <after>
| +-+ hashmap (control) sequential get    +----------------------+----------------------
|   +- hits throughput                    | 14.789 M ops/s       | 14.745 M ops/s (  ~  )
|   +- hits latency                       | 67.679 ns/op         | 67.879 ns/op   (  ~  )
|   +- important_hits throughput          | 14.789 M ops/s       | 14.745 M ops/s (  ~  )
|
| + num keys: 1000
| :                                         <before>             | <after>
| +-+ hashmap (control) sequential get    +----------------------+----------------------
|   +- hits throughput                    | 12.233 M ops/s       | 12.170 M ops/s (  ~  )
|   +- hits latency                       | 81.754 ns/op         | 82.185 ns/op   (  ~  )
|   +- important_hits throughput          | 12.233 M ops/s       | 12.170 M ops/s (  ~  )
|
| + num keys: 10000
| :                                         <before>             | <after>
| +-+ hashmap (control) sequential get    +----------------------+----------------------
|   +- hits throughput                    | 7.220 M ops/s        | 7.204 M ops/s  (  ~  )
|   +- hits latency                       | 138.522 ns/op        | 138.842 ns/op  (  ~  )
|   +- important_hits throughput          | 7.220 M ops/s        | 7.204 M ops/s  (  ~  )
|
| + num keys: 100000
| :                                         <before>             | <after>
| +-+ hashmap (control) sequential get    +----------------------+----------------------
|   +- hits throughput                    | 5.061 M ops/s        | 5.165 M ops/s  (+2.1%)
|   +- hits latency                       | 198.483 ns/op        | 194.270 ns/op  (-2.1%)
|   +- important_hits throughput          | 5.061 M ops/s        | 5.165 M ops/s  (+2.1%)
|
| + num keys: 4194304
| :                                         <before>             | <after>
| +-+ hashmap (control) sequential get    +----------------------+----------------------
|   +- hits throughput                    | 2.864 M ops/s        | 2.882 M ops/s  (  ~  )
|   +- hits latency                       | 365.220 ns/op        | 361.418 ns/op  (-1.0%)
|   +- important_hits throughput          | 2.864 M ops/s        | 2.882 M ops/s  (  ~  )
|
+---- Local Storage ----------------------
|
| + num_maps: 1
| :                                         <before>             | <after>
| +-+ local_storage cache sequential get  +----------------------+----------------------
|   +- hits throughput                    | 33.005 M ops/s       | 39.068 M ops/s (+18.4%)
|   +- hits latency                       | 30.300 ns/op         | 25.598 ns/op   (-15.5%)
|   +- important_hits throughput          | 33.005 M ops/s       | 39.068 M ops/s (+18.4%)
| :
| :                                         <before>             | <after>
| +-+ local_storage cache interleaved get +----------------------+----------------------
|   +- hits throughput                    | 37.151 M ops/s       | 44.926 M ops/s (+20.9%)
|   +- hits latency                       | 26.919 ns/op         | 22.259 ns/op   (-17.3%)
|   +- important_hits throughput          | 37.151 M ops/s       | 44.926 M ops/s (+20.9%)
|
| + num_maps: 10
| :                                         <before>             | <after>
| +-+ local_storage cache sequential get  +----------------------+----------------------
|   +- hits throughput                    | 32.288 M ops/s       | 38.099 M ops/s (+18.0%)
|   +- hits latency                       | 30.972 ns/op         | 26.248 ns/op   (-15.3%)
|   +- important_hits throughput          | 3.229 M ops/s        | 3.810 M ops/s  (+18.0%)
| :
| :                                         <before>             | <after>
| +-+ local_storage cache interleaved get +----------------------+----------------------
|   +- hits throughput                    | 34.473 M ops/s       | 41.145 M ops/s (+19.4%)
|   +- hits latency                       | 29.010 ns/op         | 24.307 ns/op   (-16.2%)
|   +- important_hits throughput          | 12.312 M ops/s       | 14.695 M ops/s (+19.4%)
|
| + num_maps: 16
| :                                         <before>             | <after>
| +-+ local_storage cache sequential get  +----------------------+----------------------
|   +- hits throughput                    | 32.524 M ops/s       | 38.341 M ops/s (+17.9%)
|   +- hits latency                       | 30.748 ns/op         | 26.083 ns/op   (-15.2%)
|   +- important_hits throughput          | 2.033 M ops/s        | 2.396 M ops/s  (+17.9%)
| :
| :                                         <before>             | <after>
| +-+ local_storage cache interleaved get +----------------------+----------------------
|   +- hits throughput                    | 34.575 M ops/s       | 41.338 M ops/s (+19.6%)
|   +- hits latency                       | 28.925 ns/op         | 24.193 ns/op   (-16.4%)
|   +- important_hits throughput          | 11.001 M ops/s       | 13.153 M ops/s (+19.6%)
|
| + num_maps: 17
| :                                         <before>             | <after>
| +-+ local_storage cache sequential get  +----------------------+----------------------
|   +- hits throughput                    | 28.861 M ops/s       | 32.756 M ops/s (+13.5%)
|   +- hits latency                       | 34.649 ns/op         | 30.530 ns/op   (-11.9%)
|   +- important_hits throughput          | 1.700 M ops/s        | 1.929 M ops/s  (+13.5%)
| :
| :                                         <before>             | <after>
| +-+ local_storage cache interleaved get +----------------------+----------------------
|   +- hits throughput                    | 31.529 M ops/s       | 36.110 M ops/s (+14.5%)
|   +- hits latency                       | 31.719 ns/op         | 27.697 ns/op   (-12.7%)
|   +- important_hits throughput          | 9.598 M ops/s        | 10.993 M ops/s (+14.5%)
|
| + num_maps: 24
| :                                         <before>             | <after>
| +-+ local_storage cache sequential get  +----------------------+----------------------
|   +- hits throughput                    | 18.602 M ops/s       | 19.937 M ops/s (+7.2%)
|   +- hits latency                       | 53.767 ns/op         | 50.166 ns/op   (-6.7%)
|   +- important_hits throughput          | 0.776 M ops/s        | 0.831 M ops/s  (+7.2%)
| :
| :                                         <before>             | <after>
| +-+ local_storage cache interleaved get +----------------------+----------------------
|   +- hits throughput                    | 21.718 M ops/s       | 23.332 M ops/s (+7.4%)
|   +- hits latency                       | 46.047 ns/op         | 42.865 ns/op   (-6.9%)
|   +- important_hits throughput          | 6.110 M ops/s        | 6.564 M ops/s  (+7.4%)
|
| + num_maps: 32
| :                                         <before>             | <after>
| +-+ local_storage cache sequential get  +----------------------+----------------------
|   +- hits throughput                    | 14.118 M ops/s       | 14.626 M ops/s (+3.6%)
|   +- hits latency                       | 70.856 ns/op         | 68.381 ns/op   (-3.5%)
|   +- important_hits throughput          | 0.442 M ops/s        | 0.458 M ops/s  (+3.6%)
| :
| :                                         <before>             | <after>
| +-+ local_storage cache interleaved get +----------------------+----------------------
|   +- hits throughput                    | 17.111 M ops/s       | 17.906 M ops/s (+4.6%)
|   +- hits latency                       | 58.451 ns/op         | 55.865 ns/op   (-4.4%)
|   +- important_hits throughput          | 4.776 M ops/s        | 4.998 M ops/s  (+4.6%)
|
| + num_maps: 100
| :                                         <before>             | <after>
| +-+ local_storage cache sequential get  +----------------------+----------------------
|   +- hits throughput                    | 5.281 M ops/s        | 5.528 M ops/s  (+4.7%)
|   +- hits latency                       | 192.398 ns/op        | 183.059 ns/op  (-4.9%)
|   +- important_hits throughput          | 0.053 M ops/s        | 0.055 M ops/s  (+4.9%)
| :
| :                                         <before>             | <after>
| +-+ local_storage cache interleaved get +----------------------+----------------------
|   +- hits throughput                    | 6.265 M ops/s        | 6.498 M ops/s  (+3.7%)
|   +- hits latency                       | 161.436 ns/op        | 152.877 ns/op  (-5.3%)
|   +- important_hits throughput          | 1.636 M ops/s        | 1.697 M ops/s  (+3.7%)
|
| + num_maps: 1000
| :                                         <before>             | <after>
| +-+ local_storage cache sequential get  +----------------------+----------------------
|   +- hits throughput                    | 0.355 M ops/s        | 0.354 M ops/s  (  ~  )
|   +- hits latency                       | 2826.538 ns/op       | 2827.139 ns/op (  ~  )
|   +- important_hits throughput          | 0.000 M ops/s        | 0.000 M ops/s  (  ~  )
| :
| :                                         <before>             | <after>
| +-+ local_storage cache interleaved get +----------------------+----------------------
|   +- hits throughput                    | 0.404 M ops/s        | 0.403 M ops/s  (  ~  )
|   +- hits latency                       | 2481.190 ns/op       | 2487.555 ns/op (  ~  )
|   +- important_hits throughput          | 0.102 M ops/s        | 0.101 M ops/s  (  ~  )

The on_lookup test in {cgrp,task}_ls_recursion.c is removed
because the bpf_local_storage_lookup is no longer traceable
and adding tracepoint will make the compiler generate worse
code: https://lore.kernel.org/bpf/ZcJmok64Xqv6l4ZS@elver.google.com/

Signed-off-by: Marco Elver <elver@google.com>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240207122626.3508658-1-elver@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf_local_storage.h                  | 30 ++++++++++++-
 kernel/bpf/bpf_local_storage.c                     | 52 ++++++----------------
 .../selftests/bpf/prog_tests/task_local_storage.c  |  6 ---
 .../selftests/bpf/progs/cgrp_ls_recursion.c        | 26 -----------
 .../selftests/bpf/progs/task_ls_recursion.c        | 17 -------
 5 files changed, 41 insertions(+), 90 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 173ec7f43ed1..dcddb0aef7d8 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -129,10 +129,36 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
 			    struct bpf_local_storage_cache *cache,
 			    bool bpf_ma);
 
-struct bpf_local_storage_data *
+void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
+				      struct bpf_local_storage_map *smap,
+				      struct bpf_local_storage_elem *selem);
+/* If cacheit_lockit is false, this lookup function is lockless */
+static inline struct bpf_local_storage_data *
 bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
 			 struct bpf_local_storage_map *smap,
-			 bool cacheit_lockit);
+			 bool cacheit_lockit)
+{
+	struct bpf_local_storage_data *sdata;
+	struct bpf_local_storage_elem *selem;
+
+	/* Fast path (cache hit) */
+	sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx],
+				      bpf_rcu_lock_held());
+	if (sdata && rcu_access_pointer(sdata->smap) == smap)
+		return sdata;
+
+	/* Slow path (cache miss) */
+	hlist_for_each_entry_rcu(selem, &local_storage->list, snode,
+				  rcu_read_lock_trace_held())
+		if (rcu_access_pointer(SDATA(selem)->smap) == smap)
+			break;
+
+	if (!selem)
+		return NULL;
+	if (cacheit_lockit)
+		__bpf_local_storage_insert_cache(local_storage, smap, selem);
+	return SDATA(selem);
+}
 
 void bpf_local_storage_destroy(struct bpf_local_storage *local_storage);
 
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 146824cc9689..bdea1a459153 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -414,47 +414,21 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
 	bpf_selem_unlink_storage(selem, reuse_now);
 }
 
-/* If cacheit_lockit is false, this lookup function is lockless */
-struct bpf_local_storage_data *
-bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
-			 struct bpf_local_storage_map *smap,
-			 bool cacheit_lockit)
+void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
+				      struct bpf_local_storage_map *smap,
+				      struct bpf_local_storage_elem *selem)
 {
-	struct bpf_local_storage_data *sdata;
-	struct bpf_local_storage_elem *selem;
-
-	/* Fast path (cache hit) */
-	sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx],
-				      bpf_rcu_lock_held());
-	if (sdata && rcu_access_pointer(sdata->smap) == smap)
-		return sdata;
-
-	/* Slow path (cache miss) */
-	hlist_for_each_entry_rcu(selem, &local_storage->list, snode,
-				  rcu_read_lock_trace_held())
-		if (rcu_access_pointer(SDATA(selem)->smap) == smap)
-			break;
-
-	if (!selem)
-		return NULL;
-
-	sdata = SDATA(selem);
-	if (cacheit_lockit) {
-		unsigned long flags;
-
-		/* spinlock is needed to avoid racing with the
-		 * parallel delete.  Otherwise, publishing an already
-		 * deleted sdata to the cache will become a use-after-free
-		 * problem in the next bpf_local_storage_lookup().
-		 */
-		raw_spin_lock_irqsave(&local_storage->lock, flags);
-		if (selem_linked_to_storage(selem))
-			rcu_assign_pointer(local_storage->cache[smap->cache_idx],
-					   sdata);
-		raw_spin_unlock_irqrestore(&local_storage->lock, flags);
-	}
+	unsigned long flags;
 
-	return sdata;
+	/* spinlock is needed to avoid racing with the
+	 * parallel delete.  Otherwise, publishing an already
+	 * deleted sdata to the cache will become a use-after-free
+	 * problem in the next bpf_local_storage_lookup().
+	 */
+	raw_spin_lock_irqsave(&local_storage->lock, flags);
+	if (selem_linked_to_storage(selem))
+		rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem));
+	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 }
 
 static int check_flags(const struct bpf_local_storage_data *old_sdata,
diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
index ea8537c54413..c33c05161a9e 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
@@ -117,12 +117,6 @@ static void test_recursion(void)
 	ASSERT_OK(err, "lookup map_b");
 	ASSERT_EQ(value, 100, "map_b value");
 
-	prog_fd = bpf_program__fd(skel->progs.on_lookup);
-	memset(&info, 0, sizeof(info));
-	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
-	ASSERT_OK(err, "get prog info");
-	ASSERT_GT(info.recursion_misses, 0, "on_lookup prog recursion");
-
 	prog_fd = bpf_program__fd(skel->progs.on_update);
 	memset(&info, 0, sizeof(info));
 	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
diff --git a/tools/testing/selftests/bpf/progs/cgrp_ls_recursion.c b/tools/testing/selftests/bpf/progs/cgrp_ls_recursion.c
index 610c2427fd93..3500e4b69ebe 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_ls_recursion.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_ls_recursion.c
@@ -27,32 +27,6 @@ bool is_cgroup1 = 0;
 struct cgroup *bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id) __ksym;
 void bpf_cgroup_release(struct cgroup *cgrp) __ksym;
 
-static void __on_lookup(struct cgroup *cgrp)
-{
-	bpf_cgrp_storage_delete(&map_a, cgrp);
-	bpf_cgrp_storage_delete(&map_b, cgrp);
-}
-
-SEC("fentry/bpf_local_storage_lookup")
-int BPF_PROG(on_lookup)
-{
-	struct task_struct *task = bpf_get_current_task_btf();
-	struct cgroup *cgrp;
-
-	if (is_cgroup1) {
-		cgrp = bpf_task_get_cgroup1(task, target_hid);
-		if (!cgrp)
-			return 0;
-
-		__on_lookup(cgrp);
-		bpf_cgroup_release(cgrp);
-		return 0;
-	}
-
-	__on_lookup(task->cgroups->dfl_cgrp);
-	return 0;
-}
-
 static void __on_update(struct cgroup *cgrp)
 {
 	long *ptr;
diff --git a/tools/testing/selftests/bpf/progs/task_ls_recursion.c b/tools/testing/selftests/bpf/progs/task_ls_recursion.c
index 4542dc683b44..f1853c38aada 100644
--- a/tools/testing/selftests/bpf/progs/task_ls_recursion.c
+++ b/tools/testing/selftests/bpf/progs/task_ls_recursion.c
@@ -27,23 +27,6 @@ struct {
 	__type(value, long);
 } map_b SEC(".maps");
 
-SEC("fentry/bpf_local_storage_lookup")
-int BPF_PROG(on_lookup)
-{
-	struct task_struct *task = bpf_get_current_task_btf();
-
-	if (!test_pid || task->pid != test_pid)
-		return 0;
-
-	/* The bpf_task_storage_delete will call
-	 * bpf_local_storage_lookup.  The prog->active will
-	 * stop the recursion.
-	 */
-	bpf_task_storage_delete(&map_a, task);
-	bpf_task_storage_delete(&map_b, task);
-	return 0;
-}
-
 SEC("fentry/bpf_local_storage_update")
 int BPF_PROG(on_update)
 {
-- 
cgit v1.2.3


From 129e406e1811538c1afc9c8e97d61bb18eed3363 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Thu, 8 Feb 2024 14:06:49 -0800
Subject: net/ipv6: set expires in rt6_add_dflt_router().

Pass the duration of a lifetime (in seconds) to the function
rt6_add_dflt_router() so that it can properly set the expiration time.

The function ndisc_router_discovery() is the only one that calls
rt6_add_dflt_router(), and it will later set the expiration time for the
route created by rt6_add_dflt_router(). However, there is a gap of time
between calling rt6_add_dflt_router() and setting the expiration time in
ndisc_router_discovery(). During this period, there is a possibility that a
new route may be removed from the routing table. By setting the correct
expiration time in rt6_add_dflt_router(), we can prevent this from
happening. The reason for setting RTF_EXPIRES in rt6_add_dflt_router() is
to start the Garbage Collection (GC) timer, as it only activates when a
route with RTF_EXPIRES is added to a table.

Suggested-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 3 ++-
 net/ipv6/ndisc.c        | 3 ++-
 net/ipv6/route.c        | 4 +++-
 3 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 28b065790261..52a51c69aa9d 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -170,7 +170,8 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
 struct fib6_info *rt6_add_dflt_router(struct net *net,
 				     const struct in6_addr *gwaddr,
 				     struct net_device *dev, unsigned int pref,
-				     u32 defrtr_usr_metric);
+				     u32 defrtr_usr_metric,
+				     int lifetime);
 
 void rt6_purge_dflt_routers(struct net *net);
 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index a19999b30bc0..a68462668158 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1382,7 +1382,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
 			neigh_release(neigh);
 
 		rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr,
-					 skb->dev, pref, defrtr_usr_metric);
+					 skb->dev, pref, defrtr_usr_metric,
+					 lifetime);
 		if (!rt) {
 			ND_PRINTK(0, err,
 				  "RA: %s failed to add default route\n",
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 63b4c6056582..98abba8f15cd 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4355,7 +4355,8 @@ struct fib6_info *rt6_add_dflt_router(struct net *net,
 				     const struct in6_addr *gwaddr,
 				     struct net_device *dev,
 				     unsigned int pref,
-				     u32 defrtr_usr_metric)
+				     u32 defrtr_usr_metric,
+				     int lifetime)
 {
 	struct fib6_config cfg = {
 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
@@ -4368,6 +4369,7 @@ struct fib6_info *rt6_add_dflt_router(struct net *net,
 		.fc_nlinfo.portid = 0,
 		.fc_nlinfo.nlh = NULL,
 		.fc_nlinfo.nl_net = net,
+		.fc_expires = jiffies_to_clock_t(lifetime * HZ),
 	};
 
 	cfg.fc_gateway = *gwaddr;
-- 
cgit v1.2.3


From 5eb902b8e7193cdcb33242af0a56502e6b5206e9 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Thu, 8 Feb 2024 14:06:51 -0800
Subject: net/ipv6: Remove expired routes with a separated list of routes.

FIB6 GC walks trees of fib6_tables to remove expired routes. Walking a tree
can be expensive if the number of routes in a table is big, even if most of
them are permanent. Checking routes in a separated list of routes having
expiration will avoid this potential issue.

Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 46 ++++++++++++++++++++++++++++++++++++++-
 net/ipv6/addrconf.c   | 41 +++++++++++++++++++++++++++++------
 net/ipv6/ip6_fib.c    | 60 ++++++++++++++++++++++++++++++++++++++++++++++-----
 net/ipv6/ndisc.c      | 10 ++++++++-
 net/ipv6/route.c      | 13 +++++++++--
 5 files changed, 154 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 360b12e61850..323c94f1845b 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -173,6 +173,9 @@ struct fib6_info {
 
 	refcount_t			fib6_ref;
 	unsigned long			expires;
+
+	struct hlist_node		gc_link;
+
 	struct dst_metrics		*fib6_metrics;
 #define fib6_pmtu		fib6_metrics->metrics[RTAX_MTU-1]
 
@@ -241,12 +244,18 @@ static inline bool fib6_requires_src(const struct fib6_info *rt)
 	return rt->fib6_src.plen > 0;
 }
 
+/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever
+ * been added to a table before.
+ */
 static inline void fib6_clean_expires(struct fib6_info *f6i)
 {
 	f6i->fib6_flags &= ~RTF_EXPIRES;
 	f6i->expires = 0;
 }
 
+/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever
+ * been added to a table before.
+ */
 static inline void fib6_set_expires(struct fib6_info *f6i,
 				    unsigned long expires)
 {
@@ -327,8 +336,10 @@ static inline bool fib6_info_hold_safe(struct fib6_info *f6i)
 
 static inline void fib6_info_release(struct fib6_info *f6i)
 {
-	if (f6i && refcount_dec_and_test(&f6i->fib6_ref))
+	if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) {
+		DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link));
 		call_rcu(&f6i->rcu, fib6_info_destroy_rcu);
+	}
 }
 
 enum fib6_walk_state {
@@ -382,6 +393,7 @@ struct fib6_table {
 	struct inet_peer_base	tb6_peers;
 	unsigned int		flags;
 	unsigned int		fib_seq;
+	struct hlist_head       tb6_gc_hlist;	/* GC candidates */
 #define RT6_TABLE_HAS_DFLT_ROUTER	BIT(0)
 };
 
@@ -498,6 +510,38 @@ void fib6_gc_cleanup(void);
 
 int fib6_init(void);
 
+/* Add the route to the gc list if it is not already there
+ *
+ * The callers should hold f6i->fib6_table->tb6_lock.
+ */
+static inline void fib6_add_gc_list(struct fib6_info *f6i)
+{
+	/* If fib6_node is null, the f6i is not in (or removed from) the
+	 * table.
+	 *
+	 * There is a gap between finding the f6i from the table and
+	 * calling this function without the protection of the tb6_lock.
+	 * This check makes sure the f6i is not added to the gc list when
+	 * it is not on the table.
+	 */
+	if (!rcu_dereference_protected(f6i->fib6_node,
+				       lockdep_is_held(&f6i->fib6_table->tb6_lock)))
+		return;
+
+	if (hlist_unhashed(&f6i->gc_link))
+		hlist_add_head(&f6i->gc_link, &f6i->fib6_table->tb6_gc_hlist);
+}
+
+/* Remove the route from the gc list if it is on the list.
+ *
+ * The callers should hold f6i->fib6_table->tb6_lock.
+ */
+static inline void fib6_remove_gc_list(struct fib6_info *f6i)
+{
+	if (!hlist_unhashed(&f6i->gc_link))
+		hlist_del_init(&f6i->gc_link);
+}
+
 struct ipv6_route_iter {
 	struct seq_net_private p;
 	struct fib6_walker w;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d63f5d063f07..0ea44563454f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1255,6 +1255,7 @@ static void
 cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires,
 		     bool del_rt, bool del_peer)
 {
+	struct fib6_table *table;
 	struct fib6_info *f6i;
 
 	f6i = addrconf_get_prefix_route(del_peer ? &ifp->peer_addr : &ifp->addr,
@@ -1264,8 +1265,15 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires,
 		if (del_rt)
 			ip6_del_rt(dev_net(ifp->idev->dev), f6i, false);
 		else {
-			if (!(f6i->fib6_flags & RTF_EXPIRES))
+			if (!(f6i->fib6_flags & RTF_EXPIRES)) {
+				table = f6i->fib6_table;
+				spin_lock_bh(&table->tb6_lock);
+
 				fib6_set_expires(f6i, expires);
+				fib6_add_gc_list(f6i);
+
+				spin_unlock_bh(&table->tb6_lock);
+			}
 			fib6_info_release(f6i);
 		}
 	}
@@ -2706,6 +2714,7 @@ EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr);
 void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 {
 	struct prefix_info *pinfo;
+	struct fib6_table *table;
 	__u32 valid_lft;
 	__u32 prefered_lft;
 	int addr_type, err;
@@ -2782,11 +2791,20 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 			if (valid_lft == 0) {
 				ip6_del_rt(net, rt, false);
 				rt = NULL;
-			} else if (addrconf_finite_timeout(rt_expires)) {
-				/* not infinity */
-				fib6_set_expires(rt, jiffies + rt_expires);
 			} else {
-				fib6_clean_expires(rt);
+				table = rt->fib6_table;
+				spin_lock_bh(&table->tb6_lock);
+
+				if (addrconf_finite_timeout(rt_expires)) {
+					/* not infinity */
+					fib6_set_expires(rt, jiffies + rt_expires);
+					fib6_add_gc_list(rt);
+				} else {
+					fib6_clean_expires(rt);
+					fib6_remove_gc_list(rt);
+				}
+
+				spin_unlock_bh(&table->tb6_lock);
 			}
 		} else if (valid_lft) {
 			clock_t expires = 0;
@@ -4741,6 +4759,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp,
 			       unsigned long expires, u32 flags,
 			       bool modify_peer)
 {
+	struct fib6_table *table;
 	struct fib6_info *f6i;
 	u32 prio;
 
@@ -4761,10 +4780,18 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp,
 				      ifp->rt_priority, ifp->idev->dev,
 				      expires, flags, GFP_KERNEL);
 	} else {
-		if (!expires)
+		table = f6i->fib6_table;
+		spin_lock_bh(&table->tb6_lock);
+
+		if (!expires) {
 			fib6_clean_expires(f6i);
-		else
+			fib6_remove_gc_list(f6i);
+		} else {
 			fib6_set_expires(f6i, expires);
+			fib6_add_gc_list(f6i);
+		}
+
+		spin_unlock_bh(&table->tb6_lock);
 
 		fib6_info_release(f6i);
 	}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 38a0348b1d17..805bbf26b3ef 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -160,6 +160,8 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
 	INIT_LIST_HEAD(&f6i->fib6_siblings);
 	refcount_set(&f6i->fib6_ref, 1);
 
+	INIT_HLIST_NODE(&f6i->gc_link);
+
 	return f6i;
 }
 
@@ -246,6 +248,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
 				   net->ipv6.fib6_null_entry);
 		table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 		inet_peer_base_init(&table->tb6_peers);
+		INIT_HLIST_HEAD(&table->tb6_gc_hlist);
 	}
 
 	return table;
@@ -1055,6 +1058,9 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
 				    lockdep_is_held(&table->tb6_lock));
 		}
 	}
+
+	fib6_clean_expires(rt);
+	fib6_remove_gc_list(rt);
 }
 
 /*
@@ -1115,10 +1121,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 					rt->fib6_nsiblings = 0;
 				if (!(iter->fib6_flags & RTF_EXPIRES))
 					return -EEXIST;
-				if (!(rt->fib6_flags & RTF_EXPIRES))
+				if (!(rt->fib6_flags & RTF_EXPIRES)) {
 					fib6_clean_expires(iter);
-				else
+					fib6_remove_gc_list(iter);
+				} else {
 					fib6_set_expires(iter, rt->expires);
+					fib6_add_gc_list(iter);
+				}
 
 				if (rt->fib6_pmtu)
 					fib6_metric_set(iter, RTAX_MTU,
@@ -1477,6 +1486,10 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 		if (rt->nh)
 			list_add(&rt->nh_list, &rt->nh->f6i_list);
 		__fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));
+
+		if (rt->fib6_flags & RTF_EXPIRES)
+			fib6_add_gc_list(rt);
+
 		fib6_start_gc(info->nl_net, rt);
 	}
 
@@ -2280,9 +2293,8 @@ static void fib6_flush_trees(struct net *net)
  *	Garbage collection
  */
 
-static int fib6_age(struct fib6_info *rt, void *arg)
+static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args)
 {
-	struct fib6_gc_args *gc_args = arg;
 	unsigned long now = jiffies;
 
 	/*
@@ -2307,6 +2319,42 @@ static int fib6_age(struct fib6_info *rt, void *arg)
 	return 0;
 }
 
+static void fib6_gc_table(struct net *net,
+			  struct fib6_table *tb6,
+			  struct fib6_gc_args *gc_args)
+{
+	struct fib6_info *rt;
+	struct hlist_node *n;
+	struct nl_info info = {
+		.nl_net = net,
+		.skip_notify = false,
+	};
+
+	hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link)
+		if (fib6_age(rt, gc_args) == -1)
+			fib6_del(rt, &info);
+}
+
+static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args)
+{
+	struct fib6_table *table;
+	struct hlist_head *head;
+	unsigned int h;
+
+	rcu_read_lock();
+	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+		head = &net->ipv6.fib_table_hash[h];
+		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
+			spin_lock_bh(&table->tb6_lock);
+
+			fib6_gc_table(net, table, gc_args);
+
+			spin_unlock_bh(&table->tb6_lock);
+		}
+	}
+	rcu_read_unlock();
+}
+
 void fib6_run_gc(unsigned long expires, struct net *net, bool force)
 {
 	struct fib6_gc_args gc_args;
@@ -2322,7 +2370,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force)
 			  net->ipv6.sysctl.ip6_rt_gc_interval;
 	gc_args.more = 0;
 
-	fib6_clean_all(net, fib6_age, &gc_args);
+	fib6_gc_all(net, &gc_args);
 	now = jiffies;
 	net->ipv6.ip6_rt_last_gc = now;
 
@@ -2382,6 +2430,7 @@ static int __net_init fib6_net_init(struct net *net)
 	net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
 		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 	inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
+	INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist);
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 	net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
@@ -2394,6 +2443,7 @@ static int __net_init fib6_net_init(struct net *net)
 	net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
 		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 	inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
+	INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist);
 #endif
 	fib6_tables_init(net);
 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index a68462668158..73cb31afe935 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1237,6 +1237,7 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
 	struct ndisc_options ndopts;
 	struct fib6_info *rt = NULL;
 	struct inet6_dev *in6_dev;
+	struct fib6_table *table;
 	u32 defrtr_usr_metric;
 	unsigned int pref = 0;
 	__u32 old_if_flags;
@@ -1410,8 +1411,15 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
 		inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE);
 	}
 
-	if (rt)
+	if (rt) {
+		table = rt->fib6_table;
+		spin_lock_bh(&table->tb6_lock);
+
 		fib6_set_expires(rt, jiffies + (HZ * lifetime));
+		fib6_add_gc_list(rt);
+
+		spin_unlock_bh(&table->tb6_lock);
+	}
 	if (in6_dev->cnf.accept_ra_min_hop_limit < 256 &&
 	    ra_msg->icmph.icmp6_hop_limit) {
 		if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index dd6ff5b20918..707d65bc9c0e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -931,6 +931,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 	struct net *net = dev_net(dev);
 	struct route_info *rinfo = (struct route_info *) opt;
 	struct in6_addr prefix_buf, *prefix;
+	struct fib6_table *table;
 	unsigned int pref;
 	unsigned long lifetime;
 	struct fib6_info *rt;
@@ -989,10 +990,18 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 
 	if (rt) {
-		if (!addrconf_finite_timeout(lifetime))
+		table = rt->fib6_table;
+		spin_lock_bh(&table->tb6_lock);
+
+		if (!addrconf_finite_timeout(lifetime)) {
 			fib6_clean_expires(rt);
-		else
+			fib6_remove_gc_list(rt);
+		} else {
 			fib6_set_expires(rt, jiffies + HZ * lifetime);
+			fib6_add_gc_list(rt);
+		}
+
+		spin_unlock_bh(&table->tb6_lock);
 
 		fib6_info_release(rt);
 	}
-- 
cgit v1.2.3


From e1aa35e16399d600215470411dfb56e1d6f8e017 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Fri, 8 Dec 2023 17:57:29 +0100
Subject: can: isotp: support dynamic flow control parameters

The ISO15765-2 standard supports to take the PDUs communication parameters
blocksize (BS) and Separation Time minimum (STmin) either from the first
received flow control (FC) "static" or from every received FC "dynamic".

Add a new CAN_ISOTP_DYN_FC_PARMS flag to support dynamic FC parameters.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/all/20231208165729.3011-1-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/isotp.h | 1 +
 net/can/isotp.c                | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/can/isotp.h b/include/uapi/linux/can/isotp.h
index 439c982f7e81..6cde62371b6f 100644
--- a/include/uapi/linux/can/isotp.h
+++ b/include/uapi/linux/can/isotp.h
@@ -137,6 +137,7 @@ struct can_isotp_ll_options {
 #define CAN_ISOTP_WAIT_TX_DONE	0x0400	/* wait for tx completion */
 #define CAN_ISOTP_SF_BROADCAST	0x0800	/* 1-to-N functional addressing */
 #define CAN_ISOTP_CF_BROADCAST	0x1000	/* 1-to-N transmission w/o FC */
+#define CAN_ISOTP_DYN_FC_PARMS	0x2000	/* dynamic FC parameters BS/STmin */
 
 /* protocol machine default values */
 
diff --git a/net/can/isotp.c b/net/can/isotp.c
index d1c6f206f429..25bac0fafc83 100644
--- a/net/can/isotp.c
+++ b/net/can/isotp.c
@@ -381,8 +381,9 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae)
 		return 1;
 	}
 
-	/* get communication parameters only from the first FC frame */
-	if (so->tx.state == ISOTP_WAIT_FIRST_FC) {
+	/* get static/dynamic communication params from first/every FC frame */
+	if (so->tx.state == ISOTP_WAIT_FIRST_FC ||
+	    so->opt.flags & CAN_ISOTP_DYN_FC_PARMS) {
 		so->txfc.bs = cf->data[ae + 1];
 		so->txfc.stmin = cf->data[ae + 2];
 
-- 
cgit v1.2.3


From 6f656131f6988709e3bf828c3ad992032b717c2e Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Thu, 8 Feb 2024 12:01:43 +0100
Subject: wifi: mac80211: remove gfp parameter from
 ieee80211_obss_color_collision_notify

Get rid of gfp parameter from ieee80211_obss_color_collision_notify
since it is no longer used.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Reviewed-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Acked-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Link: https://msgid.link/f91e1c78896408ac556586ba8c99e4e389aeba02.1707389901.git.lorenzo@kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath11k/wmi.c | 3 +--
 include/net/mac80211.h                | 3 +--
 net/mac80211/cfg.c                    | 2 +-
 net/mac80211/rx.c                     | 3 +--
 4 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c
index 3c9f3b0bcfaa..1943e636faef 100644
--- a/drivers/net/wireless/ath/ath11k/wmi.c
+++ b/drivers/net/wireless/ath/ath11k/wmi.c
@@ -4020,8 +4020,7 @@ ath11k_wmi_obss_color_collision_event(struct ath11k_base *ab, struct sk_buff *sk
 
 	switch (ev->evt_type) {
 	case WMI_BSS_COLOR_COLLISION_DETECTION:
-		ieee80211_obss_color_collision_notify(arvif->vif, ev->obss_color_bitmap,
-						      GFP_KERNEL);
+		ieee80211_obss_color_collision_notify(arvif->vif, ev->obss_color_bitmap);
 		ath11k_dbg(ab, ATH11K_DBG_WMI,
 			   "OBSS color collision detected vdev:%d, event:%d, bitmap:%08llx\n",
 			   ev->vdev_id, ev->evt_type, ev->obss_color_bitmap);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 45d905b17a65..fc223761e3af 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -7490,11 +7490,10 @@ ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw,
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
  * @color_bitmap: a 64 bit bitmap representing the colors that the local BSS is
  *	aware of.
- * @gfp: allocation flags
  */
 void
 ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif,
-				      u64 color_bitmap, gfp_t gfp);
+				      u64 color_bitmap);
 
 /**
  * ieee80211_is_tx_data - check if frame is a data frame
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 156a4215dcda..6f7c96c358f4 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -4765,7 +4765,7 @@ EXPORT_SYMBOL_GPL(ieee80211_color_change_finish);
 
 void
 ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif,
-				       u64 color_bitmap, gfp_t gfp)
+				      u64 color_bitmap)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
 	struct ieee80211_link_data *link = &sdata->deflink;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 9902ea69af0a..c1f850138405 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3386,8 +3386,7 @@ ieee80211_rx_check_bss_color_collision(struct ieee80211_rx_data *rx)
 				      IEEE80211_HE_OPERATION_BSS_COLOR_MASK);
 		if (color == bss_conf->he_bss_color.color)
 			ieee80211_obss_color_collision_notify(&rx->sdata->vif,
-							      BIT_ULL(color),
-							      GFP_ATOMIC);
+							      BIT_ULL(color));
 	}
 }
 
-- 
cgit v1.2.3


From f6ca96aa51a4ae1b3a416fbe85acdf1197c405a6 Mon Sep 17 00:00:00 2001
From: Aditya Kumar Singh <quic_adisi@quicinc.com>
Date: Mon, 5 Feb 2024 21:59:50 +0530
Subject: wifi: cfg80211: add support for link id attribute in
 NL80211_CMD_DEL_STATION

Currently whenever NL80211_CMD_DEL_STATION command is called without any
MAC address, all stations present on that interface are flushed.
However with MLO there is a need to flush such stations only which are
using at least a particular link from the AP MLD interface.

For example - 2 GHz and 5 GHz are part of an AP MLD.
To this interface, following stations are connected -
   1. One non-EHT STA on 2 GHz link.
   2. One non-EHT STA on 5 GHz link.
   3. One Multi-Link STA having 2 GHz and 5 GHz as active links.

Now if currently, NL80211_CMD_DEL_STATION is issued by the 2 GHz link
without any MAC address, it would flush all station entries. However,
flushing of station entry #2 at least is not desireable since it
is connected to 5 GHz link alone.

Hence, add an option to pass link ID as well in the command so that if link
ID is passed, stations using that passed link ID alone would be flushed
and others will not.

So after this, station entries #1 and #3 alone would be flushed and #2 will
remain as it is.

Signed-off-by: Aditya Kumar Singh <quic_adisi@quicinc.com>
Link: https://msgid.link/20240205162952.1697646-2-quic_adisi@quicinc.com
[clarify documentation]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  4 ++++
 include/uapi/linux/nl80211.h |  3 ++-
 net/wireless/nl80211.c       | 19 ++++++++++++++++++-
 net/wireless/trace.h         |  7 +++++--
 4 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f52f989a54ad..62894b024e88 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1780,11 +1780,15 @@ struct station_parameters {
  * @subtype: Management frame subtype to use for indicating removal
  *	(10 = Disassociation, 12 = Deauthentication)
  * @reason_code: Reason code for the Disassociation/Deauthentication frame
+ * @link_id: Link ID indicating a link that stations to be flushed must be
+ *	using; valid only for MLO, but can also be -1 for MLO to really
+ *	remove all stations.
  */
 struct station_del_parameters {
 	const u8 *mac;
 	u8 subtype;
 	u16 reason_code;
+	int link_id;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 853ac538a686..805bfe712971 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -438,7 +438,8 @@
  *	%NL80211_ATTR_REASON_CODE can optionally be used to specify which type
  *	of disconnection indication should be sent to the station
  *	(Deauthentication or Disassociation frame and reason code for that
- *	frame).
+ *	frame). %NL80211_ATTR_MLO_LINK_ID can be used optionally to remove
+ *	stations connected and using at least that link as one of its links.
  *
  * @NL80211_CMD_GET_MPATH: Get mesh path attributes for mesh path to
  * 	destination %NL80211_ATTR_MAC on the interface identified by
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e1106ae35e21..5ca545753810 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7624,14 +7624,16 @@ static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct station_del_parameters params;
+	int link_id = nl80211_link_id_or_invalid(info->attrs);
 
 	memset(&params, 0, sizeof(params));
 
 	if (info->attrs[NL80211_ATTR_MAC])
 		params.mac = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	switch (dev->ieee80211_ptr->iftype) {
+	switch (wdev->iftype) {
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_AP_VLAN:
 	case NL80211_IFTYPE_MESH_POINT:
@@ -7672,6 +7674,17 @@ static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info)
 		params.reason_code = WLAN_REASON_PREV_AUTH_NOT_VALID;
 	}
 
+	/* Link ID not expected in case of non-ML operation */
+	if (!wdev->valid_links && link_id != -1)
+		return -EINVAL;
+
+	/* If given, a valid link ID should be passed during MLO */
+	if (wdev->valid_links && link_id >= 0 &&
+	    !(wdev->valid_links & BIT(link_id)))
+		return -EINVAL;
+
+	params.link_id = link_id;
+
 	return rdev_del_station(rdev, dev, &params);
 }
 
@@ -16773,6 +16786,10 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_del_station,
 		.flags = GENL_UNS_ADMIN_PERM,
+		/* cannot use NL80211_FLAG_MLO_VALID_LINK_ID, depends on
+		 * whether MAC address is passed or not. If MAC address is
+		 * passed, then even during MLO, link ID is not required.
+		 */
 		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
 	},
 	{
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 194ea2471717..361331c29116 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -867,6 +867,7 @@ DECLARE_EVENT_CLASS(station_del,
 		MAC_ENTRY(sta_mac)
 		__field(u8, subtype)
 		__field(u16, reason_code)
+		__field(int, link_id)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
@@ -874,11 +875,13 @@ DECLARE_EVENT_CLASS(station_del,
 		MAC_ASSIGN(sta_mac, params->mac);
 		__entry->subtype = params->subtype;
 		__entry->reason_code = params->reason_code;
+		__entry->link_id = params->link_id;
 	),
 	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM"
-		  ", subtype: %u, reason_code: %u",
+		  ", subtype: %u, reason_code: %u, link_id: %d",
 		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac,
-		  __entry->subtype, __entry->reason_code)
+		  __entry->subtype, __entry->reason_code,
+		  __entry->link_id)
 );
 
 DEFINE_EVENT(station_del, rdev_del_station,
-- 
cgit v1.2.3


From 7b5e25b8baebc02db728bfbdc3080be863144c7b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 6 Feb 2024 16:54:07 +0200
Subject: wifi: cfg80211: rename UHB to 6 GHz

UHB stands for "Ultra High Band", but this term doesn't really
exist in the spec. Rename all occurrences to "6 GHz", but keep
a few defines for userspace API compatibility.

Link: https://msgid.link/20240206164849.c9cfb9400839.I153db3b951934a1d84409c17fbe1f1d1782543fa@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 10 +++++-----
 include/uapi/linux/nl80211.h | 31 ++++++++++++++++++++-----------
 net/wireless/nl80211.c       |  8 ++++----
 net/wireless/reg.c           | 10 +++++-----
 net/wireless/scan.c          |  8 ++++----
 5 files changed, 38 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 62894b024e88..7bb8484e859e 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -118,9 +118,9 @@ struct wiphy;
  *	restrictions.
  * @IEEE80211_CHAN_NO_EHT: EHT operation is not permitted on this channel.
  * @IEEE80211_CHAN_DFS_CONCURRENT: See %NL80211_RRF_DFS_CONCURRENT
- * @IEEE80211_CHAN_NO_UHB_VLP_CLIENT: Client connection with VLP AP
+ * @IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT: Client connection with VLP AP
  *	not permitted using this channel
- * @IEEE80211_CHAN_NO_UHB_AFC_CLIENT: Client connection with AFC AP
+ * @IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT: Client connection with AFC AP
  *	not permitted using this channel
  */
 enum ieee80211_channel_flags {
@@ -146,8 +146,8 @@ enum ieee80211_channel_flags {
 	IEEE80211_CHAN_NO_320MHZ	= 1<<19,
 	IEEE80211_CHAN_NO_EHT		= 1<<20,
 	IEEE80211_CHAN_DFS_CONCURRENT	= 1<<21,
-	IEEE80211_CHAN_NO_UHB_VLP_CLIENT= 1<<22,
-	IEEE80211_CHAN_NO_UHB_AFC_CLIENT= 1<<23,
+	IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT = 1<<22,
+	IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT = 1<<23,
 };
 
 #define IEEE80211_CHAN_NO_HT40 \
@@ -4935,7 +4935,7 @@ struct cfg80211_ops {
  * enum wiphy_flags - wiphy capability flags
  *
  * @WIPHY_FLAG_SPLIT_SCAN_6GHZ: if set to true, the scan request will be split
- *	 into two, first for legacy bands and second for UHB.
+ *	 into two, first for legacy bands and second for 6 GHz.
  * @WIPHY_FLAG_NETNS_OK: if not set, do not allow changing the netns of this
  *	wiphy at all
  * @WIPHY_FLAG_PS_ON_BY_DEFAULT: if set to true, powersave will be enabled
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 805bfe712971..13fa10804909 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -11,7 +11,7 @@
  * Copyright 2008 Jouni Malinen <jouni.malinen@atheros.com>
  * Copyright 2008 Colin McCabe <colin@cozybit.com>
  * Copyright 2015-2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -4269,9 +4269,9 @@ enum nl80211_wmm_rule {
  *	allowed for peer-to-peer or adhoc communication under the control
  *	of a DFS master which operates on the same channel (FCC-594280 D01
  *	Section B.3). Should be used together with %NL80211_RRF_DFS only.
- * @NL80211_FREQUENCY_ATTR_NO_UHB_VLP_CLIENT: Client connection to VLP AP
+ * @NL80211_FREQUENCY_ATTR_NO_6GHZ_VLP_CLIENT: Client connection to VLP AP
  *	not allowed using this channel
- * @NL80211_FREQUENCY_ATTR_NO_UHB_AFC_CLIENT: Client connection to AFC AP
+ * @NL80211_FREQUENCY_ATTR_NO_6GHZ_AFC_CLIENT: Client connection to AFC AP
  *	not allowed using this channel
  * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number
  *	currently defined
@@ -4313,8 +4313,8 @@ enum nl80211_frequency_attr {
 	NL80211_FREQUENCY_ATTR_NO_EHT,
 	NL80211_FREQUENCY_ATTR_PSD,
 	NL80211_FREQUENCY_ATTR_DFS_CONCURRENT,
-	NL80211_FREQUENCY_ATTR_NO_UHB_VLP_CLIENT,
-	NL80211_FREQUENCY_ATTR_NO_UHB_AFC_CLIENT,
+	NL80211_FREQUENCY_ATTR_NO_6GHZ_VLP_CLIENT,
+	NL80211_FREQUENCY_ATTR_NO_6GHZ_AFC_CLIENT,
 
 	/* keep last */
 	__NL80211_FREQUENCY_ATTR_AFTER_LAST,
@@ -4327,6 +4327,10 @@ enum nl80211_frequency_attr {
 #define NL80211_FREQUENCY_ATTR_NO_IR		NL80211_FREQUENCY_ATTR_NO_IR
 #define NL80211_FREQUENCY_ATTR_GO_CONCURRENT \
 					NL80211_FREQUENCY_ATTR_IR_CONCURRENT
+#define NL80211_FREQUENCY_ATTR_NO_UHB_VLP_CLIENT \
+	NL80211_FREQUENCY_ATTR_NO_6GHZ_VLP_CLIENT
+#define NL80211_FREQUENCY_ATTR_NO_UHB_AFC_CLIENT \
+	NL80211_FREQUENCY_ATTR_NO_6GHZ_AFC_CLIENT
 
 /**
  * enum nl80211_bitrate_attr - bitrate attributes
@@ -4517,8 +4521,8 @@ enum nl80211_sched_scan_match_attr {
 	peer-to-peer or adhoc communication under the control of a DFS master
 	which operates on the same channel (FCC-594280 D01 Section B.3).
 	Should be used together with %NL80211_RRF_DFS only.
- * @NL80211_RRF_NO_UHB_VLP_CLIENT: Client connection to VLP AP not allowed
- * @NL80211_RRF_NO_UHB_AFC_CLIENT: Client connection to AFC AP not allowed
+ * @NL80211_RRF_NO_6GHZ_VLP_CLIENT: Client connection to VLP AP not allowed
+ * @NL80211_RRF_NO_6GHZ_AFC_CLIENT: Client connection to AFC AP not allowed
  */
 enum nl80211_reg_rule_flags {
 	NL80211_RRF_NO_OFDM		= 1<<0,
@@ -4541,8 +4545,8 @@ enum nl80211_reg_rule_flags {
 	NL80211_RRF_NO_EHT		= 1<<19,
 	NL80211_RRF_PSD			= 1<<20,
 	NL80211_RRF_DFS_CONCURRENT	= 1<<21,
-	NL80211_RRF_NO_UHB_VLP_CLIENT	= 1<<22,
-	NL80211_RRF_NO_UHB_AFC_CLIENT	= 1<<23,
+	NL80211_RRF_NO_6GHZ_VLP_CLIENT	= 1<<22,
+	NL80211_RRF_NO_6GHZ_AFC_CLIENT	= 1<<23,
 };
 
 #define NL80211_RRF_PASSIVE_SCAN	NL80211_RRF_NO_IR
@@ -4551,6 +4555,8 @@ enum nl80211_reg_rule_flags {
 #define NL80211_RRF_NO_HT40		(NL80211_RRF_NO_HT40MINUS |\
 					 NL80211_RRF_NO_HT40PLUS)
 #define NL80211_RRF_GO_CONCURRENT	NL80211_RRF_IR_CONCURRENT
+#define NL80211_RRF_NO_UHB_VLP_CLIENT	NL80211_RRF_NO_6GHZ_VLP_CLIENT
+#define NL80211_RRF_NO_UHB_AFC_CLIENT	NL80211_RRF_NO_6GHZ_AFC_CLIENT
 
 /* For backport compatibility with older userspace */
 #define NL80211_RRF_NO_IR_ALL		(NL80211_RRF_NO_IR | __NL80211_RRF_NO_IBSS)
@@ -5098,14 +5104,17 @@ enum nl80211_bss_use_for {
  *	BSS isn't possible
  * @NL80211_BSS_CANNOT_USE_NSTR_NONPRIMARY: NSTR nonprimary links aren't
  *	supported by the device, and this BSS entry represents one.
- * @NL80211_BSS_CANNOT_USE_UHB_PWR_MISMATCH: STA is not supporting
+ * @NL80211_BSS_CANNOT_USE_6GHZ_PWR_MISMATCH: STA is not supporting
  *	the AP power type (SP, VLP, AP) that the AP uses.
  */
 enum nl80211_bss_cannot_use_reasons {
 	NL80211_BSS_CANNOT_USE_NSTR_NONPRIMARY	= 1 << 0,
-	NL80211_BSS_CANNOT_USE_UHB_PWR_MISMATCH	= 1 << 1,
+	NL80211_BSS_CANNOT_USE_6GHZ_PWR_MISMATCH	= 1 << 1,
 };
 
+#define NL80211_BSS_CANNOT_USE_UHB_PWR_MISMATCH \
+	NL80211_BSS_CANNOT_USE_6GHZ_PWR_MISMATCH
+
 /**
  * enum nl80211_bss - netlink attributes for a BSS
  *
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 5ca545753810..3b3b511f9f69 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1198,11 +1198,11 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
 		if ((chan->flags & IEEE80211_CHAN_DFS_CONCURRENT) &&
 		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_DFS_CONCURRENT))
 			goto nla_put_failure;
-		if ((chan->flags & IEEE80211_CHAN_NO_UHB_VLP_CLIENT) &&
-		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_UHB_VLP_CLIENT))
+		if ((chan->flags & IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT) &&
+		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_6GHZ_VLP_CLIENT))
 			goto nla_put_failure;
-		if ((chan->flags & IEEE80211_CHAN_NO_UHB_AFC_CLIENT) &&
-		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_UHB_AFC_CLIENT))
+		if ((chan->flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT) &&
+		    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_6GHZ_AFC_CLIENT))
 			goto nla_put_failure;
 	}
 
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 2741b626919a..50cadbad485f 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -5,7 +5,7 @@
  * Copyright 2008-2011	Luis R. Rodriguez <mcgrof@qca.qualcomm.com>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright      2017  Intel Deutschland GmbH
- * Copyright (C) 2018 - 2023 Intel Corporation
+ * Copyright (C) 2018 - 2024 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -1595,10 +1595,10 @@ static u32 map_regdom_flags(u32 rd_flags)
 		channel_flags |= IEEE80211_CHAN_NO_EHT;
 	if (rd_flags & NL80211_RRF_DFS_CONCURRENT)
 		channel_flags |= IEEE80211_CHAN_DFS_CONCURRENT;
-	if (rd_flags & NL80211_RRF_NO_UHB_VLP_CLIENT)
-		channel_flags |= IEEE80211_CHAN_NO_UHB_VLP_CLIENT;
-	if (rd_flags & NL80211_RRF_NO_UHB_AFC_CLIENT)
-		channel_flags |= IEEE80211_CHAN_NO_UHB_AFC_CLIENT;
+	if (rd_flags & NL80211_RRF_NO_6GHZ_VLP_CLIENT)
+		channel_flags |= IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT;
+	if (rd_flags & NL80211_RRF_NO_6GHZ_AFC_CLIENT)
+		channel_flags |= IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT;
 	if (rd_flags & NL80211_RRF_PSD)
 		channel_flags |= IEEE80211_CHAN_PSD;
 	return channel_flags;
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 04039e9dbd05..88e8b25c073a 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -5,7 +5,7 @@
  * Copyright 2008 Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2016	Intel Deutschland GmbH
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
  */
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -3041,9 +3041,9 @@ static bool cfg80211_uhb_power_type_valid(const u8 *ie,
 		case IEEE80211_6GHZ_CTRL_REG_LPI_AP:
 			return true;
 		case IEEE80211_6GHZ_CTRL_REG_SP_AP:
-			return !(flags & IEEE80211_CHAN_NO_UHB_AFC_CLIENT);
+			return !(flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT);
 		case IEEE80211_6GHZ_CTRL_REG_VLP_AP:
-			return !(flags & IEEE80211_CHAN_NO_UHB_VLP_CLIENT);
+			return !(flags & IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT);
 		}
 	}
 	return false;
@@ -3112,7 +3112,7 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
 		data->restrict_use = 1;
 		data->use_for = 0;
 		data->cannot_use_reasons =
-			NL80211_BSS_CANNOT_USE_UHB_PWR_MISMATCH;
+			NL80211_BSS_CANNOT_USE_6GHZ_PWR_MISMATCH;
 	}
 
 	if (ext) {
-- 
cgit v1.2.3


From a110a3b79177ddd7e7295671df97fb5386406835 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 6 Feb 2024 16:54:08 +0200
Subject: wifi: cfg80211: optionally support monitor on disabled channels

If the hardware supports a disabled channel, it may in
some cases be possible to use monitor mode (without any
transmit) on it when it's otherwise disabled. Add a new
channel flag IEEE80211_CHAN_CAN_MONITOR that makes it
possible for a driver to indicate such a thing.

Make it per channel so drivers could have a choice with
it, perhaps it's only possible on some channels, perhaps
some channels are not supported at all, but still there
and marked disabled.

In _nl80211_parse_chandef() simplify the code and check
only for an unknown channel, _cfg80211_chandef_usable()
will later check for IEEE80211_CHAN_DISABLED anyway.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240206164849.87fad3a21a09.I9116b2fdc2e2c9fd59a9273a64db7fcb41fc0328@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  4 ++++
 include/uapi/linux/nl80211.h |  4 ++++
 net/wireless/chan.c          | 26 +++++++++++++++++++-------
 net/wireless/core.h          |  5 ++++-
 net/wireless/nl80211.c       | 27 ++++++++++++++++++---------
 5 files changed, 49 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 7bb8484e859e..0a3151587556 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -122,6 +122,9 @@ struct wiphy;
  *	not permitted using this channel
  * @IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT: Client connection with AFC AP
  *	not permitted using this channel
+ * @IEEE80211_CHAN_CAN_MONITOR: This channel can be used for monitor
+ *	mode even in the presence of other (regulatory) restrictions,
+ *	even if it is otherwise disabled.
  */
 enum ieee80211_channel_flags {
 	IEEE80211_CHAN_DISABLED		= 1<<0,
@@ -148,6 +151,7 @@ enum ieee80211_channel_flags {
 	IEEE80211_CHAN_DFS_CONCURRENT	= 1<<21,
 	IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT = 1<<22,
 	IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT = 1<<23,
+	IEEE80211_CHAN_CAN_MONITOR	= 1<<24,
 };
 
 #define IEEE80211_CHAN_NO_HT40 \
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 13fa10804909..546cc176c2a9 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4273,6 +4273,9 @@ enum nl80211_wmm_rule {
  *	not allowed using this channel
  * @NL80211_FREQUENCY_ATTR_NO_6GHZ_AFC_CLIENT: Client connection to AFC AP
  *	not allowed using this channel
+ * @NL80211_FREQUENCY_ATTR_CAN_MONITOR: This channel can be used in monitor
+ *	mode despite other (regulatory) restrictions, even if the channel is
+ *	otherwise completely disabled.
  * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number
  *	currently defined
  * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use
@@ -4315,6 +4318,7 @@ enum nl80211_frequency_attr {
 	NL80211_FREQUENCY_ATTR_DFS_CONCURRENT,
 	NL80211_FREQUENCY_ATTR_NO_6GHZ_VLP_CLIENT,
 	NL80211_FREQUENCY_ATTR_NO_6GHZ_AFC_CLIENT,
+	NL80211_FREQUENCY_ATTR_CAN_MONITOR,
 
 	/* keep last */
 	__NL80211_FREQUENCY_ATTR_AFTER_LAST,
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index e2ce89afa9ff..3414b2c3abcc 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -1145,7 +1145,7 @@ EXPORT_SYMBOL(cfg80211_chandef_dfs_cac_time);
 
 static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy,
 					u32 center_freq, u32 bandwidth,
-					u32 prohibited_flags)
+					u32 prohibited_flags, bool monitor)
 {
 	struct ieee80211_channel *c;
 	u32 freq, start_freq, end_freq;
@@ -1155,7 +1155,11 @@ static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy,
 
 	for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
 		c = ieee80211_get_channel_khz(wiphy, freq);
-		if (!c || c->flags & prohibited_flags)
+		if (!c)
+			return false;
+		if (monitor && c->flags & IEEE80211_CHAN_CAN_MONITOR)
+			continue;
+		if (c->flags & prohibited_flags)
 			return false;
 	}
 
@@ -1215,9 +1219,9 @@ static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels,
 	return true;
 }
 
-bool cfg80211_chandef_usable(struct wiphy *wiphy,
-			     const struct cfg80211_chan_def *chandef,
-			     u32 prohibited_flags)
+bool _cfg80211_chandef_usable(struct wiphy *wiphy,
+			      const struct cfg80211_chan_def *chandef,
+			      u32 prohibited_flags, bool monitor)
 {
 	struct ieee80211_sta_ht_cap *ht_cap;
 	struct ieee80211_sta_vht_cap *vht_cap;
@@ -1379,14 +1383,22 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
 
 	if (!cfg80211_secondary_chans_ok(wiphy,
 					 ieee80211_chandef_to_khz(chandef),
-					 width, prohibited_flags))
+					 width, prohibited_flags, monitor))
 		return false;
 
 	if (!chandef->center_freq2)
 		return true;
 	return cfg80211_secondary_chans_ok(wiphy,
 					   MHZ_TO_KHZ(chandef->center_freq2),
-					   width, prohibited_flags);
+					   width, prohibited_flags, monitor);
+}
+
+bool cfg80211_chandef_usable(struct wiphy *wiphy,
+			     const struct cfg80211_chan_def *chandef,
+			     u32 prohibited_flags)
+{
+	return _cfg80211_chandef_usable(wiphy, chandef, prohibited_flags,
+					false);
 }
 EXPORT_SYMBOL(cfg80211_chandef_usable);
 
diff --git a/net/wireless/core.h b/net/wireless/core.h
index debf63e6c61f..118f2f619828 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -3,7 +3,7 @@
  * Wireless configuration interface internals.
  *
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
  */
 #ifndef __NET_WIRELESS_CORE_H
 #define __NET_WIRELESS_CORE_H
@@ -492,6 +492,9 @@ bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef,
 bool cfg80211_wdev_on_sub_chan(struct wireless_dev *wdev,
 			       struct ieee80211_channel *chan,
 			       bool primary_only);
+bool _cfg80211_chandef_usable(struct wiphy *wiphy,
+			      const struct cfg80211_chan_def *chandef,
+			      u32 prohibited_flags, bool monitor);
 
 static inline unsigned int elapsed_jiffies_msecs(unsigned long start)
 {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 3b3b511f9f69..612ca99fbf39 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3218,9 +3218,9 @@ static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev)
 		wdev->iftype == NL80211_IFTYPE_P2P_GO;
 }
 
-int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
-			  struct genl_info *info,
-			  struct cfg80211_chan_def *chandef)
+static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
+				  struct genl_info *info, bool monitor,
+				  struct cfg80211_chan_def *chandef)
 {
 	struct netlink_ext_ack *extack = info->extack;
 	struct nlattr **attrs = info->attrs;
@@ -3245,10 +3245,9 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
 	chandef->freq1_offset = control_freq % 1000;
 	chandef->center_freq2 = 0;
 
-	/* Primary channel not allowed */
-	if (!chandef->chan || chandef->chan->flags & IEEE80211_CHAN_DISABLED) {
+	if (!chandef->chan) {
 		NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ],
-				    "Channel is disabled");
+				    "Unknown channel");
 		return -EINVAL;
 	}
 
@@ -3343,8 +3342,9 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
 		return -EINVAL;
 	}
 
-	if (!cfg80211_chandef_usable(&rdev->wiphy, chandef,
-				     IEEE80211_CHAN_DISABLED)) {
+	if (!_cfg80211_chandef_usable(&rdev->wiphy, chandef,
+				      IEEE80211_CHAN_DISABLED,
+				      monitor)) {
 		NL_SET_ERR_MSG(extack, "(extension) channel is disabled");
 		return -EINVAL;
 	}
@@ -3359,6 +3359,13 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
 	return 0;
 }
 
+int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
+			  struct genl_info *info,
+			  struct cfg80211_chan_def *chandef)
+{
+	return _nl80211_parse_chandef(rdev, info, false, chandef);
+}
+
 static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
 				 struct net_device *dev,
 				 struct genl_info *info,
@@ -3383,7 +3390,9 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
 		link_id = 0;
 	}
 
-	result = nl80211_parse_chandef(rdev, info, &chandef);
+	result = _nl80211_parse_chandef(rdev, info,
+					iftype == NL80211_IFTYPE_MONITOR,
+					&chandef);
 	if (result)
 		return result;
 
-- 
cgit v1.2.3


From 93d9f26db5b34ec4d2d5056aeb8819cbef35519c Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Tue, 6 Feb 2024 16:54:10 +0200
Subject: wifi: nl80211: allow reporting wakeup for unprot deauth/disassoc

Add a report reason for waking up due to an unprotected
deauth/disassoc when MFP is used.

If setting wowlan to wake on disconnection, and an unprotected
deatuh/disassoc arrived (in MFP), some drivers might want to
report wakeup due to unprotected deauth/disassoc, rather than
dissassociation. Add support for that.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240206164849.8dc9ad531a17.I7f8e926adf927f762e11aaa3458f6354665c7fc5@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 546cc176c2a9..f23ecbdd84a2 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -5757,6 +5757,8 @@ struct nl80211_pattern_support {
  *	%NL80211_ATTR_SCAN_FREQUENCIES contains more than one
  *	frequency, it means that the match occurred in more than one
  *	channel.
+ * @NL80211_WOWLAN_TRIG_UNPROTECTED_DEAUTH_DISASSOC: For wakeup reporting only.
+ *	Wake up happened due to unprotected deauth or disassoc frame in MFP.
  * @NUM_NL80211_WOWLAN_TRIG: number of wake on wireless triggers
  * @MAX_NL80211_WOWLAN_TRIG: highest wowlan trigger attribute number
  *
@@ -5784,6 +5786,7 @@ enum nl80211_wowlan_triggers {
 	NL80211_WOWLAN_TRIG_WAKEUP_TCP_NOMORETOKENS,
 	NL80211_WOWLAN_TRIG_NET_DETECT,
 	NL80211_WOWLAN_TRIG_NET_DETECT_RESULTS,
+	NL80211_WOWLAN_TRIG_UNPROTECTED_DEAUTH_DISASSOC,
 
 	/* keep last */
 	NUM_NL80211_WOWLAN_TRIG,
-- 
cgit v1.2.3


From a64be8296e31f432d4a9df4db684cc8a250eb81c Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Tue, 6 Feb 2024 16:54:11 +0200
Subject: wifi: cfg80211: report unprotected deauth/disassoc in wowlan

Add to cfg80211_wowlan_wakeup another wakeup reason -
unprot_deauth_disassoc.
To be set to true if the woke up was due to an
unprotected deauth or disassoc frame in MFP.
In that case report WOWLAN_TRIG_UNPROTECTED_DEAUTH_DISASSOC.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240206164849.a3d739850d03.I8f52a21c4f36d1af1f8068bed79e2f9cbf8289ef@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 5 ++++-
 net/wireless/nl80211.c | 5 +++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 0a3151587556..93e9abb7fc3d 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3612,12 +3612,15 @@ struct cfg80211_wowlan_nd_info {
  * @tcp_connlost: TCP connection lost or failed to establish
  * @tcp_nomoretokens: TCP data ran out of tokens
  * @net_detect: if not %NULL, woke up because of net detect
+ * @unprot_deauth_disassoc: woke up due to unprotected deauth or
+ *	disassoc frame (in MFP).
  */
 struct cfg80211_wowlan_wakeup {
 	bool disconnect, magic_pkt, gtk_rekey_failure,
 	     eap_identity_req, four_way_handshake,
 	     rfkill_release, packet_80211,
-	     tcp_match, tcp_connlost, tcp_nomoretokens;
+	     tcp_match, tcp_connlost, tcp_nomoretokens,
+	     unprot_deauth_disassoc;
 	s32 pattern_idx;
 	u32 packet_present_len, packet_len;
 	const void *packet;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 612ca99fbf39..5f18cbf7cc3d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -19851,6 +19851,11 @@ void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev,
 				 NL80211_WOWLAN_TRIG_WAKEUP_TCP_NOMORETOKENS))
 			goto free_msg;
 
+		if (wakeup->unprot_deauth_disassoc &&
+		    nla_put_flag(msg,
+				 NL80211_WOWLAN_TRIG_UNPROTECTED_DEAUTH_DISASSOC))
+			goto free_msg;
+
 		if (wakeup->packet) {
 			u32 pkt_attr = NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211;
 			u32 len_attr = NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211_LEN;
-- 
cgit v1.2.3


From a3522a2edb3faf8cb98d38c2a99f5967beef24e2 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Fri, 9 Feb 2024 17:43:37 +0100
Subject: ipv4: Set the routing scope properly in ip_route_output_ports().

Set scope automatically in ip_route_output_ports() (using the socket
SOCK_LOCALROUTE flag). This way, callers don't have to overload the
tos with the RTO_ONLINK flag, like RT_CONN_FLAGS() does.

For callers that don't pass a struct sock, this doesn't change anything
as the scope is still set to RT_SCOPE_UNIVERSE when sk is NULL.

Callers that passed a struct sock and used RT_CONN_FLAGS(sk) or
RT_CONN_FLAGS_TOS(sk, tos) for the tos are modified to use
ip_sock_tos(sk) and RT_TOS(tos) respectively, as overloading tos with
the RTO_ONLINK flag now becomes unnecessary.

In drivers/net/amt.c, all ip_route_output_ports() calls use a 0 tos
parameter, ignoring the SOCK_LOCALROUTE flag of the socket. But the sk
parameter is a kernel socket, which doesn't have any configuration path
for setting SOCK_LOCALROUTE anyway. Therefore, ip_route_output_ports()
will continue to initialise scope with RT_SCOPE_UNIVERSE and amt.c
doesn't need to be modified.

Also, remove RT_CONN_FLAGS() and RT_CONN_FLAGS_TOS() from route.h as
these macros are now unused.

The objective is to eventually remove RTO_ONLINK entirely to allow
converting ->flowi4_tos to dscp_t. This will ensure proper isolation
between the DSCP and ECN bits, thus minimising the risk of introducing
bugs where TOS values interfere with ECN.

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/dacfd2ab40685e20959ab7b53c427595ba229e7d.1707496938.git.gnault@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/route.h             | 7 ++-----
 net/ipv4/af_inet.c              | 2 +-
 net/ipv4/datagram.c             | 2 +-
 net/ipv4/inet_connection_sock.c | 2 +-
 net/ipv4/ip_output.c            | 2 +-
 net/l2tp/l2tp_ip.c              | 2 +-
 6 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/route.h b/include/net/route.h
index 980ab474eabd..d4a0147942f1 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -37,9 +37,6 @@
 
 #define RTO_ONLINK	0x01
 
-#define RT_CONN_FLAGS(sk)   (RT_TOS(READ_ONCE(inet_sk(sk)->tos)) | sock_flag(sk, SOCK_LOCALROUTE))
-#define RT_CONN_FLAGS_TOS(sk,tos)   (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE))
-
 static inline __u8 ip_sock_rt_scope(const struct sock *sk)
 {
 	if (sock_flag(sk, SOCK_LOCALROUTE))
@@ -163,8 +160,8 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi
 						   __u8 proto, __u8 tos, int oif)
 {
 	flowi4_init_output(fl4, oif, sk ? READ_ONCE(sk->sk_mark) : 0, tos,
-			   RT_SCOPE_UNIVERSE, proto,
-			   sk ? inet_sk_flowi_flags(sk) : 0,
+			   sk ? ip_sock_rt_scope(sk) : RT_SCOPE_UNIVERSE,
+			   proto, sk ? inet_sk_flowi_flags(sk) : 0,
 			   daddr, saddr, dport, sport, sock_net_uid(net, sk));
 	if (sk)
 		security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a5a820ee2026..ad278009e469 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1326,7 +1326,7 @@ int inet_sk_rebuild_header(struct sock *sk)
 	fl4 = &inet->cork.fl.u.ip4;
 	rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
 				   inet->inet_dport, inet->inet_sport,
-				   sk->sk_protocol, RT_CONN_FLAGS(sk),
+				   sk->sk_protocol, ip_sock_rt_tos(sk),
 				   sk->sk_bound_dev_if);
 	if (!IS_ERR(rt)) {
 		err = 0;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 2cc50cbfc2a3..cc6d0bd7b0a9 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -119,7 +119,7 @@ void ip4_datagram_release_cb(struct sock *sk)
 	rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr,
 				   inet->inet_saddr, inet->inet_dport,
 				   inet->inet_sport, sk->sk_protocol,
-				   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+				   ip_sock_rt_tos(sk), sk->sk_bound_dev_if);
 
 	dst = !IS_ERR(rt) ? &rt->dst : NULL;
 	sk_dst_set(sk, dst);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 459af1f89739..747ed7344cbe 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1467,7 +1467,7 @@ static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *f
 	rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
 				   inet->inet_saddr, inet->inet_dport,
 				   inet->inet_sport, sk->sk_protocol,
-				   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+				   ip_sock_rt_tos(sk), sk->sk_bound_dev_if);
 	if (IS_ERR(rt))
 		rt = NULL;
 	if (rt)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 41537d18eecf..5b5a0adb927f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -493,7 +493,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 					   inet->inet_dport,
 					   inet->inet_sport,
 					   sk->sk_protocol,
-					   RT_CONN_FLAGS_TOS(sk, tos),
+					   RT_TOS(tos),
 					   sk->sk_bound_dev_if);
 		if (IS_ERR(rt))
 			goto no_route;
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 9a2a9ed3ba47..970af3983d11 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -478,7 +478,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
 					   daddr, inet->inet_saddr,
 					   inet->inet_dport, inet->inet_sport,
-					   sk->sk_protocol, RT_CONN_FLAGS(sk),
+					   sk->sk_protocol, ip_sock_rt_tos(sk),
 					   sk->sk_bound_dev_if);
 		if (IS_ERR(rt))
 			goto no_route;
-- 
cgit v1.2.3


From c83c22ec1493c0b7cc77327bedbd387e295872b6 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 12 Feb 2024 22:35:50 +0100
Subject: can: canxl: add virtual CAN network identifier support

CAN XL data frames contain an 8-bit virtual CAN network identifier (VCID).
A VCID value of zero represents an 'untagged' CAN XL frame.

To receive and send these optional VCIDs via CAN_RAW sockets a new socket
option CAN_RAW_XL_VCID_OPTS is introduced to define/access VCID content:

- tx: set the outgoing VCID value by the kernel (one fixed 8-bit value)
- tx: pass through VCID values from the user space (e.g. for traffic replay)
- rx: apply VCID receive filter (value/mask) to be passed to the user space

With the 'tx pass through' option CAN_RAW_XL_VCID_TX_PASS all valid VCID
values can be sent, e.g. to replay full qualified CAN XL traffic.

The VCID value provided for the CAN_RAW_XL_VCID_TX_SET option will
override the VCID value in the struct canxl_frame.prio defined for
CAN_RAW_XL_VCID_TX_PASS when both flags are set.

With a rx_vcid_mask of zero all possible VCID values (0x00 - 0xFF) are
passed to the user space when the CAN_RAW_XL_VCID_RX_FILTER flag is set.
Without this flag only untagged CAN XL frames (VCID = 0x00) are delivered
to the user space (default).

The 8-bit VCID is stored inside the CAN XL prio element (only in CAN XL
frames!) to not interfere with other CAN content or the CAN filters
provided by the CAN_RAW sockets and kernel infrastruture.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/all/20240212213550.18516-1-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can.h     |  9 ++++-
 include/uapi/linux/can/raw.h | 16 ++++++++
 net/can/af_can.c             |  2 +
 net/can/raw.c                | 93 ++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 110 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/can.h b/include/uapi/linux/can.h
index 939db2388208..e78cbd85ce7c 100644
--- a/include/uapi/linux/can.h
+++ b/include/uapi/linux/can.h
@@ -193,9 +193,14 @@ struct canfd_frame {
 #define CANXL_XLF 0x80 /* mandatory CAN XL frame flag (must always be set!) */
 #define CANXL_SEC 0x01 /* Simple Extended Content (security/segmentation) */
 
+/* the 8-bit VCID is optionally placed in the canxl_frame.prio element */
+#define CANXL_VCID_OFFSET 16 /* bit offset of VCID in prio element */
+#define CANXL_VCID_VAL_MASK 0xFFUL /* VCID is an 8-bit value */
+#define CANXL_VCID_MASK (CANXL_VCID_VAL_MASK << CANXL_VCID_OFFSET)
+
 /**
  * struct canxl_frame - CAN with e'X'tended frame 'L'ength frame structure
- * @prio:  11 bit arbitration priority with zero'ed CAN_*_FLAG flags
+ * @prio:  11 bit arbitration priority with zero'ed CAN_*_FLAG flags / VCID
  * @flags: additional flags for CAN XL
  * @sdt:   SDU (service data unit) type
  * @len:   frame payload length in byte (CANXL_MIN_DLEN .. CANXL_MAX_DLEN)
@@ -205,7 +210,7 @@ struct canfd_frame {
  * @prio shares the same position as @can_id from struct can[fd]_frame.
  */
 struct canxl_frame {
-	canid_t prio;  /* 11 bit priority for arbitration (canid_t) */
+	canid_t prio;  /* 11 bit priority for arbitration / 8 bit VCID */
 	__u8    flags; /* additional flags for CAN XL */
 	__u8    sdt;   /* SDU (service data unit) type */
 	__u16   len;   /* frame payload length in byte */
diff --git a/include/uapi/linux/can/raw.h b/include/uapi/linux/can/raw.h
index 31622c9b7988..e024d896e278 100644
--- a/include/uapi/linux/can/raw.h
+++ b/include/uapi/linux/can/raw.h
@@ -65,6 +65,22 @@ enum {
 	CAN_RAW_FD_FRAMES,	/* allow CAN FD frames (default:off) */
 	CAN_RAW_JOIN_FILTERS,	/* all filters must match to trigger */
 	CAN_RAW_XL_FRAMES,	/* allow CAN XL frames (default:off) */
+	CAN_RAW_XL_VCID_OPTS,	/* CAN XL VCID configuration options */
 };
 
+/* configuration for CAN XL virtual CAN identifier (VCID) handling */
+struct can_raw_vcid_options {
+
+	__u8 flags;		/* flags for vcid (filter) behaviour */
+	__u8 tx_vcid;		/* VCID value set into canxl_frame.prio */
+	__u8 rx_vcid;		/* VCID value for VCID filter */
+	__u8 rx_vcid_mask;	/* VCID mask for VCID filter */
+
+};
+
+/* can_raw_vcid_options.flags for CAN XL virtual CAN identifier handling */
+#define CAN_RAW_XL_VCID_TX_SET		0x01
+#define CAN_RAW_XL_VCID_TX_PASS		0x02
+#define CAN_RAW_XL_VCID_RX_FILTER	0x04
+
 #endif /* !_UAPI_CAN_RAW_H */
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 7343fd487dbe..707576eeeb58 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -865,6 +865,8 @@ static __init int can_init(void)
 	/* check for correct padding to be able to use the structs similarly */
 	BUILD_BUG_ON(offsetof(struct can_frame, len) !=
 		     offsetof(struct canfd_frame, len) ||
+		     offsetof(struct can_frame, len) !=
+		     offsetof(struct canxl_frame, flags) ||
 		     offsetof(struct can_frame, data) !=
 		     offsetof(struct canfd_frame, data));
 
diff --git a/net/can/raw.c b/net/can/raw.c
index e6b822624ba2..cb8e6f788af8 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -91,6 +91,10 @@ struct raw_sock {
 	int recv_own_msgs;
 	int fd_frames;
 	int xl_frames;
+	struct can_raw_vcid_options raw_vcid_opts;
+	canid_t tx_vcid_shifted;
+	canid_t rx_vcid_shifted;
+	canid_t rx_vcid_mask_shifted;
 	int join_filters;
 	int count;                 /* number of active filters */
 	struct can_filter dfilter; /* default/single filter */
@@ -134,10 +138,29 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
 		return;
 
 	/* make sure to not pass oversized frames to the socket */
-	if ((!ro->fd_frames && can_is_canfd_skb(oskb)) ||
-	    (!ro->xl_frames && can_is_canxl_skb(oskb)))
+	if (!ro->fd_frames && can_is_canfd_skb(oskb))
 		return;
 
+	if (can_is_canxl_skb(oskb)) {
+		struct canxl_frame *cxl = (struct canxl_frame *)oskb->data;
+
+		/* make sure to not pass oversized frames to the socket */
+		if (!ro->xl_frames)
+			return;
+
+		/* filter CAN XL VCID content */
+		if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_RX_FILTER) {
+			/* apply VCID filter if user enabled the filter */
+			if ((cxl->prio & ro->rx_vcid_mask_shifted) !=
+			    (ro->rx_vcid_shifted & ro->rx_vcid_mask_shifted))
+				return;
+		} else {
+			/* no filter => do not forward VCID tagged frames */
+			if (cxl->prio & CANXL_VCID_MASK)
+				return;
+		}
+	}
+
 	/* eliminate multiple filter matches for the same skb */
 	if (this_cpu_ptr(ro->uniq)->skb == oskb &&
 	    this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) {
@@ -698,6 +721,19 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
 			ro->fd_frames = ro->xl_frames;
 		break;
 
+	case CAN_RAW_XL_VCID_OPTS:
+		if (optlen != sizeof(ro->raw_vcid_opts))
+			return -EINVAL;
+
+		if (copy_from_sockptr(&ro->raw_vcid_opts, optval, optlen))
+			return -EFAULT;
+
+		/* prepare 32 bit values for handling in hot path */
+		ro->tx_vcid_shifted = ro->raw_vcid_opts.tx_vcid << CANXL_VCID_OFFSET;
+		ro->rx_vcid_shifted = ro->raw_vcid_opts.rx_vcid << CANXL_VCID_OFFSET;
+		ro->rx_vcid_mask_shifted = ro->raw_vcid_opts.rx_vcid_mask << CANXL_VCID_OFFSET;
+		break;
+
 	case CAN_RAW_JOIN_FILTERS:
 		if (optlen != sizeof(ro->join_filters))
 			return -EINVAL;
@@ -786,6 +822,21 @@ static int raw_getsockopt(struct socket *sock, int level, int optname,
 		val = &ro->xl_frames;
 		break;
 
+	case CAN_RAW_XL_VCID_OPTS:
+		/* user space buffer to small for VCID opts? */
+		if (len < sizeof(ro->raw_vcid_opts)) {
+			/* return -ERANGE and needed space in optlen */
+			err = -ERANGE;
+			if (put_user(sizeof(ro->raw_vcid_opts), optlen))
+				err = -EFAULT;
+		} else {
+			if (len > sizeof(ro->raw_vcid_opts))
+				len = sizeof(ro->raw_vcid_opts);
+			if (copy_to_user(optval, &ro->raw_vcid_opts, len))
+				err = -EFAULT;
+		}
+		break;
+
 	case CAN_RAW_JOIN_FILTERS:
 		if (len > sizeof(int))
 			len = sizeof(int);
@@ -803,23 +854,41 @@ static int raw_getsockopt(struct socket *sock, int level, int optname,
 	return 0;
 }
 
-static bool raw_bad_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu)
+static void raw_put_canxl_vcid(struct raw_sock *ro, struct sk_buff *skb)
+{
+	struct canxl_frame *cxl = (struct canxl_frame *)skb->data;
+
+	/* sanitize non CAN XL bits */
+	cxl->prio &= (CANXL_PRIO_MASK | CANXL_VCID_MASK);
+
+	/* clear VCID in CAN XL frame if pass through is disabled */
+	if (!(ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_PASS))
+		cxl->prio &= CANXL_PRIO_MASK;
+
+	/* set VCID in CAN XL frame if enabled */
+	if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_SET) {
+		cxl->prio &= CANXL_PRIO_MASK;
+		cxl->prio |= ro->tx_vcid_shifted;
+	}
+}
+
+static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu)
 {
 	/* Classical CAN -> no checks for flags and device capabilities */
 	if (can_is_can_skb(skb))
-		return false;
+		return CAN_MTU;
 
 	/* CAN FD -> needs to be enabled and a CAN FD or CAN XL device */
 	if (ro->fd_frames && can_is_canfd_skb(skb) &&
 	    (mtu == CANFD_MTU || can_is_canxl_dev_mtu(mtu)))
-		return false;
+		return CANFD_MTU;
 
 	/* CAN XL -> needs to be enabled and a CAN XL device */
 	if (ro->xl_frames && can_is_canxl_skb(skb) &&
 	    can_is_canxl_dev_mtu(mtu))
-		return false;
+		return CANXL_MTU;
 
-	return true;
+	return 0;
 }
 
 static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
@@ -829,6 +898,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 	struct sockcm_cookie sockc;
 	struct sk_buff *skb;
 	struct net_device *dev;
+	unsigned int txmtu;
 	int ifindex;
 	int err = -EINVAL;
 
@@ -869,9 +939,16 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 		goto free_skb;
 
 	err = -EINVAL;
-	if (raw_bad_txframe(ro, skb, dev->mtu))
+
+	/* check for valid CAN (CC/FD/XL) frame content */
+	txmtu = raw_check_txframe(ro, skb, dev->mtu);
+	if (!txmtu)
 		goto free_skb;
 
+	/* only CANXL: clear/forward/set VCID value */
+	if (txmtu == CANXL_MTU)
+		raw_put_canxl_vcid(ro, skb);
+
 	sockcm_init(&sockc, sk);
 	if (msg->msg_controllen) {
 		err = sock_cmsg_send(sk, msg, &sockc);
-- 
cgit v1.2.3


From 178c54666f9c4d2f49f2ea661d0c11b52f0ed190 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Tue, 6 Feb 2024 23:01:02 -0800
Subject: bpf: Mark bpf_spin_{lock,unlock}() helpers with notrace correctly

Currently tracing is supposed not to allow for bpf_spin_{lock,unlock}()
helper calls. This is to prevent deadlock for the following cases:
  - there is a prog (prog-A) calling bpf_spin_{lock,unlock}().
  - there is a tracing program (prog-B), e.g., fentry, attached
    to bpf_spin_lock() and/or bpf_spin_unlock().
  - prog-B calls bpf_spin_{lock,unlock}().
For such a case, when prog-A calls bpf_spin_{lock,unlock}(),
a deadlock will happen.

The related source codes are below in kernel/bpf/helpers.c:
  notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
  notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
notrace is supposed to prevent fentry prog from attaching to
bpf_spin_{lock,unlock}().

But actually this is not the case and fentry prog can successfully
attached to bpf_spin_lock(). Siddharth Chintamaneni reported
the issue in [1]. The following is the macro definition for
above BPF_CALL_1:
  #define BPF_CALL_x(x, name, ...)                                               \
        static __always_inline                                                 \
        u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__));   \
        typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \
        u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__));         \
        u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__))          \
        {                                                                      \
                return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
        }                                                                      \
        static __always_inline                                                 \
        u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__))

  #define BPF_CALL_1(name, ...)   BPF_CALL_x(1, name, __VA_ARGS__)

The notrace attribute is actually applied to the static always_inline function
____bpf_spin_{lock,unlock}(). The actual callback function
bpf_spin_{lock,unlock}() is not marked with notrace, hence
allowing fentry prog to attach to two helpers, and this
may cause the above mentioned deadlock. Siddharth Chintamaneni
actually has a reproducer in [2].

To fix the issue, a new macro NOTRACE_BPF_CALL_1 is introduced which
will add notrace attribute to the original function instead of
the hidden always_inline function and this fixed the problem.

  [1] https://lore.kernel.org/bpf/CAE5sdEigPnoGrzN8WU7Tx-h-iFuMZgW06qp0KHWtpvoXxf1OAQ@mail.gmail.com/
  [2] https://lore.kernel.org/bpf/CAE5sdEg6yUc_Jz50AnUXEEUh6O73yQ1Z6NV2srJnef0ZrQkZew@mail.gmail.com/

Fixes: d83525ca62cf ("bpf: introduce bpf_spin_lock")
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20240207070102.335167-1-yonghong.song@linux.dev
---
 include/linux/filter.h | 21 ++++++++++++---------
 kernel/bpf/helpers.c   |  4 ++--
 2 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index fee070b9826e..36cc29a2934c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -547,24 +547,27 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
 	__BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2,       \
 		  u64, __ur_3, u64, __ur_4, u64, __ur_5)
 
-#define BPF_CALL_x(x, name, ...)					       \
+#define BPF_CALL_x(x, attr, name, ...)					       \
 	static __always_inline						       \
 	u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__));   \
 	typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \
-	u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__));	       \
-	u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__))	       \
+	attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__));    \
+	attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__))     \
 	{								       \
 		return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
 	}								       \
 	static __always_inline						       \
 	u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__))
 
-#define BPF_CALL_0(name, ...)	BPF_CALL_x(0, name, __VA_ARGS__)
-#define BPF_CALL_1(name, ...)	BPF_CALL_x(1, name, __VA_ARGS__)
-#define BPF_CALL_2(name, ...)	BPF_CALL_x(2, name, __VA_ARGS__)
-#define BPF_CALL_3(name, ...)	BPF_CALL_x(3, name, __VA_ARGS__)
-#define BPF_CALL_4(name, ...)	BPF_CALL_x(4, name, __VA_ARGS__)
-#define BPF_CALL_5(name, ...)	BPF_CALL_x(5, name, __VA_ARGS__)
+#define __NOATTR
+#define BPF_CALL_0(name, ...)	BPF_CALL_x(0, __NOATTR, name, __VA_ARGS__)
+#define BPF_CALL_1(name, ...)	BPF_CALL_x(1, __NOATTR, name, __VA_ARGS__)
+#define BPF_CALL_2(name, ...)	BPF_CALL_x(2, __NOATTR, name, __VA_ARGS__)
+#define BPF_CALL_3(name, ...)	BPF_CALL_x(3, __NOATTR, name, __VA_ARGS__)
+#define BPF_CALL_4(name, ...)	BPF_CALL_x(4, __NOATTR, name, __VA_ARGS__)
+#define BPF_CALL_5(name, ...)	BPF_CALL_x(5, __NOATTR, name, __VA_ARGS__)
+
+#define NOTRACE_BPF_CALL_1(name, ...)	BPF_CALL_x(1, notrace, name, __VA_ARGS__)
 
 #define bpf_ctx_range(TYPE, MEMBER)						\
 	offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3503949b4c1b..93edf730d288 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -334,7 +334,7 @@ static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
 	__this_cpu_write(irqsave_flags, flags);
 }
 
-notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
 {
 	__bpf_spin_lock_irqsave(lock);
 	return 0;
@@ -357,7 +357,7 @@ static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
 	local_irq_restore(flags);
 }
 
-notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
 {
 	__bpf_spin_unlock_irqrestore(lock);
 	return 0;
-- 
cgit v1.2.3


From 77c0208e199ccb0986fb3612f2409c8cdcb036ad Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Thu, 8 Feb 2024 18:37:47 -0800
Subject: bpf: add btf pointer to struct bpf_ctx_arg_aux.

Enable the providers to use types defined in a module instead of in the
kernel (btf_vmlinux).

Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240209023750.1153905-2-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h | 1 +
 kernel/bpf/btf.c    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1ebbee1d648e..3b7836f0a83e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1415,6 +1415,7 @@ struct bpf_jit_poke_descriptor {
 struct bpf_ctx_arg_aux {
 	u32 offset;
 	enum bpf_reg_type reg_type;
+	struct btf *btf;
 	u32 btf_id;
 };
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 8e06d29961f1..cf100b5573ca 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6266,7 +6266,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			}
 
 			info->reg_type = ctx_arg_info->reg_type;
-			info->btf = btf_vmlinux;
+			info->btf = ctx_arg_info->btf ? : btf_vmlinux;
 			info->btf_id = ctx_arg_info->btf_id;
 			return true;
 		}
-- 
cgit v1.2.3


From 6115a0aeef01aef152ad7738393aad11422bfb82 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Thu, 8 Feb 2024 18:37:48 -0800
Subject: bpf: Move __kfunc_param_match_suffix() to btf.c.

Move __kfunc_param_match_suffix() to btf.c and rename it as
btf_param_match_suffix(). It can be reused by bpf_struct_ops later.

Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240209023750.1153905-3-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/btf.h   |  4 ++++
 kernel/bpf/btf.c      | 18 ++++++++++++++++++
 kernel/bpf/verifier.c | 38 ++++++++++----------------------------
 3 files changed, 32 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 1ee8977b8c95..df76a14c64f6 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -495,6 +495,10 @@ static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id)
 	return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func);
 }
 
+bool btf_param_match_suffix(const struct btf *btf,
+			    const struct btf_param *arg,
+			    const char *suffix);
+
 struct bpf_verifier_log;
 
 #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index cf100b5573ca..447da964f217 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -8904,3 +8904,21 @@ errout:
 }
 EXPORT_SYMBOL_GPL(__register_bpf_struct_ops);
 #endif
+
+bool btf_param_match_suffix(const struct btf *btf,
+			    const struct btf_param *arg,
+			    const char *suffix)
+{
+	int suffix_len = strlen(suffix), len;
+	const char *param_name;
+
+	/* In the future, this can be ported to use BTF tagging */
+	param_name = btf_name_by_offset(btf, arg->name_off);
+	if (str_is_empty(param_name))
+		return false;
+	len = strlen(param_name);
+	if (len <= suffix_len)
+		return false;
+	param_name += len - suffix_len;
+	return !strncmp(param_name, suffix, suffix_len);
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ddaf09db1175..c92d6af7d975 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10682,24 +10682,6 @@ static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta)
 	return meta->kfunc_flags & KF_RCU_PROTECTED;
 }
 
-static bool __kfunc_param_match_suffix(const struct btf *btf,
-				       const struct btf_param *arg,
-				       const char *suffix)
-{
-	int suffix_len = strlen(suffix), len;
-	const char *param_name;
-
-	/* In the future, this can be ported to use BTF tagging */
-	param_name = btf_name_by_offset(btf, arg->name_off);
-	if (str_is_empty(param_name))
-		return false;
-	len = strlen(param_name);
-	if (len < suffix_len)
-		return false;
-	param_name += len - suffix_len;
-	return !strncmp(param_name, suffix, suffix_len);
-}
-
 static bool is_kfunc_arg_mem_size(const struct btf *btf,
 				  const struct btf_param *arg,
 				  const struct bpf_reg_state *reg)
@@ -10710,7 +10692,7 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
 	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
 		return false;
 
-	return __kfunc_param_match_suffix(btf, arg, "__sz");
+	return btf_param_match_suffix(btf, arg, "__sz");
 }
 
 static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
@@ -10723,47 +10705,47 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
 	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
 		return false;
 
-	return __kfunc_param_match_suffix(btf, arg, "__szk");
+	return btf_param_match_suffix(btf, arg, "__szk");
 }
 
 static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg)
 {
-	return __kfunc_param_match_suffix(btf, arg, "__opt");
+	return btf_param_match_suffix(btf, arg, "__opt");
 }
 
 static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
 {
-	return __kfunc_param_match_suffix(btf, arg, "__k");
+	return btf_param_match_suffix(btf, arg, "__k");
 }
 
 static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg)
 {
-	return __kfunc_param_match_suffix(btf, arg, "__ign");
+	return btf_param_match_suffix(btf, arg, "__ign");
 }
 
 static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg)
 {
-	return __kfunc_param_match_suffix(btf, arg, "__alloc");
+	return btf_param_match_suffix(btf, arg, "__alloc");
 }
 
 static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg)
 {
-	return __kfunc_param_match_suffix(btf, arg, "__uninit");
+	return btf_param_match_suffix(btf, arg, "__uninit");
 }
 
 static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg)
 {
-	return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr");
+	return btf_param_match_suffix(btf, arg, "__refcounted_kptr");
 }
 
 static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg)
 {
-	return __kfunc_param_match_suffix(btf, arg, "__nullable");
+	return btf_param_match_suffix(btf, arg, "__nullable");
 }
 
 static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg)
 {
-	return __kfunc_param_match_suffix(btf, arg, "__str");
+	return btf_param_match_suffix(btf, arg, "__str");
 }
 
 static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
-- 
cgit v1.2.3


From 1611603537a4b88cec7993f32b70c03113801a46 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Thu, 8 Feb 2024 18:37:49 -0800
Subject: bpf: Create argument information for nullable arguments.

Collect argument information from the type information of stub functions to
mark arguments of BPF struct_ops programs with PTR_MAYBE_NULL if they are
nullable.  A nullable argument is annotated by suffixing "__nullable" at
the argument name of stub function.

For nullable arguments, this patch sets a struct bpf_ctx_arg_aux to label
their reg_type with PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL. This
makes the verifier to check programs and ensure that they properly check
the pointer. The programs should check if the pointer is null before
accessing the pointed memory.

The implementer of a struct_ops type should annotate the arguments that can
be null. The implementer should define a stub function (empty) as a
placeholder for each defined operator. The name of a stub function should
be in the pattern "<st_op_type>__<operator name>". For example, for
test_maybe_null of struct bpf_testmod_ops, it's stub function name should
be "bpf_testmod_ops__test_maybe_null". You mark an argument nullable by
suffixing the argument name with "__nullable" at the stub function.

Since we already has stub functions for kCFI, we just reuse these stub
functions with the naming convention mentioned earlier. These stub
functions with the naming convention is only required if there are nullable
arguments to annotate. For functions having not nullable arguments, stub
functions are not necessary for the purpose of this patch.

This patch will prepare a list of struct bpf_ctx_arg_aux, aka arg_info, for
each member field of a struct_ops type.  "arg_info" will be assigned to
"prog->aux->ctx_arg_info" of BPF struct_ops programs in
check_struct_ops_btf_id() so that it can be used by btf_ctx_access() later
to set reg_type properly for the verifier.

Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240209023750.1153905-4-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h         |  21 +++++
 include/linux/btf.h         |   2 +
 kernel/bpf/bpf_struct_ops.c | 213 +++++++++++++++++++++++++++++++++++++++++---
 kernel/bpf/btf.c            |  27 ++++++
 kernel/bpf/verifier.c       |   6 ++
 5 files changed, 257 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3b7836f0a83e..c7aa99b44dbd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1709,6 +1709,19 @@ struct bpf_struct_ops {
 	struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
 };
 
+/* Every member of a struct_ops type has an instance even a member is not
+ * an operator (function pointer). The "info" field will be assigned to
+ * prog->aux->ctx_arg_info of BPF struct_ops programs to provide the
+ * argument information required by the verifier to verify the program.
+ *
+ * btf_ctx_access() will lookup prog->aux->ctx_arg_info to find the
+ * corresponding entry for an given argument.
+ */
+struct bpf_struct_ops_arg_info {
+	struct bpf_ctx_arg_aux *info;
+	u32 cnt;
+};
+
 struct bpf_struct_ops_desc {
 	struct bpf_struct_ops *st_ops;
 
@@ -1716,6 +1729,9 @@ struct bpf_struct_ops_desc {
 	const struct btf_type *value_type;
 	u32 type_id;
 	u32 value_id;
+
+	/* Collection of argument information for each member */
+	struct bpf_struct_ops_arg_info *arg_info;
 };
 
 enum bpf_struct_ops_state {
@@ -1790,6 +1806,7 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 			     struct btf *btf,
 			     struct bpf_verifier_log *log);
 void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map);
+void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc);
 #else
 #define register_bpf_struct_ops(st_ops, type) ({ (void *)(st_ops); 0; })
 static inline bool bpf_try_module_get(const void *data, struct module *owner)
@@ -1814,6 +1831,10 @@ static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struc
 {
 }
 
+static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc)
+{
+}
+
 #endif
 
 #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
diff --git a/include/linux/btf.h b/include/linux/btf.h
index df76a14c64f6..cb96f6263638 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -498,6 +498,8 @@ static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id)
 bool btf_param_match_suffix(const struct btf *btf,
 			    const struct btf_param *arg,
 			    const char *suffix);
+int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto,
+		       u32 arg_no);
 
 struct bpf_verifier_log;
 
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index f98f580de77a..0d7be97a2411 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -116,17 +116,183 @@ static bool is_valid_value_type(struct btf *btf, s32 value_id,
 	return true;
 }
 
+#define MAYBE_NULL_SUFFIX "__nullable"
+#define MAX_STUB_NAME 128
+
+/* Return the type info of a stub function, if it exists.
+ *
+ * The name of a stub function is made up of the name of the struct_ops and
+ * the name of the function pointer member, separated by "__". For example,
+ * if the struct_ops type is named "foo_ops" and the function pointer
+ * member is named "bar", the stub function name would be "foo_ops__bar".
+ */
+static const struct btf_type *
+find_stub_func_proto(const struct btf *btf, const char *st_op_name,
+		     const char *member_name)
+{
+	char stub_func_name[MAX_STUB_NAME];
+	const struct btf_type *func_type;
+	s32 btf_id;
+	int cp;
+
+	cp = snprintf(stub_func_name, MAX_STUB_NAME, "%s__%s",
+		      st_op_name, member_name);
+	if (cp >= MAX_STUB_NAME) {
+		pr_warn("Stub function name too long\n");
+		return NULL;
+	}
+	btf_id = btf_find_by_name_kind(btf, stub_func_name, BTF_KIND_FUNC);
+	if (btf_id < 0)
+		return NULL;
+	func_type = btf_type_by_id(btf, btf_id);
+	if (!func_type)
+		return NULL;
+
+	return btf_type_by_id(btf, func_type->type); /* FUNC_PROTO */
+}
+
+/* Prepare argument info for every nullable argument of a member of a
+ * struct_ops type.
+ *
+ * Initialize a struct bpf_struct_ops_arg_info according to type info of
+ * the arguments of a stub function. (Check kCFI for more information about
+ * stub functions.)
+ *
+ * Each member in the struct_ops type has a struct bpf_struct_ops_arg_info
+ * to provide an array of struct bpf_ctx_arg_aux, which in turn provides
+ * the information that used by the verifier to check the arguments of the
+ * BPF struct_ops program assigned to the member. Here, we only care about
+ * the arguments that are marked as __nullable.
+ *
+ * The array of struct bpf_ctx_arg_aux is eventually assigned to
+ * prog->aux->ctx_arg_info of BPF struct_ops programs and passed to the
+ * verifier. (See check_struct_ops_btf_id())
+ *
+ * arg_info->info will be the list of struct bpf_ctx_arg_aux if success. If
+ * fails, it will be kept untouched.
+ */
+static int prepare_arg_info(struct btf *btf,
+			    const char *st_ops_name,
+			    const char *member_name,
+			    const struct btf_type *func_proto,
+			    struct bpf_struct_ops_arg_info *arg_info)
+{
+	const struct btf_type *stub_func_proto, *pointed_type;
+	const struct btf_param *stub_args, *args;
+	struct bpf_ctx_arg_aux *info, *info_buf;
+	u32 nargs, arg_no, info_cnt = 0;
+	u32 arg_btf_id;
+	int offset;
+
+	stub_func_proto = find_stub_func_proto(btf, st_ops_name, member_name);
+	if (!stub_func_proto)
+		return 0;
+
+	/* Check if the number of arguments of the stub function is the same
+	 * as the number of arguments of the function pointer.
+	 */
+	nargs = btf_type_vlen(func_proto);
+	if (nargs != btf_type_vlen(stub_func_proto)) {
+		pr_warn("the number of arguments of the stub function %s__%s does not match the number of arguments of the member %s of struct %s\n",
+			st_ops_name, member_name, member_name, st_ops_name);
+		return -EINVAL;
+	}
+
+	if (!nargs)
+		return 0;
+
+	args = btf_params(func_proto);
+	stub_args = btf_params(stub_func_proto);
+
+	info_buf = kcalloc(nargs, sizeof(*info_buf), GFP_KERNEL);
+	if (!info_buf)
+		return -ENOMEM;
+
+	/* Prepare info for every nullable argument */
+	info = info_buf;
+	for (arg_no = 0; arg_no < nargs; arg_no++) {
+		/* Skip arguments that is not suffixed with
+		 * "__nullable".
+		 */
+		if (!btf_param_match_suffix(btf, &stub_args[arg_no],
+					    MAYBE_NULL_SUFFIX))
+			continue;
+
+		/* Should be a pointer to struct */
+		pointed_type = btf_type_resolve_ptr(btf,
+						    args[arg_no].type,
+						    &arg_btf_id);
+		if (!pointed_type ||
+		    !btf_type_is_struct(pointed_type)) {
+			pr_warn("stub function %s__%s has %s tagging to an unsupported type\n",
+				st_ops_name, member_name, MAYBE_NULL_SUFFIX);
+			goto err_out;
+		}
+
+		offset = btf_ctx_arg_offset(btf, func_proto, arg_no);
+		if (offset < 0) {
+			pr_warn("stub function %s__%s has an invalid trampoline ctx offset for arg#%u\n",
+				st_ops_name, member_name, arg_no);
+			goto err_out;
+		}
+
+		if (args[arg_no].type != stub_args[arg_no].type) {
+			pr_warn("arg#%u type in stub function %s__%s does not match with its original func_proto\n",
+				arg_no, st_ops_name, member_name);
+			goto err_out;
+		}
+
+		/* Fill the information of the new argument */
+		info->reg_type =
+			PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
+		info->btf_id = arg_btf_id;
+		info->btf = btf;
+		info->offset = offset;
+
+		info++;
+		info_cnt++;
+	}
+
+	if (info_cnt) {
+		arg_info->info = info_buf;
+		arg_info->cnt = info_cnt;
+	} else {
+		kfree(info_buf);
+	}
+
+	return 0;
+
+err_out:
+	kfree(info_buf);
+
+	return -EINVAL;
+}
+
+/* Clean up the arg_info in a struct bpf_struct_ops_desc. */
+void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc)
+{
+	struct bpf_struct_ops_arg_info *arg_info;
+	int i;
+
+	arg_info = st_ops_desc->arg_info;
+	for (i = 0; i < btf_type_vlen(st_ops_desc->type); i++)
+		kfree(arg_info[i].info);
+
+	kfree(arg_info);
+}
+
 int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 			     struct btf *btf,
 			     struct bpf_verifier_log *log)
 {
 	struct bpf_struct_ops *st_ops = st_ops_desc->st_ops;
+	struct bpf_struct_ops_arg_info *arg_info;
 	const struct btf_member *member;
 	const struct btf_type *t;
 	s32 type_id, value_id;
 	char value_name[128];
 	const char *mname;
-	int i;
+	int i, err;
 
 	if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
 	    sizeof(value_name)) {
@@ -160,6 +326,17 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 	if (!is_valid_value_type(btf, value_id, t, value_name))
 		return -EINVAL;
 
+	arg_info = kcalloc(btf_type_vlen(t), sizeof(*arg_info),
+			   GFP_KERNEL);
+	if (!arg_info)
+		return -ENOMEM;
+
+	st_ops_desc->arg_info = arg_info;
+	st_ops_desc->type = t;
+	st_ops_desc->type_id = type_id;
+	st_ops_desc->value_id = value_id;
+	st_ops_desc->value_type = btf_type_by_id(btf, value_id);
+
 	for_each_member(i, t, member) {
 		const struct btf_type *func_proto;
 
@@ -167,40 +344,52 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 		if (!*mname) {
 			pr_warn("anon member in struct %s is not supported\n",
 				st_ops->name);
-			return -EOPNOTSUPP;
+			err = -EOPNOTSUPP;
+			goto errout;
 		}
 
 		if (__btf_member_bitfield_size(t, member)) {
 			pr_warn("bit field member %s in struct %s is not supported\n",
 				mname, st_ops->name);
-			return -EOPNOTSUPP;
+			err = -EOPNOTSUPP;
+			goto errout;
 		}
 
 		func_proto = btf_type_resolve_func_ptr(btf,
 						       member->type,
 						       NULL);
-		if (func_proto &&
-		    btf_distill_func_proto(log, btf,
+		if (!func_proto)
+			continue;
+
+		if (btf_distill_func_proto(log, btf,
 					   func_proto, mname,
 					   &st_ops->func_models[i])) {
 			pr_warn("Error in parsing func ptr %s in struct %s\n",
 				mname, st_ops->name);
-			return -EINVAL;
+			err = -EINVAL;
+			goto errout;
 		}
+
+		err = prepare_arg_info(btf, st_ops->name, mname,
+				       func_proto,
+				       arg_info + i);
+		if (err)
+			goto errout;
 	}
 
 	if (st_ops->init(btf)) {
 		pr_warn("Error in init bpf_struct_ops %s\n",
 			st_ops->name);
-		return -EINVAL;
+		err = -EINVAL;
+		goto errout;
 	}
 
-	st_ops_desc->type_id = type_id;
-	st_ops_desc->type = t;
-	st_ops_desc->value_id = value_id;
-	st_ops_desc->value_type = btf_type_by_id(btf, value_id);
-
 	return 0;
+
+errout:
+	bpf_struct_ops_desc_release(st_ops_desc);
+
+	return err;
 }
 
 static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 447da964f217..efd9bc274be0 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1699,6 +1699,13 @@ static void btf_free_struct_meta_tab(struct btf *btf)
 static void btf_free_struct_ops_tab(struct btf *btf)
 {
 	struct btf_struct_ops_tab *tab = btf->struct_ops_tab;
+	u32 i;
+
+	if (!tab)
+		return;
+
+	for (i = 0; i < tab->cnt; i++)
+		bpf_struct_ops_desc_release(&tab->ops[i]);
 
 	kfree(tab);
 	btf->struct_ops_tab = NULL;
@@ -6130,6 +6137,26 @@ static bool prog_args_trusted(const struct bpf_prog *prog)
 	}
 }
 
+int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto,
+		       u32 arg_no)
+{
+	const struct btf_param *args;
+	const struct btf_type *t;
+	int off = 0, i;
+	u32 sz;
+
+	args = btf_params(func_proto);
+	for (i = 0; i < arg_no; i++) {
+		t = btf_type_by_id(btf, args[i].type);
+		t = btf_resolve_size(btf, t, &sz);
+		if (IS_ERR(t))
+			return PTR_ERR(t);
+		off += roundup(sz, 8);
+	}
+
+	return off;
+}
+
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c92d6af7d975..72ca27f49616 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20419,6 +20419,12 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
 		}
 	}
 
+	/* btf_ctx_access() used this to provide argument type info */
+	prog->aux->ctx_arg_info =
+		st_ops_desc->arg_info[member_idx].info;
+	prog->aux->ctx_arg_info_size =
+		st_ops_desc->arg_info[member_idx].cnt;
+
 	prog->aux->attach_func_proto = func_proto;
 	prog->aux->attach_func_name = mname;
 	env->ops = st_ops->verifier_ops;
-- 
cgit v1.2.3


From 32e18e7688c6847b0c9db073aafb00639ecf576c Mon Sep 17 00:00:00 2001
From: Oliver Crumrine <ozlinuxc@gmail.com>
Date: Fri, 9 Feb 2024 14:41:22 -0500
Subject: bpf: remove check in __cgroup_bpf_run_filter_skb

Originally, this patch removed a redundant check in
BPF_CGROUP_RUN_PROG_INET_EGRESS, as the check was already being done in
the function it called, __cgroup_bpf_run_filter_skb. For v2, it was
reccomended that I remove the check from __cgroup_bpf_run_filter_skb,
and add the checks to the other macro that calls that function,
BPF_CGROUP_RUN_PROG_INET_INGRESS.

To sum it up, checking that the socket exists and that it is a full
socket is now part of both macros BPF_CGROUP_RUN_PROG_INET_EGRESS and
BPF_CGROUP_RUN_PROG_INET_INGRESS, and it is no longer part of the
function they call, __cgroup_bpf_run_filter_skb.

v3->v4: Fixed weird merge conflict.
v2->v3: Sent to bpf-next instead of generic patch
v1->v2: Addressed feedback about where check should be removed.

Signed-off-by: Oliver Crumrine <ozlinuxc@gmail.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/7lv62yiyvmj5a7eozv2iznglpkydkdfancgmbhiptrgvgan5sy@3fl3onchgdz3
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf-cgroup.h | 3 ++-
 kernel/bpf/cgroup.c        | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index a789266feac3..fb3c3e7181e6 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -196,7 +196,8 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 ({									      \
 	int __ret = 0;							      \
 	if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) &&			      \
-	    cgroup_bpf_sock_enabled(sk, CGROUP_INET_INGRESS))		      \
+	    cgroup_bpf_sock_enabled(sk, CGROUP_INET_INGRESS) && sk &&	      \
+	    sk_fullsock(sk))						      \
 		__ret = __cgroup_bpf_run_filter_skb(sk, skb,		      \
 						    CGROUP_INET_INGRESS); \
 									      \
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 98e0e3835b28..5a568bbbeaeb 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1364,9 +1364,6 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	struct cgroup *cgrp;
 	int ret;
 
-	if (!sk || !sk_fullsock(sk))
-		return 0;
-
 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 		return 0;
 
-- 
cgit v1.2.3


From fb5b86cfd4ef21ea18966718f6bf6c8f1b9df12e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 12 Feb 2024 15:32:18 -0800
Subject: bpf: simplify btf_get_prog_ctx_type() into btf_is_prog_ctx_type()

Return result of btf_get_prog_ctx_type() is never used and callers only
check NULL vs non-NULL case to determine if given type matches expected
PTR_TO_CTX type. So rename function to `btf_is_prog_ctx_type()` and
return a simple true/false. We'll use this simpler interface to handle
kprobe program type's special typedef case in the next patch.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240212233221.2575350-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h   | 17 ++++++++---------
 kernel/bpf/btf.c      | 27 +++++++++++++--------------
 kernel/bpf/verifier.c |  2 +-
 3 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index cb96f6263638..f9e56fd12a9f 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -531,10 +531,9 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id);
 int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
 				struct module *owner);
 struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id);
-const struct btf_type *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
-		      const struct btf_type *t, enum bpf_prog_type prog_type,
-		      int arg);
+bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+			   const struct btf_type *t, enum bpf_prog_type prog_type,
+			   int arg);
 int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type);
 bool btf_types_are_same(const struct btf *btf1, u32 id1,
 			const struct btf *btf2, u32 id2);
@@ -574,12 +573,12 @@ static inline struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf
 {
 	return NULL;
 }
-static inline const struct btf_member *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
-		      const struct btf_type *t, enum bpf_prog_type prog_type,
-		      int arg)
+static inline bool
+btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+		     const struct btf_type *t, enum bpf_prog_type prog_type,
+		     int arg)
 {
-	return NULL;
+	return false;
 }
 static inline int get_kern_ctx_btf_id(struct bpf_verifier_log *log,
 				      enum bpf_prog_type prog_type) {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index efd9bc274be0..405f95722905 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5694,10 +5694,9 @@ static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)
 	return ctx_type->type;
 }
 
-const struct btf_type *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
-		      const struct btf_type *t, enum bpf_prog_type prog_type,
-		      int arg)
+bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+			  const struct btf_type *t, enum bpf_prog_type prog_type,
+			  int arg)
 {
 	const struct btf_type *ctx_type;
 	const char *tname, *ctx_tname;
@@ -5711,26 +5710,26 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
 		 * is not supported yet.
 		 * BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
 		 */
-		return NULL;
+		return false;
 	}
 	tname = btf_name_by_offset(btf, t->name_off);
 	if (!tname) {
 		bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
-		return NULL;
+		return false;
 	}
 
 	ctx_type = find_canonical_prog_ctx_type(prog_type);
 	if (!ctx_type) {
 		bpf_log(log, "btf_vmlinux is malformed\n");
 		/* should not happen */
-		return NULL;
+		return false;
 	}
 again:
 	ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
 	if (!ctx_tname) {
 		/* should not happen */
 		bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n");
-		return NULL;
+		return false;
 	}
 	/* only compare that prog's ctx type name is the same as
 	 * kernel expects. No need to compare field by field.
@@ -5740,20 +5739,20 @@ again:
 	 * { // no fields of skb are ever used }
 	 */
 	if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
-		return ctx_type;
+		return true;
 	if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
-		return ctx_type;
+		return true;
 	if (strcmp(ctx_tname, tname)) {
 		/* bpf_user_pt_regs_t is a typedef, so resolve it to
 		 * underlying struct and check name again
 		 */
 		if (!btf_type_is_modifier(ctx_type))
-			return NULL;
+			return false;
 		while (btf_type_is_modifier(ctx_type))
 			ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
 		goto again;
 	}
-	return ctx_type;
+	return true;
 }
 
 /* forward declarations for arch-specific underlying types of
@@ -5905,7 +5904,7 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
 				     enum bpf_prog_type prog_type,
 				     int arg)
 {
-	if (!btf_get_prog_ctx_type(log, btf, t, prog_type, arg))
+	if (!btf_is_prog_ctx_type(log, btf, t, prog_type, arg))
 		return -ENOENT;
 	return find_kern_ctx_type_id(prog_type);
 }
@@ -7211,7 +7210,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 		if (!btf_type_is_ptr(t))
 			goto skip_pointer;
 
-		if ((tags & ARG_TAG_CTX) || btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+		if ((tags & ARG_TAG_CTX) || btf_is_prog_ctx_type(log, btf, t, prog_type, i)) {
 			if (tags & ~ARG_TAG_CTX) {
 				bpf_log(log, "arg#%d has invalid combination of tags\n", i);
 				return -EINVAL;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 72ca27f49616..aa192dc735a9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11015,7 +11015,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	 * type to our caller. When a set of conditions hold in the BTF type of
 	 * arguments, we resolve it to a known kfunc_ptr_arg_type.
 	 */
-	if (btf_get_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
+	if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
 		return KF_ARG_PTR_TO_CTX;
 
 	if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
-- 
cgit v1.2.3


From 2b0cfa6e49566c8fa6759734cf821aa6e8271a9e Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 12 Feb 2024 10:50:54 +0100
Subject: net: add generic percpu page_pool allocator

Introduce generic percpu page_pools allocator.
Moreover add page_pool_create_percpu() and cpuid filed in page_pool struct
in order to recycle the page in the page_pool "hot" cache if
napi_pp_put_page() is running on the same cpu.
This is a preliminary patch to add xdp multi-buff support for xdp running
in generic mode.

Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Reviewed-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/80bc4285228b6f4220cd03de1999d86e46e3fcbd.1707729884.git.lorenzo@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/page_pool/types.h |  3 +++
 net/core/dev.c                | 45 +++++++++++++++++++++++++++++++++++++++++++
 net/core/page_pool.c          | 23 ++++++++++++++++++----
 net/core/skbuff.c             |  5 +++--
 4 files changed, 70 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 76481c465375..3828396ae60c 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -128,6 +128,7 @@ struct page_pool_stats {
 struct page_pool {
 	struct page_pool_params_fast p;
 
+	int cpuid;
 	bool has_init_callback;
 
 	long frag_users;
@@ -203,6 +204,8 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
 struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
 				  unsigned int size, gfp_t gfp);
 struct page_pool *page_pool_create(const struct page_pool_params *params);
+struct page_pool *page_pool_create_percpu(const struct page_pool_params *params,
+					  int cpuid);
 
 struct xdp_mem_info;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 7cf15d2bf78d..e19bdf1421e0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -153,6 +153,8 @@
 #include <linux/prandom.h>
 #include <linux/once_lite.h>
 #include <net/netdev_rx_queue.h>
+#include <net/page_pool/types.h>
+#include <net/page_pool/helpers.h>
 
 #include "dev.h"
 #include "net-sysfs.h"
@@ -450,6 +452,12 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 EXPORT_PER_CPU_SYMBOL(softnet_data);
 
+/* Page_pool has a lockless array/stack to alloc/recycle pages.
+ * PP consumers must pay attention to run APIs in the appropriate context
+ * (e.g. NAPI context).
+ */
+static DEFINE_PER_CPU_ALIGNED(struct page_pool *, system_page_pool);
+
 #ifdef CONFIG_LOCKDEP
 /*
  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
@@ -11724,6 +11732,27 @@ static void __init net_dev_struct_check(void)
  *
  */
 
+/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
+#define SYSTEM_PERCPU_PAGE_POOL_SIZE	((1 << 20) / PAGE_SIZE)
+
+static int net_page_pool_create(int cpuid)
+{
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+	struct page_pool_params page_pool_params = {
+		.pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
+		.nid = NUMA_NO_NODE,
+	};
+	struct page_pool *pp_ptr;
+
+	pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
+	if (IS_ERR(pp_ptr))
+		return -ENOMEM;
+
+	per_cpu(system_page_pool, cpuid) = pp_ptr;
+#endif
+	return 0;
+}
+
 /*
  *       This is called single threaded during boot, so no need
  *       to take the rtnl semaphore.
@@ -11776,6 +11805,9 @@ static int __init net_dev_init(void)
 		init_gro_hash(&sd->backlog);
 		sd->backlog.poll = process_backlog;
 		sd->backlog.weight = weight_p;
+
+		if (net_page_pool_create(i))
+			goto out;
 	}
 
 	dev_boot_phase = 0;
@@ -11803,6 +11835,19 @@ static int __init net_dev_init(void)
 	WARN_ON(rc < 0);
 	rc = 0;
 out:
+	if (rc < 0) {
+		for_each_possible_cpu(i) {
+			struct page_pool *pp_ptr;
+
+			pp_ptr = per_cpu(system_page_pool, i);
+			if (!pp_ptr)
+				continue;
+
+			page_pool_destroy(pp_ptr);
+			per_cpu(system_page_pool, i) = NULL;
+		}
+	}
+
 	return rc;
 }
 
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 4933762e5a6b..89c835fcf094 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -171,13 +171,16 @@ static void page_pool_producer_unlock(struct page_pool *pool,
 }
 
 static int page_pool_init(struct page_pool *pool,
-			  const struct page_pool_params *params)
+			  const struct page_pool_params *params,
+			  int cpuid)
 {
 	unsigned int ring_qsize = 1024; /* Default */
 
 	memcpy(&pool->p, &params->fast, sizeof(pool->p));
 	memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
 
+	pool->cpuid = cpuid;
+
 	/* Validate only known flags were used */
 	if (pool->p.flags & ~(PP_FLAG_ALL))
 		return -EINVAL;
@@ -253,10 +256,12 @@ static void page_pool_uninit(struct page_pool *pool)
 }
 
 /**
- * page_pool_create() - create a page pool.
+ * page_pool_create_percpu() - create a page pool for a given cpu.
  * @params: parameters, see struct page_pool_params
+ * @cpuid: cpu identifier
  */
-struct page_pool *page_pool_create(const struct page_pool_params *params)
+struct page_pool *
+page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
 {
 	struct page_pool *pool;
 	int err;
@@ -265,7 +270,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
 	if (!pool)
 		return ERR_PTR(-ENOMEM);
 
-	err = page_pool_init(pool, params);
+	err = page_pool_init(pool, params, cpuid);
 	if (err < 0)
 		goto err_free;
 
@@ -282,6 +287,16 @@ err_free:
 	kfree(pool);
 	return ERR_PTR(err);
 }
+EXPORT_SYMBOL(page_pool_create_percpu);
+
+/**
+ * page_pool_create() - create a page pool
+ * @params: parameters, see struct page_pool_params
+ */
+struct page_pool *page_pool_create(const struct page_pool_params *params)
+{
+	return page_pool_create_percpu(params, -1);
+}
 EXPORT_SYMBOL(page_pool_create);
 
 static void page_pool_return_page(struct page_pool *pool, struct page *page);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index edbbef563d4d..9e5eb47b4025 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -923,9 +923,10 @@ bool napi_pp_put_page(struct page *page, bool napi_safe)
 	 */
 	if (napi_safe || in_softirq()) {
 		const struct napi_struct *napi = READ_ONCE(pp->p.napi);
+		unsigned int cpuid = smp_processor_id();
 
-		allow_direct = napi &&
-			READ_ONCE(napi->list_owner) == smp_processor_id();
+		allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid;
+		allow_direct |= (pp->cpuid == cpuid);
 	}
 
 	/* Driver set this to memory recycling info. Reset it on recycle.
-- 
cgit v1.2.3


From 4d2bb0bfe8741a8778e0053f31a4e0f0cba80e8b Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 12 Feb 2024 10:50:55 +0100
Subject: xdp: rely on skb pointer reference in do_xdp_generic and
 netif_receive_generic_xdp

Rely on skb pointer reference instead of the skb pointer in do_xdp_generic
and netif_receive_generic_xdp routine signatures.
This is a preliminary patch to add multi-buff support for xdp running in
generic mode where we will need to reallocate the skb to avoid
linearization and we will need to make it visible to do_xdp_generic()
caller.

Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Reviewed-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/c09415b1f48c8620ef4d76deed35050a7bddf7c2.1707729884.git.lorenzo@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/tun.c         |  4 ++--
 include/linux/netdevice.h |  2 +-
 net/core/dev.c            | 16 +++++++++-------
 3 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b472f2c972d8..bc80fc1d576e 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1926,7 +1926,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		rcu_read_lock();
 		xdp_prog = rcu_dereference(tun->xdp_prog);
 		if (xdp_prog) {
-			ret = do_xdp_generic(xdp_prog, skb);
+			ret = do_xdp_generic(xdp_prog, &skb);
 			if (ret != XDP_PASS) {
 				rcu_read_unlock();
 				local_bh_enable();
@@ -2516,7 +2516,7 @@ build:
 	skb_record_rx_queue(skb, tfile->queue_index);
 
 	if (skb_xdp) {
-		ret = do_xdp_generic(xdp_prog, skb);
+		ret = do_xdp_generic(xdp_prog, &skb);
 		if (ret != XDP_PASS) {
 			ret = 0;
 			goto out;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 07cefa32eafa..a3f9c95da51e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3958,7 +3958,7 @@ static inline void dev_consume_skb_any(struct sk_buff *skb)
 u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 			     struct bpf_prog *xdp_prog);
 void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
-int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb);
 int netif_rx(struct sk_buff *skb);
 int __netif_rx(struct sk_buff *skb);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index e19bdf1421e0..ffeb0e0279fe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4936,10 +4936,11 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 	return act;
 }
 
-static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
 				     struct xdp_buff *xdp,
 				     struct bpf_prog *xdp_prog)
 {
+	struct sk_buff *skb = *pskb;
 	u32 act = XDP_DROP;
 
 	/* Reinjected packets coming from act_mirred or similar should
@@ -5020,24 +5021,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 
 static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
 
-int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
 {
 	if (xdp_prog) {
 		struct xdp_buff xdp;
 		u32 act;
 		int err;
 
-		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
+		act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
 		if (act != XDP_PASS) {
 			switch (act) {
 			case XDP_REDIRECT:
-				err = xdp_do_generic_redirect(skb->dev, skb,
+				err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
 							      &xdp, xdp_prog);
 				if (err)
 					goto out_redir;
 				break;
 			case XDP_TX:
-				generic_xdp_tx(skb, xdp_prog);
+				generic_xdp_tx(*pskb, xdp_prog);
 				break;
 			}
 			return XDP_DROP;
@@ -5045,7 +5046,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 	}
 	return XDP_PASS;
 out_redir:
-	kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
+	kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
 	return XDP_DROP;
 }
 EXPORT_SYMBOL_GPL(do_xdp_generic);
@@ -5368,7 +5369,8 @@ another_round:
 		int ret2;
 
 		migrate_disable();
-		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
+		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+				      &skb);
 		migrate_enable();
 
 		if (ret2 != XDP_PASS) {
-- 
cgit v1.2.3


From e6d5dbdd20aa6a86974af51deb9414cd2e7794cb Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 12 Feb 2024 10:50:56 +0100
Subject: xdp: add multi-buff support for xdp running in generic mode

Similar to native xdp, do not always linearize the skb in
netif_receive_generic_xdp routine but create a non-linear xdp_buff to be
processed by the eBPF program. This allow to add multi-buffer support
for xdp running in generic mode.

Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Reviewed-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/1044d6412b1c3e95b40d34993fd5f37cd2f319fd.1707729884.git.lorenzo@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h |  2 ++
 net/core/dev.c         | 70 +++++++++++++++++++++++++++-----------
 net/core/skbuff.c      | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 144 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2dde34c29203..def3d8689c3d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3446,6 +3446,8 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
 	__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
 }
 
+int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
+			 struct bpf_prog *prog);
 bool napi_pp_put_page(struct page *page, bool napi_safe);
 
 static inline void
diff --git a/net/core/dev.c b/net/core/dev.c
index ffeb0e0279fe..2d02ca8a3da5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4874,6 +4874,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
 	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
 			 skb_headlen(skb) + mac_len, true);
+	if (skb_is_nonlinear(skb)) {
+		skb_shinfo(skb)->xdp_frags_size = skb->data_len;
+		xdp_buff_set_frags_flag(xdp);
+	} else {
+		xdp_buff_clear_frags_flag(xdp);
+	}
 
 	orig_data_end = xdp->data_end;
 	orig_data = xdp->data;
@@ -4903,6 +4909,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 		skb->len += off; /* positive on grow, negative on shrink */
 	}
 
+	/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
+	 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
+	 */
+	if (xdp_buff_has_frags(xdp))
+		skb->data_len = skb_shinfo(skb)->xdp_frags_size;
+	else
+		skb->data_len = 0;
+
 	/* check if XDP changed eth hdr such SKB needs update */
 	eth = (struct ethhdr *)xdp->data;
 	if ((orig_eth_type != eth->h_proto) ||
@@ -4936,12 +4950,35 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 	return act;
 }
 
+static int
+netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
+{
+	struct sk_buff *skb = *pskb;
+	int err, hroom, troom;
+
+	if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
+		return 0;
+
+	/* In case we have to go down the path and also linearize,
+	 * then lets do the pskb_expand_head() work just once here.
+	 */
+	hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+	troom = skb->tail + skb->data_len - skb->end;
+	err = pskb_expand_head(skb,
+			       hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+			       troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
+	if (err)
+		return err;
+
+	return skb_linearize(skb);
+}
+
 static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
 				     struct xdp_buff *xdp,
 				     struct bpf_prog *xdp_prog)
 {
 	struct sk_buff *skb = *pskb;
-	u32 act = XDP_DROP;
+	u32 mac_len, act = XDP_DROP;
 
 	/* Reinjected packets coming from act_mirred or similar should
 	 * not get XDP generic processing.
@@ -4949,41 +4986,36 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
 	if (skb_is_redirected(skb))
 		return XDP_PASS;
 
-	/* XDP packets must be linear and must have sufficient headroom
-	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
-	 * native XDP provides, thus we need to do it here as well.
+	/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
+	 * bytes. This is the guarantee that also native XDP provides,
+	 * thus we need to do it here as well.
 	 */
+	mac_len = skb->data - skb_mac_header(skb);
+	__skb_push(skb, mac_len);
+
 	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
 	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
-		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
-		int troom = skb->tail + skb->data_len - skb->end;
-
-		/* In case we have to go down the path and also linearize,
-		 * then lets do the pskb_expand_head() work just once here.
-		 */
-		if (pskb_expand_head(skb,
-				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
-				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
-			goto do_drop;
-		if (skb_linearize(skb))
+		if (netif_skb_check_for_xdp(pskb, xdp_prog))
 			goto do_drop;
 	}
 
-	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
+	__skb_pull(*pskb, mac_len);
+
+	act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
 	switch (act) {
 	case XDP_REDIRECT:
 	case XDP_TX:
 	case XDP_PASS:
 		break;
 	default:
-		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
+		bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
 		fallthrough;
 	case XDP_ABORTED:
-		trace_xdp_exception(skb->dev, xdp_prog, act);
+		trace_xdp_exception((*pskb)->dev, xdp_prog, act);
 		fallthrough;
 	case XDP_DROP:
 	do_drop:
-		kfree_skb(skb);
+		kfree_skb(*pskb);
 		break;
 	}
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 9e5eb47b4025..bdb94749f05d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -895,6 +895,97 @@ static bool is_pp_page(struct page *page)
 	return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
 }
 
+static int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
+			   unsigned int headroom)
+{
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+	u32 size, truesize, len, max_head_size, off;
+	struct sk_buff *skb = *pskb, *nskb;
+	int err, i, head_off;
+	void *data;
+
+	/* XDP does not support fraglist so we need to linearize
+	 * the skb.
+	 */
+	if (skb_has_frag_list(skb))
+		return -EOPNOTSUPP;
+
+	max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom);
+	if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
+		return -ENOMEM;
+
+	size = min_t(u32, skb->len, max_head_size);
+	truesize = SKB_HEAD_ALIGN(size) + headroom;
+	data = page_pool_dev_alloc_va(pool, &truesize);
+	if (!data)
+		return -ENOMEM;
+
+	nskb = napi_build_skb(data, truesize);
+	if (!nskb) {
+		page_pool_free_va(pool, data, true);
+		return -ENOMEM;
+	}
+
+	skb_reserve(nskb, headroom);
+	skb_copy_header(nskb, skb);
+	skb_mark_for_recycle(nskb);
+
+	err = skb_copy_bits(skb, 0, nskb->data, size);
+	if (err) {
+		consume_skb(nskb);
+		return err;
+	}
+	skb_put(nskb, size);
+
+	head_off = skb_headroom(nskb) - skb_headroom(skb);
+	skb_headers_offset_update(nskb, head_off);
+
+	off = size;
+	len = skb->len - off;
+	for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
+		struct page *page;
+		u32 page_off;
+
+		size = min_t(u32, len, PAGE_SIZE);
+		truesize = size;
+
+		page = page_pool_dev_alloc(pool, &page_off, &truesize);
+		if (!data) {
+			consume_skb(nskb);
+			return -ENOMEM;
+		}
+
+		skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
+		err = skb_copy_bits(skb, off, page_address(page) + page_off,
+				    size);
+		if (err) {
+			consume_skb(nskb);
+			return err;
+		}
+
+		len -= size;
+		off += size;
+	}
+
+	consume_skb(skb);
+	*pskb = nskb;
+
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
+			 struct bpf_prog *prog)
+{
+	if (!prog->aux->xdp_has_frags)
+		return -EINVAL;
+
+	return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM);
+}
+EXPORT_SYMBOL(skb_cow_data_for_xdp);
+
 #if IS_ENABLED(CONFIG_PAGE_POOL)
 bool napi_pp_put_page(struct page *page, bool napi_safe)
 {
-- 
cgit v1.2.3


From 27accb3cc08a0ec4e348356774042d5fa5f30cce Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Mon, 12 Feb 2024 10:50:57 +0100
Subject: veth: rely on skb_pp_cow_data utility routine

Rely on skb_pp_cow_data utility routine and remove duplicated code.

Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Reviewed-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/029cc14cce41cb242ee7efdcf32acc81f1ce4e9f.1707729884.git.lorenzo@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/veth.c     | 74 ++------------------------------------------------
 include/linux/skbuff.h |  2 ++
 net/core/skbuff.c      |  5 ++--
 3 files changed, 7 insertions(+), 74 deletions(-)

(limited to 'include')

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index de1f13837782..500b9dfccd08 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -729,80 +729,10 @@ static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
 	if (skb_shared(skb) || skb_head_is_locked(skb) ||
 	    skb_shinfo(skb)->nr_frags ||
 	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
-		u32 size, len, max_head_size, off, truesize, page_offset;
-		struct sk_buff *nskb;
-		struct page *page;
-		int i, head_off;
-		void *va;
-
-		/* We need a private copy of the skb and data buffers since
-		 * the ebpf program can modify it. We segment the original skb
-		 * into order-0 pages without linearize it.
-		 *
-		 * Make sure we have enough space for linear and paged area
-		 */
-		max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE -
-						  VETH_XDP_HEADROOM);
-		if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size)
-			goto drop;
-
-		size = min_t(u32, skb->len, max_head_size);
-		truesize = SKB_HEAD_ALIGN(size) + VETH_XDP_HEADROOM;
-
-		/* Allocate skb head */
-		va = page_pool_dev_alloc_va(rq->page_pool, &truesize);
-		if (!va)
-			goto drop;
-
-		nskb = napi_build_skb(va, truesize);
-		if (!nskb) {
-			page_pool_free_va(rq->page_pool, va, true);
+		if (skb_pp_cow_data(rq->page_pool, pskb, XDP_PACKET_HEADROOM))
 			goto drop;
-		}
-
-		skb_reserve(nskb, VETH_XDP_HEADROOM);
-		skb_copy_header(nskb, skb);
-		skb_mark_for_recycle(nskb);
-
-		if (skb_copy_bits(skb, 0, nskb->data, size)) {
-			consume_skb(nskb);
-			goto drop;
-		}
-		skb_put(nskb, size);
 
-		head_off = skb_headroom(nskb) - skb_headroom(skb);
-		skb_headers_offset_update(nskb, head_off);
-
-		/* Allocate paged area of new skb */
-		off = size;
-		len = skb->len - off;
-
-		for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
-			size = min_t(u32, len, PAGE_SIZE);
-			truesize = size;
-
-			page = page_pool_dev_alloc(rq->page_pool, &page_offset,
-						   &truesize);
-			if (!page) {
-				consume_skb(nskb);
-				goto drop;
-			}
-
-			skb_add_rx_frag(nskb, i, page, page_offset, size,
-					truesize);
-			if (skb_copy_bits(skb, off,
-					  page_address(page) + page_offset,
-					  size)) {
-				consume_skb(nskb);
-				goto drop;
-			}
-
-			len -= size;
-			off += size;
-		}
-
-		consume_skb(skb);
-		skb = nskb;
+		skb = *pskb;
 	}
 
 	/* SKB "head" area always have tailroom for skb_shared_info */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index def3d8689c3d..696e7680656f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3446,6 +3446,8 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
 	__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
 }
 
+int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
+		    unsigned int headroom);
 int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
 			 struct bpf_prog *prog);
 bool napi_pp_put_page(struct page *page, bool napi_safe);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bdb94749f05d..0d9a489e6ae1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -895,8 +895,8 @@ static bool is_pp_page(struct page *page)
 	return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
 }
 
-static int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
-			   unsigned int headroom)
+int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
+		    unsigned int headroom)
 {
 #if IS_ENABLED(CONFIG_PAGE_POOL)
 	u32 size, truesize, len, max_head_size, off;
@@ -975,6 +975,7 @@ static int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
 	return -EOPNOTSUPP;
 #endif
 }
+EXPORT_SYMBOL(skb_pp_cow_data);
 
 int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
 			 struct bpf_prog *prog)
-- 
cgit v1.2.3


From 18e2bf0edf4dd88d9656ec92395aa47392e85b61 Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Tue, 13 Feb 2024 06:16:45 +0000
Subject: eventpoll: Add epoll ioctl for epoll_params

Add an ioctl for getting and setting epoll_params. User programs can use
this ioctl to get and set the busy poll usec time, packet budget, and
prefer busy poll params for a specific epoll context.

Parameters are limited:
  - busy_poll_usecs is limited to <= s32_max
  - busy_poll_budget is limited to <= NAPI_POLL_WEIGHT by unprivileged
    users (!capable(CAP_NET_ADMIN))
  - prefer_busy_poll must be 0 or 1
  - __pad must be 0

Signed-off-by: Joe Damato <jdamato@fastly.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Reviewed-by: Jiri Slaby <jirislaby@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/userspace-api/ioctl/ioctl-number.rst |  1 +
 fs/eventpoll.c                                     | 73 ++++++++++++++++++++++
 include/uapi/linux/eventpoll.h                     | 13 ++++
 3 files changed, 87 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 457e16f06e04..b33918232f78 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -309,6 +309,7 @@ Code  Seq#    Include File                                           Comments
 0x89  0B-DF  linux/sockios.h
 0x89  E0-EF  linux/sockios.h                                         SIOCPROTOPRIVATE range
 0x89  F0-FF  linux/sockios.h                                         SIOCDEVPRIVATE range
+0x8A  00-1F  linux/eventpoll.h
 0x8B  all    linux/wireless.h
 0x8C  00-3F                                                          WiNRADiO driver
                                                                      <http://www.winradio.com.au/>
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1b8d01af0c2c..df2ed3af486e 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -37,6 +37,7 @@
 #include <linux/seq_file.h>
 #include <linux/compat.h>
 #include <linux/rculist.h>
+#include <linux/capability.h>
 #include <net/busy_poll.h>
 
 /*
@@ -494,6 +495,49 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
 	ep->napi_id = napi_id;
 }
 
+static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
+				  unsigned long arg)
+{
+	struct eventpoll *ep = file->private_data;
+	void __user *uarg = (void __user *)arg;
+	struct epoll_params epoll_params;
+
+	switch (cmd) {
+	case EPIOCSPARAMS:
+		if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params)))
+			return -EFAULT;
+
+		/* pad byte must be zero */
+		if (epoll_params.__pad)
+			return -EINVAL;
+
+		if (epoll_params.busy_poll_usecs > S32_MAX)
+			return -EINVAL;
+
+		if (epoll_params.prefer_busy_poll > 1)
+			return -EINVAL;
+
+		if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT &&
+		    !capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs);
+		WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget);
+		WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll);
+		return 0;
+	case EPIOCGPARAMS:
+		memset(&epoll_params, 0, sizeof(epoll_params));
+		epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs);
+		epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget);
+		epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
+		if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params)))
+			return -EFAULT;
+		return 0;
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
 #else
 
 static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
@@ -505,6 +549,12 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
 {
 }
 
+static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
+				  unsigned long arg)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* CONFIG_NET_RX_BUSY_POLL */
 
 /*
@@ -864,6 +914,27 @@ static void ep_clear_and_put(struct eventpoll *ep)
 		ep_free(ep);
 }
 
+static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,
+			       unsigned long arg)
+{
+	int ret;
+
+	if (!is_file_epoll(file))
+		return -EINVAL;
+
+	switch (cmd) {
+	case EPIOCSPARAMS:
+	case EPIOCGPARAMS:
+		ret = ep_eventpoll_bp_ioctl(file, cmd, arg);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
 static int ep_eventpoll_release(struct inode *inode, struct file *file)
 {
 	struct eventpoll *ep = file->private_data;
@@ -970,6 +1041,8 @@ static const struct file_operations eventpoll_fops = {
 	.release	= ep_eventpoll_release,
 	.poll		= ep_eventpoll_poll,
 	.llseek		= noop_llseek,
+	.unlocked_ioctl	= ep_eventpoll_ioctl,
+	.compat_ioctl   = compat_ptr_ioctl,
 };
 
 /*
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index cfbcc4cc49ac..4f4b948ef381 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -85,4 +85,17 @@ struct epoll_event {
 	__u64 data;
 } EPOLL_PACKED;
 
+struct epoll_params {
+	__u32 busy_poll_usecs;
+	__u16 busy_poll_budget;
+	__u8 prefer_busy_poll;
+
+	/* pad the struct to a multiple of 64bits */
+	__u8 __pad;
+};
+
+#define EPOLL_IOC_TYPE 0x8A
+#define EPIOCSPARAMS _IOW(EPOLL_IOC_TYPE, 0x01, struct epoll_params)
+#define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params)
+
 #endif /* _UAPI_LINUX_EVENTPOLL_H */
-- 
cgit v1.2.3


From 4d42b37def70327b2bb19f823d42289aed2cd7c7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 13 Feb 2024 06:32:36 +0000
Subject: net: convert dev->reg_state to u8

Prepares things so that dev->reg_state reads can be lockless,
by adding WRITE_ONCE() on write side.

READ_ONCE()/WRITE_ONCE() do not support bitfields.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 23 ++++++++++++++---------
 net/core/dev.c            |  8 ++++----
 2 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a3f9c95da51e..631124655107 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1815,6 +1815,15 @@ enum netdev_stat_type {
 	NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */
 };
 
+enum netdev_reg_state {
+	NETREG_UNINITIALIZED = 0,
+	NETREG_REGISTERED,	/* completed register_netdevice */
+	NETREG_UNREGISTERING,	/* called unregister_netdevice */
+	NETREG_UNREGISTERED,	/* completed unregister todo */
+	NETREG_RELEASED,	/* called free_netdev */
+	NETREG_DUMMY,		/* dummy device for NAPI poll */
+};
+
 /**
  *	struct net_device - The DEVICE structure.
  *
@@ -2372,13 +2381,7 @@ struct net_device {
 
 	struct list_head	link_watch_list;
 
-	enum { NETREG_UNINITIALIZED=0,
-	       NETREG_REGISTERED,	/* completed register_netdevice */
-	       NETREG_UNREGISTERING,	/* called unregister_netdevice */
-	       NETREG_UNREGISTERED,	/* completed unregister todo */
-	       NETREG_RELEASED,		/* called free_netdev */
-	       NETREG_DUMMY,		/* dummy device for NAPI poll */
-	} reg_state:8;
+	u8 reg_state;
 
 	bool dismantle;
 
@@ -5254,7 +5257,9 @@ static inline const char *netdev_name(const struct net_device *dev)
 
 static inline const char *netdev_reg_state(const struct net_device *dev)
 {
-	switch (dev->reg_state) {
+	u8 reg_state = READ_ONCE(dev->reg_state);
+
+	switch (reg_state) {
 	case NETREG_UNINITIALIZED: return " (uninitialized)";
 	case NETREG_REGISTERED: return "";
 	case NETREG_UNREGISTERING: return " (unregistering)";
@@ -5263,7 +5268,7 @@ static inline const char *netdev_reg_state(const struct net_device *dev)
 	case NETREG_DUMMY: return " (dummy)";
 	}
 
-	WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, dev->reg_state);
+	WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, reg_state);
 	return " (unknown)";
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 720bd6838212..9c95cae9d6ab 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10339,7 +10339,7 @@ int register_netdevice(struct net_device *dev)
 
 	ret = netdev_register_kobject(dev);
 	write_lock(&dev_base_lock);
-	dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
+	WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
 	write_unlock(&dev_base_lock);
 	if (ret)
 		goto err_uninit_notify;
@@ -10630,7 +10630,7 @@ void netdev_run_todo(void)
 		}
 
 		write_lock(&dev_base_lock);
-		dev->reg_state = NETREG_UNREGISTERED;
+		WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
 		write_unlock(&dev_base_lock);
 		linkwatch_sync_dev(dev);
 	}
@@ -11050,7 +11050,7 @@ void free_netdev(struct net_device *dev)
 	}
 
 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
-	dev->reg_state = NETREG_RELEASED;
+	WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
 
 	/* will free via device release */
 	put_device(&dev->dev);
@@ -11140,7 +11140,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
 		/* And unlink it from device chain. */
 		write_lock(&dev_base_lock);
 		unlist_netdevice(dev, false);
-		dev->reg_state = NETREG_UNREGISTERING;
+		WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
 		write_unlock(&dev_base_lock);
 	}
 	flush_all_backlogs();
-- 
cgit v1.2.3


From 6a2968ee1ee2cc6fce30f6f5724442b34b1483b3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 13 Feb 2024 06:32:42 +0000
Subject: net: add netdev_set_operstate() helper

dev_base_lock is going away, add netdev_set_operstate() helper
so that hsr does not have to know core internals.

Remove dev_base_lock acquisition from rfc2863_policy()

v3: use an "unsigned int" for dev->operstate,
    so that try_cmpxchg() can work on all arches.
        ( https://lore.kernel.org/oe-kbuild-all/202402081918.OLyGaea3-lkp@intel.com/ )

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 +-
 include/linux/rtnetlink.h |  2 ++
 net/core/link_watch.c     |  9 ++-------
 net/core/rtnetlink.c      | 22 +++++++++++++++-------
 net/hsr/hsr_device.c      | 22 ++++++----------------
 5 files changed, 26 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 631124655107..697370706a82 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2258,7 +2258,7 @@ struct net_device {
 	const struct tlsdev_ops *tlsdev_ops;
 #endif
 
-	unsigned char		operstate;
+	unsigned int		operstate;
 	unsigned char		link_mode;
 
 	unsigned char		if_port;
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 21780608cf47..cdfc897f1e3c 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -172,4 +172,6 @@ rtnl_notify_needed(const struct net *net, u16 nlflags, u32 group)
 	return (nlflags & NLM_F_ECHO) || rtnl_has_listeners(net, group);
 }
 
+void netdev_set_operstate(struct net_device *dev, int newstate);
+
 #endif	/* __LINUX_RTNETLINK_H */
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 1b93e054c9a3..8ec35194bfcb 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -33,7 +33,7 @@ static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
 static LIST_HEAD(lweventlist);
 static DEFINE_SPINLOCK(lweventlist_lock);
 
-static unsigned char default_operstate(const struct net_device *dev)
+static unsigned int default_operstate(const struct net_device *dev)
 {
 	if (netif_testing(dev))
 		return IF_OPER_TESTING;
@@ -62,16 +62,13 @@ static unsigned char default_operstate(const struct net_device *dev)
 	return IF_OPER_UP;
 }
 
-
 static void rfc2863_policy(struct net_device *dev)
 {
-	unsigned char operstate = default_operstate(dev);
+	unsigned int operstate = default_operstate(dev);
 
 	if (operstate == READ_ONCE(dev->operstate))
 		return;
 
-	write_lock(&dev_base_lock);
-
 	switch(dev->link_mode) {
 	case IF_LINK_MODE_TESTING:
 		if (operstate == IF_OPER_UP)
@@ -88,8 +85,6 @@ static void rfc2863_policy(struct net_device *dev)
 	}
 
 	WRITE_ONCE(dev->operstate, operstate);
-
-	write_unlock(&dev_base_lock);
 }
 
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 43d92de8601c..e484ba44f23b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -842,9 +842,22 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
 }
 EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
 
+void netdev_set_operstate(struct net_device *dev, int newstate)
+{
+	unsigned int old = READ_ONCE(dev->operstate);
+
+	do {
+		if (old == newstate)
+			return;
+	} while (!try_cmpxchg(&dev->operstate, &old, newstate));
+
+	netdev_state_change(dev);
+}
+EXPORT_SYMBOL(netdev_set_operstate);
+
 static void set_operstate(struct net_device *dev, unsigned char transition)
 {
-	unsigned char operstate = dev->operstate;
+	unsigned char operstate = READ_ONCE(dev->operstate);
 
 	switch (transition) {
 	case IF_OPER_UP:
@@ -866,12 +879,7 @@ static void set_operstate(struct net_device *dev, unsigned char transition)
 		break;
 	}
 
-	if (READ_ONCE(dev->operstate) != operstate) {
-		write_lock(&dev_base_lock);
-		WRITE_ONCE(dev->operstate, operstate);
-		write_unlock(&dev_base_lock);
-		netdev_state_change(dev);
-	}
+	netdev_set_operstate(dev, operstate);
 }
 
 static unsigned int rtnl_dev_get_flags(const struct net_device *dev)
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index be0e43f46556..5ef6d437db72 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -28,29 +28,19 @@ static bool is_slave_up(struct net_device *dev)
 	return dev && is_admin_up(dev) && netif_oper_up(dev);
 }
 
-static void __hsr_set_operstate(struct net_device *dev, int transition)
-{
-	write_lock(&dev_base_lock);
-	if (READ_ONCE(dev->operstate) != transition) {
-		WRITE_ONCE(dev->operstate, transition);
-		write_unlock(&dev_base_lock);
-		netdev_state_change(dev);
-	} else {
-		write_unlock(&dev_base_lock);
-	}
-}
-
 static void hsr_set_operstate(struct hsr_port *master, bool has_carrier)
 {
-	if (!is_admin_up(master->dev)) {
-		__hsr_set_operstate(master->dev, IF_OPER_DOWN);
+	struct net_device *dev = master->dev;
+
+	if (!is_admin_up(dev)) {
+		netdev_set_operstate(dev, IF_OPER_DOWN);
 		return;
 	}
 
 	if (has_carrier)
-		__hsr_set_operstate(master->dev, IF_OPER_UP);
+		netdev_set_operstate(dev, IF_OPER_UP);
 	else
-		__hsr_set_operstate(master->dev, IF_OPER_LOWERLAYERDOWN);
+		netdev_set_operstate(dev, IF_OPER_LOWERLAYERDOWN);
 }
 
 static bool hsr_check_carrier(struct hsr_port *master)
-- 
cgit v1.2.3


From 1b3ef46cb7f2618cc0b507393220a69810f6da12 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 13 Feb 2024 06:32:45 +0000
Subject: net: remove dev_base_lock

dev_base_lock is not needed anymore, all remaining users also hold RTNL.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 --
 net/core/dev.c            | 39 ++++-----------------------------------
 2 files changed, 4 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 697370706a82..c541550b0e6e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3077,8 +3077,6 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
 int call_netdevice_notifiers_info(unsigned long val,
 				  struct netdev_notifier_info *info);
 
-extern rwlock_t				dev_base_lock;		/* Device list lock */
-
 #define for_each_netdev(net, d)		\
 		list_for_each_entry(d, &(net)->dev_base_head, dev_list)
 #define for_each_netdev_reverse(net, d)	\
diff --git a/net/core/dev.c b/net/core/dev.c
index 02cf9fd68da6..d8dd293a7a27 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -168,28 +168,6 @@ static int call_netdevice_notifiers_extack(unsigned long val,
 					   struct net_device *dev,
 					   struct netlink_ext_ack *extack);
 
-/*
- * The @dev_base_head list is protected by @dev_base_lock and the rtnl
- * semaphore.
- *
- * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
- *
- * Writers must hold the rtnl semaphore while they loop through the
- * dev_base_head list, and hold dev_base_lock for writing when they do the
- * actual updates.  This allows pure readers to access the list even
- * while a writer is preparing to update it.
- *
- * To put it another way, dev_base_lock is held for writing only to
- * protect against pure readers; the rtnl semaphore provides the
- * protection against other writers.
- *
- * See, for example usages, register_netdevice() and
- * unregister_netdevice(), which must be called with the rtnl
- * semaphore held.
- */
-DEFINE_RWLOCK(dev_base_lock);
-EXPORT_SYMBOL(dev_base_lock);
-
 static DEFINE_MUTEX(ifalias_mutex);
 
 /* protects napi_hash addition/deletion and napi_gen_id */
@@ -395,12 +373,10 @@ static void list_netdevice(struct net_device *dev)
 
 	ASSERT_RTNL();
 
-	write_lock(&dev_base_lock);
 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 	netdev_name_node_add(net, dev->name_node);
 	hlist_add_head_rcu(&dev->index_hlist,
 			   dev_index_hash(net, dev->ifindex));
-	write_unlock(&dev_base_lock);
 
 	netdev_for_each_altname(dev, name_node)
 		netdev_name_node_add(net, name_node);
@@ -427,11 +403,9 @@ static void unlist_netdevice(struct net_device *dev)
 		netdev_name_node_del(name_node);
 
 	/* Unlink dev from the device chain */
-	write_lock(&dev_base_lock);
 	list_del_rcu(&dev->dev_list);
 	netdev_name_node_del(dev->name_node);
 	hlist_del_rcu(&dev->index_hlist);
-	write_unlock(&dev_base_lock);
 
 	dev_base_seq_inc(dev_net(dev));
 }
@@ -752,9 +726,9 @@ EXPORT_SYMBOL_GPL(dev_fill_forward_path);
  *	@net: the applicable net namespace
  *	@name: name to find
  *
- *	Find an interface by name. Must be called under RTNL semaphore
- *	or @dev_base_lock. If the name is found a pointer to the device
- *	is returned. If the name is not found then %NULL is returned. The
+ *	Find an interface by name. Must be called under RTNL semaphore.
+ *	If the name is found a pointer to the device is returned.
+ *	If the name is not found then %NULL is returned. The
  *	reference counters are not incremented so the caller must be
  *	careful with locks.
  */
@@ -835,8 +809,7 @@ EXPORT_SYMBOL(netdev_get_by_name);
  *	Search for an interface by index. Returns %NULL if the device
  *	is not found or a pointer to the device. The device has not
  *	had its reference counter increased so the caller must be careful
- *	about locking. The caller must hold either the RTNL semaphore
- *	or @dev_base_lock.
+ *	about locking. The caller must hold the RTNL semaphore.
  */
 
 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
@@ -1241,15 +1214,11 @@ rollback:
 
 	netdev_adjacent_rename_links(dev, oldname);
 
-	write_lock(&dev_base_lock);
 	netdev_name_node_del(dev->name_node);
-	write_unlock(&dev_base_lock);
 
 	synchronize_net();
 
-	write_lock(&dev_base_lock);
 	netdev_name_node_add(net, dev->name_node);
-	write_unlock(&dev_base_lock);
 
 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 	ret = notifier_to_errno(ret);
-- 
cgit v1.2.3


From 414532d8aa8915d9aebd01c6b5aa54bdfd98da71 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 14 Feb 2024 20:08:43 +0100
Subject: wifi: cfg80211: use IEEE80211_MAX_MESH_ID_LEN appropriately

Even if that's the same as IEEE80211_MAX_SSID_LEN, we really
should just use IEEE80211_MAX_MESH_ID_LEN for mesh, rather
than having the BUILD_BUG_ON()s.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 +-
 net/wireless/nl80211.c | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 93e9abb7fc3d..57c2298af35b 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6223,7 +6223,7 @@ struct wireless_dev {
 			int beacon_interval;
 			struct cfg80211_chan_def preset_chandef;
 			struct cfg80211_chan_def chandef;
-			u8 id[IEEE80211_MAX_SSID_LEN];
+			u8 id[IEEE80211_MAX_MESH_ID_LEN];
 			u8 id_len, id_up_len;
 		} mesh;
 		struct {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 5f18cbf7cc3d..dd9a092b6bab 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4205,8 +4205,6 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 		if (netif_running(dev))
 			return -EBUSY;
 
-		BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN !=
-			     IEEE80211_MAX_MESH_ID_LEN);
 		wdev->u.mesh.id_up_len =
 			nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
 		memcpy(wdev->u.mesh.id,
@@ -4312,8 +4310,6 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 	case NL80211_IFTYPE_MESH_POINT:
 		if (!info->attrs[NL80211_ATTR_MESH_ID])
 			break;
-		BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN !=
-			     IEEE80211_MAX_MESH_ID_LEN);
 		wdev->u.mesh.id_up_len =
 			nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
 		memcpy(wdev->u.mesh.id,
-- 
cgit v1.2.3


From 81800aef0eba33df2b30f2e29a0137078b9ba256 Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marliere" <ricardo@marliere.net>
Date: Tue, 13 Feb 2024 11:48:00 -0300
Subject: net: mdio_bus: make mdio_bus_type const

Since commit d492cc2573a0 ("driver core: device.h: make struct
bus_type a const *"), the driver core can properly handle constant
struct bus_type, move the mdio_bus_type variable to be a constant
structure as well, placing it into read-only memory which can not be
modified at runtime.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ricardo B. Marliere <ricardo@marliere.net>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20240213-bus_cleanup-mdio-v1-1-f9e799da7fda@marliere.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/phy/mdio_bus.c   | 2 +-
 drivers/net/phy/phy_device.c | 3 +--
 include/linux/phy.h          | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 08624f073014..b765466e767f 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -1426,7 +1426,7 @@ static const struct attribute_group *mdio_bus_dev_groups[] = {
 	NULL,
 };
 
-struct bus_type mdio_bus_type = {
+const struct bus_type mdio_bus_type = {
 	.name		= "mdio_bus",
 	.dev_groups	= mdio_bus_dev_groups,
 	.match		= mdio_bus_match,
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 9f37c0bfbf8d..d63dca535746 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1604,7 +1604,6 @@ EXPORT_SYMBOL(phy_attach_direct);
 struct phy_device *phy_attach(struct net_device *dev, const char *bus_id,
 			      phy_interface_t interface)
 {
-	struct bus_type *bus = &mdio_bus_type;
 	struct phy_device *phydev;
 	struct device *d;
 	int rc;
@@ -1615,7 +1614,7 @@ struct phy_device *phy_attach(struct net_device *dev, const char *bus_id,
 	/* Search the list of PHY devices on the mdio bus for the
 	 * PHY with the requested name
 	 */
-	d = bus_find_device_by_name(bus, NULL, bus_id);
+	d = bus_find_device_by_name(&mdio_bus_type, NULL, bus_id);
 	if (!d) {
 		pr_err("PHY %s not found\n", bus_id);
 		return ERR_PTR(-ENODEV);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 2249cdb5957a..c2dda21b39e1 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -2129,7 +2129,7 @@ static inline bool phy_package_probe_once(struct phy_device *phydev)
 	return __phy_package_set_once(phydev, PHY_SHARED_F_PROBE_DONE);
 }
 
-extern struct bus_type mdio_bus_type;
+extern const struct bus_type mdio_bus_type;
 
 struct mdio_board_info {
 	const char	*bus_id;
-- 
cgit v1.2.3


From a5fcea2d2f790aa90b6e996d411ae2cf8db55186 Mon Sep 17 00:00:00 2001
From: Alex Henrie <alexhenrie24@gmail.com>
Date: Tue, 13 Feb 2024 23:26:31 -0700
Subject: net: ipv6/addrconf: introduce a regen_min_advance sysctl

In RFC 8981, REGEN_ADVANCE cannot be less than 2 seconds, and the RFC
does not permit the creation of temporary addresses with lifetimes
shorter than that:

> When processing a Router Advertisement with a
> Prefix Information option carrying a prefix for the purposes of
> address autoconfiguration (i.e., the A bit is set), the host MUST
> perform the following steps:

> 5.  A temporary address is created only if this calculated preferred
>     lifetime is greater than REGEN_ADVANCE time units.

However, some users want to change their IPv6 address as frequently as
possible regardless of the RFC's arbitrary minimum lifetime. For the
benefit of those users, add a regen_min_advance sysctl parameter that
can be set to below or above 2 seconds.

Link: https://datatracker.ietf.org/doc/html/rfc8981
Signed-off-by: Alex Henrie <alexhenrie24@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/networking/ip-sysctl.rst | 10 ++++++++++
 include/linux/ipv6.h                   |  1 +
 include/net/addrconf.h                 |  5 +++--
 net/ipv6/addrconf.c                    | 11 ++++++++++-
 4 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 458305931345..407d917d1a36 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -2535,6 +2535,16 @@ max_desync_factor - INTEGER
 
 	Default: 600
 
+regen_min_advance - INTEGER
+	How far in advance (in seconds), at minimum, to create a new temporary
+	address before the current one is deprecated. This value is added to
+	the amount of time that may be required for duplicate address detection
+	to determine when to create a new address. Linux permits setting this
+	value to less than the default of 2 seconds, but a value less than 2
+	does not conform to RFC 8981.
+
+	Default: 2
+
 regen_max_retry - INTEGER
 	Number of attempts before give up attempting to generate
 	valid temporary addresses.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 5e605e384aac..ef3aa060a289 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -27,6 +27,7 @@ struct ipv6_devconf {
 	__s32		use_tempaddr;
 	__s32		temp_valid_lft;
 	__s32		temp_prefered_lft;
+	__s32		regen_min_advance;
 	__s32		regen_max_retry;
 	__s32		max_desync_factor;
 	__s32		max_addresses;
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 61ebe723ee4d..30d6f1e84e46 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -8,8 +8,9 @@
 
 #define MIN_VALID_LIFETIME		(2*3600)	/* 2 hours */
 
-#define TEMP_VALID_LIFETIME		(7*86400)
-#define TEMP_PREFERRED_LIFETIME		(86400)
+#define TEMP_VALID_LIFETIME		(7*86400)       /* 1 week */
+#define TEMP_PREFERRED_LIFETIME		(86400)         /* 24 hours */
+#define REGEN_MIN_ADVANCE		(2)             /* 2 seconds */
 #define REGEN_MAX_RETRY			(3)
 #define MAX_DESYNC_FACTOR		(600)
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 65e886d7d80c..283823fba96a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -195,6 +195,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.use_tempaddr		= 0,
 	.temp_valid_lft		= TEMP_VALID_LIFETIME,
 	.temp_prefered_lft	= TEMP_PREFERRED_LIFETIME,
+	.regen_min_advance	= REGEN_MIN_ADVANCE,
 	.regen_max_retry	= REGEN_MAX_RETRY,
 	.max_desync_factor	= MAX_DESYNC_FACTOR,
 	.max_addresses		= IPV6_MAX_ADDRESSES,
@@ -257,6 +258,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.use_tempaddr		= 0,
 	.temp_valid_lft		= TEMP_VALID_LIFETIME,
 	.temp_prefered_lft	= TEMP_PREFERRED_LIFETIME,
+	.regen_min_advance	= REGEN_MIN_ADVANCE,
 	.regen_max_retry	= REGEN_MAX_RETRY,
 	.max_desync_factor	= MAX_DESYNC_FACTOR,
 	.max_addresses		= IPV6_MAX_ADDRESSES,
@@ -1341,7 +1343,7 @@ out:
 
 static unsigned long ipv6_get_regen_advance(struct inet6_dev *idev)
 {
-	return 2 + idev->cnf.regen_max_retry *
+	return idev->cnf.regen_min_advance + idev->cnf.regen_max_retry *
 			idev->cnf.dad_transmits *
 			max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ;
 }
@@ -6819,6 +6821,13 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname       = "regen_min_advance",
+		.data           = &ipv6_devconf.regen_min_advance,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
 	{
 		.procname	= "regen_max_retry",
 		.data		= &ipv6_devconf.regen_max_retry,
-- 
cgit v1.2.3


From 3de21a8990d3c2cc507e9cc4ed00f36358d5b93e Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
Date: Mon, 12 Feb 2024 17:16:13 +0100
Subject: genetlink: Add per family bind/unbind callbacks

Add genetlink family bind()/unbind() callbacks when adding/removing
multicast group to/from netlink client socket via setsockopt() or
bind() syscall.

They can be used to track if consumers of netlink multicast messages
emerge or disappear. Thus, a client implementing callbacks, can now
send events only when there are active consumers, preventing unnecessary
work when none exist.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/20240212161615.161935-2-stanislaw.gruszka@linux.intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/genetlink.h |  4 ++++
 net/netlink/genetlink.c | 30 ++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index e61469129402..ecadba836ae5 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -41,6 +41,8 @@ struct genl_info;
  *	do additional, common, filtering and return an error
  * @post_doit: called after an operation's doit callback, it may
  *	undo operations done by pre_doit, for example release locks
+ * @bind: called when family multicast group is added to a netlink socket
+ * @unbind: called when family multicast group is removed from a netlink socket
  * @module: pointer to the owning module (set to THIS_MODULE)
  * @mcgrps: multicast groups used by this family
  * @n_mcgrps: number of multicast groups
@@ -84,6 +86,8 @@ struct genl_family {
 	void			(*post_doit)(const struct genl_split_ops *ops,
 					     struct sk_buff *skb,
 					     struct genl_info *info);
+	int			(*bind)(int mcgrp);
+	void			(*unbind)(int mcgrp);
 	const struct genl_ops *	ops;
 	const struct genl_small_ops *small_ops;
 	const struct genl_split_ops *split_ops;
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 8c7af02f8454..50ec599a5cff 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -1836,6 +1836,9 @@ static int genl_bind(struct net *net, int group)
 		    !ns_capable(net->user_ns, CAP_SYS_ADMIN))
 			ret = -EPERM;
 
+		if (family->bind)
+			family->bind(i);
+
 		break;
 	}
 
@@ -1843,12 +1846,39 @@ static int genl_bind(struct net *net, int group)
 	return ret;
 }
 
+static void genl_unbind(struct net *net, int group)
+{
+	const struct genl_family *family;
+	unsigned int id;
+
+	down_read(&cb_lock);
+
+	idr_for_each_entry(&genl_fam_idr, family, id) {
+		int i;
+
+		if (family->n_mcgrps == 0)
+			continue;
+
+		i = group - family->mcgrp_offset;
+		if (i < 0 || i >= family->n_mcgrps)
+			continue;
+
+		if (family->unbind)
+			family->unbind(i);
+
+		break;
+	}
+
+	up_read(&cb_lock);
+}
+
 static int __net_init genl_pernet_init(struct net *net)
 {
 	struct netlink_kernel_cfg cfg = {
 		.input		= genl_rcv,
 		.flags		= NL_CFG_F_NONROOT_RECV,
 		.bind		= genl_bind,
+		.unbind		= genl_unbind,
 		.release	= genl_release,
 	};
 
-- 
cgit v1.2.3


From 80e4021c25d8c1ddae0dd655ed5f6b1e938dd79b Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 14 Feb 2024 21:16:19 +0100
Subject: net: mdio: add helpers for accessing the EEE CAP2 registers

This adds helpers for accessing the EEE CAP2 registers.
For now only 2500baseT and 5000baseT modes are supported.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mdio.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'include')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 79ceee3c8673..fd8ff310f9eb 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -439,6 +439,42 @@ static inline void mii_eee_cap1_mod_linkmode_t(unsigned long *adv, u32 val)
 			 adv, val & MDIO_EEE_10GKR);
 }
 
+/**
+ * mii_eee_cap2_mod_linkmode_sup_t()
+ * @adv: target the linkmode settings
+ * @val: register value
+ *
+ * A function that translates value of following registers to the linkmode:
+ * IEEE 802.3-2022 45.2.3.11 "EEE control and capability 2" register (3.21)
+ */
+static inline void mii_eee_cap2_mod_linkmode_sup_t(unsigned long *adv, u32 val)
+{
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT,
+			 adv, val & MDIO_EEE_2_5GT);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_5000baseT_Full_BIT,
+			 adv, val & MDIO_EEE_5GT);
+}
+
+/**
+ * mii_eee_cap2_mod_linkmode_adv_t()
+ * @adv: target the linkmode advertisement settings
+ * @val: register value
+ *
+ * A function that translates value of following registers to the linkmode:
+ * IEEE 802.3-2022 45.2.7.16 "EEE advertisement 2" register (7.62)
+ * IEEE 802.3-2022 45.2.7.17 "EEE link partner ability 2" register (7.63)
+ * Note: Currently this function is the same as mii_eee_cap2_mod_linkmode_sup_t.
+ *       For certain, not yet supported, modes however the bits differ.
+ *       Therefore create separate functions already.
+ */
+static inline void mii_eee_cap2_mod_linkmode_adv_t(unsigned long *adv, u32 val)
+{
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT,
+			 adv, val & MDIO_EEE_2_5GT);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_5000baseT_Full_BIT,
+			 adv, val & MDIO_EEE_5GT);
+}
+
 /**
  * linkmode_to_mii_eee_cap1_t()
  * @adv: the linkmode advertisement settings
@@ -466,6 +502,25 @@ static inline u32 linkmode_to_mii_eee_cap1_t(unsigned long *adv)
 	return result;
 }
 
+/**
+ * linkmode_to_mii_eee_cap2_t()
+ * @adv: the linkmode advertisement settings
+ *
+ * A function that translates linkmode to value for IEEE 802.3-2022 45.2.7.16
+ * "EEE advertisement 2" register (7.62)
+ */
+static inline u32 linkmode_to_mii_eee_cap2_t(unsigned long *adv)
+{
+	u32 result = 0;
+
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, adv))
+		result |= MDIO_EEE_2_5GT;
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_5000baseT_Full_BIT, adv))
+		result |= MDIO_EEE_5GT;
+
+	return result;
+}
+
 /**
  * mii_10base_t1_adv_mod_linkmode_t()
  * @adv: linkmode advertisement settings
-- 
cgit v1.2.3


From ef6ee3a31bdc699391f2db4eff407fdb06895809 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 14 Feb 2024 21:17:11 +0100
Subject: net: phy: add PHY_EEE_CAP2_FEATURES

As a prerequisite for adding EEE CAP2 register support, complement
PHY_EEE_CAP1_FEATURES with PHY_EEE_CAP2_FEATURES.
For now only 2500baseT and 5000baseT modes are supported.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 11 +++++++++++
 include/linux/phy.h          |  2 ++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index d63dca535746..2eefee970851 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -148,6 +148,14 @@ static const int phy_eee_cap1_features_array[] = {
 __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap1_features) __ro_after_init;
 EXPORT_SYMBOL_GPL(phy_eee_cap1_features);
 
+static const int phy_eee_cap2_features_array[] = {
+	ETHTOOL_LINK_MODE_2500baseT_Full_BIT,
+	ETHTOOL_LINK_MODE_5000baseT_Full_BIT,
+};
+
+__ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap2_features) __ro_after_init;
+EXPORT_SYMBOL_GPL(phy_eee_cap2_features);
+
 static void features_init(void)
 {
 	/* 10/100 half/full*/
@@ -232,6 +240,9 @@ static void features_init(void)
 	linkmode_set_bit_array(phy_eee_cap1_features_array,
 			       ARRAY_SIZE(phy_eee_cap1_features_array),
 			       phy_eee_cap1_features);
+	linkmode_set_bit_array(phy_eee_cap2_features_array,
+			       ARRAY_SIZE(phy_eee_cap2_features_array),
+			       phy_eee_cap2_features);
 
 }
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index c2dda21b39e1..e3ab2c347a59 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -54,6 +54,7 @@ extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_features) __ro_after_init;
 extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_fec_features) __ro_after_init;
 extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_full_features) __ro_after_init;
 extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap1_features) __ro_after_init;
+extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap2_features) __ro_after_init;
 
 #define PHY_BASIC_FEATURES ((unsigned long *)&phy_basic_features)
 #define PHY_BASIC_T1_FEATURES ((unsigned long *)&phy_basic_t1_features)
@@ -65,6 +66,7 @@ extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap1_features) __ro_after_init;
 #define PHY_10GBIT_FEC_FEATURES ((unsigned long *)&phy_10gbit_fec_features)
 #define PHY_10GBIT_FULL_FEATURES ((unsigned long *)&phy_10gbit_full_features)
 #define PHY_EEE_CAP1_FEATURES ((unsigned long *)&phy_eee_cap1_features)
+#define PHY_EEE_CAP2_FEATURES ((unsigned long *)&phy_eee_cap2_features)
 
 extern const int phy_basic_ports_array[3];
 extern const int phy_fibre_port_array[1];
-- 
cgit v1.2.3


From 1e63e5a813fa6203d7430af51d6bffb728525015 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 16 Feb 2024 15:27:44 -0800
Subject: net: sched: Annotate struct tc_pedit with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct tc_pedit.
Additionally, since the element count member must be set before accessing
the annotated flexible array member, move its initialization earlier.

Link: https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci [1]
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_pedit.h | 2 +-
 net/sched/act_pedit.c                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/tc_act/tc_pedit.h b/include/uapi/linux/tc_act/tc_pedit.h
index f3e61b04fa01..f5cab7fc96ab 100644
--- a/include/uapi/linux/tc_act/tc_pedit.h
+++ b/include/uapi/linux/tc_act/tc_pedit.h
@@ -62,7 +62,7 @@ struct tc_pedit_sel {
 	tc_gen;
 	unsigned char           nkeys;
 	unsigned char           flags;
-	struct tc_pedit_key     keys[0];
+	struct tc_pedit_key     keys[] __counted_by(nkeys);
 };
 
 #define tc_pedit tc_pedit_sel
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index df5a02d5f919..fc0a35a7b62a 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -515,11 +515,11 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
 		spin_unlock_bh(&p->tcf_lock);
 		return -ENOBUFS;
 	}
+	opt->nkeys = parms->tcfp_nkeys;
 
 	memcpy(opt->keys, parms->tcfp_keys,
 	       flex_array_size(opt, keys, parms->tcfp_nkeys));
 	opt->index = p->tcf_index;
-	opt->nkeys = parms->tcfp_nkeys;
 	opt->flags = parms->tcfp_flags;
 	opt->action = p->tcf_action;
 	opt->refcnt = refcount_read(&p->tcf_refcnt) - ref;
-- 
cgit v1.2.3


From 56ef27e3abe6d6453b1f4f6127041f3a65d7cbc9 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 15 Feb 2024 12:39:05 +0100
Subject: page_pool: disable direct recycling based on pool->cpuid on destroy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that direct recycling is performed basing on pool->cpuid when set,
memory leaks are possible:

1. A pool is destroyed.
2. Alloc cache is emptied (it's done only once).
3. pool->cpuid is still set.
4. napi_pp_put_page() does direct recycling basing on pool->cpuid.
5. Now alloc cache is not empty, but it won't ever be freed.

In order to avoid that, rewrite pool->cpuid to -1 when unlinking NAPI to
make sure no direct recycling will be possible after emptying the cache.
This involves a bit of overhead as pool->cpuid now must be accessed
via READ_ONCE() to avoid partial reads.
Rename page_pool_unlink_napi() -> page_pool_disable_direct_recycling()
to reflect what it actually does and unexport it.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/20240215113905.96817-1-aleksander.lobakin@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/page_pool/types.h |  5 -----
 net/core/page_pool.c          | 10 +++++++---
 net/core/skbuff.c             |  2 +-
 3 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 3828396ae60c..3590fbe6e3f1 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -210,17 +210,12 @@ struct page_pool *page_pool_create_percpu(const struct page_pool_params *params,
 struct xdp_mem_info;
 
 #ifdef CONFIG_PAGE_POOL
-void page_pool_unlink_napi(struct page_pool *pool);
 void page_pool_destroy(struct page_pool *pool);
 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
 			   struct xdp_mem_info *mem);
 void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 			     int count);
 #else
-static inline void page_pool_unlink_napi(struct page_pool *pool)
-{
-}
-
 static inline void page_pool_destroy(struct page_pool *pool)
 {
 }
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 89c835fcf094..e8b9399d8e32 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -949,8 +949,13 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
 	pool->xdp_mem_id = mem->id;
 }
 
-void page_pool_unlink_napi(struct page_pool *pool)
+static void page_pool_disable_direct_recycling(struct page_pool *pool)
 {
+	/* Disable direct recycling based on pool->cpuid.
+	 * Paired with READ_ONCE() in napi_pp_put_page().
+	 */
+	WRITE_ONCE(pool->cpuid, -1);
+
 	if (!pool->p.napi)
 		return;
 
@@ -962,7 +967,6 @@ void page_pool_unlink_napi(struct page_pool *pool)
 
 	WRITE_ONCE(pool->p.napi, NULL);
 }
-EXPORT_SYMBOL(page_pool_unlink_napi);
 
 void page_pool_destroy(struct page_pool *pool)
 {
@@ -972,7 +976,7 @@ void page_pool_destroy(struct page_pool *pool)
 	if (!page_pool_put(pool))
 		return;
 
-	page_pool_unlink_napi(pool);
+	page_pool_disable_direct_recycling(pool);
 	page_pool_free_frag(pool);
 
 	if (!page_pool_release(pool))
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0d9a489e6ae1..b41856585c24 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1018,7 +1018,7 @@ bool napi_pp_put_page(struct page *page, bool napi_safe)
 		unsigned int cpuid = smp_processor_id();
 
 		allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid;
-		allow_direct |= (pp->cpuid == cpuid);
+		allow_direct |= READ_ONCE(pp->cpuid) == cpuid;
 	}
 
 	/* Driver set this to memory recycling info. Reset it on recycle.
-- 
cgit v1.2.3


From f853fa5c54e7a0364a52125074dedeaf2c7ddace Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 16 Feb 2024 10:25:43 +0100
Subject: net: page_pool: fix recycle stats for system page_pool allocator

Use global percpu page_pool_recycle_stats counter for system page_pool
allocator instead of allocating a separate percpu variable for each
(also percpu) page pool instance.

Reviewed-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/87f572425e98faea3da45f76c3c68815c01a20ee.1708075412.git.lorenzo@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/page_pool/types.h |  5 +++--
 net/core/dev.c                |  1 +
 net/core/page_pool.c          | 22 +++++++++++++++++-----
 3 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 3590fbe6e3f1..5e43a08d3231 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -18,8 +18,9 @@
 					* Please note DMA-sync-for-CPU is still
 					* device driver responsibility
 					*/
-#define PP_FLAG_ALL		(PP_FLAG_DMA_MAP |\
-				 PP_FLAG_DMA_SYNC_DEV)
+#define PP_FLAG_SYSTEM_POOL	BIT(2) /* Global system page_pool */
+#define PP_FLAG_ALL		(PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \
+				 PP_FLAG_SYSTEM_POOL)
 
 /*
  * Fast allocation side cache array/stack
diff --git a/net/core/dev.c b/net/core/dev.c
index cc9c2eda65ac..c588808be77f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -11738,6 +11738,7 @@ static int net_page_pool_create(int cpuid)
 #if IS_ENABLED(CONFIG_PAGE_POOL)
 	struct page_pool_params page_pool_params = {
 		.pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
+		.flags = PP_FLAG_SYSTEM_POOL,
 		.nid = NUMA_NO_NODE,
 	};
 	struct page_pool *pp_ptr;
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index e8b9399d8e32..d706fe5548df 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -31,6 +31,8 @@
 #define BIAS_MAX	(LONG_MAX >> 1)
 
 #ifdef CONFIG_PAGE_POOL_STATS
+static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
+
 /* alloc_stat_inc is intended to be used in softirq context */
 #define alloc_stat_inc(pool, __stat)	(pool->alloc_stats.__stat++)
 /* recycle_stat_inc is safe to use when preemption is possible. */
@@ -220,14 +222,23 @@ static int page_pool_init(struct page_pool *pool,
 	pool->has_init_callback = !!pool->slow.init_callback;
 
 #ifdef CONFIG_PAGE_POOL_STATS
-	pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
-	if (!pool->recycle_stats)
-		return -ENOMEM;
+	if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) {
+		pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
+		if (!pool->recycle_stats)
+			return -ENOMEM;
+	} else {
+		/* For system page pool instance we use a singular stats object
+		 * instead of allocating a separate percpu variable for each
+		 * (also percpu) page pool instance.
+		 */
+		pool->recycle_stats = &pp_system_recycle_stats;
+	}
 #endif
 
 	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
 #ifdef CONFIG_PAGE_POOL_STATS
-		free_percpu(pool->recycle_stats);
+		if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL))
+			free_percpu(pool->recycle_stats);
 #endif
 		return -ENOMEM;
 	}
@@ -251,7 +262,8 @@ static void page_pool_uninit(struct page_pool *pool)
 		put_device(pool->p.dev);
 
 #ifdef CONFIG_PAGE_POOL_STATS
-	free_percpu(pool->recycle_stats);
+	if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL))
+		free_percpu(pool->recycle_stats);
 #endif
 }
 
-- 
cgit v1.2.3


From 18ddbf5cf0e7553fd05c3e1a02d740514ee3f0a6 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Wed, 14 Feb 2024 14:34:02 -0800
Subject: net: introduce abstraction for network memory

Add the netmem_ref type, an abstraction for network memory.

To add support for new memory types to the net stack, we must first
abstract the current memory type. Currently parts of the net stack
use struct page directly:

- page_pool
- drivers
- skb_frag_t

Originally the plan was to reuse struct page* for the new memory types,
and to set the LSB on the page* to indicate it's not really a page.
However, for compiler type checking we need to introduce a new type.

netmem_ref is introduced to abstract the underlying memory type.
Currently it's a no-op abstraction that is always a struct page
underneath. In parallel there is an undergoing effort to add support
for devmem to the net stack:

https://lore.kernel.org/netdev/20231208005250.2910004-1-almasrymina@google.com/

netmem_ref can be pointers to different underlying memory types, and the
low bits are set to indicate the memory type. Helpers are provided
to convert netmem pointers to the underlying memory type (currently only
struct page). In the devmem series helpers are provided so that calling
code can use netmem without worrying about the underlying memory type
unless absolutely necessary.

Reviewed-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/netmem.h | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 include/net/netmem.h

(limited to 'include')

diff --git a/include/net/netmem.h b/include/net/netmem.h
new file mode 100644
index 000000000000..d8b810245c1d
--- /dev/null
+++ b/include/net/netmem.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ *	Network memory
+ *
+ *	Author:	Mina Almasry <almasrymina@google.com>
+ */
+
+#ifndef _NET_NETMEM_H
+#define _NET_NETMEM_H
+
+/**
+ * typedef netmem_ref - a nonexistent type marking a reference to generic
+ * network memory.
+ *
+ * A netmem_ref currently is always a reference to a struct page. This
+ * abstraction is introduced so support for new memory types can be added.
+ *
+ * Use the supplied helpers to obtain the underlying memory pointer and fields.
+ */
+typedef unsigned long __bitwise netmem_ref;
+
+/* This conversion fails (returns NULL) if the netmem_ref is not struct page
+ * backed.
+ *
+ * Currently struct page is the only possible netmem, and this helper never
+ * fails.
+ */
+static inline struct page *netmem_to_page(netmem_ref netmem)
+{
+	return (__force struct page *)netmem;
+}
+
+/* Converting from page to netmem is always safe, because a page can always be
+ * a netmem.
+ */
+static inline netmem_ref page_to_netmem(struct page *page)
+{
+	return (__force netmem_ref)page;
+}
+
+#endif /* _NET_NETMEM_H */
-- 
cgit v1.2.3


From 21d2e6737c9789aa9b23c8a4131cbca8260139fd Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Wed, 14 Feb 2024 14:34:03 -0800
Subject: net: add netmem to skb_frag_t

Use struct netmem* instead of page in skb_frag_t. Currently struct
netmem* is always a struct page underneath, but the abstraction
allows efforts to add support for skb frags not backed by pages.

There is unfortunately 1 instance where the skb_frag_t is assumed to be
a exactly a bio_vec in kcm. For this case, WARN_ON_ONCE and return error
before doing a cast.

Add skb[_frag]_fill_netmem_*() and skb_add_rx_frag_netmem() helpers so
that the API can be used to create netmem skbs.

Signed-off-by: Mina Almasry <almasrymina@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/skbuff.h | 100 +++++++++++++++++++++++++++++++++++--------------
 net/core/skbuff.c      |  34 +++++++++++++----
 net/kcm/kcmsock.c      |   7 ++--
 3 files changed, 102 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 696e7680656f..e3a2ed5d09ad 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -37,6 +37,7 @@
 #endif
 #include <net/net_debug.h>
 #include <net/dropreason-core.h>
+#include <net/netmem.h>
 
 /**
  * DOC: skb checksums
@@ -359,7 +360,11 @@ extern int sysctl_max_skb_frags;
  */
 #define GSO_BY_FRAGS	0xFFFF
 
-typedef struct bio_vec skb_frag_t;
+typedef struct skb_frag {
+	netmem_ref netmem;
+	unsigned int len;
+	unsigned int offset;
+} skb_frag_t;
 
 /**
  * skb_frag_size() - Returns the size of a skb fragment
@@ -367,7 +372,7 @@ typedef struct bio_vec skb_frag_t;
  */
 static inline unsigned int skb_frag_size(const skb_frag_t *frag)
 {
-	return frag->bv_len;
+	return frag->len;
 }
 
 /**
@@ -377,7 +382,7 @@ static inline unsigned int skb_frag_size(const skb_frag_t *frag)
  */
 static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size)
 {
-	frag->bv_len = size;
+	frag->len = size;
 }
 
 /**
@@ -387,7 +392,7 @@ static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size)
  */
 static inline void skb_frag_size_add(skb_frag_t *frag, int delta)
 {
-	frag->bv_len += delta;
+	frag->len += delta;
 }
 
 /**
@@ -397,7 +402,7 @@ static inline void skb_frag_size_add(skb_frag_t *frag, int delta)
  */
 static inline void skb_frag_size_sub(skb_frag_t *frag, int delta)
 {
-	frag->bv_len -= delta;
+	frag->len -= delta;
 }
 
 /**
@@ -417,7 +422,7 @@ static inline bool skb_frag_must_loop(struct page *p)
  *	skb_frag_foreach_page - loop over pages in a fragment
  *
  *	@f:		skb frag to operate on
- *	@f_off:		offset from start of f->bv_page
+ *	@f_off:		offset from start of f->netmem
  *	@f_len:		length from f_off to loop over
  *	@p:		(temp var) current page
  *	@p_off:		(temp var) offset from start of current page,
@@ -2429,22 +2434,37 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
 	return skb_headlen(skb) + __skb_pagelen(skb);
 }
 
+static inline void skb_frag_fill_netmem_desc(skb_frag_t *frag,
+					     netmem_ref netmem, int off,
+					     int size)
+{
+	frag->netmem = netmem;
+	frag->offset = off;
+	skb_frag_size_set(frag, size);
+}
+
 static inline void skb_frag_fill_page_desc(skb_frag_t *frag,
 					   struct page *page,
 					   int off, int size)
 {
-	frag->bv_page = page;
-	frag->bv_offset = off;
-	skb_frag_size_set(frag, size);
+	skb_frag_fill_netmem_desc(frag, page_to_netmem(page), off, size);
+}
+
+static inline void __skb_fill_netmem_desc_noacc(struct skb_shared_info *shinfo,
+						int i, netmem_ref netmem,
+						int off, int size)
+{
+	skb_frag_t *frag = &shinfo->frags[i];
+
+	skb_frag_fill_netmem_desc(frag, netmem, off, size);
 }
 
 static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo,
 					      int i, struct page *page,
 					      int off, int size)
 {
-	skb_frag_t *frag = &shinfo->frags[i];
-
-	skb_frag_fill_page_desc(frag, page, off, size);
+	__skb_fill_netmem_desc_noacc(shinfo, i, page_to_netmem(page), off,
+				     size);
 }
 
 /**
@@ -2460,10 +2480,10 @@ static inline void skb_len_add(struct sk_buff *skb, int delta)
 }
 
 /**
- * __skb_fill_page_desc - initialise a paged fragment in an skb
+ * __skb_fill_netmem_desc - initialise a fragment in an skb
  * @skb: buffer containing fragment to be initialised
- * @i: paged fragment index to initialise
- * @page: the page to use for this fragment
+ * @i: fragment index to initialise
+ * @netmem: the netmem to use for this fragment
  * @off: the offset to the data with @page
  * @size: the length of the data
  *
@@ -2472,10 +2492,12 @@ static inline void skb_len_add(struct sk_buff *skb, int delta)
  *
  * Does not take any additional reference on the fragment.
  */
-static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
-					struct page *page, int off, int size)
+static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i,
+					  netmem_ref netmem, int off, int size)
 {
-	__skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size);
+	struct page *page = netmem_to_page(netmem);
+
+	__skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size);
 
 	/* Propagate page pfmemalloc to the skb if we can. The problem is
 	 * that not all callers have unique ownership of the page but rely
@@ -2483,7 +2505,20 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
 	 */
 	page = compound_head(page);
 	if (page_is_pfmemalloc(page))
-		skb->pfmemalloc	= true;
+		skb->pfmemalloc = true;
+}
+
+static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
+					struct page *page, int off, int size)
+{
+	__skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
+}
+
+static inline void skb_fill_netmem_desc(struct sk_buff *skb, int i,
+					netmem_ref netmem, int off, int size)
+{
+	__skb_fill_netmem_desc(skb, i, netmem, off, size);
+	skb_shinfo(skb)->nr_frags = i + 1;
 }
 
 /**
@@ -2503,8 +2538,7 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
 static inline void skb_fill_page_desc(struct sk_buff *skb, int i,
 				      struct page *page, int off, int size)
 {
-	__skb_fill_page_desc(skb, i, page, off, size);
-	skb_shinfo(skb)->nr_frags = i + 1;
+	skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
 }
 
 /**
@@ -2528,8 +2562,16 @@ static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i,
 	shinfo->nr_frags = i + 1;
 }
 
-void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
-		     int size, unsigned int truesize);
+void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
+			    int off, int size, unsigned int truesize);
+
+static inline void skb_add_rx_frag(struct sk_buff *skb, int i,
+				   struct page *page, int off, int size,
+				   unsigned int truesize)
+{
+	skb_add_rx_frag_netmem(skb, i, page_to_netmem(page), off, size,
+			       truesize);
+}
 
 void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
 			  unsigned int truesize);
@@ -3378,7 +3420,7 @@ static inline void skb_propagate_pfmemalloc(const struct page *page,
  */
 static inline unsigned int skb_frag_off(const skb_frag_t *frag)
 {
-	return frag->bv_offset;
+	return frag->offset;
 }
 
 /**
@@ -3388,7 +3430,7 @@ static inline unsigned int skb_frag_off(const skb_frag_t *frag)
  */
 static inline void skb_frag_off_add(skb_frag_t *frag, int delta)
 {
-	frag->bv_offset += delta;
+	frag->offset += delta;
 }
 
 /**
@@ -3398,7 +3440,7 @@ static inline void skb_frag_off_add(skb_frag_t *frag, int delta)
  */
 static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset)
 {
-	frag->bv_offset = offset;
+	frag->offset = offset;
 }
 
 /**
@@ -3409,7 +3451,7 @@ static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset)
 static inline void skb_frag_off_copy(skb_frag_t *fragto,
 				     const skb_frag_t *fragfrom)
 {
-	fragto->bv_offset = fragfrom->bv_offset;
+	fragto->offset = fragfrom->offset;
 }
 
 /**
@@ -3420,7 +3462,7 @@ static inline void skb_frag_off_copy(skb_frag_t *fragto,
  */
 static inline struct page *skb_frag_page(const skb_frag_t *frag)
 {
-	return frag->bv_page;
+	return netmem_to_page(frag->netmem);
 }
 
 /**
@@ -3528,7 +3570,7 @@ static inline void *skb_frag_address_safe(const skb_frag_t *frag)
 static inline void skb_frag_page_copy(skb_frag_t *fragto,
 				      const skb_frag_t *fragfrom)
 {
-	fragto->bv_page = fragfrom->bv_page;
+	fragto->netmem = fragfrom->netmem;
 }
 
 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b41856585c24..1434c422f76e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -115,6 +115,24 @@ static struct kmem_cache *skb_small_head_cache __ro_after_init;
 int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
 EXPORT_SYMBOL(sysctl_max_skb_frags);
 
+/* kcm_write_msgs() relies on casting paged frags to bio_vec to use
+ * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
+ * netmem is a page.
+ */
+static_assert(offsetof(struct bio_vec, bv_page) ==
+	      offsetof(skb_frag_t, netmem));
+static_assert(sizeof_field(struct bio_vec, bv_page) ==
+	      sizeof_field(skb_frag_t, netmem));
+
+static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len));
+static_assert(sizeof_field(struct bio_vec, bv_len) ==
+	      sizeof_field(skb_frag_t, len));
+
+static_assert(offsetof(struct bio_vec, bv_offset) ==
+	      offsetof(skb_frag_t, offset));
+static_assert(sizeof_field(struct bio_vec, bv_offset) ==
+	      sizeof_field(skb_frag_t, offset));
+
 #undef FN
 #define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
 static const char * const drop_reasons[] = {
@@ -845,17 +863,17 @@ skb_fail:
 }
 EXPORT_SYMBOL(__napi_alloc_skb);
 
-void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
-		     int size, unsigned int truesize)
+void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
+			    int off, int size, unsigned int truesize)
 {
 	DEBUG_NET_WARN_ON_ONCE(size > truesize);
 
-	skb_fill_page_desc(skb, i, page, off, size);
+	skb_fill_netmem_desc(skb, i, netmem, off, size);
 	skb->len += size;
 	skb->data_len += size;
 	skb->truesize += truesize;
 }
-EXPORT_SYMBOL(skb_add_rx_frag);
+EXPORT_SYMBOL(skb_add_rx_frag_netmem);
 
 void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
 			  unsigned int truesize)
@@ -1999,10 +2017,11 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 
 	/* skb frags point to kernel buffers */
 	for (i = 0; i < new_frags - 1; i++) {
-		__skb_fill_page_desc(skb, i, head, 0, psize);
+		__skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize);
 		head = (struct page *)page_private(head);
 	}
-	__skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
+	__skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0,
+			       d_off);
 	skb_shinfo(skb)->nr_frags = new_frags;
 
 release:
@@ -3740,7 +3759,8 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
 		if (plen) {
 			page = virt_to_head_page(from->head);
 			offset = from->data - (unsigned char *)page_address(page);
-			__skb_fill_page_desc(to, 0, page, offset, plen);
+			__skb_fill_netmem_desc(to, 0, page_to_netmem(page),
+					       offset, plen);
 			get_page(page);
 			j = 1;
 			len -= plen;
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 1184d40167b8..73c200c5c8e4 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -627,7 +627,8 @@ retry:
 			skb = txm->frag_skb;
 		}
 
-		if (WARN_ON(!skb_shinfo(skb)->nr_frags)) {
+		if (WARN_ON(!skb_shinfo(skb)->nr_frags) ||
+		    WARN_ON_ONCE(!skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
 			ret = -EINVAL;
 			goto out;
 		}
@@ -637,8 +638,8 @@ retry:
 			msize += skb_frag_size(&skb_shinfo(skb)->frags[i]);
 
 		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE,
-			      skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags,
-			      msize);
+			      (const struct bio_vec *)skb_shinfo(skb)->frags,
+			      skb_shinfo(skb)->nr_frags, msize);
 		iov_iter_advance(&msg.msg_iter, txm->frag_offset);
 
 		do {
-- 
cgit v1.2.3


From 5d4cc87414c5d11345c4b11d61377d351b5c28a2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 16 Feb 2024 16:20:06 +0000
Subject: net: reorganize "struct sock" fields

Last major reorg happened in commit 9115e8cd2a0c ("net: reorganize
struct sock for better data locality")

Since then, many changes have been done.

Before SO_PEEK_OFF support is added to TCP, we need
to move sk_peek_off to a better location.

It is time to make another pass, and add six groups,
without explicit alignment.

- sock_write_rx (following sk_refcnt) read-write fields in rx path.
- sock_read_rx read-mostly fields in rx path.
- sock_read_rxtx read-mostly fields in both rx and tx paths.
- sock_write_rxtx read-write fields in both rx and tx paths.
- sock_write_tx read-write fields in tx paths.
- sock_read_tx read-mostly fields in tx paths.

Results on TCP_RR benchmarks seem to show a gain (4 to 5 %).

It is possible UDP needs a change, because sk_peek_off
shares a cache line with sk_receive_queue.
If this the case, we can exchange roles of sk->sk_receive
and up->reader_queue queues.

After this change, we have the following layout:

struct sock {
	struct sock_common         __sk_common;          /*     0  0x88 */
	/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
	__u8                       __cacheline_group_begin__sock_write_rx[0]; /*  0x88     0 */
	atomic_t                   sk_drops;             /*  0x88   0x4 */
	__s32                      sk_peek_off;          /*  0x8c   0x4 */
	struct sk_buff_head        sk_error_queue;       /*  0x90  0x18 */
	struct sk_buff_head        sk_receive_queue;     /*  0xa8  0x18 */
	/* --- cacheline 3 boundary (192 bytes) --- */
	struct {
		atomic_t           rmem_alloc;           /*  0xc0   0x4 */
		int                len;                  /*  0xc4   0x4 */
		struct sk_buff *   head;                 /*  0xc8   0x8 */
		struct sk_buff *   tail;                 /*  0xd0   0x8 */
	} sk_backlog;                                    /*  0xc0  0x18 */
	struct {
		atomic_t                   rmem_alloc;           /*     0   0x4 */
		int                        len;                  /*   0x4   0x4 */
		struct sk_buff *           head;                 /*   0x8   0x8 */
		struct sk_buff *           tail;                 /*  0x10   0x8 */

		/* size: 24, cachelines: 1, members: 4 */
		/* last cacheline: 24 bytes */
	};

	__u8                       __cacheline_group_end__sock_write_rx[0]; /*  0xd8     0 */
	__u8                       __cacheline_group_begin__sock_read_rx[0]; /*  0xd8     0 */
	rcu *                      sk_rx_dst;            /*  0xd8   0x8 */
	int                        sk_rx_dst_ifindex;    /*  0xe0   0x4 */
	u32                        sk_rx_dst_cookie;     /*  0xe4   0x4 */
	unsigned int               sk_ll_usec;           /*  0xe8   0x4 */
	unsigned int               sk_napi_id;           /*  0xec   0x4 */
	u16                        sk_busy_poll_budget;  /*  0xf0   0x2 */
	u8                         sk_prefer_busy_poll;  /*  0xf2   0x1 */
	u8                         sk_userlocks;         /*  0xf3   0x1 */
	int                        sk_rcvbuf;            /*  0xf4   0x4 */
	rcu *                      sk_filter;            /*  0xf8   0x8 */
	/* --- cacheline 4 boundary (256 bytes) --- */
	union {
		rcu *              sk_wq;                /* 0x100   0x8 */
		struct socket_wq * sk_wq_raw;            /* 0x100   0x8 */
	};                                               /* 0x100   0x8 */
	union {
		rcu *                      sk_wq;                /*     0   0x8 */
		struct socket_wq *         sk_wq_raw;            /*     0   0x8 */
	};

	void                       (*sk_data_ready)(struct sock *); /* 0x108   0x8 */
	long                       sk_rcvtimeo;          /* 0x110   0x8 */
	int                        sk_rcvlowat;          /* 0x118   0x4 */
	__u8                       __cacheline_group_end__sock_read_rx[0]; /* 0x11c     0 */
	__u8                       __cacheline_group_begin__sock_read_rxtx[0]; /* 0x11c     0 */
	int                        sk_err;               /* 0x11c   0x4 */
	struct socket *            sk_socket;            /* 0x120   0x8 */
	struct mem_cgroup *        sk_memcg;             /* 0x128   0x8 */
	rcu *                      sk_policy[2];         /* 0x130  0x10 */
	/* --- cacheline 5 boundary (320 bytes) --- */
	__u8                       __cacheline_group_end__sock_read_rxtx[0]; /* 0x140     0 */
	__u8                       __cacheline_group_begin__sock_write_rxtx[0]; /* 0x140     0 */
	socket_lock_t              sk_lock;              /* 0x140  0x20 */
	u32                        sk_reserved_mem;      /* 0x160   0x4 */
	int                        sk_forward_alloc;     /* 0x164   0x4 */
	u32                        sk_tsflags;           /* 0x168   0x4 */
	__u8                       __cacheline_group_end__sock_write_rxtx[0]; /* 0x16c     0 */
	__u8                       __cacheline_group_begin__sock_write_tx[0]; /* 0x16c     0 */
	int                        sk_write_pending;     /* 0x16c   0x4 */
	atomic_t                   sk_omem_alloc;        /* 0x170   0x4 */
	int                        sk_sndbuf;            /* 0x174   0x4 */
	int                        sk_wmem_queued;       /* 0x178   0x4 */
	refcount_t                 sk_wmem_alloc;        /* 0x17c   0x4 */
	/* --- cacheline 6 boundary (384 bytes) --- */
	unsigned long              sk_tsq_flags;         /* 0x180   0x8 */
	union {
		struct sk_buff *   sk_send_head;         /* 0x188   0x8 */
		struct rb_root     tcp_rtx_queue;        /* 0x188   0x8 */
	};                                               /* 0x188   0x8 */
	union {
		struct sk_buff *           sk_send_head;         /*     0   0x8 */
		struct rb_root             tcp_rtx_queue;        /*     0   0x8 */
	};

	struct sk_buff_head        sk_write_queue;       /* 0x190  0x18 */
	u32                        sk_dst_pending_confirm; /* 0x1a8   0x4 */
	u32                        sk_pacing_status;     /* 0x1ac   0x4 */
	struct page_frag           sk_frag;              /* 0x1b0  0x10 */
	/* --- cacheline 7 boundary (448 bytes) --- */
	struct timer_list          sk_timer;             /* 0x1c0  0x28 */

	/* XXX last struct has 4 bytes of padding */

	unsigned long              sk_pacing_rate;       /* 0x1e8   0x8 */
	atomic_t                   sk_zckey;             /* 0x1f0   0x4 */
	atomic_t                   sk_tskey;             /* 0x1f4   0x4 */
	__u8                       __cacheline_group_end__sock_write_tx[0]; /* 0x1f8     0 */
	__u8                       __cacheline_group_begin__sock_read_tx[0]; /* 0x1f8     0 */
	unsigned long              sk_max_pacing_rate;   /* 0x1f8   0x8 */
	/* --- cacheline 8 boundary (512 bytes) --- */
	long                       sk_sndtimeo;          /* 0x200   0x8 */
	u32                        sk_priority;          /* 0x208   0x4 */
	u32                        sk_mark;              /* 0x20c   0x4 */
	rcu *                      sk_dst_cache;         /* 0x210   0x8 */
	netdev_features_t          sk_route_caps;        /* 0x218   0x8 */
	u16                        sk_gso_type;          /* 0x220   0x2 */
	u16                        sk_gso_max_segs;      /* 0x222   0x2 */
	unsigned int               sk_gso_max_size;      /* 0x224   0x4 */
	gfp_t                      sk_allocation;        /* 0x228   0x4 */
	u32                        sk_txhash;            /* 0x22c   0x4 */
	u8                         sk_pacing_shift;      /* 0x230   0x1 */
	bool                       sk_use_task_frag;     /* 0x231   0x1 */
	__u8                       __cacheline_group_end__sock_read_tx[0]; /* 0x232     0 */
	u8                         sk_gso_disabled:1;    /* 0x232: 0 0x1 */
	u8                         sk_kern_sock:1;       /* 0x232:0x1 0x1 */
	u8                         sk_no_check_tx:1;     /* 0x232:0x2 0x1 */
	u8                         sk_no_check_rx:1;     /* 0x232:0x3 0x1 */

	/* XXX 4 bits hole, try to pack */

	u8                         sk_shutdown;          /* 0x233   0x1 */
	u16                        sk_type;              /* 0x234   0x2 */
	u16                        sk_protocol;          /* 0x236   0x2 */
	unsigned long              sk_lingertime;        /* 0x238   0x8 */
	/* --- cacheline 9 boundary (576 bytes) --- */
	struct proto *             sk_prot_creator;      /* 0x240   0x8 */
	rwlock_t                   sk_callback_lock;     /* 0x248   0x8 */
	int                        sk_err_soft;          /* 0x250   0x4 */
	u32                        sk_ack_backlog;       /* 0x254   0x4 */
	u32                        sk_max_ack_backlog;   /* 0x258   0x4 */
	kuid_t                     sk_uid;               /* 0x25c   0x4 */
	spinlock_t                 sk_peer_lock;         /* 0x260   0x4 */
	int                        sk_bind_phc;          /* 0x264   0x4 */
	struct pid *               sk_peer_pid;          /* 0x268   0x8 */
	const struct cred  *       sk_peer_cred;         /* 0x270   0x8 */
	ktime_t                    sk_stamp;             /* 0x278   0x8 */
	/* --- cacheline 10 boundary (640 bytes) --- */
	int                        sk_disconnects;       /* 0x280   0x4 */
	u8                         sk_txrehash;          /* 0x284   0x1 */
	u8                         sk_clockid;           /* 0x285   0x1 */
	u8                         sk_txtime_deadline_mode:1; /* 0x286: 0 0x1 */
	u8                         sk_txtime_report_errors:1; /* 0x286:0x1 0x1 */
	u8                         sk_txtime_unused:6;   /* 0x286:0x2 0x1 */

	/* XXX 1 byte hole, try to pack */

	void *                     sk_user_data;         /* 0x288   0x8 */
	void *                     sk_security;          /* 0x290   0x8 */
	struct sock_cgroup_data    sk_cgrp_data;         /* 0x298   0x8 */
	void                       (*sk_state_change)(struct sock *); /* 0x2a0   0x8 */
	void                       (*sk_write_space)(struct sock *); /* 0x2a8   0x8 */
	void                       (*sk_error_report)(struct sock *); /* 0x2b0   0x8 */
	int                        (*sk_backlog_rcv)(struct sock *, struct sk_buff *); /* 0x2b8   0x8 */
	/* --- cacheline 11 boundary (704 bytes) --- */
	void                       (*sk_destruct)(struct sock *); /* 0x2c0   0x8 */
	rcu *                      sk_reuseport_cb;      /* 0x2c8   0x8 */
	rcu *                      sk_bpf_storage;       /* 0x2d0   0x8 */
	struct callback_head       sk_rcu __attribute__((__aligned__(8))); /* 0x2d8  0x10 */
	netns_tracker              ns_tracker;           /* 0x2e8   0x8 */

	/* size: 752, cachelines: 12, members: 105 */
	/* sum members: 749, holes: 1, sum holes: 1 */
	/* sum bitfield members: 12 bits, bit holes: 1, sum bit holes: 4 bits */
	/* paddings: 1, sum paddings: 4 */
	/* forced alignments: 1 */
	/* last cacheline: 48 bytes */
};

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20240216162006.2342759-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/sock.h | 108 ++++++++++++++++++++++++++++++-----------------------
 net/core/sock.c    |  62 ++++++++++++++++++++++++++++++
 2 files changed, 123 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index a9d99a9c583f..796a902cf4c1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -378,14 +378,10 @@ struct sock {
 #define sk_flags		__sk_common.skc_flags
 #define sk_rxhash		__sk_common.skc_rxhash
 
-	/* early demux fields */
-	struct dst_entry __rcu	*sk_rx_dst;
-	int			sk_rx_dst_ifindex;
-	u32			sk_rx_dst_cookie;
+	__cacheline_group_begin(sock_write_rx);
 
-	socket_lock_t		sk_lock;
 	atomic_t		sk_drops;
-	int			sk_rcvlowat;
+	__s32			sk_peek_off;
 	struct sk_buff_head	sk_error_queue;
 	struct sk_buff_head	sk_receive_queue;
 	/*
@@ -402,18 +398,24 @@ struct sock {
 		struct sk_buff	*head;
 		struct sk_buff	*tail;
 	} sk_backlog;
-
 #define sk_rmem_alloc sk_backlog.rmem_alloc
 
-	int			sk_forward_alloc;
-	u32			sk_reserved_mem;
+	__cacheline_group_end(sock_write_rx);
+
+	__cacheline_group_begin(sock_read_rx);
+	/* early demux fields */
+	struct dst_entry __rcu	*sk_rx_dst;
+	int			sk_rx_dst_ifindex;
+	u32			sk_rx_dst_cookie;
+
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	unsigned int		sk_ll_usec;
-	/* ===== mostly read cache line ===== */
 	unsigned int		sk_napi_id;
+	u16			sk_busy_poll_budget;
+	u8			sk_prefer_busy_poll;
 #endif
+	u8			sk_userlocks;
 	int			sk_rcvbuf;
-	int			sk_disconnects;
 
 	struct sk_filter __rcu	*sk_filter;
 	union {
@@ -422,15 +424,33 @@ struct sock {
 		struct socket_wq	*sk_wq_raw;
 		/* public: */
 	};
+
+	void			(*sk_data_ready)(struct sock *sk);
+	long			sk_rcvtimeo;
+	int			sk_rcvlowat;
+	__cacheline_group_end(sock_read_rx);
+
+	__cacheline_group_begin(sock_read_rxtx);
+	int			sk_err;
+	struct socket		*sk_socket;
+	struct mem_cgroup	*sk_memcg;
 #ifdef CONFIG_XFRM
 	struct xfrm_policy __rcu *sk_policy[2];
 #endif
+	__cacheline_group_end(sock_read_rxtx);
 
-	struct dst_entry __rcu	*sk_dst_cache;
+	__cacheline_group_begin(sock_write_rxtx);
+	socket_lock_t		sk_lock;
+	u32			sk_reserved_mem;
+	int			sk_forward_alloc;
+	u32			sk_tsflags;
+	__cacheline_group_end(sock_write_rxtx);
+
+	__cacheline_group_begin(sock_write_tx);
+	int			sk_write_pending;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
 
-	/* ===== cache line for TX ===== */
 	int			sk_wmem_queued;
 	refcount_t		sk_wmem_alloc;
 	unsigned long		sk_tsq_flags;
@@ -439,22 +459,36 @@ struct sock {
 		struct rb_root	tcp_rtx_queue;
 	};
 	struct sk_buff_head	sk_write_queue;
-	__s32			sk_peek_off;
-	int			sk_write_pending;
-	__u32			sk_dst_pending_confirm;
+	u32			sk_dst_pending_confirm;
 	u32			sk_pacing_status; /* see enum sk_pacing */
-	long			sk_sndtimeo;
+	struct page_frag	sk_frag;
 	struct timer_list	sk_timer;
-	__u32			sk_priority;
-	__u32			sk_mark;
+
 	unsigned long		sk_pacing_rate; /* bytes per second */
+	atomic_t		sk_zckey;
+	atomic_t		sk_tskey;
+	__cacheline_group_end(sock_write_tx);
+
+	__cacheline_group_begin(sock_read_tx);
 	unsigned long		sk_max_pacing_rate;
-	struct page_frag	sk_frag;
+	long			sk_sndtimeo;
+	u32			sk_priority;
+	u32			sk_mark;
+	struct dst_entry __rcu	*sk_dst_cache;
 	netdev_features_t	sk_route_caps;
-	int			sk_gso_type;
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+	struct sk_buff*		(*sk_validate_xmit_skb)(struct sock *sk,
+							struct net_device *dev,
+							struct sk_buff *skb);
+#endif
+	u16			sk_gso_type;
+	u16			sk_gso_max_segs;
 	unsigned int		sk_gso_max_size;
 	gfp_t			sk_allocation;
-	__u32			sk_txhash;
+	u32			sk_txhash;
+	u8			sk_pacing_shift;
+	bool			sk_use_task_frag;
+	__cacheline_group_end(sock_read_tx);
 
 	/*
 	 * Because of non atomicity rules, all
@@ -463,64 +497,44 @@ struct sock {
 	u8			sk_gso_disabled : 1,
 				sk_kern_sock : 1,
 				sk_no_check_tx : 1,
-				sk_no_check_rx : 1,
-				sk_userlocks : 4;
-	u8			sk_pacing_shift;
+				sk_no_check_rx : 1;
+	u8			sk_shutdown;
 	u16			sk_type;
 	u16			sk_protocol;
-	u16			sk_gso_max_segs;
 	unsigned long	        sk_lingertime;
 	struct proto		*sk_prot_creator;
 	rwlock_t		sk_callback_lock;
-	int			sk_err,
-				sk_err_soft;
+	int			sk_err_soft;
 	u32			sk_ack_backlog;
 	u32			sk_max_ack_backlog;
 	kuid_t			sk_uid;
-	u8			sk_txrehash;
-#ifdef CONFIG_NET_RX_BUSY_POLL
-	u8			sk_prefer_busy_poll;
-	u16			sk_busy_poll_budget;
-#endif
 	spinlock_t		sk_peer_lock;
 	int			sk_bind_phc;
 	struct pid		*sk_peer_pid;
 	const struct cred	*sk_peer_cred;
 
-	long			sk_rcvtimeo;
 	ktime_t			sk_stamp;
 #if BITS_PER_LONG==32
 	seqlock_t		sk_stamp_seq;
 #endif
-	atomic_t		sk_tskey;
-	atomic_t		sk_zckey;
-	u32			sk_tsflags;
-	u8			sk_shutdown;
+	int			sk_disconnects;
 
+	u8			sk_txrehash;
 	u8			sk_clockid;
 	u8			sk_txtime_deadline_mode : 1,
 				sk_txtime_report_errors : 1,
 				sk_txtime_unused : 6;
-	bool			sk_use_task_frag;
 
-	struct socket		*sk_socket;
 	void			*sk_user_data;
 #ifdef CONFIG_SECURITY
 	void			*sk_security;
 #endif
 	struct sock_cgroup_data	sk_cgrp_data;
-	struct mem_cgroup	*sk_memcg;
 	void			(*sk_state_change)(struct sock *sk);
-	void			(*sk_data_ready)(struct sock *sk);
 	void			(*sk_write_space)(struct sock *sk);
 	void			(*sk_error_report)(struct sock *sk);
 	int			(*sk_backlog_rcv)(struct sock *sk,
 						  struct sk_buff *skb);
-#ifdef CONFIG_SOCK_VALIDATE_XMIT
-	struct sk_buff*		(*sk_validate_xmit_skb)(struct sock *sk,
-							struct net_device *dev,
-							struct sk_buff *skb);
-#endif
 	void                    (*sk_destruct)(struct sock *sk);
 	struct sock_reuseport __rcu	*sk_reuseport_cb;
 #ifdef CONFIG_BPF_SYSCALL
diff --git a/net/core/sock.c b/net/core/sock.c
index 88bf810394a5..3fef3407383e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -4234,3 +4234,65 @@ int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
 	return sock_ioctl_out(sk, cmd, arg);
 }
 EXPORT_SYMBOL(sk_ioctl);
+
+static int __init sock_struct_check(void)
+{
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
+
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
+
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
+
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
+
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
+
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
+	return 0;
+}
+
+core_initcall(sock_struct_check);
-- 
cgit v1.2.3


From 219eee9c0d16f1b754a8b85275854ab17df0850a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 16 Feb 2024 12:36:57 +0100
Subject: net: skbuff: add overflow debug check to pull/push helpers

syzbot managed to trigger following splat:
BUG: KASAN: use-after-free in __skb_flow_dissect+0x4a3b/0x5e50
Read of size 1 at addr ffff888208a4000e by task a.out/2313
[..]
  __skb_flow_dissect+0x4a3b/0x5e50
  __skb_get_hash+0xb4/0x400
  ip_tunnel_xmit+0x77e/0x26f0
  ipip_tunnel_xmit+0x298/0x410
  ..

Analysis shows that the skb has a valid ->head, but bogus ->data
pointer.

skb->data gets its bogus value via the neigh layer, which does:

1556    __skb_pull(skb, skb_network_offset(skb));

... and the skb was already dodgy at this point:

skb_network_offset(skb) returns a negative value due to an
earlier overflow of skb->network_header (u16).  __skb_pull thus
"adjusts" skb->data by a huge offset, pointing outside skb->head
area.

Allow debug builds to splat when we try to pull/push more than
INT_MAX bytes.

After this, the syzkaller reproducer yields a more precise splat
before the flow dissector attempts to read off skb->data memory:

WARNING: CPU: 5 PID: 2313 at include/linux/skbuff.h:2653 neigh_connected_output+0x28e/0x400
  ip_finish_output2+0xb25/0xed0
  iptunnel_xmit+0x4ff/0x870
  ipgre_xmit+0x78e/0xbb0

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240216113700.23013-1-fw@strlen.de
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/skbuff.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e3a2ed5d09ad..28c7cb7ce251 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2684,6 +2684,8 @@ static inline void skb_put_u8(struct sk_buff *skb, u8 val)
 void *skb_push(struct sk_buff *skb, unsigned int len);
 static inline void *__skb_push(struct sk_buff *skb, unsigned int len)
 {
+	DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);
+
 	skb->data -= len;
 	skb->len  += len;
 	return skb->data;
@@ -2692,6 +2694,8 @@ static inline void *__skb_push(struct sk_buff *skb, unsigned int len)
 void *skb_pull(struct sk_buff *skb, unsigned int len);
 static inline void *__skb_pull(struct sk_buff *skb, unsigned int len)
 {
+	DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);
+
 	skb->len -= len;
 	if (unlikely(skb->len < skb->data_len)) {
 #if defined(CONFIG_DEBUG_NET)
@@ -2716,6 +2720,8 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta);
 static inline enum skb_drop_reason
 pskb_may_pull_reason(struct sk_buff *skb, unsigned int len)
 {
+	DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);
+
 	if (likely(len <= skb_headlen(skb)))
 		return SKB_NOT_DROPPED_YET;
 
-- 
cgit v1.2.3


From 2f3bfa8e30b5b4864a200be0dc2fb55d8e4b35e4 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Sat, 17 Feb 2024 11:03:06 +0100
Subject: net: wan: framer: constify of_phandle_args in xlate

The xlate callbacks are supposed to translate of_phandle_args to proper
provider without modifying the of_phandle_args.  Make the argument
pointer to const for code safety and readability.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240217100306.86740-1-krzysztof.kozlowski@linaro.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/wan/framer/framer-core.c   |  9 +++++----
 include/linux/framer/framer-provider.h | 14 +++++++-------
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wan/framer/framer-core.c b/drivers/net/wan/framer/framer-core.c
index c04dc88bda6c..33b358b99f70 100644
--- a/drivers/net/wan/framer/framer-core.c
+++ b/drivers/net/wan/framer/framer-core.c
@@ -384,7 +384,7 @@ static struct framer_provider *framer_provider_of_lookup(const struct device_nod
 	return ERR_PTR(-EPROBE_DEFER);
 }
 
-static struct framer *framer_of_get_from_provider(struct of_phandle_args *args)
+static struct framer *framer_of_get_from_provider(const struct of_phandle_args *args)
 {
 	struct framer_provider *framer_provider;
 	struct framer *framer;
@@ -735,7 +735,8 @@ EXPORT_SYMBOL_GPL(devm_framer_create);
  * should provide a custom of_xlate function that reads the *args* and returns
  * the appropriate framer.
  */
-struct framer *framer_provider_simple_of_xlate(struct device *dev, struct of_phandle_args *args)
+struct framer *framer_provider_simple_of_xlate(struct device *dev,
+					       const struct of_phandle_args *args)
 {
 	struct class_dev_iter iter;
 	struct framer *framer;
@@ -768,7 +769,7 @@ EXPORT_SYMBOL_GPL(framer_provider_simple_of_xlate);
 struct framer_provider *
 __framer_provider_of_register(struct device *dev, struct module *owner,
 			      struct framer *(*of_xlate)(struct device *dev,
-							 struct of_phandle_args *args))
+							 const struct of_phandle_args *args))
 {
 	struct framer_provider *framer_provider;
 
@@ -830,7 +831,7 @@ static void devm_framer_provider_of_unregister(struct device *dev, void *res)
 struct framer_provider *
 __devm_framer_provider_of_register(struct device *dev, struct module *owner,
 				   struct framer *(*of_xlate)(struct device *dev,
-							      struct of_phandle_args *args))
+							      const struct of_phandle_args *args))
 {
 	struct framer_provider **ptr, *framer_provider;
 
diff --git a/include/linux/framer/framer-provider.h b/include/linux/framer/framer-provider.h
index 782cd5fc83d5..f6fd2dd92591 100644
--- a/include/linux/framer/framer-provider.h
+++ b/include/linux/framer/framer-provider.h
@@ -93,7 +93,7 @@ struct framer_provider {
 	struct module		*owner;
 	struct list_head	list;
 	struct framer * (*of_xlate)(struct device *dev,
-				    struct of_phandle_args *args);
+				    const struct of_phandle_args *args);
 };
 
 static inline void framer_set_drvdata(struct framer *framer, void *data)
@@ -118,19 +118,19 @@ struct framer *devm_framer_create(struct device *dev, struct device_node *node,
 				  const struct framer_ops *ops);
 
 struct framer *framer_provider_simple_of_xlate(struct device *dev,
-					       struct of_phandle_args *args);
+					       const struct of_phandle_args *args);
 
 struct framer_provider *
 __framer_provider_of_register(struct device *dev, struct module *owner,
 			      struct framer *(*of_xlate)(struct device *dev,
-							 struct of_phandle_args *args));
+							 const struct of_phandle_args *args));
 
 void framer_provider_of_unregister(struct framer_provider *framer_provider);
 
 struct framer_provider *
 __devm_framer_provider_of_register(struct device *dev, struct module *owner,
 				   struct framer *(*of_xlate)(struct device *dev,
-							      struct of_phandle_args *args));
+							      const struct of_phandle_args *args));
 
 void framer_notify_status_change(struct framer *framer);
 
@@ -154,7 +154,7 @@ static inline struct framer *devm_framer_create(struct device *dev, struct devic
 }
 
 static inline struct framer *framer_provider_simple_of_xlate(struct device *dev,
-							     struct of_phandle_args *args)
+							     const struct of_phandle_args *args)
 {
 	return ERR_PTR(-ENOSYS);
 }
@@ -162,7 +162,7 @@ static inline struct framer *framer_provider_simple_of_xlate(struct device *dev,
 static inline struct framer_provider *
 __framer_provider_of_register(struct device *dev, struct module *owner,
 			      struct framer *(*of_xlate)(struct device *dev,
-							 struct of_phandle_args *args))
+							 const struct of_phandle_args *args))
 {
 	return ERR_PTR(-ENOSYS);
 }
@@ -174,7 +174,7 @@ void framer_provider_of_unregister(struct framer_provider *framer_provider)
 static inline struct framer_provider *
 __devm_framer_provider_of_register(struct device *dev, struct module *owner,
 				   struct framer *(*of_xlate)(struct device *dev,
-							      struct of_phandle_args *args))
+							      const struct of_phandle_args *args))
 {
 	return ERR_PTR(-ENOSYS);
 }
-- 
cgit v1.2.3


From 3f801968889459ecae1eab524b039676e6eaa319 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 14 Feb 2024 14:41:02 +0100
Subject: netfilter: move nf_reinject into nfnetlink_queue modules

No need to keep this in the core, move it to the nfnetlink_queue module.
nf_reroute is moved too, there were no other callers.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter.h        |   1 -
 include/net/netfilter/nf_queue.h |   1 -
 net/netfilter/nf_queue.c         | 106 -----------------------------
 net/netfilter/nfnetlink_queue.c  | 142 +++++++++++++++++++++++++++++++++++++++
 net/netfilter/utils.c            |  37 ----------
 5 files changed, 142 insertions(+), 145 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 80900d910992..ffb5e0297eb5 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -370,7 +370,6 @@ __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
 			    u_int8_t protocol, unsigned short family);
 int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
 	     bool strict, unsigned short family);
-int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry);
 
 #include <net/flow.h>
 
diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index c81021ab07aa..4aeffddb7586 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -35,7 +35,6 @@ struct nf_queue_handler {
 
 void nf_register_queue_handler(const struct nf_queue_handler *qh);
 void nf_unregister_queue_handler(void);
-void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict);
 
 bool nf_queue_entry_get_refs(struct nf_queue_entry *entry);
 void nf_queue_entry_free(struct nf_queue_entry *entry);
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index e2f334f70281..7f12e56e6e52 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -248,109 +248,3 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nf_queue);
-
-static unsigned int nf_iterate(struct sk_buff *skb,
-			       struct nf_hook_state *state,
-			       const struct nf_hook_entries *hooks,
-			       unsigned int *index)
-{
-	const struct nf_hook_entry *hook;
-	unsigned int verdict, i = *index;
-
-	while (i < hooks->num_hook_entries) {
-		hook = &hooks->hooks[i];
-repeat:
-		verdict = nf_hook_entry_hookfn(hook, skb, state);
-		if (verdict != NF_ACCEPT) {
-			*index = i;
-			if (verdict != NF_REPEAT)
-				return verdict;
-			goto repeat;
-		}
-		i++;
-	}
-
-	*index = i;
-	return NF_ACCEPT;
-}
-
-static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum)
-{
-	switch (pf) {
-#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
-	case NFPROTO_BRIDGE:
-		return rcu_dereference(net->nf.hooks_bridge[hooknum]);
-#endif
-	case NFPROTO_IPV4:
-		return rcu_dereference(net->nf.hooks_ipv4[hooknum]);
-	case NFPROTO_IPV6:
-		return rcu_dereference(net->nf.hooks_ipv6[hooknum]);
-	default:
-		WARN_ON_ONCE(1);
-		return NULL;
-	}
-
-	return NULL;
-}
-
-/* Caller must hold rcu read-side lock */
-void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
-{
-	const struct nf_hook_entry *hook_entry;
-	const struct nf_hook_entries *hooks;
-	struct sk_buff *skb = entry->skb;
-	const struct net *net;
-	unsigned int i;
-	int err;
-	u8 pf;
-
-	net = entry->state.net;
-	pf = entry->state.pf;
-
-	hooks = nf_hook_entries_head(net, pf, entry->state.hook);
-
-	i = entry->hook_index;
-	if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) {
-		kfree_skb(skb);
-		nf_queue_entry_free(entry);
-		return;
-	}
-
-	hook_entry = &hooks->hooks[i];
-
-	/* Continue traversal iff userspace said ok... */
-	if (verdict == NF_REPEAT)
-		verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);
-
-	if (verdict == NF_ACCEPT) {
-		if (nf_reroute(skb, entry) < 0)
-			verdict = NF_DROP;
-	}
-
-	if (verdict == NF_ACCEPT) {
-next_hook:
-		++i;
-		verdict = nf_iterate(skb, &entry->state, hooks, &i);
-	}
-
-	switch (verdict & NF_VERDICT_MASK) {
-	case NF_ACCEPT:
-	case NF_STOP:
-		local_bh_disable();
-		entry->state.okfn(entry->state.net, entry->state.sk, skb);
-		local_bh_enable();
-		break;
-	case NF_QUEUE:
-		err = nf_queue(skb, &entry->state, i, verdict);
-		if (err == 1)
-			goto next_hook;
-		break;
-	case NF_STOLEN:
-		break;
-	default:
-		kfree_skb(skb);
-	}
-
-	nf_queue_entry_free(entry);
-}
-EXPORT_SYMBOL(nf_reinject);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 5cf38fc0a366..00f4bd21c59b 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -225,6 +225,148 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
 	return entry;
 }
 
+static unsigned int nf_iterate(struct sk_buff *skb,
+			       struct nf_hook_state *state,
+			       const struct nf_hook_entries *hooks,
+			       unsigned int *index)
+{
+	const struct nf_hook_entry *hook;
+	unsigned int verdict, i = *index;
+
+	while (i < hooks->num_hook_entries) {
+		hook = &hooks->hooks[i];
+repeat:
+		verdict = nf_hook_entry_hookfn(hook, skb, state);
+		if (verdict != NF_ACCEPT) {
+			*index = i;
+			if (verdict != NF_REPEAT)
+				return verdict;
+			goto repeat;
+		}
+		i++;
+	}
+
+	*index = i;
+	return NF_ACCEPT;
+}
+
+static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum)
+{
+	switch (pf) {
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+	case NFPROTO_BRIDGE:
+		return rcu_dereference(net->nf.hooks_bridge[hooknum]);
+#endif
+	case NFPROTO_IPV4:
+		return rcu_dereference(net->nf.hooks_ipv4[hooknum]);
+	case NFPROTO_IPV6:
+		return rcu_dereference(net->nf.hooks_ipv6[hooknum]);
+	default:
+		WARN_ON_ONCE(1);
+		return NULL;
+	}
+
+	return NULL;
+}
+
+static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
+{
+#ifdef CONFIG_INET
+	const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+	if (entry->state.hook == NF_INET_LOCAL_OUT) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		if (!(iph->tos == rt_info->tos &&
+		      skb->mark == rt_info->mark &&
+		      iph->daddr == rt_info->daddr &&
+		      iph->saddr == rt_info->saddr))
+			return ip_route_me_harder(entry->state.net, entry->state.sk,
+						  skb, RTN_UNSPEC);
+	}
+#endif
+	return 0;
+}
+
+static int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry)
+{
+	const struct nf_ipv6_ops *v6ops;
+	int ret = 0;
+
+	switch (entry->state.pf) {
+	case AF_INET:
+		ret = nf_ip_reroute(skb, entry);
+		break;
+	case AF_INET6:
+		v6ops = rcu_dereference(nf_ipv6_ops);
+		if (v6ops)
+			ret = v6ops->reroute(skb, entry);
+		break;
+	}
+	return ret;
+}
+
+/* caller must hold rcu read-side lock */
+static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
+{
+	const struct nf_hook_entry *hook_entry;
+	const struct nf_hook_entries *hooks;
+	struct sk_buff *skb = entry->skb;
+	const struct net *net;
+	unsigned int i;
+	int err;
+	u8 pf;
+
+	net = entry->state.net;
+	pf = entry->state.pf;
+
+	hooks = nf_hook_entries_head(net, pf, entry->state.hook);
+
+	i = entry->hook_index;
+	if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) {
+		kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP);
+		nf_queue_entry_free(entry);
+		return;
+	}
+
+	hook_entry = &hooks->hooks[i];
+
+	/* Continue traversal iff userspace said ok... */
+	if (verdict == NF_REPEAT)
+		verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);
+
+	if (verdict == NF_ACCEPT) {
+		if (nf_reroute(skb, entry) < 0)
+			verdict = NF_DROP;
+	}
+
+	if (verdict == NF_ACCEPT) {
+next_hook:
+		++i;
+		verdict = nf_iterate(skb, &entry->state, hooks, &i);
+	}
+
+	switch (verdict & NF_VERDICT_MASK) {
+	case NF_ACCEPT:
+	case NF_STOP:
+		local_bh_disable();
+		entry->state.okfn(entry->state.net, entry->state.sk, skb);
+		local_bh_enable();
+		break;
+	case NF_QUEUE:
+		err = nf_queue(skb, &entry->state, i, verdict);
+		if (err == 1)
+			goto next_hook;
+		break;
+	case NF_STOLEN:
+		break;
+	default:
+		kfree_skb(skb);
+	}
+
+	nf_queue_entry_free(entry);
+}
+
 static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 {
 	const struct nf_ct_hook *ct_hook;
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index acef4155f0da..008419db815a 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -179,43 +179,6 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
 }
 EXPORT_SYMBOL_GPL(nf_route);
 
-static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
-{
-#ifdef CONFIG_INET
-	const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
-
-	if (entry->state.hook == NF_INET_LOCAL_OUT) {
-		const struct iphdr *iph = ip_hdr(skb);
-
-		if (!(iph->tos == rt_info->tos &&
-		      skb->mark == rt_info->mark &&
-		      iph->daddr == rt_info->daddr &&
-		      iph->saddr == rt_info->saddr))
-			return ip_route_me_harder(entry->state.net, entry->state.sk,
-						  skb, RTN_UNSPEC);
-	}
-#endif
-	return 0;
-}
-
-int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry)
-{
-	const struct nf_ipv6_ops *v6ops;
-	int ret = 0;
-
-	switch (entry->state.pf) {
-	case AF_INET:
-		ret = nf_ip_reroute(skb, entry);
-		break;
-	case AF_INET6:
-		v6ops = rcu_dereference(nf_ipv6_ops);
-		if (v6ops)
-			ret = v6ops->reroute(skb, entry);
-		break;
-	}
-	return ret;
-}
-
 /* Only get and check the lengths, not do any hop-by-hop stuff. */
 int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen)
 {
-- 
cgit v1.2.3


From 78b88ef392c1ccc189c74cf73c179cf59d23a258 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Mon, 19 Feb 2024 17:45:48 +0000
Subject: net: wan: framer: remove children from struct framer_ops kdoc

Remove documentation of non-existent children field
from the Kernel doc for struct framer_ops.

Introduced by 82c944d05b1a ("net: wan: Add framer framework support")

Signed-off-by: Simon Horman <horms@kernel.org>
Acked-by: Herve Codina <herve.codina@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/framer/framer-provider.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/framer/framer-provider.h b/include/linux/framer/framer-provider.h
index f6fd2dd92591..9724d4b44b9c 100644
--- a/include/linux/framer/framer-provider.h
+++ b/include/linux/framer/framer-provider.h
@@ -83,7 +83,6 @@ struct framer_ops {
 /**
  * struct framer_provider - represents the framer provider
  * @dev: framer provider device
- * @children: can be used to override the default (dev->of_node) child node
  * @owner: the module owner having of_xlate
  * @list: to maintain a linked list of framer providers
  * @of_xlate: function pointer to obtain framer instance from framer pointer
-- 
cgit v1.2.3


From ee975351cf0c2a11cdf97eae58265c126cb32850 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian.fainelli@broadcom.com>
Date: Mon, 19 Feb 2024 12:40:51 -0800
Subject: net: mdio: mdio-bcm-unimac: Manage clock around I/O accesses

Up until now we have managed not to have the mdio-bcm-unimac manage its
clock except during probe and suspend/resume. This works most of the
time, except where it does not.

With a fully modular build, we can get into a situation whereby the
GENET driver is fully registered, and so is the mdio-bcm-unimac driver,
however the Ethernet PHY driver is not yet, because it depends on a
resource that is not yet available (e.g.: GPIO provider). In that state,
the network device is not usable yet, and so to conserve power, the
GENET driver will have turned off its "main" clock which feeds its MDIO
controller.

When the PHY driver finally probes however, we make an access to the PHY
registers to e.g.: disable interrupts, and this causes a bus error
within the MDIO controller space because the MDIO controller clock(s)
are turned off.

To remedy that, we manage the clock around all of the I/O accesses to
the hardware which are done exclusively during read, write and clock
divider configuration.

This ensures that the register space is accessible, and this also
ensures that there are not unnecessarily elevated reference counts
keeping the clocks active when the network device is administratively
turned off. It would be the case with the previous way of managing the
clock.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mdio/mdio-bcm-unimac.c            | 93 ++++++++++++++-------------
 include/linux/platform_data/mdio-bcm-unimac.h |  3 +
 2 files changed, 53 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/drivers/net/mdio/mdio-bcm-unimac.c b/drivers/net/mdio/mdio-bcm-unimac.c
index 68f8ee0ec8ba..6fe08427fdd4 100644
--- a/drivers/net/mdio/mdio-bcm-unimac.c
+++ b/drivers/net/mdio/mdio-bcm-unimac.c
@@ -94,6 +94,10 @@ static int unimac_mdio_read(struct mii_bus *bus, int phy_id, int reg)
 	int ret;
 	u32 cmd;
 
+	ret = clk_prepare_enable(priv->clk);
+	if (ret)
+		return ret;
+
 	/* Prepare the read operation */
 	cmd = MDIO_RD | (phy_id << MDIO_PMD_SHIFT) | (reg << MDIO_REG_SHIFT);
 	unimac_mdio_writel(priv, cmd, MDIO_CMD);
@@ -103,7 +107,7 @@ static int unimac_mdio_read(struct mii_bus *bus, int phy_id, int reg)
 
 	ret = priv->wait_func(priv->wait_func_data);
 	if (ret)
-		return ret;
+		goto out;
 
 	cmd = unimac_mdio_readl(priv, MDIO_CMD);
 
@@ -112,10 +116,15 @@ static int unimac_mdio_read(struct mii_bus *bus, int phy_id, int reg)
 	 * that condition here and ignore the MDIO controller read failure
 	 * indication.
 	 */
-	if (!(bus->phy_ignore_ta_mask & 1 << phy_id) && (cmd & MDIO_READ_FAIL))
-		return -EIO;
+	if (!(bus->phy_ignore_ta_mask & 1 << phy_id) && (cmd & MDIO_READ_FAIL)) {
+		ret = -EIO;
+		goto out;
+	}
 
-	return cmd & 0xffff;
+	ret = cmd & 0xffff;
+out:
+	clk_disable_unprepare(priv->clk);
+	return ret;
 }
 
 static int unimac_mdio_write(struct mii_bus *bus, int phy_id,
@@ -123,6 +132,11 @@ static int unimac_mdio_write(struct mii_bus *bus, int phy_id,
 {
 	struct unimac_mdio_priv *priv = bus->priv;
 	u32 cmd;
+	int ret;
+
+	ret = clk_prepare_enable(priv->clk);
+	if (ret)
+		return ret;
 
 	/* Prepare the write operation */
 	cmd = MDIO_WR | (phy_id << MDIO_PMD_SHIFT) |
@@ -131,7 +145,10 @@ static int unimac_mdio_write(struct mii_bus *bus, int phy_id,
 
 	unimac_mdio_start(priv);
 
-	return priv->wait_func(priv->wait_func_data);
+	ret = priv->wait_func(priv->wait_func_data);
+	clk_disable_unprepare(priv->clk);
+
+	return ret;
 }
 
 /* Workaround for integrated BCM7xxx Gigabit PHYs which have a problem with
@@ -178,14 +195,19 @@ static int unimac_mdio_reset(struct mii_bus *bus)
 	return 0;
 }
 
-static void unimac_mdio_clk_set(struct unimac_mdio_priv *priv)
+static int unimac_mdio_clk_set(struct unimac_mdio_priv *priv)
 {
 	unsigned long rate;
 	u32 reg, div;
+	int ret;
 
 	/* Keep the hardware default values */
 	if (!priv->clk_freq)
-		return;
+		return 0;
+
+	ret = clk_prepare_enable(priv->clk);
+	if (ret)
+		return ret;
 
 	if (!priv->clk)
 		rate = 250000000;
@@ -195,7 +217,8 @@ static void unimac_mdio_clk_set(struct unimac_mdio_priv *priv)
 	div = (rate / (2 * priv->clk_freq)) - 1;
 	if (div & ~MDIO_CLK_DIV_MASK) {
 		pr_warn("Incorrect MDIO clock frequency, ignoring\n");
-		return;
+		ret = 0;
+		goto out;
 	}
 
 	/* The MDIO clock is the reference clock (typically 250Mhz) divided by
@@ -205,6 +228,9 @@ static void unimac_mdio_clk_set(struct unimac_mdio_priv *priv)
 	reg &= ~(MDIO_CLK_DIV_MASK << MDIO_CLK_DIV_SHIFT);
 	reg |= div << MDIO_CLK_DIV_SHIFT;
 	unimac_mdio_writel(priv, reg, MDIO_CFG);
+out:
+	clk_disable_unprepare(priv->clk);
+	return ret;
 }
 
 static int unimac_mdio_probe(struct platform_device *pdev)
@@ -235,24 +261,12 @@ static int unimac_mdio_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}
 
-	priv->clk = devm_clk_get_optional(&pdev->dev, NULL);
-	if (IS_ERR(priv->clk))
-		return PTR_ERR(priv->clk);
-
-	ret = clk_prepare_enable(priv->clk);
-	if (ret)
-		return ret;
-
 	if (of_property_read_u32(np, "clock-frequency", &priv->clk_freq))
 		priv->clk_freq = 0;
 
-	unimac_mdio_clk_set(priv);
-
 	priv->mii_bus = mdiobus_alloc();
-	if (!priv->mii_bus) {
-		ret = -ENOMEM;
-		goto out_clk_disable;
-	}
+	if (!priv->mii_bus)
+		return -ENOMEM;
 
 	bus = priv->mii_bus;
 	bus->priv = priv;
@@ -261,17 +275,29 @@ static int unimac_mdio_probe(struct platform_device *pdev)
 		priv->wait_func = pdata->wait_func;
 		priv->wait_func_data = pdata->wait_func_data;
 		bus->phy_mask = ~pdata->phy_mask;
+		priv->clk = pdata->clk;
 	} else {
 		bus->name = "unimac MII bus";
 		priv->wait_func_data = priv;
 		priv->wait_func = unimac_mdio_poll;
+		priv->clk = devm_clk_get_optional(&pdev->dev, NULL);
+	}
+
+	if (IS_ERR(priv->clk)) {
+		ret = PTR_ERR(priv->clk);
+		goto out_mdio_free;
 	}
+
 	bus->parent = &pdev->dev;
 	bus->read = unimac_mdio_read;
 	bus->write = unimac_mdio_write;
 	bus->reset = unimac_mdio_reset;
 	snprintf(bus->id, MII_BUS_ID_SIZE, "%s-%d", pdev->name, pdev->id);
 
+	ret = unimac_mdio_clk_set(priv);
+	if (ret)
+		goto out_mdio_free;
+
 	ret = of_mdiobus_register(bus, np);
 	if (ret) {
 		dev_err(&pdev->dev, "MDIO bus registration failed\n");
@@ -286,8 +312,6 @@ static int unimac_mdio_probe(struct platform_device *pdev)
 
 out_mdio_free:
 	mdiobus_free(bus);
-out_clk_disable:
-	clk_disable_unprepare(priv->clk);
 	return ret;
 }
 
@@ -297,34 +321,17 @@ static void unimac_mdio_remove(struct platform_device *pdev)
 
 	mdiobus_unregister(priv->mii_bus);
 	mdiobus_free(priv->mii_bus);
-	clk_disable_unprepare(priv->clk);
-}
-
-static int __maybe_unused unimac_mdio_suspend(struct device *d)
-{
-	struct unimac_mdio_priv *priv = dev_get_drvdata(d);
-
-	clk_disable_unprepare(priv->clk);
-
-	return 0;
 }
 
 static int __maybe_unused unimac_mdio_resume(struct device *d)
 {
 	struct unimac_mdio_priv *priv = dev_get_drvdata(d);
-	int ret;
 
-	ret = clk_prepare_enable(priv->clk);
-	if (ret)
-		return ret;
-
-	unimac_mdio_clk_set(priv);
-
-	return 0;
+	return unimac_mdio_clk_set(priv);
 }
 
 static SIMPLE_DEV_PM_OPS(unimac_mdio_pm_ops,
-			 unimac_mdio_suspend, unimac_mdio_resume);
+			 NULL, unimac_mdio_resume);
 
 static const struct of_device_id unimac_mdio_ids[] = {
 	{ .compatible = "brcm,asp-v2.1-mdio", },
diff --git a/include/linux/platform_data/mdio-bcm-unimac.h b/include/linux/platform_data/mdio-bcm-unimac.h
index 8a5f9f0b2c52..724e1f57b81f 100644
--- a/include/linux/platform_data/mdio-bcm-unimac.h
+++ b/include/linux/platform_data/mdio-bcm-unimac.h
@@ -1,11 +1,14 @@
 #ifndef __MDIO_BCM_UNIMAC_PDATA_H
 #define __MDIO_BCM_UNIMAC_PDATA_H
 
+struct clk;
+
 struct unimac_mdio_pdata {
 	u32 phy_mask;
 	int (*wait_func)(void *data);
 	void *wait_func_data;
 	const char *bus_name;
+	struct clk *clk;
 };
 
 #define UNIMAC_MDIO_DRV_NAME	"unimac-mdio"
-- 
cgit v1.2.3


From 6030b3a469f8936d7f0f928e788f12a4fe14a4ca Mon Sep 17 00:00:00 2001
From: Aditya Kumar Singh <quic_adisi@quicinc.com>
Date: Fri, 16 Feb 2024 20:16:20 +0530
Subject: wifi: mac80211: check beacon countdown is complete on per link basis

Currently, function to check if beacon countdown is complete uses deflink
to fetch the beacon and check the counter. However, with MLO, there is
a need to check the counter for the beacon in a particular link.

Add support to use link_id in order to fetch the beacon from a particular
link data.

Signed-off-by: Aditya Kumar Singh <quic_adisi@quicinc.com>
Link: https://msgid.link/20240216144621.514385-2-quic_adisi@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath10k/mac.c                 |  2 +-
 drivers/net/wireless/ath/ath10k/wmi.c                 |  2 +-
 drivers/net/wireless/ath/ath11k/mac.c                 |  2 +-
 drivers/net/wireless/ath/ath9k/beacon.c               |  2 +-
 drivers/net/wireless/ath/ath9k/htc_drv_beacon.c       |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c     |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/time-event.c   |  2 +-
 drivers/net/wireless/mediatek/mt76/mac80211.c         |  4 ++--
 drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c |  2 +-
 drivers/net/wireless/virtual/mac80211_hwsim.c         |  2 +-
 include/net/mac80211.h                                |  4 +++-
 net/mac80211/tx.c                                     | 14 ++++++++++++--
 12 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 41053219ee95..e322b528baaf 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -2034,7 +2034,7 @@ static void ath10k_mac_vif_ap_csa_count_down(struct ath10k_vif *arvif)
 	if (!arvif->is_up)
 		return;
 
-	if (!ieee80211_beacon_cntdwn_is_complete(vif)) {
+	if (!ieee80211_beacon_cntdwn_is_complete(vif, 0)) {
 		ieee80211_beacon_update_cntdwn(vif, 0);
 
 		ret = ath10k_mac_setup_bcn_tmpl(arvif);
diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index ddf15717d504..2e9661f4bea8 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -3884,7 +3884,7 @@ void ath10k_wmi_event_host_swba(struct ath10k *ar, struct sk_buff *skb)
 		 * actual channel switch is done
 		 */
 		if (arvif->vif->bss_conf.csa_active &&
-		    ieee80211_beacon_cntdwn_is_complete(arvif->vif)) {
+		    ieee80211_beacon_cntdwn_is_complete(arvif->vif, 0)) {
 			ieee80211_csa_finish(arvif->vif, 0);
 			continue;
 		}
diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index f7cab50bdfd1..a1e2729582e8 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -1577,7 +1577,7 @@ void ath11k_mac_bcn_tx_event(struct ath11k_vif *arvif)
 		return;
 
 	if (vif->bss_conf.color_change_active &&
-	    ieee80211_beacon_cntdwn_is_complete(vif)) {
+	    ieee80211_beacon_cntdwn_is_complete(vif, 0)) {
 		arvif->bcca_zero_sent = true;
 		ieee80211_color_change_finish(vif);
 		return;
diff --git a/drivers/net/wireless/ath/ath9k/beacon.c b/drivers/net/wireless/ath/ath9k/beacon.c
index 4e48407138b2..b399a7926ef5 100644
--- a/drivers/net/wireless/ath/ath9k/beacon.c
+++ b/drivers/net/wireless/ath/ath9k/beacon.c
@@ -365,7 +365,7 @@ bool ath9k_csa_is_finished(struct ath_softc *sc, struct ieee80211_vif *vif)
 	if (!vif || !vif->bss_conf.csa_active)
 		return false;
 
-	if (!ieee80211_beacon_cntdwn_is_complete(vif))
+	if (!ieee80211_beacon_cntdwn_is_complete(vif, 0))
 		return false;
 
 	ieee80211_csa_finish(vif, 0);
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c b/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c
index 8179d35dc310..547634f82183 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_beacon.c
@@ -514,7 +514,7 @@ bool ath9k_htc_csa_is_finished(struct ath9k_htc_priv *priv)
 	if (!vif || !vif->bss_conf.csa_active)
 		return false;
 
-	if (!ieee80211_beacon_cntdwn_is_complete(vif))
+	if (!ieee80211_beacon_cntdwn_is_complete(vif, 0))
 		return false;
 
 	ieee80211_csa_finish(vif, 0);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
index cc592e288188..123fe9bba982 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
@@ -1466,7 +1466,7 @@ static void iwl_mvm_csa_count_down(struct iwl_mvm *mvm,
 
 	mvmvif->csa_countdown = true;
 
-	if (!ieee80211_beacon_cntdwn_is_complete(csa_vif)) {
+	if (!ieee80211_beacon_cntdwn_is_complete(csa_vif, 0)) {
 		int c = ieee80211_beacon_update_cntdwn(csa_vif, 0);
 
 		iwl_mvm_mac_ctxt_beacon_changed(mvm, csa_vif,
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
index 89b1c7a87660..a59d264a11c5 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
@@ -156,7 +156,7 @@ static void iwl_mvm_csa_noa_start(struct iwl_mvm *mvm)
 	 * So we just do nothing here and the switch
 	 * will be performed on the last TBTT.
 	 */
-	if (!ieee80211_beacon_cntdwn_is_complete(csa_vif)) {
+	if (!ieee80211_beacon_cntdwn_is_complete(csa_vif, 0)) {
 		IWL_WARN(mvm, "CSA NOA started too early\n");
 		goto out_unlock;
 	}
diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index 8bf82755ca4c..758e380fdf1d 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -1613,7 +1613,7 @@ EXPORT_SYMBOL_GPL(mt76_get_sar_power);
 static void
 __mt76_csa_finish(void *priv, u8 *mac, struct ieee80211_vif *vif)
 {
-	if (vif->bss_conf.csa_active && ieee80211_beacon_cntdwn_is_complete(vif))
+	if (vif->bss_conf.csa_active && ieee80211_beacon_cntdwn_is_complete(vif, 0))
 		ieee80211_csa_finish(vif, 0);
 }
 
@@ -1638,7 +1638,7 @@ __mt76_csa_check(void *priv, u8 *mac, struct ieee80211_vif *vif)
 	if (!vif->bss_conf.csa_active)
 		return;
 
-	dev->csa_complete |= ieee80211_beacon_cntdwn_is_complete(vif);
+	dev->csa_complete |= ieee80211_beacon_cntdwn_is_complete(vif, 0);
 }
 
 void mt76_csa_check(struct mt76_dev *dev)
diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
index 66bf92c164c3..db5041d23938 100644
--- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
+++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
@@ -5739,7 +5739,7 @@ static void rtl8xxxu_update_beacon_work_callback(struct work_struct *work)
 	}
 
 	if (vif->bss_conf.csa_active) {
-		if (ieee80211_beacon_cntdwn_is_complete(vif)) {
+		if (ieee80211_beacon_cntdwn_is_complete(vif, 0)) {
 			ieee80211_csa_finish(vif, 0);
 			return;
 		}
diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c
index 0554946ed30e..2ea11a86d920 100644
--- a/drivers/net/wireless/virtual/mac80211_hwsim.c
+++ b/drivers/net/wireless/virtual/mac80211_hwsim.c
@@ -2305,7 +2305,7 @@ static void mac80211_hwsim_beacon_tx(void *arg, u8 *mac,
 			rcu_dereference(link_conf->chanctx_conf)->def.chan);
 	}
 
-	if (link_conf->csa_active && ieee80211_beacon_cntdwn_is_complete(vif))
+	if (link_conf->csa_active && ieee80211_beacon_cntdwn_is_complete(vif, link_id))
 		ieee80211_csa_finish(vif, link_id);
 }
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index fc223761e3af..25c892ea9eb3 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5566,10 +5566,12 @@ void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id);
 /**
  * ieee80211_beacon_cntdwn_is_complete - find out if countdown reached 1
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @link_id: valid link_id during MLO or 0 for non-MLO
  *
  * This function returns whether the countdown reached zero.
  */
-bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif);
+bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif,
+					 unsigned int link_id);
 
 /**
  * ieee80211_color_change_finish - notify mac80211 about color change
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index f4be4af568be..6bf223e6cd1a 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -5095,9 +5095,11 @@ unlock:
 }
 EXPORT_SYMBOL(ieee80211_beacon_set_cntdwn);
 
-bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif)
+bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif,
+					 unsigned int link_id)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_link_data *link;
 	struct beacon_data *beacon = NULL;
 	u8 *beacon_data;
 	size_t beacon_data_len;
@@ -5106,9 +5108,17 @@ bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif)
 	if (!ieee80211_sdata_running(sdata))
 		return false;
 
+	if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS))
+		return 0;
+
 	rcu_read_lock();
+
+	link = rcu_dereference(sdata->link[link_id]);
+	if (!link)
+		goto out;
+
 	if (vif->type == NL80211_IFTYPE_AP) {
-		beacon = rcu_dereference(sdata->deflink.u.ap.beacon);
+		beacon = rcu_dereference(link->u.ap.beacon);
 		if (WARN_ON(!beacon || !beacon->tail))
 			goto out;
 		beacon_data = beacon->tail;
-- 
cgit v1.2.3


From d73fbaf24c5a1e0698a7a5e17d66a5100efef72a Mon Sep 17 00:00:00 2001
From: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Date: Tue, 6 Feb 2024 16:54:06 +0200
Subject: wifi: mac80211: make associated BSS pointer visible to the driver

Some drivers need the data in it, so move it to the link conf,
which is exposed to the driver.

Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240206164849.6fe9782b87b4.Ifbffef638f07ca7f5c2b27f40d2cf2942d21de0b@changeid
[remove bss pointer from internal struct, update docs]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  5 +++++
 net/mac80211/ieee80211_i.h |  2 --
 net/mac80211/mlme.c        | 18 +++++++++---------
 3 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 25c892ea9eb3..56c6ecb2c10a 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -557,6 +557,10 @@ struct ieee80211_fils_discovery {
  * to that BSS) that can change during the lifetime of the BSS.
  *
  * @vif: reference to owning VIF
+ * @bss: the cfg80211 bss descriptor. Valid only for a station, and only
+ *	when associated. Note: This contains information which is not
+ *	necessarily authenticated. For example, information coming from probe
+ *	responses.
  * @addr: (link) address used locally
  * @link_id: link ID, or 0 for non-MLO
  * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE
@@ -700,6 +704,7 @@ struct ieee80211_fils_discovery {
  */
 struct ieee80211_bss_conf {
 	struct ieee80211_vif *vif;
+	struct cfg80211_bss *bss;
 
 	const u8 *bssid;
 	unsigned int link_id;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f3edb1a148a7..4bec625a84d1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1006,8 +1006,6 @@ struct ieee80211_link_data_managed {
 	int mu_edca_last_param_set;
 
 	u8 bss_param_ch_cnt;
-
-	struct cfg80211_bss *bss;
 };
 
 struct ieee80211_link_data_ap {
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e1554666d706..87ffc19770b8 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2015,7 +2015,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link,
 	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
-	struct cfg80211_bss *cbss = link->u.mgd.bss;
+	struct cfg80211_bss *cbss = link->conf->bss;
 	struct ieee80211_chanctx_conf *conf;
 	struct ieee80211_chanctx *chanctx;
 	enum nl80211_band current_band;
@@ -2928,7 +2928,7 @@ static u64 ieee80211_link_set_associated(struct ieee80211_link_data *link,
 
 	ieee80211_check_rate_mask(link);
 
-	link->u.mgd.bss = cbss;
+	link->conf->bss = cbss;
 	memcpy(link->u.mgd.bssid, cbss->bssid, ETH_ALEN);
 
 	if (sdata->vif.p2p ||
@@ -3076,7 +3076,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	ifmgd->associated = false;
 
 	/* other links will be destroyed */
-	sdata->deflink.u.mgd.bss = NULL;
+	sdata->deflink.conf->bss = NULL;
 	sdata->deflink.smps_mode = IEEE80211_SMPS_OFF;
 
 	netif_carrier_off(sdata->dev);
@@ -3406,7 +3406,7 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
 		ieee80211_mlme_send_probe_req(sdata, sdata->vif.addr, dst,
 					      sdata->vif.cfg.ssid,
 					      sdata->vif.cfg.ssid_len,
-					      sdata->deflink.u.mgd.bss->channel);
+					      sdata->deflink.conf->bss->channel);
 	}
 
 	ifmgd->probe_timeout = jiffies + msecs_to_jiffies(probe_wait_ms);
@@ -3489,7 +3489,7 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw,
 		return NULL;
 
 	if (ifmgd->associated)
-		cbss = sdata->deflink.u.mgd.bss;
+		cbss = sdata->deflink.conf->bss;
 	else if (ifmgd->auth_data)
 		cbss = ifmgd->auth_data->bss;
 	else if (ifmgd->assoc_data && ifmgd->assoc_data->link[0].bss)
@@ -3568,8 +3568,8 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
 			link = sdata_dereference(sdata->link[link_id], sdata);
 			if (!link)
 				continue;
-			cfg80211_unlink_bss(local->hw.wiphy, link->u.mgd.bss);
-			link->u.mgd.bss = NULL;
+			cfg80211_unlink_bss(local->hw.wiphy, link->conf->bss);
+			link->conf->bss = NULL;
 		}
 	}
 
@@ -6208,7 +6208,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
 	}
 
 	if (!ifmgd->associated ||
-	    !ieee80211_rx_our_beacon(bssid, link->u.mgd.bss))
+	    !ieee80211_rx_our_beacon(bssid, link->conf->bss))
 		return;
 	bssid = link->u.mgd.bssid;
 
@@ -6235,7 +6235,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
 	 */
 	if (!ieee80211_is_s1g_beacon(hdr->frame_control))
 		ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4);
-	parse_params.bss = link->u.mgd.bss;
+	parse_params.bss = link->conf->bss;
 	parse_params.filter = care_about_ies;
 	parse_params.crc = ncrc;
 	elems = ieee802_11_parse_elems_full(&parse_params);
-- 
cgit v1.2.3


From f79ab5d2bced9bd7c0ce86d2aa5b70d053001bb4 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 15 Feb 2024 17:41:36 +0200
Subject: wifi: cfg80211: Add KHZ_PER_GHZ to units.h and reuse

The KHZ_PER_GHZ might be used by others (with the name aligned
with similar constants). Define it in units.h and convert
wireless to use it.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://msgid.link/20240215154136.630029-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/units.h | 5 ++++-
 net/wireless/reg.c    | 7 +++----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/units.h b/include/linux/units.h
index 45110daaf8d3..00e15de33eca 100644
--- a/include/linux/units.h
+++ b/include/linux/units.h
@@ -24,10 +24,13 @@
 #define NANOHZ_PER_HZ		1000000000UL
 #define MICROHZ_PER_HZ		1000000UL
 #define MILLIHZ_PER_HZ		1000UL
+
 #define HZ_PER_KHZ		1000UL
-#define KHZ_PER_MHZ		1000UL
 #define HZ_PER_MHZ		1000000UL
 
+#define KHZ_PER_MHZ		1000UL
+#define KHZ_PER_GHZ		1000000UL
+
 #define MILLIWATT_PER_WATT	1000UL
 #define MICROWATT_PER_MILLIWATT	1000UL
 #define MICROWATT_PER_WATT	1000000UL
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 50cadbad485f..753f8e9aa4b1 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -57,6 +57,8 @@
 #include <linux/verification.h>
 #include <linux/moduleparam.h>
 #include <linux/firmware.h>
+#include <linux/units.h>
+
 #include <net/cfg80211.h>
 #include "core.h"
 #include "reg.h"
@@ -1289,20 +1291,17 @@ static bool is_valid_rd(const struct ieee80211_regdomain *rd)
 static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range,
 			      u32 freq_khz)
 {
-#define ONE_GHZ_IN_KHZ	1000000
 	/*
 	 * From 802.11ad: directional multi-gigabit (DMG):
 	 * Pertaining to operation in a frequency band containing a channel
 	 * with the Channel starting frequency above 45 GHz.
 	 */
-	u32 limit = freq_khz > 45 * ONE_GHZ_IN_KHZ ?
-			20 * ONE_GHZ_IN_KHZ : 2 * ONE_GHZ_IN_KHZ;
+	u32 limit = freq_khz > 45 * KHZ_PER_GHZ ? 20 * KHZ_PER_GHZ : 2 * KHZ_PER_GHZ;
 	if (abs(freq_khz - freq_range->start_freq_khz) <= limit)
 		return true;
 	if (abs(freq_khz - freq_range->end_freq_khz) <= limit)
 		return true;
 	return false;
-#undef ONE_GHZ_IN_KHZ
 }
 
 /*
-- 
cgit v1.2.3


From 7e899c1d6f0da2a98ebf6629274ef912d4c83359 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 16 Feb 2024 13:54:29 +0200
Subject: wifi: cfg80211: clean up cfg80211_inform_bss_frame_data()

Make cfg80211_inform_bss_frame_data() call the existing
cfg80211_inform_bss_data() after parsing the frame in the
appropriate way, so we have less code duplication. This
required introducing a new CFG80211_BSS_FTYPE_S1G_BEACON,
but that can be used by other drivers as well.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240216135047.874aed1eff5f.Ib7d88d126eec50c64763251a78cb432bb5df14df@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  2 ++
 net/wireless/scan.c    | 71 +++++++++++++++++++++++---------------------------
 2 files changed, 34 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 57c2298af35b..f9eada2a26ec 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -7175,11 +7175,13 @@ size_t cfg80211_merge_profile(const u8 *ie, size_t ielen,
  *	from a beacon or probe response
  * @CFG80211_BSS_FTYPE_BEACON: data comes from a beacon
  * @CFG80211_BSS_FTYPE_PRESP: data comes from a probe response
+ * @CFG80211_BSS_FTYPE_S1G_BEACON: data comes from an S1G beacon
  */
 enum cfg80211_bss_frame_type {
 	CFG80211_BSS_FTYPE_UNKNOWN,
 	CFG80211_BSS_FTYPE_BEACON,
 	CFG80211_BSS_FTYPE_PRESP,
+	CFG80211_BSS_FTYPE_S1G_BEACON,
 };
 
 /**
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 754138168646..c2d85fa4b75d 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -2213,6 +2213,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy,
 
 	switch (data->ftype) {
 	case CFG80211_BSS_FTYPE_BEACON:
+	case CFG80211_BSS_FTYPE_S1G_BEACON:
 		ies->from_beacon = true;
 		fallthrough;
 	case CFG80211_BSS_FTYPE_UNKNOWN:
@@ -3057,6 +3058,10 @@ cfg80211_inform_bss_data(struct wiphy *wiphy,
 	if (!res)
 		return NULL;
 
+	/* don't do any further MBSSID/ML handling for S1G */
+	if (ftype == CFG80211_BSS_FTYPE_S1G_BEACON)
+		return res;
+
 	cfg80211_parse_mbssid_data(wiphy, &inform_data, res, gfp);
 
 	cfg80211_parse_ml_sta_data(wiphy, &inform_data, res, gfp);
@@ -3071,17 +3076,16 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
 			       struct ieee80211_mgmt *mgmt, size_t len,
 			       gfp_t gfp)
 {
-	struct cfg80211_inform_single_bss_data inform_data = {
-		.drv_data = data,
-		.use_for = data->restrict_use ?
-				data->use_for :
-				NL80211_BSS_USE_FOR_ALL,
-		.cannot_use_reasons = data->cannot_use_reasons,
-	};
 	size_t min_hdr_len = offsetof(struct ieee80211_mgmt,
 				      u.probe_resp.variable);
 	struct ieee80211_ext *ext = NULL;
-	struct cfg80211_bss *res;
+	enum cfg80211_bss_frame_type ftype;
+	u16 beacon_interval;
+	const u8 *bssid;
+	u16 capability;
+	const u8 *ie;
+	size_t ielen;
+	u64 tsf;
 
 	if (WARN_ON(!mgmt))
 		return NULL;
@@ -3105,56 +3109,45 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
 	if (WARN_ON(len < min_hdr_len))
 		return NULL;
 
-	inform_data.ielen = len - min_hdr_len;
-	inform_data.ie = mgmt->u.probe_resp.variable;
+	ielen = len - min_hdr_len;
+	ie = mgmt->u.probe_resp.variable;
 	if (ext) {
 		const struct ieee80211_s1g_bcn_compat_ie *compat;
 		const struct element *elem;
 
 		if (ieee80211_is_s1g_short_beacon(mgmt->frame_control))
-			inform_data.ie = ext->u.s1g_short_beacon.variable;
+			ie = ext->u.s1g_short_beacon.variable;
 		else
-			inform_data.ie = ext->u.s1g_beacon.variable;
+			ie = ext->u.s1g_beacon.variable;
 
-		elem = cfg80211_find_elem(WLAN_EID_S1G_BCN_COMPAT,
-					  inform_data.ie, inform_data.ielen);
+		elem = cfg80211_find_elem(WLAN_EID_S1G_BCN_COMPAT, ie, ielen);
 		if (!elem)
 			return NULL;
 		if (elem->datalen < sizeof(*compat))
 			return NULL;
 		compat = (void *)elem->data;
-		memcpy(inform_data.bssid, ext->u.s1g_beacon.sa, ETH_ALEN);
-		inform_data.capability = le16_to_cpu(compat->compat_info);
-		inform_data.beacon_interval = le16_to_cpu(compat->beacon_int);
+		bssid = ext->u.s1g_beacon.sa;
+		capability = le16_to_cpu(compat->compat_info);
+		beacon_interval = le16_to_cpu(compat->beacon_int);
 	} else {
-		memcpy(inform_data.bssid, mgmt->bssid, ETH_ALEN);
-		inform_data.beacon_interval =
-			le16_to_cpu(mgmt->u.probe_resp.beacon_int);
-		inform_data.capability =
-			le16_to_cpu(mgmt->u.probe_resp.capab_info);
+		bssid = mgmt->bssid;
+		beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int);
+		capability = le16_to_cpu(mgmt->u.probe_resp.capab_info);
 	}
 
-	inform_data.tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp);
+	tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp);
 
 	if (ieee80211_is_probe_resp(mgmt->frame_control))
-		inform_data.ftype = CFG80211_BSS_FTYPE_PRESP;
+		ftype = CFG80211_BSS_FTYPE_PRESP;
+	else if (ext)
+		ftype = CFG80211_BSS_FTYPE_S1G_BEACON;
 	else
-		inform_data.ftype = CFG80211_BSS_FTYPE_BEACON;
-
-	res = cfg80211_inform_single_bss_data(wiphy, &inform_data, gfp);
-	if (!res)
-		return NULL;
-
-	/* don't do any further MBSSID/ML handling for S1G */
-	if (ext)
-		return res;
-
-	/* process each non-transmitting bss */
-	cfg80211_parse_mbssid_data(wiphy, &inform_data, res, gfp);
+		ftype = CFG80211_BSS_FTYPE_BEACON;
 
-	cfg80211_parse_ml_sta_data(wiphy, &inform_data, res, gfp);
-
-	return res;
+	return cfg80211_inform_bss_data(wiphy, data, ftype,
+					bssid, tsf, capability,
+					beacon_interval, ie, ielen,
+					gfp);
 }
 EXPORT_SYMBOL(cfg80211_inform_bss_frame_data);
 
-- 
cgit v1.2.3


From 6bd14aee0bd25525ab229acd9bfe536dd8642364 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 16 Feb 2024 13:54:31 +0200
Subject: wifi: mac80211: align ieee80211_mle_get_bss_param_ch_cnt()

Align the prototype of ieee80211_mle_get_bss_param_ch_cnt()
to also take a u8 * like the other functions, and make it
return -1 when the field isn't found, so that mac80211 can
check that instead of explicitly open-coding the check.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240216135047.583309181bc3.Ia61cb0b4fc034d5ac8fcfaf6f6fb2e115fadafe7@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 11 ++++++-----
 net/mac80211/mlme.c       |  9 +++++----
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index e4322238f273..303c75459897 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4990,17 +4990,18 @@ static inline int ieee80211_mle_get_link_id(const u8 *data)
 
 /**
  * ieee80211_mle_get_bss_param_ch_cnt - returns the BSS parameter change count
- * @mle: the basic multi link element
+ * @data: pointer to the basic multi link element
  *
  * The element is assumed to be of the correct type (BASIC) and big enough,
  * this must be checked using ieee80211_mle_type_ok().
  *
  * If the BSS parameter change count value can't be found (the presence bit
- * for it is clear), 0 will be returned.
+ * for it is clear), -1 will be returned.
  */
-static inline u8
-ieee80211_mle_get_bss_param_ch_cnt(const struct ieee80211_multi_link_elem *mle)
+static inline int
+ieee80211_mle_get_bss_param_ch_cnt(const u8 *data)
 {
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
 	u16 control = le16_to_cpu(mle->control);
 	const u8 *common = mle->variable;
 
@@ -5008,7 +5009,7 @@ ieee80211_mle_get_bss_param_ch_cnt(const struct ieee80211_multi_link_elem *mle)
 	common += sizeof(struct ieee80211_mle_basic_common_info);
 
 	if (!(control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT))
-		return 0;
+		return -1;
 
 	if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
 		common += 1;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 87ffc19770b8..5f2e9f5e1779 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -4202,13 +4202,14 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
 		 */
 		assoc_data->link[link_id].status = WLAN_STATUS_SUCCESS;
 		if (elems->ml_basic) {
-			if (!(elems->ml_basic->control &
-					cpu_to_le16(IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT))) {
+			int bss_param_ch_cnt =
+				ieee80211_mle_get_bss_param_ch_cnt((const void *)elems->ml_basic);
+
+			if (bss_param_ch_cnt < 0) {
 				ret = false;
 				goto out;
 			}
-			link->u.mgd.bss_param_ch_cnt =
-				ieee80211_mle_get_bss_param_ch_cnt(elems->ml_basic);
+			link->u.mgd.bss_param_ch_cnt = bss_param_ch_cnt;
 		}
 	} else if (elems->parse_error & IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC ||
 		   !elems->prof ||
-- 
cgit v1.2.3


From 894dd84e49ec114a2dde7b312ae4cada40d15bdb Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 16 Feb 2024 13:54:32 +0200
Subject: wifi: cfg80211: use ML element parsing helpers

Use the existing ML element parsing helpers and add a new
one for this (ieee80211_mle_get_mld_id).

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240216135047.4da47b1f035b.I437a5570ac456449facb0b147851ef24a1e473c2@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 38 ++++++++++++++++++++++++++++++++++++++
 net/wireless/scan.c       | 41 ++++++++---------------------------------
 2 files changed, 46 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 303c75459897..3385a2cc5b09 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -5115,6 +5115,44 @@ static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data)
 	return get_unaligned_le16(common);
 }
 
+/**
+ * ieee80211_mle_get_mld_id - returns the MLD ID
+ * @data: pointer to the multi link element
+ *
+ * The element is assumed to be of the correct type (BASIC) and big enough,
+ * this must be checked using ieee80211_mle_type_ok().
+ *
+ * If the MLD ID is not present, 0 will be returned.
+ */
+static inline u8 ieee80211_mle_get_mld_id(const u8 *data)
+{
+	const struct ieee80211_multi_link_elem *mle = (const void *)data;
+	u16 control = le16_to_cpu(mle->control);
+	const u8 *common = mle->variable;
+
+	/*
+	 * common points now at the beginning of
+	 * ieee80211_mle_basic_common_info
+	 */
+	common += sizeof(struct ieee80211_mle_basic_common_info);
+
+	if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_ID))
+		return 0;
+
+	if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
+		common += 1;
+	if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
+		common += 1;
+	if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
+		common += 2;
+	if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)
+		common += 2;
+	if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)
+		common += 2;
+
+	return *common;
+}
+
 /**
  * ieee80211_mle_size_ok - validate multi-link element size
  * @data: pointer to the element data
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index e46dfc71c497..7cf36b8d3ae7 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -2833,17 +2833,16 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy,
 	struct cfg80211_bss *bss;
 	u8 mld_id, reporter_link_id, bss_change_count;
 	u16 seen_links = 0;
-	const u8 *pos;
 	u8 i;
 
-	if (!ieee80211_mle_size_ok(elem->data + 1, elem->datalen - 1))
+	if (!ieee80211_mle_type_ok(elem->data + 1,
+				   IEEE80211_ML_CONTROL_TYPE_BASIC,
+				   elem->datalen - 1))
 		return;
 
-	ml_elem = (void *)elem->data + 1;
+	ml_elem = (void *)(elem->data + 1);
 	control = le16_to_cpu(ml_elem->control);
-	if (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE) !=
-	    IEEE80211_ML_CONTROL_TYPE_BASIC)
-		return;
+	ml_common_len = ml_elem->variable[0];
 
 	/* Must be present when transmitted by an AP (in a probe response) */
 	if (!(control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) ||
@@ -2851,24 +2850,8 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy,
 	    !(control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP))
 		return;
 
-	ml_common_len = ml_elem->variable[0];
-
-	/* length + MLD MAC address */
-	pos = ml_elem->variable + 1 + 6;
-
-	reporter_link_id = pos[0];
-	pos += 1;
-
-	bss_change_count = pos[0];
-	pos += 1;
-
-	if (u16_get_bits(control, IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY))
-		pos += 2;
-	if (u16_get_bits(control, IEEE80211_MLC_BASIC_PRES_EML_CAPA))
-		pos += 2;
-
-	/* MLD capabilities and operations */
-	pos += 2;
+	reporter_link_id = ieee80211_mle_get_link_id(elem->data + 1);
+	bss_change_count = ieee80211_mle_get_bss_param_ch_cnt(elem->data + 1);
 
 	/*
 	 * The MLD ID of the reporting AP is always zero. It is set if the AP
@@ -2876,15 +2859,7 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy,
 	 * relating to a nontransmitted BSS (matching the Multi-BSSID Index,
 	 * Draft P802.11be_D3.2, 35.3.4.2)
 	 */
-	if (u16_get_bits(control, IEEE80211_MLC_BASIC_PRES_MLD_ID)) {
-		mld_id = *pos;
-		pos += 1;
-	} else {
-		mld_id = 0;
-	}
-
-	/* Extended MLD capabilities and operations */
-	pos += 2;
+	mld_id = ieee80211_mle_get_mld_id(elem->data + 1);
 
 	/* Fully defrag the ML element for sta information/profile iteration */
 	mle = cfg80211_defrag_mle(elem, tx_data->ie, tx_data->ielen, gfp);
-- 
cgit v1.2.3


From a3c70a3cf11eb4b6409afc2cce1a3747e1dfe96f Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 20 Feb 2024 15:50:01 -0800
Subject: bpf: Shrink size of struct bpf_map/bpf_array.

Back in 2018 the commit be95a845cc44 ("bpf: avoid false sharing of map refcount
with max_entries") added ____cacheline_aligned to "struct bpf_map" to make sure
that fields like refcnt don't share a cache line with max_entries that is used
to bounds check map access. That was done to make spectre style attacks harder.
The main mitigation is done via code similar to array_index_nospec(), of course.
This was an additional precaution.

It increased the size of "struct bpf_map" a little, but it's affect on all
other maps (like array) is significant, since "struct bpf_map" is typically
the first member in other map types.

Undo this ____cacheline_aligned tag. Instead move freeze_mutex field around, so
that refcnt and max_entries are still in different cache lines.

The main effect is seen in sizeof(struct bpf_array) that reduces from 320
to 248 bytes.

BEFORE:

struct bpf_map {
	const struct bpf_map_ops  * ops;                 /*     0     8 */
	...
	char                       name[16];             /*    96    16 */

	/* XXX 16 bytes hole, try to pack */

	/* --- cacheline 2 boundary (128 bytes) --- */
	atomic64_t refcnt __attribute__((__aligned__(64))); /*   128     8 */
	...
	/* size: 256, cachelines: 4, members: 30 */
	/* sum members: 232, holes: 1, sum holes: 16 */
	/* padding: 8 */
	/* paddings: 1, sum paddings: 2 */
} __attribute__((__aligned__(64)));

struct bpf_array {
	struct bpf_map             map;                  /*     0   256 */
	...
	/* size: 320, cachelines: 5, members: 5 */
	/* padding: 48 */
	/* paddings: 1, sum paddings: 8 */
} __attribute__((__aligned__(64)));

AFTER:

struct bpf_map {
	/* size: 232, cachelines: 4, members: 30 */
	/* paddings: 1, sum paddings: 2 */
	/* last cacheline: 40 bytes */
};
struct bpf_array {
	/* size: 248, cachelines: 4, members: 5 */
	/* last cacheline: 56 bytes */
};

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20240220235001.57411-1-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c7aa99b44dbd..814dc913a968 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -251,10 +251,7 @@ struct bpf_list_node_kern {
 } __attribute__((aligned(8)));
 
 struct bpf_map {
-	/* The first two cachelines with read-mostly members of which some
-	 * are also accessed in fast-path (e.g. ops, max_entries).
-	 */
-	const struct bpf_map_ops *ops ____cacheline_aligned;
+	const struct bpf_map_ops *ops;
 	struct bpf_map *inner_map_meta;
 #ifdef CONFIG_SECURITY
 	void *security;
@@ -276,17 +273,14 @@ struct bpf_map {
 	struct obj_cgroup *objcg;
 #endif
 	char name[BPF_OBJ_NAME_LEN];
-	/* The 3rd and 4th cacheline with misc members to avoid false sharing
-	 * particularly with refcounting.
-	 */
-	atomic64_t refcnt ____cacheline_aligned;
+	struct mutex freeze_mutex;
+	atomic64_t refcnt;
 	atomic64_t usercnt;
 	/* rcu is used before freeing and work is only used during freeing */
 	union {
 		struct work_struct work;
 		struct rcu_head rcu;
 	};
-	struct mutex freeze_mutex;
 	atomic64_t writecnt;
 	/* 'Ownership' of program-containing map is claimed by the first program
 	 * that is going to use this map or by the first program which FD is
-- 
cgit v1.2.3


From 953cc643329b38434bb7d6206951d1a48016e38b Mon Sep 17 00:00:00 2001
From: Dimitri Fedrau <dima.fedrau@gmail.com>
Date: Sun, 18 Feb 2024 08:57:38 +0100
Subject: net: phy: Add BaseT1 auto-negotiation constants

Added constants for advertising 100BT1 and 1000BT1 in register BASE-T1
auto-negotiation advertisement register [31:16] (Register 7.515)

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Dimitri Fedrau <dima.fedrau@gmail.com>
Link: https://lore.kernel.org/r/20240218075753.18067-2-dima.fedrau@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/mdio.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
index 3c9097502403..c0c8ec995b06 100644
--- a/include/uapi/linux/mdio.h
+++ b/include/uapi/linux/mdio.h
@@ -350,6 +350,8 @@
 
 /* BASE-T1 auto-negotiation advertisement register [31:16] */
 #define MDIO_AN_T1_ADV_M_B10L		0x4000	/* device is compatible with 10BASE-T1L */
+#define MDIO_AN_T1_ADV_M_1000BT1	0x0080	/* advertise 1000BASE-T1 */
+#define MDIO_AN_T1_ADV_M_100BT1		0x0020	/* advertise 100BASE-T1 */
 #define MDIO_AN_T1_ADV_M_MST		0x0010	/* advertise master preference */
 
 /* BASE-T1 auto-negotiation advertisement register [47:32] */
-- 
cgit v1.2.3


From ac0c530619cefa68fba816dabbcf6f4ffbf60c3d Mon Sep 17 00:00:00 2001
From: Dimitri Fedrau <dima.fedrau@gmail.com>
Date: Sun, 18 Feb 2024 08:57:39 +0100
Subject: net: phy: Support 100/1000BT1 linkmode advertisements

Extend helper functions mii_t1_adv_m_mod_linkmode_t and
linkmode_adv_to_mii_t1_adv_m_t to support 100BT1 and 1000BT1 linkmode
advertisements.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Dimitri Fedrau <dima.fedrau@gmail.com>
Link: https://lore.kernel.org/r/20240218075753.18067-3-dima.fedrau@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mdio.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index fd8ff310f9eb..68f8d2e970d4 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -373,6 +373,10 @@ static inline void mii_t1_adv_m_mod_linkmode_t(unsigned long *advertising, u32 l
 {
 	linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT1L_Full_BIT,
 			 advertising, lpa & MDIO_AN_T1_ADV_M_B10L);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT1_Full_BIT,
+			 advertising, lpa & MDIO_AN_T1_ADV_M_100BT1);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT1_Full_BIT,
+			 advertising, lpa & MDIO_AN_T1_ADV_M_1000BT1);
 }
 
 /**
@@ -409,6 +413,10 @@ static inline u32 linkmode_adv_to_mii_t1_adv_m_t(unsigned long *advertising)
 
 	if (linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT1L_Full_BIT, advertising))
 		result |= MDIO_AN_T1_ADV_M_B10L;
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT1_Full_BIT, advertising))
+		result |= MDIO_AN_T1_ADV_M_100BT1;
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT1_Full_BIT, advertising))
+		result |= MDIO_AN_T1_ADV_M_1000BT1;
 
 	return result;
 }
-- 
cgit v1.2.3


From e57e4c7f6cc943be3346f938361334bb3634db3d Mon Sep 17 00:00:00 2001
From: Dimitri Fedrau <dima.fedrau@gmail.com>
Date: Sun, 18 Feb 2024 08:57:42 +0100
Subject: net: phy: marvell-88q2xxx: add driver for the Marvell 88Q2220 PHY

Add a driver for the Marvell 88Q2220. This driver allows to detect the
link, switch between 100BASE-T1 and 1000BASE-T1 and switch between
master and slave mode. Autonegotiation is supported.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Gregor Herburger <gregor.herburger@ew.tq-group.com>
Signed-off-by: Dimitri Fedrau <dima.fedrau@gmail.com>
Link: https://lore.kernel.org/r/20240218075753.18067-6-dima.fedrau@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/marvell-88q2xxx.c | 210 ++++++++++++++++++++++++++++++++++++--
 include/linux/marvell_phy.h       |   1 +
 2 files changed, 205 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/marvell-88q2xxx.c b/drivers/net/phy/marvell-88q2xxx.c
index dcebb4643aff..9829facde253 100644
--- a/drivers/net/phy/marvell-88q2xxx.c
+++ b/drivers/net/phy/marvell-88q2xxx.c
@@ -1,11 +1,17 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * Marvell 88Q2XXX automotive 100BASE-T1/1000BASE-T1 PHY driver
+ *
+ * Derived from Marvell Q222x API
+ *
+ * Copyright (C) 2024 Liebherr-Electronics and Drives GmbH
  */
 #include <linux/ethtool_netlink.h>
 #include <linux/marvell_phy.h>
 #include <linux/phy.h>
 
+#define PHY_ID_88Q2220_REVB0	(MARVELL_PHY_ID_88Q2220 | 0x1)
+
 #define MDIO_MMD_AN_MV_STAT			32769
 #define MDIO_MMD_AN_MV_STAT_ANEG		0x0100
 #define MDIO_MMD_AN_MV_STAT_LOCAL_RX		0x1000
@@ -13,6 +19,11 @@
 #define MDIO_MMD_AN_MV_STAT_LOCAL_MASTER	0x4000
 #define MDIO_MMD_AN_MV_STAT_MS_CONF_FAULT	0x8000
 
+#define MDIO_MMD_AN_MV_STAT2			32794
+#define MDIO_MMD_AN_MV_STAT2_AN_RESOLVED	0x0800
+#define MDIO_MMD_AN_MV_STAT2_100BT1		0x2000
+#define MDIO_MMD_AN_MV_STAT2_1000BT1		0x4000
+
 #define MDIO_MMD_PCS_MV_100BT1_STAT1			33032
 #define MDIO_MMD_PCS_MV_100BT1_STAT1_IDLE_ERROR		0x00ff
 #define MDIO_MMD_PCS_MV_100BT1_STAT1_JABBER		0x0100
@@ -29,6 +40,42 @@
 
 #define MDIO_MMD_PCS_MV_RX_STAT			33328
 
+struct mmd_val {
+	int devad;
+	u32 regnum;
+	u16 val;
+};
+
+static const struct mmd_val mv88q222x_revb0_init_seq0[] = {
+	{ MDIO_MMD_PCS, 0x8033, 0x6801 },
+	{ MDIO_MMD_AN, MDIO_AN_T1_CTRL, 0x0 },
+	{ MDIO_MMD_PMAPMD, MDIO_CTRL1,
+	  MDIO_CTRL1_LPOWER | MDIO_PMA_CTRL1_SPEED1000 },
+	{ MDIO_MMD_PCS, 0xfe1b, 0x48 },
+	{ MDIO_MMD_PCS, 0xffe4, 0x6b6 },
+	{ MDIO_MMD_PMAPMD, MDIO_CTRL1, 0x0 },
+	{ MDIO_MMD_PCS, MDIO_CTRL1, 0x0 },
+};
+
+static const struct mmd_val mv88q222x_revb0_init_seq1[] = {
+	{ MDIO_MMD_PCS, 0xfe79, 0x0 },
+	{ MDIO_MMD_PCS, 0xfe07, 0x125a },
+	{ MDIO_MMD_PCS, 0xfe09, 0x1288 },
+	{ MDIO_MMD_PCS, 0xfe08, 0x2588 },
+	{ MDIO_MMD_PCS, 0xfe11, 0x1105 },
+	{ MDIO_MMD_PCS, 0xfe72, 0x042c },
+	{ MDIO_MMD_PCS, 0xfbba, 0xcb2 },
+	{ MDIO_MMD_PCS, 0xfbbb, 0xc4a },
+	{ MDIO_MMD_AN, 0x8032, 0x2020 },
+	{ MDIO_MMD_AN, 0x8031, 0xa28 },
+	{ MDIO_MMD_AN, 0x8031, 0xc28 },
+	{ MDIO_MMD_PCS, 0xffdb, 0xfc10 },
+	{ MDIO_MMD_PCS, 0xfe1b, 0x58 },
+	{ MDIO_MMD_PCS, 0xfe79, 0x4 },
+	{ MDIO_MMD_PCS, 0xfe5f, 0xe8 },
+	{ MDIO_MMD_PCS, 0xfe05, 0x755c },
+};
+
 static int mv88q2xxx_soft_reset(struct phy_device *phydev)
 {
 	int ret;
@@ -125,24 +172,90 @@ out:
 
 static int mv88q2xxx_read_link(struct phy_device *phydev)
 {
-	int ret;
-
 	/* The 88Q2XXX PHYs do not have the PMA/PMD status register available,
 	 * therefore we need to read the link status from the vendor specific
 	 * registers depending on the speed.
 	 */
+
 	if (phydev->speed == SPEED_1000)
-		ret = mv88q2xxx_read_link_gbit(phydev);
+		return mv88q2xxx_read_link_gbit(phydev);
+	else if (phydev->speed == SPEED_100)
+		return mv88q2xxx_read_link_100m(phydev);
+
+	phydev->link = false;
+	return 0;
+}
+
+static int mv88q2xxx_read_master_slave_state(struct phy_device *phydev)
+{
+	int ret;
+
+	phydev->master_slave_state = MASTER_SLAVE_STATE_UNKNOWN;
+	ret = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_MMD_AN_MV_STAT);
+	if (ret < 0)
+		return ret;
+
+	if (ret & MDIO_MMD_AN_MV_STAT_LOCAL_MASTER)
+		phydev->master_slave_state = MASTER_SLAVE_STATE_MASTER;
 	else
-		ret = mv88q2xxx_read_link_100m(phydev);
+		phydev->master_slave_state = MASTER_SLAVE_STATE_SLAVE;
 
-	return ret;
+	return 0;
+}
+
+static int mv88q2xxx_read_aneg_speed(struct phy_device *phydev)
+{
+	int ret;
+
+	phydev->speed = SPEED_UNKNOWN;
+	ret = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_MMD_AN_MV_STAT2);
+	if (ret < 0)
+		return ret;
+
+	if (!(ret & MDIO_MMD_AN_MV_STAT2_AN_RESOLVED))
+		return 0;
+
+	if (ret & MDIO_MMD_AN_MV_STAT2_100BT1)
+		phydev->speed = SPEED_100;
+	else if (ret & MDIO_MMD_AN_MV_STAT2_1000BT1)
+		phydev->speed = SPEED_1000;
+
+	return 0;
 }
 
 static int mv88q2xxx_read_status(struct phy_device *phydev)
 {
 	int ret;
 
+	if (phydev->autoneg == AUTONEG_ENABLE) {
+		/* We have to get the negotiated speed first, otherwise we are
+		 * not able to read the link.
+		 */
+		ret = mv88q2xxx_read_aneg_speed(phydev);
+		if (ret < 0)
+			return ret;
+
+		ret = mv88q2xxx_read_link(phydev);
+		if (ret < 0)
+			return ret;
+
+		ret = genphy_c45_read_lpa(phydev);
+		if (ret < 0)
+			return ret;
+
+		ret = genphy_c45_baset1_read_status(phydev);
+		if (ret < 0)
+			return ret;
+
+		ret = mv88q2xxx_read_master_slave_state(phydev);
+		if (ret < 0)
+			return ret;
+
+		phy_resolve_aneg_linkmode(phydev);
+
+		return 0;
+	}
+
 	ret = mv88q2xxx_read_link(phydev);
 	if (ret < 0)
 		return ret;
@@ -171,7 +284,9 @@ static int mv88q2xxx_get_features(struct phy_device *phydev)
 	 * sequence provided by Marvell. Disable it for now until a proper
 	 * workaround is found or a new PHY revision is released.
 	 */
-	linkmode_clear_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, phydev->supported);
+	if (phydev->drv->phy_id == MARVELL_PHY_ID_88Q2110)
+		linkmode_clear_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+				   phydev->supported);
 
 	return 0;
 }
@@ -241,6 +356,75 @@ static int mv88q2xxx_get_sqi_max(struct phy_device *phydev)
 	return 15;
 }
 
+static int mv88q222x_soft_reset(struct phy_device *phydev)
+{
+	int ret;
+
+	/* Enable RESET of DCL */
+	if (phydev->autoneg == AUTONEG_ENABLE || phydev->speed == SPEED_1000) {
+		ret = phy_write_mmd(phydev, MDIO_MMD_PCS, 0xfe1b, 0x48);
+		if (ret < 0)
+			return ret;
+	}
+
+	ret = phy_write_mmd(phydev, MDIO_MMD_PCS, MDIO_PCS_1000BT1_CTRL,
+			    MDIO_PCS_1000BT1_CTRL_RESET);
+	if (ret < 0)
+		return ret;
+
+	ret = phy_write_mmd(phydev, MDIO_MMD_PCS, 0xffe4, 0xc);
+	if (ret < 0)
+		return ret;
+
+	/* Disable RESET of DCL */
+	if (phydev->autoneg == AUTONEG_ENABLE || phydev->speed == SPEED_1000)
+		return phy_write_mmd(phydev, MDIO_MMD_PCS, 0xfe1b, 0x58);
+
+	return 0;
+}
+
+static int mv88q222x_config_aneg(struct phy_device *phydev)
+{
+	int ret;
+
+	ret = genphy_c45_config_aneg(phydev);
+	if (ret)
+		return ret;
+
+	return mv88q222x_soft_reset(phydev);
+}
+
+static int mv88q222x_revb0_config_init(struct phy_device *phydev)
+{
+	int ret, i;
+
+	for (i = 0; i < ARRAY_SIZE(mv88q222x_revb0_init_seq0); i++) {
+		ret = phy_write_mmd(phydev, mv88q222x_revb0_init_seq0[i].devad,
+				    mv88q222x_revb0_init_seq0[i].regnum,
+				    mv88q222x_revb0_init_seq0[i].val);
+		if (ret < 0)
+			return ret;
+	}
+
+	usleep_range(5000, 10000);
+
+	for (i = 0; i < ARRAY_SIZE(mv88q222x_revb0_init_seq1); i++) {
+		ret = phy_write_mmd(phydev, mv88q222x_revb0_init_seq1[i].devad,
+				    mv88q222x_revb0_init_seq1[i].regnum,
+				    mv88q222x_revb0_init_seq1[i].val);
+		if (ret < 0)
+			return ret;
+	}
+
+	/* The 88Q2XXX PHYs do have the extended ability register available, but
+	 * register MDIO_PMA_EXTABLE where they should signalize it does not
+	 * work according to specification. Therefore, we force it here.
+	 */
+	phydev->pma_extable = MDIO_PMA_EXTABLE_BT1;
+
+	return 0;
+}
+
 static struct phy_driver mv88q2xxx_driver[] = {
 	{
 		.phy_id			= MARVELL_PHY_ID_88Q2110,
@@ -255,12 +439,26 @@ static struct phy_driver mv88q2xxx_driver[] = {
 		.get_sqi		= mv88q2xxx_get_sqi,
 		.get_sqi_max		= mv88q2xxx_get_sqi_max,
 	},
+	{
+		PHY_ID_MATCH_EXACT(PHY_ID_88Q2220_REVB0),
+		.name			= "mv88q2220",
+		.get_features		= mv88q2xxx_get_features,
+		.config_aneg		= mv88q222x_config_aneg,
+		.aneg_done		= genphy_c45_aneg_done,
+		.config_init		= mv88q222x_revb0_config_init,
+		.read_status		= mv88q2xxx_read_status,
+		.soft_reset		= mv88q222x_soft_reset,
+		.set_loopback		= genphy_c45_loopback,
+		.get_sqi		= mv88q2xxx_get_sqi,
+		.get_sqi_max		= mv88q2xxx_get_sqi_max,
+	},
 };
 
 module_phy_driver(mv88q2xxx_driver);
 
 static struct mdio_device_id __maybe_unused mv88q2xxx_tbl[] = {
 	{ MARVELL_PHY_ID_88Q2110, MARVELL_PHY_ID_MASK },
+	{ PHY_ID_MATCH_EXACT(PHY_ID_88Q2220_REVB0), },
 	{ /*sentinel*/ }
 };
 MODULE_DEVICE_TABLE(mdio, mv88q2xxx_tbl);
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index 9b54c4f0677f..693eba9869e4 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -26,6 +26,7 @@
 #define MARVELL_PHY_ID_88E2110		0x002b09b0
 #define MARVELL_PHY_ID_88X2222		0x01410f10
 #define MARVELL_PHY_ID_88Q2110		0x002b0980
+#define MARVELL_PHY_ID_88Q2220		0x002b0b20
 
 /* Marvel 88E1111 in Finisar SFP module with modified PHY ID */
 #define MARVELL_PHY_ID_88E1111_FINISAR	0x01ff0cc0
-- 
cgit v1.2.3


From f796feabb9f5b1e5c48780a7a0023ab4b82336dd Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 20 Feb 2024 12:00:01 +0100
Subject: udp: add local "peek offset enabled" flag

We want to re-organize the struct sock layout. The sk_peek_off
field location is problematic, as most protocols want it in the
RX read area, while UDP wants it on a cacheline different from
sk_receive_queue.

Create a local (inside udp_sock) copy of the 'peek offset is enabled'
flag and place it inside the same cacheline of reader_queue.

Check such flag before reading sk_peek_off. This will save potential
false sharing and cache misses in the fast-path.

Tested under UDP flood with small packets. The struct sock layout
update causes a 4% performance drop, and this patch restores completely
the original tput.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/67ab679c15fbf49fa05b3ffe05d91c47ab84f147.1708426665.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/udp.h | 10 ++++++++++
 net/ipv4/af_inet.c  |  2 +-
 net/ipv4/udp.c      |  2 +-
 net/ipv6/af_inet6.c |  2 +-
 4 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index d04188714dca..3748e82b627b 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -92,6 +92,9 @@ struct udp_sock {
 
 	/* This fields follows rcvbuf value, and is touched by udp_recvmsg */
 	int		forward_threshold;
+
+	/* Cache friendly copy of sk->sk_peek_off >= 0 */
+	bool		peeking_with_offset;
 };
 
 #define udp_test_bit(nr, sk)			\
@@ -109,6 +112,13 @@ struct udp_sock {
 
 #define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk)
 
+static inline int udp_set_peek_off(struct sock *sk, int val)
+{
+	sk_set_peek_off(sk, val);
+	WRITE_ONCE(udp_sk(sk)->peeking_with_offset, val >= 0);
+	return 0;
+}
+
 static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
 {
 	udp_assign_bit(NO_CHECK6_TX, sk, val);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ad278009e469..5daebdcbca32 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1103,7 +1103,7 @@ const struct proto_ops inet_dgram_ops = {
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
 	.splice_eof	   = inet_splice_eof,
-	.set_peek_off	   = sk_set_peek_off,
+	.set_peek_off	   = udp_set_peek_off,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	   = inet_compat_ioctl,
 #endif
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f631b0a21af4..38cce7cc51f6 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1589,7 +1589,7 @@ int udp_init_sock(struct sock *sk)
 
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
 {
-	if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) {
+	if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset))) {
 		bool slow = lock_sock_fast(sk);
 
 		sk_peek_offset_bwd(sk, len);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 959bfd9f6344..b90d46533cdc 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -736,7 +736,7 @@ const struct proto_ops inet6_dgram_ops = {
 	.recvmsg	   = inet6_recvmsg,		/* retpoline's sake */
 	.read_skb	   = udp_read_skb,
 	.mmap		   = sock_no_mmap,
-	.set_peek_off	   = sk_set_peek_off,
+	.set_peek_off	   = udp_set_peek_off,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	   = inet6_compat_ioctl,
 #endif
-- 
cgit v1.2.3


From ee076b73e576b0a052d5686d873346b285ae50ea Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Mon, 19 Feb 2024 17:51:46 +0800
Subject: net: mctp: avoid confusion over local/peer dest/source addresses

We have a double-swap of local and peer addresses in
mctp_alloc_local_tag; the arguments in both call sites are swapped, but
there is also a swap in the implementation of alloc_local_tag. This is
opaque because we're using source/dest address references, which don't
match the local/peer semantics.

Avoid this confusion by naming the arguments as 'local' and 'peer', and
remove the double swap. The calling order now matches mctp_key_alloc.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/mctp.h |  4 ++--
 net/mctp/af_mctp.c |  2 +-
 net/mctp/route.c   | 16 ++++++++--------
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index da86e106c91d..f937a325ea6f 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -87,7 +87,7 @@ struct mctp_sock {
 };
 
 /* Key for matching incoming packets to sockets or reassembly contexts.
- * Packets are matched on (src,dest,tag).
+ * Packets are matched on (peer EID, local EID, tag).
  *
  * Lifetime / locking requirements:
  *
@@ -254,7 +254,7 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 
 void mctp_key_unref(struct mctp_sk_key *key);
 struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
-					 mctp_eid_t daddr, mctp_eid_t saddr,
+					 mctp_eid_t local, mctp_eid_t peer,
 					 bool manual, u8 *tagp);
 
 /* routing <--> device interface */
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index f6be58b68c6f..d8197e9e233b 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -367,7 +367,7 @@ static int mctp_ioctl_alloctag(struct mctp_sock *msk, unsigned long arg)
 	if (ctl.flags)
 		return -EINVAL;
 
-	key = mctp_alloc_local_tag(msk, ctl.peer_addr, MCTP_ADDR_ANY,
+	key = mctp_alloc_local_tag(msk, MCTP_ADDR_ANY, ctl.peer_addr,
 				   true, &tag);
 	if (IS_ERR(key))
 		return PTR_ERR(key);
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 7a47a58aa54b..04d29e910af9 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -596,11 +596,11 @@ static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key,
 	refcount_inc(&key->refs);
 }
 
-/* Allocate a locally-owned tag value for (saddr, daddr), and reserve
+/* Allocate a locally-owned tag value for (local, peer), and reserve
  * it for the socket msk
  */
 struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
-					 mctp_eid_t daddr, mctp_eid_t saddr,
+					 mctp_eid_t local, mctp_eid_t peer,
 					 bool manual, u8 *tagp)
 {
 	struct net *net = sock_net(&msk->sk);
@@ -610,11 +610,11 @@ struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
 	u8 tagbits;
 
 	/* for NULL destination EIDs, we may get a response from any peer */
-	if (daddr == MCTP_ADDR_NULL)
-		daddr = MCTP_ADDR_ANY;
+	if (peer == MCTP_ADDR_NULL)
+		peer = MCTP_ADDR_ANY;
 
 	/* be optimistic, alloc now */
-	key = mctp_key_alloc(msk, saddr, daddr, 0, GFP_KERNEL);
+	key = mctp_key_alloc(msk, local, peer, 0, GFP_KERNEL);
 	if (!key)
 		return ERR_PTR(-ENOMEM);
 
@@ -635,8 +635,8 @@ struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
 		if (tmp->tag & MCTP_HDR_FLAG_TO)
 			continue;
 
-		if (!(mctp_address_matches(tmp->peer_addr, daddr) &&
-		      mctp_address_matches(tmp->local_addr, saddr)))
+		if (!(mctp_address_matches(tmp->peer_addr, peer) &&
+		      mctp_address_matches(tmp->local_addr, local)))
 			continue;
 
 		spin_lock(&tmp->lock);
@@ -924,7 +924,7 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 			key = mctp_lookup_prealloc_tag(msk, daddr,
 						       req_tag, &tag);
 		else
-			key = mctp_alloc_local_tag(msk, daddr, saddr,
+			key = mctp_alloc_local_tag(msk, saddr, daddr,
 						   false, &tag);
 
 		if (IS_ERR(key)) {
-- 
cgit v1.2.3


From 43e6795574f5d75284a3cb21f5b76a5ffb98e8e6 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Mon, 19 Feb 2024 17:51:50 +0800
Subject: net: mctp: separate key correlation across nets

Currently, we lookup sk_keys from the entire struct net_namespace, which
may contain multiple MCTP net IDs. In those cases we want to distinguish
between endpoints with the same EID but different net ID.

Add the net ID data to the struct mctp_sk_key, populate on add and
filter on this during route lookup.

For the ioctl interface, we use a default net of
MCTP_INITIAL_DEFAULT_NET (ie., what will be in use for single-net
configurations), but we'll extend the ioctl interface to provide
net-specific tag allocation in an upcoming change.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/mctp.h         |  2 ++
 net/mctp/af_mctp.c         |  4 ++--
 net/mctp/route.c           | 43 +++++++++++++++++++++++++++++++------------
 net/mctp/test/route-test.c |  7 +++++--
 4 files changed, 40 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index f937a325ea6f..0dfae6f51a32 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -133,6 +133,7 @@ struct mctp_sock {
  *    - through an expiry timeout, on a per-socket timer
  */
 struct mctp_sk_key {
+	unsigned int	net;
 	mctp_eid_t	peer_addr;
 	mctp_eid_t	local_addr; /* MCTP_ADDR_ANY for local owned tags */
 	__u8		tag; /* incoming tag match; invert TO for local */
@@ -254,6 +255,7 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 
 void mctp_key_unref(struct mctp_sk_key *key);
 struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
+					 unsigned int netid,
 					 mctp_eid_t local, mctp_eid_t peer,
 					 bool manual, u8 *tagp);
 
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index d8197e9e233b..05315a422ffb 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -367,8 +367,8 @@ static int mctp_ioctl_alloctag(struct mctp_sock *msk, unsigned long arg)
 	if (ctl.flags)
 		return -EINVAL;
 
-	key = mctp_alloc_local_tag(msk, MCTP_ADDR_ANY, ctl.peer_addr,
-				   true, &tag);
+	key = mctp_alloc_local_tag(msk, MCTP_INITIAL_DEFAULT_NET,
+				   MCTP_ADDR_ANY, ctl.peer_addr, true, &tag);
 	if (IS_ERR(key))
 		return PTR_ERR(key);
 
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 0c2ed75a9e28..28648a7ec866 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -107,9 +107,12 @@ static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb)
  * and peer addresses, or either being ANY.
  */
 
-static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local,
-			   mctp_eid_t peer, u8 tag)
+static bool mctp_key_match(struct mctp_sk_key *key, unsigned int net,
+			   mctp_eid_t local, mctp_eid_t peer, u8 tag)
 {
+	if (key->net != net)
+		return false;
+
 	if (!mctp_address_matches(key->local_addr, local))
 		return false;
 
@@ -126,7 +129,7 @@ static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local,
  * key exists.
  */
 static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb,
-					   mctp_eid_t peer,
+					   unsigned int netid, mctp_eid_t peer,
 					   unsigned long *irqflags)
 	__acquires(&key->lock)
 {
@@ -142,7 +145,7 @@ static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb,
 	spin_lock_irqsave(&net->mctp.keys_lock, flags);
 
 	hlist_for_each_entry(key, &net->mctp.keys, hlist) {
-		if (!mctp_key_match(key, mh->dest, peer, tag))
+		if (!mctp_key_match(key, netid, mh->dest, peer, tag))
 			continue;
 
 		spin_lock(&key->lock);
@@ -165,6 +168,7 @@ static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb,
 }
 
 static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk,
+					  unsigned int net,
 					  mctp_eid_t local, mctp_eid_t peer,
 					  u8 tag, gfp_t gfp)
 {
@@ -174,6 +178,7 @@ static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk,
 	if (!key)
 		return NULL;
 
+	key->net = net;
 	key->peer_addr = peer;
 	key->local_addr = local;
 	key->tag = tag;
@@ -219,8 +224,8 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk)
 	}
 
 	hlist_for_each_entry(tmp, &net->mctp.keys, hlist) {
-		if (mctp_key_match(tmp, key->local_addr, key->peer_addr,
-				   key->tag)) {
+		if (mctp_key_match(tmp, key->net, key->local_addr,
+				   key->peer_addr, key->tag)) {
 			spin_lock(&tmp->lock);
 			if (tmp->valid)
 				rc = -EEXIST;
@@ -361,6 +366,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
 	struct net *net = dev_net(skb->dev);
 	struct mctp_sock *msk;
 	struct mctp_hdr *mh;
+	unsigned int netid;
 	unsigned long f;
 	u8 tag, flags;
 	int rc;
@@ -379,6 +385,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
 
 	/* grab header, advance data ptr */
 	mh = mctp_hdr(skb);
+	netid = mctp_cb(skb)->net;
 	skb_pull(skb, sizeof(struct mctp_hdr));
 
 	if (mh->ver != 1)
@@ -392,7 +399,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
 	/* lookup socket / reasm context, exactly matching (src,dest,tag).
 	 * we hold a ref on the key, and key->lock held.
 	 */
-	key = mctp_lookup_key(net, skb, mh->src, &f);
+	key = mctp_lookup_key(net, skb, netid, mh->src, &f);
 
 	if (flags & MCTP_HDR_FLAG_SOM) {
 		if (key) {
@@ -406,7 +413,8 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
 			 * this lookup requires key->peer to be MCTP_ADDR_ANY,
 			 * it doesn't match just any key->peer.
 			 */
-			any_key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY, &f);
+			any_key = mctp_lookup_key(net, skb, netid,
+						  MCTP_ADDR_ANY, &f);
 			if (any_key) {
 				msk = container_of(any_key->sk,
 						   struct mctp_sock, sk);
@@ -443,7 +451,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
 		 * packets for this message
 		 */
 		if (!key) {
-			key = mctp_key_alloc(msk, mh->dest, mh->src,
+			key = mctp_key_alloc(msk, netid, mh->dest, mh->src,
 					     tag, GFP_ATOMIC);
 			if (!key) {
 				rc = -ENOMEM;
@@ -637,6 +645,7 @@ static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key,
  * it for the socket msk
  */
 struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
+					 unsigned int netid,
 					 mctp_eid_t local, mctp_eid_t peer,
 					 bool manual, u8 *tagp)
 {
@@ -651,7 +660,7 @@ struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
 		peer = MCTP_ADDR_ANY;
 
 	/* be optimistic, alloc now */
-	key = mctp_key_alloc(msk, local, peer, 0, GFP_KERNEL);
+	key = mctp_key_alloc(msk, netid, local, peer, 0, GFP_KERNEL);
 	if (!key)
 		return ERR_PTR(-ENOMEM);
 
@@ -668,6 +677,10 @@ struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
 		 * lock held, they don't change over the lifetime of the key.
 		 */
 
+		/* tags are net-specific */
+		if (tmp->net != netid)
+			continue;
+
 		/* if we don't own the tag, it can't conflict */
 		if (tmp->tag & MCTP_HDR_FLAG_TO)
 			continue;
@@ -716,6 +729,7 @@ struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
 }
 
 static struct mctp_sk_key *mctp_lookup_prealloc_tag(struct mctp_sock *msk,
+						    unsigned int netid,
 						    mctp_eid_t daddr,
 						    u8 req_tag, u8 *tagp)
 {
@@ -730,6 +744,9 @@ static struct mctp_sk_key *mctp_lookup_prealloc_tag(struct mctp_sock *msk,
 	spin_lock_irqsave(&mns->keys_lock, flags);
 
 	hlist_for_each_entry(tmp, &mns->keys, hlist) {
+		if (tmp->net != netid)
+			continue;
+
 		if (tmp->tag != req_tag)
 			continue;
 
@@ -910,6 +927,7 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 	struct mctp_sk_key *key;
 	struct mctp_hdr *hdr;
 	unsigned long flags;
+	unsigned int netid;
 	unsigned int mtu;
 	mctp_eid_t saddr;
 	bool ext_rt;
@@ -960,16 +978,17 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 		rc = 0;
 	}
 	spin_unlock_irqrestore(&rt->dev->addrs_lock, flags);
+	netid = READ_ONCE(rt->dev->net);
 
 	if (rc)
 		goto out_release;
 
 	if (req_tag & MCTP_TAG_OWNER) {
 		if (req_tag & MCTP_TAG_PREALLOC)
-			key = mctp_lookup_prealloc_tag(msk, daddr,
+			key = mctp_lookup_prealloc_tag(msk, netid, daddr,
 						       req_tag, &tag);
 		else
-			key = mctp_alloc_local_tag(msk, saddr, daddr,
+			key = mctp_alloc_local_tag(msk, netid, saddr, daddr,
 						   false, &tag);
 
 		if (IS_ERR(key)) {
diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c
index 714e5ae47629..b3dbd3600d91 100644
--- a/net/mctp/test/route-test.c
+++ b/net/mctp/test/route-test.c
@@ -552,6 +552,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test)
 	struct mctp_sock *msk;
 	struct socket *sock;
 	unsigned long flags;
+	unsigned int net;
 	int rc;
 	u8 c;
 
@@ -559,6 +560,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test)
 
 	dev = mctp_test_create_dev();
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev);
+	net = READ_ONCE(dev->mdev->net);
 
 	rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt);
@@ -570,8 +572,9 @@ static void mctp_test_route_input_sk_keys(struct kunit *test)
 	mns = &sock_net(sock->sk)->mctp;
 
 	/* set the incoming tag according to test params */
-	key = mctp_key_alloc(msk, params->key_local_addr, params->key_peer_addr,
-			     params->key_tag, GFP_KERNEL);
+	key = mctp_key_alloc(msk, net, params->key_local_addr,
+			     params->key_peer_addr, params->key_tag,
+			     GFP_KERNEL);
 
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, key);
 
-- 
cgit v1.2.3


From c16d2380e8fdd9fc1fd0e9c60e068f264b2d0ced Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Mon, 19 Feb 2024 17:51:51 +0800
Subject: net: mctp: provide a more specific tag allocation ioctl

Now that we have net-specific tags, extend the tag allocation ioctls
(SIOCMCTPALLOCTAG / SIOCMCTPDROPTAG) to allow a network parameter to be
passed to the tag allocation.

We also add a local_addr member to the ioc struct, to allow for a future
finer-grained tag allocation using local EIDs too. We don't add any
specific support for that now though, so require MCTP_ADDR_ANY or
MCTP_ADDR_NULL for those at present.

The old ioctls will still work, but allocate for the default MCTP net.
These are now marked as deprecated in the header.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/mctp.h |  32 +++++++++++++
 net/mctp/af_mctp.c        | 117 ++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 129 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h
index 154ab56651f1..e1db65df9359 100644
--- a/include/uapi/linux/mctp.h
+++ b/include/uapi/linux/mctp.h
@@ -50,7 +50,14 @@ struct sockaddr_mctp_ext {
 
 #define SIOCMCTPALLOCTAG	(SIOCPROTOPRIVATE + 0)
 #define SIOCMCTPDROPTAG		(SIOCPROTOPRIVATE + 1)
+#define SIOCMCTPALLOCTAG2	(SIOCPROTOPRIVATE + 2)
+#define SIOCMCTPDROPTAG2	(SIOCPROTOPRIVATE + 3)
 
+/* Deprecated: use mctp_ioc_tag_ctl2 / TAG2 ioctls instead, which defines the
+ * MCTP network ID as part of the allocated tag. Using this assumes the default
+ * net ID for allocated tags, which may not give correct behaviour on system
+ * with multiple networks configured.
+ */
 struct mctp_ioc_tag_ctl {
 	mctp_eid_t	peer_addr;
 
@@ -65,4 +72,29 @@ struct mctp_ioc_tag_ctl {
 	__u16		flags;
 };
 
+struct mctp_ioc_tag_ctl2 {
+	/* Peer details: network ID, peer EID, local EID. All set by the
+	 * caller.
+	 *
+	 * Local EID must be MCTP_ADDR_NULL or MCTP_ADDR_ANY in current
+	 * kernels.
+	 */
+	unsigned int	net;
+	mctp_eid_t	peer_addr;
+	mctp_eid_t	local_addr;
+
+	/* Set by caller, but no flags defined currently. Must be 0 */
+	__u16		flags;
+
+	/* For SIOCMCTPALLOCTAG2: must be passed as zero, kernel will
+	 * populate with the allocated tag value. Returned tag value will
+	 * always have TO and PREALLOC set.
+	 *
+	 * For SIOCMCTPDROPTAG2: userspace provides tag value to drop, from
+	 * a prior SIOCMCTPALLOCTAG2 call (and so must have TO and PREALLOC set).
+	 */
+	__u8		tag;
+
+};
+
 #endif /* __UAPI_MCTP_H */
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index 05315a422ffb..de52a9191da0 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -350,30 +350,102 @@ static int mctp_getsockopt(struct socket *sock, int level, int optname,
 	return -EINVAL;
 }
 
-static int mctp_ioctl_alloctag(struct mctp_sock *msk, unsigned long arg)
+/* helpers for reading/writing the tag ioc, handling compatibility across the
+ * two versions, and some basic API error checking
+ */
+static int mctp_ioctl_tag_copy_from_user(unsigned long arg,
+					 struct mctp_ioc_tag_ctl2 *ctl,
+					 bool tagv2)
+{
+	struct mctp_ioc_tag_ctl ctl_compat;
+	unsigned long size;
+	void *ptr;
+	int rc;
+
+	if (tagv2) {
+		size = sizeof(*ctl);
+		ptr = ctl;
+	} else {
+		size = sizeof(ctl_compat);
+		ptr = &ctl_compat;
+	}
+
+	rc = copy_from_user(ptr, (void __user *)arg, size);
+	if (rc)
+		return -EFAULT;
+
+	if (!tagv2) {
+		/* compat, using defaults for new fields */
+		ctl->net = MCTP_INITIAL_DEFAULT_NET;
+		ctl->peer_addr = ctl_compat.peer_addr;
+		ctl->local_addr = MCTP_ADDR_ANY;
+		ctl->flags = ctl_compat.flags;
+		ctl->tag = ctl_compat.tag;
+	}
+
+	if (ctl->flags)
+		return -EINVAL;
+
+	if (ctl->local_addr != MCTP_ADDR_ANY &&
+	    ctl->local_addr != MCTP_ADDR_NULL)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int mctp_ioctl_tag_copy_to_user(unsigned long arg,
+				       struct mctp_ioc_tag_ctl2 *ctl,
+				       bool tagv2)
+{
+	struct mctp_ioc_tag_ctl ctl_compat;
+	unsigned long size;
+	void *ptr;
+	int rc;
+
+	if (tagv2) {
+		ptr = ctl;
+		size = sizeof(*ctl);
+	} else {
+		ctl_compat.peer_addr = ctl->peer_addr;
+		ctl_compat.tag = ctl->tag;
+		ctl_compat.flags = ctl->flags;
+
+		ptr = &ctl_compat;
+		size = sizeof(ctl_compat);
+	}
+
+	rc = copy_to_user((void __user *)arg, ptr, size);
+	if (rc)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int mctp_ioctl_alloctag(struct mctp_sock *msk, bool tagv2,
+			       unsigned long arg)
 {
 	struct net *net = sock_net(&msk->sk);
 	struct mctp_sk_key *key = NULL;
-	struct mctp_ioc_tag_ctl ctl;
+	struct mctp_ioc_tag_ctl2 ctl;
 	unsigned long flags;
 	u8 tag;
+	int rc;
 
-	if (copy_from_user(&ctl, (void __user *)arg, sizeof(ctl)))
-		return -EFAULT;
+	rc = mctp_ioctl_tag_copy_from_user(arg, &ctl, tagv2);
+	if (rc)
+		return rc;
 
 	if (ctl.tag)
 		return -EINVAL;
 
-	if (ctl.flags)
-		return -EINVAL;
-
-	key = mctp_alloc_local_tag(msk, MCTP_INITIAL_DEFAULT_NET,
-				   MCTP_ADDR_ANY, ctl.peer_addr, true, &tag);
+	key = mctp_alloc_local_tag(msk, ctl.net, MCTP_ADDR_ANY,
+				   ctl.peer_addr, true, &tag);
 	if (IS_ERR(key))
 		return PTR_ERR(key);
 
 	ctl.tag = tag | MCTP_TAG_OWNER | MCTP_TAG_PREALLOC;
-	if (copy_to_user((void __user *)arg, &ctl, sizeof(ctl))) {
+	rc = mctp_ioctl_tag_copy_to_user(arg, &ctl, tagv2);
+	if (rc) {
 		unsigned long fl2;
 		/* Unwind our key allocation: the keys list lock needs to be
 		 * taken before the individual key locks, and we need a valid
@@ -385,28 +457,27 @@ static int mctp_ioctl_alloctag(struct mctp_sock *msk, unsigned long arg)
 		__mctp_key_remove(key, net, fl2, MCTP_TRACE_KEY_DROPPED);
 		mctp_key_unref(key);
 		spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
-		return -EFAULT;
+		return rc;
 	}
 
 	mctp_key_unref(key);
 	return 0;
 }
 
-static int mctp_ioctl_droptag(struct mctp_sock *msk, unsigned long arg)
+static int mctp_ioctl_droptag(struct mctp_sock *msk, bool tagv2,
+			      unsigned long arg)
 {
 	struct net *net = sock_net(&msk->sk);
-	struct mctp_ioc_tag_ctl ctl;
+	struct mctp_ioc_tag_ctl2 ctl;
 	unsigned long flags, fl2;
 	struct mctp_sk_key *key;
 	struct hlist_node *tmp;
 	int rc;
 	u8 tag;
 
-	if (copy_from_user(&ctl, (void __user *)arg, sizeof(ctl)))
-		return -EFAULT;
-
-	if (ctl.flags)
-		return -EINVAL;
+	rc = mctp_ioctl_tag_copy_from_user(arg, &ctl, tagv2);
+	if (rc)
+		return rc;
 
 	/* Must be a local tag, TO set, preallocated */
 	if ((ctl.tag & ~MCTP_TAG_MASK) != (MCTP_TAG_OWNER | MCTP_TAG_PREALLOC))
@@ -422,6 +493,7 @@ static int mctp_ioctl_droptag(struct mctp_sock *msk, unsigned long arg)
 		 */
 		spin_lock_irqsave(&key->lock, fl2);
 		if (key->manual_alloc &&
+		    ctl.net == key->net &&
 		    ctl.peer_addr == key->peer_addr &&
 		    tag == key->tag) {
 			__mctp_key_remove(key, net, fl2,
@@ -439,12 +511,17 @@ static int mctp_ioctl_droptag(struct mctp_sock *msk, unsigned long arg)
 static int mctp_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
 	struct mctp_sock *msk = container_of(sock->sk, struct mctp_sock, sk);
+	bool tagv2 = false;
 
 	switch (cmd) {
+	case SIOCMCTPALLOCTAG2:
 	case SIOCMCTPALLOCTAG:
-		return mctp_ioctl_alloctag(msk, arg);
+		tagv2 = cmd == SIOCMCTPALLOCTAG2;
+		return mctp_ioctl_alloctag(msk, tagv2, arg);
 	case SIOCMCTPDROPTAG:
-		return mctp_ioctl_droptag(msk, arg);
+	case SIOCMCTPDROPTAG2:
+		tagv2 = cmd == SIOCMCTPDROPTAG2;
+		return mctp_ioctl_droptag(msk, tagv2, arg);
 	}
 
 	return -EINVAL;
-- 
cgit v1.2.3


From 58fd62e0aa50fdd20bc41a01e787001f3af8a925 Mon Sep 17 00:00:00 2001
From: Martin Kelly <martin.kelly@crowdstrike.com>
Date: Wed, 21 Feb 2024 13:18:38 -0800
Subject: bpf: Clarify batch lookup/lookup_and_delete semantics

The batch lookup and lookup_and_delete APIs have two parameters,
in_batch and out_batch, to facilitate iterative
lookup/lookup_and_deletion operations for supported maps. Except NULL
for in_batch at the start of these two batch operations, both parameters
need to point to memory equal or larger than the respective map key
size, except for various hashmaps (hash, percpu_hash, lru_hash,
lru_percpu_hash) where the in_batch/out_batch memory size should be
at least 4 bytes.

Document these semantics to clarify the API.

Signed-off-by: Martin Kelly <martin.kelly@crowdstrike.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240221211838.1241578-1-martin.kelly@crowdstrike.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/uapi/linux/bpf.h       |  6 +++++-
 tools/include/uapi/linux/bpf.h |  6 +++++-
 tools/lib/bpf/bpf.h            | 17 ++++++++++++-----
 3 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d96708380e52..d2e6c5fcec01 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -617,7 +617,11 @@ union bpf_iter_link_info {
  *		to NULL to begin the batched operation. After each subsequent
  *		**BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant
  *		*out_batch* as the *in_batch* for the next operation to
- *		continue iteration from the current point.
+ *		continue iteration from the current point. Both *in_batch* and
+ *		*out_batch* must point to memory large enough to hold a key,
+ *		except for maps of type **BPF_MAP_TYPE_{HASH, PERCPU_HASH,
+ *		LRU_HASH, LRU_PERCPU_HASH}**, for which batch parameters
+ *		must be at least 4 bytes wide regardless of key size.
  *
  *		The *keys* and *values* are output parameters which must point
  *		to memory large enough to hold *count* items based on the key
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d96708380e52..d2e6c5fcec01 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -617,7 +617,11 @@ union bpf_iter_link_info {
  *		to NULL to begin the batched operation. After each subsequent
  *		**BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant
  *		*out_batch* as the *in_batch* for the next operation to
- *		continue iteration from the current point.
+ *		continue iteration from the current point. Both *in_batch* and
+ *		*out_batch* must point to memory large enough to hold a key,
+ *		except for maps of type **BPF_MAP_TYPE_{HASH, PERCPU_HASH,
+ *		LRU_HASH, LRU_PERCPU_HASH}**, for which batch parameters
+ *		must be at least 4 bytes wide regardless of key size.
  *
  *		The *keys* and *values* are output parameters which must point
  *		to memory large enough to hold *count* items based on the key
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index ab2570d28aec..df0db2f0cdb7 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -190,10 +190,14 @@ LIBBPF_API int bpf_map_delete_batch(int fd, const void *keys,
 /**
  * @brief **bpf_map_lookup_batch()** allows for batch lookup of BPF map elements.
  *
- * The parameter *in_batch* is the address of the first element in the batch to read.
- * *out_batch* is an output parameter that should be passed as *in_batch* to subsequent
- * calls to **bpf_map_lookup_batch()**. NULL can be passed for *in_batch* to indicate
- * that the batched lookup starts from the beginning of the map.
+ * The parameter *in_batch* is the address of the first element in the batch to
+ * read. *out_batch* is an output parameter that should be passed as *in_batch*
+ * to subsequent calls to **bpf_map_lookup_batch()**. NULL can be passed for
+ * *in_batch* to indicate that the batched lookup starts from the beginning of
+ * the map. Both *in_batch* and *out_batch* must point to memory large enough to
+ * hold a single key, except for maps of type **BPF_MAP_TYPE_{HASH, PERCPU_HASH,
+ * LRU_HASH, LRU_PERCPU_HASH}**, for which the memory size must be at
+ * least 4 bytes wide regardless of key size.
  *
  * The *keys* and *values* are output parameters which must point to memory large enough to
  * hold *count* items based on the key and value size of the map *map_fd*. The *keys*
@@ -226,7 +230,10 @@ LIBBPF_API int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch,
  *
  * @param fd BPF map file descriptor
  * @param in_batch address of the first element in batch to read, can pass NULL to
- * get address of the first element in *out_batch*
+ * get address of the first element in *out_batch*. If not NULL, must be large
+ * enough to hold a key. For **BPF_MAP_TYPE_{HASH, PERCPU_HASH, LRU_HASH,
+ * LRU_PERCPU_HASH}**, the memory size must be at least 4 bytes wide regardless
+ * of key size.
  * @param out_batch output parameter that should be passed to next call as *in_batch*
  * @param keys pointer to an array of *count* keys
  * @param values pointer to an array large enough for *count* values
-- 
cgit v1.2.3


From 025f8ad20f2e3264d11683aa9cbbf0083eefbdcd Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 22 Feb 2024 15:03:10 +0100
Subject: net: mpls: error out if inner headers are not set

mpls_gso_segment() assumes skb_inner_network_header() returns
a valid result:

  mpls_hlen = skb_inner_network_header(skb) - skb_network_header(skb);
  if (unlikely(!mpls_hlen || mpls_hlen % MPLS_HLEN))
        goto out;
  if (unlikely(!pskb_may_pull(skb, mpls_hlen)))

With syzbot reproducer, skb_inner_network_header() yields 0,
skb_network_header() returns 108, so this will
"pskb_may_pull(skb, -108)))" which triggers a newly added
DEBUG_NET_WARN_ON_ONCE() check:

------------[ cut here ]------------
WARNING: CPU: 0 PID: 5068 at include/linux/skbuff.h:2723 pskb_may_pull_reason include/linux/skbuff.h:2723 [inline]
WARNING: CPU: 0 PID: 5068 at include/linux/skbuff.h:2723 pskb_may_pull include/linux/skbuff.h:2739 [inline]
WARNING: CPU: 0 PID: 5068 at include/linux/skbuff.h:2723 mpls_gso_segment+0x773/0xaa0 net/mpls/mpls_gso.c:34
[..]
 skb_mac_gso_segment+0x383/0x740 net/core/gso.c:53
 nsh_gso_segment+0x40a/0xad0 net/nsh/nsh.c:108
 skb_mac_gso_segment+0x383/0x740 net/core/gso.c:53
 __skb_gso_segment+0x324/0x4c0 net/core/gso.c:124
 skb_gso_segment include/net/gso.h:83 [inline]
 [..]
 sch_direct_xmit+0x11a/0x5f0 net/sched/sch_generic.c:327
 [..]
 packet_sendmsg+0x46a9/0x6130 net/packet/af_packet.c:3113
 [..]

First iteration of this patch made mpls_hlen signed and changed
test to error out to "mpls_hlen <= 0 || ..".

Eric Dumazet said:
 > I was thinking about adding a debug check in skb_inner_network_header()
 > if inner_network_header is zero (that would mean it is not 'set' yet),
 > but this would trigger even after your patch.

So add new skb_inner_network_header_was_set() helper and use that.

The syzbot reproducer injects data via packet socket. The skb that gets
allocated and passed down the stack has ->protocol set to NSH (0x894f)
and gso_type set to SKB_GSO_UDP | SKB_GSO_DODGY.

This gets passed to skb_mac_gso_segment(), which sees NSH as ptype to
find a callback for.  nsh_gso_segment() retrieves next type:

        proto = tun_p_to_eth_p(nsh_hdr(skb)->np);

... which is MPLS (TUN_P_MPLS_UC). It updates skb->protocol and then
calls mpls_gso_segment().  Inner offsets are all 0, so mpls_gso_segment()
ends up with a negative header size.

In case more callers rely on silent handling of such large may_pull values
we could also 'legalize' this behaviour, either replacing the debug check
with (len > INT_MAX) test or removing it and instead adding a comment
before existing

 if (unlikely(len > skb->len))
    return SKB_DROP_REASON_PKT_TOO_SMALL;

test in pskb_may_pull_reason(), saying that this check also implicitly
takes care of callers that miscompute header sizes.

Cc: Simon Horman <horms@kernel.org>
Fixes: 219eee9c0d16 ("net: skbuff: add overflow debug check to pull/push helpers")
Reported-by: syzbot+99d15fcdb0132a1e1a82@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/00000000000043b1310611e388aa@google.com/raw
Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20240222140321.14080-1-fw@strlen.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 5 +++++
 net/mpls/mpls_gso.c    | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 28c7cb7ce251..1470b74fb6d2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2894,6 +2894,11 @@ static inline void skb_set_inner_network_header(struct sk_buff *skb,
 	skb->inner_network_header += offset;
 }
 
+static inline bool skb_inner_network_header_was_set(const struct sk_buff *skb)
+{
+	return skb->inner_network_header > 0;
+}
+
 static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb)
 {
 	return skb->head + skb->inner_mac_header;
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 533d082f0701..45d1e6a157fc 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -27,6 +27,9 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
 	__be16 mpls_protocol;
 	unsigned int mpls_hlen;
 
+	if (!skb_inner_network_header_was_set(skb))
+		goto out;
+
 	skb_reset_network_header(skb);
 	mpls_hlen = skb_inner_network_header(skb) - skb_network_header(skb);
 	if (unlikely(!mpls_hlen || mpls_hlen % MPLS_HLEN))
-- 
cgit v1.2.3


From 5fd5403964ecf047b03a9608a339bdc26a109f33 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 22 Feb 2024 14:28:19 -0800
Subject: genetlink: make info in GENL_REQ_ATTR_CHECK() const

Make the local variable in GENL_REQ_ATTR_CHECK() const.
genl_info_dump() returns a const pointer, so the macro
is currently hard to use in genl dumps.

Link: https://lore.kernel.org/r/20240222222819.156320-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/genetlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index ecadba836ae5..9ece6e5a3ea8 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -153,7 +153,7 @@ static inline void *genl_info_userhdr(const struct genl_info *info)
 
 /* Report that a root attribute is missing */
 #define GENL_REQ_ATTR_CHECK(info, attr) ({				\
-	struct genl_info *__info = (info);				\
+	const struct genl_info *__info = (info);			\
 									\
 	NL_REQ_ATTR_CHECK(__info->extack, NULL, __info->attrs, (attr)); \
 })
-- 
cgit v1.2.3


From 8afc7a78d55de726b2747d7775c54def79509ec5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 22 Feb 2024 10:50:10 +0000
Subject: ipv6: prepare inet6_fill_ifinfo() for RCU protection

We want to use RCU protection instead of RTNL
for inet6_fill_ifinfo().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  6 ++++--
 net/core/dev.c            |  4 ++--
 net/ipv6/addrconf.c       | 11 +++++++----
 3 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f07c8374f29c..09023e44db4e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4354,8 +4354,10 @@ static inline bool netif_testing(const struct net_device *dev)
  */
 static inline bool netif_oper_up(const struct net_device *dev)
 {
-	return (dev->operstate == IF_OPER_UP ||
-		dev->operstate == IF_OPER_UNKNOWN /* backward compat */);
+	unsigned int operstate = READ_ONCE(dev->operstate);
+
+	return	operstate == IF_OPER_UP ||
+		operstate == IF_OPER_UNKNOWN /* backward compat */;
 }
 
 /**
diff --git a/net/core/dev.c b/net/core/dev.c
index 0628d8ff1ed9..275fd5259a4a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8632,12 +8632,12 @@ unsigned int dev_get_flags(const struct net_device *dev)
 {
 	unsigned int flags;
 
-	flags = (dev->flags & ~(IFF_PROMISC |
+	flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
 				IFF_ALLMULTI |
 				IFF_RUNNING |
 				IFF_LOWER_UP |
 				IFF_DORMANT)) |
-		(dev->gflags & (IFF_PROMISC |
+		(READ_ONCE(dev->gflags) & (IFF_PROMISC |
 				IFF_ALLMULTI));
 
 	if (netif_running(dev)) {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a56dad307fe3..daa81556d118 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6062,6 +6062,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
 	struct net_device *dev = idev->dev;
 	struct ifinfomsg *hdr;
 	struct nlmsghdr *nlh;
+	int ifindex, iflink;
 	void *protoinfo;
 
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags);
@@ -6072,16 +6073,18 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
 	hdr->ifi_family = AF_INET6;
 	hdr->__ifi_pad = 0;
 	hdr->ifi_type = dev->type;
-	hdr->ifi_index = dev->ifindex;
+	ifindex = READ_ONCE(dev->ifindex);
+	hdr->ifi_index = ifindex;
 	hdr->ifi_flags = dev_get_flags(dev);
 	hdr->ifi_change = 0;
 
+	iflink = dev_get_iflink(dev);
 	if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
 	    (dev->addr_len &&
 	     nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
-	    nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
-	    (dev->ifindex != dev_get_iflink(dev) &&
-	     nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) ||
+	    nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) ||
+	    (ifindex != iflink &&
+	     nla_put_u32(skb, IFLA_LINK, iflink)) ||
 	    nla_put_u8(skb, IFLA_OPERSTATE,
 		       netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN))
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 386520e0ecc01004d3a29c70c5a77d4bbf8a8420 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 22 Feb 2024 10:50:15 +0000
Subject: rtnetlink: add RTNL_FLAG_DUMP_UNLOCKED flag

Similarly to RTNL_FLAG_DOIT_UNLOCKED, this new flag
allows dump operations registered via rtnl_register()
or rtnl_register_module() to opt-out from RTNL protection.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  | 2 ++
 include/net/rtnetlink.h  | 1 +
 net/core/rtnetlink.c     | 2 ++
 net/netlink/af_netlink.c | 3 +++
 4 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 1a4445bf2ab9..5df7340d4dab 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -291,6 +291,7 @@ struct netlink_callback {
 	u16			answer_flags;
 	u32			min_dump_alloc;
 	unsigned int		prev_seq, seq;
+	int			flags;
 	bool			strict_check;
 	union {
 		u8		ctx[48];
@@ -323,6 +324,7 @@ struct netlink_dump_control {
 	void *data;
 	struct module *module;
 	u32 min_dump_alloc;
+	int flags;
 };
 
 int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 6506221c5fe3..3bfb80bad173 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -12,6 +12,7 @@ typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *);
 enum rtnl_link_flags {
 	RTNL_FLAG_DOIT_UNLOCKED		= BIT(0),
 	RTNL_FLAG_BULK_DEL_SUPPORTED	= BIT(1),
+	RTNL_FLAG_DUMP_UNLOCKED		= BIT(2),
 };
 
 enum rtnl_kinds {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 060543fe7919..1b26dfa5668d 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -6532,6 +6532,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 		}
 		owner = link->owner;
 		dumpit = link->dumpit;
+		flags = link->flags;
 
 		if (type == RTM_GETLINK - RTM_BASE)
 			min_dump_alloc = rtnl_calcit(skb, nlh);
@@ -6549,6 +6550,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 				.dump		= dumpit,
 				.min_dump_alloc	= min_dump_alloc,
 				.module		= owner,
+				.flags		= flags,
 			};
 			err = netlink_dump_start(rtnl, skb, nlh, &c);
 			/* netlink_dump_start() will keep a reference on
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 84cad7be6d43..be5792b638aa 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2261,6 +2261,8 @@ static int netlink_dump(struct sock *sk, bool lock_taken)
 
 		cb->extack = &extack;
 
+		if (cb->flags & RTNL_FLAG_DUMP_UNLOCKED)
+			extra_mutex = NULL;
 		if (extra_mutex)
 			mutex_lock(extra_mutex);
 		nlk->dump_done_errno = cb->dump(skb, cb);
@@ -2355,6 +2357,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	cb->data = control->data;
 	cb->module = control->module;
 	cb->min_dump_alloc = control->min_dump_alloc;
+	cb->flags = control->flags;
 	cb->skb = skb;
 
 	cb->strict_check = nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk);
-- 
cgit v1.2.3


From 22e36ea9f5d7707ae3d64c497d172f4ef735c353 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 22 Feb 2024 10:50:17 +0000
Subject: inet: allow ip_valid_fib_dump_req() to be called with RTNL or RCU

Add a new field into struct fib_dump_filter, to let callers
tell if they use RTNL locking or RCU.

This is used in the following patch, when inet_dump_fib()
no longer holds RTNL.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    |  1 +
 net/ipv4/fib_frontend.c | 15 +++++++++++----
 net/ipv4/ipmr.c         |  4 +++-
 net/ipv6/ip6_fib.c      |  7 +++++--
 net/ipv6/ip6mr.c        |  4 +++-
 net/mpls/af_mpls.c      |  4 +++-
 6 files changed, 26 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index d4667b7797e3..9b2f69ba5e49 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -264,6 +264,7 @@ struct fib_dump_filter {
 	bool			filter_set;
 	bool			dump_routes;
 	bool			dump_exceptions;
+	bool			rtnl_held;
 	unsigned char		protocol;
 	unsigned char		rt_type;
 	unsigned int		flags;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 390f4be7f7be..39f67990e01c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -916,7 +916,8 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 	struct rtmsg *rtm;
 	int err, i;
 
-	ASSERT_RTNL();
+	if (filter->rtnl_held)
+		ASSERT_RTNL();
 
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
 		NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
@@ -961,7 +962,10 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 			break;
 		case RTA_OIF:
 			ifindex = nla_get_u32(tb[i]);
-			filter->dev = __dev_get_by_index(net, ifindex);
+			if (filter->rtnl_held)
+				filter->dev = __dev_get_by_index(net, ifindex);
+			else
+				filter->dev = dev_get_by_index_rcu(net, ifindex);
 			if (!filter->dev)
 				return -ENODEV;
 			break;
@@ -983,8 +987,11 @@ EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
 
 static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct fib_dump_filter filter = { .dump_routes = true,
-					  .dump_exceptions = true };
+	struct fib_dump_filter filter = {
+		.dump_routes = true,
+		.dump_exceptions = true,
+		.rtnl_held = true,
+	};
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
 	unsigned int h, s_h;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5561bce3a37e..0708ac6f6c58 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2587,7 +2587,9 @@ errout_free:
 
 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct fib_dump_filter filter = {};
+	struct fib_dump_filter filter = {
+		.rtnl_held = true,
+	};
 	int err;
 
 	if (cb->strict_check) {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6540d877d369..5c558dc1c683 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -620,8 +620,11 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 
 static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true,
-					 .filter.dump_routes = true };
+	struct rt6_rtnl_dump_arg arg = {
+		.filter.dump_exceptions = true,
+		.filter.dump_routes = true,
+		.filter.rtnl_held = true,
+	};
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
 	unsigned int h, s_h;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 1f19743f2540..cb0ee81a068a 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2592,7 +2592,9 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
-	struct fib_dump_filter filter = {};
+	struct fib_dump_filter filter = {
+		.rtnl_held = true,
+	};
 	int err;
 
 	if (cb->strict_check) {
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 1af29af65388..6dab883a08dd 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2179,7 +2179,9 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
 	struct mpls_route __rcu **platform_label;
-	struct fib_dump_filter filter = {};
+	struct fib_dump_filter filter = {
+		.rtnl_held = true,
+	};
 	unsigned int flags = NLM_F_MULTI;
 	size_t platform_labels;
 	unsigned int index;
-- 
cgit v1.2.3


From 0ac3fa0c3b365f97c3969f391edf7b44d3bb210d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 22 Feb 2024 10:50:18 +0000
Subject: nexthop: allow nexthop_mpath_fill_node() to be called without RTNL

nexthop_mpath_fill_node() will be potentially called
from contexts holding rcu_lock instead of RTNL.

Suggested-by: Ido Schimmel <idosch@nvidia.com>
Link: https://lore.kernel.org/all/ZdZDWVdjMaQkXBgW@shredder/
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nexthop.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 6647ad509faa..77e99cba60ad 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -317,7 +317,7 @@ static inline
 int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh,
 			    u8 rt_family)
 {
-	struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+	struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
 	int i;
 
 	for (i = 0; i < nhg->num_nh; i++) {
-- 
cgit v1.2.3


From c3718936ec47ae811a7ce9a618b6cb1cda835bab Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 23 Feb 2024 20:10:54 +0000
Subject: ipv6: anycast: complete RCU handling of struct ifacaddr6

struct ifacaddr6 are already freed after RCU grace period.

Add __rcu qualifier to aca_next pointer, and idev->ac_list

Add relevant rcu_assign_pointer() and dereference accessors.

ipv6_chk_acast_dev() no longer needs to acquire idev->lock.

/proc/net/anycast6 is now purely RCU protected, it no
longer acquires idev->lock.

Similarly in6_dump_addrs() can use RCU protection to iterate
through anycast addresses. It was relying on a mixture of RCU
and RTNL but next patches will get rid of RTNL there.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/20240223201054.220534-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/if_inet6.h |  4 ++--
 net/ipv6/addrconf.c    |  4 ++--
 net/ipv6/anycast.c     | 61 +++++++++++++++++++-------------------------------
 3 files changed, 27 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index f07642264c1e..238ad3349456 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -144,7 +144,7 @@ struct ipv6_ac_socklist {
 struct ifacaddr6 {
 	struct in6_addr		aca_addr;
 	struct fib6_info	*aca_rt;
-	struct ifacaddr6	*aca_next;
+	struct ifacaddr6 __rcu	*aca_next;
 	struct hlist_node	aca_addr_lst;
 	int			aca_users;
 	refcount_t		aca_refcnt;
@@ -196,7 +196,7 @@ struct inet6_dev {
 	spinlock_t		mc_report_lock;	/* mld query report lock */
 	struct mutex		mc_lock;	/* mld global lock */
 
-	struct ifacaddr6	*ac_list;
+	struct ifacaddr6 __rcu	*ac_list;
 	rwlock_t		lock;
 	refcount_t		refcnt;
 	__u32			if_flags;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a280614b3765..e27069ad938c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5314,8 +5314,8 @@ next:
 	case ANYCAST_ADDR:
 		fillargs->event = RTM_GETANYCAST;
 		/* anycast address */
-		for (ifaca = idev->ac_list; ifaca;
-		     ifaca = ifaca->aca_next, ip_idx++) {
+		for (ifaca = rcu_dereference(idev->ac_list); ifaca;
+		     ifaca = rcu_dereference(ifaca->aca_next), ip_idx++) {
 			if (ip_idx < s_ip_idx)
 				continue;
 			err = inet6_fill_ifacaddr(skb, ifaca, fillargs);
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index bb17f484ee2c..0f2506e35359 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -296,7 +296,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 		goto out;
 	}
 
-	for (aca = idev->ac_list; aca; aca = aca->aca_next) {
+	for (aca = rtnl_dereference(idev->ac_list); aca;
+	     aca = rtnl_dereference(aca->aca_next)) {
 		if (ipv6_addr_equal(&aca->aca_addr, addr)) {
 			aca->aca_users++;
 			err = 0;
@@ -317,13 +318,13 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 		goto out;
 	}
 
-	aca->aca_next = idev->ac_list;
-	idev->ac_list = aca;
-
 	/* Hold this for addrconf_join_solict() below before we unlock,
 	 * it is already exposed via idev->ac_list.
 	 */
 	aca_get(aca);
+	aca->aca_next = idev->ac_list;
+	rcu_assign_pointer(idev->ac_list, aca);
+
 	write_unlock_bh(&idev->lock);
 
 	ipv6_add_acaddr_hash(net, aca);
@@ -350,7 +351,8 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
 
 	write_lock_bh(&idev->lock);
 	prev_aca = NULL;
-	for (aca = idev->ac_list; aca; aca = aca->aca_next) {
+	for (aca = rtnl_dereference(idev->ac_list); aca;
+	     aca = rtnl_dereference(aca->aca_next)) {
 		if (ipv6_addr_equal(&aca->aca_addr, addr))
 			break;
 		prev_aca = aca;
@@ -364,9 +366,9 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
 		return 0;
 	}
 	if (prev_aca)
-		prev_aca->aca_next = aca->aca_next;
+		rcu_assign_pointer(prev_aca->aca_next, aca->aca_next);
 	else
-		idev->ac_list = aca->aca_next;
+		rcu_assign_pointer(idev->ac_list, aca->aca_next);
 	write_unlock_bh(&idev->lock);
 	ipv6_del_acaddr_hash(aca);
 	addrconf_leave_solict(idev, &aca->aca_addr);
@@ -392,8 +394,8 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev)
 	struct ifacaddr6 *aca;
 
 	write_lock_bh(&idev->lock);
-	while ((aca = idev->ac_list) != NULL) {
-		idev->ac_list = aca->aca_next;
+	while ((aca = rtnl_dereference(idev->ac_list)) != NULL) {
+		rcu_assign_pointer(idev->ac_list, aca->aca_next);
 		write_unlock_bh(&idev->lock);
 
 		ipv6_del_acaddr_hash(aca);
@@ -420,11 +422,10 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *ad
 
 	idev = __in6_dev_get(dev);
 	if (idev) {
-		read_lock_bh(&idev->lock);
-		for (aca = idev->ac_list; aca; aca = aca->aca_next)
+		for (aca = rcu_dereference(idev->ac_list); aca;
+		     aca = rcu_dereference(aca->aca_next))
 			if (ipv6_addr_equal(&aca->aca_addr, addr))
 				break;
-		read_unlock_bh(&idev->lock);
 		return aca != NULL;
 	}
 	return false;
@@ -477,30 +478,25 @@ bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
 struct ac6_iter_state {
 	struct seq_net_private p;
 	struct net_device *dev;
-	struct inet6_dev *idev;
 };
 
 #define ac6_seq_private(seq)	((struct ac6_iter_state *)(seq)->private)
 
 static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq)
 {
-	struct ifacaddr6 *im = NULL;
 	struct ac6_iter_state *state = ac6_seq_private(seq);
 	struct net *net = seq_file_net(seq);
+	struct ifacaddr6 *im = NULL;
 
-	state->idev = NULL;
 	for_each_netdev_rcu(net, state->dev) {
 		struct inet6_dev *idev;
+
 		idev = __in6_dev_get(state->dev);
 		if (!idev)
 			continue;
-		read_lock_bh(&idev->lock);
-		im = idev->ac_list;
-		if (im) {
-			state->idev = idev;
+		im = rcu_dereference(idev->ac_list);
+		if (im)
 			break;
-		}
-		read_unlock_bh(&idev->lock);
 	}
 	return im;
 }
@@ -508,22 +504,17 @@ static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq)
 static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im)
 {
 	struct ac6_iter_state *state = ac6_seq_private(seq);
+	struct inet6_dev *idev;
 
-	im = im->aca_next;
+	im = rcu_dereference(im->aca_next);
 	while (!im) {
-		if (likely(state->idev != NULL))
-			read_unlock_bh(&state->idev->lock);
-
 		state->dev = next_net_device_rcu(state->dev);
-		if (!state->dev) {
-			state->idev = NULL;
+		if (!state->dev)
 			break;
-		}
-		state->idev = __in6_dev_get(state->dev);
-		if (!state->idev)
+		idev = __in6_dev_get(state->dev);
+		if (!idev)
 			continue;
-		read_lock_bh(&state->idev->lock);
-		im = state->idev->ac_list;
+		im = rcu_dereference(idev->ac_list);
 	}
 	return im;
 }
@@ -555,12 +546,6 @@ static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 static void ac6_seq_stop(struct seq_file *seq, void *v)
 	__releases(RCU)
 {
-	struct ac6_iter_state *state = ac6_seq_private(seq);
-
-	if (likely(state->idev != NULL)) {
-		read_unlock_bh(&state->idev->lock);
-		state->idev = NULL;
-	}
 	rcu_read_unlock();
 }
 
-- 
cgit v1.2.3


From 12a686c2e761f1f1f6e6e2117a9ab9c6de2ac8a7 Mon Sep 17 00:00:00 2001
From: Adam Li <adamli@os.amperecomputing.com>
Date: Mon, 26 Feb 2024 02:24:52 +0000
Subject: net: make SK_MEMORY_PCPU_RESERV tunable

This patch adds /proc/sys/net/core/mem_pcpu_rsv sysctl file,
to make SK_MEMORY_PCPU_RESERV tunable.

Commit 3cd3399dd7a8 ("net: implement per-cpu reserves for
memory_allocated") introduced per-cpu forward alloc cache:

"Implement a per-cpu cache of +1/-1 MB, to reduce number
of changes to sk->sk_prot->memory_allocated, which
would otherwise be cause of false sharing."

sk_prot->memory_allocated points to global atomic variable:
atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp;

If increasing the per-cpu cache size from 1MB to e.g. 16MB,
changes to sk->sk_prot->memory_allocated can be further reduced.
Performance may be improved on system with many cores.

Signed-off-by: Adam Li <adamli@os.amperecomputing.com>
Reviewed-by: Christoph Lameter (Ampere) <cl@linux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/admin-guide/sysctl/net.rst | 5 +++++
 include/net/sock.h                       | 5 +++--
 net/core/sock.c                          | 1 +
 net/core/sysctl_net_core.c               | 9 +++++++++
 4 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index 396091651955..7250c0542828 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -206,6 +206,11 @@ Will increase power usage.
 
 Default: 0 (off)
 
+mem_pcpu_rsv
+------------
+
+Per-cpu reserved forward alloc cache size in page units. Default 1MB per CPU.
+
 rmem_default
 ------------
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 796a902cf4c1..09a0cde8bf52 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1443,6 +1443,7 @@ sk_memory_allocated(const struct sock *sk)
 
 /* 1 MB per cpu, in page units */
 #define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT))
+extern int sysctl_mem_pcpu_rsv;
 
 static inline void
 sk_memory_allocated_add(struct sock *sk, int amt)
@@ -1451,7 +1452,7 @@ sk_memory_allocated_add(struct sock *sk, int amt)
 
 	preempt_disable();
 	local_reserve = __this_cpu_add_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
-	if (local_reserve >= SK_MEMORY_PCPU_RESERVE) {
+	if (local_reserve >= READ_ONCE(sysctl_mem_pcpu_rsv)) {
 		__this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
 		atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
 	}
@@ -1465,7 +1466,7 @@ sk_memory_allocated_sub(struct sock *sk, int amt)
 
 	preempt_disable();
 	local_reserve = __this_cpu_sub_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
-	if (local_reserve <= -SK_MEMORY_PCPU_RESERVE) {
+	if (local_reserve <= -READ_ONCE(sysctl_mem_pcpu_rsv)) {
 		__this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
 		atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
 	}
diff --git a/net/core/sock.c b/net/core/sock.c
index 8d86886e39fa..df2ac54a8f74 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -283,6 +283,7 @@ __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 EXPORT_SYMBOL(sysctl_rmem_max);
 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
+int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE;
 
 int sysctl_tstamp_allow_data __read_mostly = 1;
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 0f0cb1465e08..986f15e5d6c4 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -30,6 +30,7 @@ static int int_3600 = 3600;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
 static int max_skb_frags = MAX_SKB_FRAGS;
+static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE;
 
 static int net_msg_warn;	/* Unused, but still a sysctl */
 
@@ -407,6 +408,14 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_rcvbuf,
 	},
+	{
+		.procname	= "mem_pcpu_rsv",
+		.data		= &sysctl_mem_pcpu_rsv,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_mem_pcpu_rsv,
+	},
 	{
 		.procname	= "dev_weight",
 		.data		= &weight_p,
-- 
cgit v1.2.3


From 48e4704aedb9f17213206dc649fdac90f94cb749 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Mon, 26 Feb 2024 11:22:18 +0800
Subject: tcp: add a dropreason definitions and prepare for cookie check

Adding one drop reason to detect the condition of skb dropped
because of hook points in cookie check and extending NO_SOCKET
to consider another two cases can be used later.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dropreason-core.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index 6d3a20163260..a871f061558d 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -54,6 +54,7 @@
 	FN(NEIGH_QUEUEFULL)		\
 	FN(NEIGH_DEAD)			\
 	FN(TC_EGRESS)			\
+	FN(SECURITY_HOOK)		\
 	FN(QDISC_DROP)			\
 	FN(CPU_BACKLOG)			\
 	FN(XDP)				\
@@ -105,7 +106,13 @@ enum skb_drop_reason {
 	SKB_CONSUMED,
 	/** @SKB_DROP_REASON_NOT_SPECIFIED: drop reason is not specified */
 	SKB_DROP_REASON_NOT_SPECIFIED,
-	/** @SKB_DROP_REASON_NO_SOCKET: socket not found */
+	/**
+	 * @SKB_DROP_REASON_NO_SOCKET: no valid socket that can be used.
+	 * Reason could be one of three cases:
+	 * 1) no established/listening socket found during lookup process
+	 * 2) no valid request socket during 3WHS process
+	 * 3) no valid child socket during 3WHS process
+	 */
 	SKB_DROP_REASON_NO_SOCKET,
 	/** @SKB_DROP_REASON_PKT_TOO_SMALL: packet size is too small */
 	SKB_DROP_REASON_PKT_TOO_SMALL,
@@ -271,6 +278,8 @@ enum skb_drop_reason {
 	SKB_DROP_REASON_NEIGH_DEAD,
 	/** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */
 	SKB_DROP_REASON_TC_EGRESS,
+	/** @SKB_DROP_REASON_SECURITY_HOOK: dropped due to security HOOK */
+	SKB_DROP_REASON_SECURITY_HOOK,
 	/**
 	 * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc when packet outputting (
 	 * failed to enqueue to current qdisc)
-- 
cgit v1.2.3


From 3d359faba191c95e97ac6699b0163e797812bfca Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Mon, 26 Feb 2024 11:22:23 +0800
Subject: tcp: introduce dropreasons in receive path

Soon later patches can use these relatively more accurate
reasons to recognise and find out the cause.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dropreason-core.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index a871f061558d..9707ab54fdd5 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -30,6 +30,7 @@
 	FN(TCP_AOFAILURE)		\
 	FN(SOCKET_BACKLOG)		\
 	FN(TCP_FLAGS)			\
+	FN(TCP_ABORT_ON_DATA)		\
 	FN(TCP_ZEROWINDOW)		\
 	FN(TCP_OLD_DATA)		\
 	FN(TCP_OVERWINDOW)		\
@@ -37,6 +38,7 @@
 	FN(TCP_RFC7323_PAWS)		\
 	FN(TCP_OLD_SEQUENCE)		\
 	FN(TCP_INVALID_SEQUENCE)	\
+	FN(TCP_INVALID_ACK_SEQUENCE)	\
 	FN(TCP_RESET)			\
 	FN(TCP_INVALID_SYN)		\
 	FN(TCP_CLOSE)			\
@@ -204,6 +206,11 @@ enum skb_drop_reason {
 	SKB_DROP_REASON_SOCKET_BACKLOG,
 	/** @SKB_DROP_REASON_TCP_FLAGS: TCP flags invalid */
 	SKB_DROP_REASON_TCP_FLAGS,
+	/**
+	 * @SKB_DROP_REASON_TCP_ABORT_ON_DATA: abort on data, corresponding to
+	 * LINUX_MIB_TCPABORTONDATA
+	 */
+	SKB_DROP_REASON_TCP_ABORT_ON_DATA,
 	/**
 	 * @SKB_DROP_REASON_TCP_ZEROWINDOW: TCP receive window size is zero,
 	 * see LINUX_MIB_TCPZEROWINDOWDROP
@@ -228,13 +235,19 @@ enum skb_drop_reason {
 	SKB_DROP_REASON_TCP_OFOMERGE,
 	/**
 	 * @SKB_DROP_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to
-	 * LINUX_MIB_PAWSESTABREJECTED
+	 * LINUX_MIB_PAWSESTABREJECTED, LINUX_MIB_PAWSACTIVEREJECTED
 	 */
 	SKB_DROP_REASON_TCP_RFC7323_PAWS,
 	/** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */
 	SKB_DROP_REASON_TCP_OLD_SEQUENCE,
 	/** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */
 	SKB_DROP_REASON_TCP_INVALID_SEQUENCE,
+	/**
+	 * @SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ
+	 * field because ack sequence is not in the window between snd_una
+	 * and snd_nxt
+	 */
+	SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE,
 	/** @SKB_DROP_REASON_TCP_RESET: Invalid RST packet */
 	SKB_DROP_REASON_TCP_RESET,
 	/**
-- 
cgit v1.2.3


From 7d6ed9afde8547723f6f96f81f984cc6c48eef52 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Mon, 26 Feb 2024 11:22:25 +0800
Subject: tcp: add dropreasons in tcp_rcv_state_process()

In this patch, I equipped this function with more dropreasons, but
it still doesn't work yet, which I will do later.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    |  2 +-
 net/ipv4/tcp_input.c | 19 ++++++++++++-------
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 33bf92dff0af..af2a4dcd4518 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -348,7 +348,7 @@ void tcp_wfree(struct sk_buff *skb);
 void tcp_write_timer_handler(struct sock *sk);
 void tcp_delack_timer_handler(struct sock *sk);
 int tcp_ioctl(struct sock *sk, int cmd, int *karg);
-int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
+enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
 void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
 void tcp_rcv_space_adjust(struct sock *sk);
 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 83308cca1610..5d874817a78d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6619,7 +6619,8 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
  *	address independent.
  */
 
-int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
+enum skb_drop_reason
+tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
@@ -6635,7 +6636,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 
 	case TCP_LISTEN:
 		if (th->ack)
-			return 1;
+			return SKB_DROP_REASON_TCP_FLAGS;
 
 		if (th->rst) {
 			SKB_DR_SET(reason, TCP_RESET);
@@ -6704,8 +6705,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 				  FLAG_NO_CHALLENGE_ACK);
 
 	if ((int)reason <= 0) {
-		if (sk->sk_state == TCP_SYN_RECV)
-			return 1;	/* send one RST */
+		if (sk->sk_state == TCP_SYN_RECV) {
+			/* send one RST */
+			if (!reason)
+				return SKB_DROP_REASON_TCP_OLD_ACK;
+			return -reason;
+		}
 		/* accept old ack during closing */
 		if ((int)reason < 0) {
 			tcp_send_challenge_ack(sk);
@@ -6781,7 +6786,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		if (READ_ONCE(tp->linger2) < 0) {
 			tcp_done(sk);
 			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
-			return 1;
+			return SKB_DROP_REASON_TCP_ABORT_ON_DATA;
 		}
 		if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
 		    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
@@ -6790,7 +6795,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 				tcp_fastopen_active_disable(sk);
 			tcp_done(sk);
 			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
-			return 1;
+			return SKB_DROP_REASON_TCP_ABORT_ON_DATA;
 		}
 
 		tmo = tcp_fin_time(sk);
@@ -6855,7 +6860,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
 				NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
 				tcp_reset(sk, skb);
-				return 1;
+				return SKB_DROP_REASON_TCP_ABORT_ON_DATA;
 			}
 		}
 		fallthrough;
-- 
cgit v1.2.3


From b9825695930546af725b1e686b8eaf4c71201728 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Mon, 26 Feb 2024 11:22:26 +0800
Subject: tcp: make the dropreason really work when calling
 tcp_rcv_state_process()

Update three callers including both ipv4 and ipv6 and let the dropreason
mechanism work in reality.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h        |  4 ++--
 net/ipv4/tcp_ipv4.c      |  3 ++-
 net/ipv4/tcp_minisocks.c | 10 +++++-----
 net/ipv6/tcp_ipv6.c      |  3 ++-
 4 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index af2a4dcd4518..6ae35199d3b3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -396,8 +396,8 @@ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw,
 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 			   struct request_sock *req, bool fastopen,
 			   bool *lost_race);
-int tcp_child_process(struct sock *parent, struct sock *child,
-		      struct sk_buff *skb);
+enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child,
+				       struct sk_buff *skb);
 void tcp_enter_loss(struct sock *sk);
 void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag);
 void tcp_clear_retrans(struct tcp_sock *tp);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0a944e109088..c79e25549972 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1926,7 +1926,8 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 	} else
 		sock_rps_save_rxhash(sk, skb);
 
-	if (tcp_rcv_state_process(sk, skb)) {
+	reason = tcp_rcv_state_process(sk, skb);
+	if (reason) {
 		rsk = sk;
 		goto reset;
 	}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 9e85f2a0bddd..52040b0e2616 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -911,11 +911,11 @@ EXPORT_SYMBOL(tcp_check_req);
  * be created.
  */
 
-int tcp_child_process(struct sock *parent, struct sock *child,
-		      struct sk_buff *skb)
+enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child,
+				       struct sk_buff *skb)
 	__releases(&((child)->sk_lock.slock))
 {
-	int ret = 0;
+	enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
 	int state = child->sk_state;
 
 	/* record sk_napi_id and sk_rx_queue_mapping of child. */
@@ -923,7 +923,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
 
 	tcp_segs_in(tcp_sk(child), skb);
 	if (!sock_owned_by_user(child)) {
-		ret = tcp_rcv_state_process(child, skb);
+		reason = tcp_rcv_state_process(child, skb);
 		/* Wakeup parent, send SIGIO */
 		if (state == TCP_SYN_RECV && child->sk_state != state)
 			parent->sk_data_ready(parent);
@@ -937,6 +937,6 @@ int tcp_child_process(struct sock *parent, struct sock *child,
 
 	bh_unlock_sock(child);
 	sock_put(child);
-	return ret;
+	return reason;
 }
 EXPORT_SYMBOL(tcp_child_process);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0c180bb8187f..4f8464e04b7f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1663,7 +1663,8 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	} else
 		sock_rps_save_rxhash(sk, skb);
 
-	if (tcp_rcv_state_process(sk, skb))
+	reason = tcp_rcv_state_process(sk, skb);
+	if (reason)
 		goto reset;
 	if (opt_skb)
 		goto ipv6_pktoptions;
-- 
cgit v1.2.3


From 5fa918a33563e050d45fb8f260f9240b2039f432 Mon Sep 17 00:00:00 2001
From: Justin Iurman <justin.iurman@uliege.be>
Date: Mon, 26 Feb 2024 14:14:10 +0100
Subject: uapi: ioam6: API for netlink multicast events

Add new api to support ioam6 events for generic netlink multicast. A
first "trace" event is added to the list of ioam6 events, which will
represent an IOAM Pre-allocated Trace Option-Type. It provides another
solution to share IOAM data with user space.

Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Justin Iurman <justin.iurman@uliege.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ioam6_genl.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/ioam6_genl.h b/include/uapi/linux/ioam6_genl.h
index ca4b22833754..1733fbc51fb5 100644
--- a/include/uapi/linux/ioam6_genl.h
+++ b/include/uapi/linux/ioam6_genl.h
@@ -49,4 +49,24 @@ enum {
 
 #define IOAM6_CMD_MAX (__IOAM6_CMD_MAX - 1)
 
+#define IOAM6_GENL_EV_GRP_NAME "ioam6_events"
+
+enum ioam6_event_type {
+	IOAM6_EVENT_UNSPEC,
+	IOAM6_EVENT_TRACE,
+};
+
+enum ioam6_event_attr {
+	IOAM6_EVENT_ATTR_UNSPEC,
+
+	IOAM6_EVENT_ATTR_TRACE_NAMESPACE,	/* u16 */
+	IOAM6_EVENT_ATTR_TRACE_NODELEN,		/* u8 */
+	IOAM6_EVENT_ATTR_TRACE_TYPE,		/* u32 */
+	IOAM6_EVENT_ATTR_TRACE_DATA,		/* Binary */
+
+	__IOAM6_EVENT_ATTR_MAX
+};
+
+#define IOAM6_EVENT_ATTR_MAX (__IOAM6_EVENT_ATTR_MAX - 1)
+
 #endif /* _UAPI_LINUX_IOAM6_GENL_H */
-- 
cgit v1.2.3


From 67c8e4bb4f54ad22ca0ce4700c8728ffd73acb66 Mon Sep 17 00:00:00 2001
From: Justin Iurman <justin.iurman@uliege.be>
Date: Mon, 26 Feb 2024 14:14:11 +0100
Subject: net: ioam6: multicast event

Add a multicast group to the ioam6 generic netlink family and provide
ioam6_event() to send an ioam6 event to the multicast group.

Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Justin Iurman <justin.iurman@uliege.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ioam6.h |  4 ++++
 net/ipv6/ioam6.c    | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

(limited to 'include')

diff --git a/include/net/ioam6.h b/include/net/ioam6.h
index 781d2d8b2f29..2cbbee6e806a 100644
--- a/include/net/ioam6.h
+++ b/include/net/ioam6.h
@@ -12,6 +12,7 @@
 #include <linux/net.h>
 #include <linux/ipv6.h>
 #include <linux/ioam6.h>
+#include <linux/ioam6_genl.h>
 #include <linux/rhashtable-types.h>
 
 struct ioam6_namespace {
@@ -65,4 +66,7 @@ void ioam6_exit(void);
 int ioam6_iptunnel_init(void);
 void ioam6_iptunnel_exit(void);
 
+void ioam6_event(enum ioam6_event_type type, struct net *net, gfp_t gfp,
+		 void *opt, unsigned int opt_len);
+
 #endif /* _NET_IOAM6_H */
diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c
index 571f0e4d9cf3..5fa923f06632 100644
--- a/net/ipv6/ioam6.c
+++ b/net/ipv6/ioam6.c
@@ -612,6 +612,68 @@ static const struct genl_ops ioam6_genl_ops[] = {
 	},
 };
 
+#define IOAM6_GENL_EV_GRP_OFFSET 0
+
+static const struct genl_multicast_group ioam6_mcgrps[] = {
+	[IOAM6_GENL_EV_GRP_OFFSET] = { .name = IOAM6_GENL_EV_GRP_NAME,
+				       .flags = GENL_MCAST_CAP_NET_ADMIN },
+};
+
+static int ioam6_event_put_trace(struct sk_buff *skb,
+				 struct ioam6_trace_hdr *trace,
+				 unsigned int len)
+{
+	if (nla_put_u16(skb, IOAM6_EVENT_ATTR_TRACE_NAMESPACE,
+			be16_to_cpu(trace->namespace_id)) ||
+	    nla_put_u8(skb, IOAM6_EVENT_ATTR_TRACE_NODELEN, trace->nodelen) ||
+	    nla_put_u32(skb, IOAM6_EVENT_ATTR_TRACE_TYPE,
+			be32_to_cpu(trace->type_be32)) ||
+	    nla_put(skb, IOAM6_EVENT_ATTR_TRACE_DATA,
+		    len - sizeof(struct ioam6_trace_hdr) - trace->remlen * 4,
+		    trace->data + trace->remlen * 4))
+		return 1;
+
+	return 0;
+}
+
+void ioam6_event(enum ioam6_event_type type, struct net *net, gfp_t gfp,
+		 void *opt, unsigned int opt_len)
+{
+	struct nlmsghdr *nlh;
+	struct sk_buff *skb;
+
+	if (!genl_has_listeners(&ioam6_genl_family, net,
+				IOAM6_GENL_EV_GRP_OFFSET))
+		return;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!skb)
+		return;
+
+	nlh = genlmsg_put(skb, 0, 0, &ioam6_genl_family, 0, type);
+	if (!nlh)
+		goto nla_put_failure;
+
+	switch (type) {
+	case IOAM6_EVENT_UNSPEC:
+		WARN_ON_ONCE(1);
+		break;
+	case IOAM6_EVENT_TRACE:
+		if (ioam6_event_put_trace(skb, (struct ioam6_trace_hdr *)opt,
+					  opt_len))
+			goto nla_put_failure;
+		break;
+	}
+
+	genlmsg_end(skb, nlh);
+	genlmsg_multicast_netns(&ioam6_genl_family, net, skb, 0,
+				IOAM6_GENL_EV_GRP_OFFSET, gfp);
+	return;
+
+nla_put_failure:
+	nlmsg_free(skb);
+}
+
 static struct genl_family ioam6_genl_family __ro_after_init = {
 	.name		= IOAM6_GENL_NAME,
 	.version	= IOAM6_GENL_VERSION,
@@ -620,6 +682,8 @@ static struct genl_family ioam6_genl_family __ro_after_init = {
 	.ops		= ioam6_genl_ops,
 	.n_ops		= ARRAY_SIZE(ioam6_genl_ops),
 	.resv_start_op	= IOAM6_CMD_NS_SET_SCHEMA + 1,
+	.mcgrps		= ioam6_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(ioam6_mcgrps),
 	.module		= THIS_MODULE,
 };
 
-- 
cgit v1.2.3


From 416eb60317c64676d158dffea150762930ec008f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 23 Feb 2024 10:01:46 +0100
Subject: bitfield: suppress "dubious: x & !y" sparse warning

There's a somewhat common pattern of using FIELD_PREP()
even for single bits, e.g.

 cmd->info1 |= FIELD_PREP(HTT_SRNG_SETUP_CMD_INFO1_RING_FLAGS_MSI_SWAP,
                          !!(params.flags & HAL_SRNG_FLAGS_MSI_SWAP));

which might as well be written as

 if (params.flags & HAL_SRNG_FLAGS_MSI_SWAP)
   cmd->info1 |= HTT_SRNG_SETUP_CMD_INFO1_RING_FLAGS_MSI_SWAP;

(since info1 is fully initialized to start with), but in
a long chain of FIELD_PREP() this really seems fine.

However, it triggers a sparse warning, in the check in
the macro for whether a constant value fits into the mask,
as this contains a "& (_val)". In this case, this really
is always intentional, so just suppress the warning by
adding "0+" to the expression, indicating explicitly that
this is correct.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://msgid.link/20240223100146.d243b6b1a9a1.I033828b1187c6bccf086e31400f7e933bb8373e7@changeid
---
 include/linux/bitfield.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index ebfa12f69501..63928f173223 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -66,7 +66,8 @@
 				 _pfx "mask is not constant");		\
 		BUILD_BUG_ON_MSG((_mask) == 0, _pfx "mask is zero");	\
 		BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ?		\
-				 ~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \
+				 ~((_mask) >> __bf_shf(_mask)) &	\
+					(0 + (_val)) : 0,		\
 				 _pfx "value too large for the field"); \
 		BUILD_BUG_ON_MSG(__bf_cast_unsigned(_mask, _mask) >	\
 				 __bf_cast_unsigned(_reg, ~0ull),	\
-- 
cgit v1.2.3


From 292fac464b012200c4e99d08974fed3bc087b848 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 26 Feb 2024 19:29:15 -0600
Subject: net: ethtool: eee: Remove legacy _u32 from keee

All MAC drivers have been converted to use the link mode members of
keee. So remove the _u32 values, and the code in the ethtool core to
convert the legacy _u32 values to link modes.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  3 ---
 net/ethtool/eee.c       | 31 ++++---------------------------
 net/ethtool/ioctl.c     | 29 ++++++++++-------------------
 3 files changed, 14 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index b90c33607594..9901e563f706 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -226,9 +226,6 @@ struct ethtool_keee {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(advertised);
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertised);
-	u32	supported_u32;
-	u32	advertised_u32;
-	u32	lp_advertised_u32;
 	u32	tx_lpi_timer;
 	bool	tx_lpi_enabled;
 	bool	eee_active;
diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c
index db6faa18fe41..bf398973eb8a 100644
--- a/net/ethtool/eee.c
+++ b/net/ethtool/eee.c
@@ -4,9 +4,6 @@
 #include "common.h"
 #include "bitset.h"
 
-#define EEE_MODES_COUNT \
-	(sizeof_field(struct ethtool_keee, supported_u32) * BITS_PER_BYTE)
-
 struct eee_req_info {
 	struct ethnl_req_info		base;
 };
@@ -41,15 +38,6 @@ static int eee_prepare_data(const struct ethnl_req_info *req_base,
 	ret = dev->ethtool_ops->get_eee(dev, eee);
 	ethnl_ops_complete(dev);
 
-	if (!ret && !ethtool_eee_use_linkmodes(eee)) {
-		ethtool_convert_legacy_u32_to_link_mode(eee->supported,
-							eee->supported_u32);
-		ethtool_convert_legacy_u32_to_link_mode(eee->advertised,
-							eee->advertised_u32);
-		ethtool_convert_legacy_u32_to_link_mode(eee->lp_advertised,
-							eee->lp_advertised_u32);
-	}
-
 	return ret;
 }
 
@@ -62,11 +50,6 @@ static int eee_reply_size(const struct ethnl_req_info *req_base,
 	int len = 0;
 	int ret;
 
-	BUILD_BUG_ON(sizeof(eee->advertised_u32) * BITS_PER_BYTE !=
-		     EEE_MODES_COUNT);
-	BUILD_BUG_ON(sizeof(eee->lp_advertised_u32) * BITS_PER_BYTE !=
-		     EEE_MODES_COUNT);
-
 	/* MODES_OURS */
 	ret = ethnl_bitset_size(eee->advertised, eee->supported,
 				__ETHTOOL_LINK_MODE_MASK_NBITS,
@@ -154,16 +137,10 @@ ethnl_set_eee(struct ethnl_req_info *req_info, struct genl_info *info)
 	if (ret < 0)
 		return ret;
 
-	if (ethtool_eee_use_linkmodes(&eee)) {
-		ret = ethnl_update_bitset(eee.advertised,
-					  __ETHTOOL_LINK_MODE_MASK_NBITS,
-					  tb[ETHTOOL_A_EEE_MODES_OURS],
-					  link_mode_names, info->extack, &mod);
-	} else {
-		ret = ethnl_update_bitset32(&eee.advertised_u32, EEE_MODES_COUNT,
-					    tb[ETHTOOL_A_EEE_MODES_OURS],
-					    link_mode_names, info->extack, &mod);
-	}
+	ret = ethnl_update_bitset(eee.advertised,
+				  __ETHTOOL_LINK_MODE_MASK_NBITS,
+				  tb[ETHTOOL_A_EEE_MODES_OURS],
+				  link_mode_names, info->extack, &mod);
 	if (ret < 0)
 		return ret;
 	ethnl_update_bool(&eee.eee_enabled, tb[ETHTOOL_A_EEE_ENABLED], &mod);
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index b419969c0dcb..317308bdbda9 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1514,9 +1514,6 @@ static void eee_to_keee(struct ethtool_keee *keee,
 {
 	memset(keee, 0, sizeof(*keee));
 
-	keee->supported_u32 = eee->supported;
-	keee->advertised_u32 = eee->advertised;
-	keee->lp_advertised_u32 = eee->lp_advertised;
 	keee->eee_active = eee->eee_active;
 	keee->eee_enabled = eee->eee_enabled;
 	keee->tx_lpi_enabled = eee->tx_lpi_enabled;
@@ -1533,6 +1530,8 @@ static void eee_to_keee(struct ethtool_keee *keee,
 static void keee_to_eee(struct ethtool_eee *eee,
 			const struct ethtool_keee *keee)
 {
+	bool overflow;
+
 	memset(eee, 0, sizeof(*eee));
 
 	eee->eee_active = keee->eee_active;
@@ -1540,22 +1539,14 @@ static void keee_to_eee(struct ethtool_eee *eee,
 	eee->tx_lpi_enabled = keee->tx_lpi_enabled;
 	eee->tx_lpi_timer = keee->tx_lpi_timer;
 
-	if (ethtool_eee_use_linkmodes(keee)) {
-		bool overflow;
-
-		overflow = !ethtool_convert_link_mode_to_legacy_u32(&eee->supported,
-								    keee->supported);
-		ethtool_convert_link_mode_to_legacy_u32(&eee->advertised,
-							keee->advertised);
-		ethtool_convert_link_mode_to_legacy_u32(&eee->lp_advertised,
-							keee->lp_advertised);
-		if (overflow)
-			pr_warn("Ethtool ioctl interface doesn't support passing EEE linkmodes beyond bit 32\n");
-	} else {
-		eee->supported = keee->supported_u32;
-		eee->advertised = keee->advertised_u32;
-		eee->lp_advertised = keee->lp_advertised_u32;
-	}
+	overflow = !ethtool_convert_link_mode_to_legacy_u32(&eee->supported,
+							    keee->supported);
+	ethtool_convert_link_mode_to_legacy_u32(&eee->advertised,
+						keee->advertised);
+	ethtool_convert_link_mode_to_legacy_u32(&eee->lp_advertised,
+						keee->lp_advertised);
+	if (overflow)
+		pr_warn("Ethtool ioctl interface doesn't support passing EEE linkmodes beyond bit 32\n");
 }
 
 static int ethtool_get_eee(struct net_device *dev, char __user *useraddr)
-- 
cgit v1.2.3


From 0598f8f3bb77893a13105d47bb7dfe42f1dc1f4e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 27 Feb 2024 09:24:09 +0000
Subject: inet: annotate devconf data-races

Add READ_ONCE() in ipv4_devconf_get() and corresponding
WRITE_ONCE() in ipv4_devconf_set()

Add IPV4_DEVCONF_RO() and IPV4_DEVCONF_ALL_RO() macros,
and use them when reading devconf fields.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/20240227092411.2315725-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/inetdevice.h | 14 ++++++++------
 net/ipv4/devinet.c         | 21 +++++++++++----------
 net/ipv4/igmp.c            |  4 ++--
 net/ipv4/proc.c            |  2 +-
 net/ipv4/route.c           |  4 ++--
 5 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index ddb27fc0ee8c..cb5280e6cc21 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -53,13 +53,15 @@ struct in_device {
 };
 
 #define IPV4_DEVCONF(cnf, attr) ((cnf).data[IPV4_DEVCONF_ ## attr - 1])
+#define IPV4_DEVCONF_RO(cnf, attr) READ_ONCE(IPV4_DEVCONF(cnf, attr))
 #define IPV4_DEVCONF_ALL(net, attr) \
 	IPV4_DEVCONF((*(net)->ipv4.devconf_all), attr)
+#define IPV4_DEVCONF_ALL_RO(net, attr) READ_ONCE(IPV4_DEVCONF_ALL(net, attr))
 
-static inline int ipv4_devconf_get(struct in_device *in_dev, int index)
+static inline int ipv4_devconf_get(const struct in_device *in_dev, int index)
 {
 	index--;
-	return in_dev->cnf.data[index];
+	return READ_ONCE(in_dev->cnf.data[index]);
 }
 
 static inline void ipv4_devconf_set(struct in_device *in_dev, int index,
@@ -67,7 +69,7 @@ static inline void ipv4_devconf_set(struct in_device *in_dev, int index,
 {
 	index--;
 	set_bit(index, in_dev->cnf.state);
-	in_dev->cnf.data[index] = val;
+	WRITE_ONCE(in_dev->cnf.data[index], val);
 }
 
 static inline void ipv4_devconf_setall(struct in_device *in_dev)
@@ -81,18 +83,18 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 	ipv4_devconf_set((in_dev), IPV4_DEVCONF_ ## attr, (val))
 
 #define IN_DEV_ANDCONF(in_dev, attr) \
-	(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), attr) && \
+	(IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr) && \
 	 IN_DEV_CONF_GET((in_dev), attr))
 
 #define IN_DEV_NET_ORCONF(in_dev, net, attr) \
-	(IPV4_DEVCONF_ALL(net, attr) || \
+	(IPV4_DEVCONF_ALL_RO(net, attr) || \
 	 IN_DEV_CONF_GET((in_dev), attr))
 
 #define IN_DEV_ORCONF(in_dev, attr) \
 	IN_DEV_NET_ORCONF(in_dev, dev_net(in_dev->dev), attr)
 
 #define IN_DEV_MAXCONF(in_dev, attr) \
-	(max(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), attr), \
+	(max(IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr), \
 	     IN_DEV_CONF_GET((in_dev), attr)))
 
 #define IN_DEV_FORWARD(in_dev)		IN_DEV_CONF_GET((in_dev), FORWARDING)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index bc74f131fe4d..ca75d0fff1d1 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1982,7 +1982,7 @@ static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
 		return -EMSGSIZE;
 
 	for (i = 0; i < IPV4_DEVCONF_MAX; i++)
-		((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i];
+		((u32 *) nla_data(nla))[i] = READ_ONCE(in_dev->cnf.data[i]);
 
 	return 0;
 }
@@ -2068,9 +2068,9 @@ static int inet_netconf_msgsize_devconf(int type)
 }
 
 static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
-				     struct ipv4_devconf *devconf, u32 portid,
-				     u32 seq, int event, unsigned int flags,
-				     int type)
+				     const struct ipv4_devconf *devconf,
+				     u32 portid, u32 seq, int event,
+				     unsigned int flags, int type)
 {
 	struct nlmsghdr  *nlh;
 	struct netconfmsg *ncm;
@@ -2095,27 +2095,28 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 
 	if ((all || type == NETCONFA_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_FORWARDING,
-			IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
+			IPV4_DEVCONF_RO(*devconf, FORWARDING)) < 0)
 		goto nla_put_failure;
 	if ((all || type == NETCONFA_RP_FILTER) &&
 	    nla_put_s32(skb, NETCONFA_RP_FILTER,
-			IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
+			IPV4_DEVCONF_RO(*devconf, RP_FILTER)) < 0)
 		goto nla_put_failure;
 	if ((all || type == NETCONFA_MC_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,
-			IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
+			IPV4_DEVCONF_RO(*devconf, MC_FORWARDING)) < 0)
 		goto nla_put_failure;
 	if ((all || type == NETCONFA_BC_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_BC_FORWARDING,
-			IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
+			IPV4_DEVCONF_RO(*devconf, BC_FORWARDING)) < 0)
 		goto nla_put_failure;
 	if ((all || type == NETCONFA_PROXY_NEIGH) &&
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
-			IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
+			IPV4_DEVCONF_RO(*devconf, PROXY_ARP)) < 0)
 		goto nla_put_failure;
 	if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
 	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
-			IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
+			IPV4_DEVCONF_RO(*devconf,
+					IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
 		goto nla_put_failure;
 
 out:
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index efeeca2b1328..717e97a389a8 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -120,12 +120,12 @@
  */
 
 #define IGMP_V1_SEEN(in_dev) \
-	(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
+	(IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
 	 IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
 	 ((in_dev)->mr_v1_seen && \
 	  time_before(jiffies, (in_dev)->mr_v1_seen)))
 #define IGMP_V2_SEEN(in_dev) \
-	(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
+	(IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
 	 IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
 	 ((in_dev)->mr_v2_seen && \
 	  time_before(jiffies, (in_dev)->mr_v2_seen)))
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 5f4654ebff48..914bc9c35cc7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -395,7 +395,7 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
 		seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
 
 	seq_printf(seq, "\nIp: %d %d",
-		   IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
+		   IPV4_DEVCONF_ALL_RO(net, FORWARDING) ? 1 : 2,
 		   READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
 
 	BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b512288d6fcc..c8f76f56dc16 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2313,7 +2313,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		if (IN_DEV_BFORWARD(in_dev))
 			goto make_route;
 		/* not do cache if bc_forwarding is enabled */
-		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
+		if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING))
 			do_cache = false;
 		goto brd_input;
 	}
@@ -2993,7 +2993,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
 #ifdef CONFIG_IP_MROUTE
 			if (ipv4_is_multicast(dst) &&
 			    !ipv4_is_local_multicast(dst) &&
-			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
+			    IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) {
 				int err = ipmr_get_route(net, skb,
 							 fl4->saddr, fl4->daddr,
 							 r, portid);
-- 
cgit v1.2.3


From 99123622050f10ca9148a0fffba2de0afd6cdfff Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 27 Feb 2024 19:27:21 +0000
Subject: tcp: remove some holes in struct tcp_sock

By moving some fields around, this patch shrinks
holes size from 56 to 32, saving 24 bytes on 64bit arches.

After the patch pahole gives the following for 'struct tcp_sock':

	/* size: 2304, cachelines: 36, members: 162 */
	/* sum members: 2234, holes: 6, sum holes: 32 */
	/* sum bitfield members: 34 bits, bit holes: 5, sum bit holes: 14 bits */
	/* padding: 32 */
	/* paddings: 3, sum paddings: 10 */
	/* forced alignments: 1, forced holes: 1, sum forced holes: 12 */

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/20240227192721.3558982-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/tcp.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index a1c47a6d69b0..988a30ef6bfe 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -264,10 +264,10 @@ struct tcp_sock {
 	u32	pushed_seq;	/* Last pushed seq, required to talk to windows */
 	u32	lsndtime;
 	u32	mdev_us;	/* medium deviation			*/
+	u32	rtt_seq;	/* sequence number to update rttvar	*/
 	u64	tcp_wstamp_ns;	/* departure time for next sent data packet */
 	u64	tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
 	u64	tcp_mstamp;	/* most recent packet received/sent */
-	u32	rtt_seq;	/* sequence number to update rttvar	*/
 	struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
 	struct sk_buff *highest_sack;   /* skb just after the highest
 					 * skb with SACKed bit set
@@ -350,7 +350,6 @@ struct tcp_sock {
 	u32	dsack_dups;	/* RFC4898 tcpEStatsStackDSACKDups
 				 * total number of DSACK blocks received
 				 */
-	u32	last_oow_ack_time;  /* timestamp of last out-of-window ACK */
 	u32	compressed_ack_rcv_nxt;
 	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
 
@@ -384,12 +383,12 @@ struct tcp_sock {
 		syn_fastopen_ch:1, /* Active TFO re-enabling probe */
 		syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
 
+	u8	keepalive_probes; /* num of allowed keep alive probes	*/
 	u32	tcp_tx_delay;	/* delay (in usec) added to TX packets */
 
 /* RTT measurement */
 	u32	mdev_max_us;	/* maximal mdev for the last rtt period	*/
 
-	u8	keepalive_probes; /* num of allowed keep alive probes	*/
 	u32	reord_seen;	/* number of data packet reordering events */
 
 /*
@@ -402,6 +401,7 @@ struct tcp_sock {
 	u32	prior_cwnd;	/* cwnd right before starting loss recovery */
 	u32	prr_delivered;	/* Number of newly delivered packets to
 				 * receiver in Recovery. */
+	u32	last_oow_ack_time;  /* timestamp of last out-of-window ACK */
 
 	struct hrtimer	pacing_timer;
 	struct hrtimer	compressed_ack_timer;
@@ -477,8 +477,8 @@ struct tcp_sock {
 	bool	is_mptcp;
 #endif
 #if IS_ENABLED(CONFIG_SMC)
-	bool	(*smc_hs_congested)(const struct sock *sk);
 	bool	syn_smc;	/* SYN includes SMC */
+	bool	(*smc_hs_congested)(const struct sock *sk);
 #endif
 
 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
-- 
cgit v1.2.3


From ba132d841d5637655273c28d30751c8d18ac189b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 29 Jan 2024 13:51:30 +0000
Subject: rxrpc: Record the Tx serial in the rxrpc_txbuf and retransmit trace

Each Rx protocol packet contains a per-connection monotonically increasing
serial number used to correlate outgoing messages with their replies -
something that can be used for RTT calculation.

Note this value in the rxrpc_txbuf struct in addition to the wire header
and then log it in the rxrpc_retransmit trace for reference.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-afs@lists.infradead.org
cc: netdev@vger.kernel.org
---
 include/trace/events/rxrpc.h | 10 +++++++---
 net/rxrpc/ar-internal.h      |  1 +
 net/rxrpc/call_event.c       |  6 +++---
 net/rxrpc/output.c           | 36 +++++++++++++++++-------------------
 net/rxrpc/txbuf.c            |  1 +
 5 files changed, 29 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 87b8de9b6c1c..9add56980485 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -1506,25 +1506,29 @@ TRACE_EVENT(rxrpc_drop_ack,
 	    );
 
 TRACE_EVENT(rxrpc_retransmit,
-	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, s64 expiry),
+	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq,
+		     rxrpc_serial_t serial, s64 expiry),
 
-	    TP_ARGS(call, seq, expiry),
+	    TP_ARGS(call, seq, serial, expiry),
 
 	    TP_STRUCT__entry(
 		    __field(unsigned int,	call)
 		    __field(rxrpc_seq_t,	seq)
+		    __field(rxrpc_serial_t,	serial)
 		    __field(s64,		expiry)
 			     ),
 
 	    TP_fast_assign(
 		    __entry->call = call->debug_id;
 		    __entry->seq = seq;
+		    __entry->serial = serial;
 		    __entry->expiry = expiry;
 			   ),
 
-	    TP_printk("c=%08x q=%x xp=%lld",
+	    TP_printk("c=%08x q=%x r=%x xp=%lld",
 		      __entry->call,
 		      __entry->seq,
+		      __entry->serial,
 		      __entry->expiry)
 	    );
 
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 7818aae1be8e..f76125755810 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -794,6 +794,7 @@ struct rxrpc_txbuf {
 	ktime_t			last_sent;	/* Time at which last transmitted */
 	refcount_t		ref;
 	rxrpc_seq_t		seq;		/* Sequence number of this packet */
+	rxrpc_serial_t		serial;		/* Last serial number transmitted with */
 	unsigned int		call_debug_id;
 	unsigned int		debug_id;
 	unsigned int		len;		/* Amount of data in buffer */
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 0f78544d043b..a4c309976719 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -160,7 +160,7 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb)
 			goto no_further_resend;
 
 		found_txb:
-			if (after(ntohl(txb->wire.serial), call->acks_highest_serial))
+			if (after(txb->serial, call->acks_highest_serial))
 				continue; /* Ack point not yet reached */
 
 			rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked);
@@ -170,7 +170,7 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb)
 				set_bit(RXRPC_TXBUF_RESENT, &txb->flags);
 			}
 
-			trace_rxrpc_retransmit(call, txb->seq,
+			trace_rxrpc_retransmit(call, txb->seq, txb->serial,
 					       ktime_to_ns(ktime_sub(txb->last_sent,
 								     max_age)));
 
@@ -197,7 +197,7 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb)
 			break; /* Not transmitted yet */
 
 		if (ack && ack->reason == RXRPC_ACK_PING_RESPONSE &&
-		    before(ntohl(txb->wire.serial), ntohl(ack->serial)))
+		    before(txb->serial, ntohl(ack->serial)))
 			goto do_resend; /* Wasn't accounted for by a more recent ping. */
 
 		if (ktime_after(txb->last_sent, max_age)) {
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 4a292f860ae3..bad96a983e12 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -189,7 +189,6 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
 	struct rxrpc_connection *conn;
 	struct msghdr msg;
 	struct kvec iov[1];
-	rxrpc_serial_t serial;
 	size_t len, n;
 	int ret, rtt_slot = -1;
 	u16 rwind;
@@ -216,15 +215,15 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
 	iov[0].iov_len	= sizeof(txb->wire) + sizeof(txb->ack) + n;
 	len = iov[0].iov_len;
 
-	serial = rxrpc_get_next_serial(conn);
-	txb->wire.serial = htonl(serial);
-	trace_rxrpc_tx_ack(call->debug_id, serial,
+	txb->serial = rxrpc_get_next_serial(conn);
+	txb->wire.serial = htonl(txb->serial);
+	trace_rxrpc_tx_ack(call->debug_id, txb->serial,
 			   ntohl(txb->ack.firstPacket),
 			   ntohl(txb->ack.serial), txb->ack.reason, txb->ack.nAcks,
 			   rwind);
 
 	if (txb->ack.reason == RXRPC_ACK_PING)
-		rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_ping);
+		rtt_slot = rxrpc_begin_rtt_probe(call, txb->serial, rxrpc_rtt_tx_ping);
 
 	rxrpc_inc_stat(call->rxnet, stat_tx_ack_send);
 
@@ -235,7 +234,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
 	ret = do_udp_sendmsg(conn->local->socket, &msg, len);
 	call->peer->last_tx_at = ktime_get_seconds();
 	if (ret < 0) {
-		trace_rxrpc_tx_fail(call->debug_id, serial, ret,
+		trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret,
 				    rxrpc_tx_point_call_ack);
 	} else {
 		trace_rxrpc_tx_packet(call->debug_id, &txb->wire,
@@ -247,7 +246,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
 
 	if (!__rxrpc_call_is_complete(call)) {
 		if (ret < 0)
-			rxrpc_cancel_rtt_probe(call, serial, rtt_slot);
+			rxrpc_cancel_rtt_probe(call, txb->serial, rtt_slot);
 		rxrpc_set_keepalive(call);
 	}
 
@@ -327,15 +326,14 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
 	struct rxrpc_connection *conn = call->conn;
 	struct msghdr msg;
 	struct kvec iov[1];
-	rxrpc_serial_t serial;
 	size_t len;
 	int ret, rtt_slot = -1;
 
 	_enter("%x,{%d}", txb->seq, txb->len);
 
-	/* Each transmission of a Tx packet needs a new serial number */
-	serial = rxrpc_get_next_serial(conn);
-	txb->wire.serial = htonl(serial);
+	/* Each transmission of a Tx packet+ needs a new serial number */
+	txb->serial = rxrpc_get_next_serial(conn);
+	txb->wire.serial = htonl(txb->serial);
 
 	if (test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags) &&
 	    txb->seq == 1)
@@ -388,7 +386,7 @@ dont_set_request_ack:
 		static int lose;
 		if ((lose++ & 7) == 7) {
 			ret = 0;
-			trace_rxrpc_tx_data(call, txb->seq, serial,
+			trace_rxrpc_tx_data(call, txb->seq, txb->serial,
 					    txb->wire.flags,
 					    test_bit(RXRPC_TXBUF_RESENT, &txb->flags),
 					    true);
@@ -396,7 +394,7 @@ dont_set_request_ack:
 		}
 	}
 
-	trace_rxrpc_tx_data(call, txb->seq, serial, txb->wire.flags,
+	trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->wire.flags,
 			    test_bit(RXRPC_TXBUF_RESENT, &txb->flags), false);
 
 	/* Track what we've attempted to transmit at least once so that the
@@ -415,7 +413,7 @@ dont_set_request_ack:
 
 	txb->last_sent = ktime_get_real();
 	if (txb->wire.flags & RXRPC_REQUEST_ACK)
-		rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data);
+		rtt_slot = rxrpc_begin_rtt_probe(call, txb->serial, rxrpc_rtt_tx_data);
 
 	/* send the packet by UDP
 	 * - returns -EMSGSIZE if UDP would have to fragment the packet
@@ -429,8 +427,8 @@ dont_set_request_ack:
 
 	if (ret < 0) {
 		rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail);
-		rxrpc_cancel_rtt_probe(call, serial, rtt_slot);
-		trace_rxrpc_tx_fail(call->debug_id, serial, ret,
+		rxrpc_cancel_rtt_probe(call, txb->serial, rtt_slot);
+		trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret,
 				    rxrpc_tx_point_call_data_nofrag);
 	} else {
 		trace_rxrpc_tx_packet(call->debug_id, &txb->wire,
@@ -489,7 +487,7 @@ send_fragmentable:
 
 	txb->last_sent = ktime_get_real();
 	if (txb->wire.flags & RXRPC_REQUEST_ACK)
-		rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data);
+		rtt_slot = rxrpc_begin_rtt_probe(call, txb->serial, rxrpc_rtt_tx_data);
 
 	switch (conn->local->srx.transport.family) {
 	case AF_INET6:
@@ -508,8 +506,8 @@ send_fragmentable:
 
 	if (ret < 0) {
 		rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail);
-		rxrpc_cancel_rtt_probe(call, serial, rtt_slot);
-		trace_rxrpc_tx_fail(call->debug_id, serial, ret,
+		rxrpc_cancel_rtt_probe(call, txb->serial, rtt_slot);
+		trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret,
 				    rxrpc_tx_point_call_data_frag);
 	} else {
 		trace_rxrpc_tx_packet(call->debug_id, &txb->wire,
diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c
index d43be8512386..f2903c81cf5b 100644
--- a/net/rxrpc/txbuf.c
+++ b/net/rxrpc/txbuf.c
@@ -34,6 +34,7 @@ struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type,
 		txb->flags		= 0;
 		txb->ack_why		= 0;
 		txb->seq		= call->tx_prepared + 1;
+		txb->serial		= 0;
 		txb->wire.epoch		= htonl(call->conn->proto.epoch);
 		txb->wire.cid		= htonl(call->cid);
 		txb->wire.callNumber	= htonl(call->call_id);
-- 
cgit v1.2.3


From 12bdff73a147aebcea8efae2b395ef0c27448909 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 29 Jan 2024 15:01:10 +0000
Subject: rxrpc: Convert rxrpc_txbuf::flags into a mask and don't use atomics

Convert the transmission buffer flags into a mask and use | and & rather
than bitops functions (atomic ops are not required as only the I/O thread
can manipulate them once submitted for transmission).

The bottom byte can then correspond directly to the Rx protocol header
flags.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-afs@lists.infradead.org
cc: netdev@vger.kernel.org
---
 include/trace/events/rxrpc.h | 12 +++++-------
 net/rxrpc/ar-internal.h      |  8 ++++----
 net/rxrpc/call_event.c       |  8 ++++----
 net/rxrpc/input.c            |  2 +-
 net/rxrpc/output.c           | 27 +++++++++++++--------------
 net/rxrpc/sendmsg.c          | 10 ++++------
 net/rxrpc/txbuf.c            |  4 ++--
 7 files changed, 33 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 9add56980485..33888f688325 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -1084,9 +1084,9 @@ TRACE_EVENT(rxrpc_tx_packet,
 
 TRACE_EVENT(rxrpc_tx_data,
 	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq,
-		     rxrpc_serial_t serial, u8 flags, bool retrans, bool lose),
+		     rxrpc_serial_t serial, unsigned int flags, bool lose),
 
-	    TP_ARGS(call, seq, serial, flags, retrans, lose),
+	    TP_ARGS(call, seq, serial, flags, lose),
 
 	    TP_STRUCT__entry(
 		    __field(unsigned int,	call)
@@ -1094,8 +1094,7 @@ TRACE_EVENT(rxrpc_tx_data,
 		    __field(rxrpc_serial_t,	serial)
 		    __field(u32,		cid)
 		    __field(u32,		call_id)
-		    __field(u8,			flags)
-		    __field(bool,		retrans)
+		    __field(u16,		flags)
 		    __field(bool,		lose)
 			     ),
 
@@ -1106,7 +1105,6 @@ TRACE_EVENT(rxrpc_tx_data,
 		    __entry->seq = seq;
 		    __entry->serial = serial;
 		    __entry->flags = flags;
-		    __entry->retrans = retrans;
 		    __entry->lose = lose;
 			   ),
 
@@ -1116,8 +1114,8 @@ TRACE_EVENT(rxrpc_tx_data,
 		      __entry->call_id,
 		      __entry->serial,
 		      __entry->seq,
-		      __entry->flags,
-		      __entry->retrans ? " *RETRANS*" : "",
+		      __entry->flags & RXRPC_TXBUF_WIRE_FLAGS,
+		      __entry->flags & RXRPC_TXBUF_RESENT ? " *RETRANS*" : "",
 		      __entry->lose ? " *LOSE*" : "")
 	    );
 
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index f76125755810..54d1dc97cb0f 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -800,9 +800,9 @@ struct rxrpc_txbuf {
 	unsigned int		len;		/* Amount of data in buffer */
 	unsigned int		space;		/* Remaining data space */
 	unsigned int		offset;		/* Offset of fill point */
-	unsigned long		flags;
-#define RXRPC_TXBUF_LAST	0		/* Set if last packet in Tx phase */
-#define RXRPC_TXBUF_RESENT	1		/* Set if has been resent */
+	unsigned int		flags;
+#define RXRPC_TXBUF_WIRE_FLAGS	0xff		/* The wire protocol flags */
+#define RXRPC_TXBUF_RESENT	0x100		/* Set if has been resent */
 	u8 /*enum rxrpc_propose_ack_trace*/ ack_why;	/* If ack, why */
 	struct {
 		/* The packet for encrypting and DMA'ing.  We align it such
@@ -822,7 +822,7 @@ struct rxrpc_txbuf {
 
 static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb)
 {
-	return txb->wire.flags & RXRPC_CLIENT_INITIATED;
+	return txb->flags & RXRPC_CLIENT_INITIATED;
 }
 
 static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb)
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index a4c309976719..77eacbfc5d45 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -84,7 +84,7 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason,
 	txb->ack_why		= why;
 	txb->wire.seq		= 0;
 	txb->wire.type		= RXRPC_PACKET_TYPE_ACK;
-	txb->wire.flags		|= RXRPC_SLOW_START_OK;
+	txb->flags		|= RXRPC_SLOW_START_OK;
 	txb->ack.bufferSpace	= 0;
 	txb->ack.maxSkew	= 0;
 	txb->ack.firstPacket	= 0;
@@ -167,7 +167,7 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb)
 
 			if (list_empty(&txb->tx_link)) {
 				list_add_tail(&txb->tx_link, &retrans_queue);
-				set_bit(RXRPC_TXBUF_RESENT, &txb->flags);
+				txb->flags |= RXRPC_TXBUF_RESENT;
 			}
 
 			trace_rxrpc_retransmit(call, txb->seq, txb->serial,
@@ -210,7 +210,7 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb)
 		unacked = true;
 		if (list_empty(&txb->tx_link)) {
 			list_add_tail(&txb->tx_link, &retrans_queue);
-			set_bit(RXRPC_TXBUF_RESENT, &txb->flags);
+			txb->flags |= RXRPC_TXBUF_RESENT;
 			rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans);
 		}
 	}
@@ -320,7 +320,7 @@ static void rxrpc_decant_prepared_tx(struct rxrpc_call *call)
 		call->tx_top = txb->seq;
 		list_add_tail(&txb->call_link, &call->tx_buffer);
 
-		if (txb->wire.flags & RXRPC_LAST_PACKET)
+		if (txb->flags & RXRPC_LAST_PACKET)
 			rxrpc_close_tx_phase(call);
 
 		rxrpc_transmit_one(call, txb);
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 9691de00ade7..c435b50c33f4 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -212,7 +212,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
 	list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) {
 		if (before_eq(txb->seq, call->acks_hard_ack))
 			continue;
-		if (test_bit(RXRPC_TXBUF_LAST, &txb->flags)) {
+		if (txb->flags & RXRPC_LAST_PACKET) {
 			set_bit(RXRPC_CALL_TX_LAST, &call->flags);
 			rot_last = true;
 		}
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index bad96a983e12..8344ece5358a 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -205,7 +205,8 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
 	msg.msg_flags	= 0;
 
 	if (txb->ack.reason == RXRPC_ACK_PING)
-		txb->wire.flags |= RXRPC_REQUEST_ACK;
+		txb->flags |= RXRPC_REQUEST_ACK;
+	txb->wire.flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS;
 
 	n = rxrpc_fill_out_ack(conn, call, txb, &rwind);
 	if (n == 0)
@@ -239,7 +240,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
 	} else {
 		trace_rxrpc_tx_packet(call->debug_id, &txb->wire,
 				      rxrpc_tx_point_call_ack);
-		if (txb->wire.flags & RXRPC_REQUEST_ACK)
+		if (txb->flags & RXRPC_REQUEST_ACK)
 			call->peer->rtt_last_req = ktime_get_real();
 	}
 	rxrpc_tx_backoff(call, ret);
@@ -357,13 +358,13 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
 	 * service call, lest OpenAFS incorrectly send us an ACK with some
 	 * soft-ACKs in it and then never follow up with a proper hard ACK.
 	 */
-	if (txb->wire.flags & RXRPC_REQUEST_ACK)
+	if (txb->flags & RXRPC_REQUEST_ACK)
 		why = rxrpc_reqack_already_on;
-	else if (test_bit(RXRPC_TXBUF_LAST, &txb->flags) && rxrpc_sending_to_client(txb))
+	else if ((txb->flags & RXRPC_LAST_PACKET) && rxrpc_sending_to_client(txb))
 		why = rxrpc_reqack_no_srv_last;
 	else if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events))
 		why = rxrpc_reqack_ack_lost;
-	else if (test_bit(RXRPC_TXBUF_RESENT, &txb->flags))
+	else if (txb->flags & RXRPC_TXBUF_RESENT)
 		why = rxrpc_reqack_retrans;
 	else if (call->cong_mode == RXRPC_CALL_SLOW_START && call->cong_cwnd <= 2)
 		why = rxrpc_reqack_slow_start;
@@ -379,7 +380,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
 	rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]);
 	trace_rxrpc_req_ack(call->debug_id, txb->seq, why);
 	if (why != rxrpc_reqack_no_srv_last)
-		txb->wire.flags |= RXRPC_REQUEST_ACK;
+		txb->flags |= RXRPC_REQUEST_ACK;
 dont_set_request_ack:
 
 	if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
@@ -387,15 +388,12 @@ dont_set_request_ack:
 		if ((lose++ & 7) == 7) {
 			ret = 0;
 			trace_rxrpc_tx_data(call, txb->seq, txb->serial,
-					    txb->wire.flags,
-					    test_bit(RXRPC_TXBUF_RESENT, &txb->flags),
-					    true);
+					    txb->flags, true);
 			goto done;
 		}
 	}
 
-	trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->wire.flags,
-			    test_bit(RXRPC_TXBUF_RESENT, &txb->flags), false);
+	trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, false);
 
 	/* Track what we've attempted to transmit at least once so that the
 	 * retransmission algorithm doesn't try to resend what we haven't sent
@@ -411,8 +409,9 @@ dont_set_request_ack:
 	if (txb->len >= call->peer->maxdata)
 		goto send_fragmentable;
 
+	txb->wire.flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS;
 	txb->last_sent = ktime_get_real();
-	if (txb->wire.flags & RXRPC_REQUEST_ACK)
+	if (txb->flags & RXRPC_REQUEST_ACK)
 		rtt_slot = rxrpc_begin_rtt_probe(call, txb->serial, rxrpc_rtt_tx_data);
 
 	/* send the packet by UDP
@@ -442,7 +441,7 @@ dont_set_request_ack:
 done:
 	if (ret >= 0) {
 		call->tx_last_sent = txb->last_sent;
-		if (txb->wire.flags & RXRPC_REQUEST_ACK) {
+		if (txb->flags & RXRPC_REQUEST_ACK) {
 			call->peer->rtt_last_req = txb->last_sent;
 			if (call->peer->rtt_count > 1) {
 				unsigned long nowj = jiffies, ack_lost_at;
@@ -486,7 +485,7 @@ send_fragmentable:
 	_debug("send fragment");
 
 	txb->last_sent = ktime_get_real();
-	if (txb->wire.flags & RXRPC_REQUEST_ACK)
+	if (txb->flags & RXRPC_REQUEST_ACK)
 		rtt_slot = rxrpc_begin_rtt_probe(call, txb->serial, rxrpc_rtt_tx_data);
 
 	switch (conn->local->srx.transport.family) {
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 5677d5690a02..25c7c1d4f4c6 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -240,7 +240,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
 			       rxrpc_notify_end_tx_t notify_end_tx)
 {
 	rxrpc_seq_t seq = txb->seq;
-	bool last = test_bit(RXRPC_TXBUF_LAST, &txb->flags), poke;
+	bool poke, last = txb->flags & RXRPC_LAST_PACKET;
 
 	rxrpc_inc_stat(call->rxnet, stat_tx_data);
 
@@ -394,13 +394,11 @@ reload:
 		/* add the packet to the send queue if it's now full */
 		if (!txb->space ||
 		    (msg_data_left(msg) == 0 && !more)) {
-			if (msg_data_left(msg) == 0 && !more) {
-				txb->wire.flags |= RXRPC_LAST_PACKET;
-				__set_bit(RXRPC_TXBUF_LAST, &txb->flags);
-			}
+			if (msg_data_left(msg) == 0 && !more)
+				txb->flags |= RXRPC_LAST_PACKET;
 			else if (call->tx_top - call->acks_hard_ack <
 				 call->tx_winsize)
-				txb->wire.flags |= RXRPC_MORE_PACKETS;
+				txb->flags |= RXRPC_MORE_PACKETS;
 
 			ret = call->security->secure_packet(call, txb);
 			if (ret < 0)
diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c
index f2903c81cf5b..48d5a8f644e5 100644
--- a/net/rxrpc/txbuf.c
+++ b/net/rxrpc/txbuf.c
@@ -31,7 +31,7 @@ struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type,
 		txb->space		= sizeof(txb->data);
 		txb->len		= 0;
 		txb->offset		= 0;
-		txb->flags		= 0;
+		txb->flags		= call->conn->out_clientflag;
 		txb->ack_why		= 0;
 		txb->seq		= call->tx_prepared + 1;
 		txb->serial		= 0;
@@ -40,7 +40,7 @@ struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type,
 		txb->wire.callNumber	= htonl(call->call_id);
 		txb->wire.seq		= htonl(txb->seq);
 		txb->wire.type		= packet_type;
-		txb->wire.flags		= call->conn->out_clientflag;
+		txb->wire.flags		= 0;
 		txb->wire.userStatus	= 0;
 		txb->wire.securityIndex	= call->security_ix;
 		txb->wire._rsvd		= 0;
-- 
cgit v1.2.3


From 17469ae0582aaacad36e8e858f58b86c369f21ef Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 26 Jan 2024 16:17:03 +0000
Subject: rxrpc: Fix the names of the fields in the ACK trailer struct

From AFS-3.3 a trailer containing extra info was added to the ACK packet
format - but AF_RXRPC has the names of some of the fields mixed up compared
to other AFS implementations.

Rename the struct and the fields to make them match.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-afs@lists.infradead.org
cc: netdev@vger.kernel.org
---
 include/trace/events/rxrpc.h |  2 +-
 net/rxrpc/conn_event.c       | 16 ++++++++--------
 net/rxrpc/input.c            | 22 +++++++++++-----------
 net/rxrpc/output.c           | 14 +++++++-------
 net/rxrpc/protocol.h         |  6 +++---
 5 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 33888f688325..c730cd732348 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -83,7 +83,7 @@
 	EM(rxrpc_badmsg_bad_abort,		"bad-abort")		\
 	EM(rxrpc_badmsg_bad_jumbo,		"bad-jumbo")		\
 	EM(rxrpc_badmsg_short_ack,		"short-ack")		\
-	EM(rxrpc_badmsg_short_ack_info,		"short-ack-info")	\
+	EM(rxrpc_badmsg_short_ack_trailer,	"short-ack-trailer")	\
 	EM(rxrpc_badmsg_short_hdr,		"short-hdr")		\
 	EM(rxrpc_badmsg_unsupported_packet,	"unsup-pkt")		\
 	EM(rxrpc_badmsg_zero_call,		"zero-call")		\
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 1f251d758cb9..598b4ee389fc 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -88,7 +88,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 			struct rxrpc_ackpacket ack;
 		};
 	} __attribute__((packed)) pkt;
-	struct rxrpc_ackinfo ack_info;
+	struct rxrpc_acktrailer trailer;
 	size_t len;
 	int ret, ioc;
 	u32 serial, mtu, call_id, padding;
@@ -122,8 +122,8 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 	iov[0].iov_len	= sizeof(pkt.whdr);
 	iov[1].iov_base	= &padding;
 	iov[1].iov_len	= 3;
-	iov[2].iov_base	= &ack_info;
-	iov[2].iov_len	= sizeof(ack_info);
+	iov[2].iov_base	= &trailer;
+	iov[2].iov_len	= sizeof(trailer);
 
 	serial = rxrpc_get_next_serial(conn);
 
@@ -158,14 +158,14 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 		pkt.ack.serial		= htonl(skb ? sp->hdr.serial : 0);
 		pkt.ack.reason		= skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE;
 		pkt.ack.nAcks		= 0;
-		ack_info.rxMTU		= htonl(rxrpc_rx_mtu);
-		ack_info.maxMTU		= htonl(mtu);
-		ack_info.rwind		= htonl(rxrpc_rx_window_size);
-		ack_info.jumbo_max	= htonl(rxrpc_rx_jumbo_max);
+		trailer.maxMTU		= htonl(rxrpc_rx_mtu);
+		trailer.ifMTU		= htonl(mtu);
+		trailer.rwind		= htonl(rxrpc_rx_window_size);
+		trailer.jumbo_max	= htonl(rxrpc_rx_jumbo_max);
 		pkt.whdr.flags		|= RXRPC_SLOW_START_OK;
 		padding			= 0;
 		iov[0].iov_len += sizeof(pkt.ack);
-		len += sizeof(pkt.ack) + 3 + sizeof(ack_info);
+		len += sizeof(pkt.ack) + 3 + sizeof(trailer);
 		ioc = 3;
 
 		trace_rxrpc_tx_ack(chan->call_debug_id, serial,
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index c435b50c33f4..ea2df62e05e7 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -670,14 +670,14 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call,
 /*
  * Process the extra information that may be appended to an ACK packet
  */
-static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
-				struct rxrpc_ackinfo *ackinfo)
+static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb,
+				    struct rxrpc_acktrailer *trailer)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	struct rxrpc_peer *peer;
 	unsigned int mtu;
 	bool wake = false;
-	u32 rwind = ntohl(ackinfo->rwind);
+	u32 rwind = ntohl(trailer->rwind);
 
 	if (rwind > RXRPC_TX_MAX_WINDOW)
 		rwind = RXRPC_TX_MAX_WINDOW;
@@ -691,7 +691,7 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
 	if (call->cong_ssthresh > rwind)
 		call->cong_ssthresh = rwind;
 
-	mtu = min(ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU));
+	mtu = min(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU));
 
 	peer = call->peer;
 	if (mtu < peer->maxdata) {
@@ -837,7 +837,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
 	struct rxrpc_ack_summary summary = { 0 };
 	struct rxrpc_ackpacket ack;
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-	struct rxrpc_ackinfo info;
+	struct rxrpc_acktrailer trailer;
 	rxrpc_serial_t ack_serial, acked_serial;
 	rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt, since;
 	int nr_acks, offset, ioffset;
@@ -917,11 +917,11 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
 		goto send_response;
 	}
 
-	info.rxMTU = 0;
+	trailer.maxMTU = 0;
 	ioffset = offset + nr_acks + 3;
-	if (skb->len >= ioffset + sizeof(info) &&
-	    skb_copy_bits(skb, ioffset, &info, sizeof(info)) < 0)
-		return rxrpc_proto_abort(call, 0, rxrpc_badmsg_short_ack_info);
+	if (skb->len >= ioffset + sizeof(trailer) &&
+	    skb_copy_bits(skb, ioffset, &trailer, sizeof(trailer)) < 0)
+		return rxrpc_proto_abort(call, 0, rxrpc_badmsg_short_ack_trailer);
 
 	if (nr_acks > 0)
 		skb_condense(skb);
@@ -950,8 +950,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
 	}
 
 	/* Parse rwind and mtu sizes if provided. */
-	if (info.rxMTU)
-		rxrpc_input_ackinfo(call, skb, &info);
+	if (trailer.maxMTU)
+		rxrpc_input_ack_trailer(call, skb, &trailer);
 
 	if (first_soft_ack == 0)
 		return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero);
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 828b145edc56..3803bf900a46 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -83,7 +83,7 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn,
 				 struct rxrpc_txbuf *txb,
 				 u16 *_rwind)
 {
-	struct rxrpc_ackinfo ackinfo;
+	struct rxrpc_acktrailer trailer;
 	unsigned int qsize, sack, wrap, to;
 	rxrpc_seq_t window, wtop;
 	int rsize;
@@ -126,16 +126,16 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn,
 	qsize = (window - 1) - call->rx_consumed;
 	rsize = max_t(int, call->rx_winsize - qsize, 0);
 	*_rwind = rsize;
-	ackinfo.rxMTU		= htonl(rxrpc_rx_mtu);
-	ackinfo.maxMTU		= htonl(mtu);
-	ackinfo.rwind		= htonl(rsize);
-	ackinfo.jumbo_max	= htonl(jmax);
+	trailer.maxMTU		= htonl(rxrpc_rx_mtu);
+	trailer.ifMTU		= htonl(mtu);
+	trailer.rwind		= htonl(rsize);
+	trailer.jumbo_max	= htonl(jmax);
 
 	*ackp++ = 0;
 	*ackp++ = 0;
 	*ackp++ = 0;
-	memcpy(ackp, &ackinfo, sizeof(ackinfo));
-	return txb->ack.nAcks + 3 + sizeof(ackinfo);
+	memcpy(ackp, &trailer, sizeof(trailer));
+	return txb->ack.nAcks + 3 + sizeof(trailer);
 }
 
 /*
diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h
index e8ee4af43ca8..4fe6b4d20ada 100644
--- a/net/rxrpc/protocol.h
+++ b/net/rxrpc/protocol.h
@@ -135,9 +135,9 @@ struct rxrpc_ackpacket {
 /*
  * ACK packets can have a further piece of information tagged on the end
  */
-struct rxrpc_ackinfo {
-	__be32		rxMTU;		/* maximum Rx MTU size (bytes) [AFS 3.3] */
-	__be32		maxMTU;		/* maximum interface MTU size (bytes) [AFS 3.3] */
+struct rxrpc_acktrailer {
+	__be32		maxMTU;		/* maximum Rx MTU size (bytes) [AFS 3.3] */
+	__be32		ifMTU;		/* maximum interface MTU size (bytes) [AFS 3.3] */
 	__be32		rwind;		/* Rx window size (packets) [AFS 3.4] */
 	__be32		jumbo_max;	/* max packets to stick into a jumbo packet [AFS 3.5] */
 };
-- 
cgit v1.2.3


From 896880ff30866f386ebed14ab81ce1ad3710cfc4 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 22 Feb 2024 07:56:15 -0800
Subject: bpf: Replace bpf_lpm_trie_key 0-length array with flexible array

Replace deprecated 0-length array in struct bpf_lpm_trie_key with
flexible array. Found with GCC 13:

../kernel/bpf/lpm_trie.c:207:51: warning: array subscript i is outside array bounds of 'const __u8[0]' {aka 'const unsigned char[]'} [-Warray-bounds=]
  207 |                                        *(__be16 *)&key->data[i]);
      |                                                   ^~~~~~~~~~~~~
../include/uapi/linux/swab.h:102:54: note: in definition of macro '__swab16'
  102 | #define __swab16(x) (__u16)__builtin_bswap16((__u16)(x))
      |                                                      ^
../include/linux/byteorder/generic.h:97:21: note: in expansion of macro '__be16_to_cpu'
   97 | #define be16_to_cpu __be16_to_cpu
      |                     ^~~~~~~~~~~~~
../kernel/bpf/lpm_trie.c:206:28: note: in expansion of macro 'be16_to_cpu'
  206 |                 u16 diff = be16_to_cpu(*(__be16 *)&node->data[i]
^
      |                            ^~~~~~~~~~~
In file included from ../include/linux/bpf.h:7:
../include/uapi/linux/bpf.h:82:17: note: while referencing 'data'
   82 |         __u8    data[0];        /* Arbitrary size */
      |                 ^~~~

And found at run-time under CONFIG_FORTIFY_SOURCE:

  UBSAN: array-index-out-of-bounds in kernel/bpf/lpm_trie.c:218:49
  index 0 is out of range for type '__u8 [*]'

Changing struct bpf_lpm_trie_key is difficult since has been used by
userspace. For example, in Cilium:

	struct egress_gw_policy_key {
	        struct bpf_lpm_trie_key lpm_key;
	        __u32 saddr;
	        __u32 daddr;
	};

While direct references to the "data" member haven't been found, there
are static initializers what include the final member. For example,
the "{}" here:

        struct egress_gw_policy_key in_key = {
                .lpm_key = { 32 + 24, {} },
                .saddr   = CLIENT_IP,
                .daddr   = EXTERNAL_SVC_IP & 0Xffffff,
        };

To avoid the build time and run time warnings seen with a 0-sized
trailing array for struct bpf_lpm_trie_key, introduce a new struct
that correctly uses a flexible array for the trailing bytes,
struct bpf_lpm_trie_key_u8. As part of this, include the "header"
portion (which is just the "prefixlen" member), so it can be used
by anything building a bpf_lpr_trie_key that has trailing members that
aren't a u8 flexible array (like the self-test[1]), which is named
struct bpf_lpm_trie_key_hdr.

Unfortunately, C++ refuses to parse the __struct_group() helper, so
it is not possible to define struct bpf_lpm_trie_key_hdr directly in
struct bpf_lpm_trie_key_u8, so we must open-code the union directly.

Adjust the kernel code to use struct bpf_lpm_trie_key_u8 through-out,
and for the selftest to use struct bpf_lpm_trie_key_hdr. Add a comment
to the UAPI header directing folks to the two new options.

Reported-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Closes: https://paste.debian.net/hidden/ca500597/
Link: https://lore.kernel.org/all/202206281009.4332AA33@keescook/ [1]
Link: https://lore.kernel.org/bpf/20240222155612.it.533-kees@kernel.org
---
 Documentation/bpf/map_lpm_trie.rst               |  2 +-
 include/uapi/linux/bpf.h                         | 19 ++++++++++++++++++-
 kernel/bpf/lpm_trie.c                            | 20 ++++++++++----------
 samples/bpf/map_perf_test_user.c                 |  2 +-
 samples/bpf/xdp_router_ipv4_user.c               |  2 +-
 tools/include/uapi/linux/bpf.h                   | 19 ++++++++++++++++++-
 tools/testing/selftests/bpf/progs/map_ptr_kern.c |  2 +-
 tools/testing/selftests/bpf/test_lpm_map.c       | 18 +++++++++---------
 8 files changed, 59 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/Documentation/bpf/map_lpm_trie.rst b/Documentation/bpf/map_lpm_trie.rst
index 74d64a30f500..f9cd579496c9 100644
--- a/Documentation/bpf/map_lpm_trie.rst
+++ b/Documentation/bpf/map_lpm_trie.rst
@@ -17,7 +17,7 @@ significant byte.
 
 LPM tries may be created with a maximum prefix length that is a multiple
 of 8, in the range from 8 to 2048. The key used for lookup and update
-operations is a ``struct bpf_lpm_trie_key``, extended by
+operations is a ``struct bpf_lpm_trie_key_u8``, extended by
 ``max_prefixlen/8`` bytes.
 
 - For IPv4 addresses the data length is 4 bytes
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d2e6c5fcec01..a241f407c234 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -77,12 +77,29 @@ struct bpf_insn {
 	__s32	imm;		/* signed immediate constant */
 };
 
-/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
+/* Deprecated: use struct bpf_lpm_trie_key_u8 (when the "data" member is needed for
+ * byte access) or struct bpf_lpm_trie_key_hdr (when using an alternative type for
+ * the trailing flexible array member) instead.
+ */
 struct bpf_lpm_trie_key {
 	__u32	prefixlen;	/* up to 32 for AF_INET, 128 for AF_INET6 */
 	__u8	data[0];	/* Arbitrary size */
 };
 
+/* Header for bpf_lpm_trie_key structs */
+struct bpf_lpm_trie_key_hdr {
+	__u32	prefixlen;
+};
+
+/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry, with trailing byte array. */
+struct bpf_lpm_trie_key_u8 {
+	union {
+		struct bpf_lpm_trie_key_hdr	hdr;
+		__u32				prefixlen;
+	};
+	__u8	data[];		/* Arbitrary size */
+};
+
 struct bpf_cgroup_storage_key {
 	__u64	cgroup_inode_id;	/* cgroup inode id */
 	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index b32be680da6c..050fe1ebf0f7 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -164,13 +164,13 @@ static inline int extract_bit(const u8 *data, size_t index)
  */
 static size_t longest_prefix_match(const struct lpm_trie *trie,
 				   const struct lpm_trie_node *node,
-				   const struct bpf_lpm_trie_key *key)
+				   const struct bpf_lpm_trie_key_u8 *key)
 {
 	u32 limit = min(node->prefixlen, key->prefixlen);
 	u32 prefixlen = 0, i = 0;
 
 	BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32));
-	BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32));
+	BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key_u8, data) % sizeof(u32));
 
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)
 
@@ -229,7 +229,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
 {
 	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
 	struct lpm_trie_node *node, *found = NULL;
-	struct bpf_lpm_trie_key *key = _key;
+	struct bpf_lpm_trie_key_u8 *key = _key;
 
 	if (key->prefixlen > trie->max_prefixlen)
 		return NULL;
@@ -309,7 +309,7 @@ static long trie_update_elem(struct bpf_map *map,
 	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
 	struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
 	struct lpm_trie_node __rcu **slot;
-	struct bpf_lpm_trie_key *key = _key;
+	struct bpf_lpm_trie_key_u8 *key = _key;
 	unsigned long irq_flags;
 	unsigned int next_bit;
 	size_t matchlen = 0;
@@ -437,7 +437,7 @@ out:
 static long trie_delete_elem(struct bpf_map *map, void *_key)
 {
 	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
-	struct bpf_lpm_trie_key *key = _key;
+	struct bpf_lpm_trie_key_u8 *key = _key;
 	struct lpm_trie_node __rcu **trim, **trim2;
 	struct lpm_trie_node *node, *parent;
 	unsigned long irq_flags;
@@ -536,7 +536,7 @@ out:
 				 sizeof(struct lpm_trie_node))
 #define LPM_VAL_SIZE_MIN	1
 
-#define LPM_KEY_SIZE(X)		(sizeof(struct bpf_lpm_trie_key) + (X))
+#define LPM_KEY_SIZE(X)		(sizeof(struct bpf_lpm_trie_key_u8) + (X))
 #define LPM_KEY_SIZE_MAX	LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
 #define LPM_KEY_SIZE_MIN	LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
 
@@ -565,7 +565,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	/* copy mandatory map attributes */
 	bpf_map_init_from_attr(&trie->map, attr);
 	trie->data_size = attr->key_size -
-			  offsetof(struct bpf_lpm_trie_key, data);
+			  offsetof(struct bpf_lpm_trie_key_u8, data);
 	trie->max_prefixlen = trie->data_size * 8;
 
 	spin_lock_init(&trie->lock);
@@ -616,7 +616,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
 {
 	struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root;
 	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
-	struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
+	struct bpf_lpm_trie_key_u8 *key = _key, *next_key = _next_key;
 	struct lpm_trie_node **node_stack = NULL;
 	int err = 0, stack_ptr = -1;
 	unsigned int next_bit;
@@ -703,7 +703,7 @@ find_leftmost:
 	}
 do_copy:
 	next_key->prefixlen = next_node->prefixlen;
-	memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
+	memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key_u8, data),
 	       next_node->data, trie->data_size);
 free_stack:
 	kfree(node_stack);
@@ -715,7 +715,7 @@ static int trie_check_btf(const struct bpf_map *map,
 			  const struct btf_type *key_type,
 			  const struct btf_type *value_type)
 {
-	/* Keys must have struct bpf_lpm_trie_key embedded. */
+	/* Keys must have struct bpf_lpm_trie_key_u8 embedded. */
 	return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ?
 	       -EINVAL : 0;
 }
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
index d2fbcf963cdf..07ff471ed6ae 100644
--- a/samples/bpf/map_perf_test_user.c
+++ b/samples/bpf/map_perf_test_user.c
@@ -370,7 +370,7 @@ static void run_perf_test(int tasks)
 
 static void fill_lpm_trie(void)
 {
-	struct bpf_lpm_trie_key *key;
+	struct bpf_lpm_trie_key_u8 *key;
 	unsigned long value = 0;
 	unsigned int i;
 	int r;
diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c
index 9d41db09c480..266fdd0b025d 100644
--- a/samples/bpf/xdp_router_ipv4_user.c
+++ b/samples/bpf/xdp_router_ipv4_user.c
@@ -91,7 +91,7 @@ static int recv_msg(struct sockaddr_nl sock_addr, int sock)
 static void read_route(struct nlmsghdr *nh, int nll)
 {
 	char dsts[24], gws[24], ifs[16], dsts_len[24], metrics[24];
-	struct bpf_lpm_trie_key *prefix_key;
+	struct bpf_lpm_trie_key_u8 *prefix_key;
 	struct rtattr *rt_attr;
 	struct rtmsg *rt_msg;
 	int rtm_family;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d2e6c5fcec01..a241f407c234 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -77,12 +77,29 @@ struct bpf_insn {
 	__s32	imm;		/* signed immediate constant */
 };
 
-/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
+/* Deprecated: use struct bpf_lpm_trie_key_u8 (when the "data" member is needed for
+ * byte access) or struct bpf_lpm_trie_key_hdr (when using an alternative type for
+ * the trailing flexible array member) instead.
+ */
 struct bpf_lpm_trie_key {
 	__u32	prefixlen;	/* up to 32 for AF_INET, 128 for AF_INET6 */
 	__u8	data[0];	/* Arbitrary size */
 };
 
+/* Header for bpf_lpm_trie_key structs */
+struct bpf_lpm_trie_key_hdr {
+	__u32	prefixlen;
+};
+
+/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry, with trailing byte array. */
+struct bpf_lpm_trie_key_u8 {
+	union {
+		struct bpf_lpm_trie_key_hdr	hdr;
+		__u32				prefixlen;
+	};
+	__u8	data[];		/* Arbitrary size */
+};
+
 struct bpf_cgroup_storage_key {
 	__u64	cgroup_inode_id;	/* cgroup inode id */
 	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c
index 3325da17ec81..efaf622c28dd 100644
--- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c
+++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c
@@ -316,7 +316,7 @@ struct lpm_trie {
 } __attribute__((preserve_access_index));
 
 struct lpm_key {
-	struct bpf_lpm_trie_key trie_key;
+	struct bpf_lpm_trie_key_hdr trie_key;
 	__u32 data;
 };
 
diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c
index c028d621c744..d98c72dc563e 100644
--- a/tools/testing/selftests/bpf/test_lpm_map.c
+++ b/tools/testing/selftests/bpf/test_lpm_map.c
@@ -211,7 +211,7 @@ static void test_lpm_map(int keysize)
 	volatile size_t n_matches, n_matches_after_delete;
 	size_t i, j, n_nodes, n_lookups;
 	struct tlpm_node *t, *list = NULL;
-	struct bpf_lpm_trie_key *key;
+	struct bpf_lpm_trie_key_u8 *key;
 	uint8_t *data, *value;
 	int r, map;
 
@@ -331,8 +331,8 @@ static void test_lpm_map(int keysize)
 static void test_lpm_ipaddr(void)
 {
 	LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC);
-	struct bpf_lpm_trie_key *key_ipv4;
-	struct bpf_lpm_trie_key *key_ipv6;
+	struct bpf_lpm_trie_key_u8 *key_ipv4;
+	struct bpf_lpm_trie_key_u8 *key_ipv6;
 	size_t key_size_ipv4;
 	size_t key_size_ipv6;
 	int map_fd_ipv4;
@@ -423,7 +423,7 @@ static void test_lpm_ipaddr(void)
 static void test_lpm_delete(void)
 {
 	LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC);
-	struct bpf_lpm_trie_key *key;
+	struct bpf_lpm_trie_key_u8 *key;
 	size_t key_size;
 	int map_fd;
 	__u64 value;
@@ -532,7 +532,7 @@ static void test_lpm_delete(void)
 static void test_lpm_get_next_key(void)
 {
 	LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC);
-	struct bpf_lpm_trie_key *key_p, *next_key_p;
+	struct bpf_lpm_trie_key_u8 *key_p, *next_key_p;
 	size_t key_size;
 	__u32 value = 0;
 	int map_fd;
@@ -693,9 +693,9 @@ static void *lpm_test_command(void *arg)
 {
 	int i, j, ret, iter, key_size;
 	struct lpm_mt_test_info *info = arg;
-	struct bpf_lpm_trie_key *key_p;
+	struct bpf_lpm_trie_key_u8 *key_p;
 
-	key_size = sizeof(struct bpf_lpm_trie_key) + sizeof(__u32);
+	key_size = sizeof(*key_p) + sizeof(__u32);
 	key_p = alloca(key_size);
 	for (iter = 0; iter < info->iter; iter++)
 		for (i = 0; i < MAX_TEST_KEYS; i++) {
@@ -717,7 +717,7 @@ static void *lpm_test_command(void *arg)
 				ret = bpf_map_lookup_elem(info->map_fd, key_p, &value);
 				assert(ret == 0 || errno == ENOENT);
 			} else {
-				struct bpf_lpm_trie_key *next_key_p = alloca(key_size);
+				struct bpf_lpm_trie_key_u8 *next_key_p = alloca(key_size);
 				ret = bpf_map_get_next_key(info->map_fd, key_p, next_key_p);
 				assert(ret == 0 || errno == ENOENT || errno == ENOMEM);
 			}
@@ -752,7 +752,7 @@ static void test_lpm_multi_thread(void)
 
 	/* create a trie */
 	value_size = sizeof(__u32);
-	key_size = sizeof(struct bpf_lpm_trie_key) + value_size;
+	key_size = sizeof(struct bpf_lpm_trie_key_hdr) + value_size;
 	map_fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, NULL, key_size, value_size, 100, &opts);
 
 	/* create 4 threads to test update, delete, lookup and get_next_key */
-- 
cgit v1.2.3


From 096361b15577a583afcc28179a08c75cf95e9dae Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 28 Feb 2024 13:54:25 +0000
Subject: ipv6: add ipv6_devconf_read_txrx cacheline_group

IPv6 TX and RX fast path use the following fields:

- disable_ipv6
- hop_limit
- mtu6
- forwarding
- disable_policy
- proxy_ndp

Place them in a group to increase data locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ef3aa060a289..383a0ea2ab91 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -3,6 +3,7 @@
 #define _IPV6_H
 
 #include <uapi/linux/ipv6.h>
+#include <linux/cache.h>
 
 #define ipv6_optlen(p)  (((p)->hdrlen+1) << 3)
 #define ipv6_authlen(p) (((p)->hdrlen+2) << 2)
@@ -10,9 +11,16 @@
  * This structure contains configuration options per IPv6 link.
  */
 struct ipv6_devconf {
-	__s32		forwarding;
+	/* RX & TX fastpath fields. */
+	__cacheline_group_begin(ipv6_devconf_read_txrx);
+	__s32		disable_ipv6;
 	__s32		hop_limit;
 	__s32		mtu6;
+	__s32		forwarding;
+	__s32		disable_policy;
+	__s32		proxy_ndp;
+	__cacheline_group_end(ipv6_devconf_read_txrx);
+
 	__s32		accept_ra;
 	__s32		accept_redirects;
 	__s32		autoconf;
@@ -45,7 +53,6 @@ struct ipv6_devconf {
 	__s32		accept_ra_rt_info_max_plen;
 #endif
 #endif
-	__s32		proxy_ndp;
 	__s32		accept_source_route;
 	__s32		accept_ra_from_local;
 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
@@ -55,7 +62,6 @@ struct ipv6_devconf {
 #ifdef CONFIG_IPV6_MROUTE
 	atomic_t	mc_forwarding;
 #endif
-	__s32		disable_ipv6;
 	__s32		drop_unicast_in_l2_multicast;
 	__s32		accept_dad;
 	__s32		force_tllao;
@@ -76,7 +82,6 @@ struct ipv6_devconf {
 #endif
 	__u32		enhanced_dad;
 	__u32		addr_gen_mode;
-	__s32		disable_policy;
 	__s32           ndisc_tclass;
 	__s32		rpl_seg_enabled;
 	__u32		ioam6_id;
-- 
cgit v1.2.3


From e7135f484994494a38071e3653a83d21d305f50c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 28 Feb 2024 13:54:28 +0000
Subject: ipv6: annotate data-races around cnf.mtu6

idev->cnf.mtu6 might be read locklessly, add appropriate READ_ONCE()
and WRITE_ONCE() annotations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 2 +-
 net/ipv6/addrconf.c     | 4 ++--
 net/ipv6/ndisc.c        | 4 ++--
 net/ipv6/route.c        | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 52a51c69aa9d..a30c6aa9e5cf 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -332,7 +332,7 @@ static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst
 	rcu_read_lock();
 	idev = __in6_dev_get(dst->dev);
 	if (idev)
-		mtu = idev->cnf.mtu6;
+		mtu = READ_ONCE(idev->cnf.mtu6);
 	rcu_read_unlock();
 
 out:
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 6975685f55ea..1b534edc7ca2 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3671,7 +3671,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 
 		if (idev) {
 			rt6_mtu_change(dev, dev->mtu);
-			idev->cnf.mtu6 = dev->mtu;
+			WRITE_ONCE(idev->cnf.mtu6, dev->mtu);
 			break;
 		}
 
@@ -3763,7 +3763,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			if (idev->cnf.mtu6 != dev->mtu &&
 			    dev->mtu >= IPV6_MIN_MTU) {
 				rt6_mtu_change(dev, dev->mtu);
-				idev->cnf.mtu6 = dev->mtu;
+				WRITE_ONCE(idev->cnf.mtu6, dev->mtu);
 			}
 			WRITE_ONCE(idev->tstamp, jiffies);
 			inet6_ifinfo_notify(RTM_NEWLINK, idev);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 8523f0595b01..e96d79cd34d2 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1578,8 +1578,8 @@ skip_routeinfo:
 
 		if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) {
 			ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu);
-		} else if (in6_dev->cnf.mtu6 != mtu) {
-			in6_dev->cnf.mtu6 = mtu;
+		} else if (READ_ONCE(in6_dev->cnf.mtu6) != mtu) {
+			WRITE_ONCE(in6_dev->cnf.mtu6, mtu);
 			fib6_metric_set(rt, RTAX_MTU, mtu);
 			rt6_mtu_change(skb->dev, mtu);
 		}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 707d65bc9c0e..66c685b0b619 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1596,7 +1596,7 @@ static unsigned int fib6_mtu(const struct fib6_result *res)
 
 		rcu_read_lock();
 		idev = __in6_dev_get(dev);
-		mtu = idev->cnf.mtu6;
+		mtu = READ_ONCE(idev->cnf.mtu6);
 		rcu_read_unlock();
 	}
 
@@ -3249,8 +3249,8 @@ u32 ip6_mtu_from_fib6(const struct fib6_result *res,
 
 		mtu = IPV6_MIN_MTU;
 		idev = __in6_dev_get(dev);
-		if (idev && idev->cnf.mtu6 > mtu)
-			mtu = idev->cnf.mtu6;
+		if (idev)
+			mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6));
 	}
 
 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
-- 
cgit v1.2.3


From 32f754176e889cdfe989ef08ece19859427755df Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 28 Feb 2024 13:54:30 +0000
Subject: ipv6: annotate data-races around cnf.forwarding

idev->cnf.forwarding and net->ipv6.devconf_all->forwarding
might be read locklessly, add appropriate READ_ONCE()
and WRITE_ONCE() annotations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_mbim.c |  2 +-
 include/net/ipv6.h         |  8 +++++---
 net/core/filter.c          |  2 +-
 net/ipv6/addrconf.c        | 10 ++++++----
 net/ipv6/ip6_output.c      |  2 +-
 net/ipv6/ndisc.c           | 11 ++++++-----
 net/ipv6/route.c           |  4 ++--
 7 files changed, 22 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c
index cd4083e0b3b9..e13e4920ee9b 100644
--- a/drivers/net/usb/cdc_mbim.c
+++ b/drivers/net/usb/cdc_mbim.c
@@ -339,7 +339,7 @@ static void do_neigh_solicit(struct usbnet *dev, u8 *buf, u16 tci)
 	in6_dev = in6_dev_get(netdev);
 	if (!in6_dev)
 		goto out;
-	is_router = !!in6_dev->cnf.forwarding;
+	is_router = !!READ_ONCE(in6_dev->cnf.forwarding);
 	in6_dev_put(in6_dev);
 
 	/* ipv6_stub != NULL if in6_dev_get returned an inet6_dev */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index cf25ea21d770..88a8e554f7a1 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -534,13 +534,15 @@ static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb)
 	return 0;
 }
 
-static inline bool ipv6_accept_ra(struct inet6_dev *idev)
+static inline bool ipv6_accept_ra(const struct inet6_dev *idev)
 {
+	s32 accept_ra = READ_ONCE(idev->cnf.accept_ra);
+
 	/* If forwarding is enabled, RA are not accepted unless the special
 	 * hybrid mode (accept_ra=2) is enabled.
 	 */
-	return idev->cnf.forwarding ? idev->cnf.accept_ra == 2 :
-	    idev->cnf.accept_ra;
+	return READ_ONCE(idev->cnf.forwarding) ? accept_ra == 2 :
+		accept_ra;
 }
 
 #define IPV6_FRAG_HIGH_THRESH	(4 * 1024*1024)	/* 4194304 */
diff --git a/net/core/filter.c b/net/core/filter.c
index 358870408a51..58e8e1a70aa7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5988,7 +5988,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 		return -ENODEV;
 
 	idev = __in6_dev_get_safely(dev);
-	if (unlikely(!idev || !idev->cnf.forwarding))
+	if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding)))
 		return BPF_FIB_LKUP_RET_FWD_DISABLED;
 
 	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1b534edc7ca2..ec99e393f6a9 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -551,7 +551,8 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 		goto out;
 
 	if ((all || type == NETCONFA_FORWARDING) &&
-	    nla_put_s32(skb, NETCONFA_FORWARDING, devconf->forwarding) < 0)
+	    nla_put_s32(skb, NETCONFA_FORWARDING,
+			READ_ONCE(devconf->forwarding)) < 0)
 		goto nla_put_failure;
 #ifdef CONFIG_IPV6_MROUTE
 	if ((all || type == NETCONFA_MC_FORWARDING) &&
@@ -869,7 +870,8 @@ static void addrconf_forward_change(struct net *net, __s32 newf)
 		idev = __in6_dev_get(dev);
 		if (idev) {
 			int changed = (!idev->cnf.forwarding) ^ (!newf);
-			idev->cnf.forwarding = newf;
+
+			WRITE_ONCE(idev->cnf.forwarding, newf);
 			if (changed)
 				dev_forward_change(idev);
 		}
@@ -886,7 +888,7 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf)
 
 	net = (struct net *)table->extra2;
 	old = *p;
-	*p = newf;
+	WRITE_ONCE(*p, newf);
 
 	if (p == &net->ipv6.devconf_dflt->forwarding) {
 		if ((!newf) ^ (!old))
@@ -901,7 +903,7 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf)
 	if (p == &net->ipv6.devconf_all->forwarding) {
 		int old_dflt = net->ipv6.devconf_dflt->forwarding;
 
-		net->ipv6.devconf_dflt->forwarding = newf;
+		WRITE_ONCE(net->ipv6.devconf_dflt->forwarding, newf);
 		if ((!newf) ^ (!old_dflt))
 			inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
 						     NETCONFA_FORWARDING,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 0559bd000585..444be8c84cc5 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -501,7 +501,7 @@ int ip6_forward(struct sk_buff *skb)
 	u32 mtu;
 
 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
-	if (net->ipv6.devconf_all->forwarding == 0)
+	if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
 		goto error;
 
 	if (skb->pkt_type != PACKET_HOST)
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 9c9c31268432..1fb5e37bc78b 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -903,7 +903,7 @@ have_ifp:
 		}
 
 		if (ipv6_chk_acast_addr(net, dev, &msg->target) ||
-		    (idev->cnf.forwarding &&
+		    (READ_ONCE(idev->cnf.forwarding) &&
 		     (net->ipv6.devconf_all->proxy_ndp || idev->cnf.proxy_ndp) &&
 		     (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) {
 			if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
@@ -929,7 +929,7 @@ have_ifp:
 	}
 
 	if (is_router < 0)
-		is_router = idev->cnf.forwarding;
+		is_router = READ_ONCE(idev->cnf.forwarding);
 
 	if (dad) {
 		ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target,
@@ -1080,7 +1080,7 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb)
 	 * Note that we don't do a (daddr == all-routers-mcast) check.
 	 */
 	new_state = msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE;
-	if (!neigh && lladdr && idev && idev->cnf.forwarding) {
+	if (!neigh && lladdr && idev && READ_ONCE(idev->cnf.forwarding)) {
 		if (accept_untracked_na(dev, saddr)) {
 			neigh = neigh_create(&nd_tbl, &msg->target, dev);
 			new_state = NUD_STALE;
@@ -1100,7 +1100,8 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb)
 		 * has already sent a NA to us.
 		 */
 		if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) &&
-		    net->ipv6.devconf_all->forwarding && net->ipv6.devconf_all->proxy_ndp &&
+		    READ_ONCE(net->ipv6.devconf_all->forwarding) &&
+		    net->ipv6.devconf_all->proxy_ndp &&
 		    pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) {
 			/* XXX: idev->cnf.proxy_ndp */
 			goto out;
@@ -1148,7 +1149,7 @@ static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb)
 	}
 
 	/* Don't accept RS if we're not in router mode */
-	if (!idev->cnf.forwarding)
+	if (!READ_ONCE(idev->cnf.forwarding))
 		goto out;
 
 	/*
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 66c685b0b619..6a2b53de4818 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2220,7 +2220,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 	strict |= flags & RT6_LOOKUP_F_IFACE;
 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
-	if (net->ipv6.devconf_all->forwarding == 0)
+	if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
 		strict |= RT6_LOOKUP_F_REACHABLE;
 
 	rcu_read_lock();
@@ -4149,7 +4149,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	in6_dev = __in6_dev_get(skb->dev);
 	if (!in6_dev)
 		return;
-	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
+	if (READ_ONCE(in6_dev->cnf.forwarding) || !in6_dev->cnf.accept_redirects)
 		return;
 
 	/* RFC2461 8.1:
-- 
cgit v1.2.3


From fca34cc075996767fbbdb6252be9ddd21c34c920 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 28 Feb 2024 13:54:32 +0000
Subject: ipv6: annotate data-races around
 idev->cnf.ignore_routes_with_linkdown

idev->cnf.ignore_routes_with_linkdown can be used without any locks,
add appropriate annotations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h | 2 +-
 net/ipv6/addrconf.c    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 30d6f1e84e46..9d06eb945509 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -417,7 +417,7 @@ static inline bool ip6_ignore_linkdown(const struct net_device *dev)
 	if (unlikely(!idev))
 		return true;
 
-	return !!idev->cnf.ignore_routes_with_linkdown;
+	return !!READ_ONCE(idev->cnf.ignore_routes_with_linkdown);
 }
 
 void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ec99e393f6a9..82b8d90afb6a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -566,7 +566,7 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 
 	if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
 	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
-			devconf->ignore_routes_with_linkdown) < 0)
+			READ_ONCE(devconf->ignore_routes_with_linkdown)) < 0)
 		goto nla_put_failure;
 
 out:
@@ -935,7 +935,7 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf)
 		if (idev) {
 			int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf);
 
-			idev->cnf.ignore_routes_with_linkdown = newf;
+			WRITE_ONCE(idev->cnf.ignore_routes_with_linkdown, newf);
 			if (changed)
 				inet6_netconf_notify_devconf(dev_net(dev),
 							     RTM_NEWNETCONF,
@@ -956,7 +956,7 @@ static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf)
 
 	net = (struct net *)table->extra2;
 	old = *p;
-	*p = newf;
+	WRITE_ONCE(*p, newf);
 
 	if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) {
 		if ((!newf) ^ (!old))
@@ -970,7 +970,7 @@ static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf)
 	}
 
 	if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) {
-		net->ipv6.devconf_dflt->ignore_routes_with_linkdown = newf;
+		WRITE_ONCE(net->ipv6.devconf_dflt->ignore_routes_with_linkdown, newf);
 		addrconf_linkdown_change(net, newf);
 		if ((!newf) ^ (!old))
 			inet6_netconf_notify_devconf(net,
-- 
cgit v1.2.3


From f29f9199c2d2b3c258f577f438885288016847ed Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 28 Feb 2024 15:05:29 +0100
Subject: Simplify net_dbg_ratelimited() dummy

There is no need to wrap calls to the no_printk() helper inside an
always-false check, as no_printk() already does that internally.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index c9b4a63791a4..15df6d5f27a7 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -299,10 +299,7 @@ do {									\
 	net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__)
 #else
 #define net_dbg_ratelimited(fmt, ...)				\
-	do {							\
-		if (0)						\
-			no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \
-	} while (0)
+	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
 #endif
 
 #define net_get_random_once(buf, nbytes)			\
-- 
cgit v1.2.3


From cc15bd10e716fcb472d611f24d76c795acb0f8c7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 29 Feb 2024 09:39:08 +0000
Subject: net: adopt skb_network_header_len() more broadly

(skb_transport_header(skb) - skb_network_header(skb))
can be replaced by skb_network_header_len(skb)

Add a DEBUG_NET_WARN_ON_ONCE() in skb_network_header_len()
to catch cases were the transport_header was not set.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 3 +--
 drivers/net/ethernet/intel/i40e/i40e_main.c     | 2 +-
 drivers/net/ethernet/intel/iavf/iavf_main.c     | 2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/tx.c     | 3 +--
 drivers/net/wireless/intel/iwlwifi/pcie/tx.c    | 2 +-
 drivers/net/wireless/intel/iwlwifi/queue/tx.c   | 2 +-
 include/linux/skbuff.h                          | 1 +
 7 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index adcee8d9d6d9..c9b6acd8c892 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -3621,8 +3621,7 @@ static u8 bnx2x_set_pbd_csum(struct bnx2x *bp, struct sk_buff *skb,
 			    ((skb->protocol == cpu_to_be16(ETH_P_8021Q)) <<
 			     ETH_TX_PARSE_BD_E1X_LLC_SNAP_EN_SHIFT));
 
-	pbd->ip_hlen_w = (skb_transport_header(skb) -
-			skb_network_header(skb)) >> 1;
+	pbd->ip_hlen_w = skb_network_header_len(skb) >> 1;
 
 	hlen += pbd->ip_hlen_w;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index a67b13869016..3fada49b8ae2 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -13213,7 +13213,7 @@ static netdev_features_t i40e_features_check(struct sk_buff *skb,
 		goto out_err;
 
 	/* IPLEN and EIPLEN can support at most 127 dwords */
-	len = skb_transport_header(skb) - skb_network_header(skb);
+	len = skb_network_header_len(skb);
 	if (len & ~(127 * 4))
 		goto out_err;
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 245c458e38ae..aefec6bd3b67 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -4428,7 +4428,7 @@ static netdev_features_t iavf_features_check(struct sk_buff *skb,
 		goto out_err;
 
 	/* IPLEN and EIPLEN can support at most 127 dwords */
-	len = skb_transport_header(skb) - skb_network_header(skb);
+	len = skb_network_header_len(skb);
 	if (len & ~(127 * 4))
 		goto out_err;
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index e502f4ee9e1f..782ddc8c296b 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -1015,8 +1015,7 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
 	netdev_features_t netdev_flags = NETIF_F_CSUM_MASK | NETIF_F_SG;
 	u8 tid;
 
-	snap_ip_tcp = 8 + skb_transport_header(skb) - skb_network_header(skb) +
-		tcp_hdrlen(skb);
+	snap_ip_tcp = 8 + skb_network_header_len(skb) + tcp_hdrlen(skb);
 
 	if (!mvmsta->max_amsdu_len ||
 	    !ieee80211_is_data_qos(hdr->frame_control) ||
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
index 6c2b37e56c78..fa8eba47dc4c 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
@@ -1331,7 +1331,7 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb,
 			     trans->txqs.tfd.size,
 			     &dev_cmd->hdr, IWL_FIRST_TB_SIZE + tb1_len, 0);
 
-	ip_hdrlen = skb_transport_header(skb) - skb_network_header(skb);
+	ip_hdrlen = skb_network_header_len(skb);
 	snap_ip_tcp_hdrlen = 8 + ip_hdrlen + tcp_hdrlen(skb);
 	total_len = skb->len - snap_ip_tcp_hdrlen - hdr_len - iv_len;
 	amsdu_pad = 0;
diff --git a/drivers/net/wireless/intel/iwlwifi/queue/tx.c b/drivers/net/wireless/intel/iwlwifi/queue/tx.c
index d3bde2d010b7..33973a60d0bf 100644
--- a/drivers/net/wireless/intel/iwlwifi/queue/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/queue/tx.c
@@ -353,7 +353,7 @@ static int iwl_txq_gen2_build_amsdu(struct iwl_trans *trans,
 	trace_iwlwifi_dev_tx(trans->dev, skb, tfd, sizeof(*tfd),
 			     &dev_cmd->hdr, start_len, 0);
 
-	ip_hdrlen = skb_transport_header(skb) - skb_network_header(skb);
+	ip_hdrlen = skb_network_header_len(skb);
 	snap_ip_tcp_hdrlen = 8 + ip_hdrlen + tcp_hdrlen(skb);
 	total_len = skb->len - snap_ip_tcp_hdrlen - hdr_len;
 	amsdu_pad = 0;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1470b74fb6d2..d577e0bee18d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3036,6 +3036,7 @@ static inline int skb_transport_offset(const struct sk_buff *skb)
 
 static inline u32 skb_network_header_len(const struct sk_buff *skb)
 {
+	DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb));
 	return skb->transport_header - skb->network_header;
 }
 
-- 
cgit v1.2.3


From 04577bfa99ac8cfb7a43dbbba09584e0b1086cee Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Wed, 28 Feb 2024 09:44:56 +0100
Subject: wifi: mac80211: add link id to ieee80211_gtk_rekey_add()

In MLO, we need the link id in the GTK key to be given by
the driver after rekeying in wowlan, so add that.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Reviewed-by: Miriam Rachel Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240228094500.ce1bfc83a680.I43a6f8ab2804ee07116a37d5b9ec601b843464b1@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 12 +++++++-----
 include/net/mac80211.h                      |  4 +++-
 net/mac80211/key.c                          | 16 ++++++++++++----
 3 files changed, 22 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
index 6d5ed79b9fff..ca2c6d0b605e 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
@@ -1976,6 +1976,7 @@ static bool iwl_mvm_gtk_rekey(struct iwl_wowlan_status_data *status,
 	} conf = {
 		.conf.cipher = gtk_cipher,
 	};
+	int link_id = vif->active_links ? __ffs(vif->active_links) : -1;
 
 	BUILD_BUG_ON(WLAN_KEY_LEN_CCMP != WLAN_KEY_LEN_GCMP);
 	BUILD_BUG_ON(sizeof(conf.key) < WLAN_KEY_LEN_CCMP);
@@ -2009,7 +2010,7 @@ static bool iwl_mvm_gtk_rekey(struct iwl_wowlan_status_data *status,
 		memcpy(conf.conf.key, status->gtk[i].key,
 		       sizeof(status->gtk[i].key));
 
-		key = ieee80211_gtk_rekey_add(vif, &conf.conf);
+		key = ieee80211_gtk_rekey_add(vif, &conf.conf, link_id);
 		if (IS_ERR(key))
 			return false;
 
@@ -2040,6 +2041,7 @@ iwl_mvm_d3_igtk_bigtk_rekey_add(struct iwl_wowlan_status_data *status,
 		.conf.keyidx = key_data->id,
 	};
 	struct ieee80211_key_seq seq;
+	int link_id = vif->active_links ? __ffs(vif->active_links) : -1;
 
 	if (!key_data->len)
 		return true;
@@ -2065,17 +2067,17 @@ iwl_mvm_d3_igtk_bigtk_rekey_add(struct iwl_wowlan_status_data *status,
 	BUILD_BUG_ON(sizeof(conf.key) < sizeof(key_data->key));
 	memcpy(conf.conf.key, key_data->key, conf.conf.keylen);
 
-	key_config = ieee80211_gtk_rekey_add(vif, &conf.conf);
+	key_config = ieee80211_gtk_rekey_add(vif, &conf.conf, link_id);
 	if (IS_ERR(key_config))
 		return false;
 	ieee80211_set_key_rx_seq(key_config, 0, &seq);
 
 	if (key_config->keyidx == 4 || key_config->keyidx == 5) {
 		struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
-		int link_id = vif->active_links ? __ffs(vif->active_links) : 0;
-		struct iwl_mvm_vif_link_info *mvm_link =
-			mvmvif->link[link_id];
+		struct iwl_mvm_vif_link_info *mvm_link;
 
+		link_id = link_id < 0 ? 0 : link_id;
+		mvm_link = mvmvif->link[link_id];
 		mvm_link->igtk = key_config;
 	}
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 56c6ecb2c10a..34d66d0a24b1 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5912,6 +5912,7 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf);
  * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN
  * @vif: the virtual interface to add the key on
  * @keyconf: new key data
+ * @link_id: the link id of the key or -1 for non-MLO
  *
  * When GTK rekeying was done while the system was suspended, (a) new
  * key(s) will be available. These will be needed by mac80211 for proper
@@ -5939,7 +5940,8 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf);
  */
 struct ieee80211_key_conf *
 ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
-			struct ieee80211_key_conf *keyconf);
+			struct ieee80211_key_conf *keyconf,
+			int link_id);
 
 /**
  * ieee80211_gtk_rekey_notify - notify userspace supplicant of rekeying
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index a2cce62c97b7..eecdd2265eaa 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -6,7 +6,7 @@
  * Copyright 2007-2008	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2015-2017	Intel Deutschland GmbH
- * Copyright 2018-2020, 2022-2023  Intel Corporation
+ * Copyright 2018-2020, 2022-2024  Intel Corporation
  */
 
 #include <crypto/utils.h>
@@ -1372,12 +1372,19 @@ EXPORT_SYMBOL_GPL(ieee80211_remove_key);
 
 struct ieee80211_key_conf *
 ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
-			struct ieee80211_key_conf *keyconf)
+			struct ieee80211_key_conf *keyconf,
+			int link_id)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_key *key;
 	int err;
+	struct ieee80211_link_data *link_data =
+		link_id < 0 ? &sdata->deflink :
+		sdata_dereference(sdata->link[link_id], sdata);
+
+	if (WARN_ON(!link_data))
+		return ERR_PTR(-EINVAL);
 
 	if (WARN_ON(!local->wowlan))
 		return ERR_PTR(-EINVAL);
@@ -1394,8 +1401,9 @@ ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
 	if (sdata->u.mgd.mfp != IEEE80211_MFP_DISABLED)
 		key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT;
 
-	/* FIXME: this function needs to get a link ID */
-	err = ieee80211_key_link(key, &sdata->deflink, NULL);
+	key->conf.link_id = link_id;
+
+	err = ieee80211_key_link(key, link_data, NULL);
 	if (err)
 		return ERR_PTR(err);
 
-- 
cgit v1.2.3


From 68f6c6afbcebdc3acdc6084abfe453f4cba6b9dc Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 28 Feb 2024 09:48:11 +0100
Subject: wifi: mac80211: add ieee80211_vif_link_active() helper

We sometimes need to check if a link is active, and this
is complicated by the fact that active_links has no bits
set when the vif isn't (acting as) an MLD. Add a small
new helper ieee80211_vif_link_active() to make that a bit
easier, and use it in a few places.

Reviewed-by: Ilan Peer <ilan.peer@intel.com>
Reviewed-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Reviewed-by: Miriam Rachel Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240228094901.688760aff5f7.I06892a503f5ecb9563fbd678d35d08daf7a044b0@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h    | 15 +++++++++++++++
 net/mac80211/cfg.c        |  3 +--
 net/mac80211/chan.c       |  3 +--
 net/mac80211/driver-ops.c | 14 +++++---------
 net/mac80211/util.c       |  3 +--
 5 files changed, 23 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 34d66d0a24b1..6c6d8210d637 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2011,6 +2011,21 @@ static inline bool ieee80211_vif_is_mld(const struct ieee80211_vif *vif)
 	return vif->valid_links != 0;
 }
 
+/**
+ * ieee80211_vif_link_active - check if a given link is active
+ * @vif: the vif
+ * @link_id: the link ID to check
+ * Return: %true if the vif is an MLD and the link is active, or if
+ *	the vif is not an MLD and the link ID is 0; %false otherwise.
+ */
+static inline bool ieee80211_vif_link_active(const struct ieee80211_vif *vif,
+					     unsigned int link_id)
+{
+	if (!ieee80211_vif_is_mld(vif))
+		return link_id == 0;
+	return vif->active_links & BIT(link_id);
+}
+
 #define for_each_vif_active_link(vif, link, link_id)				\
 	for (link_id = 0; link_id < ARRAY_SIZE((vif)->link_conf); link_id++)	\
 		if ((!(vif)->active_links ||					\
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index b9c1ec2f9cfc..358593eeb500 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3189,8 +3189,7 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
 	if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION))
 		return -EINVAL;
 
-	if (ieee80211_vif_is_mld(&sdata->vif) &&
-	    !(sdata->vif.active_links & BIT(link->link_id)))
+	if (!ieee80211_vif_link_active(&sdata->vif, link->link_id))
 		return 0;
 
 	old_req = link->u.mgd.req_smps;
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 38acdc458c7c..80e4b9784131 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -1701,8 +1701,7 @@ int ieee80211_link_use_channel(struct ieee80211_link_data *link,
 
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	if (sdata->vif.active_links &&
-	    !(sdata->vif.active_links & BIT(link->link_id))) {
+	if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) {
 		ieee80211_link_update_chanreq(link, chanreq);
 		return 0;
 	}
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index 3b7f70073fc3..dce37ba8ebe3 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright 2015 Intel Deutschland GmbH
- * Copyright (C) 2022-2023 Intel Corporation
+ * Copyright (C) 2022-2024 Intel Corporation
  */
 #include <net/mac80211.h>
 #include "ieee80211_i.h"
@@ -214,8 +214,7 @@ int drv_conf_tx(struct ieee80211_local *local,
 	if (!check_sdata_in_driver(sdata))
 		return -EIO;
 
-	if (sdata->vif.active_links &&
-	    !(sdata->vif.active_links & BIT(link->link_id)))
+	if (!ieee80211_vif_link_active(&sdata->vif, link->link_id))
 		return 0;
 
 	if (params->cw_min == 0 || params->cw_min > params->cw_max) {
@@ -315,8 +314,7 @@ int drv_assign_vif_chanctx(struct ieee80211_local *local,
 	if (!check_sdata_in_driver(sdata))
 		return -EIO;
 
-	if (sdata->vif.active_links &&
-	    !(sdata->vif.active_links & BIT(link_conf->link_id)))
+	if (!ieee80211_vif_link_active(&sdata->vif, link_conf->link_id))
 		return 0;
 
 	trace_drv_assign_vif_chanctx(local, sdata, link_conf, ctx);
@@ -343,8 +341,7 @@ void drv_unassign_vif_chanctx(struct ieee80211_local *local,
 	if (!check_sdata_in_driver(sdata))
 		return;
 
-	if (sdata->vif.active_links &&
-	    !(sdata->vif.active_links & BIT(link_conf->link_id)))
+	if (!ieee80211_vif_link_active(&sdata->vif, link_conf->link_id))
 		return;
 
 	trace_drv_unassign_vif_chanctx(local, sdata, link_conf, ctx);
@@ -461,8 +458,7 @@ void drv_link_info_changed(struct ieee80211_local *local,
 	if (!check_sdata_in_driver(sdata))
 		return;
 
-	if (sdata->vif.active_links &&
-	    !(sdata->vif.active_links & BIT(link_id)))
+	if (!ieee80211_vif_link_active(&sdata->vif, link_id))
 		return;
 
 	trace_drv_link_info_changed(local, sdata, info, changed);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index f810b83a390a..a237cbcf7b49 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1935,8 +1935,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		for (link_id = 0;
 		     link_id < ARRAY_SIZE(sdata->vif.link_conf);
 		     link_id++) {
-			if (ieee80211_vif_is_mld(&sdata->vif) &&
-			    !(sdata->vif.active_links & BIT(link_id)))
+			if (!ieee80211_vif_link_active(&sdata->vif, link_id))
 				continue;
 
 			link = sdata_dereference(sdata->link[link_id], sdata);
-- 
cgit v1.2.3


From 22667035e5ddb7b68c7d473693b321fb9e20a397 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 28 Feb 2024 09:55:41 +0100
Subject: wifi: cfg80211: expose cfg80211_iter_rnr() to drivers

In mac80211 we'll need to look at reduced neighbor report
entries for channel switch purposes, so export the iteration
function to make that simpler.

Reviewed-by: Miriam Rachel Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240228095718.0954809964ef.I53e95c017aa71f14e8d1057afbbc75982ddb43df@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 32 ++++++++++++++++++++++++++++++++
 net/wireless/scan.c    | 20 +++++++-------------
 2 files changed, 39 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f9eada2a26ec..53653d234d39 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6870,6 +6870,38 @@ cfg80211_find_vendor_ie(unsigned int oui, int oui_type,
 	return (const void *)cfg80211_find_vendor_elem(oui, oui_type, ies, len);
 }
 
+/**
+ * enum cfg80211_rnr_iter_ret - reduced neighbor report iteration state
+ * @RNR_ITER_CONTINUE: continue iterating with the next entry
+ * @RNR_ITER_BREAK: break iteration and return success
+ * @RNR_ITER_ERROR: break iteration and return error
+ */
+enum cfg80211_rnr_iter_ret {
+	RNR_ITER_CONTINUE,
+	RNR_ITER_BREAK,
+	RNR_ITER_ERROR,
+};
+
+/**
+ * cfg80211_iter_rnr - iterate reduced neighbor report entries
+ * @elems: the frame elements to iterate RNR elements and then
+ *	their entries in
+ * @elems_len: length of the elements
+ * @iter: iteration function, see also &enum cfg80211_rnr_iter_ret
+ *	for the return value
+ * @iter_data: additional data passed to the iteration function
+ * Return: %true on success (after successfully iterating all entries
+ *	or if the iteration function returned %RNR_ITER_BREAK),
+ *	%false on error (iteration function returned %RNR_ITER_ERROR
+ *	or elements were malformed.)
+ */
+bool cfg80211_iter_rnr(const u8 *elems, size_t elems_len,
+		       enum cfg80211_rnr_iter_ret
+		       (*iter)(void *data, u8 type,
+			       const struct ieee80211_neighbor_ap_info *info,
+			       const u8 *tbtt_info, u8 tbtt_info_len),
+		       void *iter_data);
+
 /**
  * cfg80211_defragment_element - Defrag the given element data into a buffer
  *
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 7cf36b8d3ae7..9377a43aa5f7 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -611,19 +611,12 @@ static int cfg80211_parse_ap_info(struct cfg80211_colocated_ap *entry,
 	return 0;
 }
 
-enum cfg80211_rnr_iter_ret {
-	RNR_ITER_CONTINUE,
-	RNR_ITER_BREAK,
-	RNR_ITER_ERROR,
-};
-
-static bool
-cfg80211_iter_rnr(const u8 *elems, size_t elems_len,
-		  enum cfg80211_rnr_iter_ret
-		  (*iter)(void *data, u8 type,
-			  const struct ieee80211_neighbor_ap_info *info,
-			  const u8 *tbtt_info, u8 tbtt_info_len),
-		  void *iter_data)
+bool cfg80211_iter_rnr(const u8 *elems, size_t elems_len,
+		       enum cfg80211_rnr_iter_ret
+		       (*iter)(void *data, u8 type,
+			       const struct ieee80211_neighbor_ap_info *info,
+			       const u8 *tbtt_info, u8 tbtt_info_len),
+		       void *iter_data)
 {
 	const struct element *rnr;
 	const u8 *pos, *end;
@@ -675,6 +668,7 @@ cfg80211_iter_rnr(const u8 *elems, size_t elems_len,
 
 	return true;
 }
+EXPORT_SYMBOL_GPL(cfg80211_iter_rnr);
 
 struct colocated_ap_data {
 	const struct element *ssid_elem;
-- 
cgit v1.2.3


From 8ade3356b25ab2522892a21832a709e7ad5f8168 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 28 Feb 2024 09:55:42 +0100
Subject: wifi: cfg80211: allow cfg80211_defragment_element() without output

If we just want to determine the length of the fragmented
data, we basically need the same logic, and really we want
it to be _literally_ the same logic, so it cannot be out
of sync in any way.

Allow calling cfg80211_defragment_element() without an output
buffer, where it then just returns the required output size.

Also add this to the tests, just to exercise it, using the
pre-calculated length to really do the defragmentation, which
checks that this is sufficient.

Reviewed-by: Miriam Rachel Korenblit <miriam.rachel.korenblit@intel.com>
Reviewed-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://msgid.link/20240228095718.6d6565b9e3f2.Ib441903f4b8644ba04b1c766f90580ee6f54fc66@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h             |  4 ++--
 net/wireless/scan.c                | 27 ++++++++++++++++++---------
 net/wireless/tests/fragmentation.c | 30 +++++++++++++++++++++++++-----
 3 files changed, 45 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 53653d234d39..2e2be4fd2bb6 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6908,8 +6908,8 @@ bool cfg80211_iter_rnr(const u8 *elems, size_t elems_len,
  * @elem: the element to defragment
  * @ies: elements where @elem is contained
  * @ieslen: length of @ies
- * @data: buffer to store element data
- * @data_len: length of @data
+ * @data: buffer to store element data, or %NULL to just determine size
+ * @data_len: length of @data, or 0
  * @frag_id: the element ID of fragments
  *
  * Return: length of @data, or -EINVAL on error
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 9377a43aa5f7..5a5dd3ce497f 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -2504,16 +2504,22 @@ ssize_t cfg80211_defragment_element(const struct element *elem, const u8 *ies,
 
 	if (elem->id == WLAN_EID_EXTENSION) {
 		copied = elem->datalen - 1;
-		if (copied > data_len)
-			return -ENOSPC;
 
-		memmove(data, elem->data + 1, copied);
+		if (data) {
+			if (copied > data_len)
+				return -ENOSPC;
+
+			memmove(data, elem->data + 1, copied);
+		}
 	} else {
 		copied = elem->datalen;
-		if (copied > data_len)
-			return -ENOSPC;
 
-		memmove(data, elem->data, copied);
+		if (data) {
+			if (copied > data_len)
+				return -ENOSPC;
+
+			memmove(data, elem->data, copied);
+		}
 	}
 
 	/* Fragmented elements must have 255 bytes */
@@ -2532,10 +2538,13 @@ ssize_t cfg80211_defragment_element(const struct element *elem, const u8 *ies,
 
 		elem_datalen = elem->datalen;
 
-		if (copied + elem_datalen > data_len)
-			return -ENOSPC;
+		if (data) {
+			if (copied + elem_datalen > data_len)
+				return -ENOSPC;
+
+			memmove(data + copied, elem->data, elem_datalen);
+		}
 
-		memmove(data + copied, elem->data, elem_datalen);
 		copied += elem_datalen;
 
 		/* Only the last fragment may be short */
diff --git a/net/wireless/tests/fragmentation.c b/net/wireless/tests/fragmentation.c
index 49a339ca8880..411fae18cd88 100644
--- a/net/wireless/tests/fragmentation.c
+++ b/net/wireless/tests/fragmentation.c
@@ -2,7 +2,7 @@
 /*
  * KUnit tests for element fragmentation
  *
- * Copyright (C) 2023 Intel Corporation
+ * Copyright (C) 2023-2024 Intel Corporation
  */
 #include <linux/ieee80211.h>
 #include <net/cfg80211.h>
@@ -27,7 +27,12 @@ static void defragment_0(struct kunit *test)
 
 	ret = cfg80211_defragment_element((void *)input,
 					  input, sizeof(input),
-					  data, sizeof(input),
+					  NULL, 0,
+					  WLAN_EID_FRAGMENT);
+	KUNIT_EXPECT_EQ(test, ret, 253);
+	ret = cfg80211_defragment_element((void *)input,
+					  input, sizeof(input),
+					  data, ret,
 					  WLAN_EID_FRAGMENT);
 	KUNIT_EXPECT_EQ(test, ret, 253);
 	KUNIT_EXPECT_MEMEQ(test, data, input + 3, 253);
@@ -63,7 +68,12 @@ static void defragment_1(struct kunit *test)
 
 	ret = cfg80211_defragment_element((void *)input,
 					  input, sizeof(input),
-					  data, sizeof(input),
+					  NULL, 0,
+					  WLAN_EID_FRAGMENT);
+	KUNIT_EXPECT_EQ(test, ret, 254 + 7);
+	ret = cfg80211_defragment_element((void *)input,
+					  input, sizeof(input),
+					  data, ret,
 					  WLAN_EID_FRAGMENT);
 	/* this means the last fragment was not used */
 	KUNIT_EXPECT_EQ(test, ret, 254 + 7);
@@ -106,10 +116,15 @@ static void defragment_2(struct kunit *test)
 
 	ret = cfg80211_defragment_element((void *)input,
 					  input, sizeof(input),
-					  data, sizeof(input),
+					  NULL, 0,
 					  WLAN_EID_FRAGMENT);
 	/* this means the last fragment was not used */
 	KUNIT_EXPECT_EQ(test, ret, 254 + 255 + 1);
+	ret = cfg80211_defragment_element((void *)input,
+					  input, sizeof(input),
+					  data, ret,
+					  WLAN_EID_FRAGMENT);
+	KUNIT_EXPECT_EQ(test, ret, 254 + 255 + 1);
 	KUNIT_EXPECT_MEMEQ(test, data, input + 3, 254);
 	KUNIT_EXPECT_MEMEQ(test, data + 254, input + 257 + 2, 255);
 	KUNIT_EXPECT_MEMEQ(test, data + 254 + 255, input + 2 * 257 + 2, 1);
@@ -134,7 +149,12 @@ static void defragment_at_end(struct kunit *test)
 
 	ret = cfg80211_defragment_element((void *)input,
 					  input, sizeof(input),
-					  data, sizeof(input),
+					  NULL, 0,
+					  WLAN_EID_FRAGMENT);
+	KUNIT_EXPECT_EQ(test, ret, 254 + 7);
+	ret = cfg80211_defragment_element((void *)input,
+					  input, sizeof(input),
+					  data, ret,
 					  WLAN_EID_FRAGMENT);
 	KUNIT_EXPECT_EQ(test, ret, 254 + 7);
 	KUNIT_EXPECT_MEMEQ(test, data, input + 3, 254);
-- 
cgit v1.2.3


From e6ee3a3713fe38e4ae94318e9cb18b39ec30da4a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 28 Feb 2024 09:55:43 +0100
Subject: wifi: mac80211: pass link_id to channel switch ops

For CSA to work correctly in multi-link scenarios, pass
the link_id to the relevant callbacks.

While at it, unify/deduplicate the tracing for them.

Reviewed-by: Miriam Rachel Korenblit <miriam.rachel.korenblit@intel.com>
Reviewed-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Link: https://msgid.link/20240228095718.b7726635c054.I0be5d00af4acb48cfbd23a9dbf067f9aeb66469d@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  6 ++--
 net/mac80211/cfg.c     |  4 ++-
 net/mac80211/mlme.c    |  4 ++-
 net/mac80211/trace.h   | 82 +++++++++++---------------------------------------
 4 files changed, 28 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 6c6d8210d637..8ea9fa81e68c 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1763,8 +1763,9 @@ struct ieee80211_conf {
  * @chandef: the new channel to switch to
  * @count: the number of TBTT's until the channel switch event
  * @delay: maximum delay between the time the AP transmitted the last beacon in
-  *	current channel and the expected time of the first beacon in the new
-  *	channel, expressed in TU.
+ *	current channel and the expected time of the first beacon in the new
+ *	channel, expressed in TU.
+ * @link_id: the link ID of the link doing the channel switch, 0 for non-MLO
  */
 struct ieee80211_channel_switch {
 	u64 timestamp;
@@ -1772,6 +1773,7 @@ struct ieee80211_channel_switch {
 	bool block_tx;
 	struct cfg80211_chan_def chandef;
 	u8 count;
+	u8 link_id;
 	u32 delay;
 };
 
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 358593eeb500..4bc0cc2dff83 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3931,7 +3931,9 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_chan_req chanreq = { .oper = params->chandef };
 	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_channel_switch ch_switch;
+	struct ieee80211_channel_switch ch_switch = {
+		.link_id = params->link_id,
+	};
 	struct ieee80211_chanctx_conf *conf;
 	struct ieee80211_chanctx *chanctx;
 	struct ieee80211_bss_conf *link_conf;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 90e505778db6..d1d4480c0352 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2021,7 +2021,9 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link,
 	struct ieee80211_chanctx *chanctx;
 	enum nl80211_band current_band;
 	struct ieee80211_csa_ie csa_ie;
-	struct ieee80211_channel_switch ch_switch;
+	struct ieee80211_channel_switch ch_switch = {
+		.link_id = link->link_id,
+	};
 	struct ieee80211_bss *bss;
 	unsigned long timeout;
 	int res;
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 478b32d2520a..8e758b5074bd 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1209,7 +1209,7 @@ DEFINE_EVENT(sta_event, drv_flush_sta,
 	TP_ARGS(local, sdata, sta)
 );
 
-TRACE_EVENT(drv_channel_switch,
+DECLARE_EVENT_CLASS(chanswitch_evt,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
 		 struct ieee80211_channel_switch *ch_switch),
@@ -1224,6 +1224,7 @@ TRACE_EVENT(drv_channel_switch,
 		__field(u32, device_timestamp)
 		__field(bool, block_tx)
 		__field(u8, count)
+		__field(u8, link_id)
 	),
 
 	TP_fast_assign(
@@ -1234,14 +1235,24 @@ TRACE_EVENT(drv_channel_switch,
 		__entry->device_timestamp = ch_switch->device_timestamp;
 		__entry->block_tx = ch_switch->block_tx;
 		__entry->count = ch_switch->count;
+		__entry->link_id = ch_switch->link_id;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT VIF_PR_FMT " new " CHANDEF_PR_FMT " count:%d",
-		LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->count
+		LOCAL_PR_FMT VIF_PR_FMT CHANDEF_PR_FMT  " count:%d block_tx:%d timestamp:%llu device_ts:%u link_id:%d",
+		LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->count,
+		__entry->block_tx, __entry->timestamp,
+		__entry->device_timestamp, __entry->link_id
 	)
 );
 
+DEFINE_EVENT(chanswitch_evt, drv_channel_switch,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 struct ieee80211_channel_switch *ch_switch),
+	TP_ARGS(local, sdata, ch_switch)
+);
+
 TRACE_EVENT(drv_set_antenna,
 	TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret),
 
@@ -2121,39 +2132,11 @@ TRACE_EVENT(drv_channel_switch_beacon,
 	)
 );
 
-TRACE_EVENT(drv_pre_channel_switch,
+DEFINE_EVENT(chanswitch_evt, drv_pre_channel_switch,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
 		 struct ieee80211_channel_switch *ch_switch),
-
-	TP_ARGS(local, sdata, ch_switch),
-
-	TP_STRUCT__entry(
-		LOCAL_ENTRY
-		VIF_ENTRY
-		CHANDEF_ENTRY
-		__field(u64, timestamp)
-		__field(u32, device_timestamp)
-		__field(bool, block_tx)
-		__field(u8, count)
-	),
-
-	TP_fast_assign(
-		LOCAL_ASSIGN;
-		VIF_ASSIGN;
-		CHANDEF_ASSIGN(&ch_switch->chandef)
-		__entry->timestamp = ch_switch->timestamp;
-		__entry->device_timestamp = ch_switch->device_timestamp;
-		__entry->block_tx = ch_switch->block_tx;
-		__entry->count = ch_switch->count;
-	),
-
-	TP_printk(
-		LOCAL_PR_FMT VIF_PR_FMT " prepare channel switch to "
-		CHANDEF_PR_FMT  " count:%d block_tx:%d timestamp:%llu",
-		LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->count,
-		__entry->block_tx, __entry->timestamp
-	)
+	TP_ARGS(local, sdata, ch_switch)
 );
 
 DEFINE_EVENT(local_sdata_evt, drv_post_channel_switch,
@@ -2168,40 +2151,11 @@ DEFINE_EVENT(local_sdata_evt, drv_abort_channel_switch,
 	     TP_ARGS(local, sdata)
 );
 
-TRACE_EVENT(drv_channel_switch_rx_beacon,
+DEFINE_EVENT(chanswitch_evt, drv_channel_switch_rx_beacon,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
 		 struct ieee80211_channel_switch *ch_switch),
-
-	TP_ARGS(local, sdata, ch_switch),
-
-	TP_STRUCT__entry(
-		LOCAL_ENTRY
-		VIF_ENTRY
-		CHANDEF_ENTRY
-		__field(u64, timestamp)
-		__field(u32, device_timestamp)
-		__field(bool, block_tx)
-		__field(u8, count)
-	),
-
-	TP_fast_assign(
-		LOCAL_ASSIGN;
-		VIF_ASSIGN;
-		CHANDEF_ASSIGN(&ch_switch->chandef)
-		__entry->timestamp = ch_switch->timestamp;
-		__entry->device_timestamp = ch_switch->device_timestamp;
-		__entry->block_tx = ch_switch->block_tx;
-		__entry->count = ch_switch->count;
-	),
-
-	TP_printk(
-		LOCAL_PR_FMT VIF_PR_FMT
-		" received a channel switch beacon to "
-		CHANDEF_PR_FMT  " count:%d block_tx:%d timestamp:%llu",
-		LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->count,
-		__entry->block_tx, __entry->timestamp
-	)
+	TP_ARGS(local, sdata, ch_switch)
 );
 
 TRACE_EVENT(drv_get_txpower,
-- 
cgit v1.2.3


From 5ecd5d82b17ee2818548aeb5eb52f3a5b5cae18c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 28 Feb 2024 09:55:44 +0100
Subject: wifi: mac80211: pass link conf to abort_channel_switch

Pass the link conf to the abort_channel_switch driver
method so the driver can handle things correctly.

Reviewed-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Reviewed-by: Miriam Rachel Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240228095718.27f621106ddd.Iadd3d69b722ffe5934779a32a0e4e596a4e33ed4@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 13 +++++++++++--
 drivers/net/wireless/intel/iwlwifi/mvm/mvm.h      |  3 ++-
 include/net/mac80211.h                            |  5 +++--
 net/mac80211/driver-ops.h                         |  8 +++++---
 net/mac80211/mlme.c                               |  2 +-
 5 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index 69f6a96b0cfb..1935630d3def 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -1470,7 +1470,8 @@ out_unlock:
 }
 
 void iwl_mvm_abort_channel_switch(struct ieee80211_hw *hw,
-				  struct ieee80211_vif *vif)
+				  struct ieee80211_vif *vif,
+				  struct ieee80211_bss_conf *link_conf)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
 	struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
@@ -5551,8 +5552,16 @@ void iwl_mvm_channel_switch_rx_beacon(struct ieee80211_hw *hw,
 
 	if (chsw->count >= mvmvif->csa_count && chsw->block_tx) {
 		if (mvmvif->csa_misbehave) {
+			struct ieee80211_bss_conf *link_conf;
+
 			/* Second time, give up on this AP*/
-			iwl_mvm_abort_channel_switch(hw, vif);
+
+			link_conf = wiphy_dereference(hw->wiphy,
+						      vif->link_conf[chsw->link_id]);
+			if (WARN_ON(!link_conf))
+				return;
+
+			iwl_mvm_abort_channel_switch(hw, vif, link_conf);
 			ieee80211_chswitch_done(vif, false, 0);
 			mvmvif->csa_misbehave = false;
 			return;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
index c85d9e460ad2..63c802712596 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
@@ -2709,7 +2709,8 @@ int iwl_mvm_pre_channel_switch(struct ieee80211_hw *hw,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_channel_switch *chsw);
 void iwl_mvm_abort_channel_switch(struct ieee80211_hw *hw,
-				  struct ieee80211_vif *vif);
+				  struct ieee80211_vif *vif,
+				  struct ieee80211_bss_conf *link_conf);
 void iwl_mvm_channel_switch_rx_beacon(struct ieee80211_hw *hw,
 				      struct ieee80211_vif *vif,
 				      struct ieee80211_channel_switch *chsw);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 8ea9fa81e68c..c622450ac012 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4268,7 +4268,7 @@ struct ieee80211_prep_tx_info {
  *	after a channel switch procedure is completed, allowing the
  *	driver to go back to a normal configuration.
  * @abort_channel_switch: This is an optional callback that is called
- *	when channel switch procedure was completed, allowing the
+ *	when channel switch procedure was aborted, allowing the
  *	driver to go back to a normal configuration.
  * @channel_switch_rx_beacon: This is an optional callback that is called
  *	when channel switch procedure is in progress and additional beacon with
@@ -4664,7 +4664,8 @@ struct ieee80211_ops {
 				   struct ieee80211_vif *vif,
 				   struct ieee80211_bss_conf *link_conf);
 	void (*abort_channel_switch)(struct ieee80211_hw *hw,
-				     struct ieee80211_vif *vif);
+				     struct ieee80211_vif *vif,
+				     struct ieee80211_bss_conf *link_conf);
 	void (*channel_switch_rx_beacon)(struct ieee80211_hw *hw,
 					 struct ieee80211_vif *vif,
 					 struct ieee80211_channel_switch *ch_switch);
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index e20c64edb880..5d078c0a2323 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -2,7 +2,7 @@
 /*
 * Portions of this file
 * Copyright(c) 2016 Intel Deutschland GmbH
-* Copyright (C) 2018 - 2019, 2021 - 2023 Intel Corporation
+* Copyright (C) 2018-2019, 2021-2024 Intel Corporation
 */
 
 #ifndef __MAC80211_DRIVER_OPS
@@ -1180,8 +1180,9 @@ drv_post_channel_switch(struct ieee80211_link_data *link)
 }
 
 static inline void
-drv_abort_channel_switch(struct ieee80211_sub_if_data *sdata)
+drv_abort_channel_switch(struct ieee80211_link_data *link)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_local *local = sdata->local;
 
 	might_sleep();
@@ -1193,7 +1194,8 @@ drv_abort_channel_switch(struct ieee80211_sub_if_data *sdata)
 	trace_drv_abort_channel_switch(local, sdata);
 
 	if (local->ops->abort_channel_switch)
-		local->ops->abort_channel_switch(&local->hw, &sdata->vif);
+		local->ops->abort_channel_switch(&local->hw, &sdata->vif,
+						 link->conf);
 }
 
 static inline void
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index d1d4480c0352..c5c757514179 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2004,7 +2004,7 @@ ieee80211_sta_abort_chanswitch(struct ieee80211_link_data *link)
 	link->csa_block_tx = false;
 	link->conf->csa_active = false;
 
-	drv_abort_channel_switch(sdata);
+	drv_abort_channel_switch(link);
 }
 
 static void
-- 
cgit v1.2.3


From 6f0107d195a812074c6f977d492ffc99ba3ff2bb Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 28 Feb 2024 09:55:45 +0100
Subject: wifi: mac80211: introduce a feature flag for quiet in CSA

When doing CSA in multi-link, there really isn't a need to
stop transmissions entirely. Add a feature flag for drivers
to indicate they can handle quiet in CSA (be it by parsing
themselves, or by implementing drv_pre_channel_switch()),
to make that possible.

Also clean up the csa_block_tx handling: it clearly cannot
handle multi-link due to the way queues are stopped, move
it to the sdata. Drivers should be doing it themselves for
working properly during CSA in MLO anyway. Also rename it
to indicate that it reflects TX was blocked at mac80211.

Reviewed-by: Miriam Rachel Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240228095719.258439191541.I2469d206e2bf5cb244cfde2b4bbc2ae6d1cd3dd9@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  6 ++++++
 net/mac80211/cfg.c         | 16 +++++++++-------
 net/mac80211/debugfs.c     |  3 ++-
 net/mac80211/ieee80211_i.h |  3 ++-
 net/mac80211/iface.c       |  6 +++---
 net/mac80211/mlme.c        | 35 +++++++++++++++++++++--------------
 6 files changed, 43 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c622450ac012..353488ab94a2 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2778,6 +2778,11 @@ struct ieee80211_txq {
  * @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT
  *	and connecting with a lower bandwidth instead
  *
+ * @IEEE80211_HW_HANDLES_QUIET_CSA: HW/driver handles quieting for CSA, so
+ *	no need to stop queues. This really should be set by a driver that
+ *	implements MLO, so operation can continue on other links when one
+ *	link is switching.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2836,6 +2841,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_DETECTS_COLOR_COLLISION,
 	IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX,
 	IEEE80211_HW_DISALLOW_PUNCTURING,
+	IEEE80211_HW_HANDLES_QUIET_CSA,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 4bc0cc2dff83..3a9d2ceec752 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1607,10 +1607,10 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 	/* abort any running channel switch or color change */
 	link_conf->csa_active = false;
 	link_conf->color_change_active = false;
-	if (link->csa_block_tx) {
+	if (sdata->csa_blocked_tx) {
 		ieee80211_wake_vif_queues(local, sdata,
 					  IEEE80211_QUEUE_STOP_REASON_CSA);
-		link->csa_block_tx = false;
+		sdata->csa_blocked_tx = false;
 	}
 
 	ieee80211_free_next_beacon(link);
@@ -3649,7 +3649,7 @@ void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif, bool block_t
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_local *local = sdata->local;
 
-	sdata->deflink.csa_block_tx = block_tx;
+	sdata->csa_blocked_tx = block_tx;
 	sdata_info(sdata, "channel switch failed, disconnecting\n");
 	wiphy_work_queue(local->hw.wiphy, &ifmgd->csa_connection_drop_work);
 }
@@ -3735,10 +3735,10 @@ static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data)
 
 	ieee80211_link_info_change_notify(sdata, link_data, changed);
 
-	if (link_data->csa_block_tx) {
+	if (sdata->csa_blocked_tx) {
 		ieee80211_wake_vif_queues(local, sdata,
 					  IEEE80211_QUEUE_STOP_REASON_CSA);
-		link_data->csa_block_tx = false;
+		sdata->csa_blocked_tx = false;
 	}
 
 	err = drv_post_channel_switch(link_data);
@@ -4014,12 +4014,14 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 	}
 
 	link_data->csa_chanreq = chanreq; 
-	link_data->csa_block_tx = params->block_tx;
 	link_conf->csa_active = true;
 
-	if (link_data->csa_block_tx)
+	if (params->block_tx &&
+	    !ieee80211_hw_check(&local->hw, HANDLES_QUIET_CSA)) {
 		ieee80211_stop_vif_queues(local, sdata,
 					  IEEE80211_QUEUE_STOP_REASON_CSA);
+		sdata->csa_blocked_tx = true;
+	}
 
 	cfg80211_ch_switch_started_notify(sdata->dev,
 					  &link_data->csa_chanreq.oper, 0,
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 74be49191e70..2f68e92a7404 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -4,7 +4,7 @@
  *
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright (C) 2018 - 2019, 2021-2023 Intel Corporation
+ * Copyright (C) 2018 - 2019, 2021-2024 Intel Corporation
  */
 
 #include <linux/debugfs.h>
@@ -498,6 +498,7 @@ static const char *hw_flag_names[] = {
 	FLAG(DETECTS_COLOR_COLLISION),
 	FLAG(MLO_MCAST_MULTI_LINK_TX),
 	FLAG(DISALLOW_PUNCTURING),
+	FLAG(HANDLES_QUIET_CSA),
 #undef FLAG
 };
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index da320799fbff..cfd0c03e4bd6 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1034,7 +1034,6 @@ struct ieee80211_link_data {
 	struct ieee80211_key __rcu *default_beacon_key;
 
 	struct wiphy_work csa_finalize_work;
-	bool csa_block_tx;
 
 	bool operating_11g_mode;
 
@@ -1093,6 +1092,8 @@ struct ieee80211_sub_if_data {
 
 	unsigned long state;
 
+	bool csa_blocked_tx;
+
 	char name[IFNAMSIZ];
 
 	struct ieee80211_fragment_cache frags;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index b75b83a5142b..395de62d9cb2 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -8,7 +8,7 @@
  * Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright (c) 2016        Intel Deutschland GmbH
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
  */
 #include <linux/slab.h>
 #include <linux/kernel.h>
@@ -544,10 +544,10 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
 	sdata->vif.bss_conf.csa_active = false;
 	if (sdata->vif.type == NL80211_IFTYPE_STATION)
 		sdata->deflink.u.mgd.csa_waiting_bcn = false;
-	if (sdata->deflink.csa_block_tx) {
+	if (sdata->csa_blocked_tx) {
 		ieee80211_wake_vif_queues(local, sdata,
 					  IEEE80211_QUEUE_STOP_REASON_CSA);
-		sdata->deflink.csa_block_tx = false;
+		sdata->csa_blocked_tx = false;
 	}
 
 	wiphy_work_cancel(local->hw.wiphy, &sdata->deflink.csa_finalize_work);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index c5c757514179..3d5cc3dd9238 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1926,10 +1926,10 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link)
 
 	WARN_ON(!link->conf->csa_active);
 
-	if (link->csa_block_tx) {
+	if (sdata->csa_blocked_tx) {
 		ieee80211_wake_vif_queues(local, sdata,
 					  IEEE80211_QUEUE_STOP_REASON_CSA);
-		link->csa_block_tx = false;
+		sdata->csa_blocked_tx = false;
 	}
 
 	link->conf->csa_active = false;
@@ -1997,11 +1997,12 @@ ieee80211_sta_abort_chanswitch(struct ieee80211_link_data *link)
 
 	ieee80211_link_unreserve_chanctx(link);
 
-	if (link->csa_block_tx)
+	if (sdata->csa_blocked_tx) {
 		ieee80211_wake_vif_queues(local, sdata,
 					  IEEE80211_QUEUE_STOP_REASON_CSA);
+		sdata->csa_blocked_tx = false;
+	}
 
-	link->csa_block_tx = false;
 	link->conf->csa_active = false;
 
 	drv_abort_channel_switch(link);
@@ -2145,13 +2146,15 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link,
 
 	link->conf->csa_active = true;
 	link->csa_chanreq = csa_ie.chanreq;
-	link->csa_block_tx = csa_ie.mode;
 	link->u.mgd.csa_ignored_same_chan = false;
 	link->u.mgd.beacon_crc_valid = false;
 
-	if (link->csa_block_tx)
+	if (csa_ie.mode &&
+	    !ieee80211_hw_check(&local->hw, HANDLES_QUIET_CSA)) {
 		ieee80211_stop_vif_queues(local, sdata,
 					  IEEE80211_QUEUE_STOP_REASON_CSA);
+		sdata->csa_blocked_tx = true;
+	}
 
 	cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chanreq.oper,
 					  link->link_id, csa_ie.count,
@@ -2179,7 +2182,8 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link,
 	 * reset when the disconnection worker runs.
 	 */
 	link->conf->csa_active = true;
-	link->csa_block_tx = csa_ie.mode;
+	sdata->csa_blocked_tx =
+		csa_ie.mode && !ieee80211_hw_check(&local->hw, HANDLES_QUIET_CSA);
 
 	wiphy_work_queue(sdata->local->hw.wiphy,
 			 &ifmgd->csa_connection_drop_work);
@@ -3233,10 +3237,10 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	sdata->vif.bss_conf.csa_active = false;
 	sdata->deflink.u.mgd.csa_waiting_bcn = false;
 	sdata->deflink.u.mgd.csa_ignored_same_chan = false;
-	if (sdata->deflink.csa_block_tx) {
+	if (sdata->csa_blocked_tx) {
 		ieee80211_wake_vif_queues(local, sdata,
 					  IEEE80211_QUEUE_STOP_REASON_CSA);
-		sdata->deflink.csa_block_tx = false;
+		sdata->csa_blocked_tx = false;
 	}
 
 	/* existing TX TSPEC sessions no longer exist */
@@ -3549,9 +3553,12 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
 	if (!ifmgd->associated)
 		return;
 
-	/* in MLO assume we have a link where we can TX the frame */
-	tx = ieee80211_vif_is_mld(&sdata->vif) ||
-		!sdata->deflink.csa_block_tx;
+	/*
+	 * MLO drivers should have HANDLES_QUIET_CSA, so that csa_blocked_tx
+	 * is always false; if they don't then this may try to transmit the
+	 * frame but queues will be stopped.
+	 */
+	tx = !sdata->csa_blocked_tx;
 
 	if (!ifmgd->driver_disconnect) {
 		unsigned int link_id;
@@ -3584,10 +3591,10 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
 	/* the other links will be destroyed */
 	sdata->vif.bss_conf.csa_active = false;
 	sdata->deflink.u.mgd.csa_waiting_bcn = false;
-	if (sdata->deflink.csa_block_tx) {
+	if (sdata->csa_blocked_tx) {
 		ieee80211_wake_vif_queues(local, sdata,
 					  IEEE80211_QUEUE_STOP_REASON_CSA);
-		sdata->deflink.csa_block_tx = false;
+		sdata->csa_blocked_tx = false;
 	}
 
 	ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), tx,
-- 
cgit v1.2.3


From 187e2af05abe6bf80581490239c449456627d17a Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <thinker.li@gmail.com>
Date: Sat, 24 Feb 2024 14:34:17 -0800
Subject: bpf: struct_ops supports more than one page for trampolines.

The BPF struct_ops previously only allowed one page of trampolines.
Each function pointer of a struct_ops is implemented by a struct_ops
bpf program. Each struct_ops bpf program requires a trampoline.
The following selftest patch shows each page can hold a little more
than 20 trampolines.

While one page is more than enough for the tcp-cc usecase,
the sched_ext use case shows that one page is not always enough and hits
the one page limit. This patch overcomes the one page limit by allocating
another page when needed and it is limited to a total of
MAX_IMAGE_PAGES (8) pages which is more than enough for
reasonable usages.

The variable st_map->image has been changed to st_map->image_pages, and
its type has been changed to an array of pointers to pages.

Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
Link: https://lore.kernel.org/r/20240224223418.526631-3-thinker.li@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h            |   4 +-
 kernel/bpf/bpf_struct_ops.c    | 130 ++++++++++++++++++++++++++++-------------
 net/bpf/bpf_dummy_struct_ops.c |  12 ++--
 3 files changed, 96 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 814dc913a968..785660810e6a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1763,7 +1763,9 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 				      struct bpf_tramp_link *link,
 				      const struct btf_func_model *model,
 				      void *stub_func,
-				      void *image, void *image_end);
+				      void **image, u32 *image_off,
+				      bool allow_alloc);
+void bpf_struct_ops_image_free(void *image);
 static inline bool bpf_try_module_get(const void *data, struct module *owner)
 {
 	if (owner == BPF_MODULE_OWNER)
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 07e554c191d1..43356faaa057 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -18,6 +18,8 @@ struct bpf_struct_ops_value {
 	char data[] ____cacheline_aligned_in_smp;
 };
 
+#define MAX_TRAMP_IMAGE_PAGES 8
+
 struct bpf_struct_ops_map {
 	struct bpf_map map;
 	struct rcu_head rcu;
@@ -30,12 +32,11 @@ struct bpf_struct_ops_map {
 	 */
 	struct bpf_link **links;
 	u32 links_cnt;
-	/* image is a page that has all the trampolines
+	u32 image_pages_cnt;
+	/* image_pages is an array of pages that has all the trampolines
 	 * that stores the func args before calling the bpf_prog.
-	 * A PAGE_SIZE "image" is enough to store all trampoline for
-	 * "links[]".
 	 */
-	void *image;
+	void *image_pages[MAX_TRAMP_IMAGE_PAGES];
 	/* The owner moduler's btf. */
 	struct btf *btf;
 	/* uvalue->data stores the kernel struct
@@ -116,6 +117,31 @@ static bool is_valid_value_type(struct btf *btf, s32 value_id,
 	return true;
 }
 
+static void *bpf_struct_ops_image_alloc(void)
+{
+	void *image;
+	int err;
+
+	err = bpf_jit_charge_modmem(PAGE_SIZE);
+	if (err)
+		return ERR_PTR(err);
+	image = arch_alloc_bpf_trampoline(PAGE_SIZE);
+	if (!image) {
+		bpf_jit_uncharge_modmem(PAGE_SIZE);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return image;
+}
+
+void bpf_struct_ops_image_free(void *image)
+{
+	if (image) {
+		arch_free_bpf_trampoline(image, PAGE_SIZE);
+		bpf_jit_uncharge_modmem(PAGE_SIZE);
+	}
+}
+
 #define MAYBE_NULL_SUFFIX "__nullable"
 #define MAX_STUB_NAME 128
 
@@ -461,6 +487,15 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
 	}
 }
 
+static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map)
+{
+	int i;
+
+	for (i = 0; i < st_map->image_pages_cnt; i++)
+		bpf_struct_ops_image_free(st_map->image_pages[i]);
+	st_map->image_pages_cnt = 0;
+}
+
 static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data)
 {
 	const struct btf_member *member;
@@ -506,9 +541,12 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = {
 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 				      struct bpf_tramp_link *link,
 				      const struct btf_func_model *model,
-				      void *stub_func, void *image, void *image_end)
+				      void *stub_func,
+				      void **_image, u32 *_image_off,
+				      bool allow_alloc)
 {
-	u32 flags = BPF_TRAMP_F_INDIRECT;
+	u32 image_off = *_image_off, flags = BPF_TRAMP_F_INDIRECT;
+	void *image = *_image;
 	int size;
 
 	tlinks[BPF_TRAMP_FENTRY].links[0] = link;
@@ -518,12 +556,32 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 		flags |= BPF_TRAMP_F_RET_FENTRY_RET;
 
 	size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
-	if (size < 0)
-		return size;
-	if (size > (unsigned long)image_end - (unsigned long)image)
-		return -E2BIG;
-	return arch_prepare_bpf_trampoline(NULL, image, image_end,
+	if (size <= 0)
+		return size ? : -EFAULT;
+
+	/* Allocate image buffer if necessary */
+	if (!image || size > PAGE_SIZE - image_off) {
+		if (!allow_alloc)
+			return -E2BIG;
+
+		image = bpf_struct_ops_image_alloc();
+		if (IS_ERR(image))
+			return PTR_ERR(image);
+		image_off = 0;
+	}
+
+	size = arch_prepare_bpf_trampoline(NULL, image + image_off,
+					   image + PAGE_SIZE,
 					   model, flags, tlinks, stub_func);
+	if (size <= 0) {
+		if (image != *_image)
+			bpf_struct_ops_image_free(image);
+		return size ? : -EFAULT;
+	}
+
+	*_image = image;
+	*_image_off = image_off + size;
+	return 0;
 }
 
 static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
@@ -539,8 +597,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	struct bpf_tramp_links *tlinks;
 	void *udata, *kdata;
 	int prog_fd, err;
-	void *image, *image_end;
-	u32 i;
+	u32 i, trampoline_start, image_off = 0;
+	void *cur_image = NULL, *image = NULL;
 
 	if (flags)
 		return -EINVAL;
@@ -578,8 +636,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 
 	udata = &uvalue->data;
 	kdata = &kvalue->data;
-	image = st_map->image;
-	image_end = st_map->image + PAGE_SIZE;
 
 	module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
 	for_each_member(i, t, member) {
@@ -658,15 +714,24 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 			      &bpf_struct_ops_link_lops, prog);
 		st_map->links[i] = &link->link;
 
+		trampoline_start = image_off;
 		err = bpf_struct_ops_prepare_trampoline(tlinks, link,
-							&st_ops->func_models[i],
-							*(void **)(st_ops->cfi_stubs + moff),
-							image, image_end);
+						&st_ops->func_models[i],
+						*(void **)(st_ops->cfi_stubs + moff),
+						&image, &image_off,
+						st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES);
+		if (err)
+			goto reset_unlock;
+
+		if (cur_image != image) {
+			st_map->image_pages[st_map->image_pages_cnt++] = image;
+			cur_image = image;
+			trampoline_start = 0;
+		}
 		if (err < 0)
 			goto reset_unlock;
 
-		*(void **)(kdata + moff) = image + cfi_get_offset();
-		image += err;
+		*(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset();
 
 		/* put prog_id to udata */
 		*(unsigned long *)(udata + moff) = prog->aux->id;
@@ -677,10 +742,11 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		if (err)
 			goto reset_unlock;
 	}
+	for (i = 0; i < st_map->image_pages_cnt; i++)
+		arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
 
 	if (st_map->map.map_flags & BPF_F_LINK) {
 		err = 0;
-		arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
 		/* Let bpf_link handle registration & unregistration.
 		 *
 		 * Pair with smp_load_acquire() during lookup_elem().
@@ -689,7 +755,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		goto unlock;
 	}
 
-	arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
 	err = st_ops->reg(kdata);
 	if (likely(!err)) {
 		/* This refcnt increment on the map here after
@@ -712,9 +777,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	 * there was a race in registering the struct_ops (under the same name) to
 	 * a sub-system through different struct_ops's maps.
 	 */
-	arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
 
 reset_unlock:
+	bpf_struct_ops_map_free_image(st_map);
 	bpf_struct_ops_map_put_progs(st_map);
 	memset(uvalue, 0, map->value_size);
 	memset(kvalue, 0, map->value_size);
@@ -781,10 +846,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
 	if (st_map->links)
 		bpf_struct_ops_map_put_progs(st_map);
 	bpf_map_area_free(st_map->links);
-	if (st_map->image) {
-		arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
-		bpf_jit_uncharge_modmem(PAGE_SIZE);
-	}
+	bpf_struct_ops_map_free_image(st_map);
 	bpf_map_area_free(st_map->uvalue);
 	bpf_map_area_free(st_map);
 }
@@ -894,20 +956,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	st_map->st_ops_desc = st_ops_desc;
 	map = &st_map->map;
 
-	ret = bpf_jit_charge_modmem(PAGE_SIZE);
-	if (ret)
-		goto errout_free;
-
-	st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
-	if (!st_map->image) {
-		/* __bpf_struct_ops_map_free() uses st_map->image as flag
-		 * for "charged or not". In this case, we need to unchange
-		 * here.
-		 */
-		bpf_jit_uncharge_modmem(PAGE_SIZE);
-		ret = -ENOMEM;
-		goto errout_free;
-	}
 	st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
 	st_map->links_cnt = btf_type_vlen(t);
 	st_map->links =
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index 02de71719aed..1b5f812e6972 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -91,6 +91,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 	struct bpf_tramp_link *link = NULL;
 	void *image = NULL;
 	unsigned int op_idx;
+	u32 image_off = 0;
 	int prog_ret;
 	s32 type_id;
 	int err;
@@ -114,12 +115,6 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 		goto out;
 	}
 
-	image = arch_alloc_bpf_trampoline(PAGE_SIZE);
-	if (!image) {
-		err = -ENOMEM;
-		goto out;
-	}
-
 	link = kzalloc(sizeof(*link), GFP_USER);
 	if (!link) {
 		err = -ENOMEM;
@@ -133,7 +128,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 	err = bpf_struct_ops_prepare_trampoline(tlinks, link,
 						&st_ops->func_models[op_idx],
 						&dummy_ops_test_ret_function,
-						image, image + PAGE_SIZE);
+						&image, &image_off,
+						true);
 	if (err < 0)
 		goto out;
 
@@ -147,7 +143,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 		err = -EFAULT;
 out:
 	kfree(args);
-	arch_free_bpf_trampoline(image, PAGE_SIZE);
+	bpf_struct_ops_image_free(image);
 	if (link)
 		bpf_link_put(&link->link);
 	kfree(tlinks);
-- 
cgit v1.2.3


From 345a6e2631c1267221b684e110bba03e4c26ece0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Mar 2024 17:19:45 +0000
Subject: tcp: align tcp_sock_write_rx group

Stephen Rothwell and kernel test robot reported that some arches
(parisc, hexagon) and/or compilers would not like blamed commit.

Lets make sure tcp_sock_write_rx group does not start with a hole.

While we are at it, correct tcp_sock_write_tx CACHELINE_ASSERT_GROUP_SIZE()
since after the blamed commit, we went to 105 bytes.

Fixes: 99123622050f ("tcp: remove some holes in struct tcp_sock")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/netdev/20240301121108.5d39e4f9@canb.auug.org.au/
Closes: https://lore.kernel.org/oe-kbuild-all/202403011451.csPYOS3C-lkp@intel.com/
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Tested-by: Simon Horman <horms@kernel.org> # build-tested
Link: https://lore.kernel.org/r/20240301171945.2958176-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/tcp.h | 2 +-
 net/ipv4/tcp.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 988a30ef6bfe..55399ee2a57e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -304,7 +304,7 @@ struct tcp_sock {
 	__cacheline_group_end(tcp_sock_write_txrx);
 
 	/* RX read-write hotpath cache lines */
-	__cacheline_group_begin(tcp_sock_write_rx);
+	__cacheline_group_begin(tcp_sock_write_rx) __aligned(8);
 	u64	bytes_received;
 				/* RFC4898 tcpEStatsAppHCThruOctetsReceived
 				 * sum(delta(rcv_nxt)), or how many bytes
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c82dc42f57c6..7e1b848398d0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4651,7 +4651,7 @@ static void __init tcp_struct_check(void)
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
-	CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 113);
+	CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 105);
 
 	/* TXRX read-write hotpath cache lines */
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
-- 
cgit v1.2.3


From 411c5f36805c02c7c412f1ad6bfa4459a1148011 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Wed, 28 Feb 2024 17:30:08 +0800
Subject: mm/page_alloc: modify page_frag_alloc_align() to accept align as an
 argument

napi_alloc_frag_align() and netdev_alloc_frag_align() accept
align as an argument, and they are thin wrappers around the
__napi_alloc_frag_align() and __netdev_alloc_frag_align() APIs
doing the alignment checking and align mask conversion, in order
to call page_frag_alloc_align() directly. The intention here is
to keep the alignment checking and the alignmask conversion in
in-line wrapper to avoid those kind of operations during execution
time since it can usually be handled during compile time.

We are going to use page_frag_alloc_align() in vhost_net.c, it
need the same kind of alignment checking and alignmask conversion,
so split up page_frag_alloc_align into an inline wrapper doing the
above operation, and add __page_frag_alloc_align() which is passed
with the align mask the original function expected as suggested by
Alexander.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
CC: Alexander Duyck <alexander.duyck@gmail.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/gfp.h | 15 +++++++++++----
 mm/page_alloc.c     |  8 ++++----
 net/core/skbuff.c   |  9 ++++++---
 3 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index de292a007138..28aea17fa59b 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -312,14 +312,21 @@ extern void free_pages(unsigned long addr, unsigned int order);
 
 struct page_frag_cache;
 extern void __page_frag_cache_drain(struct page *page, unsigned int count);
-extern void *page_frag_alloc_align(struct page_frag_cache *nc,
-				   unsigned int fragsz, gfp_t gfp_mask,
-				   unsigned int align_mask);
+void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
+			      gfp_t gfp_mask, unsigned int align_mask);
+
+static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
+					  unsigned int fragsz, gfp_t gfp_mask,
+					  unsigned int align)
+{
+	WARN_ON_ONCE(!is_power_of_2(align));
+	return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
+}
 
 static inline void *page_frag_alloc(struct page_frag_cache *nc,
 			     unsigned int fragsz, gfp_t gfp_mask)
 {
-	return page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
+	return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
 }
 
 extern void page_frag_free(void *addr);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 150d4f23b010..c0f7e67c4250 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4708,9 +4708,9 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
-void *page_frag_alloc_align(struct page_frag_cache *nc,
-		      unsigned int fragsz, gfp_t gfp_mask,
-		      unsigned int align_mask)
+void *__page_frag_alloc_align(struct page_frag_cache *nc,
+			      unsigned int fragsz, gfp_t gfp_mask,
+			      unsigned int align_mask)
 {
 	unsigned int size = PAGE_SIZE;
 	struct page *page;
@@ -4779,7 +4779,7 @@ refill:
 
 	return nc->va + offset;
 }
-EXPORT_SYMBOL(page_frag_alloc_align);
+EXPORT_SYMBOL(__page_frag_alloc_align);
 
 /*
  * Frees a page fragment allocated out of either a compound or order 0 page.
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1f918e602bc4..43d7fc150acc 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -315,7 +315,8 @@ void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 
 	fragsz = SKB_DATA_ALIGN(fragsz);
 
-	return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
+	return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
+				       align_mask);
 }
 EXPORT_SYMBOL(__napi_alloc_frag_align);
 
@@ -327,13 +328,15 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 	if (in_hardirq() || irqs_disabled()) {
 		struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
 
-		data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
+		data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC,
+					       align_mask);
 	} else {
 		struct napi_alloc_cache *nc;
 
 		local_bh_disable();
 		nc = this_cpu_ptr(&napi_alloc_cache);
-		data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
+		data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
+					       align_mask);
 		local_bh_enable();
 	}
 	return data;
-- 
cgit v1.2.3


From a0727489ac22d6fbd2e390d38a51193bba61da83 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Wed, 28 Feb 2024 17:30:10 +0800
Subject: net: introduce page_frag_cache_drain()

When draining a page_frag_cache, most user are doing
the similar steps, so introduce an API to avoid code
duplication.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/google/gve/gve_main.c | 11 ++---------
 drivers/net/ethernet/mediatek/mtk_wed_wo.c | 17 ++---------------
 drivers/nvme/host/tcp.c                    |  7 +------
 drivers/nvme/target/tcp.c                  |  4 +---
 include/linux/gfp.h                        |  1 +
 mm/page_alloc.c                            | 10 ++++++++++
 6 files changed, 17 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 7b89b66adb53..166bd827a6d7 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -1276,17 +1276,10 @@ static void gve_unreg_xdp_info(struct gve_priv *priv)
 
 static void gve_drain_page_cache(struct gve_priv *priv)
 {
-	struct page_frag_cache *nc;
 	int i;
 
-	for (i = 0; i < priv->rx_cfg.num_queues; i++) {
-		nc = &priv->rx[i].page_cache;
-		if (nc->va) {
-			__page_frag_cache_drain(virt_to_page(nc->va),
-						nc->pagecnt_bias);
-			nc->va = NULL;
-		}
-	}
+	for (i = 0; i < priv->rx_cfg.num_queues; i++)
+		page_frag_cache_drain(&priv->rx[i].page_cache);
 }
 
 static void gve_qpls_get_curr_alloc_cfg(struct gve_priv *priv,
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
index d58b07e7e123..7063c78bd35f 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
@@ -286,7 +286,6 @@ mtk_wed_wo_queue_free(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 static void
 mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-	struct page *page;
 	int i;
 
 	for (i = 0; i < q->n_desc; i++) {
@@ -301,19 +300,12 @@ mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 		entry->buf = NULL;
 	}
 
-	if (!q->cache.va)
-		return;
-
-	page = virt_to_page(q->cache.va);
-	__page_frag_cache_drain(page, q->cache.pagecnt_bias);
-	memset(&q->cache, 0, sizeof(q->cache));
+	page_frag_cache_drain(&q->cache);
 }
 
 static void
 mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-	struct page *page;
-
 	for (;;) {
 		void *buf = mtk_wed_wo_dequeue(wo, q, NULL, true);
 
@@ -323,12 +315,7 @@ mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 		skb_free_frag(buf);
 	}
 
-	if (!q->cache.va)
-		return;
-
-	page = virt_to_page(q->cache.va);
-	__page_frag_cache_drain(page, q->cache.pagecnt_bias);
-	memset(&q->cache, 0, sizeof(q->cache));
+	page_frag_cache_drain(&q->cache);
 }
 
 static void
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index a6d596e05602..3692b56cb58d 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1344,7 +1344,6 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
 
 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 {
-	struct page *page;
 	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
 	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
 	unsigned int noreclaim_flag;
@@ -1355,11 +1354,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 	if (queue->hdr_digest || queue->data_digest)
 		nvme_tcp_free_crypto(queue);
 
-	if (queue->pf_cache.va) {
-		page = virt_to_head_page(queue->pf_cache.va);
-		__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
-		queue->pf_cache.va = NULL;
-	}
+	page_frag_cache_drain(&queue->pf_cache);
 
 	noreclaim_flag = memalloc_noreclaim_save();
 	/* ->sock will be released by fput() */
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index c8655fc5aa5b..2aa5762e9f50 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1591,7 +1591,6 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue)
 
 static void nvmet_tcp_release_queue_work(struct work_struct *w)
 {
-	struct page *page;
 	struct nvmet_tcp_queue *queue =
 		container_of(w, struct nvmet_tcp_queue, release_work);
 
@@ -1615,8 +1614,7 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
 	if (queue->hdr_digest || queue->data_digest)
 		nvmet_tcp_free_crypto(queue);
 	ida_free(&nvmet_tcp_queue_ida, queue->idx);
-	page = virt_to_head_page(queue->pf_cache.va);
-	__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
+	page_frag_cache_drain(&queue->pf_cache);
 	kfree(queue);
 }
 
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 28aea17fa59b..6cef1c241180 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -311,6 +311,7 @@ extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
 
 struct page_frag_cache;
+void page_frag_cache_drain(struct page_frag_cache *nc);
 extern void __page_frag_cache_drain(struct page *page, unsigned int count);
 void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
 			      gfp_t gfp_mask, unsigned int align_mask);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 636145c29f70..06aa1ebbd21c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4699,6 +4699,16 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 	return page;
 }
 
+void page_frag_cache_drain(struct page_frag_cache *nc)
+{
+	if (!nc->va)
+		return;
+
+	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
+	nc->va = NULL;
+}
+EXPORT_SYMBOL(page_frag_cache_drain);
+
 void __page_frag_cache_drain(struct page *page, unsigned int count)
 {
 	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
-- 
cgit v1.2.3


From 93e16ea025d234d0ed01d9dc9c819257a2159bb6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Mar 2024 19:37:37 +0000
Subject: net: gro: rename skb_gro_header_hard()

skb_gro_header_hard() is renamed to skb_gro_may_pull() to match
the convention used by common helpers like pskb_may_pull().

This means the condition is inverted:

	if (skb_gro_header_hard(skb, hlen))
		slow_path();

becomes:

	if (!skb_gro_may_pull(skb, hlen))
		slow_path();

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/geneve.c   | 2 +-
 include/net/gro.h      | 7 ++++---
 net/core/gro.c         | 2 +-
 net/ipv4/fou_core.c    | 2 +-
 net/ipv4/gre_offload.c | 2 +-
 net/ipv4/tcp_offload.c | 2 +-
 6 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 6f3f9b446b1d..e25e0a31126c 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -508,7 +508,7 @@ static struct sk_buff *geneve_gro_receive(struct sock *sk,
 	gh_len = geneve_hlen(gh);
 
 	hlen = off_gnv + gh_len;
-	if (skb_gro_header_hard(skb, hlen)) {
+	if (!skb_gro_may_pull(skb, hlen)) {
 		gh = skb_gro_header_slow(skb, hlen, off_gnv);
 		if (unlikely(!gh))
 			goto out;
diff --git a/include/net/gro.h b/include/net/gro.h
index b435f0ddbf64..ffc2c96d263b 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -145,9 +145,10 @@ static inline void *skb_gro_header_fast(struct sk_buff *skb,
 	return NAPI_GRO_CB(skb)->frag0 + offset;
 }
 
-static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen)
+static inline bool skb_gro_may_pull(const struct sk_buff *skb,
+				    unsigned int hlen)
 {
-	return NAPI_GRO_CB(skb)->frag0_len < hlen;
+	return hlen <= NAPI_GRO_CB(skb)->frag0_len;
 }
 
 static inline void skb_gro_frag0_invalidate(struct sk_buff *skb)
@@ -172,7 +173,7 @@ static inline void *skb_gro_header(struct sk_buff *skb,
 	void *ptr;
 
 	ptr = skb_gro_header_fast(skb, offset);
-	if (skb_gro_header_hard(skb, hlen))
+	if (!skb_gro_may_pull(skb, hlen))
 		ptr = skb_gro_header_slow(skb, hlen, offset);
 	return ptr;
 }
diff --git a/net/core/gro.c b/net/core/gro.c
index 0759277dc14e..927ccf681490 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -700,7 +700,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 	skb_reset_mac_header(skb);
 	skb_gro_reset_offset(skb, hlen);
 
-	if (unlikely(skb_gro_header_hard(skb, hlen))) {
+	if (unlikely(!skb_gro_may_pull(skb, hlen))) {
 		eth = skb_gro_header_slow(skb, hlen, 0);
 		if (unlikely(!eth)) {
 			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c
index 0c41076e31ed..a8494f796dca 100644
--- a/net/ipv4/fou_core.c
+++ b/net/ipv4/fou_core.c
@@ -351,7 +351,7 @@ static struct sk_buff *gue_gro_receive(struct sock *sk,
 	optlen = guehdr->hlen << 2;
 	len += optlen;
 
-	if (skb_gro_header_hard(skb, len)) {
+	if (!skb_gro_may_pull(skb, len)) {
 		guehdr = skb_gro_header_slow(skb, len, off);
 		if (unlikely(!guehdr))
 			goto out;
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 311e70bfce40..5028c72d494a 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -174,7 +174,7 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
 		grehlen += GRE_HEADER_SECTION;
 
 	hlen = off + grehlen;
-	if (skb_gro_header_hard(skb, hlen)) {
+	if (!skb_gro_may_pull(skb, hlen)) {
 		greh = skb_gro_header_slow(skb, hlen, off);
 		if (unlikely(!greh))
 			goto out;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 8311c38267b5..875456efc92d 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -204,7 +204,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
 		goto out;
 
 	hlen = off + thlen;
-	if (skb_gro_header_hard(skb, hlen)) {
+	if (!skb_gro_may_pull(skb, hlen)) {
 		th = skb_gro_header_slow(skb, hlen, off);
 		if (unlikely(!th))
 			goto out;
-- 
cgit v1.2.3


From bd56a29c7a4ebcd3ca69505a2e676449e60965f3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Mar 2024 19:37:38 +0000
Subject: net: gro: change skb_gro_network_header()

Change skb_gro_network_header() to accept a const sk_buff
and to no longer check if frag0 is NULL or not.

This allows to remove skb_gro_frag0_invalidate()
which is seen in profiles when header-split is enabled.

sk_buff parameter is constified for skb_gro_header_fast(),
inet_gro_compute_pseudo() and ip6_gro_compute_pseudo().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/gro.h | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/gro.h b/include/net/gro.h
index ffc2c96d263b..3c3666e46b30 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -139,7 +139,7 @@ static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len)
 	NAPI_GRO_CB(skb)->data_offset += len;
 }
 
-static inline void *skb_gro_header_fast(struct sk_buff *skb,
+static inline void *skb_gro_header_fast(const struct sk_buff *skb,
 					unsigned int offset)
 {
 	return NAPI_GRO_CB(skb)->frag0 + offset;
@@ -151,24 +151,17 @@ static inline bool skb_gro_may_pull(const struct sk_buff *skb,
 	return hlen <= NAPI_GRO_CB(skb)->frag0_len;
 }
 
-static inline void skb_gro_frag0_invalidate(struct sk_buff *skb)
-{
-	NAPI_GRO_CB(skb)->frag0 = NULL;
-	NAPI_GRO_CB(skb)->frag0_len = 0;
-}
-
 static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen,
 					unsigned int offset)
 {
 	if (!pskb_may_pull(skb, hlen))
 		return NULL;
 
-	skb_gro_frag0_invalidate(skb);
 	return skb->data + offset;
 }
 
-static inline void *skb_gro_header(struct sk_buff *skb,
-					unsigned int hlen, unsigned int offset)
+static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen,
+				   unsigned int offset)
 {
 	void *ptr;
 
@@ -178,13 +171,16 @@ static inline void *skb_gro_header(struct sk_buff *skb,
 	return ptr;
 }
 
-static inline void *skb_gro_network_header(struct sk_buff *skb)
+static inline void *skb_gro_network_header(const struct sk_buff *skb)
 {
-	return (NAPI_GRO_CB(skb)->frag0 ?: skb->data) +
-	       skb_network_offset(skb);
+	if (skb_gro_may_pull(skb, skb_gro_offset(skb)))
+		return skb_gro_header_fast(skb, skb_network_offset(skb));
+
+	return skb_network_header(skb);
 }
 
-static inline __wsum inet_gro_compute_pseudo(struct sk_buff *skb, int proto)
+static inline __wsum inet_gro_compute_pseudo(const struct sk_buff *skb,
+					     int proto)
 {
 	const struct iphdr *iph = skb_gro_network_header(skb);
 
@@ -422,7 +418,8 @@ static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 	return uh;
 }
 
-static inline __wsum ip6_gro_compute_pseudo(struct sk_buff *skb, int proto)
+static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb,
+					    int proto)
 {
 	const struct ipv6hdr *iph = skb_gro_network_header(skb);
 
-- 
cgit v1.2.3


From c7583e9f768eeb82f2531c8372584ba89cfade8b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Mar 2024 19:37:39 +0000
Subject: net: gro: enable fast path for more cases

Currently the so-called GRO fast path is only enabled for
napi_frags_skb() callers.

After the prior patch, we no longer have to clear frag0 whenever
we pulled bytes to skb->head.

We therefore can initialize frag0 to skb->data so that GRO
fast path can be used in the following additional cases:

- Drivers using header split (populating skb->data with headers,
  and having payload in one or more page fragments).

- Drivers not using any page frag (entire packet is in skb->data)

Add a likely() in skb_gro_may_pull() to help the compiler
to generate better code if possible.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/gro.h |  2 +-
 net/core/gro.c    | 23 ++++++++++++++++-------
 2 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/gro.h b/include/net/gro.h
index 3c3666e46b30..2b58671a6549 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -148,7 +148,7 @@ static inline void *skb_gro_header_fast(const struct sk_buff *skb,
 static inline bool skb_gro_may_pull(const struct sk_buff *skb,
 				    unsigned int hlen)
 {
-	return hlen <= NAPI_GRO_CB(skb)->frag0_len;
+	return likely(hlen <= NAPI_GRO_CB(skb)->frag0_len);
 }
 
 static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen,
diff --git a/net/core/gro.c b/net/core/gro.c
index 927ccf681490..6a0edbd826a1 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -369,15 +369,21 @@ static void gro_list_prepare(const struct list_head *head,
 
 static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
 {
-	const struct skb_shared_info *pinfo = skb_shinfo(skb);
-	const skb_frag_t *frag0 = &pinfo->frags[0];
+	const struct skb_shared_info *pinfo;
+	const skb_frag_t *frag0;
+	unsigned int headlen;
 
 	NAPI_GRO_CB(skb)->data_offset = 0;
-	NAPI_GRO_CB(skb)->frag0 = NULL;
-	NAPI_GRO_CB(skb)->frag0_len = 0;
+	headlen = skb_headlen(skb);
+	NAPI_GRO_CB(skb)->frag0 = skb->data;
+	NAPI_GRO_CB(skb)->frag0_len = headlen;
+	if (headlen)
+		return;
+
+	pinfo = skb_shinfo(skb);
+	frag0 = &pinfo->frags[0];
 
-	if (!skb_headlen(skb) && pinfo->nr_frags &&
-	    !PageHighMem(skb_frag_page(frag0)) &&
+	if (pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) &&
 	    (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
 		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
@@ -710,7 +716,10 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 		}
 	} else {
 		eth = (const struct ethhdr *)skb->data;
-		gro_pull_from_frag0(skb, hlen);
+
+		if (NAPI_GRO_CB(skb)->frag0 != skb->data)
+			gro_pull_from_frag0(skb, hlen);
+
 		NAPI_GRO_CB(skb)->frag0 += hlen;
 		NAPI_GRO_CB(skb)->frag0_len -= hlen;
 	}
-- 
cgit v1.2.3


From 885c36e59f46375c138de18ff1692f18eff67b7f Mon Sep 17 00:00:00 2001
From: Abhishek Chauhan <quic_abchauha@quicinc.com>
Date: Fri, 1 Mar 2024 12:13:48 -0800
Subject: net: Re-use and set mono_delivery_time bit for userspace tstamp
 packets

Bridge driver today has no support to forward the userspace timestamp
packets and ends up resetting the timestamp. ETF qdisc checks the
packet coming from userspace and encounters to be 0 thereby dropping
time sensitive packets. These changes will allow userspace timestamps
packets to be forwarded from the bridge to NIC drivers.

Setting the same bit (mono_delivery_time) to avoid dropping of
userspace tstamp packets in the forwarding path.

Existing functionality of mono_delivery_time remains unaltered here,
instead just extended with userspace tstamp support for bridge
forwarding path.

Signed-off-by: Abhishek Chauhan <quic_abchauha@quicinc.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20240301201348.2815102-1-quic_abchauha@quicinc.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/skbuff.h | 6 +++---
 net/ipv4/ip_output.c   | 1 +
 net/ipv4/raw.c         | 1 +
 net/ipv6/ip6_output.c  | 2 +-
 net/ipv6/raw.c         | 2 +-
 net/packet/af_packet.c | 4 +++-
 6 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d577e0bee18d..3013355b63f5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -822,9 +822,9 @@ typedef unsigned char *sk_buff_data_t;
  *	@decrypted: Decrypted SKB
  *	@slow_gro: state present at GRO time, slower prepare step required
  *	@mono_delivery_time: When set, skb->tstamp has the
- *		delivery_time in mono clock base (i.e. EDT).  Otherwise, the
- *		skb->tstamp has the (rcv) timestamp at ingress and
- *		delivery_time at egress.
+ *		delivery_time in mono clock base (i.e., EDT) or a clock base chosen
+ *		by SO_TXTIME. If zero, skb->tstamp has the (rcv) timestamp at
+ *		ingress.
  *	@napi_id: id of the NAPI struct this skb came from
  *	@sender_cpu: (aka @napi_id) source CPU in XPS
  *	@alloc_cpu: CPU which did the skb allocation.
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 1fe794967211..33f93dc730a3 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1458,6 +1458,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
 	skb->mark = cork->mark;
 	skb->tstamp = cork->transmit_time;
+	skb->mono_delivery_time = !!skb->tstamp;
 	/*
 	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
 	 * on dst refcount
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index a82fd102df05..494a6284bd7e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -353,6 +353,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 	skb->priority = READ_ONCE(sk->sk_priority);
 	skb->mark = sockc->mark;
 	skb->tstamp = sockc->transmit_time;
+	skb->mono_delivery_time = !!skb->tstamp;
 	skb_dst_set(skb, &rt->dst);
 	*rtp = NULL;
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b9dd3a66e423..02eeca5492cd 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1925,7 +1925,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
 	skb->priority = READ_ONCE(sk->sk_priority);
 	skb->mark = cork->base.mark;
 	skb->tstamp = cork->base.transmit_time;
-
+	skb->mono_delivery_time = !!skb->tstamp;
 	ip6_cork_steal_dst(skb, cork);
 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
 	if (proto == IPPROTO_ICMPV6) {
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 76e6eb3b643d..779274055abf 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -615,7 +615,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 	skb->priority = READ_ONCE(sk->sk_priority);
 	skb->mark = sockc->mark;
 	skb->tstamp = sockc->transmit_time;
-
+	skb->mono_delivery_time = !!skb->tstamp;
 	skb_put(skb, length);
 	skb_reset_network_header(skb);
 	iph = ipv6_hdr(skb);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c9bbc2686690..0db31ca4982d 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2057,7 +2057,7 @@ retry:
 	skb->priority = READ_ONCE(sk->sk_priority);
 	skb->mark = READ_ONCE(sk->sk_mark);
 	skb->tstamp = sockc.transmit_time;
-
+	skb->mono_delivery_time = !!skb->tstamp;
 	skb_setup_tx_timestamp(skb, sockc.tsflags);
 
 	if (unlikely(extra_len == 4))
@@ -2586,6 +2586,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 	skb->priority = READ_ONCE(po->sk.sk_priority);
 	skb->mark = READ_ONCE(po->sk.sk_mark);
 	skb->tstamp = sockc->transmit_time;
+	skb->mono_delivery_time = !!skb->tstamp;
 	skb_setup_tx_timestamp(skb, sockc->tsflags);
 	skb_zcopy_set_nouarg(skb, ph.raw);
 
@@ -3064,6 +3065,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	skb->priority = READ_ONCE(sk->sk_priority);
 	skb->mark = sockc.mark;
 	skb->tstamp = sockc.transmit_time;
+	skb->mono_delivery_time = !!skb->tstamp;
 
 	if (unlikely(extra_len == 4))
 		skb->no_fcs = 1;
-- 
cgit v1.2.3


From e5560011692981bc8bfeae7fc0673c65a02badba Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marliere" <ricardo@marliere.net>
Date: Sat, 2 Mar 2024 14:06:02 -0300
Subject: nfc: core: make nfc_class constant

Since commit 43a7206b0963 ("driver core: class: make class_register() take
a const *"), the driver core allows for struct class to be in read-only
memory, so move the nfc_class structure to be declared at build time
placing it into read-only memory, instead of having to be dynamically
allocated at boot time.

Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ricardo B. Marliere <ricardo@marliere.net>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240302-class_cleanup-net-next-v1-6-8fa378595b93@marliere.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/nfc/nfc.h | 2 +-
 net/nfc/core.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h
index 5dee575fbe86..3d07abacf08b 100644
--- a/include/net/nfc/nfc.h
+++ b/include/net/nfc/nfc.h
@@ -196,7 +196,7 @@ struct nfc_dev {
 };
 #define to_nfc_dev(_dev) container_of(_dev, struct nfc_dev, dev)
 
-extern struct class nfc_class;
+extern const struct class nfc_class;
 
 struct nfc_dev *nfc_allocate_device(const struct nfc_ops *ops,
 				    u32 supported_protocols,
diff --git a/net/nfc/core.c b/net/nfc/core.c
index eb2c0959e5b6..e58dc6405054 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -1015,7 +1015,7 @@ static void nfc_check_pres_timeout(struct timer_list *t)
 	schedule_work(&dev->check_pres_work);
 }
 
-struct class nfc_class = {
+const struct class nfc_class = {
 	.name = "nfc",
 	.dev_release = nfc_release,
 };
-- 
cgit v1.2.3


From 12a66e77c4999b97a2c2b8f5e4e7533c656cee12 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 30 Jan 2024 20:30:03 +0000
Subject: rxrpc: Differentiate PING ACK transmission traces.

There are three points that transmit PING ACKs and all of them use the same
trace string.  Change two of them to use different strings.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-afs@lists.infradead.org
cc: netdev@vger.kernel.org
---
 include/trace/events/rxrpc.h | 2 ++
 net/rxrpc/call_event.c       | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index c730cd732348..3b726a6c8c42 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -364,11 +364,13 @@
 
 #define rxrpc_propose_ack_traces \
 	EM(rxrpc_propose_ack_client_tx_end,	"ClTxEnd") \
+	EM(rxrpc_propose_ack_delayed_ack,	"DlydAck") \
 	EM(rxrpc_propose_ack_input_data,	"DataIn ") \
 	EM(rxrpc_propose_ack_input_data_hole,	"DataInH") \
 	EM(rxrpc_propose_ack_ping_for_keepalive, "KeepAlv") \
 	EM(rxrpc_propose_ack_ping_for_lost_ack,	"LostAck") \
 	EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \
+	EM(rxrpc_propose_ack_ping_for_0_retrans, "0-Retrn") \
 	EM(rxrpc_propose_ack_ping_for_old_rtt,	"OldRtt ") \
 	EM(rxrpc_propose_ack_ping_for_params,	"Params ") \
 	EM(rxrpc_propose_ack_ping_for_rtt,	"Rtt    ") \
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index ef28ebf37c7d..bc1a5abb7d6f 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -197,7 +197,7 @@ no_resend:
 		if (ktime_to_us(ack_ts) < (call->peer->srtt_us >> 3))
 			goto out;
 		rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
-			       rxrpc_propose_ack_ping_for_lost_ack);
+			       rxrpc_propose_ack_ping_for_0_retrans);
 		goto out;
 	}
 
@@ -387,7 +387,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
 		trace_rxrpc_timer(call, rxrpc_timer_exp_ack, now);
 		call->delay_ack_at = now + MAX_JIFFY_OFFSET;
 		rxrpc_send_ACK(call, RXRPC_ACK_DELAY, 0,
-			       rxrpc_propose_ack_ping_for_lost_ack);
+			       rxrpc_propose_ack_delayed_ack);
 	}
 
 	t = call->ack_lost_at;
-- 
cgit v1.2.3


From 153f90a066dd4a91ef7edc0df3964dd097a5e2a5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 30 Jan 2024 21:37:16 +0000
Subject: rxrpc: Use ktimes for call timeout tracking and set the timer lazily

Track the call timeouts as ktimes rather than jiffies as the latter's
granularity is too high and only set the timer at the end of the event
handling function.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-afs@lists.infradead.org
cc: netdev@vger.kernel.org
---
 include/trace/events/rxrpc.h | 174 ++++++++++++++++++++-------------------
 net/rxrpc/af_rxrpc.c         |  12 +--
 net/rxrpc/ar-internal.h      |  35 ++++----
 net/rxrpc/call_event.c       | 189 +++++++++++++++++++++----------------------
 net/rxrpc/call_object.c      |  56 ++++++-------
 net/rxrpc/input.c            |  28 +++----
 net/rxrpc/misc.c             |   8 +-
 net/rxrpc/output.c           |  40 ++++-----
 net/rxrpc/proc.c             |  10 +--
 net/rxrpc/rtt.c              |  36 ++++-----
 net/rxrpc/sendmsg.c          |  25 +++---
 net/rxrpc/sysctl.c           |  16 ++--
 12 files changed, 307 insertions(+), 322 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 3b726a6c8c42..a1b126a6b0d7 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -119,6 +119,7 @@
 	EM(rxrpc_call_poke_complete,		"Compl")	\
 	EM(rxrpc_call_poke_error,		"Error")	\
 	EM(rxrpc_call_poke_idle,		"Idle")		\
+	EM(rxrpc_call_poke_set_timeout,		"Set-timo")	\
 	EM(rxrpc_call_poke_start,		"Start")	\
 	EM(rxrpc_call_poke_timer,		"Timer")	\
 	E_(rxrpc_call_poke_timer_now,		"Timer-now")
@@ -340,27 +341,16 @@
 	E_(rxrpc_rtt_rx_requested_ack,		"RACK")
 
 #define rxrpc_timer_traces \
-	EM(rxrpc_timer_begin,			"Begin ") \
-	EM(rxrpc_timer_exp_ack,			"ExpAck") \
-	EM(rxrpc_timer_exp_hard,		"ExpHrd") \
-	EM(rxrpc_timer_exp_idle,		"ExpIdl") \
-	EM(rxrpc_timer_exp_keepalive,		"ExpKA ") \
-	EM(rxrpc_timer_exp_lost_ack,		"ExpLoA") \
-	EM(rxrpc_timer_exp_normal,		"ExpNml") \
-	EM(rxrpc_timer_exp_ping,		"ExpPng") \
-	EM(rxrpc_timer_exp_resend,		"ExpRsn") \
-	EM(rxrpc_timer_init_for_reply,		"IniRpl") \
-	EM(rxrpc_timer_init_for_send_reply,	"SndRpl") \
-	EM(rxrpc_timer_restart,			"Restrt") \
-	EM(rxrpc_timer_set_for_ack,		"SetAck") \
-	EM(rxrpc_timer_set_for_hard,		"SetHrd") \
-	EM(rxrpc_timer_set_for_idle,		"SetIdl") \
-	EM(rxrpc_timer_set_for_keepalive,	"KeepAl") \
-	EM(rxrpc_timer_set_for_lost_ack,	"SetLoA") \
-	EM(rxrpc_timer_set_for_normal,		"SetNml") \
-	EM(rxrpc_timer_set_for_ping,		"SetPng") \
-	EM(rxrpc_timer_set_for_resend,		"SetRTx") \
-	E_(rxrpc_timer_set_for_send,		"SetSnd")
+	EM(rxrpc_timer_trace_delayed_ack,	"DelayAck ") \
+	EM(rxrpc_timer_trace_expect_rx,		"ExpectRx ") \
+	EM(rxrpc_timer_trace_hard,		"HardLimit") \
+	EM(rxrpc_timer_trace_idle,		"IdleLimit") \
+	EM(rxrpc_timer_trace_keepalive,		"KeepAlive") \
+	EM(rxrpc_timer_trace_lost_ack,		"LostAck  ") \
+	EM(rxrpc_timer_trace_ping,		"DelayPing") \
+	EM(rxrpc_timer_trace_resend,		"Resend   ") \
+	EM(rxrpc_timer_trace_resend_reset,	"ResendRst") \
+	E_(rxrpc_timer_trace_resend_tx,		"ResendTx ")
 
 #define rxrpc_propose_ack_traces \
 	EM(rxrpc_propose_ack_client_tx_end,	"ClTxEnd") \
@@ -1314,90 +1304,112 @@ TRACE_EVENT(rxrpc_rtt_rx,
 		      __entry->rto)
 	    );
 
-TRACE_EVENT(rxrpc_timer,
-	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_timer_trace why,
-		     unsigned long now),
+TRACE_EVENT(rxrpc_timer_set,
+	    TP_PROTO(struct rxrpc_call *call, ktime_t delay,
+		     enum rxrpc_timer_trace why),
 
-	    TP_ARGS(call, why, now),
+	    TP_ARGS(call, delay, why),
 
 	    TP_STRUCT__entry(
 		    __field(unsigned int,		call)
 		    __field(enum rxrpc_timer_trace,	why)
-		    __field(long,			now)
-		    __field(long,			ack_at)
-		    __field(long,			ack_lost_at)
-		    __field(long,			resend_at)
-		    __field(long,			ping_at)
-		    __field(long,			expect_rx_by)
-		    __field(long,			expect_req_by)
-		    __field(long,			expect_term_by)
-		    __field(long,			timer)
+		    __field(ktime_t,			delay)
 			     ),
 
 	    TP_fast_assign(
 		    __entry->call		= call->debug_id;
 		    __entry->why		= why;
-		    __entry->now		= now;
-		    __entry->ack_at		= call->delay_ack_at;
-		    __entry->ack_lost_at	= call->ack_lost_at;
-		    __entry->resend_at		= call->resend_at;
-		    __entry->expect_rx_by	= call->expect_rx_by;
-		    __entry->expect_req_by	= call->expect_req_by;
-		    __entry->expect_term_by	= call->expect_term_by;
-		    __entry->timer		= call->timer.expires;
+		    __entry->delay		= delay;
 			   ),
 
-	    TP_printk("c=%08x %s a=%ld la=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld",
+	    TP_printk("c=%08x %s to=%lld",
 		      __entry->call,
 		      __print_symbolic(__entry->why, rxrpc_timer_traces),
-		      __entry->ack_at - __entry->now,
-		      __entry->ack_lost_at - __entry->now,
-		      __entry->resend_at - __entry->now,
-		      __entry->expect_rx_by - __entry->now,
-		      __entry->expect_req_by - __entry->now,
-		      __entry->expect_term_by - __entry->now,
-		      __entry->timer - __entry->now)
+		      ktime_to_us(__entry->delay))
+	    );
+
+TRACE_EVENT(rxrpc_timer_exp,
+	    TP_PROTO(struct rxrpc_call *call, ktime_t delay,
+		     enum rxrpc_timer_trace why),
+
+	    TP_ARGS(call, delay, why),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		call)
+		    __field(enum rxrpc_timer_trace,	why)
+		    __field(ktime_t,			delay)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call		= call->debug_id;
+		    __entry->why		= why;
+		    __entry->delay		= delay;
+			   ),
+
+	    TP_printk("c=%08x %s to=%lld",
+		      __entry->call,
+		      __print_symbolic(__entry->why, rxrpc_timer_traces),
+		      ktime_to_us(__entry->delay))
+	    );
+
+TRACE_EVENT(rxrpc_timer_can,
+	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_timer_trace why),
+
+	    TP_ARGS(call, why),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		call)
+		    __field(enum rxrpc_timer_trace,	why)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call		= call->debug_id;
+		    __entry->why		= why;
+			   ),
+
+	    TP_printk("c=%08x %s",
+		      __entry->call,
+		      __print_symbolic(__entry->why, rxrpc_timer_traces))
+	    );
+
+TRACE_EVENT(rxrpc_timer_restart,
+	    TP_PROTO(struct rxrpc_call *call, ktime_t delay, unsigned long delayj),
+
+	    TP_ARGS(call, delay, delayj),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		call)
+		    __field(unsigned long,		delayj)
+		    __field(ktime_t,			delay)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->call		= call->debug_id;
+		    __entry->delayj		= delayj;
+		    __entry->delay		= delay;
+			   ),
+
+	    TP_printk("c=%08x to=%lld j=%ld",
+		      __entry->call,
+		      ktime_to_us(__entry->delay),
+		      __entry->delayj)
 	    );
 
 TRACE_EVENT(rxrpc_timer_expired,
-	    TP_PROTO(struct rxrpc_call *call, unsigned long now),
+	    TP_PROTO(struct rxrpc_call *call),
 
-	    TP_ARGS(call, now),
+	    TP_ARGS(call),
 
 	    TP_STRUCT__entry(
 		    __field(unsigned int,	call)
-		    __field(long,		now)
-		    __field(long,		ack_at)
-		    __field(long,		ack_lost_at)
-		    __field(long,		resend_at)
-		    __field(long,		ping_at)
-		    __field(long,		expect_rx_by)
-		    __field(long,		expect_req_by)
-		    __field(long,		expect_term_by)
-		    __field(long,		timer)
 			     ),
 
 	    TP_fast_assign(
 		    __entry->call		= call->debug_id;
-		    __entry->now		= now;
-		    __entry->ack_at		= call->delay_ack_at;
-		    __entry->ack_lost_at	= call->ack_lost_at;
-		    __entry->resend_at		= call->resend_at;
-		    __entry->expect_rx_by	= call->expect_rx_by;
-		    __entry->expect_req_by	= call->expect_req_by;
-		    __entry->expect_term_by	= call->expect_term_by;
-		    __entry->timer		= call->timer.expires;
 			   ),
 
-	    TP_printk("c=%08x EXPIRED a=%ld la=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld",
-		      __entry->call,
-		      __entry->ack_at - __entry->now,
-		      __entry->ack_lost_at - __entry->now,
-		      __entry->resend_at - __entry->now,
-		      __entry->expect_rx_by - __entry->now,
-		      __entry->expect_req_by - __entry->now,
-		      __entry->expect_term_by - __entry->now,
-		      __entry->timer - __entry->now)
+	    TP_printk("c=%08x EXPIRED",
+		      __entry->call)
 	    );
 
 TRACE_EVENT(rxrpc_rx_lose,
@@ -1507,7 +1519,7 @@ TRACE_EVENT(rxrpc_drop_ack,
 
 TRACE_EVENT(rxrpc_retransmit,
 	    TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq,
-		     rxrpc_serial_t serial, s64 expiry),
+		     rxrpc_serial_t serial, ktime_t expiry),
 
 	    TP_ARGS(call, seq, serial, expiry),
 
@@ -1515,7 +1527,7 @@ TRACE_EVENT(rxrpc_retransmit,
 		    __field(unsigned int,	call)
 		    __field(rxrpc_seq_t,	seq)
 		    __field(rxrpc_serial_t,	serial)
-		    __field(s64,		expiry)
+		    __field(ktime_t,		expiry)
 			     ),
 
 	    TP_fast_assign(
@@ -1529,7 +1541,7 @@ TRACE_EVENT(rxrpc_retransmit,
 		      __entry->call,
 		      __entry->seq,
 		      __entry->serial,
-		      __entry->expiry)
+		      ktime_to_us(__entry->expiry))
 	    );
 
 TRACE_EVENT(rxrpc_congest,
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 465bfe5eb061..5222bc97d192 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -487,7 +487,7 @@ EXPORT_SYMBOL(rxrpc_kernel_new_call_notification);
  * rxrpc_kernel_set_max_life - Set maximum lifespan on a call
  * @sock: The socket the call is on
  * @call: The call to configure
- * @hard_timeout: The maximum lifespan of the call in jiffies
+ * @hard_timeout: The maximum lifespan of the call in ms
  *
  * Set the maximum lifespan of a call.  The call will end with ETIME or
  * ETIMEDOUT if it takes longer than this.
@@ -495,14 +495,14 @@ EXPORT_SYMBOL(rxrpc_kernel_new_call_notification);
 void rxrpc_kernel_set_max_life(struct socket *sock, struct rxrpc_call *call,
 			       unsigned long hard_timeout)
 {
-	unsigned long now;
+	ktime_t delay = ms_to_ktime(hard_timeout), expect_term_by;
 
 	mutex_lock(&call->user_mutex);
 
-	now = jiffies;
-	hard_timeout += now;
-	WRITE_ONCE(call->expect_term_by, hard_timeout);
-	rxrpc_reduce_call_timer(call, hard_timeout, now, rxrpc_timer_set_for_hard);
+	expect_term_by = ktime_add(ktime_get_real(), delay);
+	WRITE_ONCE(call->expect_term_by, expect_term_by);
+	trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard);
+	rxrpc_poke_call(call, rxrpc_call_poke_set_timeout);
 
 	mutex_unlock(&call->user_mutex);
 }
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 47f4689379ca..21ecac22b51d 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -352,8 +352,8 @@ struct rxrpc_peer {
 	u32			mdev_us;	/* medium deviation			*/
 	u32			mdev_max_us;	/* maximal mdev for the last rtt period	*/
 	u32			rttvar_us;	/* smoothed mdev_max			*/
-	u32			rto_j;		/* Retransmission timeout in jiffies */
-	u8			backoff;	/* Backoff timeout */
+	u32			rto_us;		/* Retransmission timeout in usec */
+	u8			backoff;	/* Backoff timeout (as shift) */
 
 	u8			cong_ssthresh;	/* Congestion slow-start threshold */
 };
@@ -620,17 +620,17 @@ struct rxrpc_call {
 	const struct rxrpc_security *security;	/* applied security module */
 	struct mutex		user_mutex;	/* User access mutex */
 	struct sockaddr_rxrpc	dest_srx;	/* Destination address */
-	unsigned long		delay_ack_at;	/* When DELAY ACK needs to happen */
-	unsigned long		ack_lost_at;	/* When ACK is figured as lost */
-	unsigned long		resend_at;	/* When next resend needs to happen */
-	unsigned long		ping_at;	/* When next to send a ping */
-	unsigned long		keepalive_at;	/* When next to send a keepalive ping */
-	unsigned long		expect_rx_by;	/* When we expect to get a packet by */
-	unsigned long		expect_req_by;	/* When we expect to get a request DATA packet by */
-	unsigned long		expect_term_by;	/* When we expect call termination by */
-	u32			next_rx_timo;	/* Timeout for next Rx packet (jif) */
-	u32			next_req_timo;	/* Timeout for next Rx request packet (jif) */
-	u32			hard_timo;	/* Maximum lifetime or 0 (jif) */
+	ktime_t			delay_ack_at;	/* When DELAY ACK needs to happen */
+	ktime_t			ack_lost_at;	/* When ACK is figured as lost */
+	ktime_t			resend_at;	/* When next resend needs to happen */
+	ktime_t			ping_at;	/* When next to send a ping */
+	ktime_t			keepalive_at;	/* When next to send a keepalive ping */
+	ktime_t			expect_rx_by;	/* When we expect to get a packet by */
+	ktime_t			expect_req_by;	/* When we expect to get a request DATA packet by */
+	ktime_t			expect_term_by;	/* When we expect call termination by */
+	u32			next_rx_timo;	/* Timeout for next Rx packet (ms) */
+	u32			next_req_timo;	/* Timeout for next Rx request packet (ms) */
+	u32			hard_timo;	/* Maximum lifetime or 0 (s) */
 	struct timer_list	timer;		/* Combined event timer */
 	struct work_struct	destroyer;	/* In-process-context destroyer */
 	rxrpc_notify_rx_t	notify_rx;	/* kernel service Rx notification function */
@@ -675,7 +675,7 @@ struct rxrpc_call {
 	rxrpc_seq_t		tx_transmitted;	/* Highest packet transmitted */
 	rxrpc_seq_t		tx_prepared;	/* Highest Tx slot prepared. */
 	rxrpc_seq_t		tx_top;		/* Highest Tx slot allocated. */
-	u16			tx_backoff;	/* Delay to insert due to Tx failure */
+	u16			tx_backoff;	/* Delay to insert due to Tx failure (ms) */
 	u8			tx_winsize;	/* Maximum size of Tx window */
 #define RXRPC_TX_MAX_WINDOW	128
 	ktime_t			tx_last_sent;	/* Last time a transmission occurred */
@@ -866,11 +866,6 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t,
 void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *);
 void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb);
 
-void rxrpc_reduce_call_timer(struct rxrpc_call *call,
-			     unsigned long expire_at,
-			     unsigned long now,
-			     enum rxrpc_timer_trace why);
-
 bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb);
 
 /*
@@ -1214,7 +1209,7 @@ static inline int rxrpc_abort_eproto(struct rxrpc_call *call,
  */
 void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, int,
 			rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
-unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *, bool);
+ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans);
 void rxrpc_peer_init_rtt(struct rxrpc_peer *);
 
 /*
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index bc1a5abb7d6f..2a9f74eb7c46 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -23,14 +23,14 @@
 void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial,
 			enum rxrpc_propose_ack_trace why)
 {
-	unsigned long now = jiffies;
-	unsigned long ping_at = now + rxrpc_idle_ack_delay;
+	ktime_t delay = ms_to_ktime(READ_ONCE(rxrpc_idle_ack_delay));
+	ktime_t now = ktime_get_real();
+	ktime_t ping_at = ktime_add(now, delay);
 
-	if (time_before(ping_at, call->ping_at)) {
+	trace_rxrpc_propose_ack(call, why, RXRPC_ACK_PING, serial);
+	if (ktime_before(ping_at, call->ping_at)) {
 		call->ping_at = ping_at;
-		rxrpc_reduce_call_timer(call, ping_at, now,
-					rxrpc_timer_set_for_ping);
-		trace_rxrpc_propose_ack(call, why, RXRPC_ACK_PING, serial);
+		trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_ping);
 	}
 }
 
@@ -40,25 +40,18 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial,
 void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial,
 			     enum rxrpc_propose_ack_trace why)
 {
-	unsigned long expiry = rxrpc_soft_ack_delay;
-	unsigned long now = jiffies, ack_at;
+	ktime_t now = ktime_get_real(), delay;
 
-	if (rxrpc_soft_ack_delay < expiry)
-		expiry = rxrpc_soft_ack_delay;
-	if (call->peer->srtt_us != 0)
-		ack_at = usecs_to_jiffies(call->peer->srtt_us >> 3);
+	trace_rxrpc_propose_ack(call, why, RXRPC_ACK_DELAY, serial);
+
+	if (call->peer->srtt_us)
+		delay = (call->peer->srtt_us >> 3) * NSEC_PER_USEC;
 	else
-		ack_at = expiry;
-
-	ack_at += READ_ONCE(call->tx_backoff);
-	ack_at += now;
-	if (time_before(ack_at, call->delay_ack_at)) {
-		call->delay_ack_at = ack_at;
-		rxrpc_reduce_call_timer(call, ack_at, now,
-					rxrpc_timer_set_for_ack);
-	}
+		delay = ms_to_ktime(READ_ONCE(rxrpc_soft_ack_delay));
+	ktime_add_ms(delay, call->tx_backoff);
 
-	trace_rxrpc_propose_ack(call, why, RXRPC_ACK_DELAY, serial);
+	call->delay_ack_at = ktime_add(now, delay);
+	trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_delayed_ack);
 }
 
 /*
@@ -77,9 +70,8 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb)
 	struct rxrpc_ackpacket *ack = NULL;
 	struct rxrpc_skb_priv *sp;
 	struct rxrpc_txbuf *txb;
-	unsigned long resend_at;
 	rxrpc_seq_t transmitted = call->tx_transmitted;
-	ktime_t now, max_age, oldest, ack_ts;
+	ktime_t now, max_age, oldest, ack_ts, delay;
 	bool unacked = false;
 	unsigned int i;
 	LIST_HEAD(retrans_queue);
@@ -87,7 +79,7 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb)
 	_enter("{%d,%d}", call->acks_hard_ack, call->tx_top);
 
 	now = ktime_get_real();
-	max_age = ktime_sub_us(now, jiffies_to_usecs(call->peer->rto_j));
+	max_age = ktime_sub_us(now, call->peer->rto_us);
 	oldest = now;
 
 	if (list_empty(&call->tx_buffer))
@@ -178,10 +170,8 @@ void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb)
 
 no_further_resend:
 no_resend:
-	resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest)));
-	resend_at += jiffies + rxrpc_get_rto_backoff(call->peer,
-						     !list_empty(&retrans_queue));
-	call->resend_at = resend_at;
+	delay = rxrpc_get_rto_backoff(call->peer, !list_empty(&retrans_queue));
+	call->resend_at = ktime_add(oldest, delay);
 
 	if (unacked)
 		rxrpc_congestion_timeout(call);
@@ -191,8 +181,7 @@ no_resend:
 	 * retransmitting data.
 	 */
 	if (list_empty(&retrans_queue)) {
-		rxrpc_reduce_call_timer(call, resend_at, jiffies,
-					rxrpc_timer_set_for_resend);
+		trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_resend_reset);
 		ack_ts = ktime_sub(now, call->acks_latest_ts);
 		if (ktime_to_us(ack_ts) < (call->peer->srtt_us >> 3))
 			goto out;
@@ -218,13 +207,11 @@ out:
  */
 static void rxrpc_begin_service_reply(struct rxrpc_call *call)
 {
-	unsigned long now = jiffies;
-
 	rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SEND_REPLY);
-	call->delay_ack_at = now + MAX_JIFFY_OFFSET;
 	if (call->ackr_reason == RXRPC_ACK_DELAY)
 		call->ackr_reason = 0;
-	trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now);
+	call->delay_ack_at = KTIME_MAX;
+	trace_rxrpc_timer_can(call, rxrpc_timer_trace_delayed_ack);
 }
 
 /*
@@ -333,8 +320,8 @@ static void rxrpc_send_initial_ping(struct rxrpc_call *call)
  */
 bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
 {
-	unsigned long now, next, t;
-	bool resend = false, expired = false;
+	ktime_t now, t;
+	bool resend = false;
 	s32 abort_code;
 
 	rxrpc_see_call(call, rxrpc_call_see_input);
@@ -362,66 +349,69 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
 		rxrpc_input_call_packet(call, skb);
 
 	/* If we see our async-event poke, check for timeout trippage. */
-	now = jiffies;
-	t = call->expect_rx_by;
-	if (time_after_eq(now, t)) {
-		trace_rxrpc_timer(call, rxrpc_timer_exp_normal, now);
-		expired = true;
+	now = ktime_get_real();
+	t = ktime_sub(call->expect_rx_by, now);
+	if (t <= 0) {
+		trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_expect_rx);
+		goto expired;
 	}
 
-	t = call->expect_req_by;
-	if (__rxrpc_call_state(call) == RXRPC_CALL_SERVER_RECV_REQUEST &&
-	    time_after_eq(now, t)) {
-		trace_rxrpc_timer(call, rxrpc_timer_exp_idle, now);
-		expired = true;
+	t = ktime_sub(call->expect_req_by, now);
+	if (t <= 0) {
+		call->expect_req_by = KTIME_MAX;
+		if (__rxrpc_call_state(call) == RXRPC_CALL_SERVER_RECV_REQUEST) {
+			trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_idle);
+			goto expired;
+		}
 	}
 
-	t = READ_ONCE(call->expect_term_by);
-	if (time_after_eq(now, t)) {
-		trace_rxrpc_timer(call, rxrpc_timer_exp_hard, now);
-		expired = true;
+	t = ktime_sub(READ_ONCE(call->expect_term_by), now);
+	if (t <= 0) {
+		trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_hard);
+		goto expired;
 	}
 
-	t = call->delay_ack_at;
-	if (time_after_eq(now, t)) {
-		trace_rxrpc_timer(call, rxrpc_timer_exp_ack, now);
-		call->delay_ack_at = now + MAX_JIFFY_OFFSET;
+	t = ktime_sub(call->delay_ack_at, now);
+	if (t <= 0) {
+		trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_delayed_ack);
+		call->delay_ack_at = KTIME_MAX;
 		rxrpc_send_ACK(call, RXRPC_ACK_DELAY, 0,
 			       rxrpc_propose_ack_delayed_ack);
 	}
 
-	t = call->ack_lost_at;
-	if (time_after_eq(now, t)) {
-		trace_rxrpc_timer(call, rxrpc_timer_exp_lost_ack, now);
-		call->ack_lost_at = now + MAX_JIFFY_OFFSET;
+	t = ktime_sub(call->ack_lost_at, now);
+	if (t <= 0) {
+		trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_lost_ack);
+		call->ack_lost_at = KTIME_MAX;
 		set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events);
 	}
 
-	t = call->keepalive_at;
-	if (time_after_eq(now, t)) {
-		trace_rxrpc_timer(call, rxrpc_timer_exp_keepalive, now);
-		call->keepalive_at = now + MAX_JIFFY_OFFSET;
+	t = ktime_sub(call->ping_at, now);
+	if (t <= 0) {
+		trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_ping);
+		call->ping_at = KTIME_MAX;
 		rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
 			       rxrpc_propose_ack_ping_for_keepalive);
 	}
 
-	t = call->ping_at;
-	if (time_after_eq(now, t)) {
-		trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now);
-		call->ping_at = now + MAX_JIFFY_OFFSET;
-		rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
-			       rxrpc_propose_ack_ping_for_keepalive);
-	}
-
-	t = call->resend_at;
-	if (time_after_eq(now, t)) {
-		trace_rxrpc_timer(call, rxrpc_timer_exp_resend, now);
-		call->resend_at = now + MAX_JIFFY_OFFSET;
+	t = ktime_sub(call->resend_at, now);
+	if (t <= 0) {
+		trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_resend);
+		call->resend_at = KTIME_MAX;
 		resend = true;
 	}
 
 	rxrpc_transmit_some_data(call);
 
+	now = ktime_get_real();
+	t = ktime_sub(call->keepalive_at, now);
+	if (t <= 0) {
+		trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_keepalive);
+		call->keepalive_at = KTIME_MAX;
+		rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
+			       rxrpc_propose_ack_ping_for_keepalive);
+	}
+
 	if (skb) {
 		struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 
@@ -433,19 +423,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
 		rxrpc_send_initial_ping(call);
 
 	/* Process events */
-	if (expired) {
-		if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) &&
-		    (int)call->conn->hi_serial - (int)call->rx_serial > 0) {
-			trace_rxrpc_call_reset(call);
-			rxrpc_abort_call(call, 0, RX_CALL_DEAD, -ECONNRESET,
-					 rxrpc_abort_call_reset);
-		} else {
-			rxrpc_abort_call(call, 0, RX_CALL_TIMEOUT, -ETIME,
-					 rxrpc_abort_call_timeout);
-		}
-		goto out;
-	}
-
 	if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events))
 		rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
 			       rxrpc_propose_ack_ping_for_lost_ack);
@@ -474,23 +451,33 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
 
 	/* Make sure the timer is restarted */
 	if (!__rxrpc_call_is_complete(call)) {
-		next = call->expect_rx_by;
+		ktime_t next = READ_ONCE(call->expect_term_by), delay;
 
-#define set(T) { t = READ_ONCE(T); if (time_before(t, next)) next = t; }
+#define set(T) { ktime_t _t = (T); if (ktime_before(_t, next)) next = _t; }
 
 		set(call->expect_req_by);
-		set(call->expect_term_by);
+		set(call->expect_rx_by);
 		set(call->delay_ack_at);
 		set(call->ack_lost_at);
 		set(call->resend_at);
 		set(call->keepalive_at);
 		set(call->ping_at);
 
-		now = jiffies;
-		if (time_after_eq(now, next))
+		now = ktime_get_real();
+		delay = ktime_sub(next, now);
+		if (delay <= 0) {
 			rxrpc_poke_call(call, rxrpc_call_poke_timer_now);
-
-		rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart);
+		} else {
+			unsigned long nowj = jiffies, delayj, nextj;
+
+			delayj = max(nsecs_to_jiffies(delay), 1);
+			nextj = nowj + delayj;
+			if (time_before(nextj, call->timer.expires) ||
+			    !timer_pending(&call->timer)) {
+				trace_rxrpc_timer_restart(call, delay, delayj);
+				timer_reduce(&call->timer, nextj);
+			}
+		}
 	}
 
 out:
@@ -505,4 +492,16 @@ out:
 		rxrpc_shrink_call_tx_buffer(call);
 	_leave("");
 	return true;
+
+expired:
+	if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) &&
+	    (int)call->conn->hi_serial - (int)call->rx_serial > 0) {
+		trace_rxrpc_call_reset(call);
+		rxrpc_abort_call(call, 0, RX_CALL_DEAD, -ECONNRESET,
+				 rxrpc_abort_call_reset);
+	} else {
+		rxrpc_abort_call(call, 0, RX_CALL_TIMEOUT, -ETIME,
+				 rxrpc_abort_call_timeout);
+	}
+	goto out;
 }
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 9fc9a6c3f685..01fa71e8b1f7 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -70,20 +70,11 @@ static void rxrpc_call_timer_expired(struct timer_list *t)
 	_enter("%d", call->debug_id);
 
 	if (!__rxrpc_call_is_complete(call)) {
-		trace_rxrpc_timer_expired(call, jiffies);
+		trace_rxrpc_timer_expired(call);
 		rxrpc_poke_call(call, rxrpc_call_poke_timer);
 	}
 }
 
-void rxrpc_reduce_call_timer(struct rxrpc_call *call,
-			     unsigned long expire_at,
-			     unsigned long now,
-			     enum rxrpc_timer_trace why)
-{
-	trace_rxrpc_timer(call, why, now);
-	timer_reduce(&call->timer, expire_at);
-}
-
 static struct lock_class_key rxrpc_call_user_mutex_lock_class_key;
 
 static void rxrpc_destroy_call(struct work_struct *);
@@ -163,12 +154,20 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
 	spin_lock_init(&call->notify_lock);
 	spin_lock_init(&call->tx_lock);
 	refcount_set(&call->ref, 1);
-	call->debug_id = debug_id;
-	call->tx_total_len = -1;
-	call->next_rx_timo = 20 * HZ;
-	call->next_req_timo = 1 * HZ;
-	call->ackr_window = 1;
-	call->ackr_wtop = 1;
+	call->debug_id		= debug_id;
+	call->tx_total_len	= -1;
+	call->next_rx_timo	= 20 * HZ;
+	call->next_req_timo	= 1 * HZ;
+	call->ackr_window	= 1;
+	call->ackr_wtop		= 1;
+	call->delay_ack_at	= KTIME_MAX;
+	call->ack_lost_at	= KTIME_MAX;
+	call->resend_at		= KTIME_MAX;
+	call->ping_at		= KTIME_MAX;
+	call->keepalive_at	= KTIME_MAX;
+	call->expect_rx_by	= KTIME_MAX;
+	call->expect_req_by	= KTIME_MAX;
+	call->expect_term_by	= KTIME_MAX;
 
 	memset(&call->sock_node, 0xed, sizeof(call->sock_node));
 
@@ -226,11 +225,11 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
 		__set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags);
 
 	if (p->timeouts.normal)
-		call->next_rx_timo = min(msecs_to_jiffies(p->timeouts.normal), 1UL);
+		call->next_rx_timo = min(p->timeouts.normal, 1);
 	if (p->timeouts.idle)
-		call->next_req_timo = min(msecs_to_jiffies(p->timeouts.idle), 1UL);
+		call->next_req_timo = min(p->timeouts.idle, 1);
 	if (p->timeouts.hard)
-		call->hard_timo = p->timeouts.hard * HZ;
+		call->hard_timo = p->timeouts.hard;
 
 	ret = rxrpc_init_client_call_security(call);
 	if (ret < 0) {
@@ -253,18 +252,13 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
  */
 void rxrpc_start_call_timer(struct rxrpc_call *call)
 {
-	unsigned long now = jiffies;
-	unsigned long j = now + MAX_JIFFY_OFFSET;
-
-	call->delay_ack_at = j;
-	call->ack_lost_at = j;
-	call->resend_at = j;
-	call->ping_at = j;
-	call->keepalive_at = j;
-	call->expect_rx_by = j;
-	call->expect_req_by = j;
-	call->expect_term_by = j + call->hard_timo;
-	call->timer.expires = now;
+	if (call->hard_timo) {
+		ktime_t delay = ms_to_ktime(call->hard_timo * 1000);
+
+		call->expect_term_by = ktime_add(ktime_get_real(), delay);
+		trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard);
+	}
+	call->timer.expires = jiffies;
 }
 
 /*
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index e53a49accc16..09cce1d5d605 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -252,6 +252,9 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
 {
 	ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags));
 
+	call->resend_at = KTIME_MAX;
+	trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend);
+
 	if (unlikely(call->cong_last_nack)) {
 		rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack);
 		call->cong_last_nack = NULL;
@@ -288,13 +291,11 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
 static bool rxrpc_receiving_reply(struct rxrpc_call *call)
 {
 	struct rxrpc_ack_summary summary = { 0 };
-	unsigned long now;
 	rxrpc_seq_t top = READ_ONCE(call->tx_top);
 
 	if (call->ackr_reason) {
-		now = jiffies;
-		call->delay_ack_at = now + MAX_JIFFY_OFFSET;
-		trace_rxrpc_timer(call, rxrpc_timer_init_for_reply, now);
+		call->delay_ack_at = KTIME_MAX;
+		trace_rxrpc_timer_can(call, rxrpc_timer_trace_delayed_ack);
 	}
 
 	if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
@@ -327,7 +328,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
 
 	case RXRPC_CALL_SERVER_RECV_REQUEST:
 		rxrpc_set_call_state(call, RXRPC_CALL_SERVER_ACK_REQUEST);
-		call->expect_req_by = jiffies + MAX_JIFFY_OFFSET;
+		call->expect_req_by = KTIME_MAX;
 		rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_processing_op);
 		break;
 
@@ -587,14 +588,12 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb)
 
 	case RXRPC_CALL_SERVER_RECV_REQUEST: {
 		unsigned long timo = READ_ONCE(call->next_req_timo);
-		unsigned long now, expect_req_by;
 
 		if (timo) {
-			now = jiffies;
-			expect_req_by = now + timo;
-			call->expect_req_by = now + timo;
-			rxrpc_reduce_call_timer(call, expect_req_by, now,
-						rxrpc_timer_set_for_idle);
+			ktime_t delay = ms_to_ktime(timo);
+
+			call->expect_req_by = ktime_add(ktime_get_real(), delay);
+			trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_idle);
 		}
 		break;
 	}
@@ -1046,11 +1045,10 @@ void rxrpc_input_call_packet(struct rxrpc_call *call, struct sk_buff *skb)
 
 	timo = READ_ONCE(call->next_rx_timo);
 	if (timo) {
-		unsigned long now = jiffies;
+		ktime_t delay = ms_to_ktime(timo);
 
-		call->expect_rx_by = now + timo;
-		rxrpc_reduce_call_timer(call, call->expect_rx_by, now,
-					rxrpc_timer_set_for_normal);
+		call->expect_rx_by = ktime_add(ktime_get_real(), delay);
+		trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx);
 	}
 
 	switch (sp->hdr.type) {
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 825b81183046..657cf35089a6 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -17,22 +17,22 @@
 unsigned int rxrpc_max_backlog __read_mostly = 10;
 
 /*
- * How long to wait before scheduling an ACK with subtype DELAY (in jiffies).
+ * How long to wait before scheduling an ACK with subtype DELAY (in ms).
  *
  * We use this when we've received new data packets.  If those packets aren't
  * all consumed within this time we will send a DELAY ACK if an ACK was not
  * requested to let the sender know it doesn't need to resend.
  */
-unsigned long rxrpc_soft_ack_delay = HZ;
+unsigned long rxrpc_soft_ack_delay = 1000;
 
 /*
- * How long to wait before scheduling an ACK with subtype IDLE (in jiffies).
+ * How long to wait before scheduling an ACK with subtype IDLE (in ms).
  *
  * We use this when we've consumed some previously soft-ACK'd packets when
  * further packets aren't immediately received to decide when to send an IDLE
  * ACK let the other end know that it can free up its Tx buffer space.
  */
-unsigned long rxrpc_idle_ack_delay = HZ / 2;
+unsigned long rxrpc_idle_ack_delay = 500;
 
 /*
  * Receive window size in packets.  This indicates the maximum number of
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 0a317498b8e0..ec82193e5681 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -48,12 +48,10 @@ static const char rxrpc_keepalive_string[] = "";
 static void rxrpc_tx_backoff(struct rxrpc_call *call, int ret)
 {
 	if (ret < 0) {
-		u16 tx_backoff = READ_ONCE(call->tx_backoff);
-
-		if (tx_backoff < HZ)
-			WRITE_ONCE(call->tx_backoff, tx_backoff + 1);
+		if (call->tx_backoff < 1000)
+			call->tx_backoff += 100;
 	} else {
-		WRITE_ONCE(call->tx_backoff, 0);
+		call->tx_backoff = 0;
 	}
 }
 
@@ -67,11 +65,10 @@ static void rxrpc_tx_backoff(struct rxrpc_call *call, int ret)
  */
 static void rxrpc_set_keepalive(struct rxrpc_call *call)
 {
-	unsigned long now = jiffies;
+	ktime_t delay = ms_to_ktime(READ_ONCE(call->next_rx_timo) / 6);
 
-	call->keepalive_at = now + call->next_rx_timo / 6;
-	rxrpc_reduce_call_timer(call, call->keepalive_at, now,
-				rxrpc_timer_set_for_keepalive);
+	call->keepalive_at = ktime_add(ktime_get_real(), delay);
+	trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_keepalive);
 }
 
 /*
@@ -515,24 +512,21 @@ done:
 		if (txb->flags & RXRPC_REQUEST_ACK) {
 			call->peer->rtt_last_req = txb->last_sent;
 			if (call->peer->rtt_count > 1) {
-				unsigned long nowj = jiffies, ack_lost_at;
+				ktime_t delay = rxrpc_get_rto_backoff(call->peer, false);
+				ktime_t now = ktime_get_real();
 
-				ack_lost_at = rxrpc_get_rto_backoff(call->peer, false);
-				ack_lost_at += nowj;
-				call->ack_lost_at = ack_lost_at;
-				rxrpc_reduce_call_timer(call, ack_lost_at, nowj,
-							rxrpc_timer_set_for_lost_ack);
+				call->ack_lost_at = ktime_add(now, delay);
+				trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_lost_ack);
 			}
 		}
 
 		if (txb->seq == 1 &&
 		    !test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER,
 				      &call->flags)) {
-			unsigned long nowj = jiffies;
+			ktime_t delay = ms_to_ktime(READ_ONCE(call->next_rx_timo));
 
-			call->expect_rx_by = nowj + call->next_rx_timo;
-			rxrpc_reduce_call_timer(call, call->expect_rx_by, nowj,
-						rxrpc_timer_set_for_normal);
+			call->expect_rx_by = ktime_add(ktime_get_real(), delay);
+			trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx);
 		}
 
 		rxrpc_set_keepalive(call);
@@ -755,11 +749,9 @@ void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
 			rxrpc_instant_resend(call, txb);
 		}
 	} else {
-		unsigned long now = jiffies;
-		unsigned long resend_at = now + call->peer->rto_j;
+		ktime_t delay = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC);
 
-		call->resend_at = resend_at;
-		rxrpc_reduce_call_timer(call, resend_at, now,
-					rxrpc_timer_set_for_send);
+		call->resend_at = ktime_add(ktime_get_real(), delay);
+		trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_resend_tx);
 	}
 }
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 26dc2f26d92d..263a2251e3d2 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -52,9 +52,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 	struct rxrpc_call *call;
 	struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
 	enum rxrpc_call_state state;
-	unsigned long timeout = 0;
 	rxrpc_seq_t acks_hard_ack;
 	char lbuff[50], rbuff[50];
+	long timeout = 0;
 
 	if (v == &rxnet->calls) {
 		seq_puts(seq,
@@ -76,10 +76,8 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
 	sprintf(rbuff, "%pISpc", &call->dest_srx.transport);
 
 	state = rxrpc_call_state(call);
-	if (state != RXRPC_CALL_SERVER_PREALLOC) {
-		timeout = READ_ONCE(call->expect_rx_by);
-		timeout -= jiffies;
-	}
+	if (state != RXRPC_CALL_SERVER_PREALLOC)
+		timeout = ktime_ms_delta(READ_ONCE(call->expect_rx_by), ktime_get_real());
 
 	acks_hard_ack = READ_ONCE(call->acks_hard_ack);
 	seq_printf(seq,
@@ -309,7 +307,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
 		   peer->mtu,
 		   now - peer->last_tx_at,
 		   peer->srtt_us >> 3,
-		   jiffies_to_usecs(peer->rto_j));
+		   peer->rto_us);
 
 	return 0;
 }
diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c
index be61d6f5be8d..cdab7b7d08a0 100644
--- a/net/rxrpc/rtt.c
+++ b/net/rxrpc/rtt.c
@@ -11,8 +11,8 @@
 #include <linux/net.h>
 #include "ar-internal.h"
 
-#define RXRPC_RTO_MAX	((unsigned)(120 * HZ))
-#define RXRPC_TIMEOUT_INIT ((unsigned)(1*HZ))	/* RFC6298 2.1 initial RTO value	*/
+#define RXRPC_RTO_MAX	(120 * USEC_PER_SEC)
+#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * MSEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */
 #define rxrpc_jiffies32 ((u32)jiffies)		/* As rxrpc_jiffies32 */
 
 static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer)
@@ -22,7 +22,7 @@ static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer)
 
 static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer)
 {
-	return usecs_to_jiffies((peer->srtt_us >> 3) + peer->rttvar_us);
+	return (peer->srtt_us >> 3) + peer->rttvar_us;
 }
 
 static u32 rxrpc_bound_rto(u32 rto)
@@ -124,7 +124,7 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer)
 	/* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo
 	 * guarantees that rto is higher.
 	 */
-	peer->rto_j = rxrpc_bound_rto(rto);
+	peer->rto_us = rxrpc_bound_rto(rto);
 }
 
 static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us)
@@ -163,33 +163,33 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
 	spin_unlock(&peer->rtt_input_lock);
 
 	trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial,
-			   peer->srtt_us >> 3, peer->rto_j);
+			   peer->srtt_us >> 3, peer->rto_us);
 }
 
 /*
- * Get the retransmission timeout to set in jiffies, backing it off each time
- * we retransmit.
+ * Get the retransmission timeout to set in nanoseconds, backing it off each
+ * time we retransmit.
  */
-unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans)
+ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans)
 {
-	u64 timo_j;
-	u8 backoff = READ_ONCE(peer->backoff);
+	u64 timo_us;
+	u32 backoff = READ_ONCE(peer->backoff);
 
-	timo_j = peer->rto_j;
-	timo_j <<= backoff;
-	if (retrans && timo_j * 2 <= RXRPC_RTO_MAX)
+	timo_us = peer->rto_us;
+	timo_us <<= backoff;
+	if (retrans && timo_us * 2 <= RXRPC_RTO_MAX)
 		WRITE_ONCE(peer->backoff, backoff + 1);
 
-	if (timo_j < 1)
-		timo_j = 1;
+	if (timo_us < 1)
+		timo_us = 1;
 
-	return timo_j;
+	return ns_to_ktime(timo_us * NSEC_PER_USEC);
 }
 
 void rxrpc_peer_init_rtt(struct rxrpc_peer *peer)
 {
-	peer->rto_j	= RXRPC_TIMEOUT_INIT;
-	peer->mdev_us	= jiffies_to_usecs(RXRPC_TIMEOUT_INIT);
+	peer->rto_us	= RXRPC_TIMEOUT_INIT;
+	peer->mdev_us	= RXRPC_TIMEOUT_INIT;
 	peer->backoff	= 0;
 	//minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U);
 }
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 4d152f06b039..6f765768c49c 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -609,7 +609,6 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 	__releases(&rx->sk.sk_lock.slock)
 {
 	struct rxrpc_call *call;
-	unsigned long now, j;
 	bool dropped_lock = false;
 	int ret;
 
@@ -687,25 +686,21 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 
 	switch (p.call.nr_timeouts) {
 	case 3:
-		j = msecs_to_jiffies(p.call.timeouts.normal);
-		if (p.call.timeouts.normal > 0 && j == 0)
-			j = 1;
-		WRITE_ONCE(call->next_rx_timo, j);
+		WRITE_ONCE(call->next_rx_timo, p.call.timeouts.normal);
 		fallthrough;
 	case 2:
-		j = msecs_to_jiffies(p.call.timeouts.idle);
-		if (p.call.timeouts.idle > 0 && j == 0)
-			j = 1;
-		WRITE_ONCE(call->next_req_timo, j);
+		WRITE_ONCE(call->next_req_timo, p.call.timeouts.idle);
 		fallthrough;
 	case 1:
 		if (p.call.timeouts.hard > 0) {
-			j = p.call.timeouts.hard * HZ;
-			now = jiffies;
-			j += now;
-			WRITE_ONCE(call->expect_term_by, j);
-			rxrpc_reduce_call_timer(call, j, now,
-						rxrpc_timer_set_for_hard);
+			ktime_t delay = ms_to_ktime(p.call.timeouts.hard * MSEC_PER_SEC);
+
+			WRITE_ONCE(call->expect_term_by,
+				   ktime_add(p.call.timeouts.hard,
+					     ktime_get_real()));
+			trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard);
+			rxrpc_poke_call(call, rxrpc_call_poke_set_timeout);
+
 		}
 		break;
 	}
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index ecaeb4ecfb58..c9bedd0e2d86 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -15,6 +15,8 @@ static const unsigned int four = 4;
 static const unsigned int max_backlog = RXRPC_BACKLOG_MAX - 1;
 static const unsigned int n_65535 = 65535;
 static const unsigned int n_max_acks = 255;
+static const unsigned long one_ms = 1;
+static const unsigned long max_ms = 1000;
 static const unsigned long one_jiffy = 1;
 static const unsigned long max_jiffies = MAX_JIFFY_OFFSET;
 #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY
@@ -28,24 +30,24 @@ static const unsigned long max_500 = 500;
  * information on the individual parameters.
  */
 static struct ctl_table rxrpc_sysctl_table[] = {
-	/* Values measured in milliseconds but used in jiffies */
+	/* Values measured in milliseconds */
 	{
 		.procname	= "soft_ack_delay",
 		.data		= &rxrpc_soft_ack_delay,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
-		.extra1		= (void *)&one_jiffy,
-		.extra2		= (void *)&max_jiffies,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= (void *)&one_ms,
+		.extra2		= (void *)&max_ms,
 	},
 	{
 		.procname	= "idle_ack_delay",
 		.data		= &rxrpc_idle_ack_delay,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
-		.extra1		= (void *)&one_jiffy,
-		.extra2		= (void *)&max_jiffies,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= (void *)&one_ms,
+		.extra2		= (void *)&max_ms,
 	},
 	{
 		.procname	= "idle_conn_expiry",
-- 
cgit v1.2.3


From 6f2fc8584a46bb35787bfc1dad1fb7dd5898e21f Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Sat, 2 Mar 2024 20:53:00 +0100
Subject: net: add helpers for EEE configuration

Add helpers that phylib and phylink can use to manage EEE configuration
and determine whether the MAC should be permitted to use LPI based on
that configuration.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://lore.kernel.org/r/20240302195306.3207716-2-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/eee.h | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 include/net/eee.h

(limited to 'include')

diff --git a/include/net/eee.h b/include/net/eee.h
new file mode 100644
index 000000000000..84837aba3cd9
--- /dev/null
+++ b/include/net/eee.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _EEE_H
+#define _EEE_H
+
+#include <linux/types.h>
+
+struct eee_config {
+	u32 tx_lpi_timer;
+	bool tx_lpi_enabled;
+	bool eee_enabled;
+};
+
+static inline bool eeecfg_mac_can_tx_lpi(const struct eee_config *eeecfg)
+{
+	/* eee_enabled is the master on/off */
+	if (!eeecfg->eee_enabled || !eeecfg->tx_lpi_enabled)
+		return false;
+
+	return true;
+}
+
+static inline void eeecfg_to_eee(struct ethtool_keee *eee,
+				 const struct eee_config *eeecfg)
+{
+	eee->tx_lpi_timer = eeecfg->tx_lpi_timer;
+	eee->tx_lpi_enabled = eeecfg->tx_lpi_enabled;
+	eee->eee_enabled = eeecfg->eee_enabled;
+}
+
+static inline void eee_to_eeecfg(struct eee_config *eeecfg,
+				 const struct ethtool_keee *eee)
+{
+	eeecfg->tx_lpi_timer = eee->tx_lpi_timer;
+	eeecfg->tx_lpi_enabled = eee->tx_lpi_enabled;
+	eeecfg->eee_enabled = eee->eee_enabled;
+}
+
+#endif
-- 
cgit v1.2.3


From e3b6876ab85061e7de198f023a0c2bfc7478b420 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 2 Mar 2024 20:53:01 +0100
Subject: net: phy: Add phydev->enable_tx_lpi to simplify adjust link callbacks

MAC drivers which support EEE need to know the results of the EEE
auto-neg in order to program the hardware to perform EEE or not.  The
oddly named phy_init_eee() can be used to determine this, it returns 0
if EEE should be used, or a negative error code,
e.g. -EOPPROTONOTSUPPORT if the PHY does not support EEE or negotiate
resulted in it not being used.

However, many MAC drivers get this wrong. Add phydev->enable_tx_lpi
which indicates the result of the autoneg for EEE, including if EEE is
administratively disabled with ethtool. The MAC driver can then access
this in the same way as link speed and duplex in the adjust link
callback. If enable_tx_lpi is true, the MAC should send low power
indications and does not need to consider anything else with respect
to EEE.

Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://lore.kernel.org/r/20240302195306.3207716-3-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy.c | 7 +++++++
 include/linux/phy.h   | 2 ++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 14224e06d69f..2bc0a7d51c63 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -983,9 +983,16 @@ static int phy_check_link_status(struct phy_device *phydev)
 	if (phydev->link && phydev->state != PHY_RUNNING) {
 		phy_check_downshift(phydev);
 		phydev->state = PHY_RUNNING;
+		err = genphy_c45_eee_is_active(phydev,
+					       NULL, NULL, NULL);
+		if (err < 0)
+			phydev->enable_tx_lpi = false;
+		else
+			phydev->enable_tx_lpi = !!err;
 		phy_link_up(phydev);
 	} else if (!phydev->link && phydev->state != PHY_NOLINK) {
 		phydev->state = PHY_NOLINK;
+		phydev->enable_tx_lpi = false;
 		phy_link_down(phydev);
 	}
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e3ab2c347a59..a880f6d7170e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -594,6 +594,7 @@ struct macsec_ops;
  * @supported_eee: supported PHY EEE linkmodes
  * @advertising_eee: Currently advertised EEE linkmodes
  * @eee_enabled: Flag indicating whether the EEE feature is enabled
+ * @enable_tx_lpi: When True, MAC should transmit LPI to PHY
  * @lp_advertising: Current link partner advertised linkmodes
  * @host_interfaces: PHY interface modes supported by host
  * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited
@@ -713,6 +714,7 @@ struct phy_device {
 
 	/* Energy efficient ethernet modes which should be prohibited */
 	u32 eee_broken_modes;
+	bool enable_tx_lpi;
 
 #ifdef CONFIG_LED_TRIGGER_PHY
 	struct phy_led_trigger *phy_led_triggers;
-- 
cgit v1.2.3


From fe0d4fd9285e5013b4bafbd3338847235b805a1c Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 2 Mar 2024 20:53:02 +0100
Subject: net: phy: Keep track of EEE configuration

Have phylib keep track of the EEE configuration. This simplifies the
MAC drivers, in that they don't need to store it.

Future patches to phylib will also make use of this information to
further simplify the MAC drivers.

Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://lore.kernel.org/r/20240302195306.3207716-4-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy.c | 7 +++++--
 include/linux/phy.h   | 3 +++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 2bc0a7d51c63..95c4ef5d4e97 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1640,8 +1640,8 @@ EXPORT_SYMBOL(phy_get_eee_err);
  * @phydev: target phy_device struct
  * @data: ethtool_keee data
  *
- * Description: it reportes the Supported/Advertisement/LP Advertisement
- * capabilities.
+ * Description: reports the Supported/Advertisement/LP Advertisement
+ * capabilities, etc.
  */
 int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data)
 {
@@ -1652,6 +1652,7 @@ int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data)
 
 	mutex_lock(&phydev->lock);
 	ret = genphy_c45_ethtool_get_eee(phydev, data);
+	eeecfg_to_eee(data, &phydev->eee_cfg);
 	mutex_unlock(&phydev->lock);
 
 	return ret;
@@ -1674,6 +1675,8 @@ int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data)
 
 	mutex_lock(&phydev->lock);
 	ret = genphy_c45_ethtool_set_eee(phydev, data);
+	if (!ret)
+		eee_to_eeecfg(&phydev->eee_cfg, data);
 	mutex_unlock(&phydev->lock);
 
 	return ret;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index a880f6d7170e..695e366bd75c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -30,6 +30,7 @@
 #include <linux/refcount.h>
 
 #include <linux/atomic.h>
+#include <net/eee.h>
 
 #define PHY_DEFAULT_FEATURES	(SUPPORTED_Autoneg | \
 				 SUPPORTED_TP | \
@@ -595,6 +596,7 @@ struct macsec_ops;
  * @advertising_eee: Currently advertised EEE linkmodes
  * @eee_enabled: Flag indicating whether the EEE feature is enabled
  * @enable_tx_lpi: When True, MAC should transmit LPI to PHY
+ * @eee_cfg: User configuration of EEE
  * @lp_advertising: Current link partner advertised linkmodes
  * @host_interfaces: PHY interface modes supported by host
  * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited
@@ -715,6 +717,7 @@ struct phy_device {
 	/* Energy efficient ethernet modes which should be prohibited */
 	u32 eee_broken_modes;
 	bool enable_tx_lpi;
+	struct eee_config eee_cfg;
 
 #ifdef CONFIG_LED_TRIGGER_PHY
 	struct phy_led_trigger *phy_led_triggers;
-- 
cgit v1.2.3


From 49168d1980e220cf3d1b761e1eafac62041cb94d Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 2 Mar 2024 20:53:04 +0100
Subject: net: phy: Add phy_support_eee() indicating MAC support EEE

In order for EEE to operate, both the MAC and the PHY need to support
it, similar to how pause works. With some exception - a number of PHYs
have SmartEEE or AutoGrEEEn support in order to provide some EEE-like
power savings with non-EEE capable MACs.

Copy the pause concept and add the call phy_support_eee() which the MAC
makes after connecting the PHY to indicate it supports EEE. phylib will
then advertise EEE when auto-neg is performed.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://lore.kernel.org/r/20240302195306.3207716-6-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy_device.c | 28 ++++++++++++++++++++++++++++
 include/linux/phy.h          |  3 ++-
 2 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 2eefee970851..72452e6a478c 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -2910,6 +2910,34 @@ void phy_advertise_eee_all(struct phy_device *phydev)
 }
 EXPORT_SYMBOL_GPL(phy_advertise_eee_all);
 
+/**
+ * phy_support_eee - Set initial EEE policy configuration
+ * @phydev: Target phy_device struct
+ *
+ * This function configures the initial policy for Energy Efficient Ethernet
+ * (EEE) on the specified PHY device, influencing that EEE capabilities are
+ * advertised before the link is established. It should be called during PHY
+ * registration by the MAC driver and/or the PHY driver (for SmartEEE PHYs)
+ * if MAC supports LPI or PHY is capable to compensate missing LPI functionality
+ * of the MAC.
+ *
+ * The function sets default EEE policy parameters, including preparing the PHY
+ * to advertise EEE capabilities based on hardware support.
+ *
+ * It also sets the expected configuration for Low Power Idle (LPI) in the MAC
+ * driver. If the PHY framework determines that both local and remote
+ * advertisements support EEE, and the negotiated link mode is compatible with
+ * EEE, it will set enable_tx_lpi = true. The MAC driver is expected to act on
+ * this setting by enabling the LPI timer if enable_tx_lpi is set.
+ */
+void phy_support_eee(struct phy_device *phydev)
+{
+	linkmode_copy(phydev->advertising_eee, phydev->supported_eee);
+	phydev->eee_cfg.tx_lpi_enabled = true;
+	phydev->eee_cfg.eee_enabled = true;
+}
+EXPORT_SYMBOL(phy_support_eee);
+
 /**
  * phy_support_sym_pause - Enable support of symmetrical pause
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 695e366bd75c..3f68b8239bb1 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -706,7 +706,7 @@ struct phy_device {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising);
 	/* used with phy_speed_down */
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(adv_old);
-	/* used for eee validation */
+	/* used for eee validation and configuration*/
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported_eee);
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(advertising_eee);
 	bool eee_enabled;
@@ -1973,6 +1973,7 @@ void phy_advertise_supported(struct phy_device *phydev);
 void phy_advertise_eee_all(struct phy_device *phydev);
 void phy_support_sym_pause(struct phy_device *phydev);
 void phy_support_asym_pause(struct phy_device *phydev);
+void phy_support_eee(struct phy_device *phydev);
 void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx,
 		       bool autoneg);
 void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx);
-- 
cgit v1.2.3


From eeb78df4063c0b162324a9408ef573b24791871f Mon Sep 17 00:00:00 2001
From: Juntong Deng <juntong.deng@outlook.com>
Date: Mon, 4 Mar 2024 11:32:08 +0000
Subject: inet: Add getsockopt support for IP_ROUTER_ALERT and
 IPV6_ROUTER_ALERT

Currently getsockopt does not support IP_ROUTER_ALERT and
IPV6_ROUTER_ALERT, and we are unable to get the values of these two
socket options through getsockopt.

This patch adds getsockopt support for IP_ROUTER_ALERT and
IPV6_ROUTER_ALERT.

Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h  |  1 +
 net/ipv4/ip_sockglue.c   | 13 ++++++++++---
 net/ipv6/ipv6_sockglue.c |  6 ++++++
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index d94c242eb3ed..f9ddd47dc4f8 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -274,6 +274,7 @@ enum {
 	INET_FLAGS_REPFLOW	= 27,
 	INET_FLAGS_RTALERT_ISOLATE = 28,
 	INET_FLAGS_SNDFLOW	= 29,
+	INET_FLAGS_RTALERT	= 30,
 };
 
 /* cmsg flags for inet */
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 21d2ffa919e9..cf377377b52d 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -894,7 +894,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct net *net = sock_net(sk);
-	int val = 0, err;
+	int val = 0, err, retv;
 	bool needs_rtnl = setsockopt_needs_rtnl(optname);
 
 	switch (optname) {
@@ -938,8 +938,12 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
 
 	/* If optlen==0, it is equivalent to val == 0 */
 
-	if (optname == IP_ROUTER_ALERT)
-		return ip_ra_control(sk, val ? 1 : 0, NULL);
+	if (optname == IP_ROUTER_ALERT) {
+		retv = ip_ra_control(sk, val ? 1 : 0, NULL);
+		if (retv == 0)
+			inet_assign_bit(RTALERT, sk, val);
+		return retv;
+	}
 	if (ip_mroute_opt(optname))
 		return ip_mroute_setsockopt(sk, optname, optval, optlen);
 
@@ -1575,6 +1579,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_BIND_ADDRESS_NO_PORT:
 		val = inet_test_bit(BIND_ADDRESS_NO_PORT, sk);
 		goto copyval;
+	case IP_ROUTER_ALERT:
+		val = inet_test_bit(RTALERT, sk);
+		goto copyval;
 	case IP_TTL:
 		val = READ_ONCE(inet->uc_ttl);
 		if (val < 0)
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index f61d977ac052..d4c28ec1bc51 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -948,6 +948,8 @@ done:
 		if (optlen < sizeof(int))
 			goto e_inval;
 		retv = ip6_ra_control(sk, val);
+		if (retv == 0)
+			inet6_assign_bit(RTALERT, sk, valbool);
 		break;
 	case IPV6_FLOWLABEL_MGR:
 		retv = ipv6_flowlabel_opt(sk, optval, optlen);
@@ -1445,6 +1447,10 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		val = np->rxopt.bits.recvfragsize;
 		break;
 
+	case IPV6_ROUTER_ALERT:
+		val = inet6_test_bit(RTALERT, sk);
+		break;
+
 	case IPV6_ROUTER_ALERT_ISOLATE:
 		val = inet6_test_bit(RTALERT_ISOLATE, sk);
 		break;
-- 
cgit v1.2.3


From 0ed3bba16d37618c7776a44ef02ad3039966d687 Mon Sep 17 00:00:00 2001
From: Takeru Hayasaka <hayatake396@gmail.com>
Date: Mon, 12 Feb 2024 02:04:03 +0000
Subject: ethtool: Add GTP RSS hash options to ethtool.h

This is a patch that enables RSS functionality for GTP packets using ethtool.

A user can include TEID and make RSS work for GTP-U over IPv4 by doing the
following:`ethtool -N ens3 rx-flow-hash gtpu4 sde`

In addition to gtpu(4|6), we now support gtpc(4|6),gtpc(4|6)t,gtpu(4|6)e,
gtpu(4|6)u, and gtpu(4|6)d.

gtpc(4|6): Used for GTP-C in IPv4 and IPv6, where the GTP header format does
not include a TEID.
gtpc(4|6)t: Used for GTP-C in IPv4 and IPv6, with a GTP header format that
includes a TEID.
gtpu(4|6): Used for GTP-U in both IPv4 and IPv6 scenarios.
gtpu(4|6)e: Used for GTP-U with extended headers in both IPv4 and IPv6.
gtpu(4|6)u: Used when the PSC (PDU session container) in the GTP-U extended
header includes Uplink, applicable to both IPv4 and IPv6.
gtpu(4|6)d: Used when the PSC in the GTP-U extended header includes Downlink,
for both IPv4 and IPv6.

GTP generates a flow that includes an ID called TEID to identify the tunnel.
This tunnel is created for each UE (User Equipment).By performing RSS based on
this flow, it is possible to apply RSS for each communication unit from the UE.
Without this, RSS would only be effective within the range of IP addresses. For
instance, the PGW can only perform RSS within the IP range of the SGW.
Problematic from a load distribution perspective, especially if there's a bias
in the terminals connected to a particular base station.This case can be
solved by using this patch.

Signed-off-by: Takeru Hayasaka <hayatake396@gmail.com>
Reviewed-by: Marcin Szycik <marcin.szycik@linux.intel.com>
Tested-by: Pucha Himasekhar Reddy <himasekharx.reddy.pucha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/uapi/linux/ethtool.h | 48 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 06ef6b78b7de..11fc18988bc2 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -2023,6 +2023,53 @@ static inline int ethtool_validate_duplex(__u8 duplex)
 #define	IPV4_FLOW	0x10	/* hash only */
 #define	IPV6_FLOW	0x11	/* hash only */
 #define	ETHER_FLOW	0x12	/* spec only (ether_spec) */
+
+/* Used for GTP-U IPv4 and IPv6.
+ * The format of GTP packets only includes
+ * elements such as TEID and GTP version.
+ * It is primarily intended for data communication of the UE.
+ */
+#define GTPU_V4_FLOW 0x13	/* hash only */
+#define GTPU_V6_FLOW 0x14	/* hash only */
+
+/* Use for GTP-C IPv4 and v6.
+ * The format of these GTP packets does not include TEID.
+ * Primarily expected to be used for communication
+ * to create sessions for UE data communication,
+ * commonly referred to as CSR (Create Session Request).
+ */
+#define GTPC_V4_FLOW 0x15	/* hash only */
+#define GTPC_V6_FLOW 0x16	/* hash only */
+
+/* Use for GTP-C IPv4 and v6.
+ * Unlike GTPC_V4_FLOW, the format of these GTP packets includes TEID.
+ * After session creation, it becomes this packet.
+ * This is mainly used for requests to realize UE handover.
+ */
+#define GTPC_TEID_V4_FLOW 0x17	/* hash only */
+#define GTPC_TEID_V6_FLOW 0x18	/* hash only */
+
+/* Use for GTP-U and extended headers for the PSC (PDU Session Container).
+ * The format of these GTP packets includes TEID and QFI.
+ * In 5G communication using UPF (User Plane Function),
+ * data communication with this extended header is performed.
+ */
+#define GTPU_EH_V4_FLOW 0x19	/* hash only */
+#define GTPU_EH_V6_FLOW 0x1a	/* hash only */
+
+/* Use for GTP-U IPv4 and v6 PSC (PDU Session Container) extended headers.
+ * This differs from GTPU_EH_V(4|6)_FLOW in that it is distinguished by
+ * UL/DL included in the PSC.
+ * There are differences in the data included based on Downlink/Uplink,
+ * and can be used to distinguish packets.
+ * The functions described so far are useful when you want to
+ * handle communication from the mobile network in UPF, PGW, etc.
+ */
+#define GTPU_UL_V4_FLOW 0x1b	/* hash only */
+#define GTPU_UL_V6_FLOW 0x1c	/* hash only */
+#define GTPU_DL_V4_FLOW 0x1d	/* hash only */
+#define GTPU_DL_V6_FLOW 0x1e	/* hash only */
+
 /* Flag to enable additional fields in struct ethtool_rx_flow_spec */
 #define	FLOW_EXT	0x80000000
 #define	FLOW_MAC_EXT	0x40000000
@@ -2037,6 +2084,7 @@ static inline int ethtool_validate_duplex(__u8 duplex)
 #define	RXH_IP_DST	(1 << 5)
 #define	RXH_L4_B_0_1	(1 << 6) /* src port in case of TCP/UDP/SCTP */
 #define	RXH_L4_B_2_3	(1 << 7) /* dst port in case of TCP/UDP/SCTP */
+#define	RXH_GTP_TEID	(1 << 8) /* teid in case of GTP */
 #define	RXH_DISCARD	(1 << 31)
 
 #define	RX_CLS_FLOW_DISC	0xffffffffffffffffULL
-- 
cgit v1.2.3


From e8a1e58345cf40b7b272e08ac7b32328b2543e40 Mon Sep 17 00:00:00 2001
From: Fedor Pchelkin <pchelkin@ispras.ru>
Date: Wed, 28 Feb 2024 19:38:39 +0300
Subject: mac802154: fix llsec key resources release in mac802154_llsec_key_del

mac802154_llsec_key_del() can free resources of a key directly without
following the RCU rules for waiting before the end of a grace period. This
may lead to use-after-free in case llsec_lookup_key() is traversing the
list of keys in parallel with a key deletion:

refcount_t: addition on 0; use-after-free.
WARNING: CPU: 4 PID: 16000 at lib/refcount.c:25 refcount_warn_saturate+0x162/0x2a0
Modules linked in:
CPU: 4 PID: 16000 Comm: wpan-ping Not tainted 6.7.0 #19
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
RIP: 0010:refcount_warn_saturate+0x162/0x2a0
Call Trace:
 <TASK>
 llsec_lookup_key.isra.0+0x890/0x9e0
 mac802154_llsec_encrypt+0x30c/0x9c0
 ieee802154_subif_start_xmit+0x24/0x1e0
 dev_hard_start_xmit+0x13e/0x690
 sch_direct_xmit+0x2ae/0xbc0
 __dev_queue_xmit+0x11dd/0x3c20
 dgram_sendmsg+0x90b/0xd60
 __sys_sendto+0x466/0x4c0
 __x64_sys_sendto+0xe0/0x1c0
 do_syscall_64+0x45/0xf0
 entry_SYSCALL_64_after_hwframe+0x6e/0x76

Also, ieee802154_llsec_key_entry structures are not freed by
mac802154_llsec_key_del():

unreferenced object 0xffff8880613b6980 (size 64):
  comm "iwpan", pid 2176, jiffies 4294761134 (age 60.475s)
  hex dump (first 32 bytes):
    78 0d 8f 18 80 88 ff ff 22 01 00 00 00 00 ad de  x.......".......
    00 00 00 00 00 00 00 00 03 00 cd ab 00 00 00 00  ................
  backtrace:
    [<ffffffff81dcfa62>] __kmem_cache_alloc_node+0x1e2/0x2d0
    [<ffffffff81c43865>] kmalloc_trace+0x25/0xc0
    [<ffffffff88968b09>] mac802154_llsec_key_add+0xac9/0xcf0
    [<ffffffff8896e41a>] ieee802154_add_llsec_key+0x5a/0x80
    [<ffffffff8892adc6>] nl802154_add_llsec_key+0x426/0x5b0
    [<ffffffff86ff293e>] genl_family_rcv_msg_doit+0x1fe/0x2f0
    [<ffffffff86ff46d1>] genl_rcv_msg+0x531/0x7d0
    [<ffffffff86fee7a9>] netlink_rcv_skb+0x169/0x440
    [<ffffffff86ff1d88>] genl_rcv+0x28/0x40
    [<ffffffff86fec15c>] netlink_unicast+0x53c/0x820
    [<ffffffff86fecd8b>] netlink_sendmsg+0x93b/0xe60
    [<ffffffff86b91b35>] ____sys_sendmsg+0xac5/0xca0
    [<ffffffff86b9c3dd>] ___sys_sendmsg+0x11d/0x1c0
    [<ffffffff86b9c65a>] __sys_sendmsg+0xfa/0x1d0
    [<ffffffff88eadbf5>] do_syscall_64+0x45/0xf0
    [<ffffffff890000ea>] entry_SYSCALL_64_after_hwframe+0x6e/0x76

Handle the proper resource release in the RCU callback function
mac802154_llsec_key_del_rcu().

Note that if llsec_lookup_key() finds a key, it gets a refcount via
llsec_key_get() and locally copies key id from key_entry (which is a
list element). So it's safe to call llsec_key_put() and free the list
entry after the RCU grace period elapses.

Found by Linux Verification Center (linuxtesting.org).

Fixes: 5d637d5aabd8 ("mac802154: add llsec structures and mutators")
Cc: stable@vger.kernel.org
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Acked-by: Alexander Aring <aahringo@redhat.com>
Message-ID: <20240228163840.6667-1-pchelkin@ispras.ru>
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 include/net/cfg802154.h |  1 +
 net/mac802154/llsec.c   | 18 +++++++++++++-----
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index cd95711b12b8..76d2cd2e2b30 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -401,6 +401,7 @@ struct ieee802154_llsec_key {
 
 struct ieee802154_llsec_key_entry {
 	struct list_head list;
+	struct rcu_head rcu;
 
 	struct ieee802154_llsec_key_id id;
 	struct ieee802154_llsec_key *key;
diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c
index 8d2eabc71bbe..f13b07ebfb98 100644
--- a/net/mac802154/llsec.c
+++ b/net/mac802154/llsec.c
@@ -265,19 +265,27 @@ fail:
 	return -ENOMEM;
 }
 
+static void mac802154_llsec_key_del_rcu(struct rcu_head *rcu)
+{
+	struct ieee802154_llsec_key_entry *pos;
+	struct mac802154_llsec_key *mkey;
+
+	pos = container_of(rcu, struct ieee802154_llsec_key_entry, rcu);
+	mkey = container_of(pos->key, struct mac802154_llsec_key, key);
+
+	llsec_key_put(mkey);
+	kfree_sensitive(pos);
+}
+
 int mac802154_llsec_key_del(struct mac802154_llsec *sec,
 			    const struct ieee802154_llsec_key_id *key)
 {
 	struct ieee802154_llsec_key_entry *pos;
 
 	list_for_each_entry(pos, &sec->table.keys, list) {
-		struct mac802154_llsec_key *mkey;
-
-		mkey = container_of(pos->key, struct mac802154_llsec_key, key);
-
 		if (llsec_key_id_equal(&pos->id, key)) {
 			list_del_rcu(&pos->list);
-			llsec_key_put(mkey);
+			call_rcu(&pos->rcu, mac802154_llsec_key_del_rcu);
 			return 0;
 		}
 	}
-- 
cgit v1.2.3


From 968667f2e0345a67a6eea5a502f4659085666564 Mon Sep 17 00:00:00 2001
From: Jonas Dreßler <verdre@v0yd.nl>
Date: Sun, 7 Jan 2024 19:02:47 +0100
Subject: Bluetooth: Remove HCI_POWER_OFF_TIMEOUT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With commit cf75ad8b41d2 ("Bluetooth: hci_sync: Convert MGMT_SET_POWERED"),
the power off sequence got refactored so that this timeout was no longer
necessary, let's remove the leftover define from the header too.

Fixes: cf75ad8b41d2 ("Bluetooth: hci_sync: Convert MGMT_SET_POWERED")
Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index bdee5d649cc6..f7918c755183 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -437,7 +437,6 @@ enum {
 #define HCI_NCMD_TIMEOUT	msecs_to_jiffies(4000)	/* 4 seconds */
 #define HCI_ACL_TX_TIMEOUT	msecs_to_jiffies(45000)	/* 45 seconds */
 #define HCI_AUTO_OFF_TIMEOUT	msecs_to_jiffies(2000)	/* 2 seconds */
-#define HCI_POWER_OFF_TIMEOUT	msecs_to_jiffies(5000)	/* 5 seconds */
 #define HCI_LE_CONN_TIMEOUT	msecs_to_jiffies(20000)	/* 20 seconds */
 #define HCI_LE_AUTOCONN_TIMEOUT	msecs_to_jiffies(4000)	/* 4 seconds */
 
-- 
cgit v1.2.3


From b14202aff5acba3b672704d792e821f02f8f562a Mon Sep 17 00:00:00 2001
From: Jonas Dreßler <verdre@v0yd.nl>
Date: Sun, 7 Jan 2024 19:02:49 +0100
Subject: Bluetooth: Add new state HCI_POWERING_DOWN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new state HCI_POWERING_DOWN that indicates that the device is
currently powering down, this will be useful for the next commit.

Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h |  1 +
 net/bluetooth/hci_sync.c    | 16 +++++++++++-----
 net/bluetooth/mgmt.c        | 14 ++++++++++++++
 3 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index f7918c755183..a94a8491ec7a 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -372,6 +372,7 @@ enum {
 	HCI_SETUP,
 	HCI_CONFIG,
 	HCI_DEBUGFS_CREATED,
+	HCI_POWERING_DOWN,
 	HCI_AUTO_OFF,
 	HCI_RFKILLED,
 	HCI_MGMT,
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 5716345a26df..b146562a65fc 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -5403,27 +5403,33 @@ static int hci_power_off_sync(struct hci_dev *hdev)
 	if (!test_bit(HCI_UP, &hdev->flags))
 		return 0;
 
+	hci_dev_set_flag(hdev, HCI_POWERING_DOWN);
+
 	if (test_bit(HCI_ISCAN, &hdev->flags) ||
 	    test_bit(HCI_PSCAN, &hdev->flags)) {
 		err = hci_write_scan_enable_sync(hdev, 0x00);
 		if (err)
-			return err;
+			goto out;
 	}
 
 	err = hci_clear_adv_sync(hdev, NULL, false);
 	if (err)
-		return err;
+		goto out;
 
 	err = hci_stop_discovery_sync(hdev);
 	if (err)
-		return err;
+		goto out;
 
 	/* Terminated due to Power Off */
 	err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF);
 	if (err)
-		return err;
+		goto out;
+
+	err = hci_dev_close_sync(hdev);
 
-	return hci_dev_close_sync(hdev);
+out:
+	hci_dev_clear_flag(hdev, HCI_POWERING_DOWN);
+	return err;
 }
 
 int hci_set_powered_sync(struct hci_dev *hdev, u8 val)
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 688890f581cb..3fe6db8a0ec0 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -1390,6 +1390,14 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	hci_dev_lock(hdev);
 
+	if (!cp->val) {
+		if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN)) {
+			err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED,
+					      MGMT_STATUS_BUSY);
+			goto failed;
+		}
+	}
+
 	if (pending_find(MGMT_OP_SET_POWERED, hdev)) {
 		err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED,
 				      MGMT_STATUS_BUSY);
@@ -9748,6 +9756,9 @@ bool mgmt_powering_down(struct hci_dev *hdev)
 	struct mgmt_pending_cmd *cmd;
 	struct mgmt_mode *cp;
 
+	if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN))
+		return true;
+
 	cmd = pending_find(MGMT_OP_SET_POWERED, hdev);
 	if (!cmd)
 		return false;
@@ -10055,6 +10066,9 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status)
 		/* If this is a HCI command related to powering on the
 		 * HCI dev don't send any mgmt signals.
 		 */
+		if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN))
+			return;
+
 		if (pending_find(MGMT_OP_SET_POWERED, hdev))
 			return;
 	}
-- 
cgit v1.2.3


From 79c0868ad65a8fc7cdfaa5f2b77a4b70d0b0ea16 Mon Sep 17 00:00:00 2001
From: Jonas Dreßler <verdre@v0yd.nl>
Date: Mon, 8 Jan 2024 23:46:07 +0100
Subject: Bluetooth: hci_event: Use HCI error defines instead of magic values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We have error defines already, so let's use them.

Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h | 2 ++
 net/bluetooth/hci_event.c   | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index a94a8491ec7a..1cd212bb3789 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -653,6 +653,7 @@ enum {
 #define HCI_ERROR_PIN_OR_KEY_MISSING	0x06
 #define HCI_ERROR_MEMORY_EXCEEDED	0x07
 #define HCI_ERROR_CONNECTION_TIMEOUT	0x08
+#define HCI_ERROR_COMMAND_DISALLOWED	0x0c
 #define HCI_ERROR_REJ_LIMITED_RESOURCES	0x0d
 #define HCI_ERROR_REJ_BAD_ADDR		0x0f
 #define HCI_ERROR_INVALID_PARAMETERS	0x12
@@ -661,6 +662,7 @@ enum {
 #define HCI_ERROR_REMOTE_POWER_OFF	0x15
 #define HCI_ERROR_LOCAL_HOST_TERM	0x16
 #define HCI_ERROR_PAIRING_NOT_ALLOWED	0x18
+#define HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE	0x1e
 #define HCI_ERROR_INVALID_LL_PARAMS	0x1e
 #define HCI_ERROR_UNSPECIFIED		0x1f
 #define HCI_ERROR_ADVERTISING_TIMEOUT	0x3c
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 1f63f77661dc..31df5f5b7994 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -95,11 +95,11 @@ static u8 hci_cc_inquiry_cancel(struct hci_dev *hdev, void *data,
 	/* It is possible that we receive Inquiry Complete event right
 	 * before we receive Inquiry Cancel Command Complete event, in
 	 * which case the latter event should have status of Command
-	 * Disallowed (0x0c). This should not be treated as error, since
+	 * Disallowed. This should not be treated as error, since
 	 * we actually achieve what Inquiry Cancel wants to achieve,
 	 * which is to end the last Inquiry session.
 	 */
-	if (rp->status == 0x0c && !test_bit(HCI_INQUIRY, &hdev->flags)) {
+	if (rp->status == HCI_ERROR_COMMAND_DISALLOWED && !test_bit(HCI_INQUIRY, &hdev->flags)) {
 		bt_dev_warn(hdev, "Ignoring error of Inquiry Cancel command");
 		rp->status = 0x00;
 	}
@@ -2342,7 +2342,7 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
 
 	if (status) {
 		if (conn && conn->state == BT_CONNECT) {
-			if (status != 0x0c || conn->attempt > 2) {
+			if (status != HCI_ERROR_COMMAND_DISALLOWED || conn->attempt > 2) {
 				conn->state = BT_CLOSED;
 				hci_connect_cfm(conn, status);
 				hci_conn_del(conn);
@@ -6682,7 +6682,7 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, void *data,
 			 * transition into connected state and mark it as
 			 * successful.
 			 */
-			if (!conn->out && ev->status == 0x1a &&
+			if (!conn->out && ev->status == HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE &&
 			    (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES))
 				status = 0x00;
 			else
-- 
cgit v1.2.3


From 63298d6e752fc0ec7f5093860af8bc9f047b30c8 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 9 Jan 2024 13:45:40 -0500
Subject: Bluetooth: hci_core: Cancel request on command timeout

If command has timed out call __hci_cmd_sync_cancel to notify the
hci_req since it will inevitably cause a timeout.

This also rework the code around __hci_cmd_sync_cancel since it was
wrongly assuming it needs to cancel timer as well, but sometimes the
timers have not been started or in fact they already had timed out in
which case they don't need to be cancel yet again.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |  2 +-
 net/bluetooth/hci_core.c         | 84 +++++++++++++++++++++++++++-------------
 net/bluetooth/hci_request.c      |  2 +-
 net/bluetooth/hci_sync.c         | 20 +++++-----
 net/bluetooth/mgmt.c             |  2 +-
 5 files changed, 71 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 6efbc2152146..e2582c242544 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -42,7 +42,7 @@ int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen,
 void hci_cmd_sync_init(struct hci_dev *hdev);
 void hci_cmd_sync_clear(struct hci_dev *hdev);
 void hci_cmd_sync_cancel(struct hci_dev *hdev, int err);
-void __hci_cmd_sync_cancel(struct hci_dev *hdev, int err);
+void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err);
 
 int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 			void *data, hci_cmd_sync_work_destroy_t destroy);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index e5cb618fa6d3..de730d210ccb 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1523,10 +1523,11 @@ static void hci_cmd_timeout(struct work_struct *work)
 					    cmd_timer.work);
 
 	if (hdev->sent_cmd) {
-		struct hci_command_hdr *sent = (void *) hdev->sent_cmd->data;
-		u16 opcode = __le16_to_cpu(sent->opcode);
+		u16 opcode = hci_skb_opcode(hdev->sent_cmd);
 
 		bt_dev_err(hdev, "command 0x%4.4x tx timeout", opcode);
+
+		hci_cmd_sync_cancel_sync(hdev, ETIMEDOUT);
 	} else {
 		bt_dev_err(hdev, "command tx timeout");
 	}
@@ -2857,6 +2858,23 @@ int hci_unregister_suspend_notifier(struct hci_dev *hdev)
 	return ret;
 }
 
+/* Cancel ongoing command synchronously:
+ *
+ * - Cancel command timer
+ * - Reset command counter
+ * - Cancel command request
+ */
+static void hci_cancel_cmd_sync(struct hci_dev *hdev, int err)
+{
+	bt_dev_dbg(hdev, "err 0x%2.2x", err);
+
+	cancel_delayed_work_sync(&hdev->cmd_timer);
+	cancel_delayed_work_sync(&hdev->ncmd_timer);
+	atomic_set(&hdev->cmd_cnt, 1);
+
+	hci_cmd_sync_cancel_sync(hdev, -err);
+}
+
 /* Suspend HCI device */
 int hci_suspend_dev(struct hci_dev *hdev)
 {
@@ -2874,7 +2892,7 @@ int hci_suspend_dev(struct hci_dev *hdev)
 		return 0;
 
 	/* Cancel potentially blocking sync operation before suspend */
-	__hci_cmd_sync_cancel(hdev, -EHOSTDOWN);
+	hci_cancel_cmd_sync(hdev, -EHOSTDOWN);
 
 	hci_req_sync_lock(hdev);
 	ret = hci_suspend_sync(hdev);
@@ -4159,6 +4177,33 @@ static void hci_rx_work(struct work_struct *work)
 	}
 }
 
+static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	int err;
+
+	bt_dev_dbg(hdev, "skb %p", skb);
+
+	kfree_skb(hdev->sent_cmd);
+
+	hdev->sent_cmd = skb_clone(skb, GFP_KERNEL);
+	if (!hdev->sent_cmd) {
+		skb_queue_head(&hdev->cmd_q, skb);
+		queue_work(hdev->workqueue, &hdev->cmd_work);
+		return;
+	}
+
+	err = hci_send_frame(hdev, skb);
+	if (err < 0) {
+		hci_cmd_sync_cancel_sync(hdev, err);
+		return;
+	}
+
+	if (hci_req_status_pend(hdev))
+		hci_dev_set_flag(hdev, HCI_CMD_PENDING);
+
+	atomic_dec(&hdev->cmd_cnt);
+}
+
 static void hci_cmd_work(struct work_struct *work)
 {
 	struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_work);
@@ -4173,30 +4218,15 @@ static void hci_cmd_work(struct work_struct *work)
 		if (!skb)
 			return;
 
-		kfree_skb(hdev->sent_cmd);
-
-		hdev->sent_cmd = skb_clone(skb, GFP_KERNEL);
-		if (hdev->sent_cmd) {
-			int res;
-			if (hci_req_status_pend(hdev))
-				hci_dev_set_flag(hdev, HCI_CMD_PENDING);
-			atomic_dec(&hdev->cmd_cnt);
+		hci_send_cmd_sync(hdev, skb);
 
-			res = hci_send_frame(hdev, skb);
-			if (res < 0)
-				__hci_cmd_sync_cancel(hdev, -res);
-
-			rcu_read_lock();
-			if (test_bit(HCI_RESET, &hdev->flags) ||
-			    hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
-				cancel_delayed_work(&hdev->cmd_timer);
-			else
-				queue_delayed_work(hdev->workqueue, &hdev->cmd_timer,
-						   HCI_CMD_TIMEOUT);
-			rcu_read_unlock();
-		} else {
-			skb_queue_head(&hdev->cmd_q, skb);
-			queue_work(hdev->workqueue, &hdev->cmd_work);
-		}
+		rcu_read_lock();
+		if (test_bit(HCI_RESET, &hdev->flags) ||
+		    hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
+			cancel_delayed_work(&hdev->cmd_timer);
+		else
+			queue_delayed_work(hdev->workqueue, &hdev->cmd_timer,
+					   HCI_CMD_TIMEOUT);
+		rcu_read_unlock();
 	}
 }
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 6e023b0104b0..00e02138003e 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -895,7 +895,7 @@ void hci_request_setup(struct hci_dev *hdev)
 
 void hci_request_cancel_all(struct hci_dev *hdev)
 {
-	__hci_cmd_sync_cancel(hdev, ENODEV);
+	hci_cmd_sync_cancel_sync(hdev, ENODEV);
 
 	cancel_interleave_scan(hdev);
 }
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index b146562a65fc..1122296ce3fa 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -584,7 +584,7 @@ void hci_cmd_sync_clear(struct hci_dev *hdev)
 	mutex_unlock(&hdev->cmd_sync_work_lock);
 }
 
-void __hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
+void hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
 {
 	bt_dev_dbg(hdev, "err 0x%2.2x", err);
 
@@ -592,15 +592,17 @@ void __hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
 		hdev->req_result = err;
 		hdev->req_status = HCI_REQ_CANCELED;
 
-		cancel_delayed_work_sync(&hdev->cmd_timer);
-		cancel_delayed_work_sync(&hdev->ncmd_timer);
-		atomic_set(&hdev->cmd_cnt, 1);
-
-		wake_up_interruptible(&hdev->req_wait_q);
+		queue_work(hdev->workqueue, &hdev->cmd_sync_cancel_work);
 	}
 }
+EXPORT_SYMBOL(hci_cmd_sync_cancel);
 
-void hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
+/* Cancel ongoing command request synchronously:
+ *
+ * - Set result and mark status to HCI_REQ_CANCELED
+ * - Wakeup command sync thread
+ */
+void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err)
 {
 	bt_dev_dbg(hdev, "err 0x%2.2x", err);
 
@@ -608,10 +610,10 @@ void hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
 		hdev->req_result = err;
 		hdev->req_status = HCI_REQ_CANCELED;
 
-		queue_work(hdev->workqueue, &hdev->cmd_sync_cancel_work);
+		wake_up_interruptible(&hdev->req_wait_q);
 	}
 }
-EXPORT_SYMBOL(hci_cmd_sync_cancel);
+EXPORT_SYMBOL(hci_cmd_sync_cancel_sync);
 
 /* Submit HCI command to be run in as cmd_sync_work:
  *
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 3fe6db8a0ec0..8c4493255f92 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -1417,7 +1417,7 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	/* Cancel potentially blocking sync operation before power off */
 	if (cp->val == 0x00) {
-		__hci_cmd_sync_cancel(hdev, -EHOSTDOWN);
+		hci_cmd_sync_cancel_sync(hdev, -EHOSTDOWN);
 		err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd,
 					 mgmt_set_powered_complete);
 	} else {
-- 
cgit v1.2.3


From e7b02296fb400ee64822fbdd81a0718449066333 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 1 Feb 2024 11:18:58 -0500
Subject: Bluetooth: Remove BT_HS

High Speed, Alternate MAC and PHY (AMP) extension, has been removed from
Bluetooth Core specification on 5.3:

https://www.bluetooth.com/blog/new-core-specification-v5-3-feature-enhancements/

Fixes: 244bc377591c ("Bluetooth: Add BT_HS config option")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h   |    1 -
 include/net/bluetooth/l2cap.h |   42 --
 net/bluetooth/Kconfig         |    8 -
 net/bluetooth/Makefile        |    1 -
 net/bluetooth/a2mp.c          | 1054 ----------------------------------------
 net/bluetooth/a2mp.h          |  154 ------
 net/bluetooth/amp.c           |  590 -----------------------
 net/bluetooth/amp.h           |   60 ---
 net/bluetooth/hci_conn.c      |    4 -
 net/bluetooth/hci_event.c     |    2 -
 net/bluetooth/l2cap_core.c    | 1069 +----------------------------------------
 net/bluetooth/l2cap_sock.c    |   18 +-
 net/bluetooth/mgmt.c          |   73 +--
 13 files changed, 20 insertions(+), 3056 deletions(-)
 delete mode 100644 net/bluetooth/a2mp.c
 delete mode 100644 net/bluetooth/a2mp.h
 delete mode 100644 net/bluetooth/amp.c
 delete mode 100644 net/bluetooth/amp.h

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 1cd212bb3789..aa6c69053d7c 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -394,7 +394,6 @@ enum {
 	HCI_LIMITED_PRIVACY,
 	HCI_RPA_EXPIRED,
 	HCI_RPA_RESOLVING,
-	HCI_HS_ENABLED,
 	HCI_LE_ENABLED,
 	HCI_ADVERTISING,
 	HCI_ADVERTISING_CONNECTABLE,
diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index cf393e72d6ed..92d7197f9a56 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -59,8 +59,6 @@
 #define L2CAP_WAIT_ACK_POLL_PERIOD	msecs_to_jiffies(200)
 #define L2CAP_WAIT_ACK_TIMEOUT		msecs_to_jiffies(10000)
 
-#define L2CAP_A2MP_DEFAULT_MTU		670
-
 /* L2CAP socket address */
 struct sockaddr_l2 {
 	sa_family_t	l2_family;
@@ -109,12 +107,6 @@ struct l2cap_conninfo {
 #define L2CAP_ECHO_RSP		0x09
 #define L2CAP_INFO_REQ		0x0a
 #define L2CAP_INFO_RSP		0x0b
-#define L2CAP_CREATE_CHAN_REQ	0x0c
-#define L2CAP_CREATE_CHAN_RSP	0x0d
-#define L2CAP_MOVE_CHAN_REQ	0x0e
-#define L2CAP_MOVE_CHAN_RSP	0x0f
-#define L2CAP_MOVE_CHAN_CFM	0x10
-#define L2CAP_MOVE_CHAN_CFM_RSP	0x11
 #define L2CAP_CONN_PARAM_UPDATE_REQ	0x12
 #define L2CAP_CONN_PARAM_UPDATE_RSP	0x13
 #define L2CAP_LE_CONN_REQ	0x14
@@ -144,7 +136,6 @@ struct l2cap_conninfo {
 /* L2CAP fixed channels */
 #define L2CAP_FC_SIG_BREDR	0x02
 #define L2CAP_FC_CONNLESS	0x04
-#define L2CAP_FC_A2MP		0x08
 #define L2CAP_FC_ATT		0x10
 #define L2CAP_FC_SIG_LE		0x20
 #define L2CAP_FC_SMP_LE		0x40
@@ -267,7 +258,6 @@ struct l2cap_conn_rsp {
 /* channel identifier */
 #define L2CAP_CID_SIGNALING	0x0001
 #define L2CAP_CID_CONN_LESS	0x0002
-#define L2CAP_CID_A2MP		0x0003
 #define L2CAP_CID_ATT		0x0004
 #define L2CAP_CID_LE_SIGNALING	0x0005
 #define L2CAP_CID_SMP		0x0006
@@ -282,7 +272,6 @@ struct l2cap_conn_rsp {
 #define L2CAP_CR_BAD_PSM	0x0002
 #define L2CAP_CR_SEC_BLOCK	0x0003
 #define L2CAP_CR_NO_MEM		0x0004
-#define L2CAP_CR_BAD_AMP	0x0005
 #define L2CAP_CR_INVALID_SCID	0x0006
 #define L2CAP_CR_SCID_IN_USE	0x0007
 
@@ -404,29 +393,6 @@ struct l2cap_info_rsp {
 	__u8        data[];
 } __packed;
 
-struct l2cap_create_chan_req {
-	__le16      psm;
-	__le16      scid;
-	__u8        amp_id;
-} __packed;
-
-struct l2cap_create_chan_rsp {
-	__le16      dcid;
-	__le16      scid;
-	__le16      result;
-	__le16      status;
-} __packed;
-
-struct l2cap_move_chan_req {
-	__le16      icid;
-	__u8        dest_amp_id;
-} __packed;
-
-struct l2cap_move_chan_rsp {
-	__le16      icid;
-	__le16      result;
-} __packed;
-
 #define L2CAP_MR_SUCCESS	0x0000
 #define L2CAP_MR_PEND		0x0001
 #define L2CAP_MR_BAD_ID		0x0002
@@ -539,8 +505,6 @@ struct l2cap_seq_list {
 
 struct l2cap_chan {
 	struct l2cap_conn	*conn;
-	struct hci_conn		*hs_hcon;
-	struct hci_chan		*hs_hchan;
 	struct kref	kref;
 	atomic_t	nesting;
 
@@ -591,12 +555,6 @@ struct l2cap_chan {
 	unsigned long	conn_state;
 	unsigned long	flags;
 
-	__u8		remote_amp_id;
-	__u8		local_amp_id;
-	__u8		move_id;
-	__u8		move_state;
-	__u8		move_role;
-
 	__u16		next_tx_seq;
 	__u16		expected_ack_seq;
 	__u16		expected_tx_seq;
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
index da7cac0a1b71..6b2b65a66700 100644
--- a/net/bluetooth/Kconfig
+++ b/net/bluetooth/Kconfig
@@ -62,14 +62,6 @@ source "net/bluetooth/cmtp/Kconfig"
 
 source "net/bluetooth/hidp/Kconfig"
 
-config BT_HS
-	bool "Bluetooth High Speed (HS) features"
-	depends on BT_BREDR
-	help
-	  Bluetooth High Speed includes support for off-loading
-	  Bluetooth connections via 802.11 (wifi) physical layer
-	  available with Bluetooth version 3.0 or later.
-
 config BT_LE
 	bool "Bluetooth Low Energy (LE) features"
 	depends on BT
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index 141ac1fda0bf..628d448d78be 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -21,7 +21,6 @@ bluetooth-$(CONFIG_DEV_COREDUMP) += coredump.o
 
 bluetooth-$(CONFIG_BT_BREDR) += sco.o
 bluetooth-$(CONFIG_BT_LE) += iso.o
-bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o
 bluetooth-$(CONFIG_BT_LEDS) += leds.o
 bluetooth-$(CONFIG_BT_MSFTEXT) += msft.o
 bluetooth-$(CONFIG_BT_AOSPEXT) += aosp.o
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
deleted file mode 100644
index e7adb8a98cf9..000000000000
--- a/net/bluetooth/a2mp.c
+++ /dev/null
@@ -1,1054 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
-   Copyright (c) 2010,2011 Code Aurora Forum.  All rights reserved.
-   Copyright (c) 2011,2012 Intel Corp.
-
-*/
-
-#include <net/bluetooth/bluetooth.h>
-#include <net/bluetooth/hci_core.h>
-#include <net/bluetooth/l2cap.h>
-
-#include "hci_request.h"
-#include "a2mp.h"
-#include "amp.h"
-
-#define A2MP_FEAT_EXT	0x8000
-
-/* Global AMP Manager list */
-static LIST_HEAD(amp_mgr_list);
-static DEFINE_MUTEX(amp_mgr_list_lock);
-
-/* A2MP build & send command helper functions */
-static struct a2mp_cmd *__a2mp_build(u8 code, u8 ident, u16 len, void *data)
-{
-	struct a2mp_cmd *cmd;
-	int plen;
-
-	plen = sizeof(*cmd) + len;
-	cmd = kzalloc(plen, GFP_KERNEL);
-	if (!cmd)
-		return NULL;
-
-	cmd->code = code;
-	cmd->ident = ident;
-	cmd->len = cpu_to_le16(len);
-
-	memcpy(cmd->data, data, len);
-
-	return cmd;
-}
-
-static void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *data)
-{
-	struct l2cap_chan *chan = mgr->a2mp_chan;
-	struct a2mp_cmd *cmd;
-	u16 total_len = len + sizeof(*cmd);
-	struct kvec iv;
-	struct msghdr msg;
-
-	cmd = __a2mp_build(code, ident, len, data);
-	if (!cmd)
-		return;
-
-	iv.iov_base = cmd;
-	iv.iov_len = total_len;
-
-	memset(&msg, 0, sizeof(msg));
-
-	iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iv, 1, total_len);
-
-	l2cap_chan_send(chan, &msg, total_len);
-
-	kfree(cmd);
-}
-
-static u8 __next_ident(struct amp_mgr *mgr)
-{
-	if (++mgr->ident == 0)
-		mgr->ident = 1;
-
-	return mgr->ident;
-}
-
-static struct amp_mgr *amp_mgr_lookup_by_state(u8 state)
-{
-	struct amp_mgr *mgr;
-
-	mutex_lock(&amp_mgr_list_lock);
-	list_for_each_entry(mgr, &amp_mgr_list, list) {
-		if (test_and_clear_bit(state, &mgr->state)) {
-			amp_mgr_get(mgr);
-			mutex_unlock(&amp_mgr_list_lock);
-			return mgr;
-		}
-	}
-	mutex_unlock(&amp_mgr_list_lock);
-
-	return NULL;
-}
-
-/* hci_dev_list shall be locked */
-static void __a2mp_add_cl(struct amp_mgr *mgr, struct a2mp_cl *cl)
-{
-	struct hci_dev *hdev;
-	int i = 1;
-
-	cl[0].id = AMP_ID_BREDR;
-	cl[0].type = AMP_TYPE_BREDR;
-	cl[0].status = AMP_STATUS_BLUETOOTH_ONLY;
-
-	list_for_each_entry(hdev, &hci_dev_list, list) {
-		if (hdev->dev_type == HCI_AMP) {
-			cl[i].id = hdev->id;
-			cl[i].type = hdev->amp_type;
-			if (test_bit(HCI_UP, &hdev->flags))
-				cl[i].status = hdev->amp_status;
-			else
-				cl[i].status = AMP_STATUS_POWERED_DOWN;
-			i++;
-		}
-	}
-}
-
-/* Processing A2MP messages */
-static int a2mp_command_rej(struct amp_mgr *mgr, struct sk_buff *skb,
-			    struct a2mp_cmd *hdr)
-{
-	struct a2mp_cmd_rej *rej = (void *) skb->data;
-
-	if (le16_to_cpu(hdr->len) < sizeof(*rej))
-		return -EINVAL;
-
-	BT_DBG("ident %u reason %d", hdr->ident, le16_to_cpu(rej->reason));
-
-	skb_pull(skb, sizeof(*rej));
-
-	return 0;
-}
-
-static int a2mp_discover_req(struct amp_mgr *mgr, struct sk_buff *skb,
-			     struct a2mp_cmd *hdr)
-{
-	struct a2mp_discov_req *req = (void *) skb->data;
-	u16 len = le16_to_cpu(hdr->len);
-	struct a2mp_discov_rsp *rsp;
-	u16 ext_feat;
-	u8 num_ctrl;
-	struct hci_dev *hdev;
-
-	if (len < sizeof(*req))
-		return -EINVAL;
-
-	skb_pull(skb, sizeof(*req));
-
-	ext_feat = le16_to_cpu(req->ext_feat);
-
-	BT_DBG("mtu %d efm 0x%4.4x", le16_to_cpu(req->mtu), ext_feat);
-
-	/* check that packet is not broken for now */
-	while (ext_feat & A2MP_FEAT_EXT) {
-		if (len < sizeof(ext_feat))
-			return -EINVAL;
-
-		ext_feat = get_unaligned_le16(skb->data);
-		BT_DBG("efm 0x%4.4x", ext_feat);
-		len -= sizeof(ext_feat);
-		skb_pull(skb, sizeof(ext_feat));
-	}
-
-	read_lock(&hci_dev_list_lock);
-
-	/* at minimum the BR/EDR needs to be listed */
-	num_ctrl = 1;
-
-	list_for_each_entry(hdev, &hci_dev_list, list) {
-		if (hdev->dev_type == HCI_AMP)
-			num_ctrl++;
-	}
-
-	len = struct_size(rsp, cl, num_ctrl);
-	rsp = kmalloc(len, GFP_ATOMIC);
-	if (!rsp) {
-		read_unlock(&hci_dev_list_lock);
-		return -ENOMEM;
-	}
-
-	rsp->mtu = cpu_to_le16(L2CAP_A2MP_DEFAULT_MTU);
-	rsp->ext_feat = 0;
-
-	__a2mp_add_cl(mgr, rsp->cl);
-
-	read_unlock(&hci_dev_list_lock);
-
-	a2mp_send(mgr, A2MP_DISCOVER_RSP, hdr->ident, len, rsp);
-
-	kfree(rsp);
-	return 0;
-}
-
-static int a2mp_discover_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
-			     struct a2mp_cmd *hdr)
-{
-	struct a2mp_discov_rsp *rsp = (void *) skb->data;
-	u16 len = le16_to_cpu(hdr->len);
-	struct a2mp_cl *cl;
-	u16 ext_feat;
-	bool found = false;
-
-	if (len < sizeof(*rsp))
-		return -EINVAL;
-
-	len -= sizeof(*rsp);
-	skb_pull(skb, sizeof(*rsp));
-
-	ext_feat = le16_to_cpu(rsp->ext_feat);
-
-	BT_DBG("mtu %d efm 0x%4.4x", le16_to_cpu(rsp->mtu), ext_feat);
-
-	/* check that packet is not broken for now */
-	while (ext_feat & A2MP_FEAT_EXT) {
-		if (len < sizeof(ext_feat))
-			return -EINVAL;
-
-		ext_feat = get_unaligned_le16(skb->data);
-		BT_DBG("efm 0x%4.4x", ext_feat);
-		len -= sizeof(ext_feat);
-		skb_pull(skb, sizeof(ext_feat));
-	}
-
-	cl = (void *) skb->data;
-	while (len >= sizeof(*cl)) {
-		BT_DBG("Remote AMP id %u type %u status %u", cl->id, cl->type,
-		       cl->status);
-
-		if (cl->id != AMP_ID_BREDR && cl->type != AMP_TYPE_BREDR) {
-			struct a2mp_info_req req;
-
-			found = true;
-
-			memset(&req, 0, sizeof(req));
-
-			req.id = cl->id;
-			a2mp_send(mgr, A2MP_GETINFO_REQ, __next_ident(mgr),
-				  sizeof(req), &req);
-		}
-
-		len -= sizeof(*cl);
-		cl = skb_pull(skb, sizeof(*cl));
-	}
-
-	/* Fall back to L2CAP init sequence */
-	if (!found) {
-		struct l2cap_conn *conn = mgr->l2cap_conn;
-		struct l2cap_chan *chan;
-
-		mutex_lock(&conn->chan_lock);
-
-		list_for_each_entry(chan, &conn->chan_l, list) {
-
-			BT_DBG("chan %p state %s", chan,
-			       state_to_string(chan->state));
-
-			if (chan->scid == L2CAP_CID_A2MP)
-				continue;
-
-			l2cap_chan_lock(chan);
-
-			if (chan->state == BT_CONNECT)
-				l2cap_send_conn_req(chan);
-
-			l2cap_chan_unlock(chan);
-		}
-
-		mutex_unlock(&conn->chan_lock);
-	}
-
-	return 0;
-}
-
-static int a2mp_change_notify(struct amp_mgr *mgr, struct sk_buff *skb,
-			      struct a2mp_cmd *hdr)
-{
-	struct a2mp_cl *cl = (void *) skb->data;
-
-	while (skb->len >= sizeof(*cl)) {
-		BT_DBG("Controller id %u type %u status %u", cl->id, cl->type,
-		       cl->status);
-		cl = skb_pull(skb, sizeof(*cl));
-	}
-
-	/* TODO send A2MP_CHANGE_RSP */
-
-	return 0;
-}
-
-static void read_local_amp_info_complete(struct hci_dev *hdev, u8 status,
-					 u16 opcode)
-{
-	BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
-	a2mp_send_getinfo_rsp(hdev);
-}
-
-static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb,
-			    struct a2mp_cmd *hdr)
-{
-	struct a2mp_info_req *req  = (void *) skb->data;
-	struct hci_dev *hdev;
-	struct hci_request hreq;
-	int err = 0;
-
-	if (le16_to_cpu(hdr->len) < sizeof(*req))
-		return -EINVAL;
-
-	BT_DBG("id %u", req->id);
-
-	hdev = hci_dev_get(req->id);
-	if (!hdev || hdev->dev_type != HCI_AMP) {
-		struct a2mp_info_rsp rsp;
-
-		memset(&rsp, 0, sizeof(rsp));
-
-		rsp.id = req->id;
-		rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
-
-		a2mp_send(mgr, A2MP_GETINFO_RSP, hdr->ident, sizeof(rsp),
-			  &rsp);
-
-		goto done;
-	}
-
-	set_bit(READ_LOC_AMP_INFO, &mgr->state);
-	hci_req_init(&hreq, hdev);
-	hci_req_add(&hreq, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL);
-	err = hci_req_run(&hreq, read_local_amp_info_complete);
-	if (err < 0)
-		a2mp_send_getinfo_rsp(hdev);
-
-done:
-	if (hdev)
-		hci_dev_put(hdev);
-
-	skb_pull(skb, sizeof(*req));
-	return 0;
-}
-
-static int a2mp_getinfo_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
-			    struct a2mp_cmd *hdr)
-{
-	struct a2mp_info_rsp *rsp = (struct a2mp_info_rsp *) skb->data;
-	struct a2mp_amp_assoc_req req;
-	struct amp_ctrl *ctrl;
-
-	if (le16_to_cpu(hdr->len) < sizeof(*rsp))
-		return -EINVAL;
-
-	BT_DBG("id %u status 0x%2.2x", rsp->id, rsp->status);
-
-	if (rsp->status)
-		return -EINVAL;
-
-	ctrl = amp_ctrl_add(mgr, rsp->id);
-	if (!ctrl)
-		return -ENOMEM;
-
-	memset(&req, 0, sizeof(req));
-
-	req.id = rsp->id;
-	a2mp_send(mgr, A2MP_GETAMPASSOC_REQ, __next_ident(mgr), sizeof(req),
-		  &req);
-
-	skb_pull(skb, sizeof(*rsp));
-	return 0;
-}
-
-static int a2mp_getampassoc_req(struct amp_mgr *mgr, struct sk_buff *skb,
-				struct a2mp_cmd *hdr)
-{
-	struct a2mp_amp_assoc_req *req = (void *) skb->data;
-	struct hci_dev *hdev;
-	struct amp_mgr *tmp;
-
-	if (le16_to_cpu(hdr->len) < sizeof(*req))
-		return -EINVAL;
-
-	BT_DBG("id %u", req->id);
-
-	/* Make sure that other request is not processed */
-	tmp = amp_mgr_lookup_by_state(READ_LOC_AMP_ASSOC);
-
-	hdev = hci_dev_get(req->id);
-	if (!hdev || hdev->amp_type == AMP_TYPE_BREDR || tmp) {
-		struct a2mp_amp_assoc_rsp rsp;
-
-		memset(&rsp, 0, sizeof(rsp));
-		rsp.id = req->id;
-
-		if (tmp) {
-			rsp.status = A2MP_STATUS_COLLISION_OCCURED;
-			amp_mgr_put(tmp);
-		} else {
-			rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
-		}
-
-		a2mp_send(mgr, A2MP_GETAMPASSOC_RSP, hdr->ident, sizeof(rsp),
-			  &rsp);
-
-		goto done;
-	}
-
-	amp_read_loc_assoc(hdev, mgr);
-
-done:
-	if (hdev)
-		hci_dev_put(hdev);
-
-	skb_pull(skb, sizeof(*req));
-	return 0;
-}
-
-static int a2mp_getampassoc_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
-				struct a2mp_cmd *hdr)
-{
-	struct a2mp_amp_assoc_rsp *rsp = (void *) skb->data;
-	u16 len = le16_to_cpu(hdr->len);
-	struct hci_dev *hdev;
-	struct amp_ctrl *ctrl;
-	struct hci_conn *hcon;
-	size_t assoc_len;
-
-	if (len < sizeof(*rsp))
-		return -EINVAL;
-
-	assoc_len = len - sizeof(*rsp);
-
-	BT_DBG("id %u status 0x%2.2x assoc len %zu", rsp->id, rsp->status,
-	       assoc_len);
-
-	if (rsp->status)
-		return -EINVAL;
-
-	/* Save remote ASSOC data */
-	ctrl = amp_ctrl_lookup(mgr, rsp->id);
-	if (ctrl) {
-		u8 *assoc;
-
-		assoc = kmemdup(rsp->amp_assoc, assoc_len, GFP_KERNEL);
-		if (!assoc) {
-			amp_ctrl_put(ctrl);
-			return -ENOMEM;
-		}
-
-		ctrl->assoc = assoc;
-		ctrl->assoc_len = assoc_len;
-		ctrl->assoc_rem_len = assoc_len;
-		ctrl->assoc_len_so_far = 0;
-
-		amp_ctrl_put(ctrl);
-	}
-
-	/* Create Phys Link */
-	hdev = hci_dev_get(rsp->id);
-	if (!hdev)
-		return -EINVAL;
-
-	hcon = phylink_add(hdev, mgr, rsp->id, true);
-	if (!hcon)
-		goto done;
-
-	BT_DBG("Created hcon %p: loc:%u -> rem:%u", hcon, hdev->id, rsp->id);
-
-	mgr->bredr_chan->remote_amp_id = rsp->id;
-
-	amp_create_phylink(hdev, mgr, hcon);
-
-done:
-	hci_dev_put(hdev);
-	skb_pull(skb, len);
-	return 0;
-}
-
-static int a2mp_createphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb,
-				   struct a2mp_cmd *hdr)
-{
-	struct a2mp_physlink_req *req = (void *) skb->data;
-	struct a2mp_physlink_rsp rsp;
-	struct hci_dev *hdev;
-	struct hci_conn *hcon;
-	struct amp_ctrl *ctrl;
-
-	if (le16_to_cpu(hdr->len) < sizeof(*req))
-		return -EINVAL;
-
-	BT_DBG("local_id %u, remote_id %u", req->local_id, req->remote_id);
-
-	memset(&rsp, 0, sizeof(rsp));
-
-	rsp.local_id = req->remote_id;
-	rsp.remote_id = req->local_id;
-
-	hdev = hci_dev_get(req->remote_id);
-	if (!hdev || hdev->amp_type == AMP_TYPE_BREDR) {
-		rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
-		goto send_rsp;
-	}
-
-	ctrl = amp_ctrl_lookup(mgr, rsp.remote_id);
-	if (!ctrl) {
-		ctrl = amp_ctrl_add(mgr, rsp.remote_id);
-		if (ctrl) {
-			amp_ctrl_get(ctrl);
-		} else {
-			rsp.status = A2MP_STATUS_UNABLE_START_LINK_CREATION;
-			goto send_rsp;
-		}
-	}
-
-	if (ctrl) {
-		size_t assoc_len = le16_to_cpu(hdr->len) - sizeof(*req);
-		u8 *assoc;
-
-		assoc = kmemdup(req->amp_assoc, assoc_len, GFP_KERNEL);
-		if (!assoc) {
-			amp_ctrl_put(ctrl);
-			hci_dev_put(hdev);
-			return -ENOMEM;
-		}
-
-		ctrl->assoc = assoc;
-		ctrl->assoc_len = assoc_len;
-		ctrl->assoc_rem_len = assoc_len;
-		ctrl->assoc_len_so_far = 0;
-
-		amp_ctrl_put(ctrl);
-	}
-
-	hcon = phylink_add(hdev, mgr, req->local_id, false);
-	if (hcon) {
-		amp_accept_phylink(hdev, mgr, hcon);
-		rsp.status = A2MP_STATUS_SUCCESS;
-	} else {
-		rsp.status = A2MP_STATUS_UNABLE_START_LINK_CREATION;
-	}
-
-send_rsp:
-	if (hdev)
-		hci_dev_put(hdev);
-
-	/* Reply error now and success after HCI Write Remote AMP Assoc
-	   command complete with success status
-	 */
-	if (rsp.status != A2MP_STATUS_SUCCESS) {
-		a2mp_send(mgr, A2MP_CREATEPHYSLINK_RSP, hdr->ident,
-			  sizeof(rsp), &rsp);
-	} else {
-		set_bit(WRITE_REMOTE_AMP_ASSOC, &mgr->state);
-		mgr->ident = hdr->ident;
-	}
-
-	skb_pull(skb, le16_to_cpu(hdr->len));
-	return 0;
-}
-
-static int a2mp_discphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb,
-				 struct a2mp_cmd *hdr)
-{
-	struct a2mp_physlink_req *req = (void *) skb->data;
-	struct a2mp_physlink_rsp rsp;
-	struct hci_dev *hdev;
-	struct hci_conn *hcon;
-
-	if (le16_to_cpu(hdr->len) < sizeof(*req))
-		return -EINVAL;
-
-	BT_DBG("local_id %u remote_id %u", req->local_id, req->remote_id);
-
-	memset(&rsp, 0, sizeof(rsp));
-
-	rsp.local_id = req->remote_id;
-	rsp.remote_id = req->local_id;
-	rsp.status = A2MP_STATUS_SUCCESS;
-
-	hdev = hci_dev_get(req->remote_id);
-	if (!hdev) {
-		rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
-		goto send_rsp;
-	}
-
-	hcon = hci_conn_hash_lookup_ba(hdev, AMP_LINK,
-				       &mgr->l2cap_conn->hcon->dst);
-	if (!hcon) {
-		bt_dev_err(hdev, "no phys link exist");
-		rsp.status = A2MP_STATUS_NO_PHYSICAL_LINK_EXISTS;
-		goto clean;
-	}
-
-	/* TODO Disconnect Phys Link here */
-
-clean:
-	hci_dev_put(hdev);
-
-send_rsp:
-	a2mp_send(mgr, A2MP_DISCONNPHYSLINK_RSP, hdr->ident, sizeof(rsp), &rsp);
-
-	skb_pull(skb, sizeof(*req));
-	return 0;
-}
-
-static inline int a2mp_cmd_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
-			       struct a2mp_cmd *hdr)
-{
-	BT_DBG("ident %u code 0x%2.2x", hdr->ident, hdr->code);
-
-	skb_pull(skb, le16_to_cpu(hdr->len));
-	return 0;
-}
-
-/* Handle A2MP signalling */
-static int a2mp_chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
-{
-	struct a2mp_cmd *hdr;
-	struct amp_mgr *mgr = chan->data;
-	int err = 0;
-
-	amp_mgr_get(mgr);
-
-	while (skb->len >= sizeof(*hdr)) {
-		u16 len;
-
-		hdr = (void *) skb->data;
-		len = le16_to_cpu(hdr->len);
-
-		BT_DBG("code 0x%2.2x id %u len %u", hdr->code, hdr->ident, len);
-
-		skb_pull(skb, sizeof(*hdr));
-
-		if (len > skb->len || !hdr->ident) {
-			err = -EINVAL;
-			break;
-		}
-
-		mgr->ident = hdr->ident;
-
-		switch (hdr->code) {
-		case A2MP_COMMAND_REJ:
-			a2mp_command_rej(mgr, skb, hdr);
-			break;
-
-		case A2MP_DISCOVER_REQ:
-			err = a2mp_discover_req(mgr, skb, hdr);
-			break;
-
-		case A2MP_CHANGE_NOTIFY:
-			err = a2mp_change_notify(mgr, skb, hdr);
-			break;
-
-		case A2MP_GETINFO_REQ:
-			err = a2mp_getinfo_req(mgr, skb, hdr);
-			break;
-
-		case A2MP_GETAMPASSOC_REQ:
-			err = a2mp_getampassoc_req(mgr, skb, hdr);
-			break;
-
-		case A2MP_CREATEPHYSLINK_REQ:
-			err = a2mp_createphyslink_req(mgr, skb, hdr);
-			break;
-
-		case A2MP_DISCONNPHYSLINK_REQ:
-			err = a2mp_discphyslink_req(mgr, skb, hdr);
-			break;
-
-		case A2MP_DISCOVER_RSP:
-			err = a2mp_discover_rsp(mgr, skb, hdr);
-			break;
-
-		case A2MP_GETINFO_RSP:
-			err = a2mp_getinfo_rsp(mgr, skb, hdr);
-			break;
-
-		case A2MP_GETAMPASSOC_RSP:
-			err = a2mp_getampassoc_rsp(mgr, skb, hdr);
-			break;
-
-		case A2MP_CHANGE_RSP:
-		case A2MP_CREATEPHYSLINK_RSP:
-		case A2MP_DISCONNPHYSLINK_RSP:
-			err = a2mp_cmd_rsp(mgr, skb, hdr);
-			break;
-
-		default:
-			BT_ERR("Unknown A2MP sig cmd 0x%2.2x", hdr->code);
-			err = -EINVAL;
-			break;
-		}
-	}
-
-	if (err) {
-		struct a2mp_cmd_rej rej;
-
-		memset(&rej, 0, sizeof(rej));
-
-		rej.reason = cpu_to_le16(0);
-		hdr = (void *) skb->data;
-
-		BT_DBG("Send A2MP Rej: cmd 0x%2.2x err %d", hdr->code, err);
-
-		a2mp_send(mgr, A2MP_COMMAND_REJ, hdr->ident, sizeof(rej),
-			  &rej);
-	}
-
-	/* Always free skb and return success error code to prevent
-	   from sending L2CAP Disconnect over A2MP channel */
-	kfree_skb(skb);
-
-	amp_mgr_put(mgr);
-
-	return 0;
-}
-
-static void a2mp_chan_close_cb(struct l2cap_chan *chan)
-{
-	l2cap_chan_put(chan);
-}
-
-static void a2mp_chan_state_change_cb(struct l2cap_chan *chan, int state,
-				      int err)
-{
-	struct amp_mgr *mgr = chan->data;
-
-	if (!mgr)
-		return;
-
-	BT_DBG("chan %p state %s", chan, state_to_string(state));
-
-	chan->state = state;
-
-	switch (state) {
-	case BT_CLOSED:
-		if (mgr)
-			amp_mgr_put(mgr);
-		break;
-	}
-}
-
-static struct sk_buff *a2mp_chan_alloc_skb_cb(struct l2cap_chan *chan,
-					      unsigned long hdr_len,
-					      unsigned long len, int nb)
-{
-	struct sk_buff *skb;
-
-	skb = bt_skb_alloc(hdr_len + len, GFP_KERNEL);
-	if (!skb)
-		return ERR_PTR(-ENOMEM);
-
-	return skb;
-}
-
-static const struct l2cap_ops a2mp_chan_ops = {
-	.name = "L2CAP A2MP channel",
-	.recv = a2mp_chan_recv_cb,
-	.close = a2mp_chan_close_cb,
-	.state_change = a2mp_chan_state_change_cb,
-	.alloc_skb = a2mp_chan_alloc_skb_cb,
-
-	/* Not implemented for A2MP */
-	.new_connection = l2cap_chan_no_new_connection,
-	.teardown = l2cap_chan_no_teardown,
-	.ready = l2cap_chan_no_ready,
-	.defer = l2cap_chan_no_defer,
-	.resume = l2cap_chan_no_resume,
-	.set_shutdown = l2cap_chan_no_set_shutdown,
-	.get_sndtimeo = l2cap_chan_no_get_sndtimeo,
-};
-
-static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked)
-{
-	struct l2cap_chan *chan;
-	int err;
-
-	chan = l2cap_chan_create();
-	if (!chan)
-		return NULL;
-
-	BT_DBG("chan %p", chan);
-
-	chan->chan_type = L2CAP_CHAN_FIXED;
-	chan->scid = L2CAP_CID_A2MP;
-	chan->dcid = L2CAP_CID_A2MP;
-	chan->omtu = L2CAP_A2MP_DEFAULT_MTU;
-	chan->imtu = L2CAP_A2MP_DEFAULT_MTU;
-	chan->flush_to = L2CAP_DEFAULT_FLUSH_TO;
-
-	chan->ops = &a2mp_chan_ops;
-
-	l2cap_chan_set_defaults(chan);
-	chan->remote_max_tx = chan->max_tx;
-	chan->remote_tx_win = chan->tx_win;
-
-	chan->retrans_timeout = L2CAP_DEFAULT_RETRANS_TO;
-	chan->monitor_timeout = L2CAP_DEFAULT_MONITOR_TO;
-
-	skb_queue_head_init(&chan->tx_q);
-
-	chan->mode = L2CAP_MODE_ERTM;
-
-	err = l2cap_ertm_init(chan);
-	if (err < 0) {
-		l2cap_chan_del(chan, 0);
-		return NULL;
-	}
-
-	chan->conf_state = 0;
-
-	if (locked)
-		__l2cap_chan_add(conn, chan);
-	else
-		l2cap_chan_add(conn, chan);
-
-	chan->remote_mps = chan->omtu;
-	chan->mps = chan->omtu;
-
-	chan->state = BT_CONNECTED;
-
-	return chan;
-}
-
-/* AMP Manager functions */
-struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr)
-{
-	BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
-
-	kref_get(&mgr->kref);
-
-	return mgr;
-}
-
-static void amp_mgr_destroy(struct kref *kref)
-{
-	struct amp_mgr *mgr = container_of(kref, struct amp_mgr, kref);
-
-	BT_DBG("mgr %p", mgr);
-
-	mutex_lock(&amp_mgr_list_lock);
-	list_del(&mgr->list);
-	mutex_unlock(&amp_mgr_list_lock);
-
-	amp_ctrl_list_flush(mgr);
-	kfree(mgr);
-}
-
-int amp_mgr_put(struct amp_mgr *mgr)
-{
-	BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
-
-	return kref_put(&mgr->kref, &amp_mgr_destroy);
-}
-
-static struct amp_mgr *amp_mgr_create(struct l2cap_conn *conn, bool locked)
-{
-	struct amp_mgr *mgr;
-	struct l2cap_chan *chan;
-
-	mgr = kzalloc(sizeof(*mgr), GFP_KERNEL);
-	if (!mgr)
-		return NULL;
-
-	BT_DBG("conn %p mgr %p", conn, mgr);
-
-	mgr->l2cap_conn = conn;
-
-	chan = a2mp_chan_open(conn, locked);
-	if (!chan) {
-		kfree(mgr);
-		return NULL;
-	}
-
-	mgr->a2mp_chan = chan;
-	chan->data = mgr;
-
-	conn->hcon->amp_mgr = mgr;
-
-	kref_init(&mgr->kref);
-
-	/* Remote AMP ctrl list initialization */
-	INIT_LIST_HEAD(&mgr->amp_ctrls);
-	mutex_init(&mgr->amp_ctrls_lock);
-
-	mutex_lock(&amp_mgr_list_lock);
-	list_add(&mgr->list, &amp_mgr_list);
-	mutex_unlock(&amp_mgr_list_lock);
-
-	return mgr;
-}
-
-struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn,
-				       struct sk_buff *skb)
-{
-	struct amp_mgr *mgr;
-
-	if (conn->hcon->type != ACL_LINK)
-		return NULL;
-
-	mgr = amp_mgr_create(conn, false);
-	if (!mgr) {
-		BT_ERR("Could not create AMP manager");
-		return NULL;
-	}
-
-	BT_DBG("mgr: %p chan %p", mgr, mgr->a2mp_chan);
-
-	return mgr->a2mp_chan;
-}
-
-void a2mp_send_getinfo_rsp(struct hci_dev *hdev)
-{
-	struct amp_mgr *mgr;
-	struct a2mp_info_rsp rsp;
-
-	mgr = amp_mgr_lookup_by_state(READ_LOC_AMP_INFO);
-	if (!mgr)
-		return;
-
-	BT_DBG("%s mgr %p", hdev->name, mgr);
-
-	memset(&rsp, 0, sizeof(rsp));
-
-	rsp.id = hdev->id;
-	rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
-
-	if (hdev->amp_type != AMP_TYPE_BREDR) {
-		rsp.status = 0;
-		rsp.total_bw = cpu_to_le32(hdev->amp_total_bw);
-		rsp.max_bw = cpu_to_le32(hdev->amp_max_bw);
-		rsp.min_latency = cpu_to_le32(hdev->amp_min_latency);
-		rsp.pal_cap = cpu_to_le16(hdev->amp_pal_cap);
-		rsp.assoc_size = cpu_to_le16(hdev->amp_assoc_size);
-	}
-
-	a2mp_send(mgr, A2MP_GETINFO_RSP, mgr->ident, sizeof(rsp), &rsp);
-	amp_mgr_put(mgr);
-}
-
-void a2mp_send_getampassoc_rsp(struct hci_dev *hdev, u8 status)
-{
-	struct amp_mgr *mgr;
-	struct amp_assoc *loc_assoc = &hdev->loc_assoc;
-	struct a2mp_amp_assoc_rsp *rsp;
-	size_t len;
-
-	mgr = amp_mgr_lookup_by_state(READ_LOC_AMP_ASSOC);
-	if (!mgr)
-		return;
-
-	BT_DBG("%s mgr %p", hdev->name, mgr);
-
-	len = sizeof(struct a2mp_amp_assoc_rsp) + loc_assoc->len;
-	rsp = kzalloc(len, GFP_KERNEL);
-	if (!rsp) {
-		amp_mgr_put(mgr);
-		return;
-	}
-
-	rsp->id = hdev->id;
-
-	if (status) {
-		rsp->status = A2MP_STATUS_INVALID_CTRL_ID;
-	} else {
-		rsp->status = A2MP_STATUS_SUCCESS;
-		memcpy(rsp->amp_assoc, loc_assoc->data, loc_assoc->len);
-	}
-
-	a2mp_send(mgr, A2MP_GETAMPASSOC_RSP, mgr->ident, len, rsp);
-	amp_mgr_put(mgr);
-	kfree(rsp);
-}
-
-void a2mp_send_create_phy_link_req(struct hci_dev *hdev, u8 status)
-{
-	struct amp_mgr *mgr;
-	struct amp_assoc *loc_assoc = &hdev->loc_assoc;
-	struct a2mp_physlink_req *req;
-	struct l2cap_chan *bredr_chan;
-	size_t len;
-
-	mgr = amp_mgr_lookup_by_state(READ_LOC_AMP_ASSOC_FINAL);
-	if (!mgr)
-		return;
-
-	len = sizeof(*req) + loc_assoc->len;
-
-	BT_DBG("%s mgr %p assoc_len %zu", hdev->name, mgr, len);
-
-	req = kzalloc(len, GFP_KERNEL);
-	if (!req) {
-		amp_mgr_put(mgr);
-		return;
-	}
-
-	bredr_chan = mgr->bredr_chan;
-	if (!bredr_chan)
-		goto clean;
-
-	req->local_id = hdev->id;
-	req->remote_id = bredr_chan->remote_amp_id;
-	memcpy(req->amp_assoc, loc_assoc->data, loc_assoc->len);
-
-	a2mp_send(mgr, A2MP_CREATEPHYSLINK_REQ, __next_ident(mgr), len, req);
-
-clean:
-	amp_mgr_put(mgr);
-	kfree(req);
-}
-
-void a2mp_send_create_phy_link_rsp(struct hci_dev *hdev, u8 status)
-{
-	struct amp_mgr *mgr;
-	struct a2mp_physlink_rsp rsp;
-	struct hci_conn *hs_hcon;
-
-	mgr = amp_mgr_lookup_by_state(WRITE_REMOTE_AMP_ASSOC);
-	if (!mgr)
-		return;
-
-	memset(&rsp, 0, sizeof(rsp));
-
-	hs_hcon = hci_conn_hash_lookup_state(hdev, AMP_LINK, BT_CONNECT);
-	if (!hs_hcon) {
-		rsp.status = A2MP_STATUS_UNABLE_START_LINK_CREATION;
-	} else {
-		rsp.remote_id = hs_hcon->remote_id;
-		rsp.status = A2MP_STATUS_SUCCESS;
-	}
-
-	BT_DBG("%s mgr %p hs_hcon %p status %u", hdev->name, mgr, hs_hcon,
-	       status);
-
-	rsp.local_id = hdev->id;
-	a2mp_send(mgr, A2MP_CREATEPHYSLINK_RSP, mgr->ident, sizeof(rsp), &rsp);
-	amp_mgr_put(mgr);
-}
-
-void a2mp_discover_amp(struct l2cap_chan *chan)
-{
-	struct l2cap_conn *conn = chan->conn;
-	struct amp_mgr *mgr = conn->hcon->amp_mgr;
-	struct a2mp_discov_req req;
-
-	BT_DBG("chan %p conn %p mgr %p", chan, conn, mgr);
-
-	if (!mgr) {
-		mgr = amp_mgr_create(conn, true);
-		if (!mgr)
-			return;
-	}
-
-	mgr->bredr_chan = chan;
-
-	memset(&req, 0, sizeof(req));
-
-	req.mtu = cpu_to_le16(L2CAP_A2MP_DEFAULT_MTU);
-	req.ext_feat = 0;
-	a2mp_send(mgr, A2MP_DISCOVER_REQ, 1, sizeof(req), &req);
-}
diff --git a/net/bluetooth/a2mp.h b/net/bluetooth/a2mp.h
deleted file mode 100644
index 2fd253a61a2a..000000000000
--- a/net/bluetooth/a2mp.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
-   Copyright (c) 2010,2011 Code Aurora Forum.  All rights reserved.
-   Copyright (c) 2011,2012 Intel Corp.
-
-*/
-
-#ifndef __A2MP_H
-#define __A2MP_H
-
-#include <net/bluetooth/l2cap.h>
-
-enum amp_mgr_state {
-	READ_LOC_AMP_INFO,
-	READ_LOC_AMP_ASSOC,
-	READ_LOC_AMP_ASSOC_FINAL,
-	WRITE_REMOTE_AMP_ASSOC,
-};
-
-struct amp_mgr {
-	struct list_head	list;
-	struct l2cap_conn	*l2cap_conn;
-	struct l2cap_chan	*a2mp_chan;
-	struct l2cap_chan	*bredr_chan;
-	struct kref		kref;
-	__u8			ident;
-	__u8			handle;
-	unsigned long		state;
-	unsigned long		flags;
-
-	struct list_head	amp_ctrls;
-	struct mutex		amp_ctrls_lock;
-};
-
-struct a2mp_cmd {
-	__u8	code;
-	__u8	ident;
-	__le16	len;
-	__u8	data[];
-} __packed;
-
-/* A2MP command codes */
-#define A2MP_COMMAND_REJ         0x01
-struct a2mp_cmd_rej {
-	__le16	reason;
-	__u8	data[];
-} __packed;
-
-#define A2MP_DISCOVER_REQ        0x02
-struct a2mp_discov_req {
-	__le16	mtu;
-	__le16	ext_feat;
-} __packed;
-
-struct a2mp_cl {
-	__u8	id;
-	__u8	type;
-	__u8	status;
-} __packed;
-
-#define A2MP_DISCOVER_RSP        0x03
-struct a2mp_discov_rsp {
-	__le16     mtu;
-	__le16     ext_feat;
-	struct a2mp_cl cl[];
-} __packed;
-
-#define A2MP_CHANGE_NOTIFY       0x04
-#define A2MP_CHANGE_RSP          0x05
-
-#define A2MP_GETINFO_REQ         0x06
-struct a2mp_info_req {
-	__u8       id;
-} __packed;
-
-#define A2MP_GETINFO_RSP         0x07
-struct a2mp_info_rsp {
-	__u8	id;
-	__u8	status;
-	__le32	total_bw;
-	__le32	max_bw;
-	__le32	min_latency;
-	__le16	pal_cap;
-	__le16	assoc_size;
-} __packed;
-
-#define A2MP_GETAMPASSOC_REQ     0x08
-struct a2mp_amp_assoc_req {
-	__u8	id;
-} __packed;
-
-#define A2MP_GETAMPASSOC_RSP     0x09
-struct a2mp_amp_assoc_rsp {
-	__u8	id;
-	__u8	status;
-	__u8	amp_assoc[];
-} __packed;
-
-#define A2MP_CREATEPHYSLINK_REQ  0x0A
-#define A2MP_DISCONNPHYSLINK_REQ 0x0C
-struct a2mp_physlink_req {
-	__u8	local_id;
-	__u8	remote_id;
-	__u8	amp_assoc[];
-} __packed;
-
-#define A2MP_CREATEPHYSLINK_RSP  0x0B
-#define A2MP_DISCONNPHYSLINK_RSP 0x0D
-struct a2mp_physlink_rsp {
-	__u8	local_id;
-	__u8	remote_id;
-	__u8	status;
-} __packed;
-
-/* A2MP response status */
-#define A2MP_STATUS_SUCCESS			0x00
-#define A2MP_STATUS_INVALID_CTRL_ID		0x01
-#define A2MP_STATUS_UNABLE_START_LINK_CREATION	0x02
-#define A2MP_STATUS_NO_PHYSICAL_LINK_EXISTS	0x02
-#define A2MP_STATUS_COLLISION_OCCURED		0x03
-#define A2MP_STATUS_DISCONN_REQ_RECVD		0x04
-#define A2MP_STATUS_PHYS_LINK_EXISTS		0x05
-#define A2MP_STATUS_SECURITY_VIOLATION		0x06
-
-struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr);
-
-#if IS_ENABLED(CONFIG_BT_HS)
-int amp_mgr_put(struct amp_mgr *mgr);
-struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn,
-				       struct sk_buff *skb);
-void a2mp_discover_amp(struct l2cap_chan *chan);
-#else
-static inline int amp_mgr_put(struct amp_mgr *mgr)
-{
-	return 0;
-}
-
-static inline struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn,
-						     struct sk_buff *skb)
-{
-	return NULL;
-}
-
-static inline void a2mp_discover_amp(struct l2cap_chan *chan)
-{
-}
-#endif
-
-void a2mp_send_getinfo_rsp(struct hci_dev *hdev);
-void a2mp_send_getampassoc_rsp(struct hci_dev *hdev, u8 status);
-void a2mp_send_create_phy_link_req(struct hci_dev *hdev, u8 status);
-void a2mp_send_create_phy_link_rsp(struct hci_dev *hdev, u8 status);
-
-#endif /* __A2MP_H */
diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c
deleted file mode 100644
index 5d698f19868c..000000000000
--- a/net/bluetooth/amp.c
+++ /dev/null
@@ -1,590 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
-   Copyright (c) 2011,2012 Intel Corp.
-
-*/
-
-#include <net/bluetooth/bluetooth.h>
-#include <net/bluetooth/hci.h>
-#include <net/bluetooth/hci_core.h>
-#include <crypto/hash.h>
-
-#include "hci_request.h"
-#include "a2mp.h"
-#include "amp.h"
-
-/* Remote AMP Controllers interface */
-void amp_ctrl_get(struct amp_ctrl *ctrl)
-{
-	BT_DBG("ctrl %p orig refcnt %d", ctrl,
-	       kref_read(&ctrl->kref));
-
-	kref_get(&ctrl->kref);
-}
-
-static void amp_ctrl_destroy(struct kref *kref)
-{
-	struct amp_ctrl *ctrl = container_of(kref, struct amp_ctrl, kref);
-
-	BT_DBG("ctrl %p", ctrl);
-
-	kfree(ctrl->assoc);
-	kfree(ctrl);
-}
-
-int amp_ctrl_put(struct amp_ctrl *ctrl)
-{
-	BT_DBG("ctrl %p orig refcnt %d", ctrl,
-	       kref_read(&ctrl->kref));
-
-	return kref_put(&ctrl->kref, &amp_ctrl_destroy);
-}
-
-struct amp_ctrl *amp_ctrl_add(struct amp_mgr *mgr, u8 id)
-{
-	struct amp_ctrl *ctrl;
-
-	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
-	if (!ctrl)
-		return NULL;
-
-	kref_init(&ctrl->kref);
-	ctrl->id = id;
-
-	mutex_lock(&mgr->amp_ctrls_lock);
-	list_add(&ctrl->list, &mgr->amp_ctrls);
-	mutex_unlock(&mgr->amp_ctrls_lock);
-
-	BT_DBG("mgr %p ctrl %p", mgr, ctrl);
-
-	return ctrl;
-}
-
-void amp_ctrl_list_flush(struct amp_mgr *mgr)
-{
-	struct amp_ctrl *ctrl, *n;
-
-	BT_DBG("mgr %p", mgr);
-
-	mutex_lock(&mgr->amp_ctrls_lock);
-	list_for_each_entry_safe(ctrl, n, &mgr->amp_ctrls, list) {
-		list_del(&ctrl->list);
-		amp_ctrl_put(ctrl);
-	}
-	mutex_unlock(&mgr->amp_ctrls_lock);
-}
-
-struct amp_ctrl *amp_ctrl_lookup(struct amp_mgr *mgr, u8 id)
-{
-	struct amp_ctrl *ctrl;
-
-	BT_DBG("mgr %p id %u", mgr, id);
-
-	mutex_lock(&mgr->amp_ctrls_lock);
-	list_for_each_entry(ctrl, &mgr->amp_ctrls, list) {
-		if (ctrl->id == id) {
-			amp_ctrl_get(ctrl);
-			mutex_unlock(&mgr->amp_ctrls_lock);
-			return ctrl;
-		}
-	}
-	mutex_unlock(&mgr->amp_ctrls_lock);
-
-	return NULL;
-}
-
-/* Physical Link interface */
-static u8 __next_handle(struct amp_mgr *mgr)
-{
-	if (++mgr->handle == 0)
-		mgr->handle = 1;
-
-	return mgr->handle;
-}
-
-struct hci_conn *phylink_add(struct hci_dev *hdev, struct amp_mgr *mgr,
-			     u8 remote_id, bool out)
-{
-	bdaddr_t *dst = &mgr->l2cap_conn->hcon->dst;
-	struct hci_conn *hcon;
-	u8 role = out ? HCI_ROLE_MASTER : HCI_ROLE_SLAVE;
-
-	hcon = hci_conn_add(hdev, AMP_LINK, dst, role, __next_handle(mgr));
-	if (!hcon)
-		return NULL;
-
-	BT_DBG("hcon %p dst %pMR", hcon, dst);
-
-	hcon->state = BT_CONNECT;
-	hcon->attempt++;
-	hcon->remote_id = remote_id;
-	hcon->amp_mgr = amp_mgr_get(mgr);
-
-	return hcon;
-}
-
-/* AMP crypto key generation interface */
-static int hmac_sha256(u8 *key, u8 ksize, char *plaintext, u8 psize, u8 *output)
-{
-	struct crypto_shash *tfm;
-	struct shash_desc *shash;
-	int ret;
-
-	if (!ksize)
-		return -EINVAL;
-
-	tfm = crypto_alloc_shash("hmac(sha256)", 0, 0);
-	if (IS_ERR(tfm)) {
-		BT_DBG("crypto_alloc_ahash failed: err %ld", PTR_ERR(tfm));
-		return PTR_ERR(tfm);
-	}
-
-	ret = crypto_shash_setkey(tfm, key, ksize);
-	if (ret) {
-		BT_DBG("crypto_ahash_setkey failed: err %d", ret);
-		goto failed;
-	}
-
-	shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(tfm),
-			GFP_KERNEL);
-	if (!shash) {
-		ret = -ENOMEM;
-		goto failed;
-	}
-
-	shash->tfm = tfm;
-
-	ret = crypto_shash_digest(shash, plaintext, psize, output);
-
-	kfree(shash);
-
-failed:
-	crypto_free_shash(tfm);
-	return ret;
-}
-
-int phylink_gen_key(struct hci_conn *conn, u8 *data, u8 *len, u8 *type)
-{
-	struct hci_dev *hdev = conn->hdev;
-	struct link_key *key;
-	u8 keybuf[HCI_AMP_LINK_KEY_SIZE];
-	u8 gamp_key[HCI_AMP_LINK_KEY_SIZE];
-	int err;
-
-	if (!hci_conn_check_link_mode(conn))
-		return -EACCES;
-
-	BT_DBG("conn %p key_type %d", conn, conn->key_type);
-
-	/* Legacy key */
-	if (conn->key_type < 3) {
-		bt_dev_err(hdev, "legacy key type %u", conn->key_type);
-		return -EACCES;
-	}
-
-	*type = conn->key_type;
-	*len = HCI_AMP_LINK_KEY_SIZE;
-
-	key = hci_find_link_key(hdev, &conn->dst);
-	if (!key) {
-		BT_DBG("No Link key for conn %p dst %pMR", conn, &conn->dst);
-		return -EACCES;
-	}
-
-	/* BR/EDR Link Key concatenated together with itself */
-	memcpy(&keybuf[0], key->val, HCI_LINK_KEY_SIZE);
-	memcpy(&keybuf[HCI_LINK_KEY_SIZE], key->val, HCI_LINK_KEY_SIZE);
-
-	/* Derive Generic AMP Link Key (gamp) */
-	err = hmac_sha256(keybuf, HCI_AMP_LINK_KEY_SIZE, "gamp", 4, gamp_key);
-	if (err) {
-		bt_dev_err(hdev, "could not derive Generic AMP Key: err %d", err);
-		return err;
-	}
-
-	if (conn->key_type == HCI_LK_DEBUG_COMBINATION) {
-		BT_DBG("Use Generic AMP Key (gamp)");
-		memcpy(data, gamp_key, HCI_AMP_LINK_KEY_SIZE);
-		return err;
-	}
-
-	/* Derive Dedicated AMP Link Key: "802b" is 802.11 PAL keyID */
-	return hmac_sha256(gamp_key, HCI_AMP_LINK_KEY_SIZE, "802b", 4, data);
-}
-
-static void read_local_amp_assoc_complete(struct hci_dev *hdev, u8 status,
-					  u16 opcode, struct sk_buff *skb)
-{
-	struct hci_rp_read_local_amp_assoc *rp = (void *)skb->data;
-	struct amp_assoc *assoc = &hdev->loc_assoc;
-	size_t rem_len, frag_len;
-
-	BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
-
-	if (rp->status)
-		goto send_rsp;
-
-	frag_len = skb->len - sizeof(*rp);
-	rem_len = __le16_to_cpu(rp->rem_len);
-
-	if (rem_len > frag_len) {
-		BT_DBG("frag_len %zu rem_len %zu", frag_len, rem_len);
-
-		memcpy(assoc->data + assoc->offset, rp->frag, frag_len);
-		assoc->offset += frag_len;
-
-		/* Read other fragments */
-		amp_read_loc_assoc_frag(hdev, rp->phy_handle);
-
-		return;
-	}
-
-	memcpy(assoc->data + assoc->offset, rp->frag, rem_len);
-	assoc->len = assoc->offset + rem_len;
-	assoc->offset = 0;
-
-send_rsp:
-	/* Send A2MP Rsp when all fragments are received */
-	a2mp_send_getampassoc_rsp(hdev, rp->status);
-	a2mp_send_create_phy_link_req(hdev, rp->status);
-}
-
-void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle)
-{
-	struct hci_cp_read_local_amp_assoc cp;
-	struct amp_assoc *loc_assoc = &hdev->loc_assoc;
-	struct hci_request req;
-	int err;
-
-	BT_DBG("%s handle %u", hdev->name, phy_handle);
-
-	cp.phy_handle = phy_handle;
-	cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
-	cp.len_so_far = cpu_to_le16(loc_assoc->offset);
-
-	hci_req_init(&req, hdev);
-	hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
-	err = hci_req_run_skb(&req, read_local_amp_assoc_complete);
-	if (err < 0)
-		a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID);
-}
-
-void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr)
-{
-	struct hci_cp_read_local_amp_assoc cp;
-	struct hci_request req;
-	int err;
-
-	memset(&hdev->loc_assoc, 0, sizeof(struct amp_assoc));
-	memset(&cp, 0, sizeof(cp));
-
-	cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
-
-	set_bit(READ_LOC_AMP_ASSOC, &mgr->state);
-	hci_req_init(&req, hdev);
-	hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
-	err = hci_req_run_skb(&req, read_local_amp_assoc_complete);
-	if (err < 0)
-		a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID);
-}
-
-void amp_read_loc_assoc_final_data(struct hci_dev *hdev,
-				   struct hci_conn *hcon)
-{
-	struct hci_cp_read_local_amp_assoc cp;
-	struct amp_mgr *mgr = hcon->amp_mgr;
-	struct hci_request req;
-	int err;
-
-	if (!mgr)
-		return;
-
-	cp.phy_handle = hcon->handle;
-	cp.len_so_far = cpu_to_le16(0);
-	cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
-
-	set_bit(READ_LOC_AMP_ASSOC_FINAL, &mgr->state);
-
-	/* Read Local AMP Assoc final link information data */
-	hci_req_init(&req, hdev);
-	hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
-	err = hci_req_run_skb(&req, read_local_amp_assoc_complete);
-	if (err < 0)
-		a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID);
-}
-
-static void write_remote_amp_assoc_complete(struct hci_dev *hdev, u8 status,
-					    u16 opcode, struct sk_buff *skb)
-{
-	struct hci_rp_write_remote_amp_assoc *rp = (void *)skb->data;
-
-	BT_DBG("%s status 0x%2.2x phy_handle 0x%2.2x",
-	       hdev->name, rp->status, rp->phy_handle);
-
-	if (rp->status)
-		return;
-
-	amp_write_rem_assoc_continue(hdev, rp->phy_handle);
-}
-
-/* Write AMP Assoc data fragments, returns true with last fragment written*/
-static bool amp_write_rem_assoc_frag(struct hci_dev *hdev,
-				     struct hci_conn *hcon)
-{
-	struct hci_cp_write_remote_amp_assoc *cp;
-	struct amp_mgr *mgr = hcon->amp_mgr;
-	struct amp_ctrl *ctrl;
-	struct hci_request req;
-	u16 frag_len, len;
-
-	ctrl = amp_ctrl_lookup(mgr, hcon->remote_id);
-	if (!ctrl)
-		return false;
-
-	if (!ctrl->assoc_rem_len) {
-		BT_DBG("all fragments are written");
-		ctrl->assoc_rem_len = ctrl->assoc_len;
-		ctrl->assoc_len_so_far = 0;
-
-		amp_ctrl_put(ctrl);
-		return true;
-	}
-
-	frag_len = min_t(u16, 248, ctrl->assoc_rem_len);
-	len = frag_len + sizeof(*cp);
-
-	cp = kzalloc(len, GFP_KERNEL);
-	if (!cp) {
-		amp_ctrl_put(ctrl);
-		return false;
-	}
-
-	BT_DBG("hcon %p ctrl %p frag_len %u assoc_len %u rem_len %u",
-	       hcon, ctrl, frag_len, ctrl->assoc_len, ctrl->assoc_rem_len);
-
-	cp->phy_handle = hcon->handle;
-	cp->len_so_far = cpu_to_le16(ctrl->assoc_len_so_far);
-	cp->rem_len = cpu_to_le16(ctrl->assoc_rem_len);
-	memcpy(cp->frag, ctrl->assoc, frag_len);
-
-	ctrl->assoc_len_so_far += frag_len;
-	ctrl->assoc_rem_len -= frag_len;
-
-	amp_ctrl_put(ctrl);
-
-	hci_req_init(&req, hdev);
-	hci_req_add(&req, HCI_OP_WRITE_REMOTE_AMP_ASSOC, len, cp);
-	hci_req_run_skb(&req, write_remote_amp_assoc_complete);
-
-	kfree(cp);
-
-	return false;
-}
-
-void amp_write_rem_assoc_continue(struct hci_dev *hdev, u8 handle)
-{
-	struct hci_conn *hcon;
-
-	BT_DBG("%s phy handle 0x%2.2x", hdev->name, handle);
-
-	hcon = hci_conn_hash_lookup_handle(hdev, handle);
-	if (!hcon)
-		return;
-
-	/* Send A2MP create phylink rsp when all fragments are written */
-	if (amp_write_rem_assoc_frag(hdev, hcon))
-		a2mp_send_create_phy_link_rsp(hdev, 0);
-}
-
-void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle)
-{
-	struct hci_conn *hcon;
-
-	BT_DBG("%s phy handle 0x%2.2x", hdev->name, handle);
-
-	hcon = hci_conn_hash_lookup_handle(hdev, handle);
-	if (!hcon)
-		return;
-
-	BT_DBG("%s phy handle 0x%2.2x hcon %p", hdev->name, handle, hcon);
-
-	amp_write_rem_assoc_frag(hdev, hcon);
-}
-
-static void create_phylink_complete(struct hci_dev *hdev, u8 status,
-				    u16 opcode)
-{
-	struct hci_cp_create_phy_link *cp;
-
-	BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
-	cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_PHY_LINK);
-	if (!cp)
-		return;
-
-	hci_dev_lock(hdev);
-
-	if (status) {
-		struct hci_conn *hcon;
-
-		hcon = hci_conn_hash_lookup_handle(hdev, cp->phy_handle);
-		if (hcon)
-			hci_conn_del(hcon);
-	} else {
-		amp_write_remote_assoc(hdev, cp->phy_handle);
-	}
-
-	hci_dev_unlock(hdev);
-}
-
-void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
-			struct hci_conn *hcon)
-{
-	struct hci_cp_create_phy_link cp;
-	struct hci_request req;
-
-	cp.phy_handle = hcon->handle;
-
-	BT_DBG("%s hcon %p phy handle 0x%2.2x", hdev->name, hcon,
-	       hcon->handle);
-
-	if (phylink_gen_key(mgr->l2cap_conn->hcon, cp.key, &cp.key_len,
-			    &cp.key_type)) {
-		BT_DBG("Cannot create link key");
-		return;
-	}
-
-	hci_req_init(&req, hdev);
-	hci_req_add(&req, HCI_OP_CREATE_PHY_LINK, sizeof(cp), &cp);
-	hci_req_run(&req, create_phylink_complete);
-}
-
-static void accept_phylink_complete(struct hci_dev *hdev, u8 status,
-				    u16 opcode)
-{
-	struct hci_cp_accept_phy_link *cp;
-
-	BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
-	if (status)
-		return;
-
-	cp = hci_sent_cmd_data(hdev, HCI_OP_ACCEPT_PHY_LINK);
-	if (!cp)
-		return;
-
-	amp_write_remote_assoc(hdev, cp->phy_handle);
-}
-
-void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
-			struct hci_conn *hcon)
-{
-	struct hci_cp_accept_phy_link cp;
-	struct hci_request req;
-
-	cp.phy_handle = hcon->handle;
-
-	BT_DBG("%s hcon %p phy handle 0x%2.2x", hdev->name, hcon,
-	       hcon->handle);
-
-	if (phylink_gen_key(mgr->l2cap_conn->hcon, cp.key, &cp.key_len,
-			    &cp.key_type)) {
-		BT_DBG("Cannot create link key");
-		return;
-	}
-
-	hci_req_init(&req, hdev);
-	hci_req_add(&req, HCI_OP_ACCEPT_PHY_LINK, sizeof(cp), &cp);
-	hci_req_run(&req, accept_phylink_complete);
-}
-
-void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon)
-{
-	struct hci_dev *bredr_hdev = hci_dev_hold(bredr_hcon->hdev);
-	struct amp_mgr *mgr = hs_hcon->amp_mgr;
-	struct l2cap_chan *bredr_chan;
-
-	BT_DBG("bredr_hcon %p hs_hcon %p mgr %p", bredr_hcon, hs_hcon, mgr);
-
-	if (!bredr_hdev || !mgr || !mgr->bredr_chan)
-		return;
-
-	bredr_chan = mgr->bredr_chan;
-
-	l2cap_chan_lock(bredr_chan);
-
-	set_bit(FLAG_EFS_ENABLE, &bredr_chan->flags);
-	bredr_chan->remote_amp_id = hs_hcon->remote_id;
-	bredr_chan->local_amp_id = hs_hcon->hdev->id;
-	bredr_chan->hs_hcon = hs_hcon;
-	bredr_chan->conn->mtu = hs_hcon->hdev->block_mtu;
-
-	__l2cap_physical_cfm(bredr_chan, 0);
-
-	l2cap_chan_unlock(bredr_chan);
-
-	hci_dev_put(bredr_hdev);
-}
-
-void amp_create_logical_link(struct l2cap_chan *chan)
-{
-	struct hci_conn *hs_hcon = chan->hs_hcon;
-	struct hci_cp_create_accept_logical_link cp;
-	struct hci_dev *hdev;
-
-	BT_DBG("chan %p hs_hcon %p dst %pMR", chan, hs_hcon,
-	       &chan->conn->hcon->dst);
-
-	if (!hs_hcon)
-		return;
-
-	hdev = hci_dev_hold(chan->hs_hcon->hdev);
-	if (!hdev)
-		return;
-
-	cp.phy_handle = hs_hcon->handle;
-
-	cp.tx_flow_spec.id = chan->local_id;
-	cp.tx_flow_spec.stype = chan->local_stype;
-	cp.tx_flow_spec.msdu = cpu_to_le16(chan->local_msdu);
-	cp.tx_flow_spec.sdu_itime = cpu_to_le32(chan->local_sdu_itime);
-	cp.tx_flow_spec.acc_lat = cpu_to_le32(chan->local_acc_lat);
-	cp.tx_flow_spec.flush_to = cpu_to_le32(chan->local_flush_to);
-
-	cp.rx_flow_spec.id = chan->remote_id;
-	cp.rx_flow_spec.stype = chan->remote_stype;
-	cp.rx_flow_spec.msdu = cpu_to_le16(chan->remote_msdu);
-	cp.rx_flow_spec.sdu_itime = cpu_to_le32(chan->remote_sdu_itime);
-	cp.rx_flow_spec.acc_lat = cpu_to_le32(chan->remote_acc_lat);
-	cp.rx_flow_spec.flush_to = cpu_to_le32(chan->remote_flush_to);
-
-	if (hs_hcon->out)
-		hci_send_cmd(hdev, HCI_OP_CREATE_LOGICAL_LINK, sizeof(cp),
-			     &cp);
-	else
-		hci_send_cmd(hdev, HCI_OP_ACCEPT_LOGICAL_LINK, sizeof(cp),
-			     &cp);
-
-	hci_dev_put(hdev);
-}
-
-void amp_disconnect_logical_link(struct hci_chan *hchan)
-{
-	struct hci_conn *hcon = hchan->conn;
-	struct hci_cp_disconn_logical_link cp;
-
-	if (hcon->state != BT_CONNECTED) {
-		BT_DBG("hchan %p not connected", hchan);
-		return;
-	}
-
-	cp.log_handle = cpu_to_le16(hchan->handle);
-	hci_send_cmd(hcon->hdev, HCI_OP_DISCONN_LOGICAL_LINK, sizeof(cp), &cp);
-}
-
-void amp_destroy_logical_link(struct hci_chan *hchan, u8 reason)
-{
-	BT_DBG("hchan %p", hchan);
-
-	hci_chan_del(hchan);
-}
diff --git a/net/bluetooth/amp.h b/net/bluetooth/amp.h
deleted file mode 100644
index 97c87abd129f..000000000000
--- a/net/bluetooth/amp.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
-   Copyright (c) 2011,2012 Intel Corp.
-
-*/
-
-#ifndef __AMP_H
-#define __AMP_H
-
-struct amp_ctrl {
-	struct list_head	list;
-	struct kref		kref;
-	__u8			id;
-	__u16			assoc_len_so_far;
-	__u16			assoc_rem_len;
-	__u16			assoc_len;
-	__u8			*assoc;
-};
-
-int amp_ctrl_put(struct amp_ctrl *ctrl);
-void amp_ctrl_get(struct amp_ctrl *ctrl);
-struct amp_ctrl *amp_ctrl_add(struct amp_mgr *mgr, u8 id);
-struct amp_ctrl *amp_ctrl_lookup(struct amp_mgr *mgr, u8 id);
-void amp_ctrl_list_flush(struct amp_mgr *mgr);
-
-struct hci_conn *phylink_add(struct hci_dev *hdev, struct amp_mgr *mgr,
-			     u8 remote_id, bool out);
-
-int phylink_gen_key(struct hci_conn *hcon, u8 *data, u8 *len, u8 *type);
-
-void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle);
-void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr);
-void amp_read_loc_assoc_final_data(struct hci_dev *hdev,
-				   struct hci_conn *hcon);
-void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
-			struct hci_conn *hcon);
-void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
-			struct hci_conn *hcon);
-
-#if IS_ENABLED(CONFIG_BT_HS)
-void amp_create_logical_link(struct l2cap_chan *chan);
-void amp_disconnect_logical_link(struct hci_chan *hchan);
-#else
-static inline void amp_create_logical_link(struct l2cap_chan *chan)
-{
-}
-
-static inline void amp_disconnect_logical_link(struct hci_chan *hchan)
-{
-}
-#endif
-
-void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle);
-void amp_write_rem_assoc_continue(struct hci_dev *hdev, u8 handle);
-void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon);
-void amp_create_logical_link(struct l2cap_chan *chan);
-void amp_disconnect_logical_link(struct hci_chan *hchan);
-void amp_destroy_logical_link(struct hci_chan *hchan, u8 reason);
-
-#endif /* __AMP_H */
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index a41d2693f4d8..fc4d72f83ac2 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -36,7 +36,6 @@
 
 #include "hci_request.h"
 #include "smp.h"
-#include "a2mp.h"
 #include "eir.h"
 
 struct sco_param {
@@ -1175,9 +1174,6 @@ void hci_conn_del(struct hci_conn *conn)
 		}
 	}
 
-	if (conn->amp_mgr)
-		amp_mgr_put(conn->amp_mgr);
-
 	skb_queue_purge(&conn->data_q);
 
 	/* Remove the connection from the list and cleanup its remaining
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 31df5f5b7994..11b55d1f9772 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -36,8 +36,6 @@
 #include "hci_request.h"
 #include "hci_debugfs.h"
 #include "hci_codec.h"
-#include "a2mp.h"
-#include "amp.h"
 #include "smp.h"
 #include "msft.h"
 #include "eir.h"
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 656f49b299d2..ab5a9d42fae7 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -39,8 +39,6 @@
 #include <net/bluetooth/l2cap.h>
 
 #include "smp.h"
-#include "a2mp.h"
-#include "amp.h"
 
 #define LE_FLOWCTL_MAX_CREDITS 65535
 
@@ -167,24 +165,6 @@ static struct l2cap_chan *__l2cap_get_chan_by_ident(struct l2cap_conn *conn,
 	return NULL;
 }
 
-static struct l2cap_chan *l2cap_get_chan_by_ident(struct l2cap_conn *conn,
-						  u8 ident)
-{
-	struct l2cap_chan *c;
-
-	mutex_lock(&conn->chan_lock);
-	c = __l2cap_get_chan_by_ident(conn, ident);
-	if (c) {
-		/* Only lock if chan reference is not 0 */
-		c = l2cap_chan_hold_unless_zero(c);
-		if (c)
-			l2cap_chan_lock(c);
-	}
-	mutex_unlock(&conn->chan_lock);
-
-	return c;
-}
-
 static struct l2cap_chan *__l2cap_global_chan_by_addr(__le16 psm, bdaddr_t *src,
 						      u8 src_type)
 {
@@ -651,7 +631,6 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err)
 	chan->ops->teardown(chan, err);
 
 	if (conn) {
-		struct amp_mgr *mgr = conn->hcon->amp_mgr;
 		/* Delete from channel list */
 		list_del(&chan->list);
 
@@ -666,16 +645,6 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err)
 		if (chan->chan_type != L2CAP_CHAN_FIXED ||
 		    test_bit(FLAG_HOLD_HCI_CONN, &chan->flags))
 			hci_conn_drop(conn->hcon);
-
-		if (mgr && mgr->bredr_chan == chan)
-			mgr->bredr_chan = NULL;
-	}
-
-	if (chan->hs_hchan) {
-		struct hci_chan *hs_hchan = chan->hs_hchan;
-
-		BT_DBG("chan %p disconnect hs_hchan %p", chan, hs_hchan);
-		amp_disconnect_logical_link(hs_hchan);
 	}
 
 	if (test_bit(CONF_NOT_COMPLETE, &chan->conf_state))
@@ -977,12 +946,6 @@ static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
 	hci_send_acl(conn->hchan, skb, flags);
 }
 
-static bool __chan_is_moving(struct l2cap_chan *chan)
-{
-	return chan->move_state != L2CAP_MOVE_STABLE &&
-	       chan->move_state != L2CAP_MOVE_WAIT_PREPARE;
-}
-
 static void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb)
 {
 	struct hci_conn *hcon = chan->conn->hcon;
@@ -991,15 +954,6 @@ static void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb)
 	BT_DBG("chan %p, skb %p len %d priority %u", chan, skb, skb->len,
 	       skb->priority);
 
-	if (chan->hs_hcon && !__chan_is_moving(chan)) {
-		if (chan->hs_hchan)
-			hci_send_acl(chan->hs_hchan, skb, ACL_COMPLETE);
-		else
-			kfree_skb(skb);
-
-		return;
-	}
-
 	/* Use NO_FLUSH for LE links (where this is the only option) or
 	 * if the BR/EDR link supports it and flushing has not been
 	 * explicitly requested (through FLAG_FLUSHABLE).
@@ -1180,9 +1134,6 @@ static void l2cap_send_sframe(struct l2cap_chan *chan,
 	if (!control->sframe)
 		return;
 
-	if (__chan_is_moving(chan))
-		return;
-
 	if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state) &&
 	    !control->poll)
 		control->final = 1;
@@ -1237,40 +1188,6 @@ static inline int __l2cap_no_conn_pending(struct l2cap_chan *chan)
 	return !test_bit(CONF_CONNECT_PEND, &chan->conf_state);
 }
 
-static bool __amp_capable(struct l2cap_chan *chan)
-{
-	struct l2cap_conn *conn = chan->conn;
-	struct hci_dev *hdev;
-	bool amp_available = false;
-
-	if (!(conn->local_fixed_chan & L2CAP_FC_A2MP))
-		return false;
-
-	if (!(conn->remote_fixed_chan & L2CAP_FC_A2MP))
-		return false;
-
-	read_lock(&hci_dev_list_lock);
-	list_for_each_entry(hdev, &hci_dev_list, list) {
-		if (hdev->amp_type != AMP_TYPE_BREDR &&
-		    test_bit(HCI_UP, &hdev->flags)) {
-			amp_available = true;
-			break;
-		}
-	}
-	read_unlock(&hci_dev_list_lock);
-
-	if (chan->chan_policy == BT_CHANNEL_POLICY_AMP_PREFERRED)
-		return amp_available;
-
-	return false;
-}
-
-static bool l2cap_check_efs(struct l2cap_chan *chan)
-{
-	/* Check EFS parameters */
-	return true;
-}
-
 void l2cap_send_conn_req(struct l2cap_chan *chan)
 {
 	struct l2cap_conn *conn = chan->conn;
@@ -1286,76 +1203,6 @@ void l2cap_send_conn_req(struct l2cap_chan *chan)
 	l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_REQ, sizeof(req), &req);
 }
 
-static void l2cap_send_create_chan_req(struct l2cap_chan *chan, u8 amp_id)
-{
-	struct l2cap_create_chan_req req;
-	req.scid = cpu_to_le16(chan->scid);
-	req.psm  = chan->psm;
-	req.amp_id = amp_id;
-
-	chan->ident = l2cap_get_ident(chan->conn);
-
-	l2cap_send_cmd(chan->conn, chan->ident, L2CAP_CREATE_CHAN_REQ,
-		       sizeof(req), &req);
-}
-
-static void l2cap_move_setup(struct l2cap_chan *chan)
-{
-	struct sk_buff *skb;
-
-	BT_DBG("chan %p", chan);
-
-	if (chan->mode != L2CAP_MODE_ERTM)
-		return;
-
-	__clear_retrans_timer(chan);
-	__clear_monitor_timer(chan);
-	__clear_ack_timer(chan);
-
-	chan->retry_count = 0;
-	skb_queue_walk(&chan->tx_q, skb) {
-		if (bt_cb(skb)->l2cap.retries)
-			bt_cb(skb)->l2cap.retries = 1;
-		else
-			break;
-	}
-
-	chan->expected_tx_seq = chan->buffer_seq;
-
-	clear_bit(CONN_REJ_ACT, &chan->conn_state);
-	clear_bit(CONN_SREJ_ACT, &chan->conn_state);
-	l2cap_seq_list_clear(&chan->retrans_list);
-	l2cap_seq_list_clear(&chan->srej_list);
-	skb_queue_purge(&chan->srej_q);
-
-	chan->tx_state = L2CAP_TX_STATE_XMIT;
-	chan->rx_state = L2CAP_RX_STATE_MOVE;
-
-	set_bit(CONN_REMOTE_BUSY, &chan->conn_state);
-}
-
-static void l2cap_move_done(struct l2cap_chan *chan)
-{
-	u8 move_role = chan->move_role;
-	BT_DBG("chan %p", chan);
-
-	chan->move_state = L2CAP_MOVE_STABLE;
-	chan->move_role = L2CAP_MOVE_ROLE_NONE;
-
-	if (chan->mode != L2CAP_MODE_ERTM)
-		return;
-
-	switch (move_role) {
-	case L2CAP_MOVE_ROLE_INITIATOR:
-		l2cap_tx(chan, NULL, NULL, L2CAP_EV_EXPLICIT_POLL);
-		chan->rx_state = L2CAP_RX_STATE_WAIT_F;
-		break;
-	case L2CAP_MOVE_ROLE_RESPONDER:
-		chan->rx_state = L2CAP_RX_STATE_WAIT_P;
-		break;
-	}
-}
-
 static void l2cap_chan_ready(struct l2cap_chan *chan)
 {
 	/* The channel may have already been flagged as connected in
@@ -1505,10 +1352,7 @@ static void l2cap_le_start(struct l2cap_chan *chan)
 
 static void l2cap_start_connection(struct l2cap_chan *chan)
 {
-	if (__amp_capable(chan)) {
-		BT_DBG("chan %p AMP capable: discover AMPs", chan);
-		a2mp_discover_amp(chan);
-	} else if (chan->conn->hcon->type == LE_LINK) {
+	if (chan->conn->hcon->type == LE_LINK) {
 		l2cap_le_start(chan);
 	} else {
 		l2cap_send_conn_req(chan);
@@ -1611,11 +1455,6 @@ static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err)
 		__clear_ack_timer(chan);
 	}
 
-	if (chan->scid == L2CAP_CID_A2MP) {
-		l2cap_state_change(chan, BT_DISCONN);
-		return;
-	}
-
 	req.dcid = cpu_to_le16(chan->dcid);
 	req.scid = cpu_to_le16(chan->scid);
 	l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_DISCONN_REQ,
@@ -1754,11 +1593,6 @@ static void l2cap_conn_ready(struct l2cap_conn *conn)
 
 		l2cap_chan_lock(chan);
 
-		if (chan->scid == L2CAP_CID_A2MP) {
-			l2cap_chan_unlock(chan);
-			continue;
-		}
-
 		if (hcon->type == LE_LINK) {
 			l2cap_le_start(chan);
 		} else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
@@ -2067,9 +1901,6 @@ static void l2cap_streaming_send(struct l2cap_chan *chan,
 
 	BT_DBG("chan %p, skbs %p", chan, skbs);
 
-	if (__chan_is_moving(chan))
-		return;
-
 	skb_queue_splice_tail_init(skbs, &chan->tx_q);
 
 	while (!skb_queue_empty(&chan->tx_q)) {
@@ -2112,9 +1943,6 @@ static int l2cap_ertm_send(struct l2cap_chan *chan)
 	if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state))
 		return 0;
 
-	if (__chan_is_moving(chan))
-		return 0;
-
 	while (chan->tx_send_head &&
 	       chan->unacked_frames < chan->remote_tx_win &&
 	       chan->tx_state == L2CAP_TX_STATE_XMIT) {
@@ -2180,9 +2008,6 @@ static void l2cap_ertm_resend(struct l2cap_chan *chan)
 	if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state))
 		return;
 
-	if (__chan_is_moving(chan))
-		return;
-
 	while (chan->retrans_list.head != L2CAP_SEQ_LIST_CLEAR) {
 		seq = l2cap_seq_list_pop(&chan->retrans_list);
 
@@ -2522,8 +2347,7 @@ static int l2cap_segment_sdu(struct l2cap_chan *chan,
 	pdu_len = chan->conn->mtu;
 
 	/* Constrain PDU size for BR/EDR connections */
-	if (!chan->hs_hcon)
-		pdu_len = min_t(size_t, pdu_len, L2CAP_BREDR_MAX_PAYLOAD);
+	pdu_len = min_t(size_t, pdu_len, L2CAP_BREDR_MAX_PAYLOAD);
 
 	/* Adjust for largest possible L2CAP overhead. */
 	if (chan->fcs)
@@ -3287,11 +3111,6 @@ int l2cap_ertm_init(struct l2cap_chan *chan)
 
 	skb_queue_head_init(&chan->tx_q);
 
-	chan->local_amp_id = AMP_ID_BREDR;
-	chan->move_id = AMP_ID_BREDR;
-	chan->move_state = L2CAP_MOVE_STABLE;
-	chan->move_role = L2CAP_MOVE_ROLE_NONE;
-
 	if (chan->mode != L2CAP_MODE_ERTM)
 		return 0;
 
@@ -3326,52 +3145,19 @@ static inline __u8 l2cap_select_mode(__u8 mode, __u16 remote_feat_mask)
 
 static inline bool __l2cap_ews_supported(struct l2cap_conn *conn)
 {
-	return ((conn->local_fixed_chan & L2CAP_FC_A2MP) &&
-		(conn->feat_mask & L2CAP_FEAT_EXT_WINDOW));
+	return (conn->feat_mask & L2CAP_FEAT_EXT_WINDOW);
 }
 
 static inline bool __l2cap_efs_supported(struct l2cap_conn *conn)
 {
-	return ((conn->local_fixed_chan & L2CAP_FC_A2MP) &&
-		(conn->feat_mask & L2CAP_FEAT_EXT_FLOW));
+	return (conn->feat_mask & L2CAP_FEAT_EXT_FLOW);
 }
 
 static void __l2cap_set_ertm_timeouts(struct l2cap_chan *chan,
 				      struct l2cap_conf_rfc *rfc)
 {
-	if (chan->local_amp_id != AMP_ID_BREDR && chan->hs_hcon) {
-		u64 ertm_to = chan->hs_hcon->hdev->amp_be_flush_to;
-
-		/* Class 1 devices have must have ERTM timeouts
-		 * exceeding the Link Supervision Timeout.  The
-		 * default Link Supervision Timeout for AMP
-		 * controllers is 10 seconds.
-		 *
-		 * Class 1 devices use 0xffffffff for their
-		 * best-effort flush timeout, so the clamping logic
-		 * will result in a timeout that meets the above
-		 * requirement.  ERTM timeouts are 16-bit values, so
-		 * the maximum timeout is 65.535 seconds.
-		 */
-
-		/* Convert timeout to milliseconds and round */
-		ertm_to = DIV_ROUND_UP_ULL(ertm_to, 1000);
-
-		/* This is the recommended formula for class 2 devices
-		 * that start ERTM timers when packets are sent to the
-		 * controller.
-		 */
-		ertm_to = 3 * ertm_to + 500;
-
-		if (ertm_to > 0xffff)
-			ertm_to = 0xffff;
-
-		rfc->retrans_timeout = cpu_to_le16((u16) ertm_to);
-		rfc->monitor_timeout = rfc->retrans_timeout;
-	} else {
-		rfc->retrans_timeout = cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO);
-		rfc->monitor_timeout = cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO);
-	}
+	rfc->retrans_timeout = cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO);
+	rfc->monitor_timeout = cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO);
 }
 
 static inline void l2cap_txwin_setup(struct l2cap_chan *chan)
@@ -3623,13 +3409,7 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data
 		case L2CAP_CONF_EWS:
 			if (olen != 2)
 				break;
-			if (!(chan->conn->local_fixed_chan & L2CAP_FC_A2MP))
-				return -ECONNREFUSED;
-			set_bit(FLAG_EXT_CTRL, &chan->flags);
-			set_bit(CONF_EWS_RECV, &chan->conf_state);
-			chan->tx_win_max = L2CAP_DEFAULT_EXT_WINDOW;
-			chan->remote_tx_win = val;
-			break;
+			return -ECONNREFUSED;
 
 		default:
 			if (hint)
@@ -4027,11 +3807,7 @@ void __l2cap_connect_rsp_defer(struct l2cap_chan *chan)
 	rsp.dcid   = cpu_to_le16(chan->scid);
 	rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
 	rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
-
-	if (chan->hs_hcon)
-		rsp_code = L2CAP_CREATE_CHAN_RSP;
-	else
-		rsp_code = L2CAP_CONN_RSP;
+	rsp_code = L2CAP_CONN_RSP;
 
 	BT_DBG("chan %p rsp_code %u", chan, rsp_code);
 
@@ -4190,7 +3966,6 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn,
 	chan->dst_type = bdaddr_dst_type(conn->hcon);
 	chan->psm  = psm;
 	chan->dcid = scid;
-	chan->local_amp_id = amp_id;
 
 	__l2cap_chan_add(conn, chan);
 
@@ -4516,10 +4291,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn,
 		/* check compatibility */
 
 		/* Send rsp for BR/EDR channel */
-		if (!chan->hs_hcon)
-			l2cap_send_efs_conf_rsp(chan, rsp, cmd->ident, flags);
-		else
-			chan->ident = cmd->ident;
+		l2cap_send_efs_conf_rsp(chan, rsp, cmd->ident, flags);
 	}
 
 unlock:
@@ -4571,15 +4343,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn,
 				goto done;
 			}
 
-			if (!chan->hs_hcon) {
-				l2cap_send_efs_conf_rsp(chan, buf, cmd->ident,
-							0);
-			} else {
-				if (l2cap_check_efs(chan)) {
-					amp_create_logical_link(chan);
-					chan->ident = cmd->ident;
-				}
-			}
+			l2cap_send_efs_conf_rsp(chan, buf, cmd->ident, 0);
 		}
 		goto done;
 
@@ -4750,9 +4514,6 @@ static inline int l2cap_information_req(struct l2cap_conn *conn,
 		if (!disable_ertm)
 			feat_mask |= L2CAP_FEAT_ERTM | L2CAP_FEAT_STREAMING
 				| L2CAP_FEAT_FCS;
-		if (conn->local_fixed_chan & L2CAP_FC_A2MP)
-			feat_mask |= L2CAP_FEAT_EXT_FLOW
-				| L2CAP_FEAT_EXT_WINDOW;
 
 		put_unaligned_le32(feat_mask, rsp->data);
 		l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(buf),
@@ -4841,751 +4602,6 @@ static inline int l2cap_information_rsp(struct l2cap_conn *conn,
 	return 0;
 }
 
-static int l2cap_create_channel_req(struct l2cap_conn *conn,
-				    struct l2cap_cmd_hdr *cmd,
-				    u16 cmd_len, void *data)
-{
-	struct l2cap_create_chan_req *req = data;
-	struct l2cap_create_chan_rsp rsp;
-	struct l2cap_chan *chan;
-	struct hci_dev *hdev;
-	u16 psm, scid;
-
-	if (cmd_len != sizeof(*req))
-		return -EPROTO;
-
-	if (!(conn->local_fixed_chan & L2CAP_FC_A2MP))
-		return -EINVAL;
-
-	psm = le16_to_cpu(req->psm);
-	scid = le16_to_cpu(req->scid);
-
-	BT_DBG("psm 0x%2.2x, scid 0x%4.4x, amp_id %d", psm, scid, req->amp_id);
-
-	/* For controller id 0 make BR/EDR connection */
-	if (req->amp_id == AMP_ID_BREDR) {
-		l2cap_connect(conn, cmd, data, L2CAP_CREATE_CHAN_RSP,
-			      req->amp_id);
-		return 0;
-	}
-
-	/* Validate AMP controller id */
-	hdev = hci_dev_get(req->amp_id);
-	if (!hdev)
-		goto error;
-
-	if (hdev->dev_type != HCI_AMP || !test_bit(HCI_UP, &hdev->flags)) {
-		hci_dev_put(hdev);
-		goto error;
-	}
-
-	chan = l2cap_connect(conn, cmd, data, L2CAP_CREATE_CHAN_RSP,
-			     req->amp_id);
-	if (chan) {
-		struct amp_mgr *mgr = conn->hcon->amp_mgr;
-		struct hci_conn *hs_hcon;
-
-		hs_hcon = hci_conn_hash_lookup_ba(hdev, AMP_LINK,
-						  &conn->hcon->dst);
-		if (!hs_hcon) {
-			hci_dev_put(hdev);
-			cmd_reject_invalid_cid(conn, cmd->ident, chan->scid,
-					       chan->dcid);
-			return 0;
-		}
-
-		BT_DBG("mgr %p bredr_chan %p hs_hcon %p", mgr, chan, hs_hcon);
-
-		mgr->bredr_chan = chan;
-		chan->hs_hcon = hs_hcon;
-		chan->fcs = L2CAP_FCS_NONE;
-		conn->mtu = hdev->block_mtu;
-	}
-
-	hci_dev_put(hdev);
-
-	return 0;
-
-error:
-	rsp.dcid = 0;
-	rsp.scid = cpu_to_le16(scid);
-	rsp.result = cpu_to_le16(L2CAP_CR_BAD_AMP);
-	rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
-
-	l2cap_send_cmd(conn, cmd->ident, L2CAP_CREATE_CHAN_RSP,
-		       sizeof(rsp), &rsp);
-
-	return 0;
-}
-
-static void l2cap_send_move_chan_req(struct l2cap_chan *chan, u8 dest_amp_id)
-{
-	struct l2cap_move_chan_req req;
-	u8 ident;
-
-	BT_DBG("chan %p, dest_amp_id %d", chan, dest_amp_id);
-
-	ident = l2cap_get_ident(chan->conn);
-	chan->ident = ident;
-
-	req.icid = cpu_to_le16(chan->scid);
-	req.dest_amp_id = dest_amp_id;
-
-	l2cap_send_cmd(chan->conn, ident, L2CAP_MOVE_CHAN_REQ, sizeof(req),
-		       &req);
-
-	__set_chan_timer(chan, L2CAP_MOVE_TIMEOUT);
-}
-
-static void l2cap_send_move_chan_rsp(struct l2cap_chan *chan, u16 result)
-{
-	struct l2cap_move_chan_rsp rsp;
-
-	BT_DBG("chan %p, result 0x%4.4x", chan, result);
-
-	rsp.icid = cpu_to_le16(chan->dcid);
-	rsp.result = cpu_to_le16(result);
-
-	l2cap_send_cmd(chan->conn, chan->ident, L2CAP_MOVE_CHAN_RSP,
-		       sizeof(rsp), &rsp);
-}
-
-static void l2cap_send_move_chan_cfm(struct l2cap_chan *chan, u16 result)
-{
-	struct l2cap_move_chan_cfm cfm;
-
-	BT_DBG("chan %p, result 0x%4.4x", chan, result);
-
-	chan->ident = l2cap_get_ident(chan->conn);
-
-	cfm.icid = cpu_to_le16(chan->scid);
-	cfm.result = cpu_to_le16(result);
-
-	l2cap_send_cmd(chan->conn, chan->ident, L2CAP_MOVE_CHAN_CFM,
-		       sizeof(cfm), &cfm);
-
-	__set_chan_timer(chan, L2CAP_MOVE_TIMEOUT);
-}
-
-static void l2cap_send_move_chan_cfm_icid(struct l2cap_conn *conn, u16 icid)
-{
-	struct l2cap_move_chan_cfm cfm;
-
-	BT_DBG("conn %p, icid 0x%4.4x", conn, icid);
-
-	cfm.icid = cpu_to_le16(icid);
-	cfm.result = cpu_to_le16(L2CAP_MC_UNCONFIRMED);
-
-	l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_MOVE_CHAN_CFM,
-		       sizeof(cfm), &cfm);
-}
-
-static void l2cap_send_move_chan_cfm_rsp(struct l2cap_conn *conn, u8 ident,
-					 u16 icid)
-{
-	struct l2cap_move_chan_cfm_rsp rsp;
-
-	BT_DBG("icid 0x%4.4x", icid);
-
-	rsp.icid = cpu_to_le16(icid);
-	l2cap_send_cmd(conn, ident, L2CAP_MOVE_CHAN_CFM_RSP, sizeof(rsp), &rsp);
-}
-
-static void __release_logical_link(struct l2cap_chan *chan)
-{
-	chan->hs_hchan = NULL;
-	chan->hs_hcon = NULL;
-
-	/* Placeholder - release the logical link */
-}
-
-static void l2cap_logical_fail(struct l2cap_chan *chan)
-{
-	/* Logical link setup failed */
-	if (chan->state != BT_CONNECTED) {
-		/* Create channel failure, disconnect */
-		l2cap_send_disconn_req(chan, ECONNRESET);
-		return;
-	}
-
-	switch (chan->move_role) {
-	case L2CAP_MOVE_ROLE_RESPONDER:
-		l2cap_move_done(chan);
-		l2cap_send_move_chan_rsp(chan, L2CAP_MR_NOT_SUPP);
-		break;
-	case L2CAP_MOVE_ROLE_INITIATOR:
-		if (chan->move_state == L2CAP_MOVE_WAIT_LOGICAL_COMP ||
-		    chan->move_state == L2CAP_MOVE_WAIT_LOGICAL_CFM) {
-			/* Remote has only sent pending or
-			 * success responses, clean up
-			 */
-			l2cap_move_done(chan);
-		}
-
-		/* Other amp move states imply that the move
-		 * has already aborted
-		 */
-		l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
-		break;
-	}
-}
-
-static void l2cap_logical_finish_create(struct l2cap_chan *chan,
-					struct hci_chan *hchan)
-{
-	struct l2cap_conf_rsp rsp;
-
-	chan->hs_hchan = hchan;
-	chan->hs_hcon->l2cap_data = chan->conn;
-
-	l2cap_send_efs_conf_rsp(chan, &rsp, chan->ident, 0);
-
-	if (test_bit(CONF_INPUT_DONE, &chan->conf_state)) {
-		int err;
-
-		set_default_fcs(chan);
-
-		err = l2cap_ertm_init(chan);
-		if (err < 0)
-			l2cap_send_disconn_req(chan, -err);
-		else
-			l2cap_chan_ready(chan);
-	}
-}
-
-static void l2cap_logical_finish_move(struct l2cap_chan *chan,
-				      struct hci_chan *hchan)
-{
-	chan->hs_hcon = hchan->conn;
-	chan->hs_hcon->l2cap_data = chan->conn;
-
-	BT_DBG("move_state %d", chan->move_state);
-
-	switch (chan->move_state) {
-	case L2CAP_MOVE_WAIT_LOGICAL_COMP:
-		/* Move confirm will be sent after a success
-		 * response is received
-		 */
-		chan->move_state = L2CAP_MOVE_WAIT_RSP_SUCCESS;
-		break;
-	case L2CAP_MOVE_WAIT_LOGICAL_CFM:
-		if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
-			chan->move_state = L2CAP_MOVE_WAIT_LOCAL_BUSY;
-		} else if (chan->move_role == L2CAP_MOVE_ROLE_INITIATOR) {
-			chan->move_state = L2CAP_MOVE_WAIT_CONFIRM_RSP;
-			l2cap_send_move_chan_cfm(chan, L2CAP_MC_CONFIRMED);
-		} else if (chan->move_role == L2CAP_MOVE_ROLE_RESPONDER) {
-			chan->move_state = L2CAP_MOVE_WAIT_CONFIRM;
-			l2cap_send_move_chan_rsp(chan, L2CAP_MR_SUCCESS);
-		}
-		break;
-	default:
-		/* Move was not in expected state, free the channel */
-		__release_logical_link(chan);
-
-		chan->move_state = L2CAP_MOVE_STABLE;
-	}
-}
-
-/* Call with chan locked */
-void l2cap_logical_cfm(struct l2cap_chan *chan, struct hci_chan *hchan,
-		       u8 status)
-{
-	BT_DBG("chan %p, hchan %p, status %d", chan, hchan, status);
-
-	if (status) {
-		l2cap_logical_fail(chan);
-		__release_logical_link(chan);
-		return;
-	}
-
-	if (chan->state != BT_CONNECTED) {
-		/* Ignore logical link if channel is on BR/EDR */
-		if (chan->local_amp_id != AMP_ID_BREDR)
-			l2cap_logical_finish_create(chan, hchan);
-	} else {
-		l2cap_logical_finish_move(chan, hchan);
-	}
-}
-
-void l2cap_move_start(struct l2cap_chan *chan)
-{
-	BT_DBG("chan %p", chan);
-
-	if (chan->local_amp_id == AMP_ID_BREDR) {
-		if (chan->chan_policy != BT_CHANNEL_POLICY_AMP_PREFERRED)
-			return;
-		chan->move_role = L2CAP_MOVE_ROLE_INITIATOR;
-		chan->move_state = L2CAP_MOVE_WAIT_PREPARE;
-		/* Placeholder - start physical link setup */
-	} else {
-		chan->move_role = L2CAP_MOVE_ROLE_INITIATOR;
-		chan->move_state = L2CAP_MOVE_WAIT_RSP_SUCCESS;
-		chan->move_id = 0;
-		l2cap_move_setup(chan);
-		l2cap_send_move_chan_req(chan, 0);
-	}
-}
-
-static void l2cap_do_create(struct l2cap_chan *chan, int result,
-			    u8 local_amp_id, u8 remote_amp_id)
-{
-	BT_DBG("chan %p state %s %u -> %u", chan, state_to_string(chan->state),
-	       local_amp_id, remote_amp_id);
-
-	chan->fcs = L2CAP_FCS_NONE;
-
-	/* Outgoing channel on AMP */
-	if (chan->state == BT_CONNECT) {
-		if (result == L2CAP_CR_SUCCESS) {
-			chan->local_amp_id = local_amp_id;
-			l2cap_send_create_chan_req(chan, remote_amp_id);
-		} else {
-			/* Revert to BR/EDR connect */
-			l2cap_send_conn_req(chan);
-		}
-
-		return;
-	}
-
-	/* Incoming channel on AMP */
-	if (__l2cap_no_conn_pending(chan)) {
-		struct l2cap_conn_rsp rsp;
-		char buf[128];
-		rsp.scid = cpu_to_le16(chan->dcid);
-		rsp.dcid = cpu_to_le16(chan->scid);
-
-		if (result == L2CAP_CR_SUCCESS) {
-			/* Send successful response */
-			rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
-			rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
-		} else {
-			/* Send negative response */
-			rsp.result = cpu_to_le16(L2CAP_CR_NO_MEM);
-			rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
-		}
-
-		l2cap_send_cmd(chan->conn, chan->ident, L2CAP_CREATE_CHAN_RSP,
-			       sizeof(rsp), &rsp);
-
-		if (result == L2CAP_CR_SUCCESS) {
-			l2cap_state_change(chan, BT_CONFIG);
-			set_bit(CONF_REQ_SENT, &chan->conf_state);
-			l2cap_send_cmd(chan->conn, l2cap_get_ident(chan->conn),
-				       L2CAP_CONF_REQ,
-				       l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
-			chan->num_conf_req++;
-		}
-	}
-}
-
-static void l2cap_do_move_initiate(struct l2cap_chan *chan, u8 local_amp_id,
-				   u8 remote_amp_id)
-{
-	l2cap_move_setup(chan);
-	chan->move_id = local_amp_id;
-	chan->move_state = L2CAP_MOVE_WAIT_RSP;
-
-	l2cap_send_move_chan_req(chan, remote_amp_id);
-}
-
-static void l2cap_do_move_respond(struct l2cap_chan *chan, int result)
-{
-	struct hci_chan *hchan = NULL;
-
-	/* Placeholder - get hci_chan for logical link */
-
-	if (hchan) {
-		if (hchan->state == BT_CONNECTED) {
-			/* Logical link is ready to go */
-			chan->hs_hcon = hchan->conn;
-			chan->hs_hcon->l2cap_data = chan->conn;
-			chan->move_state = L2CAP_MOVE_WAIT_CONFIRM;
-			l2cap_send_move_chan_rsp(chan, L2CAP_MR_SUCCESS);
-
-			l2cap_logical_cfm(chan, hchan, L2CAP_MR_SUCCESS);
-		} else {
-			/* Wait for logical link to be ready */
-			chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_CFM;
-		}
-	} else {
-		/* Logical link not available */
-		l2cap_send_move_chan_rsp(chan, L2CAP_MR_NOT_ALLOWED);
-	}
-}
-
-static void l2cap_do_move_cancel(struct l2cap_chan *chan, int result)
-{
-	if (chan->move_role == L2CAP_MOVE_ROLE_RESPONDER) {
-		u8 rsp_result;
-		if (result == -EINVAL)
-			rsp_result = L2CAP_MR_BAD_ID;
-		else
-			rsp_result = L2CAP_MR_NOT_ALLOWED;
-
-		l2cap_send_move_chan_rsp(chan, rsp_result);
-	}
-
-	chan->move_role = L2CAP_MOVE_ROLE_NONE;
-	chan->move_state = L2CAP_MOVE_STABLE;
-
-	/* Restart data transmission */
-	l2cap_ertm_send(chan);
-}
-
-/* Invoke with locked chan */
-void __l2cap_physical_cfm(struct l2cap_chan *chan, int result)
-{
-	u8 local_amp_id = chan->local_amp_id;
-	u8 remote_amp_id = chan->remote_amp_id;
-
-	BT_DBG("chan %p, result %d, local_amp_id %d, remote_amp_id %d",
-	       chan, result, local_amp_id, remote_amp_id);
-
-	if (chan->state == BT_DISCONN || chan->state == BT_CLOSED)
-		return;
-
-	if (chan->state != BT_CONNECTED) {
-		l2cap_do_create(chan, result, local_amp_id, remote_amp_id);
-	} else if (result != L2CAP_MR_SUCCESS) {
-		l2cap_do_move_cancel(chan, result);
-	} else {
-		switch (chan->move_role) {
-		case L2CAP_MOVE_ROLE_INITIATOR:
-			l2cap_do_move_initiate(chan, local_amp_id,
-					       remote_amp_id);
-			break;
-		case L2CAP_MOVE_ROLE_RESPONDER:
-			l2cap_do_move_respond(chan, result);
-			break;
-		default:
-			l2cap_do_move_cancel(chan, result);
-			break;
-		}
-	}
-}
-
-static inline int l2cap_move_channel_req(struct l2cap_conn *conn,
-					 struct l2cap_cmd_hdr *cmd,
-					 u16 cmd_len, void *data)
-{
-	struct l2cap_move_chan_req *req = data;
-	struct l2cap_move_chan_rsp rsp;
-	struct l2cap_chan *chan;
-	u16 icid = 0;
-	u16 result = L2CAP_MR_NOT_ALLOWED;
-
-	if (cmd_len != sizeof(*req))
-		return -EPROTO;
-
-	icid = le16_to_cpu(req->icid);
-
-	BT_DBG("icid 0x%4.4x, dest_amp_id %d", icid, req->dest_amp_id);
-
-	if (!(conn->local_fixed_chan & L2CAP_FC_A2MP))
-		return -EINVAL;
-
-	chan = l2cap_get_chan_by_dcid(conn, icid);
-	if (!chan) {
-		rsp.icid = cpu_to_le16(icid);
-		rsp.result = cpu_to_le16(L2CAP_MR_NOT_ALLOWED);
-		l2cap_send_cmd(conn, cmd->ident, L2CAP_MOVE_CHAN_RSP,
-			       sizeof(rsp), &rsp);
-		return 0;
-	}
-
-	chan->ident = cmd->ident;
-
-	if (chan->scid < L2CAP_CID_DYN_START ||
-	    chan->chan_policy == BT_CHANNEL_POLICY_BREDR_ONLY ||
-	    (chan->mode != L2CAP_MODE_ERTM &&
-	     chan->mode != L2CAP_MODE_STREAMING)) {
-		result = L2CAP_MR_NOT_ALLOWED;
-		goto send_move_response;
-	}
-
-	if (chan->local_amp_id == req->dest_amp_id) {
-		result = L2CAP_MR_SAME_ID;
-		goto send_move_response;
-	}
-
-	if (req->dest_amp_id != AMP_ID_BREDR) {
-		struct hci_dev *hdev;
-		hdev = hci_dev_get(req->dest_amp_id);
-		if (!hdev || hdev->dev_type != HCI_AMP ||
-		    !test_bit(HCI_UP, &hdev->flags)) {
-			if (hdev)
-				hci_dev_put(hdev);
-
-			result = L2CAP_MR_BAD_ID;
-			goto send_move_response;
-		}
-		hci_dev_put(hdev);
-	}
-
-	/* Detect a move collision.  Only send a collision response
-	 * if this side has "lost", otherwise proceed with the move.
-	 * The winner has the larger bd_addr.
-	 */
-	if ((__chan_is_moving(chan) ||
-	     chan->move_role != L2CAP_MOVE_ROLE_NONE) &&
-	    bacmp(&conn->hcon->src, &conn->hcon->dst) > 0) {
-		result = L2CAP_MR_COLLISION;
-		goto send_move_response;
-	}
-
-	chan->move_role = L2CAP_MOVE_ROLE_RESPONDER;
-	l2cap_move_setup(chan);
-	chan->move_id = req->dest_amp_id;
-
-	if (req->dest_amp_id == AMP_ID_BREDR) {
-		/* Moving to BR/EDR */
-		if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
-			chan->move_state = L2CAP_MOVE_WAIT_LOCAL_BUSY;
-			result = L2CAP_MR_PEND;
-		} else {
-			chan->move_state = L2CAP_MOVE_WAIT_CONFIRM;
-			result = L2CAP_MR_SUCCESS;
-		}
-	} else {
-		chan->move_state = L2CAP_MOVE_WAIT_PREPARE;
-		/* Placeholder - uncomment when amp functions are available */
-		/*amp_accept_physical(chan, req->dest_amp_id);*/
-		result = L2CAP_MR_PEND;
-	}
-
-send_move_response:
-	l2cap_send_move_chan_rsp(chan, result);
-
-	l2cap_chan_unlock(chan);
-	l2cap_chan_put(chan);
-
-	return 0;
-}
-
-static void l2cap_move_continue(struct l2cap_conn *conn, u16 icid, u16 result)
-{
-	struct l2cap_chan *chan;
-	struct hci_chan *hchan = NULL;
-
-	chan = l2cap_get_chan_by_scid(conn, icid);
-	if (!chan) {
-		l2cap_send_move_chan_cfm_icid(conn, icid);
-		return;
-	}
-
-	__clear_chan_timer(chan);
-	if (result == L2CAP_MR_PEND)
-		__set_chan_timer(chan, L2CAP_MOVE_ERTX_TIMEOUT);
-
-	switch (chan->move_state) {
-	case L2CAP_MOVE_WAIT_LOGICAL_COMP:
-		/* Move confirm will be sent when logical link
-		 * is complete.
-		 */
-		chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_CFM;
-		break;
-	case L2CAP_MOVE_WAIT_RSP_SUCCESS:
-		if (result == L2CAP_MR_PEND) {
-			break;
-		} else if (test_bit(CONN_LOCAL_BUSY,
-				    &chan->conn_state)) {
-			chan->move_state = L2CAP_MOVE_WAIT_LOCAL_BUSY;
-		} else {
-			/* Logical link is up or moving to BR/EDR,
-			 * proceed with move
-			 */
-			chan->move_state = L2CAP_MOVE_WAIT_CONFIRM_RSP;
-			l2cap_send_move_chan_cfm(chan, L2CAP_MC_CONFIRMED);
-		}
-		break;
-	case L2CAP_MOVE_WAIT_RSP:
-		/* Moving to AMP */
-		if (result == L2CAP_MR_SUCCESS) {
-			/* Remote is ready, send confirm immediately
-			 * after logical link is ready
-			 */
-			chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_CFM;
-		} else {
-			/* Both logical link and move success
-			 * are required to confirm
-			 */
-			chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_COMP;
-		}
-
-		/* Placeholder - get hci_chan for logical link */
-		if (!hchan) {
-			/* Logical link not available */
-			l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
-			break;
-		}
-
-		/* If the logical link is not yet connected, do not
-		 * send confirmation.
-		 */
-		if (hchan->state != BT_CONNECTED)
-			break;
-
-		/* Logical link is already ready to go */
-
-		chan->hs_hcon = hchan->conn;
-		chan->hs_hcon->l2cap_data = chan->conn;
-
-		if (result == L2CAP_MR_SUCCESS) {
-			/* Can confirm now */
-			l2cap_send_move_chan_cfm(chan, L2CAP_MC_CONFIRMED);
-		} else {
-			/* Now only need move success
-			 * to confirm
-			 */
-			chan->move_state = L2CAP_MOVE_WAIT_RSP_SUCCESS;
-		}
-
-		l2cap_logical_cfm(chan, hchan, L2CAP_MR_SUCCESS);
-		break;
-	default:
-		/* Any other amp move state means the move failed. */
-		chan->move_id = chan->local_amp_id;
-		l2cap_move_done(chan);
-		l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
-	}
-
-	l2cap_chan_unlock(chan);
-	l2cap_chan_put(chan);
-}
-
-static void l2cap_move_fail(struct l2cap_conn *conn, u8 ident, u16 icid,
-			    u16 result)
-{
-	struct l2cap_chan *chan;
-
-	chan = l2cap_get_chan_by_ident(conn, ident);
-	if (!chan) {
-		/* Could not locate channel, icid is best guess */
-		l2cap_send_move_chan_cfm_icid(conn, icid);
-		return;
-	}
-
-	__clear_chan_timer(chan);
-
-	if (chan->move_role == L2CAP_MOVE_ROLE_INITIATOR) {
-		if (result == L2CAP_MR_COLLISION) {
-			chan->move_role = L2CAP_MOVE_ROLE_RESPONDER;
-		} else {
-			/* Cleanup - cancel move */
-			chan->move_id = chan->local_amp_id;
-			l2cap_move_done(chan);
-		}
-	}
-
-	l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
-
-	l2cap_chan_unlock(chan);
-	l2cap_chan_put(chan);
-}
-
-static int l2cap_move_channel_rsp(struct l2cap_conn *conn,
-				  struct l2cap_cmd_hdr *cmd,
-				  u16 cmd_len, void *data)
-{
-	struct l2cap_move_chan_rsp *rsp = data;
-	u16 icid, result;
-
-	if (cmd_len != sizeof(*rsp))
-		return -EPROTO;
-
-	icid = le16_to_cpu(rsp->icid);
-	result = le16_to_cpu(rsp->result);
-
-	BT_DBG("icid 0x%4.4x, result 0x%4.4x", icid, result);
-
-	if (result == L2CAP_MR_SUCCESS || result == L2CAP_MR_PEND)
-		l2cap_move_continue(conn, icid, result);
-	else
-		l2cap_move_fail(conn, cmd->ident, icid, result);
-
-	return 0;
-}
-
-static int l2cap_move_channel_confirm(struct l2cap_conn *conn,
-				      struct l2cap_cmd_hdr *cmd,
-				      u16 cmd_len, void *data)
-{
-	struct l2cap_move_chan_cfm *cfm = data;
-	struct l2cap_chan *chan;
-	u16 icid, result;
-
-	if (cmd_len != sizeof(*cfm))
-		return -EPROTO;
-
-	icid = le16_to_cpu(cfm->icid);
-	result = le16_to_cpu(cfm->result);
-
-	BT_DBG("icid 0x%4.4x, result 0x%4.4x", icid, result);
-
-	chan = l2cap_get_chan_by_dcid(conn, icid);
-	if (!chan) {
-		/* Spec requires a response even if the icid was not found */
-		l2cap_send_move_chan_cfm_rsp(conn, cmd->ident, icid);
-		return 0;
-	}
-
-	if (chan->move_state == L2CAP_MOVE_WAIT_CONFIRM) {
-		if (result == L2CAP_MC_CONFIRMED) {
-			chan->local_amp_id = chan->move_id;
-			if (chan->local_amp_id == AMP_ID_BREDR)
-				__release_logical_link(chan);
-		} else {
-			chan->move_id = chan->local_amp_id;
-		}
-
-		l2cap_move_done(chan);
-	}
-
-	l2cap_send_move_chan_cfm_rsp(conn, cmd->ident, icid);
-
-	l2cap_chan_unlock(chan);
-	l2cap_chan_put(chan);
-
-	return 0;
-}
-
-static inline int l2cap_move_channel_confirm_rsp(struct l2cap_conn *conn,
-						 struct l2cap_cmd_hdr *cmd,
-						 u16 cmd_len, void *data)
-{
-	struct l2cap_move_chan_cfm_rsp *rsp = data;
-	struct l2cap_chan *chan;
-	u16 icid;
-
-	if (cmd_len != sizeof(*rsp))
-		return -EPROTO;
-
-	icid = le16_to_cpu(rsp->icid);
-
-	BT_DBG("icid 0x%4.4x", icid);
-
-	chan = l2cap_get_chan_by_scid(conn, icid);
-	if (!chan)
-		return 0;
-
-	__clear_chan_timer(chan);
-
-	if (chan->move_state == L2CAP_MOVE_WAIT_CONFIRM_RSP) {
-		chan->local_amp_id = chan->move_id;
-
-		if (chan->local_amp_id == AMP_ID_BREDR && chan->hs_hchan)
-			__release_logical_link(chan);
-
-		l2cap_move_done(chan);
-	}
-
-	l2cap_chan_unlock(chan);
-	l2cap_chan_put(chan);
-
-	return 0;
-}
-
 static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn,
 					      struct l2cap_cmd_hdr *cmd,
 					      u16 cmd_len, u8 *data)
@@ -5745,7 +4761,6 @@ static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn,
 		break;
 
 	case L2CAP_CONN_RSP:
-	case L2CAP_CREATE_CHAN_RSP:
 		l2cap_connect_create_rsp(conn, cmd, cmd_len, data);
 		break;
 
@@ -5780,26 +4795,6 @@ static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn,
 		l2cap_information_rsp(conn, cmd, cmd_len, data);
 		break;
 
-	case L2CAP_CREATE_CHAN_REQ:
-		err = l2cap_create_channel_req(conn, cmd, cmd_len, data);
-		break;
-
-	case L2CAP_MOVE_CHAN_REQ:
-		err = l2cap_move_channel_req(conn, cmd, cmd_len, data);
-		break;
-
-	case L2CAP_MOVE_CHAN_RSP:
-		l2cap_move_channel_rsp(conn, cmd, cmd_len, data);
-		break;
-
-	case L2CAP_MOVE_CHAN_CFM:
-		err = l2cap_move_channel_confirm(conn, cmd, cmd_len, data);
-		break;
-
-	case L2CAP_MOVE_CHAN_CFM_RSP:
-		l2cap_move_channel_confirm_rsp(conn, cmd, cmd_len, data);
-		break;
-
 	default:
 		BT_ERR("Unknown BR/EDR signaling command 0x%2.2x", cmd->code);
 		err = -EINVAL;
@@ -7051,8 +6046,8 @@ static int l2cap_rx_state_recv(struct l2cap_chan *chan,
 		if (control->final) {
 			clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
 
-			if (!test_and_clear_bit(CONN_REJ_ACT, &chan->conn_state) &&
-			    !__chan_is_moving(chan)) {
+			if (!test_and_clear_bit(CONN_REJ_ACT,
+						&chan->conn_state)) {
 				control->final = 0;
 				l2cap_retransmit_all(chan, control);
 			}
@@ -7245,11 +6240,7 @@ static int l2cap_finish_move(struct l2cap_chan *chan)
 	BT_DBG("chan %p", chan);
 
 	chan->rx_state = L2CAP_RX_STATE_RECV;
-
-	if (chan->hs_hcon)
-		chan->conn->mtu = chan->hs_hcon->hdev->block_mtu;
-	else
-		chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu;
+	chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu;
 
 	return l2cap_resegment(chan);
 }
@@ -7316,11 +6307,7 @@ static int l2cap_rx_state_wait_f(struct l2cap_chan *chan,
 	 */
 	chan->next_tx_seq = control->reqseq;
 	chan->unacked_frames = 0;
-
-	if (chan->hs_hcon)
-		chan->conn->mtu = chan->hs_hcon->hdev->block_mtu;
-	else
-		chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu;
+	chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu;
 
 	err = l2cap_resegment(chan);
 
@@ -7672,21 +6659,10 @@ static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid,
 
 	chan = l2cap_get_chan_by_scid(conn, cid);
 	if (!chan) {
-		if (cid == L2CAP_CID_A2MP) {
-			chan = a2mp_channel_create(conn, skb);
-			if (!chan) {
-				kfree_skb(skb);
-				return;
-			}
-
-			l2cap_chan_hold(chan);
-			l2cap_chan_lock(chan);
-		} else {
-			BT_DBG("unknown cid 0x%4.4x", cid);
-			/* Drop packet and return */
-			kfree_skb(skb);
-			return;
-		}
+		BT_DBG("unknown cid 0x%4.4x", cid);
+		/* Drop packet and return */
+		kfree_skb(skb);
+		return;
 	}
 
 	BT_DBG("chan %p, len %d", chan, skb->len);
@@ -7887,10 +6863,6 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon)
 
 	conn->local_fixed_chan = L2CAP_FC_SIG_BREDR | L2CAP_FC_CONNLESS;
 
-	if (hcon->type == ACL_LINK &&
-	    hci_dev_test_flag(hcon->hdev, HCI_HS_ENABLED))
-		conn->local_fixed_chan |= L2CAP_FC_A2MP;
-
 	if (hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED) &&
 	    (bredr_sc_enabled(hcon->hdev) ||
 	     hci_dev_test_flag(hcon->hdev, HCI_FORCE_BREDR_SMP)))
@@ -8355,11 +7327,6 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
 		BT_DBG("chan %p scid 0x%4.4x state %s", chan, chan->scid,
 		       state_to_string(chan->state));
 
-		if (chan->scid == L2CAP_CID_A2MP) {
-			l2cap_chan_unlock(chan);
-			continue;
-		}
-
 		if (!status && encrypt)
 			chan->sec_level = hcon->sec_level;
 
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index e50d3d102078..ee7a41d6994f 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1027,23 +1027,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
 			break;
 		}
 
-		if (opt > BT_CHANNEL_POLICY_AMP_PREFERRED) {
-			err = -EINVAL;
-			break;
-		}
-
-		if (chan->mode != L2CAP_MODE_ERTM &&
-		    chan->mode != L2CAP_MODE_STREAMING) {
-			err = -EOPNOTSUPP;
-			break;
-		}
-
-		chan->chan_policy = (u8) opt;
-
-		if (sk->sk_state == BT_CONNECTED &&
-		    chan->move_role == L2CAP_MOVE_ROLE_NONE)
-			l2cap_move_start(chan);
-
+		err = -EOPNOTSUPP;
 		break;
 
 	case BT_SNDMTU:
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 8c4493255f92..7713e2cdf9e1 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -835,8 +835,6 @@ static u32 get_supported_settings(struct hci_dev *hdev)
 
 		if (lmp_ssp_capable(hdev)) {
 			settings |= MGMT_SETTING_SSP;
-			if (IS_ENABLED(CONFIG_BT_HS))
-				settings |= MGMT_SETTING_HS;
 		}
 
 		if (lmp_sc_capable(hdev))
@@ -901,9 +899,6 @@ static u32 get_current_settings(struct hci_dev *hdev)
 	if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
 		settings |= MGMT_SETTING_SSP;
 
-	if (hci_dev_test_flag(hdev, HCI_HS_ENABLED))
-		settings |= MGMT_SETTING_HS;
-
 	if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
 		settings |= MGMT_SETTING_ADVERTISING;
 
@@ -1938,7 +1933,6 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err)
 
 		if (enable && hci_dev_test_and_clear_flag(hdev,
 							  HCI_SSP_ENABLED)) {
-			hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
 			new_settings(hdev, NULL);
 		}
 
@@ -1951,12 +1945,6 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err)
 		changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED);
 	} else {
 		changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED);
-
-		if (!changed)
-			changed = hci_dev_test_and_clear_flag(hdev,
-							      HCI_HS_ENABLED);
-		else
-			hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
 	}
 
 	mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, settings_rsp, &match);
@@ -2020,11 +2008,6 @@ static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 		} else {
 			changed = hci_dev_test_and_clear_flag(hdev,
 							      HCI_SSP_ENABLED);
-			if (!changed)
-				changed = hci_dev_test_and_clear_flag(hdev,
-								      HCI_HS_ENABLED);
-			else
-				hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
 		}
 
 		err = send_settings_rsp(sk, MGMT_OP_SET_SSP, hdev);
@@ -2070,63 +2053,10 @@ failed:
 
 static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 {
-	struct mgmt_mode *cp = data;
-	bool changed;
-	u8 status;
-	int err;
-
 	bt_dev_dbg(hdev, "sock %p", sk);
 
-	if (!IS_ENABLED(CONFIG_BT_HS))
-		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
+	return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
 				       MGMT_STATUS_NOT_SUPPORTED);
-
-	status = mgmt_bredr_support(hdev);
-	if (status)
-		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS, status);
-
-	if (!lmp_ssp_capable(hdev))
-		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
-				       MGMT_STATUS_NOT_SUPPORTED);
-
-	if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
-		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
-				       MGMT_STATUS_REJECTED);
-
-	if (cp->val != 0x00 && cp->val != 0x01)
-		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
-				       MGMT_STATUS_INVALID_PARAMS);
-
-	hci_dev_lock(hdev);
-
-	if (pending_find(MGMT_OP_SET_SSP, hdev)) {
-		err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
-				      MGMT_STATUS_BUSY);
-		goto unlock;
-	}
-
-	if (cp->val) {
-		changed = !hci_dev_test_and_set_flag(hdev, HCI_HS_ENABLED);
-	} else {
-		if (hdev_is_powered(hdev)) {
-			err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
-					      MGMT_STATUS_REJECTED);
-			goto unlock;
-		}
-
-		changed = hci_dev_test_and_clear_flag(hdev, HCI_HS_ENABLED);
-	}
-
-	err = send_settings_rsp(sk, MGMT_OP_SET_HS, hdev);
-	if (err < 0)
-		goto unlock;
-
-	if (changed)
-		err = new_settings(hdev, sk);
-
-unlock:
-	hci_dev_unlock(hdev);
-	return err;
 }
 
 static void set_le_complete(struct hci_dev *hdev, void *data, int err)
@@ -6774,7 +6704,6 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 			hci_dev_clear_flag(hdev, HCI_SSP_ENABLED);
 			hci_dev_clear_flag(hdev, HCI_LINK_SECURITY);
 			hci_dev_clear_flag(hdev, HCI_FAST_CONNECTABLE);
-			hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
 		}
 
 		hci_dev_change_flag(hdev, HCI_BREDR_ENABLED);
-- 
cgit v1.2.3


From 45340097ce6ea7e875674a5a7d24c95ecbc93ef9 Mon Sep 17 00:00:00 2001
From: Jonas Dreßler <verdre@v0yd.nl>
Date: Tue, 6 Feb 2024 12:08:13 +0100
Subject: Bluetooth: hci_conn: Only do ACL connections sequentially
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pretty much all bluetooth chipsets only support paging a single device at
a time, and if they don't reject a secondary "Create Connection" request
while another is still ongoing, they'll most likely serialize those
requests in the firware.

With commit 4c67bc74f016 ("[Bluetooth] Support concurrent connect
requests") we started adding some serialization of our own in case the
adapter returns "Command Disallowed" HCI error.

This commit was using the BT_CONNECT2 state for the serialization, this
state is also used for a few more things (most notably to indicate we're
waiting for an inquiry to cancel) and therefore a bit unreliable. Also
not all BT firwares would respond with "Command Disallowed" on too many
connection requests, some will also respond with "Hardware Failure"
(BCM4378), and others will error out later and send a "Connect Complete"
event with error "Rejected Limited Resources" (Marvell 88W8897).

We can clean things up a bit and also make the serialization more reliable
by using our hci_sync machinery to always do "Create Connection" requests
in a sequential manner.

This is very similar to what we're already doing for establishing LE
connections, and it works well there.

Note that this causes a test failure in mgmt-tester (test "Pair Device
- Power off 1") because the hci_abort_conn_sync() changes the error we
return on timeout of the "Create Connection". We'll fix this on the
mgmt-tester side by adjusting the expected error for the test.

Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h      |  1 +
 include/net/bluetooth/hci_sync.h |  3 ++
 net/bluetooth/hci_conn.c         | 69 ++++++---------------------------------
 net/bluetooth/hci_sync.c         | 70 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 83 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index aa6c69053d7c..08cb5cb249a4 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -437,6 +437,7 @@ enum {
 #define HCI_NCMD_TIMEOUT	msecs_to_jiffies(4000)	/* 4 seconds */
 #define HCI_ACL_TX_TIMEOUT	msecs_to_jiffies(45000)	/* 45 seconds */
 #define HCI_AUTO_OFF_TIMEOUT	msecs_to_jiffies(2000)	/* 2 seconds */
+#define HCI_ACL_CONN_TIMEOUT	msecs_to_jiffies(20000)	/* 20 seconds */
 #define HCI_LE_CONN_TIMEOUT	msecs_to_jiffies(20000)	/* 20 seconds */
 #define HCI_LE_AUTOCONN_TIMEOUT	msecs_to_jiffies(4000)	/* 4 seconds */
 
diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index e2582c242544..824660f8f30d 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -138,3 +138,6 @@ int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason);
 int hci_le_big_terminate_sync(struct hci_dev *hdev, u8 handle);
 
 int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle);
+
+int hci_acl_create_connection_sync(struct hci_dev *hdev,
+				   struct hci_conn *conn);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index fc4d72f83ac2..816be7667a8c 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -178,64 +178,6 @@ static void hci_conn_cleanup(struct hci_conn *conn)
 	hci_dev_put(hdev);
 }
 
-static void hci_acl_create_connection(struct hci_conn *conn)
-{
-	struct hci_dev *hdev = conn->hdev;
-	struct inquiry_entry *ie;
-	struct hci_cp_create_conn cp;
-
-	BT_DBG("hcon %p", conn);
-
-	/* Many controllers disallow HCI Create Connection while it is doing
-	 * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create
-	 * Connection. This may cause the MGMT discovering state to become false
-	 * without user space's request but it is okay since the MGMT Discovery
-	 * APIs do not promise that discovery should be done forever. Instead,
-	 * the user space monitors the status of MGMT discovering and it may
-	 * request for discovery again when this flag becomes false.
-	 */
-	if (test_bit(HCI_INQUIRY, &hdev->flags)) {
-		/* Put this connection to "pending" state so that it will be
-		 * executed after the inquiry cancel command complete event.
-		 */
-		conn->state = BT_CONNECT2;
-		hci_send_cmd(hdev, HCI_OP_INQUIRY_CANCEL, 0, NULL);
-		return;
-	}
-
-	conn->state = BT_CONNECT;
-	conn->out = true;
-	conn->role = HCI_ROLE_MASTER;
-
-	conn->attempt++;
-
-	conn->link_policy = hdev->link_policy;
-
-	memset(&cp, 0, sizeof(cp));
-	bacpy(&cp.bdaddr, &conn->dst);
-	cp.pscan_rep_mode = 0x02;
-
-	ie = hci_inquiry_cache_lookup(hdev, &conn->dst);
-	if (ie) {
-		if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) {
-			cp.pscan_rep_mode = ie->data.pscan_rep_mode;
-			cp.pscan_mode     = ie->data.pscan_mode;
-			cp.clock_offset   = ie->data.clock_offset |
-					    cpu_to_le16(0x8000);
-		}
-
-		memcpy(conn->dev_class, ie->data.dev_class, 3);
-	}
-
-	cp.pkt_type = cpu_to_le16(conn->pkt_type);
-	if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER))
-		cp.role_switch = 0x01;
-	else
-		cp.role_switch = 0x00;
-
-	hci_send_cmd(hdev, HCI_OP_CREATE_CONN, sizeof(cp), &cp);
-}
-
 int hci_disconnect(struct hci_conn *conn, __u8 reason)
 {
 	BT_DBG("hcon %p", conn);
@@ -1696,10 +1638,17 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 
 	acl->conn_reason = conn_reason;
 	if (acl->state == BT_OPEN || acl->state == BT_CLOSED) {
+		int err;
+
 		acl->sec_level = BT_SECURITY_LOW;
 		acl->pending_sec_level = sec_level;
 		acl->auth_type = auth_type;
-		hci_acl_create_connection(acl);
+
+		err = hci_acl_create_connection_sync(hdev, acl);
+		if (err) {
+			hci_conn_del(acl);
+			return ERR_PTR(err);
+		}
 	}
 
 	return acl;
@@ -2654,7 +2603,7 @@ void hci_conn_check_pending(struct hci_dev *hdev)
 
 	conn = hci_conn_hash_lookup_state(hdev, ACL_LINK, BT_CONNECT2);
 	if (conn)
-		hci_acl_create_connection(conn);
+		hci_acl_create_connection_sync(hdev, conn);
 
 	hci_dev_unlock(hdev);
 }
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 1122296ce3fa..617407b81ffe 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -6492,3 +6492,73 @@ int hci_update_adv_data(struct hci_dev *hdev, u8 instance)
 	return hci_cmd_sync_queue(hdev, _update_adv_data_sync,
 				  UINT_PTR(instance), NULL);
 }
+
+static int __hci_acl_create_connection_sync(struct hci_dev *hdev, void *data)
+{
+	struct hci_conn *conn = data;
+	struct inquiry_entry *ie;
+	struct hci_cp_create_conn cp;
+	int err;
+
+	/* Many controllers disallow HCI Create Connection while it is doing
+	 * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create
+	 * Connection. This may cause the MGMT discovering state to become false
+	 * without user space's request but it is okay since the MGMT Discovery
+	 * APIs do not promise that discovery should be done forever. Instead,
+	 * the user space monitors the status of MGMT discovering and it may
+	 * request for discovery again when this flag becomes false.
+	 */
+	if (test_bit(HCI_INQUIRY, &hdev->flags)) {
+		err = __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY_CANCEL, 0,
+					    NULL, HCI_CMD_TIMEOUT);
+		if (err)
+			bt_dev_warn(hdev, "Failed to cancel inquiry %d", err);
+	}
+
+	conn->state = BT_CONNECT;
+	conn->out = true;
+	conn->role = HCI_ROLE_MASTER;
+
+	conn->attempt++;
+
+	conn->link_policy = hdev->link_policy;
+
+	memset(&cp, 0, sizeof(cp));
+	bacpy(&cp.bdaddr, &conn->dst);
+	cp.pscan_rep_mode = 0x02;
+
+	ie = hci_inquiry_cache_lookup(hdev, &conn->dst);
+	if (ie) {
+		if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) {
+			cp.pscan_rep_mode = ie->data.pscan_rep_mode;
+			cp.pscan_mode     = ie->data.pscan_mode;
+			cp.clock_offset   = ie->data.clock_offset |
+					    cpu_to_le16(0x8000);
+		}
+
+		memcpy(conn->dev_class, ie->data.dev_class, 3);
+	}
+
+	cp.pkt_type = cpu_to_le16(conn->pkt_type);
+	if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER))
+		cp.role_switch = 0x01;
+	else
+		cp.role_switch = 0x00;
+
+	err = __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN,
+				       sizeof(cp), &cp,
+				       HCI_EV_CONN_COMPLETE,
+				       HCI_ACL_CONN_TIMEOUT, NULL);
+
+	if (err == -ETIMEDOUT)
+		hci_abort_conn_sync(hdev, conn, HCI_ERROR_LOCAL_HOST_TERM);
+
+	return err;
+}
+
+int hci_acl_create_connection_sync(struct hci_dev *hdev,
+				   struct hci_conn *conn)
+{
+	return hci_cmd_sync_queue(hdev, __hci_acl_create_connection_sync,
+				  conn, NULL);
+}
-- 
cgit v1.2.3


From 4aa42119d971603dc9e4d8cf4f53d5fcf082ea7d Mon Sep 17 00:00:00 2001
From: Jonas Dreßler <verdre@v0yd.nl>
Date: Tue, 6 Feb 2024 12:08:14 +0100
Subject: Bluetooth: Remove pending ACL connection attempts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the last commit we moved to using the hci_sync queue for "Create
Connection" requests, removing the need for retrying the paging after
finished/failed "Create Connection" requests and after the end of
inquiries.

hci_conn_check_pending() was used to trigger this retry, we can remove it
now.

Note that we can also remove the special handling for COMMAND_DISALLOWED
errors in the completion handler of "Create Connection", because "Create
Connection" requests are now always serialized.

This is somewhat reverting commit 4c67bc74f016 ("[Bluetooth] Support
concurrent connect requests").

With this, the BT_CONNECT2 state of ACL hci_conn objects should now be
back to meaning only one thing: That we received a "Connection Request"
from another device (see hci_conn_request_evt), but the response to that
is going to be deferred.

Signed-off-by: Jonas Dreßler <verdre@v0yd.nl>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  1 -
 net/bluetooth/hci_conn.c         | 16 ----------------
 net/bluetooth/hci_event.c        | 21 ++++-----------------
 3 files changed, 4 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 8f8dd9173714..34aa9d0290fe 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1480,7 +1480,6 @@ struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type,
 				    bdaddr_t *dst, u8 role);
 void hci_conn_del(struct hci_conn *conn);
 void hci_conn_hash_flush(struct hci_dev *hdev);
-void hci_conn_check_pending(struct hci_dev *hdev);
 
 struct hci_chan *hci_chan_create(struct hci_conn *conn);
 void hci_chan_del(struct hci_chan *chan);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 816be7667a8c..a4beed8587eb 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -2592,22 +2592,6 @@ void hci_conn_hash_flush(struct hci_dev *hdev)
 	}
 }
 
-/* Check pending connect attempts */
-void hci_conn_check_pending(struct hci_dev *hdev)
-{
-	struct hci_conn *conn;
-
-	BT_DBG("hdev %s", hdev->name);
-
-	hci_dev_lock(hdev);
-
-	conn = hci_conn_hash_lookup_state(hdev, ACL_LINK, BT_CONNECT2);
-	if (conn)
-		hci_acl_create_connection_sync(hdev, conn);
-
-	hci_dev_unlock(hdev);
-}
-
 static u32 get_link_mode(struct hci_conn *conn)
 {
 	u32 link_mode = 0;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 271c00792801..e7887bae334b 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -118,8 +118,6 @@ static u8 hci_cc_inquiry_cancel(struct hci_dev *hdev, void *data,
 		hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
 	hci_dev_unlock(hdev);
 
-	hci_conn_check_pending(hdev);
-
 	return rp->status;
 }
 
@@ -150,8 +148,6 @@ static u8 hci_cc_exit_periodic_inq(struct hci_dev *hdev, void *data,
 
 	hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ);
 
-	hci_conn_check_pending(hdev);
-
 	return rp->status;
 }
 
@@ -2312,10 +2308,8 @@ static void hci_cs_inquiry(struct hci_dev *hdev, __u8 status)
 {
 	bt_dev_dbg(hdev, "status 0x%2.2x", status);
 
-	if (status) {
-		hci_conn_check_pending(hdev);
+	if (status)
 		return;
-	}
 
 	if (hci_sent_cmd_data(hdev, HCI_OP_INQUIRY))
 		set_bit(HCI_INQUIRY, &hdev->flags);
@@ -2340,12 +2334,9 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
 
 	if (status) {
 		if (conn && conn->state == BT_CONNECT) {
-			if (status != HCI_ERROR_COMMAND_DISALLOWED || conn->attempt > 2) {
-				conn->state = BT_CLOSED;
-				hci_connect_cfm(conn, status);
-				hci_conn_del(conn);
-			} else
-				conn->state = BT_CONNECT2;
+			conn->state = BT_CLOSED;
+			hci_connect_cfm(conn, status);
+			hci_conn_del(conn);
 		}
 	} else {
 		if (!conn) {
@@ -3035,8 +3026,6 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, void *data,
 
 	bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
 
-	hci_conn_check_pending(hdev);
-
 	if (!test_and_clear_bit(HCI_INQUIRY, &hdev->flags))
 		return;
 
@@ -3258,8 +3247,6 @@ done:
 
 unlock:
 	hci_dev_unlock(hdev);
-
-	hci_conn_check_pending(hdev);
 }
 
 static void hci_reject_conn(struct hci_dev *hdev, bdaddr_t *bdaddr)
-- 
cgit v1.2.3


From bf98feea5b65ced367a871cf35fc044dedbcfb85 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Wed, 7 Feb 2024 15:26:20 -0500
Subject: Bluetooth: hci_conn: Always use sk_timeo as conn_timeout

This aligns the use socket sk_timeo as conn_timeout when initiating a
connection and then use it when scheduling the resulting HCI command,
that way the command is actually aborted synchronously thus not
blocking commands generated by hci_abort_conn_sync to inform the
controller the connection is to be aborted.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  5 +++--
 include/net/bluetooth/l2cap.h    |  2 +-
 net/bluetooth/6lowpan.c          |  2 +-
 net/bluetooth/hci_conn.c         |  8 +++++---
 net/bluetooth/hci_sync.c         |  2 +-
 net/bluetooth/l2cap_core.c       | 10 ++++------
 net/bluetooth/l2cap_sock.c       |  3 ++-
 net/bluetooth/mgmt.c             |  3 ++-
 net/bluetooth/sco.c              |  3 ++-
 9 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 34aa9d0290fe..2bdea85b7c44 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1495,9 +1495,10 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
 				u16 conn_timeout, u8 role);
 struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 				 u8 sec_level, u8 auth_type,
-				 enum conn_reasons conn_reason);
+				 enum conn_reasons conn_reason, u16 timeout);
 struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
-				 __u16 setting, struct bt_codec *codec);
+				 __u16 setting, struct bt_codec *codec,
+				 u16 timeout);
 struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
 			      __u8 dst_type, struct bt_iso_qos *qos);
 struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst,
diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 92d7197f9a56..a4278aa618ab 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -939,7 +939,7 @@ int l2cap_add_scid(struct l2cap_chan *chan,  __u16 scid);
 struct l2cap_chan *l2cap_chan_create(void);
 void l2cap_chan_close(struct l2cap_chan *chan, int reason);
 int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
-		       bdaddr_t *dst, u8 dst_type);
+		       bdaddr_t *dst, u8 dst_type, u16 timeout);
 int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu);
 int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len);
 void l2cap_chan_busy(struct l2cap_chan *chan, int busy);
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 4eb1b3ced0d2..715cbafbf663 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -892,7 +892,7 @@ static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type)
 	chan->ops = &bt_6lowpan_chan_ops;
 
 	err = l2cap_chan_connect(chan, cpu_to_le16(L2CAP_PSM_IPSP), 0,
-				 addr, dst_type);
+				 addr, dst_type, L2CAP_CONN_TIMEOUT);
 
 	BT_DBG("chan %p err %d", chan, err);
 	if (err < 0)
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index a4beed8587eb..8164502234c5 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1607,7 +1607,7 @@ done:
 
 struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 				 u8 sec_level, u8 auth_type,
-				 enum conn_reasons conn_reason)
+				 enum conn_reasons conn_reason, u16 timeout)
 {
 	struct hci_conn *acl;
 
@@ -1643,6 +1643,7 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 		acl->sec_level = BT_SECURITY_LOW;
 		acl->pending_sec_level = sec_level;
 		acl->auth_type = auth_type;
+		acl->conn_timeout = timeout;
 
 		err = hci_acl_create_connection_sync(hdev, acl);
 		if (err) {
@@ -1683,14 +1684,15 @@ static struct hci_link *hci_conn_link(struct hci_conn *parent,
 }
 
 struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
-				 __u16 setting, struct bt_codec *codec)
+				 __u16 setting, struct bt_codec *codec,
+				 u16 timeout)
 {
 	struct hci_conn *acl;
 	struct hci_conn *sco;
 	struct hci_link *link;
 
 	acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING,
-			      CONN_REASON_SCO_CONNECT);
+			      CONN_REASON_SCO_CONNECT, timeout);
 	if (IS_ERR(acl))
 		return acl;
 
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 617407b81ffe..788a889210d8 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -6548,7 +6548,7 @@ static int __hci_acl_create_connection_sync(struct hci_dev *hdev, void *data)
 	err = __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN,
 				       sizeof(cp), &cp,
 				       HCI_EV_CONN_COMPLETE,
-				       HCI_ACL_CONN_TIMEOUT, NULL);
+				       conn->conn_timeout, NULL);
 
 	if (err == -ETIMEDOUT)
 		hci_abort_conn_sync(hdev, conn, HCI_ERROR_LOCAL_HOST_TERM);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index ab5a9d42fae7..467b242d8be0 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -6925,7 +6925,7 @@ static void l2cap_chan_by_pid(struct l2cap_chan *chan, void *data)
 }
 
 int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
-		       bdaddr_t *dst, u8 dst_type)
+		       bdaddr_t *dst, u8 dst_type, u16 timeout)
 {
 	struct l2cap_conn *conn;
 	struct hci_conn *hcon;
@@ -7018,19 +7018,17 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
 
 		if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
 			hcon = hci_connect_le(hdev, dst, dst_type, false,
-					      chan->sec_level,
-					      HCI_LE_CONN_TIMEOUT,
+					      chan->sec_level, timeout,
 					      HCI_ROLE_SLAVE);
 		else
 			hcon = hci_connect_le_scan(hdev, dst, dst_type,
-						   chan->sec_level,
-						   HCI_LE_CONN_TIMEOUT,
+						   chan->sec_level, timeout,
 						   CONN_REASON_L2CAP_CHAN);
 
 	} else {
 		u8 auth_type = l2cap_get_auth_type(chan);
 		hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type,
-				       CONN_REASON_L2CAP_CHAN);
+				       CONN_REASON_L2CAP_CHAN, timeout);
 	}
 
 	if (IS_ERR(hcon)) {
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index ee7a41d6994f..4287aa6cc988 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -254,7 +254,8 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr,
 		chan->mode = L2CAP_MODE_LE_FLOWCTL;
 
 	err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid),
-				 &la.l2_bdaddr, la.l2_bdaddr_type);
+				 &la.l2_bdaddr, la.l2_bdaddr_type,
+				 sk->sk_sndtimeo);
 	if (err)
 		return err;
 
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 064a67157d43..78ab562807d0 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -3444,7 +3444,8 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	if (cp->addr.type == BDADDR_BREDR) {
 		conn = hci_connect_acl(hdev, &cp->addr.bdaddr, sec_level,
-				       auth_type, CONN_REASON_PAIR_DEVICE);
+				       auth_type, CONN_REASON_PAIR_DEVICE,
+				       HCI_ACL_CONN_TIMEOUT);
 	} else {
 		u8 addr_type = le_addr_type(cp->addr.type);
 		struct hci_conn_params *p;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index c736186aba26..43daf965a01e 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -264,7 +264,8 @@ static int sco_connect(struct sock *sk)
 	}
 
 	hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst,
-			       sco_pi(sk)->setting, &sco_pi(sk)->codec);
+			       sco_pi(sk)->setting, &sco_pi(sk)->codec,
+			       sk->sk_sndtimeo);
 	if (IS_ERR(hcon)) {
 		err = PTR_ERR(hcon);
 		goto unlock;
-- 
cgit v1.2.3


From 5f641f03abccddd1a37233ff1b8e774b9ff1f4e8 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 9 Feb 2024 09:08:06 -0500
Subject: Bluetooth: hci_conn: Fix UAF Write in
 __hci_acl_create_connection_sync

This fixes the UAF on __hci_acl_create_connection_sync caused by
connection abortion, it uses the same logic as to LE_LINK which uses
hci_cmd_sync_cancel to prevent the callback to run if the connection is
abort prematurely.

Reported-by: syzbot+3f0a39be7a2035700868@syzkaller.appspotmail.com
Fixes: 45340097ce6e ("Bluetooth: hci_conn: Only do ACL connections sequentially")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |  3 +--
 net/bluetooth/hci_conn.c         |  3 ++-
 net/bluetooth/hci_sync.c         | 16 ++++++++++------
 3 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 824660f8f30d..ed334c253ebc 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -139,5 +139,4 @@ int hci_le_big_terminate_sync(struct hci_dev *hdev, u8 handle);
 
 int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle);
 
-int hci_acl_create_connection_sync(struct hci_dev *hdev,
-				   struct hci_conn *conn);
+int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 8164502234c5..587eb27f374c 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1645,7 +1645,7 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 		acl->auth_type = auth_type;
 		acl->conn_timeout = timeout;
 
-		err = hci_acl_create_connection_sync(hdev, acl);
+		err = hci_connect_acl_sync(hdev, acl);
 		if (err) {
 			hci_conn_del(acl);
 			return ERR_PTR(err);
@@ -2942,6 +2942,7 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason)
 	 */
 	if (conn->state == BT_CONNECT && hdev->req_status == HCI_REQ_PEND) {
 		switch (hci_skb_event(hdev->sent_cmd)) {
+		case HCI_EV_CONN_COMPLETE:
 		case HCI_EV_LE_CONN_COMPLETE:
 		case HCI_EV_LE_ENHANCED_CONN_COMPLETE:
 		case HCI_EVT_LE_CIS_ESTABLISHED:
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 788a889210d8..e1fdcb3c2706 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -6493,13 +6493,18 @@ int hci_update_adv_data(struct hci_dev *hdev, u8 instance)
 				  UINT_PTR(instance), NULL);
 }
 
-static int __hci_acl_create_connection_sync(struct hci_dev *hdev, void *data)
+static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data)
 {
-	struct hci_conn *conn = data;
+	struct hci_conn *conn;
+	u16 handle = PTR_UINT(data);
 	struct inquiry_entry *ie;
 	struct hci_cp_create_conn cp;
 	int err;
 
+	conn = hci_conn_hash_lookup_handle(hdev, handle);
+	if (!conn)
+		return 0;
+
 	/* Many controllers disallow HCI Create Connection while it is doing
 	 * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create
 	 * Connection. This may cause the MGMT discovering state to become false
@@ -6556,9 +6561,8 @@ static int __hci_acl_create_connection_sync(struct hci_dev *hdev, void *data)
 	return err;
 }
 
-int hci_acl_create_connection_sync(struct hci_dev *hdev,
-				   struct hci_conn *conn)
+int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn)
 {
-	return hci_cmd_sync_queue(hdev, __hci_acl_create_connection_sync,
-				  conn, NULL);
+	return hci_cmd_sync_queue(hdev, hci_acl_create_conn_sync,
+				  UINT_PTR(conn->handle), NULL);
 }
-- 
cgit v1.2.3


From 505ea2b295929e7be2b4e1bc86ee31cb7862fb01 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Wed, 9 Aug 2023 13:43:53 -0700
Subject: Bluetooth: hci_sync: Add helper functions to manipulate cmd_sync
 queue

This adds functions to queue, dequeue and lookup into the cmd_sync
list.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |  12 ++++
 net/bluetooth/hci_sync.c         | 132 ++++++++++++++++++++++++++++++++++++---
 2 files changed, 136 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index ed334c253ebc..4ff4aa68ee19 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -48,6 +48,18 @@ int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 			void *data, hci_cmd_sync_work_destroy_t destroy);
 int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 		       void *data, hci_cmd_sync_work_destroy_t destroy);
+struct hci_cmd_sync_work_entry *
+hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			  void *data, hci_cmd_sync_work_destroy_t destroy);
+int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			    void *data, hci_cmd_sync_work_destroy_t destroy);
+void hci_cmd_sync_cancel_entry(struct hci_dev *hdev,
+			       struct hci_cmd_sync_work_entry *entry);
+bool hci_cmd_sync_dequeue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			  void *data, hci_cmd_sync_work_destroy_t destroy);
+bool hci_cmd_sync_dequeue_once(struct hci_dev *hdev,
+			      hci_cmd_sync_work_func_t func, void *data,
+			      hci_cmd_sync_work_destroy_t destroy);
 
 int hci_update_eir_sync(struct hci_dev *hdev);
 int hci_update_class_sync(struct hci_dev *hdev);
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index e1fdcb3c2706..5b314bf844f8 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -566,6 +566,17 @@ void hci_cmd_sync_init(struct hci_dev *hdev)
 	INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire);
 }
 
+static void _hci_cmd_sync_cancel_entry(struct hci_dev *hdev,
+				       struct hci_cmd_sync_work_entry *entry,
+				       int err)
+{
+	if (entry->destroy)
+		entry->destroy(hdev, entry->data, err);
+
+	list_del(&entry->list);
+	kfree(entry);
+}
+
 void hci_cmd_sync_clear(struct hci_dev *hdev)
 {
 	struct hci_cmd_sync_work_entry *entry, *tmp;
@@ -574,13 +585,8 @@ void hci_cmd_sync_clear(struct hci_dev *hdev)
 	cancel_work_sync(&hdev->reenable_adv_work);
 
 	mutex_lock(&hdev->cmd_sync_work_lock);
-	list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) {
-		if (entry->destroy)
-			entry->destroy(hdev, entry->data, -ECANCELED);
-
-		list_del(&entry->list);
-		kfree(entry);
-	}
+	list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list)
+		_hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED);
 	mutex_unlock(&hdev->cmd_sync_work_lock);
 }
 
@@ -669,6 +675,115 @@ int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 }
 EXPORT_SYMBOL(hci_cmd_sync_queue);
 
+static struct hci_cmd_sync_work_entry *
+_hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			   void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+	struct hci_cmd_sync_work_entry *entry, *tmp;
+
+	list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) {
+		if (func && entry->func != func)
+			continue;
+
+		if (data && entry->data != data)
+			continue;
+
+		if (destroy && entry->destroy != destroy)
+			continue;
+
+		return entry;
+	}
+
+	return NULL;
+}
+
+/* Queue HCI command entry once:
+ *
+ * - Lookup if an entry already exist and only if it doesn't creates a new entry
+ *   and queue it.
+ */
+int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			    void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+	if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy))
+		return 0;
+
+	return hci_cmd_sync_queue(hdev, func, data, destroy);
+}
+EXPORT_SYMBOL(hci_cmd_sync_queue_once);
+
+/* Lookup HCI command entry:
+ *
+ * - Return first entry that matches by function callback or data or
+ *   destroy callback.
+ */
+struct hci_cmd_sync_work_entry *
+hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			  void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+	struct hci_cmd_sync_work_entry *entry;
+
+	mutex_lock(&hdev->cmd_sync_work_lock);
+	entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy);
+	mutex_unlock(&hdev->cmd_sync_work_lock);
+
+	return entry;
+}
+EXPORT_SYMBOL(hci_cmd_sync_lookup_entry);
+
+/* Cancel HCI command entry */
+void hci_cmd_sync_cancel_entry(struct hci_dev *hdev,
+			       struct hci_cmd_sync_work_entry *entry)
+{
+	mutex_lock(&hdev->cmd_sync_work_lock);
+	_hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED);
+	mutex_unlock(&hdev->cmd_sync_work_lock);
+}
+EXPORT_SYMBOL(hci_cmd_sync_cancel_entry);
+
+/* Dequeue one HCI command entry:
+ *
+ * - Lookup and cancel first entry that matches.
+ */
+bool hci_cmd_sync_dequeue_once(struct hci_dev *hdev,
+			       hci_cmd_sync_work_func_t func,
+			       void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+	struct hci_cmd_sync_work_entry *entry;
+
+	entry = hci_cmd_sync_lookup_entry(hdev, func, data, destroy);
+	if (!entry)
+		return false;
+
+	hci_cmd_sync_cancel_entry(hdev, entry);
+
+	return true;
+}
+EXPORT_SYMBOL(hci_cmd_sync_dequeue_once);
+
+/* Dequeue HCI command entry:
+ *
+ * - Lookup and cancel any entry that matches by function callback or data or
+ *   destroy callback.
+ */
+bool hci_cmd_sync_dequeue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			  void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+	struct hci_cmd_sync_work_entry *entry;
+	bool ret = false;
+
+	mutex_lock(&hdev->cmd_sync_work_lock);
+	while ((entry = _hci_cmd_sync_lookup_entry(hdev, func, data,
+						   destroy))) {
+		_hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED);
+		ret = true;
+	}
+	mutex_unlock(&hdev->cmd_sync_work_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(hci_cmd_sync_dequeue);
+
 int hci_update_eir_sync(struct hci_dev *hdev)
 {
 	struct hci_cp_write_eir cp;
@@ -2881,7 +2996,8 @@ int hci_update_passive_scan(struct hci_dev *hdev)
 	    hci_dev_test_flag(hdev, HCI_UNREGISTER))
 		return 0;
 
-	return hci_cmd_sync_queue(hdev, update_passive_scan_sync, NULL, NULL);
+	return hci_cmd_sync_queue_once(hdev, update_passive_scan_sync, NULL,
+				       NULL);
 }
 
 int hci_write_sc_support_sync(struct hci_dev *hdev, u8 val)
-- 
cgit v1.2.3


From 881559af5f5c545f6828e7c74d79813eb886d523 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 13 Feb 2024 09:59:32 -0500
Subject: Bluetooth: hci_sync: Attempt to dequeue connection attempt

If connection is still queued/pending in the cmd_sync queue it means no
command has been generated and it should be safe to just dequeue the
callback when it is being aborted.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 19 +++++++++++
 include/net/bluetooth/hci_sync.h | 10 +++---
 net/bluetooth/hci_conn.c         | 70 +++++++------------------------------
 net/bluetooth/hci_sync.c         | 74 +++++++++++++++++++++++++++++++++++-----
 4 files changed, 102 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 2bdea85b7c44..317d495cfcf5 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1083,6 +1083,24 @@ static inline unsigned int hci_conn_count(struct hci_dev *hdev)
 	return c->acl_num + c->amp_num + c->sco_num + c->le_num + c->iso_num;
 }
 
+static inline bool hci_conn_valid(struct hci_dev *hdev, struct hci_conn *conn)
+{
+	struct hci_conn_hash *h = &hdev->conn_hash;
+	struct hci_conn  *c;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(c, &h->list, list) {
+		if (c == conn) {
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+
+	return false;
+}
+
 static inline __u8 hci_conn_lookup_type(struct hci_dev *hdev, __u16 handle)
 {
 	struct hci_conn_hash *h = &hdev->conn_hash;
@@ -1493,6 +1511,7 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
 struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
 				u8 dst_type, bool dst_resolved, u8 sec_level,
 				u16 conn_timeout, u8 role);
+void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status);
 struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 				 u8 sec_level, u8 auth_type,
 				 enum conn_reasons conn_reason, u16 timeout);
diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 4ff4aa68ee19..6a9d063e9f47 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -48,11 +48,11 @@ int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 			void *data, hci_cmd_sync_work_destroy_t destroy);
 int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 		       void *data, hci_cmd_sync_work_destroy_t destroy);
+int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			    void *data, hci_cmd_sync_work_destroy_t destroy);
 struct hci_cmd_sync_work_entry *
 hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 			  void *data, hci_cmd_sync_work_destroy_t destroy);
-int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
-			    void *data, hci_cmd_sync_work_destroy_t destroy);
 void hci_cmd_sync_cancel_entry(struct hci_dev *hdev,
 			       struct hci_cmd_sync_work_entry *entry);
 bool hci_cmd_sync_dequeue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
@@ -139,8 +139,6 @@ struct hci_conn;
 
 int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason);
 
-int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn);
-
 int hci_le_create_cis_sync(struct hci_dev *hdev);
 
 int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle);
@@ -152,3 +150,7 @@ int hci_le_big_terminate_sync(struct hci_dev *hdev, u8 handle);
 int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle);
 
 int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn);
+
+int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn);
+
+int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 587eb27f374c..21e0b4064d05 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -68,7 +68,7 @@ static const struct sco_param esco_param_msbc[] = {
 };
 
 /* This function requires the caller holds hdev->lock */
-static void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status)
+void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status)
 {
 	struct hci_conn_params *params;
 	struct hci_dev *hdev = conn->hdev;
@@ -1124,6 +1124,9 @@ void hci_conn_del(struct hci_conn *conn)
 	 * rest of hci_conn_del.
 	 */
 	hci_conn_cleanup(conn);
+
+	/* Dequeue callbacks using connection pointer as data */
+	hci_cmd_sync_dequeue(hdev, NULL, conn, NULL);
 }
 
 struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type)
@@ -1258,53 +1261,6 @@ u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle)
 	return 0;
 }
 
-static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err)
-{
-	struct hci_conn *conn;
-	u16 handle = PTR_UINT(data);
-
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
-	if (!conn)
-		return;
-
-	bt_dev_dbg(hdev, "err %d", err);
-
-	hci_dev_lock(hdev);
-
-	if (!err) {
-		hci_connect_le_scan_cleanup(conn, 0x00);
-		goto done;
-	}
-
-	/* Check if connection is still pending */
-	if (conn != hci_lookup_le_connect(hdev))
-		goto done;
-
-	/* Flush to make sure we send create conn cancel command if needed */
-	flush_delayed_work(&conn->le_conn_timeout);
-	hci_conn_failed(conn, bt_status(err));
-
-done:
-	hci_dev_unlock(hdev);
-}
-
-static int hci_connect_le_sync(struct hci_dev *hdev, void *data)
-{
-	struct hci_conn *conn;
-	u16 handle = PTR_UINT(data);
-
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
-	if (!conn)
-		return 0;
-
-	bt_dev_dbg(hdev, "conn %p", conn);
-
-	clear_bit(HCI_CONN_SCANNING, &conn->flags);
-	conn->state = BT_CONNECT;
-
-	return hci_le_create_conn_sync(hdev, conn);
-}
-
 struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
 				u8 dst_type, bool dst_resolved, u8 sec_level,
 				u16 conn_timeout, u8 role)
@@ -1371,9 +1327,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
 	conn->sec_level = BT_SECURITY_LOW;
 	conn->conn_timeout = conn_timeout;
 
-	err = hci_cmd_sync_queue(hdev, hci_connect_le_sync,
-				 UINT_PTR(conn->handle),
-				 create_le_conn_complete);
+	err = hci_connect_le_sync(hdev, conn);
 	if (err) {
 		hci_conn_del(conn);
 		return ERR_PTR(err);
@@ -2909,12 +2863,10 @@ u32 hci_conn_get_phy(struct hci_conn *conn)
 
 static int abort_conn_sync(struct hci_dev *hdev, void *data)
 {
-	struct hci_conn *conn;
-	u16 handle = PTR_UINT(data);
+	struct hci_conn *conn = data;
 
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
-	if (!conn)
-		return 0;
+	if (!hci_conn_valid(hdev, conn))
+		return -ECANCELED;
 
 	return hci_abort_conn_sync(hdev, conn, conn->abort_reason);
 }
@@ -2949,8 +2901,10 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason)
 			hci_cmd_sync_cancel(hdev, -ECANCELED);
 			break;
 		}
+	/* Cancel connect attempt if still queued/pending */
+	} else if (!hci_cancel_connect_sync(hdev, conn)) {
+		return 0;
 	}
 
-	return hci_cmd_sync_queue(hdev, abort_conn_sync, UINT_PTR(conn->handle),
-				  NULL);
+	return hci_cmd_sync_queue_once(hdev, abort_conn_sync, conn, NULL);
 }
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 5b314bf844f8..b7d8e99e2a30 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -6285,12 +6285,21 @@ static int hci_le_ext_create_conn_sync(struct hci_dev *hdev,
 					conn->conn_timeout, NULL);
 }
 
-int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn)
+static int hci_le_create_conn_sync(struct hci_dev *hdev, void *data)
 {
 	struct hci_cp_le_create_conn cp;
 	struct hci_conn_params *params;
 	u8 own_addr_type;
 	int err;
+	struct hci_conn *conn = data;
+
+	if (!hci_conn_valid(hdev, conn))
+		return -ECANCELED;
+
+	bt_dev_dbg(hdev, "conn %p", conn);
+
+	clear_bit(HCI_CONN_SCANNING, &conn->flags);
+	conn->state = BT_CONNECT;
 
 	/* If requested to connect as peripheral use directed advertising */
 	if (conn->role == HCI_ROLE_SLAVE) {
@@ -6611,16 +6620,11 @@ int hci_update_adv_data(struct hci_dev *hdev, u8 instance)
 
 static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data)
 {
-	struct hci_conn *conn;
-	u16 handle = PTR_UINT(data);
+	struct hci_conn *conn = data;
 	struct inquiry_entry *ie;
 	struct hci_cp_create_conn cp;
 	int err;
 
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
-	if (!conn)
-		return 0;
-
 	/* Many controllers disallow HCI Create Connection while it is doing
 	 * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create
 	 * Connection. This may cause the MGMT discovering state to become false
@@ -6679,6 +6683,58 @@ static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data)
 
 int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn)
 {
-	return hci_cmd_sync_queue(hdev, hci_acl_create_conn_sync,
-				  UINT_PTR(conn->handle), NULL);
+	return hci_cmd_sync_queue_once(hdev, hci_acl_create_conn_sync, conn,
+				       NULL);
+}
+
+static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err)
+{
+	struct hci_conn *conn = data;
+
+	bt_dev_dbg(hdev, "err %d", err);
+
+	if (err == -ECANCELED)
+		return;
+
+	hci_dev_lock(hdev);
+
+	if (!err) {
+		hci_connect_le_scan_cleanup(conn, 0x00);
+		goto done;
+	}
+
+	/* Check if connection is still pending */
+	if (conn != hci_lookup_le_connect(hdev))
+		goto done;
+
+	/* Flush to make sure we send create conn cancel command if needed */
+	flush_delayed_work(&conn->le_conn_timeout);
+	hci_conn_failed(conn, bt_status(err));
+
+done:
+	hci_dev_unlock(hdev);
+}
+
+int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+	return hci_cmd_sync_queue_once(hdev, hci_le_create_conn_sync, conn,
+				       create_le_conn_complete);
+}
+
+int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+	if (conn->state != BT_OPEN)
+		return -EINVAL;
+
+	switch (conn->type) {
+	case ACL_LINK:
+		return !hci_cmd_sync_dequeue_once(hdev,
+						  hci_acl_create_conn_sync,
+						  conn, NULL);
+	case LE_LINK:
+		return !hci_cmd_sync_dequeue_once(hdev, hci_le_create_conn_sync,
+						  conn, create_le_conn_complete);
+	}
+
+	return -ENOENT;
 }
-- 
cgit v1.2.3


From 02171da6e86a73e1b343b36722f5d9d5c04b3539 Mon Sep 17 00:00:00 2001
From: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Date: Fri, 23 Feb 2024 15:14:41 +0200
Subject: Bluetooth: ISO: Add hcon for listening bis sk

This creates a hcon instance at bis listen, before the PA sync
procedure is started.

Signed-off-by: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  6 +++---
 net/bluetooth/hci_conn.c         | 32 +++++++++++++++++++++++++------
 net/bluetooth/iso.c              | 41 ++++++++++++++++++++++++++++++----------
 3 files changed, 60 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 317d495cfcf5..199a9f81cf50 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1,7 +1,7 @@
 /*
    BlueZ - Bluetooth protocol stack for Linux
    Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
-   Copyright 2023 NXP
+   Copyright 2023-2024 NXP
 
    Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
 
@@ -1528,8 +1528,8 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
 struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
 				 __u8 dst_type, struct bt_iso_qos *qos,
 				 __u8 data_len, __u8 *data);
-int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type,
-		       __u8 sid, struct bt_iso_qos *qos);
+struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst,
+		       __u8 dst_type, __u8 sid, struct bt_iso_qos *qos);
 int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon,
 			   struct bt_iso_qos *qos,
 			   __u16 sync_handle, __u8 num_bis, __u8 bis[]);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 21e0b4064d05..54d4189c198a 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1,7 +1,7 @@
 /*
    BlueZ - Bluetooth protocol stack for Linux
    Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
-   Copyright 2023 NXP
+   Copyright 2023-2024 NXP
 
    Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
 
@@ -2057,18 +2057,31 @@ static int create_pa_sync(struct hci_dev *hdev, void *data)
 	return hci_update_passive_scan_sync(hdev);
 }
 
-int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type,
-		       __u8 sid, struct bt_iso_qos *qos)
+struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst,
+				    __u8 dst_type, __u8 sid,
+				    struct bt_iso_qos *qos)
 {
 	struct hci_cp_le_pa_create_sync *cp;
+	struct hci_conn *conn;
+	int err;
 
 	if (hci_dev_test_and_set_flag(hdev, HCI_PA_SYNC))
-		return -EBUSY;
+		return ERR_PTR(-EBUSY);
+
+	conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_SLAVE);
+	if (!conn)
+		return ERR_PTR(-ENOMEM);
+
+	conn->iso_qos = *qos;
+	conn->state = BT_LISTEN;
+
+	hci_conn_hold(conn);
 
 	cp = kzalloc(sizeof(*cp), GFP_KERNEL);
 	if (!cp) {
 		hci_dev_clear_flag(hdev, HCI_PA_SYNC);
-		return -ENOMEM;
+		hci_conn_drop(conn);
+		return ERR_PTR(-ENOMEM);
 	}
 
 	cp->options = qos->bcast.options;
@@ -2080,7 +2093,14 @@ int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type,
 	cp->sync_cte_type = qos->bcast.sync_cte_type;
 
 	/* Queue start pa_create_sync and scan */
-	return hci_cmd_sync_queue(hdev, create_pa_sync, cp, create_pa_complete);
+	err = hci_cmd_sync_queue(hdev, create_pa_sync, cp, create_pa_complete);
+	if (err < 0) {
+		hci_conn_drop(conn);
+		kfree(cp);
+		return ERR_PTR(err);
+	}
+
+	return conn;
 }
 
 int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon,
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 04f6572d35f1..d9ee69fa1384 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -3,7 +3,7 @@
  * BlueZ - Bluetooth protocol stack for Linux
  *
  * Copyright (C) 2022 Intel Corporation
- * Copyright 2023 NXP
+ * Copyright 2023-2024 NXP
  */
 
 #include <linux/module.h>
@@ -690,11 +690,8 @@ static void iso_sock_cleanup_listen(struct sock *parent)
 		iso_sock_kill(sk);
 	}
 
-	/* If listening socket stands for a PA sync connection,
-	 * properly disconnect the hcon and socket.
-	 */
-	if (iso_pi(parent)->conn && iso_pi(parent)->conn->hcon &&
-	    test_bit(HCI_CONN_PA_SYNC, &iso_pi(parent)->conn->hcon->flags)) {
+	/* If listening socket has a hcon, properly disconnect it */
+	if (iso_pi(parent)->conn && iso_pi(parent)->conn->hcon) {
 		iso_sock_disconn(parent);
 		return;
 	}
@@ -1076,6 +1073,8 @@ static int iso_listen_bis(struct sock *sk)
 {
 	struct hci_dev *hdev;
 	int err = 0;
+	struct iso_conn *conn;
+	struct hci_conn *hcon;
 
 	BT_DBG("%pMR -> %pMR (SID 0x%2.2x)", &iso_pi(sk)->src,
 	       &iso_pi(sk)->dst, iso_pi(sk)->bc_sid);
@@ -1096,18 +1095,40 @@ static int iso_listen_bis(struct sock *sk)
 	if (!hdev)
 		return -EHOSTUNREACH;
 
+	hci_dev_lock(hdev);
+
 	/* Fail if user set invalid QoS */
 	if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) {
 		iso_pi(sk)->qos = default_qos;
-		return -EINVAL;
+		err = -EINVAL;
+		goto unlock;
 	}
 
-	err = hci_pa_create_sync(hdev, &iso_pi(sk)->dst,
-				 le_addr_type(iso_pi(sk)->dst_type),
-				 iso_pi(sk)->bc_sid, &iso_pi(sk)->qos);
+	hcon = hci_pa_create_sync(hdev, &iso_pi(sk)->dst,
+				  le_addr_type(iso_pi(sk)->dst_type),
+				  iso_pi(sk)->bc_sid, &iso_pi(sk)->qos);
+	if (IS_ERR(hcon)) {
+		err = PTR_ERR(hcon);
+		goto unlock;
+	}
+
+	conn = iso_conn_add(hcon);
+	if (!conn) {
+		hci_conn_drop(hcon);
+		err = -ENOMEM;
+		goto unlock;
+	}
+
+	err = iso_chan_add(conn, sk, NULL);
+	if (err) {
+		hci_conn_drop(hcon);
+		goto unlock;
+	}
 
 	hci_dev_put(hdev);
 
+unlock:
+	hci_dev_unlock(hdev);
 	return err;
 }
 
-- 
cgit v1.2.3


From 168d9bf9c7f01df71e6404cfff66d9c2a8e968fb Mon Sep 17 00:00:00 2001
From: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Date: Fri, 23 Feb 2024 15:14:42 +0200
Subject: Bluetooth: ISO: Reassemble PA data for bcast sink

This adds support to reassemble PA data for a Broadcast Sink
listening socket. This is needed in case the BASE is received
fragmented in multiple PA reports.

PA data is first reassembled inside the hcon, before the BASE
is extracted and stored inside the socket. The length of the
le_per_adv_data hcon array has been raised to 1650, to accommodate
the maximum PA data length that can come fragmented, according to
spec.

Signed-off-by: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h      |  5 ++++
 include/net/bluetooth/hci_core.h |  5 ++--
 net/bluetooth/iso.c              | 50 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 54 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 08cb5cb249a4..21099bd3c8bc 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -2037,6 +2037,7 @@ struct hci_cp_le_set_per_adv_params {
 } __packed;
 
 #define HCI_MAX_PER_AD_LENGTH	252
+#define HCI_MAX_PER_AD_TOT_LEN	1650
 
 #define HCI_OP_LE_SET_PER_ADV_DATA		0x203f
 struct hci_cp_le_set_per_adv_data {
@@ -2797,6 +2798,10 @@ struct hci_ev_le_per_adv_report {
 	__u8     data[];
 } __packed;
 
+#define LE_PA_DATA_COMPLETE	0x00
+#define LE_PA_DATA_MORE_TO_COME	0x01
+#define LE_PA_DATA_TRUNCATED	0x02
+
 #define HCI_EV_LE_EXT_ADV_SET_TERM	0x12
 struct hci_evt_le_ext_adv_set_term {
 	__u8	status;
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 199a9f81cf50..da6aa6549b81 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -734,8 +734,9 @@ struct hci_conn {
 	__u16		le_supv_timeout;
 	__u8		le_adv_data[HCI_MAX_EXT_AD_LENGTH];
 	__u8		le_adv_data_len;
-	__u8		le_per_adv_data[HCI_MAX_PER_AD_LENGTH];
-	__u8		le_per_adv_data_len;
+	__u8		le_per_adv_data[HCI_MAX_PER_AD_TOT_LEN];
+	__u16		le_per_adv_data_len;
+	__u16		le_per_adv_data_offset;
 	__u8		le_tx_phy;
 	__u8		le_rx_phy;
 	__s8		rssi;
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index d9ee69fa1384..30c777c469f9 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -1982,16 +1982,58 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
 
 	ev3 = hci_recv_event_data(hdev, HCI_EV_LE_PER_ADV_REPORT);
 	if (ev3) {
-		size_t base_len = ev3->length;
+		size_t base_len = 0;
 		u8 *base;
+		struct hci_conn *hcon;
 
 		sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr,
 					 iso_match_sync_handle_pa_report, ev3);
-		base = eir_get_service_data(ev3->data, ev3->length,
-					    EIR_BAA_SERVICE_UUID, &base_len);
-		if (base && sk && base_len <= sizeof(iso_pi(sk)->base)) {
+		if (!sk)
+			goto done;
+
+		hcon = iso_pi(sk)->conn->hcon;
+		if (!hcon)
+			goto done;
+
+		if (ev3->data_status == LE_PA_DATA_TRUNCATED) {
+			/* The controller was unable to retrieve PA data. */
+			memset(hcon->le_per_adv_data, 0,
+			       HCI_MAX_PER_AD_TOT_LEN);
+			hcon->le_per_adv_data_len = 0;
+			hcon->le_per_adv_data_offset = 0;
+			goto done;
+		}
+
+		if (hcon->le_per_adv_data_offset + ev3->length >
+		    HCI_MAX_PER_AD_TOT_LEN)
+			goto done;
+
+		memcpy(hcon->le_per_adv_data + hcon->le_per_adv_data_offset,
+		       ev3->data, ev3->length);
+		hcon->le_per_adv_data_offset += ev3->length;
+
+		if (ev3->data_status == LE_PA_DATA_COMPLETE) {
+			/* All PA data has been received. */
+			hcon->le_per_adv_data_len =
+				hcon->le_per_adv_data_offset;
+			hcon->le_per_adv_data_offset = 0;
+
+			/* Extract BASE */
+			base = eir_get_service_data(hcon->le_per_adv_data,
+						    hcon->le_per_adv_data_len,
+						    EIR_BAA_SERVICE_UUID,
+						    &base_len);
+
+			if (!base || base_len > BASE_MAX_LENGTH)
+				goto done;
+
 			memcpy(iso_pi(sk)->base, base, base_len);
 			iso_pi(sk)->base_len = base_len;
+		} else {
+			/* This is a PA data fragment. Keep pa_data_len set to 0
+			 * until all data has been reassembled.
+			 */
+			hcon->le_per_adv_data_len = 0;
 		}
 	} else {
 		sk = iso_get_sock_listen(&hdev->bdaddr, BDADDR_ANY, NULL, NULL);
-- 
cgit v1.2.3


From 2615fd9a7c2507eb3be3fbe49dcec88a2f56454a Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 16 Feb 2024 16:20:11 -0500
Subject: Bluetooth: hci_sync: Fix overwriting request callback

In a few cases the stack may generate commands as responses to events
which would happen to overwrite the sent_cmd, so this attempts to store
the request in req_skb so even if sent_cmd is replaced with a new
command the pending request will remain in stored in req_skb.

Fixes: 6a98e3836fa2 ("Bluetooth: Add helper for serialized HCI command execution")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  1 +
 net/bluetooth/hci_conn.c         |  2 +-
 net/bluetooth/hci_core.c         | 46 ++++++++++++++++++++++++++++------------
 net/bluetooth/hci_event.c        | 18 ++++++++--------
 net/bluetooth/hci_sync.c         | 21 +++++++++++++++---
 5 files changed, 61 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index da6aa6549b81..56fb42df44a3 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -552,6 +552,7 @@ struct hci_dev {
 	__u32			req_status;
 	__u32			req_result;
 	struct sk_buff		*req_skb;
+	struct sk_buff		*req_rsp;
 
 	void			*smp_data;
 	void			*smp_bredr_data;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 54d4189c198a..3ad74f76983b 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -2918,7 +2918,7 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason)
 		case HCI_EV_LE_CONN_COMPLETE:
 		case HCI_EV_LE_ENHANCED_CONN_COMPLETE:
 		case HCI_EVT_LE_CIS_ESTABLISHED:
-			hci_cmd_sync_cancel(hdev, -ECANCELED);
+			hci_cmd_sync_cancel(hdev, ECANCELED);
 			break;
 		}
 	/* Cancel connect attempt if still queued/pending */
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 34c8dca2069f..6ca4c0df9f9c 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1522,8 +1522,8 @@ static void hci_cmd_timeout(struct work_struct *work)
 	struct hci_dev *hdev = container_of(work, struct hci_dev,
 					    cmd_timer.work);
 
-	if (hdev->sent_cmd) {
-		u16 opcode = hci_skb_opcode(hdev->sent_cmd);
+	if (hdev->req_skb) {
+		u16 opcode = hci_skb_opcode(hdev->req_skb);
 
 		bt_dev_err(hdev, "command 0x%4.4x tx timeout", opcode);
 
@@ -2828,6 +2828,7 @@ void hci_release_dev(struct hci_dev *hdev)
 	ida_destroy(&hdev->unset_handle_ida);
 	ida_free(&hci_index_ida, hdev->id);
 	kfree_skb(hdev->sent_cmd);
+	kfree_skb(hdev->req_skb);
 	kfree_skb(hdev->recv_event);
 	kfree(hdev);
 }
@@ -3157,21 +3158,33 @@ int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen,
 EXPORT_SYMBOL(__hci_cmd_send);
 
 /* Get data from the previously sent command */
-void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
+static void *hci_cmd_data(struct sk_buff *skb, __u16 opcode)
 {
 	struct hci_command_hdr *hdr;
 
-	if (!hdev->sent_cmd)
+	if (!skb || skb->len < HCI_COMMAND_HDR_SIZE)
 		return NULL;
 
-	hdr = (void *) hdev->sent_cmd->data;
+	hdr = (void *)skb->data;
 
 	if (hdr->opcode != cpu_to_le16(opcode))
 		return NULL;
 
-	BT_DBG("%s opcode 0x%4.4x", hdev->name, opcode);
+	return skb->data + HCI_COMMAND_HDR_SIZE;
+}
 
-	return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE;
+/* Get data from the previously sent command */
+void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
+{
+	void *data;
+
+	/* Check if opcode matches last sent command */
+	data = hci_cmd_data(hdev->sent_cmd, opcode);
+	if (!data)
+		/* Check if opcode matches last request */
+		data = hci_cmd_data(hdev->req_skb, opcode);
+
+	return data;
 }
 
 /* Get data from last received event */
@@ -4072,17 +4085,19 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
 	if (!status && !hci_req_is_complete(hdev))
 		return;
 
+	skb = hdev->req_skb;
+
 	/* If this was the last command in a request the complete
-	 * callback would be found in hdev->sent_cmd instead of the
+	 * callback would be found in hdev->req_skb instead of the
 	 * command queue (hdev->cmd_q).
 	 */
-	if (bt_cb(hdev->sent_cmd)->hci.req_flags & HCI_REQ_SKB) {
-		*req_complete_skb = bt_cb(hdev->sent_cmd)->hci.req_complete_skb;
+	if (skb && bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) {
+		*req_complete_skb = bt_cb(skb)->hci.req_complete_skb;
 		return;
 	}
 
-	if (bt_cb(hdev->sent_cmd)->hci.req_complete) {
-		*req_complete = bt_cb(hdev->sent_cmd)->hci.req_complete;
+	if (skb && bt_cb(skb)->hci.req_complete) {
+		*req_complete = bt_cb(skb)->hci.req_complete;
 		return;
 	}
 
@@ -4199,8 +4214,11 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb)
 		return;
 	}
 
-	if (hci_req_status_pend(hdev))
-		hci_dev_set_flag(hdev, HCI_CMD_PENDING);
+	if (hci_req_status_pend(hdev) &&
+	    !hci_dev_test_and_set_flag(hdev, HCI_CMD_PENDING)) {
+		kfree_skb(hdev->req_skb);
+		hdev->req_skb = skb_clone(skb, GFP_KERNEL);
+	}
 
 	atomic_dec(&hdev->cmd_cnt);
 }
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 6071a1226e1b..bffd2c7ff608 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4368,7 +4368,7 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, void *data,
 	 * (since for this kind of commands there will not be a command
 	 * complete event).
 	 */
-	if (ev->status || (hdev->sent_cmd && !hci_skb_event(hdev->sent_cmd))) {
+	if (ev->status || (hdev->req_skb && !hci_skb_event(hdev->req_skb))) {
 		hci_req_cmd_complete(hdev, *opcode, ev->status, req_complete,
 				     req_complete_skb);
 		if (hci_dev_test_flag(hdev, HCI_CMD_PENDING)) {
@@ -7170,10 +7170,10 @@ static void hci_le_meta_evt(struct hci_dev *hdev, void *data,
 	bt_dev_dbg(hdev, "subevent 0x%2.2x", ev->subevent);
 
 	/* Only match event if command OGF is for LE */
-	if (hdev->sent_cmd &&
-	    hci_opcode_ogf(hci_skb_opcode(hdev->sent_cmd)) == 0x08 &&
-	    hci_skb_event(hdev->sent_cmd) == ev->subevent) {
-		*opcode = hci_skb_opcode(hdev->sent_cmd);
+	if (hdev->req_skb &&
+	    hci_opcode_ogf(hci_skb_opcode(hdev->req_skb)) == 0x08 &&
+	    hci_skb_event(hdev->req_skb) == ev->subevent) {
+		*opcode = hci_skb_opcode(hdev->req_skb);
 		hci_req_cmd_complete(hdev, *opcode, 0x00, req_complete,
 				     req_complete_skb);
 	}
@@ -7541,10 +7541,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
 	}
 
 	/* Only match event if command OGF is not for LE */
-	if (hdev->sent_cmd &&
-	    hci_opcode_ogf(hci_skb_opcode(hdev->sent_cmd)) != 0x08 &&
-	    hci_skb_event(hdev->sent_cmd) == event) {
-		hci_req_cmd_complete(hdev, hci_skb_opcode(hdev->sent_cmd),
+	if (hdev->req_skb &&
+	    hci_opcode_ogf(hci_skb_opcode(hdev->req_skb)) != 0x08 &&
+	    hci_skb_event(hdev->req_skb) == event) {
+		hci_req_cmd_complete(hdev, hci_skb_opcode(hdev->req_skb),
 				     status, &req_complete, &req_complete_skb);
 		req_evt = event;
 	}
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 10aa59da735b..8a3d0d1f7871 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -32,6 +32,10 @@ static void hci_cmd_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode,
 	hdev->req_result = result;
 	hdev->req_status = HCI_REQ_DONE;
 
+	/* Free the request command so it is not used as response */
+	kfree_skb(hdev->req_skb);
+	hdev->req_skb = NULL;
+
 	if (skb) {
 		struct sock *sk = hci_skb_sk(skb);
 
@@ -39,7 +43,7 @@ static void hci_cmd_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode,
 		if (sk)
 			sock_put(sk);
 
-		hdev->req_skb = skb_get(skb);
+		hdev->req_rsp = skb_get(skb);
 	}
 
 	wake_up_interruptible(&hdev->req_wait_q);
@@ -187,8 +191,8 @@ struct sk_buff *__hci_cmd_sync_sk(struct hci_dev *hdev, u16 opcode, u32 plen,
 
 	hdev->req_status = 0;
 	hdev->req_result = 0;
-	skb = hdev->req_skb;
-	hdev->req_skb = NULL;
+	skb = hdev->req_rsp;
+	hdev->req_rsp = NULL;
 
 	bt_dev_dbg(hdev, "end: err %d", err);
 
@@ -5021,6 +5025,11 @@ int hci_dev_open_sync(struct hci_dev *hdev)
 			hdev->sent_cmd = NULL;
 		}
 
+		if (hdev->req_skb) {
+			kfree_skb(hdev->req_skb);
+			hdev->req_skb = NULL;
+		}
+
 		clear_bit(HCI_RUNNING, &hdev->flags);
 		hci_sock_dev_event(hdev, HCI_DEV_CLOSE);
 
@@ -5181,6 +5190,12 @@ int hci_dev_close_sync(struct hci_dev *hdev)
 		hdev->sent_cmd = NULL;
 	}
 
+	/* Drop last request */
+	if (hdev->req_skb) {
+		kfree_skb(hdev->req_skb);
+		hdev->req_skb = NULL;
+	}
+
 	clear_bit(HCI_RUNNING, &hdev->flags);
 	hci_sock_dev_event(hdev, HCI_DEV_CLOSE);
 
-- 
cgit v1.2.3


From 48201a3b3f398be6a01f78a14b18bd5d31c47458 Mon Sep 17 00:00:00 2001
From: Vinicius Peixoto <nukelet64@gmail.com>
Date: Mon, 26 Feb 2024 22:43:26 -0300
Subject: Bluetooth: Add new quirk for broken read key length on ATS2851

The ATS2851 controller erroneously reports support for the "Read
Encryption Key Length" HCI command. This makes it unable to connect
to any devices, since this command is issued by the kernel during the
connection process in response to an "Encryption Change" HCI event.

Add a new quirk (HCI_QUIRK_BROKEN_ENC_KEY_SIZE) to hint that the command
is unsupported, preventing it from interrupting the connection process.

This is the error log from btmon before this patch:

> HCI Event: Encryption Change (0x08) plen 4
        Status: Success (0x00)
        Handle: 2048 Address: ...
        Encryption: Enabled with E0 (0x01)
< HCI Command: Read Encryption Key Size (0x05|0x0008) plen 2
        Handle: 2048 Address: ...
> HCI Event: Command Status (0x0f) plen 4
      Read Encryption Key Size (0x05|0x0008) ncmd 1
        Status: Unknown HCI Command (0x01)

Signed-off-by: Vinicius Peixoto <nukelet64@gmail.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c   | 1 +
 include/net/bluetooth/hci.h | 8 ++++++++
 net/bluetooth/hci_event.c   | 3 ++-
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index b09fe480e708..06e915b57283 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -4500,6 +4500,7 @@ static int btusb_probe(struct usb_interface *intf,
 		set_bit(HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, &hdev->quirks);
 		set_bit(HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, &hdev->quirks);
 		set_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &hdev->quirks);
+		set_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &hdev->quirks);
 	}
 
 	if (!reset)
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 21099bd3c8bc..8701ca5f31ee 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -330,6 +330,14 @@ enum {
 	 * during the hdev->setup vendor callback.
 	 */
 	HCI_QUIRK_BROKEN_LE_CODED,
+
+	/*
+	 * When this quirk is set, the HCI_OP_READ_ENC_KEY_SIZE command is
+	 * skipped during an HCI_EV_ENCRYPT_CHANGE event. This is required
+	 * for Actions Semiconductor ATS2851 based controllers, which erroneously
+	 * claim to support it.
+	 */
+	HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE,
 };
 
 /* HCI device flags */
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index bffd2c7ff608..4ae224824012 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3641,7 +3641,8 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data,
 		 * controller really supports it. If it doesn't, assume
 		 * the default size (16).
 		 */
-		if (!(hdev->commands[20] & 0x10)) {
+		if (!(hdev->commands[20] & 0x10) ||
+		    test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &hdev->quirks)) {
 			conn->enc_key_size = HCI_LINK_KEY_SIZE;
 			goto notify;
 		}
-- 
cgit v1.2.3


From e6f798225a31485e47a6e4f6aa07ee9fdf80c2cb Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 4 Mar 2024 19:05:16 -0800
Subject: mm: Introduce VM_SPARSE kind and vm_area_[un]map_pages().

vmap/vmalloc APIs are used to map a set of pages into contiguous kernel
virtual space.

get_vm_area() with appropriate flag is used to request an area of kernel
address range. It's used for vmalloc, vmap, ioremap, xen use cases.
- vmalloc use case dominates the usage. Such vm areas have VM_ALLOC flag.
- the areas created by vmap() function should be tagged with VM_MAP.
- ioremap areas are tagged with VM_IOREMAP.

BPF would like to extend the vmap API to implement a lazily-populated
sparse, yet contiguous kernel virtual space. Introduce VM_SPARSE flag
and vm_area_map_pages(area, start_addr, count, pages) API to map a set
of pages within a given area.
It has the same sanity checks as vmap() does.
It also checks that get_vm_area() was created with VM_SPARSE flag
which identifies such areas in /proc/vmallocinfo
and returns zero pages on read through /proc/kcore.

The next commits will introduce bpf_arena which is a sparsely populated
shared memory region between bpf program and user space process. It will
map privately-managed pages into a sparse vm area with the following steps:

  // request virtual memory region during bpf prog verification
  area = get_vm_area(area_size, VM_SPARSE);

  // on demand
  vm_area_map_pages(area, kaddr, kend, pages);
  vm_area_unmap_pages(area, kaddr, kend);

  // after bpf program is detached and unloaded
  free_vm_area(area);

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Link: https://lore.kernel.org/bpf/20240305030516.41519-3-alexei.starovoitov@gmail.com
---
 include/linux/vmalloc.h |  5 +++++
 mm/vmalloc.c            | 59 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 62 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c720be70c8dd..0f72c85a377b 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -35,6 +35,7 @@ struct iov_iter;		/* in uio.h */
 #else
 #define VM_DEFER_KMEMLEAK	0
 #endif
+#define VM_SPARSE		0x00001000	/* sparse vm_area. not all pages are present. */
 
 /* bits [20..32] reserved for arch specific ioremap internals */
 
@@ -232,6 +233,10 @@ static inline bool is_vm_area_hugepages(const void *addr)
 }
 
 #ifdef CONFIG_MMU
+int vm_area_map_pages(struct vm_struct *area, unsigned long start,
+		      unsigned long end, struct page **pages);
+void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
+			 unsigned long end);
 void vunmap_range(unsigned long addr, unsigned long end);
 static inline void set_vm_flush_reset_perms(void *addr)
 {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f42f98a127d5..e5b8c70950bc 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -648,6 +648,58 @@ static int vmap_pages_range(unsigned long addr, unsigned long end,
 	return err;
 }
 
+static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
+				unsigned long end)
+{
+	might_sleep();
+	if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS))
+		return -EINVAL;
+	if (WARN_ON_ONCE(area->flags & VM_NO_GUARD))
+		return -EINVAL;
+	if (WARN_ON_ONCE(!(area->flags & VM_SPARSE)))
+		return -EINVAL;
+	if ((end - start) >> PAGE_SHIFT > totalram_pages())
+		return -E2BIG;
+	if (start < (unsigned long)area->addr ||
+	    (void *)end > area->addr + get_vm_area_size(area))
+		return -ERANGE;
+	return 0;
+}
+
+/**
+ * vm_area_map_pages - map pages inside given sparse vm_area
+ * @area: vm_area
+ * @start: start address inside vm_area
+ * @end: end address inside vm_area
+ * @pages: pages to map (always PAGE_SIZE pages)
+ */
+int vm_area_map_pages(struct vm_struct *area, unsigned long start,
+		      unsigned long end, struct page **pages)
+{
+	int err;
+
+	err = check_sparse_vm_area(area, start, end);
+	if (err)
+		return err;
+
+	return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT);
+}
+
+/**
+ * vm_area_unmap_pages - unmap pages inside given sparse vm_area
+ * @area: vm_area
+ * @start: start address inside vm_area
+ * @end: end address inside vm_area
+ */
+void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
+			 unsigned long end)
+{
+	if (check_sparse_vm_area(area, start, end))
+		return;
+
+	vunmap_range(start, end);
+}
+
 int is_vmalloc_or_module_addr(const void *x)
 {
 	/*
@@ -3822,9 +3874,9 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 
 		if (flags & VMAP_RAM)
 			copied = vmap_ram_vread_iter(iter, addr, n, flags);
-		else if (!(vm && (vm->flags & VM_IOREMAP)))
+		else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE))))
 			copied = aligned_vread_iter(iter, addr, n);
-		else /* IOREMAP area is treated as memory hole */
+		else /* IOREMAP | SPARSE area is treated as memory hole */
 			copied = zero_iter(iter, n);
 
 		addr += copied;
@@ -4415,6 +4467,9 @@ static int s_show(struct seq_file *m, void *p)
 	if (v->flags & VM_IOREMAP)
 		seq_puts(m, " ioremap");
 
+	if (v->flags & VM_SPARSE)
+		seq_puts(m, " sparse");
+
 	if (v->flags & VM_ALLOC)
 		seq_puts(m, " vmalloc");
 
-- 
cgit v1.2.3


From 011832b97b311bb9e3c27945bc0d1089a14209c9 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 5 Mar 2024 19:19:26 -0800
Subject: bpf: Introduce may_goto instruction

Introduce may_goto instruction that from the verifier pov is similar to
open coded iterators bpf_for()/bpf_repeat() and bpf_loop() helper, but it
doesn't iterate any objects.
In assembly 'may_goto' is a nop most of the time until bpf runtime has to
terminate the program for whatever reason. In the current implementation
may_goto has a hidden counter, but other mechanisms can be used.
For programs written in C the later patch introduces 'cond_break' macro
that combines 'may_goto' with 'break' statement and has similar semantics:
cond_break is a nop until bpf runtime has to break out of this loop.
It can be used in any normal "for" or "while" loop, like

  for (i = zero; i < cnt; cond_break, i++) {

The verifier recognizes that may_goto is used in the program, reserves
additional 8 bytes of stack, initializes them in subprog prologue, and
replaces may_goto instruction with:
aux_reg = *(u64 *)(fp - 40)
if aux_reg == 0 goto pc+off
aux_reg -= 1
*(u64 *)(fp - 40) = aux_reg

may_goto instruction can be used by LLVM to implement __builtin_memcpy,
__builtin_strcmp.

may_goto is not a full substitute for bpf_for() macro.
bpf_for() doesn't have induction variable that verifiers sees,
so 'i' in bpf_for(i, 0, 100) is seen as imprecise and bounded.

But when the code is written as:
for (i = 0; i < 100; cond_break, i++)
the verifier see 'i' as precise constant zero,
hence cond_break (aka may_goto) doesn't help to converge the loop.
A static or global variable can be used as a workaround:
static int zero = 0;
for (i = zero; i < 100; cond_break, i++) // works!

may_goto works well with arena pointers that don't need to be bounds
checked on access. Load/store from arena returns imprecise unbounded
scalar and loops with may_goto pass the verifier.

Reserve new opcode BPF_JMP | BPF_JCOND for may_goto insn.
JCOND stands for conditional pseudo jump.
Since goto_or_nop insn was proposed, it may use the same opcode.
may_goto vs goto_or_nop can be distinguished by src_reg:
code = BPF_JMP | BPF_JCOND
src_reg = 0 - may_goto
src_reg = 1 - goto_or_nop

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Tested-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20240306031929.42666-2-alexei.starovoitov@gmail.com
---
 include/linux/bpf_verifier.h   |   2 +
 include/uapi/linux/bpf.h       |   5 ++
 kernel/bpf/core.c              |   1 +
 kernel/bpf/disasm.c            |   4 +
 kernel/bpf/verifier.c          | 163 +++++++++++++++++++++++++++++++++--------
 tools/include/uapi/linux/bpf.h |   5 ++
 6 files changed, 150 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 84365e6dd85d..4b0f6600e499 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -449,6 +449,7 @@ struct bpf_verifier_state {
 	u32 jmp_history_cnt;
 	u32 dfs_depth;
 	u32 callback_unroll_depth;
+	u32 may_goto_depth;
 };
 
 #define bpf_get_spilled_reg(slot, frame, mask)				\
@@ -619,6 +620,7 @@ struct bpf_subprog_info {
 	u32 start; /* insn idx of function entry point */
 	u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
 	u16 stack_depth; /* max. stack depth used by this function */
+	u16 stack_extra;
 	bool has_tail_call: 1;
 	bool tail_call_reachable: 1;
 	bool has_ld_abs: 1;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a241f407c234..85ec7fc799d7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -42,6 +42,7 @@
 #define BPF_JSGE	0x70	/* SGE is signed '>=', GE in x86 */
 #define BPF_JSLT	0xc0	/* SLT is signed, '<' */
 #define BPF_JSLE	0xd0	/* SLE is signed, '<=' */
+#define BPF_JCOND	0xe0	/* conditional pseudo jumps: may_goto, goto_or_nop */
 #define BPF_CALL	0x80	/* function call */
 #define BPF_EXIT	0x90	/* function return */
 
@@ -50,6 +51,10 @@
 #define BPF_XCHG	(0xe0 | BPF_FETCH)	/* atomic exchange */
 #define BPF_CMPXCHG	(0xf0 | BPF_FETCH)	/* atomic compare-and-write */
 
+enum bpf_cond_pseudo_jmp {
+	BPF_MAY_GOTO = 0,
+};
+
 /* Register numbers */
 enum {
 	BPF_REG_0 = 0,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 71c459a51d9e..9ee4536d0a09 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1675,6 +1675,7 @@ bool bpf_opcode_in_insntable(u8 code)
 		[BPF_LD | BPF_IND | BPF_B] = true,
 		[BPF_LD | BPF_IND | BPF_H] = true,
 		[BPF_LD | BPF_IND | BPF_W] = true,
+		[BPF_JMP | BPF_JCOND] = true,
 	};
 #undef BPF_INSN_3_TBL
 #undef BPF_INSN_2_TBL
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 49940c26a227..82b2dbdd048f 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -322,6 +322,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 		} else if (insn->code == (BPF_JMP | BPF_JA)) {
 			verbose(cbs->private_data, "(%02x) goto pc%+d\n",
 				insn->code, insn->off);
+		} else if (insn->code == (BPF_JMP | BPF_JCOND) &&
+			   insn->src_reg == BPF_MAY_GOTO) {
+			verbose(cbs->private_data, "(%02x) may_goto pc%+d\n",
+				insn->code, insn->off);
 		} else if (insn->code == (BPF_JMP32 | BPF_JA)) {
 			verbose(cbs->private_data, "(%02x) gotol pc%+d\n",
 				insn->code, insn->imm);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4dd84e13bbfe..8030b50d3b45 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -533,6 +533,16 @@ static bool is_async_callback_calling_insn(struct bpf_insn *insn)
 	return bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm);
 }
 
+static bool is_may_goto_insn(struct bpf_insn *insn)
+{
+	return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO;
+}
+
+static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx)
+{
+	return is_may_goto_insn(&env->prog->insnsi[insn_idx]);
+}
+
 static bool is_storage_get_function(enum bpf_func_id func_id)
 {
 	return func_id == BPF_FUNC_sk_storage_get ||
@@ -1429,6 +1439,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->dfs_depth = src->dfs_depth;
 	dst_state->callback_unroll_depth = src->callback_unroll_depth;
 	dst_state->used_as_loop_entry = src->used_as_loop_entry;
+	dst_state->may_goto_depth = src->may_goto_depth;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -14871,11 +14882,36 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	int err;
 
 	/* Only conditional jumps are expected to reach here. */
-	if (opcode == BPF_JA || opcode > BPF_JSLE) {
+	if (opcode == BPF_JA || opcode > BPF_JCOND) {
 		verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
 		return -EINVAL;
 	}
 
+	if (opcode == BPF_JCOND) {
+		struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
+		int idx = *insn_idx;
+
+		if (insn->code != (BPF_JMP | BPF_JCOND) ||
+		    insn->src_reg != BPF_MAY_GOTO ||
+		    insn->dst_reg || insn->imm || insn->off == 0) {
+			verbose(env, "invalid may_goto off %d imm %d\n",
+				insn->off, insn->imm);
+			return -EINVAL;
+		}
+		prev_st = find_prev_entry(env, cur_st->parent, idx);
+
+		/* branch out 'fallthrough' insn as a new state to explore */
+		queued_st = push_stack(env, idx + 1, idx, false);
+		if (!queued_st)
+			return -ENOMEM;
+
+		queued_st->may_goto_depth++;
+		if (prev_st)
+			widen_imprecise_scalars(env, prev_st, queued_st);
+		*insn_idx += insn->off;
+		return 0;
+	}
+
 	/* check src2 operand */
 	err = check_reg_arg(env, insn->dst_reg, SRC_OP);
 	if (err)
@@ -15659,6 +15695,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 	default:
 		/* conditional jump with two edges */
 		mark_prune_point(env, t);
+		if (is_may_goto_insn(insn))
+			mark_force_checkpoint(env, t);
 
 		ret = push_insn(t, t + 1, FALLTHROUGH, env);
 		if (ret)
@@ -17135,6 +17173,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 				}
 				goto skip_inf_loop_check;
 			}
+			if (is_may_goto_insn_at(env, insn_idx)) {
+				if (states_equal(env, &sl->state, cur, true)) {
+					update_loop_entry(cur, &sl->state);
+					goto hit;
+				}
+				goto skip_inf_loop_check;
+			}
 			if (calls_callback(env, insn_idx)) {
 				if (states_equal(env, &sl->state, cur, true))
 					goto hit;
@@ -17144,6 +17189,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			if (states_maybe_looping(&sl->state, cur) &&
 			    states_equal(env, &sl->state, cur, true) &&
 			    !iter_active_depths_differ(&sl->state, cur) &&
+			    sl->state.may_goto_depth == cur->may_goto_depth &&
 			    sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
 				verbose_linfo(env, insn_idx, "; ");
 				verbose(env, "infinite loop detected at insn %d\n", insn_idx);
@@ -19408,7 +19454,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 	struct bpf_insn insn_buf[16];
 	struct bpf_prog *new_prog;
 	struct bpf_map *map_ptr;
-	int i, ret, cnt, delta = 0;
+	int i, ret, cnt, delta = 0, cur_subprog = 0;
+	struct bpf_subprog_info *subprogs = env->subprog_info;
+	u16 stack_depth = subprogs[cur_subprog].stack_depth;
+	u16 stack_depth_extra = 0;
 
 	if (env->seen_exception && !env->exception_callback_subprog) {
 		struct bpf_insn patch[] = {
@@ -19428,7 +19477,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		mark_subprog_exc_cb(env, env->exception_callback_subprog);
 	}
 
-	for (i = 0; i < insn_cnt; i++, insn++) {
+	for (i = 0; i < insn_cnt;) {
 		/* Make divide-by-zero exceptions impossible. */
 		if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
 		    insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
@@ -19467,7 +19516,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			delta    += cnt - 1;
 			env->prog = prog = new_prog;
 			insn      = new_prog->insnsi + i + delta;
-			continue;
+			goto next_insn;
 		}
 
 		/* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
@@ -19487,7 +19536,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			delta    += cnt - 1;
 			env->prog = prog = new_prog;
 			insn      = new_prog->insnsi + i + delta;
-			continue;
+			goto next_insn;
 		}
 
 		/* Rewrite pointer arithmetic to mitigate speculation attacks. */
@@ -19502,7 +19551,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			aux = &env->insn_aux_data[i + delta];
 			if (!aux->alu_state ||
 			    aux->alu_state == BPF_ALU_NON_POINTER)
-				continue;
+				goto next_insn;
 
 			isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
 			issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
@@ -19540,19 +19589,39 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			delta    += cnt - 1;
 			env->prog = prog = new_prog;
 			insn      = new_prog->insnsi + i + delta;
-			continue;
+			goto next_insn;
+		}
+
+		if (is_may_goto_insn(insn)) {
+			int stack_off = -stack_depth - 8;
+
+			stack_depth_extra = 8;
+			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off);
+			insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
+			insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+			insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);
+			cnt = 4;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta += cnt - 1;
+			env->prog = prog = new_prog;
+			insn = new_prog->insnsi + i + delta;
+			goto next_insn;
 		}
 
 		if (insn->code != (BPF_JMP | BPF_CALL))
-			continue;
+			goto next_insn;
 		if (insn->src_reg == BPF_PSEUDO_CALL)
-			continue;
+			goto next_insn;
 		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
 			ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
 			if (ret)
 				return ret;
 			if (cnt == 0)
-				continue;
+				goto next_insn;
 
 			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
 			if (!new_prog)
@@ -19561,7 +19630,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			delta	 += cnt - 1;
 			env->prog = prog = new_prog;
 			insn	  = new_prog->insnsi + i + delta;
-			continue;
+			goto next_insn;
 		}
 
 		if (insn->imm == BPF_FUNC_get_route_realm)
@@ -19609,11 +19678,11 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 				}
 
 				insn->imm = ret + 1;
-				continue;
+				goto next_insn;
 			}
 
 			if (!bpf_map_ptr_unpriv(aux))
-				continue;
+				goto next_insn;
 
 			/* instead of changing every JIT dealing with tail_call
 			 * emit two extra insns:
@@ -19642,7 +19711,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			delta    += cnt - 1;
 			env->prog = prog = new_prog;
 			insn      = new_prog->insnsi + i + delta;
-			continue;
+			goto next_insn;
 		}
 
 		if (insn->imm == BPF_FUNC_timer_set_callback) {
@@ -19754,7 +19823,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 				delta    += cnt - 1;
 				env->prog = prog = new_prog;
 				insn      = new_prog->insnsi + i + delta;
-				continue;
+				goto next_insn;
 			}
 
 			BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
@@ -19785,31 +19854,31 @@ patch_map_ops_generic:
 			switch (insn->imm) {
 			case BPF_FUNC_map_lookup_elem:
 				insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
-				continue;
+				goto next_insn;
 			case BPF_FUNC_map_update_elem:
 				insn->imm = BPF_CALL_IMM(ops->map_update_elem);
-				continue;
+				goto next_insn;
 			case BPF_FUNC_map_delete_elem:
 				insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
-				continue;
+				goto next_insn;
 			case BPF_FUNC_map_push_elem:
 				insn->imm = BPF_CALL_IMM(ops->map_push_elem);
-				continue;
+				goto next_insn;
 			case BPF_FUNC_map_pop_elem:
 				insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
-				continue;
+				goto next_insn;
 			case BPF_FUNC_map_peek_elem:
 				insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
-				continue;
+				goto next_insn;
 			case BPF_FUNC_redirect_map:
 				insn->imm = BPF_CALL_IMM(ops->map_redirect);
-				continue;
+				goto next_insn;
 			case BPF_FUNC_for_each_map_elem:
 				insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
-				continue;
+				goto next_insn;
 			case BPF_FUNC_map_lookup_percpu_elem:
 				insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
-				continue;
+				goto next_insn;
 			}
 
 			goto patch_call_imm;
@@ -19837,7 +19906,7 @@ patch_map_ops_generic:
 			delta    += cnt - 1;
 			env->prog = prog = new_prog;
 			insn      = new_prog->insnsi + i + delta;
-			continue;
+			goto next_insn;
 		}
 
 		/* Implement bpf_get_func_arg inline. */
@@ -19862,7 +19931,7 @@ patch_map_ops_generic:
 			delta    += cnt - 1;
 			env->prog = prog = new_prog;
 			insn      = new_prog->insnsi + i + delta;
-			continue;
+			goto next_insn;
 		}
 
 		/* Implement bpf_get_func_ret inline. */
@@ -19890,7 +19959,7 @@ patch_map_ops_generic:
 			delta    += cnt - 1;
 			env->prog = prog = new_prog;
 			insn      = new_prog->insnsi + i + delta;
-			continue;
+			goto next_insn;
 		}
 
 		/* Implement get_func_arg_cnt inline. */
@@ -19905,7 +19974,7 @@ patch_map_ops_generic:
 
 			env->prog = prog = new_prog;
 			insn      = new_prog->insnsi + i + delta;
-			continue;
+			goto next_insn;
 		}
 
 		/* Implement bpf_get_func_ip inline. */
@@ -19920,7 +19989,7 @@ patch_map_ops_generic:
 
 			env->prog = prog = new_prog;
 			insn      = new_prog->insnsi + i + delta;
-			continue;
+			goto next_insn;
 		}
 
 		/* Implement bpf_kptr_xchg inline */
@@ -19938,7 +20007,7 @@ patch_map_ops_generic:
 			delta    += cnt - 1;
 			env->prog = prog = new_prog;
 			insn      = new_prog->insnsi + i + delta;
-			continue;
+			goto next_insn;
 		}
 patch_call_imm:
 		fn = env->ops->get_func_proto(insn->imm, env->prog);
@@ -19952,6 +20021,40 @@ patch_call_imm:
 			return -EFAULT;
 		}
 		insn->imm = fn->func - __bpf_call_base;
+next_insn:
+		if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+			subprogs[cur_subprog].stack_depth += stack_depth_extra;
+			subprogs[cur_subprog].stack_extra = stack_depth_extra;
+			cur_subprog++;
+			stack_depth = subprogs[cur_subprog].stack_depth;
+			stack_depth_extra = 0;
+		}
+		i++;
+		insn++;
+	}
+
+	env->prog->aux->stack_depth = subprogs[0].stack_depth;
+	for (i = 0; i < env->subprog_cnt; i++) {
+		int subprog_start = subprogs[i].start;
+		int stack_slots = subprogs[i].stack_extra / 8;
+
+		if (!stack_slots)
+			continue;
+		if (stack_slots > 1) {
+			verbose(env, "verifier bug: stack_slots supports may_goto only\n");
+			return -EFAULT;
+		}
+
+		/* Add ST insn to subprog prologue to init extra stack */
+		insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP,
+					 -subprogs[i].stack_depth, BPF_MAX_LOOPS);
+		/* Copy first actual insn to preserve it */
+		insn_buf[1] = env->prog->insnsi[subprog_start];
+
+		new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2);
+		if (!new_prog)
+			return -ENOMEM;
+		env->prog = prog = new_prog;
 	}
 
 	/* Since poke tab is now finalized, publish aux to tracker. */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a241f407c234..85ec7fc799d7 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -42,6 +42,7 @@
 #define BPF_JSGE	0x70	/* SGE is signed '>=', GE in x86 */
 #define BPF_JSLT	0xc0	/* SLT is signed, '<' */
 #define BPF_JSLE	0xd0	/* SLE is signed, '<=' */
+#define BPF_JCOND	0xe0	/* conditional pseudo jumps: may_goto, goto_or_nop */
 #define BPF_CALL	0x80	/* function call */
 #define BPF_EXIT	0x90	/* function return */
 
@@ -50,6 +51,10 @@
 #define BPF_XCHG	(0xe0 | BPF_FETCH)	/* atomic exchange */
 #define BPF_CMPXCHG	(0xf0 | BPF_FETCH)	/* atomic compare-and-write */
 
+enum bpf_cond_pseudo_jmp {
+	BPF_MAY_GOTO = 0,
+};
+
 /* Register numbers */
 enum {
 	BPF_REG_0 = 0,
-- 
cgit v1.2.3


From a0873a5d542559698edfd4c8fc6e6636d338eea2 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Wed, 14 Feb 2024 19:08:00 -0800
Subject: net/mlx5: Add MPIR bit in mcam_access_reg

Add a cap bit in mcam_access_reg to check for MPIR support.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 628a3aa7a7e0..2756bdb654b4 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10253,7 +10253,9 @@ struct mlx5_ifc_mcam_access_reg_bits {
 	u8         mcqi[0x1];
 	u8         mcqs[0x1];
 
-	u8         regs_95_to_87[0x9];
+	u8         regs_95_to_90[0x6];
+	u8         mpir[0x1];
+	u8         regs_88_to_87[0x2];
 	u8         mpegc[0x1];
 	u8         mtutc[0x1];
 	u8         regs_84_to_68[0x11];
-- 
cgit v1.2.3


From ed29705e4ed1d5c1b2184fecc4684bd56c5d24ee Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Wed, 14 Feb 2024 19:08:13 -0800
Subject: net/mlx5: Enable SD feature

Have an actual mlx5_sd instance in the core device, and fix the getter
accordingly. This allows SD stuff to flow, the feature becomes supported
only here.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h | 3 ++-
 include/linux/mlx5/driver.h                        | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
index 0810b92b48d0..37d5f445598c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
@@ -59,10 +59,11 @@ struct mlx5_sd;
 
 static inline struct mlx5_sd *mlx5_get_sd(struct mlx5_core_dev *dev)
 {
-	return NULL;
+	return dev->sd;
 }
 
 static inline void mlx5_set_sd(struct mlx5_core_dev *dev, struct mlx5_sd *sd)
 {
+	dev->sd = sd;
 }
 #endif
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 41f03b352401..bf9324a31ae9 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -823,6 +823,7 @@ struct mlx5_core_dev {
 	struct blocking_notifier_head macsec_nh;
 #endif
 	u64 num_ipsec_offloads;
+	struct mlx5_sd          *sd;
 };
 
 struct mlx5_db {
-- 
cgit v1.2.3


From 68ac1e46425c54653ddb5f559bc37abe19071024 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Fri, 1 Mar 2024 17:43:08 +0100
Subject: net: phylink: clean the pcs_get_state documentation

commit 4d72c3bb60dd ("net: phylink: strip out pre-March 2020 legacy code")
dropped the mac_pcs_get_state ops in phylink_mac_ops in favor of
dedicated PCS operation pcs_get_state. However, the documentation for
the pcs_get_state ops was incorrectly converted and now self-references.

Drop the extra comment.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/phylink.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 6ba411732a0d..9a57deefcb07 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -480,9 +480,6 @@ void pcs_disable(struct phylink_pcs *pcs);
  * negotiation completion state in @state->an_complete, and link up state
  * in @state->link. If possible, @state->lp_advertising should also be
  * populated.
- *
- * When present, this overrides pcs_get_state() in &struct
- * phylink_pcs_ops.
  */
 void pcs_get_state(struct phylink_pcs *pcs,
 		   struct phylink_link_state *state);
-- 
cgit v1.2.3


From 4e441bb8aca1964460e0210bdfd9b89f18a0fd16 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Mon, 4 Mar 2024 17:29:33 +0800
Subject: tcp: add tracing of skb/skaddr in tcp_event_sk_skb class

Printing the addresses can help us identify the exact skb/sk
for those system in which it's not that easy to run BPF program.
As we can see, it already fetches those, then use it directly
and it will print like below:

...tcp_retransmit_skb: skbaddr=XXX skaddr=XXX family=AF_INET...

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/trace/events/tcp.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 7b1ddffa3dfc..ac36067ae066 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -88,7 +88,8 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 			      sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
 	),
 
-	TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s",
+	TP_printk("skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s",
+		  __entry->skbaddr, __entry->skaddr,
 		  show_family_name(__entry->family),
 		  __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
 		  __entry->saddr_v6, __entry->daddr_v6,
-- 
cgit v1.2.3


From 0ab544b6f055f234f708f294f4a6e47262573a4c Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Mon, 4 Mar 2024 17:29:34 +0800
Subject: tcp: add tracing of skbaddr in tcp_event_skb class

Use the existing parameter and print the address of skbaddr
as other trace functions do.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/trace/events/tcp.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index ac36067ae066..6ca3e0343666 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -362,7 +362,8 @@ DECLARE_EVENT_CLASS(tcp_event_skb,
 		TP_STORE_ADDR_PORTS_SKB(__entry, skb);
 	),
 
-	TP_printk("src=%pISpc dest=%pISpc", __entry->saddr, __entry->daddr)
+	TP_printk("skbaddr=%p src=%pISpc dest=%pISpc",
+		  __entry->skbaddr, __entry->saddr, __entry->daddr)
 );
 
 DEFINE_EVENT(tcp_event_skb, tcp_bad_csum,
-- 
cgit v1.2.3


From 42ed95de82c01184a88945d3ca274be6a7ea607d Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 7 Mar 2024 11:58:17 -0500
Subject: Bluetooth: ISO: Align broadcast sync_timeout with connection timeout

This aligns broadcast sync_timeout with existing connection timeouts
which are 20 seconds long.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/bluetooth.h | 2 ++
 net/bluetooth/iso.c               | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 7ffa8c192c3f..9fe95a22abeb 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -164,6 +164,8 @@ struct bt_voice {
 #define BT_ISO_QOS_BIG_UNSET	0xff
 #define BT_ISO_QOS_BIS_UNSET	0xff
 
+#define BT_ISO_SYNC_TIMEOUT	0x07d0 /* 20 secs */
+
 struct bt_iso_io_qos {
 	__u32 interval;
 	__u16 latency;
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 8af75d37b14c..c8793e57f4b5 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -834,10 +834,10 @@ static struct bt_iso_qos default_qos = {
 		.bcode			= {0x00},
 		.options		= 0x00,
 		.skip			= 0x0000,
-		.sync_timeout		= 0x4000,
+		.sync_timeout		= BT_ISO_SYNC_TIMEOUT,
 		.sync_cte_type		= 0x00,
 		.mse			= 0x00,
-		.timeout		= 0x4000,
+		.timeout		= BT_ISO_SYNC_TIMEOUT,
 	},
 };
 
-- 
cgit v1.2.3


From cf2c2e4a3d910270903d50462aaa75140cdb2c96 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 6 Mar 2024 19:12:25 -0800
Subject: bpf: Plumb get_unmapped_area() callback into bpf_map_ops

Subsequent patches introduce bpf_arena that imposes special alignment
requirements on address selection.

Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/r/20240307031228.42896-4-alexei.starovoitov@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h  |  3 +++
 kernel/bpf/syscall.c | 16 ++++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 785660810e6a..95e07673cdc1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -139,6 +139,9 @@ struct bpf_map_ops {
 	int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma);
 	__poll_t (*map_poll)(struct bpf_map *map, struct file *filp,
 			     struct poll_table_struct *pts);
+	unsigned long (*map_get_unmapped_area)(struct file *filep, unsigned long addr,
+					       unsigned long len, unsigned long pgoff,
+					       unsigned long flags);
 
 	/* Functions called by bpf_local_storage maps */
 	int (*map_local_storage_charge)(struct bpf_local_storage_map *smap,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b2750b79ac80..f63f4da4db5e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -937,6 +937,21 @@ static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
 	return EPOLLERR;
 }
 
+static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr,
+					   unsigned long len, unsigned long pgoff,
+					   unsigned long flags)
+{
+	struct bpf_map *map = filp->private_data;
+
+	if (map->ops->map_get_unmapped_area)
+		return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
+#ifdef CONFIG_MMU
+	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+#else
+	return addr;
+#endif
+}
+
 const struct file_operations bpf_map_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_map_show_fdinfo,
@@ -946,6 +961,7 @@ const struct file_operations bpf_map_fops = {
 	.write		= bpf_dummy_write,
 	.mmap		= bpf_map_mmap,
 	.poll		= bpf_map_poll,
+	.get_unmapped_area = bpf_get_unmapped_area,
 };
 
 int bpf_map_new_fd(struct bpf_map *map, int flags)
-- 
cgit v1.2.3


From 2658b5a8a4eee5fad378d0bde2f221deacbc58f1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:14 +0000
Subject: net: introduce struct net_hotdata

Instead of spreading networking critical fields
all over the places, add a custom net_hotdata
structure so that we can precisely control its layout.

In this first patch, move :

- gro_normal_batch used in rx (GRO stack)
- offload_base used in rx and tx (GRO and TSO stacks)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h  |  1 -
 include/net/gro.h          |  5 ++---
 include/net/hotdata.h      | 15 +++++++++++++++
 net/core/Makefile          |  1 +
 net/core/gro.c             | 15 ++++++---------
 net/core/gso.c             |  4 ++--
 net/core/hotdata.c         |  9 +++++++++
 net/core/sysctl_net_core.c |  3 ++-
 8 files changed, 37 insertions(+), 16 deletions(-)
 create mode 100644 include/net/hotdata.h
 create mode 100644 net/core/hotdata.c

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2767467138a0..6643452af543 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4796,7 +4796,6 @@ void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s);
 extern int		netdev_max_backlog;
 extern int		dev_rx_weight;
 extern int		dev_tx_weight;
-extern int		gro_normal_batch;
 
 enum {
 	NESTED_SYNC_IMM_BIT,
diff --git a/include/net/gro.h b/include/net/gro.h
index 2b58671a6549..d6fc8fbd3730 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -9,6 +9,7 @@
 #include <net/ip6_checksum.h>
 #include <linux/skbuff.h>
 #include <net/udp.h>
+#include <net/hotdata.h>
 
 struct napi_gro_cb {
 	union {
@@ -446,7 +447,7 @@ static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb,
 {
 	list_add_tail(&skb->list, &napi->rx_list);
 	napi->rx_count += segs;
-	if (napi->rx_count >= READ_ONCE(gro_normal_batch))
+	if (napi->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch))
 		gro_normal_list(napi);
 }
 
@@ -493,6 +494,4 @@ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *
 #endif
 }
 
-extern struct list_head offload_base;
-
 #endif /* _NET_IPV6_GRO_H */
diff --git a/include/net/hotdata.h b/include/net/hotdata.h
new file mode 100644
index 000000000000..6ed32e4e34aa
--- /dev/null
+++ b/include/net/hotdata.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_HOTDATA_H
+#define _NET_HOTDATA_H
+
+#include <linux/types.h>
+
+/* Read mostly data used in network fast paths. */
+struct net_hotdata {
+	struct list_head	offload_base;
+	int			gro_normal_batch;
+};
+
+extern struct net_hotdata net_hotdata;
+
+#endif /* _NET_HOTDATA_H */
diff --git a/net/core/Makefile b/net/core/Makefile
index 821aec06abf1..6e6548011fae 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -18,6 +18,7 @@ obj-y		     += dev.o dev_addr_lists.o dst.o netevent.o \
 obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
 
 obj-y += net-sysfs.o
+obj-y += hotdata.o
 obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
diff --git a/net/core/gro.c b/net/core/gro.c
index 6a0edbd826a1..ee30d4f0c038 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -10,9 +10,6 @@
 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 
 static DEFINE_SPINLOCK(offload_lock);
-struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base);
-/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
-int gro_normal_batch __read_mostly = 8;
 
 /**
  *	dev_add_offload - register offload handlers
@@ -31,7 +28,7 @@ void dev_add_offload(struct packet_offload *po)
 	struct packet_offload *elem;
 
 	spin_lock(&offload_lock);
-	list_for_each_entry(elem, &offload_base, list) {
+	list_for_each_entry(elem, &net_hotdata.offload_base, list) {
 		if (po->priority < elem->priority)
 			break;
 	}
@@ -55,7 +52,7 @@ EXPORT_SYMBOL(dev_add_offload);
  */
 static void __dev_remove_offload(struct packet_offload *po)
 {
-	struct list_head *head = &offload_base;
+	struct list_head *head = &net_hotdata.offload_base;
 	struct packet_offload *po1;
 
 	spin_lock(&offload_lock);
@@ -235,9 +232,9 @@ done:
 
 static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
 {
+	struct list_head *head = &net_hotdata.offload_base;
 	struct packet_offload *ptype;
 	__be16 type = skb->protocol;
-	struct list_head *head = &offload_base;
 	int err = -ENOENT;
 
 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
@@ -444,7 +441,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 {
 	u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
 	struct gro_list *gro_list = &napi->gro_hash[bucket];
-	struct list_head *head = &offload_base;
+	struct list_head *head = &net_hotdata.offload_base;
 	struct packet_offload *ptype;
 	__be16 type = skb->protocol;
 	struct sk_buff *pp = NULL;
@@ -550,7 +547,7 @@ normal:
 
 struct packet_offload *gro_find_receive_by_type(__be16 type)
 {
-	struct list_head *offload_head = &offload_base;
+	struct list_head *offload_head = &net_hotdata.offload_base;
 	struct packet_offload *ptype;
 
 	list_for_each_entry_rcu(ptype, offload_head, list) {
@@ -564,7 +561,7 @@ EXPORT_SYMBOL(gro_find_receive_by_type);
 
 struct packet_offload *gro_find_complete_by_type(__be16 type)
 {
-	struct list_head *offload_head = &offload_base;
+	struct list_head *offload_head = &net_hotdata.offload_base;
 	struct packet_offload *ptype;
 
 	list_for_each_entry_rcu(ptype, offload_head, list) {
diff --git a/net/core/gso.c b/net/core/gso.c
index 9e1803bfc9c6..bcd156372f4d 100644
--- a/net/core/gso.c
+++ b/net/core/gso.c
@@ -17,7 +17,7 @@ struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb,
 	struct packet_offload *ptype;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(ptype, &offload_base, list) {
+	list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) {
 		if (ptype->type == type && ptype->callbacks.gso_segment) {
 			segs = ptype->callbacks.gso_segment(skb, features);
 			break;
@@ -48,7 +48,7 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 	__skb_pull(skb, vlan_depth);
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(ptype, &offload_base, list) {
+	list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) {
 		if (ptype->type == type && ptype->callbacks.gso_segment) {
 			segs = ptype->callbacks.gso_segment(skb, features);
 			break;
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
new file mode 100644
index 000000000000..abb8ad19d59a
--- /dev/null
+++ b/net/core/hotdata.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <net/hotdata.h>
+#include <linux/cache.h>
+#include <linux/list.h>
+
+struct net_hotdata net_hotdata __cacheline_aligned = {
+	.offload_base = LIST_HEAD_INIT(net_hotdata.offload_base),
+	.gro_normal_batch = 8,
+};
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 986f15e5d6c4..0eb1242eabbe 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -23,6 +23,7 @@
 #include <net/net_ratelimit.h>
 #include <net/busy_poll.h>
 #include <net/pkt_sched.h>
+#include <net/hotdata.h>
 
 #include "dev.h"
 
@@ -632,7 +633,7 @@ static struct ctl_table net_core_table[] = {
 	},
 	{
 		.procname	= "gro_normal_batch",
-		.data		= &gro_normal_batch,
+		.data		= &net_hotdata.gro_normal_batch,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-- 
cgit v1.2.3


From ae6e22f7b7f0702015d86cfa036492b94be92f04 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:15 +0000
Subject: net: move netdev_budget and netdev_budget to net_hotdata

netdev_budget and netdev_budget are used in rx path (net_rx_action())

Move them into net_hotdata for better cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/hotdata.h      | 2 ++
 net/core/dev.c             | 7 ++-----
 net/core/dev.h             | 2 --
 net/core/hotdata.c         | 6 ++++++
 net/core/sysctl_net_core.c | 4 ++--
 5 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index 6ed32e4e34aa..72170223385e 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -8,6 +8,8 @@
 struct net_hotdata {
 	struct list_head	offload_base;
 	int			gro_normal_batch;
+	int			netdev_budget;
+	int			netdev_budget_usecs;
 };
 
 extern struct net_hotdata net_hotdata;
diff --git a/net/core/dev.c b/net/core/dev.c
index bf933eeaa688..9ccb48618dba 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4410,9 +4410,6 @@ EXPORT_SYMBOL(netdev_max_backlog);
 
 int netdev_tstamp_prequeue __read_mostly = 1;
 unsigned int sysctl_skb_defer_max __read_mostly = 64;
-int netdev_budget __read_mostly = 300;
-/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
-unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
 int weight_p __read_mostly = 64;           /* old backlog weight */
 int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
@@ -6790,8 +6787,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
 {
 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 	unsigned long time_limit = jiffies +
-		usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
-	int budget = READ_ONCE(netdev_budget);
+		usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
+	int budget = READ_ONCE(net_hotdata.netdev_budget);
 	LIST_HEAD(list);
 	LIST_HEAD(repoll);
 
diff --git a/net/core/dev.h b/net/core/dev.h
index 45892267848d..9a6170530850 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -38,8 +38,6 @@ int dev_addr_init(struct net_device *dev);
 void dev_addr_check(struct net_device *dev);
 
 /* sysctls not referred to from outside net/core/ */
-extern int		netdev_budget;
-extern unsigned int	netdev_budget_usecs;
 extern unsigned int	sysctl_skb_defer_max;
 extern int		netdev_tstamp_prequeue;
 extern int		netdev_unregister_timeout_secs;
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
index abb8ad19d59a..907d69120397 100644
--- a/net/core/hotdata.c
+++ b/net/core/hotdata.c
@@ -1,9 +1,15 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 #include <net/hotdata.h>
 #include <linux/cache.h>
+#include <linux/jiffies.h>
 #include <linux/list.h>
 
+
 struct net_hotdata net_hotdata __cacheline_aligned = {
 	.offload_base = LIST_HEAD_INIT(net_hotdata.offload_base),
 	.gro_normal_batch = 8,
+
+	.netdev_budget = 300,
+	/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
+	.netdev_budget_usecs = 2 * USEC_PER_SEC / HZ,
 };
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 0eb1242eabbe..a9c2d798b219 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -577,7 +577,7 @@ static struct ctl_table net_core_table[] = {
 #endif
 	{
 		.procname	= "netdev_budget",
-		.data		= &netdev_budget,
+		.data		= &net_hotdata.netdev_budget,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
@@ -600,7 +600,7 @@ static struct ctl_table net_core_table[] = {
 	},
 	{
 		.procname	= "netdev_budget_usecs",
-		.data		= &netdev_budget_usecs,
+		.data		= &net_hotdata.netdev_budget_usecs,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-- 
cgit v1.2.3


From f59b5416c396ac4910dd7a0cdf26cbb0e1faf529 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:16 +0000
Subject: net: move netdev_tstamp_prequeue into net_hotdata

netdev_tstamp_prequeue is used in rx path.

Move it to net_hotdata for better cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/hotdata.h      |  1 +
 net/core/dev.c             | 10 +++++-----
 net/core/dev.h             |  1 -
 net/core/hotdata.c         |  2 ++
 net/core/sysctl_net_core.c |  2 +-
 5 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index 72170223385e..149e56528537 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -10,6 +10,7 @@ struct net_hotdata {
 	int			gro_normal_batch;
 	int			netdev_budget;
 	int			netdev_budget_usecs;
+	int			tstamp_prequeue;
 };
 
 extern struct net_hotdata net_hotdata;
diff --git a/net/core/dev.c b/net/core/dev.c
index 9ccb48618dba..f2f44303c035 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4408,7 +4408,6 @@ EXPORT_SYMBOL(__dev_direct_xmit);
 int netdev_max_backlog __read_mostly = 1000;
 EXPORT_SYMBOL(netdev_max_backlog);
 
-int netdev_tstamp_prequeue __read_mostly = 1;
 unsigned int sysctl_skb_defer_max __read_mostly = 64;
 int weight_p __read_mostly = 64;           /* old backlog weight */
 int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
@@ -5052,7 +5051,7 @@ static int netif_rx_internal(struct sk_buff *skb)
 {
 	int ret;
 
-	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
+	net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
 
 	trace_netif_rx(skb);
 
@@ -5344,7 +5343,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 	int ret = NET_RX_DROP;
 	__be16 type;
 
-	net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
+	net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);
 
 	trace_netif_receive_skb(skb);
 
@@ -5728,7 +5727,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
 {
 	int ret;
 
-	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
+	net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
 
 	if (skb_defer_rx_timestamp(skb))
 		return NET_RX_SUCCESS;
@@ -5758,7 +5757,8 @@ void netif_receive_skb_list_internal(struct list_head *head)
 
 	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
-		net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
+		net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
+				    skb);
 		skb_list_del_init(skb);
 		if (!skb_defer_rx_timestamp(skb))
 			list_add_tail(&skb->list, &sublist);
diff --git a/net/core/dev.h b/net/core/dev.h
index 9a6170530850..2bcaf8eee50c 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -39,7 +39,6 @@ void dev_addr_check(struct net_device *dev);
 
 /* sysctls not referred to from outside net/core/ */
 extern unsigned int	sysctl_skb_defer_max;
-extern int		netdev_tstamp_prequeue;
 extern int		netdev_unregister_timeout_secs;
 extern int		weight_p;
 extern int		dev_weight_rx_bias;
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
index 907d69120397..087c4c84987d 100644
--- a/net/core/hotdata.c
+++ b/net/core/hotdata.c
@@ -12,4 +12,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = {
 	.netdev_budget = 300,
 	/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
 	.netdev_budget_usecs = 2 * USEC_PER_SEC / HZ,
+
+	.tstamp_prequeue = 1,
 };
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index a9c2d798b219..bddd07da0998 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -499,7 +499,7 @@ static struct ctl_table net_core_table[] = {
 #endif
 	{
 		.procname	= "netdev_tstamp_prequeue",
-		.data		= &netdev_tstamp_prequeue,
+		.data		= &net_hotdata.tstamp_prequeue,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
-- 
cgit v1.2.3


From 0b91fa4bfb1caedd01cb6eb3b733cbc77c9edb0e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:17 +0000
Subject: net: move ptype_all into net_hotdata

ptype_all is used in rx/tx fast paths.

Move it to net_hotdata for better cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  1 -
 include/net/hotdata.h     |  1 +
 net/core/dev.c            | 16 +++++++---------
 net/core/hotdata.c        |  1 +
 net/core/net-procfs.c     |  7 ++++---
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6643452af543..b18ac8072f18 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -5306,7 +5306,6 @@ static inline const char *netdev_reg_state(const struct net_device *dev)
 #define PTYPE_HASH_SIZE	(16)
 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 
-extern struct list_head ptype_all __read_mostly;
 extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 
 extern struct net_device *blackhole_netdev;
diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index 149e56528537..d462cb8f16ba 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -7,6 +7,7 @@
 /* Read mostly data used in network fast paths. */
 struct net_hotdata {
 	struct list_head	offload_base;
+	struct list_head	ptype_all;
 	int			gro_normal_batch;
 	int			netdev_budget;
 	int			netdev_budget_usecs;
diff --git a/net/core/dev.c b/net/core/dev.c
index f2f44303c035..e1493e558407 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -161,7 +161,6 @@
 
 static DEFINE_SPINLOCK(ptype_lock);
 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
-struct list_head ptype_all __read_mostly;	/* Taps */
 
 static int netif_rx_internal(struct sk_buff *skb);
 static int call_netdevice_notifiers_extack(unsigned long val,
@@ -540,7 +539,7 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 static inline struct list_head *ptype_head(const struct packet_type *pt)
 {
 	if (pt->type == htons(ETH_P_ALL))
-		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
+		return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
 	else
 		return pt->dev ? &pt->dev->ptype_specific :
 				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
@@ -2226,7 +2225,8 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
  */
 bool dev_nit_active(struct net_device *dev)
 {
-	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
+	return !list_empty(&net_hotdata.ptype_all) ||
+	       !list_empty(&dev->ptype_all);
 }
 EXPORT_SYMBOL_GPL(dev_nit_active);
 
@@ -2237,10 +2237,9 @@ EXPORT_SYMBOL_GPL(dev_nit_active);
 
 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct packet_type *ptype;
+	struct list_head *ptype_list = &net_hotdata.ptype_all;
+	struct packet_type *ptype, *pt_prev = NULL;
 	struct sk_buff *skb2 = NULL;
-	struct packet_type *pt_prev = NULL;
-	struct list_head *ptype_list = &ptype_all;
 
 	rcu_read_lock();
 again:
@@ -2286,7 +2285,7 @@ again:
 		pt_prev = ptype;
 	}
 
-	if (ptype_list == &ptype_all) {
+	if (ptype_list == &net_hotdata.ptype_all) {
 		ptype_list = &dev->ptype_all;
 		goto again;
 	}
@@ -5387,7 +5386,7 @@ another_round:
 	if (pfmemalloc)
 		goto skip_taps;
 
-	list_for_each_entry_rcu(ptype, &ptype_all, list) {
+	list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
 		if (pt_prev)
 			ret = deliver_skb(skb, pt_prev, orig_dev);
 		pt_prev = ptype;
@@ -11749,7 +11748,6 @@ static int __init net_dev_init(void)
 	if (netdev_kobject_init())
 		goto out;
 
-	INIT_LIST_HEAD(&ptype_all);
 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
 		INIT_LIST_HEAD(&ptype_base[i]);
 
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
index 087c4c84987d..29fcfe89fd9a 100644
--- a/net/core/hotdata.c
+++ b/net/core/hotdata.c
@@ -7,6 +7,7 @@
 
 struct net_hotdata net_hotdata __cacheline_aligned = {
 	.offload_base = LIST_HEAD_INIT(net_hotdata.offload_base),
+	.ptype_all = LIST_HEAD_INIT(net_hotdata.ptype_all),
 	.gro_normal_batch = 8,
 
 	.netdev_budget = 300,
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 2e4e96d30ee1..a97eceb84e61 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -3,6 +3,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <net/wext.h>
+#include <net/hotdata.h>
 
 #include "dev.h"
 
@@ -183,7 +184,7 @@ static void *ptype_get_idx(struct seq_file *seq, loff_t pos)
 		}
 	}
 
-	list_for_each_entry_rcu(pt, &ptype_all, list) {
+	list_for_each_entry_rcu(pt, &net_hotdata.ptype_all, list) {
 		if (i == pos)
 			return pt;
 		++i;
@@ -231,13 +232,13 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 			}
 		}
 
-		nxt = ptype_all.next;
+		nxt = net_hotdata.ptype_all.next;
 		goto ptype_all;
 	}
 
 	if (pt->type == htons(ETH_P_ALL)) {
 ptype_all:
-		if (nxt != &ptype_all)
+		if (nxt != &net_hotdata.ptype_all)
 			goto found;
 		hash = 0;
 		nxt = ptype_base[0].next;
-- 
cgit v1.2.3


From edbc666cdcbf4a80ada4311c272a2078af87b880 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:18 +0000
Subject: net: move netdev_max_backlog to net_hotdata

netdev_max_backlog is used in rx fat path.

Move it to net_hodata for better cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-6-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h  | 1 -
 include/net/hotdata.h      | 1 +
 net/core/dev.c             | 8 +++-----
 net/core/gro_cells.c       | 3 ++-
 net/core/hotdata.c         | 2 ++
 net/core/sysctl_net_core.c | 2 +-
 net/xfrm/espintcp.c        | 4 +++-
 net/xfrm/xfrm_input.c      | 3 ++-
 8 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b18ac8072f18..c9a671b7bb37 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4793,7 +4793,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
 			   const struct pcpu_sw_netstats __percpu *netstats);
 void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s);
 
-extern int		netdev_max_backlog;
 extern int		dev_rx_weight;
 extern int		dev_tx_weight;
 
diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index d462cb8f16ba..dc50b200a94b 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -12,6 +12,7 @@ struct net_hotdata {
 	int			netdev_budget;
 	int			netdev_budget_usecs;
 	int			tstamp_prequeue;
+	int			max_backlog;
 };
 
 extern struct net_hotdata net_hotdata;
diff --git a/net/core/dev.c b/net/core/dev.c
index e1493e558407..e23edbaff392 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4404,9 +4404,6 @@ EXPORT_SYMBOL(__dev_direct_xmit);
  *			Receiver routines
  *************************************************************************/
 
-int netdev_max_backlog __read_mostly = 1000;
-EXPORT_SYMBOL(netdev_max_backlog);
-
 unsigned int sysctl_skb_defer_max __read_mostly = 64;
 int weight_p __read_mostly = 64;           /* old backlog weight */
 int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
@@ -4713,7 +4710,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
 	struct softnet_data *sd;
 	unsigned int old_flow, new_flow;
 
-	if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
+	if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
 		return false;
 
 	sd = this_cpu_ptr(&softnet_data);
@@ -4761,7 +4758,8 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 	if (!netif_running(skb->dev))
 		goto drop;
 	qlen = skb_queue_len(&sd->input_pkt_queue);
-	if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
+	if (qlen <= READ_ONCE(net_hotdata.max_backlog) &&
+	    !skb_flow_limit(skb, qlen)) {
 		if (qlen) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
index ed5ec5de47f6..ff8e5b64bf6b 100644
--- a/net/core/gro_cells.c
+++ b/net/core/gro_cells.c
@@ -3,6 +3,7 @@
 #include <linux/slab.h>
 #include <linux/netdevice.h>
 #include <net/gro_cells.h>
+#include <net/hotdata.h>
 
 struct gro_cell {
 	struct sk_buff_head	napi_skbs;
@@ -26,7 +27,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
 
 	cell = this_cpu_ptr(gcells->cells);
 
-	if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) {
+	if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) {
 drop:
 		dev_core_stats_rx_dropped_inc(dev);
 		kfree_skb(skb);
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
index 29fcfe89fd9a..35ed5a83ecc7 100644
--- a/net/core/hotdata.c
+++ b/net/core/hotdata.c
@@ -15,4 +15,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = {
 	.netdev_budget_usecs = 2 * USEC_PER_SEC / HZ,
 
 	.tstamp_prequeue = 1,
+	.max_backlog = 1000,
 };
+EXPORT_SYMBOL(net_hotdata);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index bddd07da0998..8eaeeb289914 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -440,7 +440,7 @@ static struct ctl_table net_core_table[] = {
 	},
 	{
 		.procname	= "netdev_max_backlog",
-		.data		= &netdev_max_backlog,
+		.data		= &net_hotdata.max_backlog,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c
index d3b3f9e720b3..fe82e2d07300 100644
--- a/net/xfrm/espintcp.c
+++ b/net/xfrm/espintcp.c
@@ -10,6 +10,7 @@
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6_stubs.h>
 #endif
+#include <net/hotdata.h>
 
 static void handle_nonesp(struct espintcp_ctx *ctx, struct sk_buff *skb,
 			  struct sock *sk)
@@ -169,7 +170,8 @@ int espintcp_queue_out(struct sock *sk, struct sk_buff *skb)
 {
 	struct espintcp_ctx *ctx = espintcp_getctx(sk);
 
-	if (skb_queue_len(&ctx->out_queue) >= READ_ONCE(netdev_max_backlog))
+	if (skb_queue_len(&ctx->out_queue) >=
+	    READ_ONCE(net_hotdata.max_backlog))
 		return -ENOBUFS;
 
 	__skb_queue_tail(&ctx->out_queue, skb);
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index bd4ce21d76d7..161f535c8b94 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -21,6 +21,7 @@
 #include <net/ip_tunnels.h>
 #include <net/ip6_tunnel.h>
 #include <net/dst_metadata.h>
+#include <net/hotdata.h>
 
 #include "xfrm_inout.h"
 
@@ -764,7 +765,7 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb,
 
 	trans = this_cpu_ptr(&xfrm_trans_tasklet);
 
-	if (skb_queue_len(&trans->queue) >= READ_ONCE(netdev_max_backlog))
+	if (skb_queue_len(&trans->queue) >= READ_ONCE(net_hotdata.max_backlog))
 		return -ENOBUFS;
 
 	BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb));
-- 
cgit v1.2.3


From 61a0be1a5342045059ce53eabfe6500d499d2f89 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:19 +0000
Subject: net: move ip_packet_offload and ipv6_packet_offload to net_hotdata

These structures are used in GRO and GSO paths.

v2: ipv6_packet_offload definition depends on CONFIG_INET

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-7-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/hotdata.h  |  5 +++++
 net/ipv4/af_inet.c     | 18 +++++++++---------
 net/ipv6/ip6_offload.c | 18 +++++++++---------
 3 files changed, 23 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index dc50b200a94b..d8ce20d3215d 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -3,9 +3,14 @@
 #define _NET_HOTDATA_H
 
 #include <linux/types.h>
+#include <linux/netdevice.h>
 
 /* Read mostly data used in network fast paths. */
 struct net_hotdata {
+#if IS_ENABLED(CONFIG_INET)
+	struct packet_offload	ip_packet_offload;
+	struct packet_offload	ipv6_packet_offload;
+#endif
 	struct list_head	offload_base;
 	struct list_head	ptype_all;
 	int			gro_normal_batch;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5daebdcbca32..08dda6955562 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1904,14 +1904,6 @@ static int ipv4_proc_init(void);
  *	IP protocol layer initialiser
  */
 
-static struct packet_offload ip_packet_offload __read_mostly = {
-	.type = cpu_to_be16(ETH_P_IP),
-	.callbacks = {
-		.gso_segment = inet_gso_segment,
-		.gro_receive = inet_gro_receive,
-		.gro_complete = inet_gro_complete,
-	},
-};
 
 static const struct net_offload ipip_offload = {
 	.callbacks = {
@@ -1938,7 +1930,15 @@ static int __init ipv4_offload_init(void)
 	if (ipip_offload_init() < 0)
 		pr_crit("%s: Cannot add IPIP protocol offload\n", __func__);
 
-	dev_add_offload(&ip_packet_offload);
+	net_hotdata.ip_packet_offload = (struct packet_offload) {
+		.type = cpu_to_be16(ETH_P_IP),
+		.callbacks = {
+			.gso_segment = inet_gso_segment,
+			.gro_receive = inet_gro_receive,
+			.gro_complete = inet_gro_complete,
+		},
+	};
+	dev_add_offload(&net_hotdata.ip_packet_offload);
 	return 0;
 }
 
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index cca64c7809be..b41e35af69ea 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -419,14 +419,6 @@ static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff)
 	return inet_gro_complete(skb, nhoff);
 }
 
-static struct packet_offload ipv6_packet_offload __read_mostly = {
-	.type = cpu_to_be16(ETH_P_IPV6),
-	.callbacks = {
-		.gso_segment = ipv6_gso_segment,
-		.gro_receive = ipv6_gro_receive,
-		.gro_complete = ipv6_gro_complete,
-	},
-};
 
 static struct sk_buff *sit_gso_segment(struct sk_buff *skb,
 				       netdev_features_t features)
@@ -486,7 +478,15 @@ static int __init ipv6_offload_init(void)
 	if (ipv6_exthdrs_offload_init() < 0)
 		pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__);
 
-	dev_add_offload(&ipv6_packet_offload);
+	net_hotdata.ipv6_packet_offload = (struct packet_offload) {
+		.type = cpu_to_be16(ETH_P_IPV6),
+		.callbacks = {
+			.gso_segment = ipv6_gso_segment,
+			.gro_receive = ipv6_gro_receive,
+			.gro_complete = ipv6_gro_complete,
+		},
+	};
+	dev_add_offload(&net_hotdata.ipv6_packet_offload);
 
 	inet_add_offload(&sit_offload, IPPROTO_IPV6);
 	inet6_add_offload(&ip6ip6_offload, IPPROTO_IPV6);
-- 
cgit v1.2.3


From 0139806eebd6bcf6a3df98950cd0691aff216304 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:20 +0000
Subject: net: move tcpv4_offload and tcpv6_offload to net_hotdata

These are used in TCP fast paths.

Move them into net_hotdata for better cache locality.

v2: tcpv6_offload definition depends on CONFIG_INET

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-8-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/hotdata.h    |  3 +++
 net/ipv4/tcp_offload.c   | 17 ++++++++---------
 net/ipv6/tcpv6_offload.c | 16 ++++++++--------
 3 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index d8ce20d3215d..d86d02f156fc 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -4,12 +4,15 @@
 
 #include <linux/types.h>
 #include <linux/netdevice.h>
+#include <net/protocol.h>
 
 /* Read mostly data used in network fast paths. */
 struct net_hotdata {
 #if IS_ENABLED(CONFIG_INET)
 	struct packet_offload	ip_packet_offload;
+	struct net_offload	tcpv4_offload;
 	struct packet_offload	ipv6_packet_offload;
+	struct net_offload	tcpv6_offload;
 #endif
 	struct list_head	offload_base;
 	struct list_head	ptype_all;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index b955ab3b236d..ebe4722bb020 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -345,15 +345,14 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
 	return 0;
 }
 
-static const struct net_offload tcpv4_offload = {
-	.callbacks = {
-		.gso_segment	=	tcp4_gso_segment,
-		.gro_receive	=	tcp4_gro_receive,
-		.gro_complete	=	tcp4_gro_complete,
-	},
-};
-
 int __init tcpv4_offload_init(void)
 {
-	return inet_add_offload(&tcpv4_offload, IPPROTO_TCP);
+	net_hotdata.tcpv4_offload = (struct net_offload) {
+		.callbacks = {
+			.gso_segment	=	tcp4_gso_segment,
+			.gro_receive	=	tcp4_gro_receive,
+			.gro_complete	=	tcp4_gro_complete,
+		},
+	};
+	return inet_add_offload(&net_hotdata.tcpv4_offload, IPPROTO_TCP);
 }
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index bf0c957e4b5e..4b07d1e6c952 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -66,15 +66,15 @@ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb,
 
 	return tcp_gso_segment(skb, features);
 }
-static const struct net_offload tcpv6_offload = {
-	.callbacks = {
-		.gso_segment	=	tcp6_gso_segment,
-		.gro_receive	=	tcp6_gro_receive,
-		.gro_complete	=	tcp6_gro_complete,
-	},
-};
 
 int __init tcpv6_offload_init(void)
 {
-	return inet6_add_offload(&tcpv6_offload, IPPROTO_TCP);
+	net_hotdata.tcpv6_offload = (struct net_offload) {
+		.callbacks = {
+			.gso_segment	=	tcp6_gso_segment,
+			.gro_receive	=	tcp6_gro_receive,
+			.gro_complete	=	tcp6_gro_complete,
+		},
+	};
+	return inet6_add_offload(&net_hotdata.tcpv6_offload, IPPROTO_TCP);
 }
-- 
cgit v1.2.3


From 26722dc74bf08fd79564cbcad1e5f3e2aa3bf9cc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:21 +0000
Subject: net: move dev_tx_weight to net_hotdata

dev_tx_weight is used in tx fast path.

Move it to net_hotdata for better cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-9-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h  | 1 -
 include/net/hotdata.h      | 1 +
 net/core/dev.c             | 1 -
 net/core/hotdata.c         | 1 +
 net/core/sysctl_net_core.c | 2 +-
 net/sched/sch_generic.c    | 3 ++-
 6 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c9a671b7bb37..ad4b031098ff 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4794,7 +4794,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
 void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s);
 
 extern int		dev_rx_weight;
-extern int		dev_tx_weight;
 
 enum {
 	NESTED_SYNC_IMM_BIT,
diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index d86d02f156fc..ffea9cc263e5 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -21,6 +21,7 @@ struct net_hotdata {
 	int			netdev_budget_usecs;
 	int			tstamp_prequeue;
 	int			max_backlog;
+	int			dev_tx_weight;
 };
 
 extern struct net_hotdata net_hotdata;
diff --git a/net/core/dev.c b/net/core/dev.c
index e23edbaff392..5d6bd481103f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4409,7 +4409,6 @@ int weight_p __read_mostly = 64;           /* old backlog weight */
 int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
 int dev_rx_weight __read_mostly = 64;
-int dev_tx_weight __read_mostly = 64;
 
 /* Called with irq disabled */
 static inline void ____napi_schedule(struct softnet_data *sd,
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
index 35ed5a83ecc7..ec8c3b48e8fe 100644
--- a/net/core/hotdata.c
+++ b/net/core/hotdata.c
@@ -16,5 +16,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = {
 
 	.tstamp_prequeue = 1,
 	.max_backlog = 1000,
+	.dev_tx_weight = 64,
 };
 EXPORT_SYMBOL(net_hotdata);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 8eaeeb289914..a30016a8660e 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -302,7 +302,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write,
 	if (!ret && write) {
 		weight = READ_ONCE(weight_p);
 		WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias);
-		WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias);
+		WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias);
 	}
 	mutex_unlock(&dev_weight_mutex);
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 9b3e9262040b..ff5336493777 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -27,6 +27,7 @@
 #include <net/sch_generic.h>
 #include <net/pkt_sched.h>
 #include <net/dst.h>
+#include <net/hotdata.h>
 #include <trace/events/qdisc.h>
 #include <trace/events/net.h>
 #include <net/xfrm.h>
@@ -409,7 +410,7 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets)
 
 void __qdisc_run(struct Qdisc *q)
 {
-	int quota = READ_ONCE(dev_tx_weight);
+	int quota = READ_ONCE(net_hotdata.dev_tx_weight);
 	int packets;
 
 	while (qdisc_restart(q, &packets)) {
-- 
cgit v1.2.3


From 71c0de9bac9c1dda503322c86be4924f055dc6c9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:22 +0000
Subject: net: move dev_rx_weight to net_hotdata

dev_rx_weight is read from process_backlog().

Move it to net_hotdata for better cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-10-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h  | 2 --
 include/net/hotdata.h      | 1 +
 net/core/dev.c             | 3 +--
 net/core/hotdata.c         | 1 +
 net/core/sysctl_net_core.c | 2 +-
 5 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ad4b031098ff..dd641297e807 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4793,8 +4793,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
 			   const struct pcpu_sw_netstats __percpu *netstats);
 void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s);
 
-extern int		dev_rx_weight;
-
 enum {
 	NESTED_SYNC_IMM_BIT,
 	NESTED_SYNC_TODO_BIT,
diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index ffea9cc263e5..e6595ed2c3be 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -22,6 +22,7 @@ struct net_hotdata {
 	int			tstamp_prequeue;
 	int			max_backlog;
 	int			dev_tx_weight;
+	int			dev_rx_weight;
 };
 
 extern struct net_hotdata net_hotdata;
diff --git a/net/core/dev.c b/net/core/dev.c
index 5d6bd481103f..40ba02e04bcb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4408,7 +4408,6 @@ unsigned int sysctl_skb_defer_max __read_mostly = 64;
 int weight_p __read_mostly = 64;           /* old backlog weight */
 int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
-int dev_rx_weight __read_mostly = 64;
 
 /* Called with irq disabled */
 static inline void ____napi_schedule(struct softnet_data *sd,
@@ -5978,7 +5977,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		net_rps_action_and_irq_enable(sd);
 	}
 
-	napi->weight = READ_ONCE(dev_rx_weight);
+	napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
 	while (again) {
 		struct sk_buff *skb;
 
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
index ec8c3b48e8fe..c8a7a451c18a 100644
--- a/net/core/hotdata.c
+++ b/net/core/hotdata.c
@@ -17,5 +17,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = {
 	.tstamp_prequeue = 1,
 	.max_backlog = 1000,
 	.dev_tx_weight = 64,
+	.dev_rx_weight = 64,
 };
 EXPORT_SYMBOL(net_hotdata);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index a30016a8660e..8a4c698dad9c 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -301,7 +301,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write,
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (!ret && write) {
 		weight = READ_ONCE(weight_p);
-		WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias);
+		WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias);
 		WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias);
 	}
 	mutex_unlock(&dev_weight_mutex);
-- 
cgit v1.2.3


From aa70d2d16f280efe8aa52afc25a33b2ec8d346b6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:23 +0000
Subject: net: move skbuff_cache(s) to net_hotdata

skbuff_cache, skbuff_fclone_cache and skb_small_head_cache
are used in rx/tx fast paths.

Move them to net_hotdata for better cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-11-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h |  1 -
 include/net/hotdata.h  |  3 +++
 kernel/bpf/cpumap.c    |  4 +++-
 net/bpf/test_run.c     |  4 +++-
 net/core/skbuff.c      | 44 ++++++++++++++++++++------------------------
 net/core/xdp.c         |  5 +++--
 6 files changed, 32 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3013355b63f5..d0508f90bed5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1271,7 +1271,6 @@ static inline void consume_skb(struct sk_buff *skb)
 
 void __consume_stateless_skb(struct sk_buff *skb);
 void  __kfree_skb(struct sk_buff *skb);
-extern struct kmem_cache *skbuff_cache;
 
 void kfree_skb_partial(struct sk_buff *skb, bool head_stolen);
 bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index e6595ed2c3be..a8f7e5e826fb 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -16,6 +16,9 @@ struct net_hotdata {
 #endif
 	struct list_head	offload_base;
 	struct list_head	ptype_all;
+	struct kmem_cache	*skbuff_cache;
+	struct kmem_cache	*skbuff_fclone_cache;
+	struct kmem_cache	*skb_small_head_cache;
 	int			gro_normal_batch;
 	int			netdev_budget;
 	int			netdev_budget_usecs;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ef82ffc90cbe..9ee8da477465 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -24,6 +24,7 @@
 #include <linux/filter.h>
 #include <linux/ptr_ring.h>
 #include <net/xdp.h>
+#include <net/hotdata.h>
 
 #include <linux/sched.h>
 #include <linux/workqueue.h>
@@ -326,7 +327,8 @@ static int cpu_map_kthread_run(void *data)
 		/* Support running another XDP prog on this CPU */
 		nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
 		if (nframes) {
-			m = kmem_cache_alloc_bulk(skbuff_cache, gfp, nframes, skbs);
+			m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+						  gfp, nframes, skbs);
 			if (unlikely(m == 0)) {
 				for (i = 0; i < nframes; i++)
 					skbs[i] = NULL; /* effect: xdp_return_frame */
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 5535f9adc658..61efeadaff8d 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -12,6 +12,7 @@
 #include <linux/rcupdate_trace.h>
 #include <linux/sched/signal.h>
 #include <net/bpf_sk_storage.h>
+#include <net/hotdata.h>
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <net/net_namespace.h>
@@ -254,7 +255,8 @@ static int xdp_recv_frames(struct xdp_frame **frames, int nframes,
 	int i, n;
 	LIST_HEAD(list);
 
-	n = kmem_cache_alloc_bulk(skbuff_cache, gfp, nframes, (void **)skbs);
+	n = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, nframes,
+				  (void **)skbs);
 	if (unlikely(n == 0)) {
 		for (i = 0; i < nframes; i++)
 			xdp_return_frame(frames[i]);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 43d7fc150acc..766219011aea 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -69,6 +69,7 @@
 #include <net/sock.h>
 #include <net/checksum.h>
 #include <net/gso.h>
+#include <net/hotdata.h>
 #include <net/ip6_checksum.h>
 #include <net/xfrm.h>
 #include <net/mpls.h>
@@ -88,15 +89,10 @@
 #include "dev.h"
 #include "sock_destructor.h"
 
-struct kmem_cache *skbuff_cache __ro_after_init;
-static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
 #ifdef CONFIG_SKB_EXTENSIONS
 static struct kmem_cache *skbuff_ext_cache __ro_after_init;
 #endif
 
-
-static struct kmem_cache *skb_small_head_cache __ro_after_init;
-
 #define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
 
 /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
@@ -349,7 +345,7 @@ static struct sk_buff *napi_skb_cache_get(void)
 	struct sk_buff *skb;
 
 	if (unlikely(!nc->skb_count)) {
-		nc->skb_count = kmem_cache_alloc_bulk(skbuff_cache,
+		nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
 						      GFP_ATOMIC,
 						      NAPI_SKB_CACHE_BULK,
 						      nc->skb_cache);
@@ -358,7 +354,7 @@ static struct sk_buff *napi_skb_cache_get(void)
 	}
 
 	skb = nc->skb_cache[--nc->skb_count];
-	kasan_mempool_unpoison_object(skb, kmem_cache_size(skbuff_cache));
+	kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));
 
 	return skb;
 }
@@ -416,7 +412,7 @@ struct sk_buff *slab_build_skb(void *data)
 	struct sk_buff *skb;
 	unsigned int size;
 
-	skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
+	skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
 	if (unlikely(!skb))
 		return NULL;
 
@@ -467,7 +463,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
 {
 	struct sk_buff *skb;
 
-	skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
+	skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
 	if (unlikely(!skb))
 		return NULL;
 
@@ -578,7 +574,7 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
 	obj_size = SKB_HEAD_ALIGN(*size);
 	if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
 	    !(flags & KMALLOC_NOT_NORMAL_BITS)) {
-		obj = kmem_cache_alloc_node(skb_small_head_cache,
+		obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
 				flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
 				node);
 		*size = SKB_SMALL_HEAD_CACHE_SIZE;
@@ -586,7 +582,7 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
 			goto out;
 		/* Try again but now we are using pfmemalloc reserves */
 		ret_pfmemalloc = true;
-		obj = kmem_cache_alloc_node(skb_small_head_cache, flags, node);
+		obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node);
 		goto out;
 	}
 
@@ -649,7 +645,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	u8 *data;
 
 	cache = (flags & SKB_ALLOC_FCLONE)
-		? skbuff_fclone_cache : skbuff_cache;
+		? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache;
 
 	if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
 		gfp_mask |= __GFP_MEMALLOC;
@@ -1095,7 +1091,7 @@ static int skb_pp_frag_ref(struct sk_buff *skb)
 static void skb_kfree_head(void *head, unsigned int end_offset)
 {
 	if (end_offset == SKB_SMALL_HEAD_HEADROOM)
-		kmem_cache_free(skb_small_head_cache, head);
+		kmem_cache_free(net_hotdata.skb_small_head_cache, head);
 	else
 		kfree(head);
 }
@@ -1162,7 +1158,7 @@ static void kfree_skbmem(struct sk_buff *skb)
 
 	switch (skb->fclone) {
 	case SKB_FCLONE_UNAVAILABLE:
-		kmem_cache_free(skbuff_cache, skb);
+		kmem_cache_free(net_hotdata.skbuff_cache, skb);
 		return;
 
 	case SKB_FCLONE_ORIG:
@@ -1183,7 +1179,7 @@ static void kfree_skbmem(struct sk_buff *skb)
 	if (!refcount_dec_and_test(&fclones->fclone_ref))
 		return;
 fastpath:
-	kmem_cache_free(skbuff_fclone_cache, fclones);
+	kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones);
 }
 
 void skb_release_head_state(struct sk_buff *skb)
@@ -1280,7 +1276,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
 	sa->skb_array[sa->skb_count++] = skb;
 
 	if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
-		kmem_cache_free_bulk(skbuff_cache, KFREE_SKB_BULK_SIZE,
+		kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE,
 				     sa->skb_array);
 		sa->skb_count = 0;
 	}
@@ -1305,7 +1301,7 @@ kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
 	}
 
 	if (sa.skb_count)
-		kmem_cache_free_bulk(skbuff_cache, sa.skb_count, sa.skb_array);
+		kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array);
 }
 EXPORT_SYMBOL(kfree_skb_list_reason);
 
@@ -1467,9 +1463,9 @@ static void napi_skb_cache_put(struct sk_buff *skb)
 	if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
 		for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
 			kasan_mempool_unpoison_object(nc->skb_cache[i],
-						kmem_cache_size(skbuff_cache));
+						kmem_cache_size(net_hotdata.skbuff_cache));
 
-		kmem_cache_free_bulk(skbuff_cache, NAPI_SKB_CACHE_HALF,
+		kmem_cache_free_bulk(net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF,
 				     nc->skb_cache + NAPI_SKB_CACHE_HALF);
 		nc->skb_count = NAPI_SKB_CACHE_HALF;
 	}
@@ -2066,7 +2062,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 		if (skb_pfmemalloc(skb))
 			gfp_mask |= __GFP_MEMALLOC;
 
-		n = kmem_cache_alloc(skbuff_cache, gfp_mask);
+		n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask);
 		if (!n)
 			return NULL;
 
@@ -5005,7 +5001,7 @@ static void skb_extensions_init(void) {}
 
 void __init skb_init(void)
 {
-	skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
+	net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
 					      sizeof(struct sk_buff),
 					      0,
 					      SLAB_HWCACHE_ALIGN|SLAB_PANIC|
@@ -5013,7 +5009,7 @@ void __init skb_init(void)
 					      offsetof(struct sk_buff, cb),
 					      sizeof_field(struct sk_buff, cb),
 					      NULL);
-	skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
+	net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
 						sizeof(struct sk_buff_fclones),
 						0,
 						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
@@ -5022,7 +5018,7 @@ void __init skb_init(void)
 	 * struct skb_shared_info is located at the end of skb->head,
 	 * and should not be copied to/from user.
 	 */
-	skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head",
+	net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head",
 						SKB_SMALL_HEAD_CACHE_SIZE,
 						0,
 						SLAB_HWCACHE_ALIGN | SLAB_PANIC,
@@ -5895,7 +5891,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
 {
 	if (head_stolen) {
 		skb_release_head_state(skb);
-		kmem_cache_free(skbuff_cache, skb);
+		kmem_cache_free(net_hotdata.skbuff_cache, skb);
 	} else {
 		__kfree_skb(skb);
 	}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 0e3709a29175..41693154e426 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -16,6 +16,7 @@
 #include <linux/bug.h>
 #include <net/page_pool/helpers.h>
 
+#include <net/hotdata.h>
 #include <net/xdp.h>
 #include <net/xdp_priv.h> /* struct xdp_mem_allocator */
 #include <trace/events/xdp.h>
@@ -589,7 +590,7 @@ EXPORT_SYMBOL_GPL(xdp_warn);
 
 int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp)
 {
-	n_skb = kmem_cache_alloc_bulk(skbuff_cache, gfp, n_skb, skbs);
+	n_skb = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, n_skb, skbs);
 	if (unlikely(!n_skb))
 		return -ENOMEM;
 
@@ -658,7 +659,7 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 {
 	struct sk_buff *skb;
 
-	skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
+	skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
 	if (unlikely(!skb))
 		return NULL;
 
-- 
cgit v1.2.3


From 6a55ca6b0122d4678e3ab54a8553361aae5082f1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:24 +0000
Subject: udp: move udpv4_offload and udpv6_offload to net_hotdata

These structures are used in GRO and GSO paths.
Move them to net_hodata for better cache locality.

v2: udpv6_offload definition depends on CONFIG_INET=y

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-12-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/hotdata.h  |  2 ++
 net/ipv4/udp_offload.c | 17 ++++++++---------
 net/ipv6/udp_offload.c | 21 ++++++++++-----------
 3 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index a8f7e5e826fb..daeee8ce8084 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -11,8 +11,10 @@ struct net_hotdata {
 #if IS_ENABLED(CONFIG_INET)
 	struct packet_offload	ip_packet_offload;
 	struct net_offload	tcpv4_offload;
+	struct net_offload 	udpv4_offload;
 	struct packet_offload	ipv6_packet_offload;
 	struct net_offload	tcpv6_offload;
+	struct net_offload	udpv6_offload;
 #endif
 	struct list_head	offload_base;
 	struct list_head	ptype_all;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 6c95d28d0c4a..b9880743765c 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -737,15 +737,14 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 	return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
 }
 
-static const struct net_offload udpv4_offload = {
-	.callbacks = {
-		.gso_segment = udp4_ufo_fragment,
-		.gro_receive  =	udp4_gro_receive,
-		.gro_complete =	udp4_gro_complete,
-	},
-};
-
 int __init udpv4_offload_init(void)
 {
-	return inet_add_offload(&udpv4_offload, IPPROTO_UDP);
+	net_hotdata.udpv4_offload = (struct net_offload) {
+		.callbacks = {
+			.gso_segment = udp4_ufo_fragment,
+			.gro_receive  =	udp4_gro_receive,
+			.gro_complete =	udp4_gro_complete,
+		},
+	};
+	return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP);
 }
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 6b95ba241ebe..312bcaeea96f 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -192,20 +192,19 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 	return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
 }
 
-static const struct net_offload udpv6_offload = {
-	.callbacks = {
-		.gso_segment	=	udp6_ufo_fragment,
-		.gro_receive	=	udp6_gro_receive,
-		.gro_complete	=	udp6_gro_complete,
-	},
-};
-
-int udpv6_offload_init(void)
+int __init udpv6_offload_init(void)
 {
-	return inet6_add_offload(&udpv6_offload, IPPROTO_UDP);
+	net_hotdata.udpv6_offload = (struct net_offload) {
+		.callbacks = {
+			.gso_segment	=	udp6_ufo_fragment,
+			.gro_receive	=	udp6_gro_receive,
+			.gro_complete	=	udp6_gro_complete,
+		},
+	};
+	return inet6_add_offload(&net_hotdata.udpv6_offload, IPPROTO_UDP);
 }
 
 int udpv6_offload_exit(void)
 {
-	return inet6_del_offload(&udpv6_offload, IPPROTO_UDP);
+	return inet6_del_offload(&net_hotdata.udpv6_offload, IPPROTO_UDP);
 }
-- 
cgit v1.2.3


From 4ea0875b9d897e3c64cdb486788509f1f062285b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:25 +0000
Subject: ipv6: move tcpv6_protocol and udpv6_protocol to net_hotdata

These structures are read in rx path, move them to net_hotdata
for better cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-13-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/hotdata.h |  4 ++++
 net/ipv6/tcp_ipv6.c   | 17 +++++++++--------
 net/ipv6/udp.c        | 16 ++++++++--------
 3 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index daeee8ce8084..03d758d25c02 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -14,6 +14,10 @@ struct net_hotdata {
 	struct net_offload 	udpv4_offload;
 	struct packet_offload	ipv6_packet_offload;
 	struct net_offload	tcpv6_offload;
+#if IS_ENABLED(CONFIG_IPV6)
+	struct inet6_protocol	tcpv6_protocol;
+	struct inet6_protocol	udpv6_protocol;
+#endif
 	struct net_offload	udpv6_offload;
 #endif
 	struct list_head	offload_base;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f677f0fa5196..3f4cba49e9ee 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -58,6 +58,7 @@
 #include <net/timewait_sock.h>
 #include <net/inet_common.h>
 #include <net/secure_seq.h>
+#include <net/hotdata.h>
 #include <net/busy_poll.h>
 
 #include <linux/proc_fs.h>
@@ -2367,11 +2368,6 @@ struct proto tcpv6_prot = {
 };
 EXPORT_SYMBOL_GPL(tcpv6_prot);
 
-static const struct inet6_protocol tcpv6_protocol = {
-	.handler	=	tcp_v6_rcv,
-	.err_handler	=	tcp_v6_err,
-	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
-};
 
 static struct inet_protosw tcpv6_protosw = {
 	.type		=	SOCK_STREAM,
@@ -2408,7 +2404,12 @@ int __init tcpv6_init(void)
 {
 	int ret;
 
-	ret = inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP);
+	net_hotdata.tcpv6_protocol = (struct inet6_protocol) {
+		.handler     = tcp_v6_rcv,
+		.err_handler = tcp_v6_err,
+		.flags	     = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+	};
+	ret = inet6_add_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP);
 	if (ret)
 		goto out;
 
@@ -2433,7 +2434,7 @@ out_tcpv6_pernet_subsys:
 out_tcpv6_protosw:
 	inet6_unregister_protosw(&tcpv6_protosw);
 out_tcpv6_protocol:
-	inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP);
+	inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP);
 	goto out;
 }
 
@@ -2441,5 +2442,5 @@ void tcpv6_exit(void)
 {
 	unregister_pernet_subsys(&tcpv6_net_ops);
 	inet6_unregister_protosw(&tcpv6_protosw);
-	inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP);
+	inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP);
 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3f2249b4cd5f..97d86909aabb 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1702,11 +1702,6 @@ int udpv6_getsockopt(struct sock *sk, int level, int optname,
 	return ipv6_getsockopt(sk, level, optname, optval, optlen);
 }
 
-static const struct inet6_protocol udpv6_protocol = {
-	.handler	=	udpv6_rcv,
-	.err_handler	=	udpv6_err,
-	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
-};
 
 /* ------------------------------------------------------------------------ */
 #ifdef CONFIG_PROC_FS
@@ -1803,7 +1798,12 @@ int __init udpv6_init(void)
 {
 	int ret;
 
-	ret = inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP);
+	net_hotdata.udpv6_protocol = (struct inet6_protocol) {
+		.handler     = udpv6_rcv,
+		.err_handler = udpv6_err,
+		.flags	     = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+	};
+	ret = inet6_add_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP);
 	if (ret)
 		goto out;
 
@@ -1814,12 +1814,12 @@ out:
 	return ret;
 
 out_udpv6_protocol:
-	inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP);
+	inet6_del_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP);
 	goto out;
 }
 
 void udpv6_exit(void)
 {
 	inet6_unregister_protosw(&udpv6_protosw);
-	inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP);
+	inet6_del_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP);
 }
-- 
cgit v1.2.3


From 571bf020be9c3b135e8b6dd87421919953268c1f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:26 +0000
Subject: inet: move tcp_protocol and udp_protocol to net_hotdata

These structures are read in rx path, move them to net_hotdata
for better cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-14-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/hotdata.h |  2 ++
 net/ipv4/af_inet.c    | 30 +++++++++++++++---------------
 2 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index 03d758d25c02..87215f7ac200 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -11,7 +11,9 @@ struct net_hotdata {
 #if IS_ENABLED(CONFIG_INET)
 	struct packet_offload	ip_packet_offload;
 	struct net_offload	tcpv4_offload;
+	struct net_protocol	tcp_protocol;
 	struct net_offload 	udpv4_offload;
+	struct net_protocol	udp_protocol;
 	struct packet_offload	ipv6_packet_offload;
 	struct net_offload	tcpv6_offload;
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 08dda6955562..6f1cfd176e7b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1751,19 +1751,6 @@ static const struct net_protocol igmp_protocol = {
 };
 #endif
 
-static const struct net_protocol tcp_protocol = {
-	.handler	=	tcp_v4_rcv,
-	.err_handler	=	tcp_v4_err,
-	.no_policy	=	1,
-	.icmp_strict_tag_validation = 1,
-};
-
-static const struct net_protocol udp_protocol = {
-	.handler =	udp_rcv,
-	.err_handler =	udp_err,
-	.no_policy =	1,
-};
-
 static const struct net_protocol icmp_protocol = {
 	.handler =	icmp_rcv,
 	.err_handler =	icmp_err,
@@ -1992,9 +1979,22 @@ static int __init inet_init(void)
 
 	if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
 		pr_crit("%s: Cannot add ICMP protocol\n", __func__);
-	if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
+
+	net_hotdata.udp_protocol = (struct net_protocol) {
+		.handler =	udp_rcv,
+		.err_handler =	udp_err,
+		.no_policy =	1,
+	};
+	if (inet_add_protocol(&net_hotdata.udp_protocol, IPPROTO_UDP) < 0)
 		pr_crit("%s: Cannot add UDP protocol\n", __func__);
-	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
+
+	net_hotdata.tcp_protocol = (struct net_protocol) {
+		.handler	=	tcp_v4_rcv,
+		.err_handler	=	tcp_v4_err,
+		.no_policy	=	1,
+		.icmp_strict_tag_validation = 1,
+	};
+	if (inet_add_protocol(&net_hotdata.tcp_protocol, IPPROTO_TCP) < 0)
 		pr_crit("%s: Cannot add TCP protocol\n", __func__);
 #ifdef CONFIG_IP_MULTICAST
 	if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
-- 
cgit v1.2.3


From 6e0735723ab437793cfab02d50b3ae3539aeb520 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:27 +0000
Subject: inet: move inet_ehash_secret and udp_ehash_secret into net_hotdata

"struct net_protocol" has a 32bit hole in 32bit arches.

Use it to store the 32bit secret used by UDP and TCP,
to increase cache locality in rx path.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-15-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/hotdata.h      | 3 +++
 include/net/protocol.h     | 1 +
 net/ipv4/inet_hashtables.c | 3 +--
 net/ipv4/udp.c             | 2 --
 4 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index 87215f7ac200..7a210ea6899c 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -36,6 +36,9 @@ struct net_hotdata {
 	int			dev_rx_weight;
 };
 
+#define inet_ehash_secret	net_hotdata.tcp_protocol.secret
+#define udp_ehash_secret	net_hotdata.udp_protocol.secret
+
 extern struct net_hotdata net_hotdata;
 
 #endif /* _NET_HOTDATA_H */
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 6aef8cb11cc8..3ff26e66735c 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -46,6 +46,7 @@ struct net_protocol {
 				 * socket lookup?
 				 */
 				icmp_strict_tag_validation:1;
+	u32			secret;
 };
 
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 308ff34002ea..7498af320164 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -24,6 +24,7 @@
 #include <net/inet6_hashtables.h>
 #endif
 #include <net/secure_seq.h>
+#include <net/hotdata.h>
 #include <net/ip.h>
 #include <net/tcp.h>
 #include <net/sock_reuseport.h>
@@ -32,8 +33,6 @@ u32 inet_ehashfn(const struct net *net, const __be32 laddr,
 		 const __u16 lport, const __be32 faddr,
 		 const __be16 fport)
 {
-	static u32 inet_ehash_secret __read_mostly;
-
 	net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
 
 	return __inet_ehashfn(laddr, lport, faddr, fport,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a8acea17b4e5..2beabf5b2d86 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -411,8 +411,6 @@ INDIRECT_CALLABLE_SCOPE
 u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
 		const __be32 faddr, const __be16 fport)
 {
-	static u32 udp_ehash_secret __read_mostly;
-
 	net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));
 
 	return __inet_ehashfn(laddr, lport, faddr, fport,
-- 
cgit v1.2.3


From 5af674bb90a030317a02419e04b66ec0dc892dcd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:28 +0000
Subject: ipv6: move inet6_ehash_secret and udp6_ehash_secret into net_hotdata

"struct inet6_protocol" has a 32bit hole in 32bit arches.

Use it to store the 32bit secret used by UDP and TCP,
to increase cache locality in rx path.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-16-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/hotdata.h       | 2 ++
 include/net/protocol.h      | 1 +
 net/ipv6/inet6_hashtables.c | 2 +-
 net/ipv6/udp.c              | 1 -
 4 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index 7a210ea6899c..6d5cd967183a 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -38,6 +38,8 @@ struct net_hotdata {
 
 #define inet_ehash_secret	net_hotdata.tcp_protocol.secret
 #define udp_ehash_secret	net_hotdata.udp_protocol.secret
+#define inet6_ehash_secret	net_hotdata.tcpv6_protocol.secret
+#define udp6_ehash_secret	net_hotdata.udpv6_protocol.secret
 
 extern struct net_hotdata net_hotdata;
 
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 3ff26e66735c..213649d2ab09 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -60,6 +60,7 @@ struct inet6_protocol {
 			       __be32 info);
 
 	unsigned int	flags;	/* INET6_PROTO_xxx */
+	u32		secret;
 };
 
 #define INET6_PROTO_NOPOLICY	0x1
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index b0e8d278e8a9..0fee97f3166c 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -14,6 +14,7 @@
 #include <linux/random.h>
 
 #include <net/addrconf.h>
+#include <net/hotdata.h>
 #include <net/inet_connection_sock.h>
 #include <net/inet_hashtables.h>
 #include <net/inet6_hashtables.h>
@@ -25,7 +26,6 @@ u32 inet6_ehashfn(const struct net *net,
 		  const struct in6_addr *laddr, const u16 lport,
 		  const struct in6_addr *faddr, const __be16 fport)
 {
-	static u32 inet6_ehash_secret __read_mostly;
 	static u32 ipv6_hash_secret __read_mostly;
 
 	u32 lhash, fhash;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 97d86909aabb..1e1c67a51675 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -79,7 +79,6 @@ u32 udp6_ehashfn(const struct net *net,
 		 const struct in6_addr *faddr,
 		 const __be16 fport)
 {
-	static u32 udp6_ehash_secret __read_mostly;
 	static u32 udp_ipv6_hash_secret __read_mostly;
 
 	u32 lhash, fhash;
-- 
cgit v1.2.3


From df51b84564159cdd91a67ee0f9e30b42b3a73cef Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:29 +0000
Subject: ipv6: move tcp_ipv6_hash_secret and udp_ipv6_hash_secret to
 net_hotdata

Use a 32bit hole in "struct net_offload" to store
the remaining 32bit secrets used by TCPv6 and UDPv6.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-17-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/hotdata.h       | 2 ++
 include/net/protocol.h      | 1 +
 net/ipv6/inet6_hashtables.c | 6 ++----
 net/ipv6/udp.c              | 2 --
 4 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index 6d5cd967183a..b0b847585f7e 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -39,7 +39,9 @@ struct net_hotdata {
 #define inet_ehash_secret	net_hotdata.tcp_protocol.secret
 #define udp_ehash_secret	net_hotdata.udp_protocol.secret
 #define inet6_ehash_secret	net_hotdata.tcpv6_protocol.secret
+#define tcp_ipv6_hash_secret	net_hotdata.tcpv6_offload.secret
 #define udp6_ehash_secret	net_hotdata.udpv6_protocol.secret
+#define udp_ipv6_hash_secret	net_hotdata.udpv6_offload.secret
 
 extern struct net_hotdata net_hotdata;
 
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 213649d2ab09..b2499f88f8f8 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -70,6 +70,7 @@ struct inet6_protocol {
 struct net_offload {
 	struct offload_callbacks callbacks;
 	unsigned int		 flags;	/* Flags used by IPv6 for now */
+	u32			 secret;
 };
 /* This should be set for any extension header which is compatible with GSO. */
 #define INET6_PROTO_GSO_EXTHDR	0x1
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 0fee97f3166c..2e81383b663b 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -26,15 +26,13 @@ u32 inet6_ehashfn(const struct net *net,
 		  const struct in6_addr *laddr, const u16 lport,
 		  const struct in6_addr *faddr, const __be16 fport)
 {
-	static u32 ipv6_hash_secret __read_mostly;
-
 	u32 lhash, fhash;
 
 	net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret));
-	net_get_random_once(&ipv6_hash_secret, sizeof(ipv6_hash_secret));
+	net_get_random_once(&tcp_ipv6_hash_secret, sizeof(tcp_ipv6_hash_secret));
 
 	lhash = (__force u32)laddr->s6_addr32[3];
-	fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret);
+	fhash = __ipv6_addr_jhash(faddr, tcp_ipv6_hash_secret);
 
 	return __inet6_ehashfn(lhash, lport, fhash, fport,
 			       inet6_ehash_secret + net_hash_mix(net));
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 1e1c67a51675..80ad8f436b17 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -79,8 +79,6 @@ u32 udp6_ehashfn(const struct net *net,
 		 const struct in6_addr *faddr,
 		 const __be16 fport)
 {
-	static u32 udp_ipv6_hash_secret __read_mostly;
-
 	u32 lhash, fhash;
 
 	net_get_random_once(&udp6_ehash_secret,
-- 
cgit v1.2.3


From 490a79faf95e705ba0ffd9ebf04a624b379e53c9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:30 +0000
Subject: net: introduce include/net/rps.h

Move RPS related structures and helpers from include/linux/netdevice.h
and include/net/sock.h to a new include file.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-18-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/intel/ice/ice_arfs.c         |   1 +
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c    |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c |   1 +
 drivers/net/ethernet/sfc/rx_common.c              |   1 +
 drivers/net/ethernet/sfc/siena/rx_common.c        |   1 +
 drivers/net/tun.c                                 |   1 +
 include/linux/netdevice.h                         |  82 --------------
 include/net/rps.h                                 | 127 ++++++++++++++++++++++
 include/net/sock.h                                |  35 ------
 net/core/dev.c                                    |   1 +
 net/core/net-sysfs.c                              |   1 +
 net/core/sysctl_net_core.c                        |   1 +
 net/ipv4/af_inet.c                                |   1 +
 net/ipv4/tcp.c                                    |   1 +
 net/ipv6/af_inet6.c                               |   1 +
 net/sctp/socket.c                                 |   1 +
 16 files changed, 140 insertions(+), 117 deletions(-)
 create mode 100644 include/net/rps.h

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/ice/ice_arfs.c b/drivers/net/ethernet/intel/ice/ice_arfs.c
index cca0e753f38f..7cee365cc7d1 100644
--- a/drivers/net/ethernet/intel/ice/ice_arfs.c
+++ b/drivers/net/ethernet/intel/ice/ice_arfs.c
@@ -2,6 +2,7 @@
 /* Copyright (C) 2018-2020, Intel Corporation. */
 
 #include "ice.h"
+#include <net/rps.h>
 
 /**
  * ice_is_arfs_active - helper to check is aRFS is active
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index d7da62cda821..5d3fde63b273 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -42,6 +42,7 @@
 #include <net/ip.h>
 #include <net/vxlan.h>
 #include <net/devlink.h>
+#include <net/rps.h>
 
 #include <linux/mlx4/driver.h>
 #include <linux/mlx4/device.h>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
index e66f486faafe..c7f542d0b8f0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -34,6 +34,7 @@
 #include <linux/mlx5/fs.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
+#include <net/rps.h>
 #include "en.h"
 
 #define ARFS_HASH_SHIFT BITS_PER_BYTE
diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c
index fac227d372db..dcd901eccfc8 100644
--- a/drivers/net/ethernet/sfc/rx_common.c
+++ b/drivers/net/ethernet/sfc/rx_common.c
@@ -11,6 +11,7 @@
 #include "net_driver.h"
 #include <linux/module.h>
 #include <linux/iommu.h>
+#include <net/rps.h>
 #include "efx.h"
 #include "nic.h"
 #include "rx_common.h"
diff --git a/drivers/net/ethernet/sfc/siena/rx_common.c b/drivers/net/ethernet/sfc/siena/rx_common.c
index 4579f43484c3..219fb358a646 100644
--- a/drivers/net/ethernet/sfc/siena/rx_common.c
+++ b/drivers/net/ethernet/sfc/siena/rx_common.c
@@ -11,6 +11,7 @@
 #include "net_driver.h"
 #include <linux/module.h>
 #include <linux/iommu.h>
+#include <net/rps.h>
 #include "efx.h"
 #include "nic.h"
 #include "rx_common.h"
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 8d258e263f54..0b3f21cba552 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -78,6 +78,7 @@
 #include <net/ax25.h>
 #include <net/rose.h>
 #include <net/6lowpan.h>
+#include <net/rps.h>
 
 #include <linux/uaccess.h>
 #include <linux/proc_fs.h>
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index dd641297e807..416a800d72ba 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -225,12 +225,6 @@ struct net_device_core_stats {
 #include <linux/cache.h>
 #include <linux/skbuff.h>
 
-#ifdef CONFIG_RPS
-#include <linux/static_key.h>
-extern struct static_key_false rps_needed;
-extern struct static_key_false rfs_needed;
-#endif
-
 struct neighbour;
 struct neigh_parms;
 struct sk_buff;
@@ -730,86 +724,10 @@ static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node
 #endif
 }
 
-#ifdef CONFIG_RPS
-/*
- * This structure holds an RPS map which can be of variable length.  The
- * map is an array of CPUs.
- */
-struct rps_map {
-	unsigned int len;
-	struct rcu_head rcu;
-	u16 cpus[];
-};
-#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16)))
-
-/*
- * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
- * tail pointer for that CPU's input queue at the time of last enqueue, and
- * a hardware filter index.
- */
-struct rps_dev_flow {
-	u16 cpu;
-	u16 filter;
-	unsigned int last_qtail;
-};
-#define RPS_NO_FILTER 0xffff
-
-/*
- * The rps_dev_flow_table structure contains a table of flow mappings.
- */
-struct rps_dev_flow_table {
-	unsigned int mask;
-	struct rcu_head rcu;
-	struct rps_dev_flow flows[];
-};
-#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
-    ((_num) * sizeof(struct rps_dev_flow)))
-
-/*
- * The rps_sock_flow_table contains mappings of flows to the last CPU
- * on which they were processed by the application (set in recvmsg).
- * Each entry is a 32bit value. Upper part is the high-order bits
- * of flow hash, lower part is CPU number.
- * rps_cpu_mask is used to partition the space, depending on number of
- * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
- * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f,
- * meaning we use 32-6=26 bits for the hash.
- */
-struct rps_sock_flow_table {
-	u32	mask;
-
-	u32	ents[] ____cacheline_aligned_in_smp;
-};
-#define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
-
-#define RPS_NO_CPU 0xffff
-
-extern u32 rps_cpu_mask;
-extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
-
-static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
-					u32 hash)
-{
-	if (table && hash) {
-		unsigned int index = hash & table->mask;
-		u32 val = hash & ~rps_cpu_mask;
-
-		/* We only give a hint, preemption can change CPU under us */
-		val |= raw_smp_processor_id();
-
-		/* The following WRITE_ONCE() is paired with the READ_ONCE()
-		 * here, and another one in get_rps_cpu().
-		 */
-		if (READ_ONCE(table->ents[index]) != val)
-			WRITE_ONCE(table->ents[index], val);
-	}
-}
-
 #ifdef CONFIG_RFS_ACCEL
 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
 			 u16 filter_id);
 #endif
-#endif /* CONFIG_RPS */
 
 /* XPS map type and offset of the xps map within net_device->xps_maps[]. */
 enum xps_map_type {
diff --git a/include/net/rps.h b/include/net/rps.h
new file mode 100644
index 000000000000..6081d817d245
--- /dev/null
+++ b/include/net/rps.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_RPS_H
+#define _NET_RPS_H
+
+#include <linux/types.h>
+#include <linux/static_key.h>
+#include <net/sock.h>
+
+#ifdef CONFIG_RPS
+
+extern struct static_key_false rps_needed;
+extern struct static_key_false rfs_needed;
+
+/*
+ * This structure holds an RPS map which can be of variable length.  The
+ * map is an array of CPUs.
+ */
+struct rps_map {
+	unsigned int	len;
+	struct rcu_head	rcu;
+	u16		cpus[];
+};
+#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16)))
+
+/*
+ * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
+ * tail pointer for that CPU's input queue at the time of last enqueue, and
+ * a hardware filter index.
+ */
+struct rps_dev_flow {
+	u16		cpu;
+	u16		filter;
+	unsigned int	last_qtail;
+};
+#define RPS_NO_FILTER 0xffff
+
+/*
+ * The rps_dev_flow_table structure contains a table of flow mappings.
+ */
+struct rps_dev_flow_table {
+	unsigned int		mask;
+	struct rcu_head		rcu;
+	struct rps_dev_flow	flows[];
+};
+#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
+    ((_num) * sizeof(struct rps_dev_flow)))
+
+/*
+ * The rps_sock_flow_table contains mappings of flows to the last CPU
+ * on which they were processed by the application (set in recvmsg).
+ * Each entry is a 32bit value. Upper part is the high-order bits
+ * of flow hash, lower part is CPU number.
+ * rps_cpu_mask is used to partition the space, depending on number of
+ * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
+ * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f,
+ * meaning we use 32-6=26 bits for the hash.
+ */
+struct rps_sock_flow_table {
+	u32	mask;
+
+	u32	ents[] ____cacheline_aligned_in_smp;
+};
+#define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
+
+#define RPS_NO_CPU 0xffff
+
+extern u32 rps_cpu_mask;
+extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
+
+static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
+					u32 hash)
+{
+	unsigned int index = hash & table->mask;
+	u32 val = hash & ~rps_cpu_mask;
+
+	/* We only give a hint, preemption can change CPU under us */
+	val |= raw_smp_processor_id();
+
+	/* The following WRITE_ONCE() is paired with the READ_ONCE()
+	 * here, and another one in get_rps_cpu().
+	 */
+	if (READ_ONCE(table->ents[index]) != val)
+		WRITE_ONCE(table->ents[index], val);
+}
+
+#endif /* CONFIG_RPS */
+
+static inline void sock_rps_record_flow_hash(__u32 hash)
+{
+#ifdef CONFIG_RPS
+	struct rps_sock_flow_table *sock_flow_table;
+
+	if (!hash)
+		return;
+	rcu_read_lock();
+	sock_flow_table = rcu_dereference(rps_sock_flow_table);
+	if (sock_flow_table)
+		rps_record_sock_flow(sock_flow_table, hash);
+	rcu_read_unlock();
+#endif
+}
+
+static inline void sock_rps_record_flow(const struct sock *sk)
+{
+#ifdef CONFIG_RPS
+	if (static_branch_unlikely(&rfs_needed)) {
+		/* Reading sk->sk_rxhash might incur an expensive cache line
+		 * miss.
+		 *
+		 * TCP_ESTABLISHED does cover almost all states where RFS
+		 * might be useful, and is cheaper [1] than testing :
+		 *	IPv4: inet_sk(sk)->inet_daddr
+		 * 	IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
+		 * OR	an additional socket flag
+		 * [1] : sk_state and sk_prot are in the same cache line.
+		 */
+		if (sk->sk_state == TCP_ESTABLISHED) {
+			/* This READ_ONCE() is paired with the WRITE_ONCE()
+			 * from sock_rps_save_rxhash() and sock_rps_reset_rxhash().
+			 */
+			sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash));
+		}
+	}
+#endif
+}
+
+#endif /* _NET_RPS_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 09a0cde8bf52..b5e00702acc1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1117,41 +1117,6 @@ static inline void sk_incoming_cpu_update(struct sock *sk)
 		WRITE_ONCE(sk->sk_incoming_cpu, cpu);
 }
 
-static inline void sock_rps_record_flow_hash(__u32 hash)
-{
-#ifdef CONFIG_RPS
-	struct rps_sock_flow_table *sock_flow_table;
-
-	rcu_read_lock();
-	sock_flow_table = rcu_dereference(rps_sock_flow_table);
-	rps_record_sock_flow(sock_flow_table, hash);
-	rcu_read_unlock();
-#endif
-}
-
-static inline void sock_rps_record_flow(const struct sock *sk)
-{
-#ifdef CONFIG_RPS
-	if (static_branch_unlikely(&rfs_needed)) {
-		/* Reading sk->sk_rxhash might incur an expensive cache line
-		 * miss.
-		 *
-		 * TCP_ESTABLISHED does cover almost all states where RFS
-		 * might be useful, and is cheaper [1] than testing :
-		 *	IPv4: inet_sk(sk)->inet_daddr
-		 * 	IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
-		 * OR	an additional socket flag
-		 * [1] : sk_state and sk_prot are in the same cache line.
-		 */
-		if (sk->sk_state == TCP_ESTABLISHED) {
-			/* This READ_ONCE() is paired with the WRITE_ONCE()
-			 * from sock_rps_save_rxhash() and sock_rps_reset_rxhash().
-			 */
-			sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash));
-		}
-	}
-#endif
-}
 
 static inline void sock_rps_save_rxhash(struct sock *sk,
 					const struct sk_buff *skb)
diff --git a/net/core/dev.c b/net/core/dev.c
index 40ba02e04bcb..bcf49b0393d2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -155,6 +155,7 @@
 #include <net/netdev_rx_queue.h>
 #include <net/page_pool/types.h>
 #include <net/page_pool/helpers.h>
+#include <net/rps.h>
 
 #include "dev.h"
 #include "net-sysfs.h"
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index af238026ac3c..5560083774b1 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -24,6 +24,7 @@
 #include <linux/of_net.h>
 #include <linux/cpu.h>
 #include <net/netdev_rx_queue.h>
+#include <net/rps.h>
 
 #include "dev.h"
 #include "net-sysfs.h"
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 8a4c698dad9c..4b93e27404e8 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -24,6 +24,7 @@
 #include <net/busy_poll.h>
 #include <net/pkt_sched.h>
 #include <net/hotdata.h>
+#include <net/rps.h>
 
 #include "dev.h"
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6f1cfd176e7b..55bd72997b31 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -119,6 +119,7 @@
 #endif
 #include <net/l3mdev.h>
 #include <net/compat.h>
+#include <net/rps.h>
 
 #include <trace/events/sock.h>
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7e1b848398d0..c5b83875411a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -279,6 +279,7 @@
 #include <linux/uaccess.h>
 #include <asm/ioctls.h>
 #include <net/busy_poll.h>
+#include <net/rps.h>
 
 /* Track pending CMSGs. */
 enum {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index b90d46533cdc..8041dc181bd4 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -64,6 +64,7 @@
 #include <net/xfrm.h>
 #include <net/ioam6.h>
 #include <net/rawv6.h>
+#include <net/rps.h>
 
 #include <linux/uaccess.h>
 #include <linux/mroute6.h>
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 6b9fcdb0952a..c67679a41044 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -67,6 +67,7 @@
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
 #include <net/sctp/stream_sched.h>
+#include <net/rps.h>
 
 /* Forward declarations for internal helper functions. */
 static bool sctp_writeable(const struct sock *sk);
-- 
cgit v1.2.3


From ce7f49ab741591d83e33e56948bac2f12de6e14e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Mar 2024 16:00:31 +0000
Subject: net: move rps_sock_flow_table to net_hotdata

rps_sock_flow_table and rps_cpu_mask are used in fast path.

Move them to net_hotdata for better cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240306160031.874438-19-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/hotdata.h      |  4 ++++
 include/net/rps.h          |  8 +++-----
 net/core/dev.c             | 12 +++---------
 net/core/sysctl_net_core.c |  9 ++++++---
 4 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index b0b847585f7e..003667a1efd6 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -27,6 +27,10 @@ struct net_hotdata {
 	struct kmem_cache	*skbuff_cache;
 	struct kmem_cache	*skbuff_fclone_cache;
 	struct kmem_cache	*skb_small_head_cache;
+#ifdef CONFIG_RPS
+	struct rps_sock_flow_table __rcu *rps_sock_flow_table;
+	u32			rps_cpu_mask;
+#endif
 	int			gro_normal_batch;
 	int			netdev_budget;
 	int			netdev_budget_usecs;
diff --git a/include/net/rps.h b/include/net/rps.h
index 6081d817d245..7660243e905b 100644
--- a/include/net/rps.h
+++ b/include/net/rps.h
@@ -5,6 +5,7 @@
 #include <linux/types.h>
 #include <linux/static_key.h>
 #include <net/sock.h>
+#include <net/hotdata.h>
 
 #ifdef CONFIG_RPS
 
@@ -64,14 +65,11 @@ struct rps_sock_flow_table {
 
 #define RPS_NO_CPU 0xffff
 
-extern u32 rps_cpu_mask;
-extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
-
 static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
 					u32 hash)
 {
 	unsigned int index = hash & table->mask;
-	u32 val = hash & ~rps_cpu_mask;
+	u32 val = hash & ~net_hotdata.rps_cpu_mask;
 
 	/* We only give a hint, preemption can change CPU under us */
 	val |= raw_smp_processor_id();
@@ -93,7 +91,7 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
 	if (!hash)
 		return;
 	rcu_read_lock();
-	sock_flow_table = rcu_dereference(rps_sock_flow_table);
+	sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
 	if (sock_flow_table)
 		rps_record_sock_flow(sock_flow_table, hash);
 	rcu_read_unlock();
diff --git a/net/core/dev.c b/net/core/dev.c
index bcf49b0393d2..0766a245816b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4450,12 +4450,6 @@ static inline void ____napi_schedule(struct softnet_data *sd,
 
 #ifdef CONFIG_RPS
 
-/* One global table that all flow-based protocols share. */
-struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
-EXPORT_SYMBOL(rps_sock_flow_table);
-u32 rps_cpu_mask __read_mostly;
-EXPORT_SYMBOL(rps_cpu_mask);
-
 struct static_key_false rps_needed __read_mostly;
 EXPORT_SYMBOL(rps_needed);
 struct static_key_false rfs_needed __read_mostly;
@@ -4547,7 +4541,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 	if (!hash)
 		goto done;
 
-	sock_flow_table = rcu_dereference(rps_sock_flow_table);
+	sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
 	if (flow_table && sock_flow_table) {
 		struct rps_dev_flow *rflow;
 		u32 next_cpu;
@@ -4557,10 +4551,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
 		 */
 		ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
-		if ((ident ^ hash) & ~rps_cpu_mask)
+		if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
 			goto try_rps;
 
-		next_cpu = ident & rps_cpu_mask;
+		next_cpu = ident & net_hotdata.rps_cpu_mask;
 
 		/* OK, now we know there is a match,
 		 * we can look at the local (per receive queue) flow table
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 4b93e27404e8..6973dda3abda 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -140,7 +140,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 
 	mutex_lock(&sock_flow_mutex);
 
-	orig_sock_table = rcu_dereference_protected(rps_sock_flow_table,
+	orig_sock_table = rcu_dereference_protected(
+					net_hotdata.rps_sock_flow_table,
 					lockdep_is_held(&sock_flow_mutex));
 	size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
 
@@ -161,7 +162,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 					mutex_unlock(&sock_flow_mutex);
 					return -ENOMEM;
 				}
-				rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
+				net_hotdata.rps_cpu_mask =
+					roundup_pow_of_two(nr_cpu_ids) - 1;
 				sock_table->mask = size - 1;
 			} else
 				sock_table = orig_sock_table;
@@ -172,7 +174,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 			sock_table = NULL;
 
 		if (sock_table != orig_sock_table) {
-			rcu_assign_pointer(rps_sock_flow_table, sock_table);
+			rcu_assign_pointer(net_hotdata.rps_sock_flow_table,
+					   sock_table);
 			if (sock_table) {
 				static_branch_inc(&rps_needed);
 				static_branch_inc(&rfs_needed);
-- 
cgit v1.2.3


From ab63a2387cb906d43b72a8effb611bbaecb2d0cd Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 6 Mar 2024 11:55:07 -0800
Subject: netdev: add per-queue statistics

The ethtool-nl family does a good job exposing various protocol
related and IEEE/IETF statistics which used to get dumped under
ethtool -S, with creative names. Queue stats don't have a netlink
API, yet, and remain a lion's share of ethtool -S output for new
drivers. Not only is that bad because the names differ driver to
driver but it's also bug-prone. Intuitively drivers try to report
only the stats for active queues, but querying ethtool stats
involves multiple system calls, and the number of stats is
read separately from the stats themselves. Worse still when user
space asks for values of the stats, it doesn't inform the kernel
how big the buffer is. If number of stats increases in the meantime
kernel will overflow user buffer.

Add a netlink API for dumping queue stats. Queue information is
exposed via the netdev-genl family, so add the stats there.
Support per-queue and sum-for-device dumps. Latter will be useful
when subsequent patches add more interesting common stats than
just bytes and packets.

The API does not currently distinguish between HW and SW stats.
The expectation is that the source of the stats will either not
matter much (good packets) or be obvious (skb alloc errors).

Acked-by: Stanislav Fomichev <sdf@google.com>
Reviewed-by: Amritha Nambiar <amritha.nambiar@intel.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240306195509.1502746-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml |  84 +++++++++++++
 Documentation/networking/statistics.rst |  15 +++
 include/linux/netdevice.h               |   3 +
 include/net/netdev_queues.h             |  54 ++++++++
 include/uapi/linux/netdev.h             |  19 +++
 net/core/netdev-genl-gen.c              |  12 ++
 net/core/netdev-genl-gen.h              |   2 +
 net/core/netdev-genl.c                  | 213 ++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/netdev.h       |  19 +++
 9 files changed, 421 insertions(+)

(limited to 'include')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 3addac970680..a1e48c3c84c9 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -74,6 +74,10 @@ definitions:
     name: queue-type
     type: enum
     entries: [ rx, tx ]
+  -
+    name: qstats-scope
+    type: flags
+    entries: [ queue ]
 
 attribute-sets:
   -
@@ -265,6 +269,66 @@ attribute-sets:
         doc: ID of the NAPI instance which services this queue.
         type: u32
 
+  -
+    name: qstats
+    doc: |
+      Get device statistics, scoped to a device or a queue.
+      These statistics extend (and partially duplicate) statistics available
+      in struct rtnl_link_stats64.
+      Value of the `scope` attribute determines how statistics are
+      aggregated. When aggregated for the entire device the statistics
+      represent the total number of events since last explicit reset of
+      the device (i.e. not a reconfiguration like changing queue count).
+      When reported per-queue, however, the statistics may not add
+      up to the total number of events, will only be reported for currently
+      active objects, and will likely report the number of events since last
+      reconfiguration.
+    attributes:
+      -
+        name: ifindex
+        doc: ifindex of the netdevice to which stats belong.
+        type: u32
+        checks:
+          min: 1
+      -
+        name: queue-type
+        doc: Queue type as rx, tx, for queue-id.
+        type: u32
+        enum: queue-type
+      -
+        name: queue-id
+        doc: Queue ID, if stats are scoped to a single queue instance.
+        type: u32
+      -
+        name: scope
+        doc: |
+          What object type should be used to iterate over the stats.
+        type: uint
+        enum: qstats-scope
+      -
+        name: rx-packets
+        doc: |
+          Number of wire packets successfully received and passed to the stack.
+          For drivers supporting XDP, XDP is considered the first layer
+          of the stack, so packets consumed by XDP are still counted here.
+        type: uint
+        value: 8 # reserve some attr ids in case we need more metadata later
+      -
+        name: rx-bytes
+        doc: Successfully received bytes, see `rx-packets`.
+        type: uint
+      -
+        name: tx-packets
+        doc: |
+          Number of wire packets successfully sent. Packet is considered to be
+          successfully sent once it is in device memory (usually this means
+          the device has issued a DMA completion for the packet).
+        type: uint
+      -
+        name: tx-bytes
+        doc: Successfully sent bytes, see `tx-packets`.
+        type: uint
+
 operations:
   list:
     -
@@ -405,6 +469,26 @@ operations:
           attributes:
             - ifindex
         reply: *napi-get-op
+    -
+      name: qstats-get
+      doc: |
+        Get / dump fine grained statistics. Which statistics are reported
+        depends on the device and the driver, and whether the driver stores
+        software counters per-queue.
+      attribute-set: qstats
+      dump:
+        request:
+          attributes:
+            - scope
+        reply:
+          attributes:
+            - ifindex
+            - queue-type
+            - queue-id
+            - rx-packets
+            - rx-bytes
+            - tx-packets
+            - tx-bytes
 
 mcast-groups:
   list:
diff --git a/Documentation/networking/statistics.rst b/Documentation/networking/statistics.rst
index 551b3cc29a41..75e017dfa825 100644
--- a/Documentation/networking/statistics.rst
+++ b/Documentation/networking/statistics.rst
@@ -41,6 +41,15 @@ If `-s` is specified once the detailed errors won't be shown.
 
 `ip` supports JSON formatting via the `-j` option.
 
+Queue statistics
+~~~~~~~~~~~~~~~~
+
+Queue statistics are accessible via the netdev netlink family.
+
+Currently no widely distributed CLI exists to access those statistics.
+Kernel development tools (ynl) can be used to experiment with them,
+see `Documentation/userspace-api/netlink/intro-specs.rst`.
+
 Protocol-specific statistics
 ----------------------------
 
@@ -147,6 +156,12 @@ Statistics are reported both in the responses to link information
 requests (`RTM_GETLINK`) and statistic requests (`RTM_GETSTATS`,
 when `IFLA_STATS_LINK_64` bit is set in the `.filter_mask` of the request).
 
+netdev (netlink)
+~~~~~~~~~~~~~~~~
+
+`netdev` generic netlink family allows accessing page pool and per queue
+statistics.
+
 ethtool
 -------
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 416a800d72ba..4230c7f3b959 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1955,6 +1955,7 @@ enum netdev_reg_state {
  *
  *	@sysfs_rx_queue_group:	Space for optional per-rx queue attributes
  *	@rtnl_link_ops:	Rtnl_link_ops
+ *	@stat_ops:	Optional ops for queue-aware statistics
  *
  *	@gso_max_size:	Maximum size of generic segmentation offload
  *	@tso_max_size:	Device (as in HW) limit on the max TSO request size
@@ -2335,6 +2336,8 @@ struct net_device {
 
 	const struct rtnl_link_ops *rtnl_link_ops;
 
+	const struct netdev_stat_ops *stat_ops;
+
 	/* for setting kernel sock attribute on TCP connection setup */
 #define GSO_MAX_SEGS		65535u
 #define GSO_LEGACY_MAX_SIZE	65536u
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index 8b8ed4e13d74..d633347eeda5 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -4,6 +4,60 @@
 
 #include <linux/netdevice.h>
 
+struct netdev_queue_stats_rx {
+	u64 bytes;
+	u64 packets;
+};
+
+struct netdev_queue_stats_tx {
+	u64 bytes;
+	u64 packets;
+};
+
+/**
+ * struct netdev_stat_ops - netdev ops for fine grained stats
+ * @get_queue_stats_rx:	get stats for a given Rx queue
+ * @get_queue_stats_tx:	get stats for a given Tx queue
+ * @get_base_stats:	get base stats (not belonging to any live instance)
+ *
+ * Query stats for a given object. The values of the statistics are undefined
+ * on entry (specifically they are *not* zero-initialized). Drivers should
+ * assign values only to the statistics they collect. Statistics which are not
+ * collected must be left undefined.
+ *
+ * Queue objects are not necessarily persistent, and only currently active
+ * queues are queried by the per-queue callbacks. This means that per-queue
+ * statistics will not generally add up to the total number of events for
+ * the device. The @get_base_stats callback allows filling in the delta
+ * between events for currently live queues and overall device history.
+ * When the statistics for the entire device are queried, first @get_base_stats
+ * is issued to collect the delta, and then a series of per-queue callbacks.
+ * Only statistics which are set in @get_base_stats will be reported
+ * at the device level, meaning that unlike in queue callbacks, setting
+ * a statistic to zero in @get_base_stats is a legitimate thing to do.
+ * This is because @get_base_stats has a second function of designating which
+ * statistics are in fact correct for the entire device (e.g. when history
+ * for some of the events is not maintained, and reliable "total" cannot
+ * be provided).
+ *
+ * Device drivers can assume that when collecting total device stats,
+ * the @get_base_stats and subsequent per-queue calls are performed
+ * "atomically" (without releasing the rtnl_lock).
+ *
+ * Device drivers are encouraged to reset the per-queue statistics when
+ * number of queues change. This is because the primary use case for
+ * per-queue statistics is currently to detect traffic imbalance.
+ */
+struct netdev_stat_ops {
+	void (*get_queue_stats_rx)(struct net_device *dev, int idx,
+				   struct netdev_queue_stats_rx *stats);
+	void (*get_queue_stats_tx)(struct net_device *dev, int idx,
+				   struct netdev_queue_stats_tx *stats);
+	void (*get_base_stats)(struct net_device *dev,
+			       struct netdev_queue_stats_rx *rx,
+			       struct netdev_queue_stats_tx *tx);
+};
+
 /**
  * DOC: Lockless queue stopping / waking helpers.
  *
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 93cb411adf72..639ffa04c172 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -70,6 +70,10 @@ enum netdev_queue_type {
 	NETDEV_QUEUE_TYPE_TX,
 };
 
+enum netdev_qstats_scope {
+	NETDEV_QSTATS_SCOPE_QUEUE = 1,
+};
+
 enum {
 	NETDEV_A_DEV_IFINDEX = 1,
 	NETDEV_A_DEV_PAD,
@@ -132,6 +136,20 @@ enum {
 	NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
 };
 
+enum {
+	NETDEV_A_QSTATS_IFINDEX = 1,
+	NETDEV_A_QSTATS_QUEUE_TYPE,
+	NETDEV_A_QSTATS_QUEUE_ID,
+	NETDEV_A_QSTATS_SCOPE,
+	NETDEV_A_QSTATS_RX_PACKETS = 8,
+	NETDEV_A_QSTATS_RX_BYTES,
+	NETDEV_A_QSTATS_TX_PACKETS,
+	NETDEV_A_QSTATS_TX_BYTES,
+
+	__NETDEV_A_QSTATS_MAX,
+	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
+};
+
 enum {
 	NETDEV_CMD_DEV_GET = 1,
 	NETDEV_CMD_DEV_ADD_NTF,
@@ -144,6 +162,7 @@ enum {
 	NETDEV_CMD_PAGE_POOL_STATS_GET,
 	NETDEV_CMD_QUEUE_GET,
 	NETDEV_CMD_NAPI_GET,
+	NETDEV_CMD_QSTATS_GET,
 
 	__NETDEV_CMD_MAX,
 	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index be7f2ebd61b2..8d8ace9ef87f 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -68,6 +68,11 @@ static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFIN
 	[NETDEV_A_NAPI_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
 };
 
+/* NETDEV_CMD_QSTATS_GET - dump */
+static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = {
+	[NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1),
+};
+
 /* Ops table for netdev */
 static const struct genl_split_ops netdev_nl_ops[] = {
 	{
@@ -138,6 +143,13 @@ static const struct genl_split_ops netdev_nl_ops[] = {
 		.maxattr	= NETDEV_A_NAPI_IFINDEX,
 		.flags		= GENL_CMD_CAP_DUMP,
 	},
+	{
+		.cmd		= NETDEV_CMD_QSTATS_GET,
+		.dumpit		= netdev_nl_qstats_get_dumpit,
+		.policy		= netdev_qstats_get_nl_policy,
+		.maxattr	= NETDEV_A_QSTATS_SCOPE,
+		.flags		= GENL_CMD_CAP_DUMP,
+	},
 };
 
 static const struct genl_multicast_group netdev_nl_mcgrps[] = {
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index a47f2bcbe4fa..4db40fd5b4a9 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -28,6 +28,8 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb,
 			       struct netlink_callback *cb);
 int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info);
 int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
+				struct netlink_callback *cb);
 
 enum {
 	NETDEV_NLGRP_MGMT,
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 918b109e0cf4..7fa75e13dc6d 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -8,6 +8,7 @@
 #include <net/xdp.h>
 #include <net/xdp_sock.h>
 #include <net/netdev_rx_queue.h>
+#include <net/netdev_queues.h>
 #include <net/busy_poll.h>
 
 #include "netdev-genl-gen.h"
@@ -460,6 +461,218 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 	return err;
 }
 
+#define NETDEV_STAT_NOT_SET		(~0ULL)
+
+static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size)
+{
+	const u64 *add = _add;
+	u64 *sum = _sum;
+
+	while (size) {
+		if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET)
+			*sum += *add;
+		sum++;
+		add++;
+		size -= 8;
+	}
+}
+
+static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value)
+{
+	if (value == NETDEV_STAT_NOT_SET)
+		return 0;
+	return nla_put_uint(rsp, attr_id, value);
+}
+
+static int
+netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx)
+{
+	if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int
+netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx)
+{
+	if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int
+netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp,
+		      u32 q_type, int i, const struct genl_info *info)
+{
+	const struct netdev_stat_ops *ops = netdev->stat_ops;
+	struct netdev_queue_stats_rx rx;
+	struct netdev_queue_stats_tx tx;
+	void *hdr;
+
+	hdr = genlmsg_iput(rsp, info);
+	if (!hdr)
+		return -EMSGSIZE;
+	if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) ||
+	    nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) ||
+	    nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i))
+		goto nla_put_failure;
+
+	switch (q_type) {
+	case NETDEV_QUEUE_TYPE_RX:
+		memset(&rx, 0xff, sizeof(rx));
+		ops->get_queue_stats_rx(netdev, i, &rx);
+		if (!memchr_inv(&rx, 0xff, sizeof(rx)))
+			goto nla_cancel;
+		if (netdev_nl_stats_write_rx(rsp, &rx))
+			goto nla_put_failure;
+		break;
+	case NETDEV_QUEUE_TYPE_TX:
+		memset(&tx, 0xff, sizeof(tx));
+		ops->get_queue_stats_tx(netdev, i, &tx);
+		if (!memchr_inv(&tx, 0xff, sizeof(tx)))
+			goto nla_cancel;
+		if (netdev_nl_stats_write_tx(rsp, &tx))
+			goto nla_put_failure;
+		break;
+	}
+
+	genlmsg_end(rsp, hdr);
+	return 0;
+
+nla_cancel:
+	genlmsg_cancel(rsp, hdr);
+	return 0;
+nla_put_failure:
+	genlmsg_cancel(rsp, hdr);
+	return -EMSGSIZE;
+}
+
+static int
+netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp,
+			 const struct genl_info *info,
+			 struct netdev_nl_dump_ctx *ctx)
+{
+	const struct netdev_stat_ops *ops = netdev->stat_ops;
+	int i, err;
+
+	if (!(netdev->flags & IFF_UP))
+		return 0;
+
+	i = ctx->rxq_idx;
+	while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) {
+		err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX,
+					    i, info);
+		if (err)
+			return err;
+		ctx->rxq_idx = i++;
+	}
+	i = ctx->txq_idx;
+	while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) {
+		err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX,
+					    i, info);
+		if (err)
+			return err;
+		ctx->txq_idx = i++;
+	}
+
+	ctx->rxq_idx = 0;
+	ctx->txq_idx = 0;
+	return 0;
+}
+
+static int
+netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp,
+			  const struct genl_info *info)
+{
+	struct netdev_queue_stats_rx rx_sum, rx;
+	struct netdev_queue_stats_tx tx_sum, tx;
+	const struct netdev_stat_ops *ops;
+	void *hdr;
+	int i;
+
+	ops = netdev->stat_ops;
+	/* Netdev can't guarantee any complete counters */
+	if (!ops->get_base_stats)
+		return 0;
+
+	memset(&rx_sum, 0xff, sizeof(rx_sum));
+	memset(&tx_sum, 0xff, sizeof(tx_sum));
+
+	ops->get_base_stats(netdev, &rx_sum, &tx_sum);
+
+	/* The op was there, but nothing reported, don't bother */
+	if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) &&
+	    !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum)))
+		return 0;
+
+	hdr = genlmsg_iput(rsp, info);
+	if (!hdr)
+		return -EMSGSIZE;
+	if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex))
+		goto nla_put_failure;
+
+	for (i = 0; i < netdev->real_num_rx_queues; i++) {
+		memset(&rx, 0xff, sizeof(rx));
+		if (ops->get_queue_stats_rx)
+			ops->get_queue_stats_rx(netdev, i, &rx);
+		netdev_nl_stats_add(&rx_sum, &rx, sizeof(rx));
+	}
+	for (i = 0; i < netdev->real_num_tx_queues; i++) {
+		memset(&tx, 0xff, sizeof(tx));
+		if (ops->get_queue_stats_tx)
+			ops->get_queue_stats_tx(netdev, i, &tx);
+		netdev_nl_stats_add(&tx_sum, &tx, sizeof(tx));
+	}
+
+	if (netdev_nl_stats_write_rx(rsp, &rx_sum) ||
+	    netdev_nl_stats_write_tx(rsp, &tx_sum))
+		goto nla_put_failure;
+
+	genlmsg_end(rsp, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(rsp, hdr);
+	return -EMSGSIZE;
+}
+
+int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
+				struct netlink_callback *cb)
+{
+	struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
+	const struct genl_info *info = genl_info_dump(cb);
+	struct net *net = sock_net(skb->sk);
+	struct net_device *netdev;
+	unsigned int scope;
+	int err = 0;
+
+	scope = 0;
+	if (info->attrs[NETDEV_A_QSTATS_SCOPE])
+		scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]);
+
+	rtnl_lock();
+	for_each_netdev_dump(net, netdev, ctx->ifindex) {
+		if (!netdev->stat_ops)
+			continue;
+
+		switch (scope) {
+		case 0:
+			err = netdev_nl_stats_by_netdev(netdev, skb, info);
+			break;
+		case NETDEV_QSTATS_SCOPE_QUEUE:
+			err = netdev_nl_stats_by_queue(netdev, skb, info, ctx);
+			break;
+		}
+		if (err < 0)
+			break;
+	}
+	rtnl_unlock();
+
+	return err;
+}
+
 static int netdev_genl_netdevice_event(struct notifier_block *nb,
 				       unsigned long event, void *ptr)
 {
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 93cb411adf72..639ffa04c172 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -70,6 +70,10 @@ enum netdev_queue_type {
 	NETDEV_QUEUE_TYPE_TX,
 };
 
+enum netdev_qstats_scope {
+	NETDEV_QSTATS_SCOPE_QUEUE = 1,
+};
+
 enum {
 	NETDEV_A_DEV_IFINDEX = 1,
 	NETDEV_A_DEV_PAD,
@@ -132,6 +136,20 @@ enum {
 	NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
 };
 
+enum {
+	NETDEV_A_QSTATS_IFINDEX = 1,
+	NETDEV_A_QSTATS_QUEUE_TYPE,
+	NETDEV_A_QSTATS_QUEUE_ID,
+	NETDEV_A_QSTATS_SCOPE,
+	NETDEV_A_QSTATS_RX_PACKETS = 8,
+	NETDEV_A_QSTATS_RX_BYTES,
+	NETDEV_A_QSTATS_TX_PACKETS,
+	NETDEV_A_QSTATS_TX_BYTES,
+
+	__NETDEV_A_QSTATS_MAX,
+	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
+};
+
 enum {
 	NETDEV_CMD_DEV_GET = 1,
 	NETDEV_CMD_DEV_ADD_NTF,
@@ -144,6 +162,7 @@ enum {
 	NETDEV_CMD_PAGE_POOL_STATS_GET,
 	NETDEV_CMD_QUEUE_GET,
 	NETDEV_CMD_NAPI_GET,
+	NETDEV_CMD_QSTATS_GET,
 
 	__NETDEV_CMD_MAX,
 	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
-- 
cgit v1.2.3


From 92f8b1f5ca0f157f564e75cef4c63641c172e0f1 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 6 Mar 2024 11:55:08 -0800
Subject: netdev: add queue stat for alloc failures

Rx alloc failures are commonly counted by drivers.
Support reporting those via netdev-genl queue stats.

Acked-by: Stanislav Fomichev <sdf@google.com>
Reviewed-by: Amritha Nambiar <amritha.nambiar@intel.com>
Reviewed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240306195509.1502746-3-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml | 7 +++++++
 include/net/netdev_queues.h             | 2 ++
 include/uapi/linux/netdev.h             | 1 +
 net/core/netdev-genl.c                  | 3 ++-
 tools/include/uapi/linux/netdev.h       | 1 +
 5 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index a1e48c3c84c9..76352dbd2be4 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -328,6 +328,13 @@ attribute-sets:
         name: tx-bytes
         doc: Successfully sent bytes, see `tx-packets`.
         type: uint
+      -
+        name: rx-alloc-fail
+        doc: |
+          Number of times skb or buffer allocation failed on the Rx datapath.
+          Allocation failure may, or may not result in a packet drop, depending
+          on driver implementation and whether system recovers quickly.
+        type: uint
 
 operations:
   list:
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index d633347eeda5..1ec408585373 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -4,9 +4,11 @@
 
 #include <linux/netdevice.h>
 
+/* See the netdev.yaml spec for definition of each statistic */
 struct netdev_queue_stats_rx {
 	u64 bytes;
 	u64 packets;
+	u64 alloc_fail;
 };
 
 struct netdev_queue_stats_tx {
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 639ffa04c172..bb65ee840cda 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -145,6 +145,7 @@ enum {
 	NETDEV_A_QSTATS_RX_BYTES,
 	NETDEV_A_QSTATS_TX_PACKETS,
 	NETDEV_A_QSTATS_TX_BYTES,
+	NETDEV_A_QSTATS_RX_ALLOC_FAIL,
 
 	__NETDEV_A_QSTATS_MAX,
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 7fa75e13dc6d..7004b3399c2b 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -488,7 +488,8 @@ static int
 netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx)
 {
 	if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) ||
-	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes))
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail))
 		return -EMSGSIZE;
 	return 0;
 }
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 639ffa04c172..bb65ee840cda 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -145,6 +145,7 @@ enum {
 	NETDEV_A_QSTATS_RX_BYTES,
 	NETDEV_A_QSTATS_TX_PACKETS,
 	NETDEV_A_QSTATS_TX_BYTES,
+	NETDEV_A_QSTATS_RX_ALLOC_FAIL,
 
 	__NETDEV_A_QSTATS_MAX,
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
-- 
cgit v1.2.3


From 6025b9135f7a8b46826a5fcf947259da43bac281 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 4 Mar 2024 06:08:47 -0800
Subject: net: dqs: add NIC stall detector based on BQL

softnet_data->time_squeeze is sometimes used as a proxy for
host overload or indication of scheduling problems. In practice
this statistic is very noisy and has hard to grasp units -
e.g. is 10 squeezes a second to be expected, or high?

Delaying network (NAPI) processing leads to drops on NIC queues
but also RTT bloat, impacting pacing and CA decisions.
Stalls are a little hard to detect on the Rx side, because
there may simply have not been any packets received in given
period of time. Packet timestamps help a little bit, but
again we don't know if packets are stale because we're
not keeping up or because someone (*cough* cgroups)
disabled IRQs for a long time.

We can, however, use Tx as a proxy for Rx stalls. Most drivers
use combined Rx+Tx NAPIs so if Tx gets starved so will Rx.
On the Tx side we know exactly when packets get queued,
and completed, so there is no uncertainty.

This patch adds stall checks to BQL. Why BQL? Because
it's a convenient place to add such checks, already
called by most drivers, and it has copious free space
in its structures (this patch adds no extra cache
references or dirtying to the fast path).

The algorithm takes one parameter - max delay AKA stall
threshold and increments a counter whenever NAPI got delayed
for at least that amount of time. It also records the length
of the longest stall.

To be precise every time NAPI has not polled for at least
stall thrs we check if there were any Tx packets queued
between last NAPI run and now - stall_thrs/2.

Unlike the classic Tx watchdog this mechanism does not
ignore stalls caused by Tx being disabled, or loss of link.
I don't think the check is worth the complexity, and
stall is a stall, whether due to host overload, flow
control, link down... doesn't matter much to the application.

We have been running this detector in production at Meta
for 2 years, with the threshold of 8ms. It's the lowest
value where false positives become rare. There's still
a constant stream of reported stalls (especially without
the ksoftirqd deferral patches reverted), those who like
their stall metrics to be 0 may prefer higher value.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/ABI/testing/sysfs-class-net-queues | 23 ++++++++
 include/linux/dynamic_queue_limits.h             | 45 ++++++++++++++
 include/trace/events/napi.h                      | 33 +++++++++++
 lib/dynamic_queue_limits.c                       | 74 ++++++++++++++++++++++++
 net/core/net-sysfs.c                             | 62 ++++++++++++++++++++
 5 files changed, 237 insertions(+)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-class-net-queues b/Documentation/ABI/testing/sysfs-class-net-queues
index 5bff64d256c2..84aa25e0d14d 100644
--- a/Documentation/ABI/testing/sysfs-class-net-queues
+++ b/Documentation/ABI/testing/sysfs-class-net-queues
@@ -96,3 +96,26 @@ Description:
 		Indicates the absolute minimum limit of bytes allowed to be
 		queued on this network device transmit queue. Default value is
 		0.
+
+What:		/sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/stall_thrs
+Date:		Jan 2024
+KernelVersion:	6.9
+Contact:	netdev@vger.kernel.org
+Description:
+		Tx completion stall detection threshold in ms. Kernel will
+		guarantee to detect all stalls longer than this threshold but
+		may also detect stalls longer than half of the threshold.
+
+What:		/sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/stall_cnt
+Date:		Jan 2024
+KernelVersion:	6.9
+Contact:	netdev@vger.kernel.org
+Description:
+		Number of detected Tx completion stalls.
+
+What:		/sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/stall_max
+Date:		Jan 2024
+KernelVersion:	6.9
+Contact:	netdev@vger.kernel.org
+Description:
+		Longest detected Tx completion stall. Write 0 to clear.
diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h
index 407c2f281b64..5693a4be0d9a 100644
--- a/include/linux/dynamic_queue_limits.h
+++ b/include/linux/dynamic_queue_limits.h
@@ -38,14 +38,22 @@
 
 #ifdef __KERNEL__
 
+#include <linux/bitops.h>
 #include <asm/bug.h>
 
+#define DQL_HIST_LEN		4
+#define DQL_HIST_ENT(dql, idx)	((dql)->history[(idx) % DQL_HIST_LEN])
+
 struct dql {
 	/* Fields accessed in enqueue path (dql_queued) */
 	unsigned int	num_queued;		/* Total ever queued */
 	unsigned int	adj_limit;		/* limit + num_completed */
 	unsigned int	last_obj_cnt;		/* Count at last queuing */
 
+	unsigned long	history_head;		/* top 58 bits of jiffies */
+	/* stall entries, a bit per entry */
+	unsigned long	history[DQL_HIST_LEN];
+
 	/* Fields accessed only by completion path (dql_completed) */
 
 	unsigned int	limit ____cacheline_aligned_in_smp; /* Current limit */
@@ -62,6 +70,13 @@ struct dql {
 	unsigned int	max_limit;		/* Max limit */
 	unsigned int	min_limit;		/* Minimum limit */
 	unsigned int	slack_hold_time;	/* Time to measure slack */
+
+	/* Stall threshold (in jiffies), defined by user */
+	unsigned short	stall_thrs;
+	/* Longest stall detected, reported to user */
+	unsigned short	stall_max;
+	unsigned long	last_reap;		/* Last reap (in jiffies) */
+	unsigned long	stall_cnt;		/* Number of stalls */
 };
 
 /* Set some static maximums */
@@ -74,6 +89,8 @@ struct dql {
  */
 static inline void dql_queued(struct dql *dql, unsigned int count)
 {
+	unsigned long map, now, now_hi, i;
+
 	BUG_ON(count > DQL_MAX_OBJECT);
 
 	dql->last_obj_cnt = count;
@@ -86,6 +103,34 @@ static inline void dql_queued(struct dql *dql, unsigned int count)
 	barrier();
 
 	dql->num_queued += count;
+
+	now = jiffies;
+	now_hi = now / BITS_PER_LONG;
+
+	/* The following code set a bit in the ring buffer, where each
+	 * bit trackes time the packet was queued. The dql->history buffer
+	 * tracks DQL_HIST_LEN * BITS_PER_LONG time (jiffies) slot
+	 */
+	if (unlikely(now_hi != dql->history_head)) {
+		/* About to reuse slots, clear them */
+		for (i = 0; i < DQL_HIST_LEN; i++) {
+			/* Multiplication masks high bits */
+			if (now_hi * BITS_PER_LONG ==
+			    (dql->history_head + i) * BITS_PER_LONG)
+				break;
+			DQL_HIST_ENT(dql, dql->history_head + i + 1) = 0;
+		}
+		/* pairs with smp_rmb() in dql_check_stall() */
+		smp_wmb();
+		WRITE_ONCE(dql->history_head, now_hi);
+	}
+
+	/* __set_bit() does not guarantee WRITE_ONCE() semantics */
+	map = DQL_HIST_ENT(dql, now_hi);
+
+	/* Populate the history with an entry (bit) per queued */
+	if (!(map & BIT_MASK(now)))
+		WRITE_ONCE(DQL_HIST_ENT(dql, now_hi), map | BIT_MASK(now));
 }
 
 /* Returns how many objects can be queued, < 0 indicates over limit. */
diff --git a/include/trace/events/napi.h b/include/trace/events/napi.h
index 6678cf8b235b..dc03cf8e0369 100644
--- a/include/trace/events/napi.h
+++ b/include/trace/events/napi.h
@@ -36,6 +36,39 @@ TRACE_EVENT(napi_poll,
 		  __entry->work, __entry->budget)
 );
 
+TRACE_EVENT(dql_stall_detected,
+
+	TP_PROTO(unsigned short thrs, unsigned int len,
+		 unsigned long last_reap, unsigned long hist_head,
+		 unsigned long now, unsigned long *hist),
+
+	TP_ARGS(thrs, len, last_reap, hist_head, now, hist),
+
+	TP_STRUCT__entry(
+		__field(	unsigned short,		thrs)
+		__field(	unsigned int,		len)
+		__field(	unsigned long,		last_reap)
+		__field(	unsigned long,		hist_head)
+		__field(	unsigned long,		now)
+		__array(	unsigned long,		hist, 4)
+	),
+
+	TP_fast_assign(
+		__entry->thrs = thrs;
+		__entry->len = len;
+		__entry->last_reap = last_reap;
+		__entry->hist_head = hist_head * BITS_PER_LONG;
+		__entry->now = now;
+		memcpy(__entry->hist, hist, sizeof(entry->hist));
+	),
+
+	TP_printk("thrs %u  len %u  last_reap %lu  hist_head %lu  now %lu  hist %016lx %016lx %016lx %016lx",
+		  __entry->thrs, __entry->len,
+		  __entry->last_reap, __entry->hist_head, __entry->now,
+		  __entry->hist[0], __entry->hist[1],
+		  __entry->hist[2], __entry->hist[3])
+);
+
 #undef NO_DEV
 
 #endif /* _TRACE_NAPI_H */
diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c
index fde0aa244148..a1389db1c30a 100644
--- a/lib/dynamic_queue_limits.c
+++ b/lib/dynamic_queue_limits.c
@@ -10,10 +10,77 @@
 #include <linux/dynamic_queue_limits.h>
 #include <linux/compiler.h>
 #include <linux/export.h>
+#include <trace/events/napi.h>
 
 #define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0)
 #define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0)
 
+static void dql_check_stall(struct dql *dql)
+{
+	unsigned short stall_thrs;
+	unsigned long now;
+
+	stall_thrs = READ_ONCE(dql->stall_thrs);
+	if (!stall_thrs)
+		return;
+
+	now = jiffies;
+	/* Check for a potential stall */
+	if (time_after_eq(now, dql->last_reap + stall_thrs)) {
+		unsigned long hist_head, t, start, end;
+
+		/* We are trying to detect a period of at least @stall_thrs
+		 * jiffies without any Tx completions, but during first half
+		 * of which some Tx was posted.
+		 */
+dqs_again:
+		hist_head = READ_ONCE(dql->history_head);
+		/* pairs with smp_wmb() in dql_queued() */
+		smp_rmb();
+
+		/* Get the previous entry in the ring buffer, which is the
+		 * oldest sample.
+		 */
+		start = (hist_head - DQL_HIST_LEN + 1) * BITS_PER_LONG;
+
+		/* Advance start to continue from the last reap time */
+		if (time_before(start, dql->last_reap + 1))
+			start = dql->last_reap + 1;
+
+		/* Newest sample we should have already seen a completion for */
+		end = hist_head * BITS_PER_LONG + (BITS_PER_LONG - 1);
+
+		/* Shrink the search space to [start, (now - start_thrs/2)] if
+		 * `end` is beyond the stall zone
+		 */
+		if (time_before(now, end + stall_thrs / 2))
+			end = now - stall_thrs / 2;
+
+		/* Search for the queued time in [t, end] */
+		for (t = start; time_before_eq(t, end); t++)
+			if (test_bit(t % (DQL_HIST_LEN * BITS_PER_LONG),
+				     dql->history))
+				break;
+
+		/* Variable t contains the time of the queue */
+		if (!time_before_eq(t, end))
+			goto no_stall;
+
+		/* The ring buffer was modified in the meantime, retry */
+		if (hist_head != READ_ONCE(dql->history_head))
+			goto dqs_again;
+
+		dql->stall_cnt++;
+		dql->stall_max = max_t(unsigned short, dql->stall_max, now - t);
+
+		trace_dql_stall_detected(dql->stall_thrs, now - t,
+					 dql->last_reap, dql->history_head,
+					 now, dql->history);
+	}
+no_stall:
+	dql->last_reap = now;
+}
+
 /* Records completed count and recalculates the queue limit */
 void dql_completed(struct dql *dql, unsigned int count)
 {
@@ -110,6 +177,8 @@ void dql_completed(struct dql *dql, unsigned int count)
 	dql->prev_last_obj_cnt = dql->last_obj_cnt;
 	dql->num_completed = completed;
 	dql->prev_num_queued = num_queued;
+
+	dql_check_stall(dql);
 }
 EXPORT_SYMBOL(dql_completed);
 
@@ -125,6 +194,10 @@ void dql_reset(struct dql *dql)
 	dql->prev_ovlimit = 0;
 	dql->lowest_slack = UINT_MAX;
 	dql->slack_start_time = jiffies;
+
+	dql->last_reap = jiffies;
+	dql->history_head = jiffies / BITS_PER_LONG;
+	memset(dql->history, 0, sizeof(dql->history));
 }
 EXPORT_SYMBOL(dql_reset);
 
@@ -133,6 +206,7 @@ void dql_init(struct dql *dql, unsigned int hold_time)
 	dql->max_limit = DQL_MAX_LIMIT;
 	dql->min_limit = 0;
 	dql->slack_hold_time = hold_time;
+	dql->stall_thrs = 0;
 	dql_reset(dql);
 }
 EXPORT_SYMBOL(dql_init);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 5560083774b1..e3d7a8cfa20b 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1415,6 +1415,65 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init
 	= __ATTR(hold_time, 0644,
 		 bql_show_hold_time, bql_set_hold_time);
 
+static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf)
+{
+	struct dql *dql = &queue->dql;
+
+	return sprintf(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs));
+}
+
+static ssize_t bql_set_stall_thrs(struct netdev_queue *queue,
+				  const char *buf, size_t len)
+{
+	struct dql *dql = &queue->dql;
+	unsigned int value;
+	int err;
+
+	err = kstrtouint(buf, 10, &value);
+	if (err < 0)
+		return err;
+
+	value = msecs_to_jiffies(value);
+	if (value && (value < 4 || value > 4 / 2 * BITS_PER_LONG))
+		return -ERANGE;
+
+	if (!dql->stall_thrs && value)
+		dql->last_reap = jiffies;
+	/* Force last_reap to be live */
+	smp_wmb();
+	dql->stall_thrs = value;
+
+	return len;
+}
+
+static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init =
+	__ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs);
+
+static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf)
+{
+	return sprintf(buf, "%u\n", READ_ONCE(queue->dql.stall_max));
+}
+
+static ssize_t bql_set_stall_max(struct netdev_queue *queue,
+				 const char *buf, size_t len)
+{
+	WRITE_ONCE(queue->dql.stall_max, 0);
+	return len;
+}
+
+static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init =
+	__ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max);
+
+static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf)
+{
+	struct dql *dql = &queue->dql;
+
+	return sprintf(buf, "%lu\n", dql->stall_cnt);
+}
+
+static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init =
+	__ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL);
+
 static ssize_t bql_show_inflight(struct netdev_queue *queue,
 				 char *buf)
 {
@@ -1453,6 +1512,9 @@ static struct attribute *dql_attrs[] __ro_after_init = {
 	&bql_limit_min_attribute.attr,
 	&bql_hold_time_attribute.attr,
 	&bql_inflight_attribute.attr,
+	&bql_stall_thrs_attribute.attr,
+	&bql_stall_cnt_attribute.attr,
+	&bql_stall_max_attribute.attr,
 	NULL
 };
 
-- 
cgit v1.2.3


From caabd859c41b50a571cfdf7747de9f245c5d531b Mon Sep 17 00:00:00 2001
From: fuyuanli <fuyuanli@didiglobal.com>
Date: Tue, 5 Mar 2024 11:04:17 +0800
Subject: tcp: Add skb addr and sock addr to arguments of tracepoint tcp_probe.

It is useful to expose skb addr and sock addr to user in tracepoint
tcp_probe, so that we can get more information while monitoring
receiving of tcp data, by ebpf or other ways.

For example, we need to identify a packet by seq and end_seq when
calculate transmit latency between layer 2 and layer 4 by ebpf, but which is
not available in tcp_probe, so we can only use kprobe hooking
tcp_rcv_established to get them. But we can use tcp_probe directly if skb
addr and sock addr are available, which is more efficient.

Signed-off-by: fuyuanli <fuyuanli@didiglobal.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 6ca3e0343666..699dafd204ea 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -259,6 +259,8 @@ TRACE_EVENT(tcp_probe,
 		__field(__u32, srtt)
 		__field(__u32, rcv_wnd)
 		__field(__u64, sock_cookie)
+		__field(const void *, skbaddr)
+		__field(const void *, skaddr)
 	),
 
 	TP_fast_assign(
@@ -286,14 +288,18 @@ TRACE_EVENT(tcp_probe,
 		__entry->ssthresh = tcp_current_ssthresh(sk);
 		__entry->srtt = tp->srtt_us >> 3;
 		__entry->sock_cookie = sock_gen_cookie(sk);
+
+		__entry->skbaddr = skb;
+		__entry->skaddr = sk;
 	),
 
-	TP_printk("family=%s src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx",
+	TP_printk("family=%s src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx skbaddr=%p skaddr=%p",
 		  show_family_name(__entry->family),
 		  __entry->saddr, __entry->daddr, __entry->mark,
 		  __entry->data_len, __entry->snd_nxt, __entry->snd_una,
 		  __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
-		  __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie)
+		  __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie,
+		  __entry->skbaddr, __entry->skaddr)
 );
 
 #define TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb)			\
-- 
cgit v1.2.3


From a207eab1039b501daddc8e729c9cc5d99fe93d18 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Wed, 6 Mar 2024 13:49:16 +0100
Subject: net: nexthop: Add NHA_OP_FLAGS

In order to add per-nexthop statistics, but still not increase netlink
message size for consumers that do not care about them, there needs to be a
toggle through which the user indicates their desire to get the statistics.
To that end, add a new attribute, NHA_OP_FLAGS. The idea is to be able to
use the attribute for carrying of arbitrary operation-specific flags, i.e.
not make it specific for get / dump.

Add the new attribute to get and dump policies, but do not actually allow
any flags yet -- those will come later as the flags themselves are defined.
Add the necessary parsing code.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/nexthop.h |  3 +++
 net/ipv4/nexthop.c           | 24 ++++++++++++++++++++----
 2 files changed, 23 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h
index d8ffa8c9ca78..086444e2946c 100644
--- a/include/uapi/linux/nexthop.h
+++ b/include/uapi/linux/nexthop.h
@@ -60,6 +60,9 @@ enum {
 	/* nested; nexthop bucket attributes */
 	NHA_RES_BUCKET,
 
+	/* u32; operation-specific flags */
+	NHA_OP_FLAGS,
+
 	__NHA_MAX,
 };
 
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index bcd4df2f1cad..576981f4ca49 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -41,6 +41,7 @@ static const struct nla_policy rtm_nh_policy_new[] = {
 
 static const struct nla_policy rtm_nh_policy_get[] = {
 	[NHA_ID]		= { .type = NLA_U32 },
+	[NHA_OP_FLAGS]		= NLA_POLICY_MASK(NLA_U32, 0),
 };
 
 static const struct nla_policy rtm_nh_policy_del[] = {
@@ -52,6 +53,7 @@ static const struct nla_policy rtm_nh_policy_dump[] = {
 	[NHA_GROUPS]		= { .type = NLA_FLAG },
 	[NHA_MASTER]		= { .type = NLA_U32 },
 	[NHA_FDB]		= { .type = NLA_FLAG },
+	[NHA_OP_FLAGS]		= NLA_POLICY_MASK(NLA_U32, 0),
 };
 
 static const struct nla_policy rtm_nh_res_policy_new[] = {
@@ -2971,7 +2973,7 @@ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
 }
 
 static int nh_valid_get_del_req(const struct nlmsghdr *nlh,
-				struct nlattr **tb, u32 *id,
+				struct nlattr **tb, u32 *id, u32 *op_flags,
 				struct netlink_ext_ack *extack)
 {
 	struct nhmsg *nhm = nlmsg_data(nlh);
@@ -2992,6 +2994,11 @@ static int nh_valid_get_del_req(const struct nlmsghdr *nlh,
 		return -EINVAL;
 	}
 
+	if (tb[NHA_OP_FLAGS])
+		*op_flags = nla_get_u32(tb[NHA_OP_FLAGS]);
+	else
+		*op_flags = 0;
+
 	return 0;
 }
 
@@ -3007,6 +3014,7 @@ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
 		.portid = NETLINK_CB(skb).portid,
 	};
 	struct nexthop *nh;
+	u32 op_flags;
 	int err;
 	u32 id;
 
@@ -3015,7 +3023,7 @@ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
-	err = nh_valid_get_del_req(nlh, tb, &id, extack);
+	err = nh_valid_get_del_req(nlh, tb, &id, &op_flags, extack);
 	if (err)
 		return err;
 
@@ -3036,6 +3044,7 @@ static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	struct nlattr *tb[NHA_MAX + 1];
 	struct sk_buff *skb = NULL;
 	struct nexthop *nh;
+	u32 op_flags;
 	int err;
 	u32 id;
 
@@ -3044,7 +3053,7 @@ static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
-	err = nh_valid_get_del_req(nlh, tb, &id, extack);
+	err = nh_valid_get_del_req(nlh, tb, &id, &op_flags, extack);
 	if (err)
 		return err;
 
@@ -3080,6 +3089,7 @@ struct nh_dump_filter {
 	bool group_filter;
 	bool fdb_filter;
 	u32 res_bucket_nh_id;
+	u32 op_flags;
 };
 
 static bool nh_dump_filtered(struct nexthop *nh,
@@ -3151,6 +3161,11 @@ static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb,
 		return -EINVAL;
 	}
 
+	if (tb[NHA_OP_FLAGS])
+		filter->op_flags = nla_get_u32(tb[NHA_OP_FLAGS]);
+	else
+		filter->op_flags = 0;
+
 	return 0;
 }
 
@@ -3474,6 +3489,7 @@ static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh,
 				   struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[NHA_MAX + 1];
+	u32 op_flags;
 	int err;
 
 	err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, NHA_MAX,
@@ -3481,7 +3497,7 @@ static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
-	err = nh_valid_get_del_req(nlh, tb, id, extack);
+	err = nh_valid_get_del_req(nlh, tb, id, &op_flags, extack);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From f4676ea74b8549cd88dbfe2a592ce4530039e61f Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 6 Mar 2024 13:49:17 +0100
Subject: net: nexthop: Add nexthop group entry stats

Add nexthop group entry stats to count the number of packets forwarded
via each nexthop in the group. The stats will be exposed to user space
for better data path observability in the next patch.

The per-CPU stats pointer is placed at the beginning of 'struct
nh_grp_entry', so that all the fields accessed for the data path reside
on the same cache line:

struct nh_grp_entry {
        struct nexthop *           nh;                   /*     0     8 */
        struct nh_grp_entry_stats * stats;               /*     8     8 */
        u8                         weight;               /*    16     1 */

        /* XXX 7 bytes hole, try to pack */

        union {
                struct {
                        atomic_t   upper_bound;          /*    24     4 */
                } hthr;                                  /*    24     4 */
                struct {
                        struct list_head uw_nh_entry;    /*    24    16 */
                        u16        count_buckets;        /*    40     2 */
                        u16        wants_buckets;        /*    42     2 */
                } res;                                   /*    24    24 */
        };                                               /*    24    24 */
        struct list_head           nh_list;              /*    48    16 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        struct nexthop *           nh_parent;            /*    64     8 */

        /* size: 72, cachelines: 2, members: 6 */
        /* sum members: 65, holes: 1, sum holes: 7 */
        /* last cacheline: 8 bytes */
};

Co-developed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nexthop.h |  6 ++++++
 net/ipv4/nexthop.c    | 35 +++++++++++++++++++++++++++++++----
 2 files changed, 37 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 77e99cba60ad..6e6a36fee51e 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -95,8 +95,14 @@ struct nh_res_table {
 	struct nh_res_bucket	nh_buckets[] __counted_by(num_nh_buckets);
 };
 
+struct nh_grp_entry_stats {
+	u64_stats_t packets;
+	struct u64_stats_sync syncp;
+};
+
 struct nh_grp_entry {
 	struct nexthop	*nh;
+	struct nh_grp_entry_stats __percpu	*stats;
 	u8		weight;
 
 	union {
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 576981f4ca49..92dc21a231f8 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -480,6 +480,7 @@ static void nexthop_free_group(struct nexthop *nh)
 		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 
 		WARN_ON(!list_empty(&nhge->nh_list));
+		free_percpu(nhge->stats);
 		nexthop_put(nhge->nh);
 	}
 
@@ -660,6 +661,16 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static void nh_grp_entry_stats_inc(struct nh_grp_entry *nhge)
+{
+	struct nh_grp_entry_stats *cpu_stats;
+
+	cpu_stats = this_cpu_ptr(nhge->stats);
+	u64_stats_update_begin(&cpu_stats->syncp);
+	u64_stats_inc(&cpu_stats->packets);
+	u64_stats_update_end(&cpu_stats->syncp);
+}
+
 static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
 {
 	struct nexthop_grp *p;
@@ -1182,6 +1193,7 @@ static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash)
 		if (hash > atomic_read(&nhge->hthr.upper_bound))
 			continue;
 
+		nh_grp_entry_stats_inc(nhge);
 		return nhge->nh;
 	}
 
@@ -1191,7 +1203,7 @@ static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash)
 
 static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash)
 {
-	struct nexthop *rc = NULL;
+	struct nh_grp_entry *nhge0 = NULL;
 	int i;
 
 	if (nhg->fdb_nh)
@@ -1206,16 +1218,20 @@ static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash)
 		if (!nexthop_is_good_nh(nhge->nh))
 			continue;
 
-		if (!rc)
-			rc = nhge->nh;
+		if (!nhge0)
+			nhge0 = nhge;
 
 		if (hash > atomic_read(&nhge->hthr.upper_bound))
 			continue;
 
+		nh_grp_entry_stats_inc(nhge);
 		return nhge->nh;
 	}
 
-	return rc ? : nhg->nh_entries[0].nh;
+	if (!nhge0)
+		nhge0 = &nhg->nh_entries[0];
+	nh_grp_entry_stats_inc(nhge0);
+	return nhge0->nh;
 }
 
 static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash)
@@ -1231,6 +1247,7 @@ static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash)
 	bucket = &res_table->nh_buckets[bucket_index];
 	nh_res_bucket_set_busy(bucket);
 	nhge = rcu_dereference(bucket->nh_entry);
+	nh_grp_entry_stats_inc(nhge);
 	return nhge->nh;
 }
 
@@ -1804,6 +1821,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
 			newg->has_v4 = true;
 
 		list_del(&nhges[i].nh_list);
+		new_nhges[j].stats = nhges[i].stats;
 		new_nhges[j].nh_parent = nhges[i].nh_parent;
 		new_nhges[j].nh = nhges[i].nh;
 		new_nhges[j].weight = nhges[i].weight;
@@ -1819,6 +1837,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
 	rcu_assign_pointer(nhp->nh_grp, newg);
 
 	list_del(&nhge->nh_list);
+	free_percpu(nhge->stats);
 	nexthop_put(nhge->nh);
 
 	/* Removal of a NH from a resilient group is notified through
@@ -2483,6 +2502,13 @@ static struct nexthop *nexthop_create_group(struct net *net,
 		if (nhi->family == AF_INET)
 			nhg->has_v4 = true;
 
+		nhg->nh_entries[i].stats =
+			netdev_alloc_pcpu_stats(struct nh_grp_entry_stats);
+		if (!nhg->nh_entries[i].stats) {
+			err = -ENOMEM;
+			nexthop_put(nhe);
+			goto out_no_nh;
+		}
 		nhg->nh_entries[i].nh = nhe;
 		nhg->nh_entries[i].weight = entry[i].weight + 1;
 		list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
@@ -2522,6 +2548,7 @@ static struct nexthop *nexthop_create_group(struct net *net,
 out_no_nh:
 	for (i--; i >= 0; --i) {
 		list_del(&nhg->nh_entries[i].nh_list);
+		free_percpu(nhg->nh_entries[i].stats);
 		nexthop_put(nhg->nh_entries[i].nh);
 	}
 
-- 
cgit v1.2.3


From 95fedd7685912f96304615efe50485588b1c3567 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 6 Mar 2024 13:49:18 +0100
Subject: net: nexthop: Expose nexthop group stats to user space

Add netlink support for reading NH group stats.

This data is only for statistics of the traffic in the SW datapath. HW
nexthop group statistics will be added in the following patches.

Emission of the stats is keyed to a new op_stats flag to avoid cluttering
the netlink message with stats if the user doesn't need them:
NHA_OP_FLAG_DUMP_STATS.

Co-developed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/nexthop.h | 30 ++++++++++++++
 net/ipv4/nexthop.c           | 95 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 117 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h
index 086444e2946c..f4db63c17085 100644
--- a/include/uapi/linux/nexthop.h
+++ b/include/uapi/linux/nexthop.h
@@ -30,6 +30,8 @@ enum {
 
 #define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1)
 
+#define NHA_OP_FLAG_DUMP_STATS		BIT(0)
+
 enum {
 	NHA_UNSPEC,
 	NHA_ID,		/* u32; id for nexthop. id == 0 means auto-assign */
@@ -63,6 +65,9 @@ enum {
 	/* u32; operation-specific flags */
 	NHA_OP_FLAGS,
 
+	/* nested; nexthop group stats */
+	NHA_GROUP_STATS,
+
 	__NHA_MAX,
 };
 
@@ -104,4 +109,29 @@ enum {
 
 #define NHA_RES_BUCKET_MAX	(__NHA_RES_BUCKET_MAX - 1)
 
+enum {
+	NHA_GROUP_STATS_UNSPEC,
+
+	/* nested; nexthop group entry stats */
+	NHA_GROUP_STATS_ENTRY,
+
+	__NHA_GROUP_STATS_MAX,
+};
+
+#define NHA_GROUP_STATS_MAX	(__NHA_GROUP_STATS_MAX - 1)
+
+enum {
+	NHA_GROUP_STATS_ENTRY_UNSPEC,
+
+	/* u32; nexthop id of the nexthop group entry */
+	NHA_GROUP_STATS_ENTRY_ID,
+
+	/* uint; number of packets forwarded via the nexthop group entry */
+	NHA_GROUP_STATS_ENTRY_PACKETS,
+
+	__NHA_GROUP_STATS_ENTRY_MAX,
+};
+
+#define NHA_GROUP_STATS_ENTRY_MAX	(__NHA_GROUP_STATS_ENTRY_MAX - 1)
+
 #endif
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 92dc21a231f8..3365c41eee9a 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -26,6 +26,8 @@ static void remove_nexthop(struct net *net, struct nexthop *nh,
 #define NH_DEV_HASHBITS  8
 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
 
+#define NHA_OP_FLAGS_DUMP_ALL (NHA_OP_FLAG_DUMP_STATS)
+
 static const struct nla_policy rtm_nh_policy_new[] = {
 	[NHA_ID]		= { .type = NLA_U32 },
 	[NHA_GROUP]		= { .type = NLA_BINARY },
@@ -41,7 +43,8 @@ static const struct nla_policy rtm_nh_policy_new[] = {
 
 static const struct nla_policy rtm_nh_policy_get[] = {
 	[NHA_ID]		= { .type = NLA_U32 },
-	[NHA_OP_FLAGS]		= NLA_POLICY_MASK(NLA_U32, 0),
+	[NHA_OP_FLAGS]		= NLA_POLICY_MASK(NLA_U32,
+						  NHA_OP_FLAGS_DUMP_ALL),
 };
 
 static const struct nla_policy rtm_nh_policy_del[] = {
@@ -53,7 +56,8 @@ static const struct nla_policy rtm_nh_policy_dump[] = {
 	[NHA_GROUPS]		= { .type = NLA_FLAG },
 	[NHA_MASTER]		= { .type = NLA_U32 },
 	[NHA_FDB]		= { .type = NLA_FLAG },
-	[NHA_OP_FLAGS]		= NLA_POLICY_MASK(NLA_U32, 0),
+	[NHA_OP_FLAGS]		= NLA_POLICY_MASK(NLA_U32,
+						  NHA_OP_FLAGS_DUMP_ALL),
 };
 
 static const struct nla_policy rtm_nh_res_policy_new[] = {
@@ -671,8 +675,78 @@ static void nh_grp_entry_stats_inc(struct nh_grp_entry *nhge)
 	u64_stats_update_end(&cpu_stats->syncp);
 }
 
-static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
+static void nh_grp_entry_stats_read(struct nh_grp_entry *nhge,
+				    u64 *ret_packets)
+{
+	int i;
+
+	*ret_packets = 0;
+
+	for_each_possible_cpu(i) {
+		struct nh_grp_entry_stats *cpu_stats;
+		unsigned int start;
+		u64 packets;
+
+		cpu_stats = per_cpu_ptr(nhge->stats, i);
+		do {
+			start = u64_stats_fetch_begin(&cpu_stats->syncp);
+			packets = u64_stats_read(&cpu_stats->packets);
+		} while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
+
+		*ret_packets += packets;
+	}
+}
+
+static int nla_put_nh_group_stats_entry(struct sk_buff *skb,
+					struct nh_grp_entry *nhge)
+{
+	struct nlattr *nest;
+	u64 packets;
+
+	nh_grp_entry_stats_read(nhge, &packets);
+
+	nest = nla_nest_start(skb, NHA_GROUP_STATS_ENTRY);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, NHA_GROUP_STATS_ENTRY_ID, nhge->nh->id) ||
+	    nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS, packets))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh)
 {
+	struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+	struct nlattr *nest;
+	int i;
+
+	nest = nla_nest_start(skb, NHA_GROUP_STATS);
+	if (!nest)
+		return -EMSGSIZE;
+
+	for (i = 0; i < nhg->num_nh; i++)
+		if (nla_put_nh_group_stats_entry(skb, &nhg->nh_entries[i]))
+			goto cancel_out;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+cancel_out:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh,
+			    u32 op_flags)
+{
+	struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 	struct nexthop_grp *p;
 	size_t len = nhg->num_nh * sizeof(*p);
 	struct nlattr *nla;
@@ -701,6 +775,10 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
 	if (nhg->resilient && nla_put_nh_group_res(skb, nhg))
 		goto nla_put_failure;
 
+	if (op_flags & NHA_OP_FLAG_DUMP_STATS &&
+	    nla_put_nh_group_stats(skb, nh))
+		goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
@@ -708,7 +786,8 @@ nla_put_failure:
 }
 
 static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
-			int event, u32 portid, u32 seq, unsigned int nlflags)
+			int event, u32 portid, u32 seq, unsigned int nlflags,
+			u32 op_flags)
 {
 	struct fib6_nh *fib6_nh;
 	struct fib_nh *fib_nh;
@@ -735,7 +814,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 
 		if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB))
 			goto nla_put_failure;
-		if (nla_put_nh_group(skb, nhg))
+		if (nla_put_nh_group(skb, nh, op_flags))
 			goto nla_put_failure;
 		goto out;
 	}
@@ -866,7 +945,7 @@ static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
 	if (!skb)
 		goto errout;
 
-	err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags);
+	err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags, 0);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in nh_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
@@ -3095,7 +3174,7 @@ static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		goto errout_free;
 
 	err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
-			   nlh->nlmsg_seq, 0);
+			   nlh->nlmsg_seq, 0, op_flags);
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
 		goto errout_free;
@@ -3265,7 +3344,7 @@ static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb,
 
 	return nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
 			    NETLINK_CB(cb->skb).portid,
-			    cb->nlh->nlmsg_seq, NLM_F_MULTI);
+			    cb->nlh->nlmsg_seq, NLM_F_MULTI, filter->op_flags);
 }
 
 /* rtnl */
-- 
cgit v1.2.3


From 5877786fcf52d1b255afcd61832753d1619f0738 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 6 Mar 2024 13:49:19 +0100
Subject: net: nexthop: Add hardware statistics notifications

Add hw_stats field to several notifier structures to communicate to the
drivers that HW statistics should be configured for nexthops within a given
group.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nexthop.h | 3 +++
 net/ipv4/nexthop.c    | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 6e6a36fee51e..584c37120c20 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -130,6 +130,7 @@ struct nh_group {
 	bool			resilient;
 	bool			fdb_nh;
 	bool			has_v4;
+	bool			hw_stats;
 
 	struct nh_res_table __rcu *res_table;
 	struct nh_grp_entry	nh_entries[] __counted_by(num_nh);
@@ -193,6 +194,7 @@ struct nh_notifier_grp_entry_info {
 struct nh_notifier_grp_info {
 	u16 num_nh;
 	bool is_fdb;
+	bool hw_stats;
 	struct nh_notifier_grp_entry_info nh_entries[] __counted_by(num_nh);
 };
 
@@ -206,6 +208,7 @@ struct nh_notifier_res_bucket_info {
 
 struct nh_notifier_res_table_info {
 	u16 num_nh_buckets;
+	bool hw_stats;
 	struct nh_notifier_single_info nhs[] __counted_by(num_nh_buckets);
 };
 
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 3365c41eee9a..c0d14e16e9c8 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -141,6 +141,7 @@ static int nh_notifier_mpath_info_init(struct nh_notifier_info *info,
 
 	info->nh_grp->num_nh = num_nh;
 	info->nh_grp->is_fdb = nhg->fdb_nh;
+	info->nh_grp->hw_stats = nhg->hw_stats;
 
 	for (i = 0; i < num_nh; i++) {
 		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
@@ -172,6 +173,7 @@ static int nh_notifier_res_table_info_init(struct nh_notifier_info *info,
 		return -ENOMEM;
 
 	info->nh_res_table->num_nh_buckets = num_nh_buckets;
+	info->nh_res_table->hw_stats = nhg->hw_stats;
 
 	for (i = 0; i < num_nh_buckets; i++) {
 		struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
-- 
cgit v1.2.3


From 746c19a52ec50b81422fd4772254d55e588d7df6 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 6 Mar 2024 13:49:20 +0100
Subject: net: nexthop: Add ability to enable / disable hardware statistics

Add netlink support for enabling collection of HW statistics on nexthop
groups.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nexthop.h        |  2 ++
 include/uapi/linux/nexthop.h |  3 +++
 net/ipv4/nexthop.c           | 15 ++++++++++++++-
 3 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 584c37120c20..1ba6d3668518 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -47,6 +47,8 @@ struct nh_config {
 	bool		nh_grp_res_has_idle_timer;
 	bool		nh_grp_res_has_unbalanced_timer;
 
+	bool		nh_hw_stats;
+
 	struct nlattr	*nh_encap;
 	u16		nh_encap_type;
 
diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h
index f4db63c17085..b86af80d4e09 100644
--- a/include/uapi/linux/nexthop.h
+++ b/include/uapi/linux/nexthop.h
@@ -68,6 +68,9 @@ enum {
 	/* nested; nexthop group stats */
 	NHA_GROUP_STATS,
 
+	/* u32; nexthop hardware stats enable */
+	NHA_HW_STATS_ENABLE,
+
 	__NHA_MAX,
 };
 
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index c0d14e16e9c8..e75b1aae5a81 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -39,6 +39,7 @@ static const struct nla_policy rtm_nh_policy_new[] = {
 	[NHA_ENCAP]		= { .type = NLA_NESTED },
 	[NHA_FDB]		= { .type = NLA_FLAG },
 	[NHA_RES_GROUP]		= { .type = NLA_NESTED },
+	[NHA_HW_STATS_ENABLE]	= NLA_POLICY_MAX(NLA_U32, true),
 };
 
 static const struct nla_policy rtm_nh_policy_get[] = {
@@ -778,7 +779,8 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh,
 		goto nla_put_failure;
 
 	if (op_flags & NHA_OP_FLAG_DUMP_STATS &&
-	    nla_put_nh_group_stats(skb, nh))
+	    (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats) ||
+	     nla_put_nh_group_stats(skb, nh)))
 		goto nla_put_failure;
 
 	return 0;
@@ -1202,6 +1204,7 @@ static int nh_check_attr_group(struct net *net,
 		if (!tb[i])
 			continue;
 		switch (i) {
+		case NHA_HW_STATS_ENABLE:
 		case NHA_FDB:
 			continue;
 		case NHA_RES_GROUP:
@@ -2622,6 +2625,9 @@ static struct nexthop *nexthop_create_group(struct net *net,
 	if (cfg->nh_fdb)
 		nhg->fdb_nh = 1;
 
+	if (cfg->nh_hw_stats)
+		nhg->hw_stats = true;
+
 	rcu_assign_pointer(nh->nh_grp, nhg);
 
 	return nh;
@@ -2964,6 +2970,9 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 			err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP],
 						       cfg, extack);
 
+		if (tb[NHA_HW_STATS_ENABLE])
+			cfg->nh_hw_stats = nla_get_u32(tb[NHA_HW_STATS_ENABLE]);
+
 		/* no other attributes should be set */
 		goto out;
 	}
@@ -3055,6 +3064,10 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 		goto out;
 	}
 
+	if (tb[NHA_HW_STATS_ENABLE]) {
+		NL_SET_ERR_MSG(extack, "Cannot enable nexthop hardware statistics for non-group nexthops");
+		goto out;
+	}
 
 	err = 0;
 out:
-- 
cgit v1.2.3


From 5072ae00aea434d922cabd1c3e6236350a77c4d7 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 6 Mar 2024 13:49:21 +0100
Subject: net: nexthop: Expose nexthop group HW stats to user space

Add netlink support for reading NH group hardware stats.

Stats collection is done through a new notifier,
NEXTHOP_EVENT_HW_STATS_REPORT_DELTA. Drivers that implement HW counters for
a given NH group are thereby asked to collect the stats and report back to
core by calling nh_grp_hw_stats_report_delta(). This is similar to what
netdevice L3 stats do.

Besides exposing number of packets that passed in the HW datapath, also
include information on whether any driver actually realizes the counters.
The core can tell based on whether it got any _report_delta() reports from
the drivers. This allows enabling the statistics at the group at any time,
with drivers opting into supporting them. This is also in line with what
netdevice L3 stats are doing.

So as not to waste time and space, tie the collection and reporting of HW
stats with a new op flag, NHA_OP_FLAG_DUMP_HW_STATS.

Co-developed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Kees Cook <keescook@chromium.org> # For the __counted_by bits
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nexthop.h        |  18 ++++++
 include/uapi/linux/nexthop.h |   9 +++
 net/ipv4/nexthop.c           | 130 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 149 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 1ba6d3668518..7ec9cc80f11c 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -122,6 +122,7 @@ struct nh_grp_entry {
 
 	struct list_head nh_list;
 	struct nexthop	*nh_parent;  /* nexthop of group with this entry */
+	u64		packets_hw;
 };
 
 struct nh_group {
@@ -166,6 +167,7 @@ enum nexthop_event_type {
 	NEXTHOP_EVENT_REPLACE,
 	NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
 	NEXTHOP_EVENT_BUCKET_REPLACE,
+	NEXTHOP_EVENT_HW_STATS_REPORT_DELTA,
 };
 
 enum nh_notifier_info_type {
@@ -173,6 +175,7 @@ enum nh_notifier_info_type {
 	NH_NOTIFIER_INFO_TYPE_GRP,
 	NH_NOTIFIER_INFO_TYPE_RES_TABLE,
 	NH_NOTIFIER_INFO_TYPE_RES_BUCKET,
+	NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS,
 };
 
 struct nh_notifier_single_info {
@@ -214,6 +217,17 @@ struct nh_notifier_res_table_info {
 	struct nh_notifier_single_info nhs[] __counted_by(num_nh_buckets);
 };
 
+struct nh_notifier_grp_hw_stats_entry_info {
+	u32 id;
+	u64 packets;
+};
+
+struct nh_notifier_grp_hw_stats_info {
+	u16 num_nh;
+	bool hw_stats_used;
+	struct nh_notifier_grp_hw_stats_entry_info stats[] __counted_by(num_nh);
+};
+
 struct nh_notifier_info {
 	struct net *net;
 	struct netlink_ext_ack *extack;
@@ -224,6 +238,7 @@ struct nh_notifier_info {
 		struct nh_notifier_grp_info *nh_grp;
 		struct nh_notifier_res_table_info *nh_res_table;
 		struct nh_notifier_res_bucket_info *nh_res_bucket;
+		struct nh_notifier_grp_hw_stats_info *nh_grp_hw_stats;
 	};
 };
 
@@ -236,6 +251,9 @@ void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
 				 bool offload, bool trap);
 void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
 				     unsigned long *activity);
+void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info,
+				  unsigned int nh_idx,
+				  u64 delta_packets);
 
 /* caller is holding rcu or rtnl; no reference taken to nexthop */
 struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h
index b86af80d4e09..dd8787f9cf39 100644
--- a/include/uapi/linux/nexthop.h
+++ b/include/uapi/linux/nexthop.h
@@ -31,6 +31,7 @@ enum {
 #define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1)
 
 #define NHA_OP_FLAG_DUMP_STATS		BIT(0)
+#define NHA_OP_FLAG_DUMP_HW_STATS	BIT(1)
 
 enum {
 	NHA_UNSPEC,
@@ -71,6 +72,9 @@ enum {
 	/* u32; nexthop hardware stats enable */
 	NHA_HW_STATS_ENABLE,
 
+	/* u32; read-only; whether any driver collects HW stats */
+	NHA_HW_STATS_USED,
+
 	__NHA_MAX,
 };
 
@@ -132,6 +136,11 @@ enum {
 	/* uint; number of packets forwarded via the nexthop group entry */
 	NHA_GROUP_STATS_ENTRY_PACKETS,
 
+	/* uint; number of packets forwarded via the nexthop group entry in
+	 * hardware
+	 */
+	NHA_GROUP_STATS_ENTRY_PACKETS_HW,
+
 	__NHA_GROUP_STATS_ENTRY_MAX,
 };
 
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index e75b1aae5a81..e34466751d7b 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -26,7 +26,8 @@ static void remove_nexthop(struct net *net, struct nexthop *nh,
 #define NH_DEV_HASHBITS  8
 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
 
-#define NHA_OP_FLAGS_DUMP_ALL (NHA_OP_FLAG_DUMP_STATS)
+#define NHA_OP_FLAGS_DUMP_ALL (NHA_OP_FLAG_DUMP_STATS |		\
+			       NHA_OP_FLAG_DUMP_HW_STATS)
 
 static const struct nla_policy rtm_nh_policy_new[] = {
 	[NHA_ID]		= { .type = NLA_U32 },
@@ -700,8 +701,95 @@ static void nh_grp_entry_stats_read(struct nh_grp_entry *nhge,
 	}
 }
 
+static int nh_notifier_grp_hw_stats_init(struct nh_notifier_info *info,
+					 const struct nexthop *nh)
+{
+	struct nh_group *nhg;
+	int i;
+
+	ASSERT_RTNL();
+	nhg = rtnl_dereference(nh->nh_grp);
+
+	info->id = nh->id;
+	info->type = NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS;
+	info->nh_grp_hw_stats = kzalloc(struct_size(info->nh_grp_hw_stats,
+						    stats, nhg->num_nh),
+					GFP_KERNEL);
+	if (!info->nh_grp_hw_stats)
+		return -ENOMEM;
+
+	info->nh_grp_hw_stats->num_nh = nhg->num_nh;
+	for (i = 0; i < nhg->num_nh; i++) {
+		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+		info->nh_grp_hw_stats->stats[i].id = nhge->nh->id;
+	}
+
+	return 0;
+}
+
+static void nh_notifier_grp_hw_stats_fini(struct nh_notifier_info *info)
+{
+	kfree(info->nh_grp_hw_stats);
+}
+
+void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info,
+				  unsigned int nh_idx,
+				  u64 delta_packets)
+{
+	info->hw_stats_used = true;
+	info->stats[nh_idx].packets += delta_packets;
+}
+EXPORT_SYMBOL(nh_grp_hw_stats_report_delta);
+
+static void nh_grp_hw_stats_apply_update(struct nexthop *nh,
+					 struct nh_notifier_info *info)
+{
+	struct nh_group *nhg;
+	int i;
+
+	ASSERT_RTNL();
+	nhg = rtnl_dereference(nh->nh_grp);
+
+	for (i = 0; i < nhg->num_nh; i++) {
+		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+		nhge->packets_hw += info->nh_grp_hw_stats->stats[i].packets;
+	}
+}
+
+static int nh_grp_hw_stats_update(struct nexthop *nh, bool *hw_stats_used)
+{
+	struct nh_notifier_info info = {
+		.net = nh->net,
+	};
+	struct net *net = nh->net;
+	int err;
+
+	if (nexthop_notifiers_is_empty(net))
+		return 0;
+
+	err = nh_notifier_grp_hw_stats_init(&info, nh);
+	if (err)
+		return err;
+
+	err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+					   NEXTHOP_EVENT_HW_STATS_REPORT_DELTA,
+					   &info);
+
+	/* Cache whatever we got, even if there was an error, otherwise the
+	 * successful stats retrievals would get lost.
+	 */
+	nh_grp_hw_stats_apply_update(nh, &info);
+	*hw_stats_used = info.nh_grp_hw_stats->hw_stats_used;
+
+	nh_notifier_grp_hw_stats_fini(&info);
+	return notifier_to_errno(err);
+}
+
 static int nla_put_nh_group_stats_entry(struct sk_buff *skb,
-					struct nh_grp_entry *nhge)
+					struct nh_grp_entry *nhge,
+					u32 op_flags)
 {
 	struct nlattr *nest;
 	u64 packets;
@@ -713,7 +801,13 @@ static int nla_put_nh_group_stats_entry(struct sk_buff *skb,
 		return -EMSGSIZE;
 
 	if (nla_put_u32(skb, NHA_GROUP_STATS_ENTRY_ID, nhge->nh->id) ||
-	    nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS, packets))
+	    nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS,
+			 packets + nhge->packets_hw))
+		goto nla_put_failure;
+
+	if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS &&
+	    nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS_HW,
+			 nhge->packets_hw))
 		goto nla_put_failure;
 
 	nla_nest_end(skb, nest);
@@ -724,18 +818,35 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
-static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh)
+static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh,
+				  u32 op_flags)
 {
 	struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 	struct nlattr *nest;
+	bool hw_stats_used;
+	int err;
 	int i;
 
+	if (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats))
+		goto err_out;
+
+	if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS &&
+	    nhg->hw_stats) {
+		err = nh_grp_hw_stats_update(nh, &hw_stats_used);
+		if (err)
+			goto out;
+
+		if (nla_put_u32(skb, NHA_HW_STATS_USED, hw_stats_used))
+			goto err_out;
+	}
+
 	nest = nla_nest_start(skb, NHA_GROUP_STATS);
 	if (!nest)
-		return -EMSGSIZE;
+		goto err_out;
 
 	for (i = 0; i < nhg->num_nh; i++)
-		if (nla_put_nh_group_stats_entry(skb, &nhg->nh_entries[i]))
+		if (nla_put_nh_group_stats_entry(skb, &nhg->nh_entries[i],
+						 op_flags))
 			goto cancel_out;
 
 	nla_nest_end(skb, nest);
@@ -743,7 +854,10 @@ static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh)
 
 cancel_out:
 	nla_nest_cancel(skb, nest);
-	return -EMSGSIZE;
+err_out:
+	err = -EMSGSIZE;
+out:
+	return err;
 }
 
 static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh,
@@ -780,7 +894,7 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh,
 
 	if (op_flags & NHA_OP_FLAG_DUMP_STATS &&
 	    (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats) ||
-	     nla_put_nh_group_stats(skb, nh)))
+	     nla_put_nh_group_stats(skb, nh, op_flags)))
 		goto nla_put_failure;
 
 	return 0;
-- 
cgit v1.2.3


From 1cface552a5b5f6e53a855de1a503ff958e2e253 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 Mar 2024 12:34:46 +0000
Subject: net: add skb_data_unref() helper

Similar to skb_unref(), add skb_data_unref() to save an expensive
atomic operation (and cache line dirtying) when last reference
on shinfo->dataref is released.

I saw this opportunity on hosts with RAW sockets accidentally
bound to UDP protocol, forcing an skb_clone() on all received packets.

These RAW sockets had their receive queue full, so all clone
packets were immediately dropped.

When UDP recvmsg() consumes later the original skb, skb_release_data()
is hitting atomic_sub_return() quite badly, because skb->clone
has been set permanently.

Note that this patch helps TCP TX performance, because
TCP stack also use (fast) clones.

This means that at least one of the two packets (the main skb or
its clone) will no longer have to perform this atomic operation
in skb_release_data().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240307123446.2302230-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 18 ++++++++++++++++++
 net/core/skbuff.c      |  4 +---
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d0508f90bed5..3023bc2be6a1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1237,6 +1237,24 @@ static inline bool skb_unref(struct sk_buff *skb)
 	return true;
 }
 
+static inline bool skb_data_unref(const struct sk_buff *skb,
+				  struct skb_shared_info *shinfo)
+{
+	int bias;
+
+	if (!skb->cloned)
+		return true;
+
+	bias = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1;
+
+	if (atomic_read(&shinfo->dataref) == bias)
+		smp_rmb();
+	else if (atomic_sub_return(bias, &shinfo->dataref))
+		return false;
+
+	return true;
+}
+
 void __fix_address
 kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 766219011aea..b99127712e67 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1115,9 +1115,7 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	int i;
 
-	if (skb->cloned &&
-	    atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
-			      &shinfo->dataref))
+	if (!skb_data_unref(skb, shinfo))
 		goto exit;
 
 	if (skb_zcopy(skb)) {
-- 
cgit v1.2.3


From de5f84338970815b9fdd3497a975fb572d11e0b5 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 7 Mar 2024 12:39:06 +0100
Subject: lib/bitmap: Introduce bitmap_scatter() and bitmap_gather() helpers

These helpers scatters or gathers a bitmap with the help of the mask
position bits parameter.

bitmap_scatter() does the following:
  src:  0000000001011010
                  ||||||
           +------+|||||
           |  +----+||||
           |  |+----+|||
           |  ||   +-+||
           |  ||   |  ||
  mask: ...v..vv...v..vv
        ...0..11...0..10
  dst:  0000001100000010

and bitmap_gather() performs this one:
   mask: ...v..vv...v..vv
   src:  0000001100000010
            ^  ^^   ^   0
            |  ||   |  10
            |  ||   > 010
            |  |+--> 1010
            |  +--> 11010
            +----> 011010
   dst:  0000000000011010

bitmap_gather() can the seen as the reverse bitmap_scatter() operation.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/lkml/20230926052007.3917389-3-andriy.shevchenko@linux.intel.com/
Co-developed-by: Herve Codina <herve.codina@bootlin.com>
Signed-off-by: Herve Codina <herve.codina@bootlin.com>
Acked-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bitmap.h | 101 +++++++++++++++++++++++++++++++++++++++++++++++++
 lib/test_bitmap.c      |  42 ++++++++++++++++++++
 2 files changed, 143 insertions(+)

(limited to 'include')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 5814e9ee40ba..00b8c6d4355c 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -63,6 +63,8 @@ struct device;
  *  bitmap_shift_left(dst, src, n, nbits)       *dst = *src << n
  *  bitmap_cut(dst, src, first, n, nbits)       Cut n bits from first, copy rest
  *  bitmap_replace(dst, old, new, mask, nbits)  *dst = (*old & ~(*mask)) | (*new & *mask)
+ *  bitmap_scatter(dst, src, mask, nbits)	*dst = map(dense, sparse)(src)
+ *  bitmap_gather(dst, src, mask, nbits)	*dst = map(sparse, dense)(src)
  *  bitmap_remap(dst, src, old, new, nbits)     *dst = map(old, new)(src)
  *  bitmap_bitremap(oldbit, old, new, nbits)    newbit = map(old, new)(oldbit)
  *  bitmap_onto(dst, orig, relmap, nbits)       *dst = orig relative to relmap
@@ -499,6 +501,105 @@ static inline void bitmap_replace(unsigned long *dst,
 		__bitmap_replace(dst, old, new, mask, nbits);
 }
 
+/**
+ * bitmap_scatter - Scatter a bitmap according to the given mask
+ * @dst: scattered bitmap
+ * @src: gathered bitmap
+ * @mask: mask representing bits to assign to in the scattered bitmap
+ * @nbits: number of bits in each of these bitmaps
+ *
+ * Scatters bitmap with sequential bits according to the given @mask.
+ *
+ * Example:
+ * If @src bitmap = 0x005a, with @mask = 0x1313, @dst will be 0x0302.
+ *
+ * Or in binary form
+ * @src			@mask			@dst
+ * 0000000001011010	0001001100010011	0000001100000010
+ *
+ * (Bits 0, 1, 2, 3, 4, 5 are copied to the bits 0, 1, 4, 8, 9, 12)
+ *
+ * A more 'visual' description of the operation:
+ * src:  0000000001011010
+ *                 ||||||
+ *          +------+|||||
+ *          |  +----+||||
+ *          |  |+----+|||
+ *          |  ||   +-+||
+ *          |  ||   |  ||
+ * mask: ...v..vv...v..vv
+ *       ...0..11...0..10
+ * dst:  0000001100000010
+ *
+ * A relationship exists between bitmap_scatter() and bitmap_gather().
+ * bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation.
+ * See bitmap_scatter() for details related to this relationship.
+ */
+static inline void bitmap_scatter(unsigned long *dst, const unsigned long *src,
+				  const unsigned long *mask, unsigned int nbits)
+{
+	unsigned int n = 0;
+	unsigned int bit;
+
+	bitmap_zero(dst, nbits);
+
+	for_each_set_bit(bit, mask, nbits)
+		__assign_bit(bit, dst, test_bit(n++, src));
+}
+
+/**
+ * bitmap_gather - Gather a bitmap according to given mask
+ * @dst: gathered bitmap
+ * @src: scattered bitmap
+ * @mask: mask representing bits to extract from in the scattered bitmap
+ * @nbits: number of bits in each of these bitmaps
+ *
+ * Gathers bitmap with sparse bits according to the given @mask.
+ *
+ * Example:
+ * If @src bitmap = 0x0302, with @mask = 0x1313, @dst will be 0x001a.
+ *
+ * Or in binary form
+ * @src			@mask			@dst
+ * 0000001100000010	0001001100010011	0000000000011010
+ *
+ * (Bits 0, 1, 4, 8, 9, 12 are copied to the bits 0, 1, 2, 3, 4, 5)
+ *
+ * A more 'visual' description of the operation:
+ * mask: ...v..vv...v..vv
+ * src:  0000001100000010
+ *          ^  ^^   ^   0
+ *          |  ||   |  10
+ *          |  ||   > 010
+ *          |  |+--> 1010
+ *          |  +--> 11010
+ *          +----> 011010
+ * dst:  0000000000011010
+ *
+ * A relationship exists between bitmap_gather() and bitmap_scatter(). See
+ * bitmap_scatter() for the bitmap scatter detailed operations.
+ * Suppose scattered computed using bitmap_scatter(scattered, src, mask, n).
+ * The operation bitmap_gather(result, scattered, mask, n) leads to a result
+ * equal or equivalent to src.
+ *
+ * The result can be 'equivalent' because bitmap_scatter() and bitmap_gather()
+ * are not bijective.
+ * The result and src values are equivalent in that sense that a call to
+ * bitmap_scatter(res, src, mask, n) and a call to
+ * bitmap_scatter(res, result, mask, n) will lead to the same res value.
+ */
+static inline void bitmap_gather(unsigned long *dst, const unsigned long *src,
+				 const unsigned long *mask, unsigned int nbits)
+{
+	unsigned int n = 0;
+	unsigned int bit;
+
+	bitmap_zero(dst, nbits);
+
+	for_each_set_bit(bit, mask, nbits)
+		__assign_bit(n++, dst, test_bit(bit, src));
+}
+
 static inline void bitmap_next_set_region(unsigned long *bitmap,
 					  unsigned int *rs, unsigned int *re,
 					  unsigned int end)
diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 65f22c2578b0..6b2b33579f56 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -380,6 +380,47 @@ static void __init test_replace(void)
 	expect_eq_bitmap(bmap, exp3_1_0, nbits);
 }
 
+static const unsigned long sg_mask[] __initconst = {
+	BITMAP_FROM_U64(0x000000000000035aULL),
+};
+
+static const unsigned long sg_src[] __initconst = {
+	BITMAP_FROM_U64(0x0000000000000667ULL),
+};
+
+static const unsigned long sg_gather_exp[] __initconst = {
+	BITMAP_FROM_U64(0x0000000000000029ULL),
+};
+
+static const unsigned long sg_scatter_exp[] __initconst = {
+	BITMAP_FROM_U64(0x000000000000021aULL),
+};
+
+static void __init test_bitmap_sg(void)
+{
+	unsigned int nbits = 64;
+	DECLARE_BITMAP(bmap_gather, 100);
+	DECLARE_BITMAP(bmap_scatter, 100);
+	DECLARE_BITMAP(bmap_tmp, 100);
+	DECLARE_BITMAP(bmap_res, 100);
+
+	/* Simple gather call */
+	bitmap_zero(bmap_gather, 100);
+	bitmap_gather(bmap_gather, sg_src, sg_mask, nbits);
+	expect_eq_bitmap(sg_gather_exp, bmap_gather, nbits);
+
+	/* Simple scatter call */
+	bitmap_zero(bmap_scatter, 100);
+	bitmap_scatter(bmap_scatter, sg_src, sg_mask, nbits);
+	expect_eq_bitmap(sg_scatter_exp, bmap_scatter, nbits);
+
+	/* Scatter/gather relationship */
+	bitmap_zero(bmap_tmp, 100);
+	bitmap_gather(bmap_tmp, bmap_scatter, sg_mask, nbits);
+	bitmap_scatter(bmap_res, bmap_tmp, sg_mask, nbits);
+	expect_eq_bitmap(bmap_scatter, bmap_res, nbits);
+}
+
 #define PARSE_TIME	0x1
 #define NO_LEN		0x2
 
@@ -1252,6 +1293,7 @@ static void __init selftest(void)
 	test_copy();
 	test_bitmap_region();
 	test_replace();
+	test_bitmap_sg();
 	test_bitmap_arr32();
 	test_bitmap_arr64();
 	test_bitmap_parse();
-- 
cgit v1.2.3


From d7bca9199a27b8690ae1c71dc11f825154af7234 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Fri, 8 Mar 2024 09:12:54 -0800
Subject: mm: Introduce vmap_page_range() to map pages in PCI address space

ioremap_page_range() should be used for ranges within vmalloc range only.
The vmalloc ranges are allocated by get_vm_area(). PCI has "resource"
allocator that manages PCI_IOBASE, IO_SPACE_LIMIT address range, hence
introduce vmap_page_range() to be used exclusively to map pages
in PCI address space.

Fixes: 3e49a866c9dc ("mm: Enforce VM_IOREMAP flag and range in ioremap_page_range.")
Reported-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Miguel Ojeda <ojeda@kernel.org>
Link: https://lore.kernel.org/bpf/CANiq72ka4rir+RTN2FQoT=Vvprp_Ao-CvoYEkSNqtSY+RZj+AA@mail.gmail.com
---
 arch/arm/mm/ioremap.c            |  8 ++++----
 arch/loongarch/kernel/setup.c    |  2 +-
 arch/mips/loongson64/init.c      |  2 +-
 arch/powerpc/kernel/isa-bridge.c |  4 ++--
 drivers/pci/pci.c                |  4 ++--
 include/linux/io.h               |  7 +++++++
 mm/vmalloc.c                     | 23 +++++++++++++++--------
 7 files changed, 32 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c
index 2129070065c3..794cfea9f9d4 100644
--- a/arch/arm/mm/ioremap.c
+++ b/arch/arm/mm/ioremap.c
@@ -110,8 +110,8 @@ void __init add_static_vm_early(struct static_vm *svm)
 int ioremap_page(unsigned long virt, unsigned long phys,
 		 const struct mem_type *mtype)
 {
-	return ioremap_page_range(virt, virt + PAGE_SIZE, phys,
-				  __pgprot(mtype->prot_pte));
+	return vmap_page_range(virt, virt + PAGE_SIZE, phys,
+			       __pgprot(mtype->prot_pte));
 }
 EXPORT_SYMBOL(ioremap_page);
 
@@ -466,8 +466,8 @@ int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr)
 	if (res->end > IO_SPACE_LIMIT)
 		return -EINVAL;
 
-	return ioremap_page_range(vaddr, vaddr + resource_size(res), phys_addr,
-				  __pgprot(get_mem_type(pci_ioremap_mem_type)->prot_pte));
+	return vmap_page_range(vaddr, vaddr + resource_size(res), phys_addr,
+			       __pgprot(get_mem_type(pci_ioremap_mem_type)->prot_pte));
 }
 EXPORT_SYMBOL(pci_remap_iospace);
 
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index 634ef17fd38b..fd915ad69c09 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -490,7 +490,7 @@ static int __init add_legacy_isa_io(struct fwnode_handle *fwnode,
 	}
 
 	vaddr = (unsigned long)(PCI_IOBASE + range->io_start);
-	ioremap_page_range(vaddr, vaddr + size, hw_start, pgprot_device(PAGE_KERNEL));
+	vmap_page_range(vaddr, vaddr + size, hw_start, pgprot_device(PAGE_KERNEL));
 
 	return 0;
 }
diff --git a/arch/mips/loongson64/init.c b/arch/mips/loongson64/init.c
index 553142c1f14f..a35dd7311795 100644
--- a/arch/mips/loongson64/init.c
+++ b/arch/mips/loongson64/init.c
@@ -180,7 +180,7 @@ static int __init add_legacy_isa_io(struct fwnode_handle *fwnode, resource_size_
 
 	vaddr = PCI_IOBASE + range->io_start;
 
-	ioremap_page_range(vaddr, vaddr + size, hw_start, pgprot_device(PAGE_KERNEL));
+	vmap_page_range(vaddr, vaddr + size, hw_start, pgprot_device(PAGE_KERNEL));
 
 	return 0;
 }
diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c
index 48e0eaf1ad61..5c064485197a 100644
--- a/arch/powerpc/kernel/isa-bridge.c
+++ b/arch/powerpc/kernel/isa-bridge.c
@@ -46,8 +46,8 @@ static void remap_isa_base(phys_addr_t pa, unsigned long size)
 	WARN_ON_ONCE(size & ~PAGE_MASK);
 
 	if (slab_is_available()) {
-		if (ioremap_page_range(ISA_IO_BASE, ISA_IO_BASE + size, pa,
-				pgprot_noncached(PAGE_KERNEL)))
+		if (vmap_page_range(ISA_IO_BASE, ISA_IO_BASE + size, pa,
+				    pgprot_noncached(PAGE_KERNEL)))
 			vunmap_range(ISA_IO_BASE, ISA_IO_BASE + size);
 	} else {
 		early_ioremap_range(ISA_IO_BASE, pa, size,
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index c3585229c12a..ccee56615f78 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4353,8 +4353,8 @@ int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr)
 	if (res->end > IO_SPACE_LIMIT)
 		return -EINVAL;
 
-	return ioremap_page_range(vaddr, vaddr + resource_size(res), phys_addr,
-				  pgprot_device(PAGE_KERNEL));
+	return vmap_page_range(vaddr, vaddr + resource_size(res), phys_addr,
+			       pgprot_device(PAGE_KERNEL));
 #else
 	/*
 	 * This architecture does not have memory mapped I/O space,
diff --git a/include/linux/io.h b/include/linux/io.h
index 7304f2a69960..235ba7d80a8f 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -23,12 +23,19 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
 #ifdef CONFIG_MMU
 int ioremap_page_range(unsigned long addr, unsigned long end,
 		       phys_addr_t phys_addr, pgprot_t prot);
+int vmap_page_range(unsigned long addr, unsigned long end,
+		    phys_addr_t phys_addr, pgprot_t prot);
 #else
 static inline int ioremap_page_range(unsigned long addr, unsigned long end,
 				     phys_addr_t phys_addr, pgprot_t prot)
 {
 	return 0;
 }
+static inline int vmap_page_range(unsigned long addr, unsigned long end,
+				  phys_addr_t phys_addr, pgprot_t prot)
+{
+	return 0;
+}
 #endif
 
 /*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e5b8c70950bc..1e36322d83d8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -304,11 +304,24 @@ static int vmap_range_noflush(unsigned long addr, unsigned long end,
 	return err;
 }
 
+int vmap_page_range(unsigned long addr, unsigned long end,
+		    phys_addr_t phys_addr, pgprot_t prot)
+{
+	int err;
+
+	err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
+				 ioremap_max_page_shift);
+	flush_cache_vmap(addr, end);
+	if (!err)
+		err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
+					       ioremap_max_page_shift);
+	return err;
+}
+
 int ioremap_page_range(unsigned long addr, unsigned long end,
 		phys_addr_t phys_addr, pgprot_t prot)
 {
 	struct vm_struct *area;
-	int err;
 
 	area = find_vm_area((void *)addr);
 	if (!area || !(area->flags & VM_IOREMAP)) {
@@ -322,13 +335,7 @@ int ioremap_page_range(unsigned long addr, unsigned long end,
 			  (long)area->addr + get_vm_area_size(area));
 		return -ERANGE;
 	}
-	err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
-				 ioremap_max_page_shift);
-	flush_cache_vmap(addr, end);
-	if (!err)
-		err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
-					       ioremap_max_page_shift);
-	return err;
+	return vmap_page_range(addr, end, phys_addr, prot);
 }
 
 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-- 
cgit v1.2.3


From e5b7aefe38f7f6258935d8a10c36552dd957048a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 8 Mar 2024 10:22:30 +0000
Subject: net: gro: move two declarations to include/net/gro.h

Move gro_find_receive_by_type() and gro_find_complete_by_type()
to include/net/gro.h where they belong.

Also use _NET_GRO_H instead of _NET_IPV6_GRO_H to protect
include/net/gro.h from multiple inclusions.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240308102230.296224-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 --
 include/net/gro.h         | 9 ++++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4230c7f3b959..c6f6ac779b34 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3901,8 +3901,6 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
 void napi_get_frags_check(struct napi_struct *napi);
 gro_result_t napi_gro_frags(struct napi_struct *napi);
-struct packet_offload *gro_find_receive_by_type(__be16 type);
-struct packet_offload *gro_find_complete_by_type(__be16 type);
 
 static inline void napi_free_frags(struct napi_struct *napi)
 {
diff --git a/include/net/gro.h b/include/net/gro.h
index d6fc8fbd3730..50f1e403dbbb 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 
-#ifndef _NET_IPV6_GRO_H
-#define _NET_IPV6_GRO_H
+#ifndef _NET_GRO_H
+#define _NET_GRO_H
 
 #include <linux/indirect_call_wrapper.h>
 #include <linux/ip.h>
@@ -494,4 +494,7 @@ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *
 #endif
 }
 
-#endif /* _NET_IPV6_GRO_H */
+struct packet_offload *gro_find_receive_by_type(__be16 type);
+struct packet_offload *gro_find_complete_by_type(__be16 type);
+
+#endif /* _NET_GRO_H */
-- 
cgit v1.2.3


From e99eb57e9b14d830a571c5255248d4d7eb08b27e Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Fri, 8 Mar 2024 13:59:46 +0100
Subject: net: nexthop: Have all NH notifiers carry NH ID

When sending the notifications to collect NH statistics for resilient
groups, the driver will need to know the nexthop IDs in individual buckets
to look up the right counter. To that end, move the nexthop ID from struct
nh_notifier_grp_entry_info to nh_notifier_single_info.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://lore.kernel.org/r/8f964cd50b1a56d3606ce7ab4c50354ae019c43b.1709901020.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/nexthop.h | 2 +-
 net/ipv4/nexthop.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 7ec9cc80f11c..7ca315ad500e 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -185,6 +185,7 @@ struct nh_notifier_single_info {
 		__be32 ipv4;
 		struct in6_addr ipv6;
 	};
+	u32 id;
 	u8 is_reject:1,
 	   is_fdb:1,
 	   has_encap:1;
@@ -192,7 +193,6 @@ struct nh_notifier_single_info {
 
 struct nh_notifier_grp_entry_info {
 	u8 weight;
-	u32 id;
 	struct nh_notifier_single_info nh;
 };
 
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index aa1b36650282..73b849be6504 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -104,6 +104,7 @@ __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info,
 	else if (nh_info->gw_family == AF_INET6)
 		nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6;
 
+	nh_info->id = nhi->nh_parent->id;
 	nh_info->is_reject = nhi->reject_nh;
 	nh_info->is_fdb = nhi->fdb_nh;
 	nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate;
@@ -150,7 +151,6 @@ static int nh_notifier_mpath_info_init(struct nh_notifier_info *info,
 		struct nh_info *nhi;
 
 		nhi = rtnl_dereference(nhge->nh->nh_info);
-		info->nh_grp->nh_entries[i].id = nhge->nh->id;
 		info->nh_grp->nh_entries[i].weight = nhge->weight;
 		__nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh,
 					       nhi);
-- 
cgit v1.2.3


From 317460317a02a1af512697e6e964298dedd8a163 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 7 Mar 2024 17:07:59 -0800
Subject: bpf: Introduce bpf_arena.

Introduce bpf_arena, which is a sparse shared memory region between the bpf
program and user space.

Use cases:
1. User space mmap-s bpf_arena and uses it as a traditional mmap-ed
   anonymous region, like memcached or any key/value storage. The bpf
   program implements an in-kernel accelerator. XDP prog can search for
   a key in bpf_arena and return a value without going to user space.
2. The bpf program builds arbitrary data structures in bpf_arena (hash
   tables, rb-trees, sparse arrays), while user space consumes it.
3. bpf_arena is a "heap" of memory from the bpf program's point of view.
   The user space may mmap it, but bpf program will not convert pointers
   to user base at run-time to improve bpf program speed.

Initially, the kernel vm_area and user vma are not populated. User space
can fault in pages within the range. While servicing a page fault,
bpf_arena logic will insert a new page into the kernel and user vmas. The
bpf program can allocate pages from that region via
bpf_arena_alloc_pages(). This kernel function will insert pages into the
kernel vm_area. The subsequent fault-in from user space will populate that
page into the user vma. The BPF_F_SEGV_ON_FAULT flag at arena creation time
can be used to prevent fault-in from user space. In such a case, if a page
is not allocated by the bpf program and not present in the kernel vm_area,
the user process will segfault. This is useful for use cases 2 and 3 above.

bpf_arena_alloc_pages() is similar to user space mmap(). It allocates pages
either at a specific address within the arena or allocates a range with the
maple tree. bpf_arena_free_pages() is analogous to munmap(), which frees
pages and removes the range from the kernel vm_area and from user process
vmas.

bpf_arena can be used as a bpf program "heap" of up to 4GB. The speed of
bpf program is more important than ease of sharing with user space. This is
use case 3. In such a case, the BPF_F_NO_USER_CONV flag is recommended.
It will tell the verifier to treat the rX = bpf_arena_cast_user(rY)
instruction as a 32-bit move wX = wY, which will improve bpf prog
performance. Otherwise, bpf_arena_cast_user is translated by JIT to
conditionally add the upper 32 bits of user vm_start (if the pointer is not
NULL) to arena pointers before they are stored into memory. This way, user
space sees them as valid 64-bit pointers.

Diff https://github.com/llvm/llvm-project/pull/84410 enables LLVM BPF
backend generate the bpf_addr_space_cast() instruction to cast pointers
between address_space(1) which is reserved for bpf_arena pointers and
default address space zero. All arena pointers in a bpf program written in
C language are tagged as __attribute__((address_space(1))). Hence, clang
provides helpful diagnostics when pointers cross address space. Libbpf and
the kernel support only address_space == 1. All other address space
identifiers are reserved.

rX = bpf_addr_space_cast(rY, /* dst_as */ 1, /* src_as */ 0) tells the
verifier that rX->type = PTR_TO_ARENA. Any further operations on
PTR_TO_ARENA register have to be in the 32-bit domain. The verifier will
mark load/store through PTR_TO_ARENA with PROBE_MEM32. JIT will generate
them as kern_vm_start + 32bit_addr memory accesses. The behavior is similar
to copy_from_kernel_nofault() except that no address checks are necessary.
The address is guaranteed to be in the 4GB range. If the page is not
present, the destination register is zeroed on read, and the operation is
ignored on write.

rX = bpf_addr_space_cast(rY, 0, 1) tells the verifier that rX->type =
unknown scalar. If arena->map_flags has BPF_F_NO_USER_CONV set, then the
verifier converts such cast instructions to mov32. Otherwise, JIT will emit
native code equivalent to:
rX = (u32)rY;
if (rY)
  rX |= clear_lo32_bits(arena->user_vm_start); /* replace hi32 bits in rX */

After such conversion, the pointer becomes a valid user pointer within
bpf_arena range. The user process can access data structures created in
bpf_arena without any additional computations. For example, a linked list
built by a bpf program can be walked natively by user space.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Barret Rhoden <brho@google.com>
Link: https://lore.kernel.org/bpf/20240308010812.89848-2-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h            |   7 +-
 include/linux/bpf_types.h      |   1 +
 include/uapi/linux/bpf.h       |  10 +
 kernel/bpf/Makefile            |   3 +
 kernel/bpf/arena.c             | 558 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/core.c              |  11 +
 kernel/bpf/syscall.c           |  36 +++
 kernel/bpf/verifier.c          |   1 +
 tools/include/uapi/linux/bpf.h |  10 +
 9 files changed, 635 insertions(+), 2 deletions(-)
 create mode 100644 kernel/bpf/arena.c

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 95e07673cdc1..ea6ab6e0eef9 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -37,6 +37,7 @@ struct perf_event;
 struct bpf_prog;
 struct bpf_prog_aux;
 struct bpf_map;
+struct bpf_arena;
 struct sock;
 struct seq_file;
 struct btf;
@@ -528,8 +529,8 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head,
 			struct bpf_spin_lock *spin_lock);
 void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
 		      struct bpf_spin_lock *spin_lock);
-
-
+u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena);
+u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena);
 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
 
 struct bpf_offload_dev;
@@ -2215,6 +2216,8 @@ int  generic_map_delete_batch(struct bpf_map *map,
 struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
 
+int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
+			unsigned long nr_pages, struct page **page_array);
 #ifdef CONFIG_MEMCG_KMEM
 void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
 			   int node);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 94baced5a1ad..9f2a6b83b49e 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -132,6 +132,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops)
 
 BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 85ec7fc799d7..e30d943db8a4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1009,6 +1009,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_BLOOM_FILTER,
 	BPF_MAP_TYPE_USER_RINGBUF,
 	BPF_MAP_TYPE_CGRP_STORAGE,
+	BPF_MAP_TYPE_ARENA,
 	__MAX_BPF_MAP_TYPE
 };
 
@@ -1396,6 +1397,12 @@ enum {
 
 /* BPF token FD is passed in a corresponding command's token_fd field */
 	BPF_F_TOKEN_FD          = (1U << 16),
+
+/* When user space page faults in bpf_arena send SIGSEGV instead of inserting new page */
+	BPF_F_SEGV_ON_FAULT	= (1U << 17),
+
+/* Do not translate kernel bpf_arena pointers to user pointers */
+	BPF_F_NO_USER_CONV	= (1U << 18),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1467,6 +1474,9 @@ union bpf_attr {
 		 * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the
 		 * number of hash functions (if 0, the bloom filter will default
 		 * to using 5 hash functions).
+		 *
+		 * BPF_MAP_TYPE_ARENA - contains the address where user space
+		 * is going to mmap() the arena. It has to be page aligned.
 		 */
 		__u64	map_extra;
 
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 4ce95acfcaa7..368c5d86b5b7 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -15,6 +15,9 @@ obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
 obj-$(CONFIG_BPF_JIT) += trampoline.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
+ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy)
+obj-$(CONFIG_BPF_SYSCALL) += arena.o
+endif
 obj-$(CONFIG_BPF_JIT) += dispatcher.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
new file mode 100644
index 000000000000..86571e760dd6
--- /dev/null
+++ b/kernel/bpf/arena.c
@@ -0,0 +1,558 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/btf_ids.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+
+/*
+ * bpf_arena is a sparsely populated shared memory region between bpf program and
+ * user space process.
+ *
+ * For example on x86-64 the values could be:
+ * user_vm_start 7f7d26200000     // picked by mmap()
+ * kern_vm_start ffffc90001e69000 // picked by get_vm_area()
+ * For user space all pointers within the arena are normal 8-byte addresses.
+ * In this example 7f7d26200000 is the address of the first page (pgoff=0).
+ * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr
+ * (u32)7f7d26200000 -> 26200000
+ * hence
+ * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb
+ * kernel memory region.
+ *
+ * BPF JITs generate the following code to access arena:
+ *   mov eax, eax  // eax has lower 32-bit of user pointer
+ *   mov word ptr [rax + r12 + off], bx
+ * where r12 == kern_vm_start and off is s16.
+ * Hence allocate 4Gb + GUARD_SZ/2 on each side.
+ *
+ * Initially kernel vm_area and user vma are not populated.
+ * User space can fault-in any address which will insert the page
+ * into kernel and user vma.
+ * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc
+ * which will insert it into kernel vm_area.
+ * The later fault-in from user space will populate that page into user vma.
+ */
+
+/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */
+#define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8)
+#define KERN_VM_SZ ((1ull << 32) + GUARD_SZ)
+
+struct bpf_arena {
+	struct bpf_map map;
+	u64 user_vm_start;
+	u64 user_vm_end;
+	struct vm_struct *kern_vm;
+	struct maple_tree mt;
+	struct list_head vma_list;
+	struct mutex lock;
+};
+
+u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
+{
+	return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0;
+}
+
+u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
+{
+	return arena ? arena->user_vm_start : 0;
+}
+
+static long arena_map_peek_elem(struct bpf_map *map, void *value)
+{
+	return -EOPNOTSUPP;
+}
+
+static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static long arena_map_pop_elem(struct bpf_map *map, void *value)
+{
+	return -EOPNOTSUPP;
+}
+
+static long arena_map_delete_elem(struct bpf_map *map, void *value)
+{
+	return -EOPNOTSUPP;
+}
+
+static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	return -EOPNOTSUPP;
+}
+
+static long compute_pgoff(struct bpf_arena *arena, long uaddr)
+{
+	return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
+}
+
+static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
+{
+	struct vm_struct *kern_vm;
+	int numa_node = bpf_map_attr_numa_node(attr);
+	struct bpf_arena *arena;
+	u64 vm_range;
+	int err = -ENOMEM;
+
+	if (attr->key_size || attr->value_size || attr->max_entries == 0 ||
+	    /* BPF_F_MMAPABLE must be set */
+	    !(attr->map_flags & BPF_F_MMAPABLE) ||
+	    /* No unsupported flags present */
+	    (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV)))
+		return ERR_PTR(-EINVAL);
+
+	if (attr->map_extra & ~PAGE_MASK)
+		/* If non-zero the map_extra is an expected user VMA start address */
+		return ERR_PTR(-EINVAL);
+
+	vm_range = (u64)attr->max_entries * PAGE_SIZE;
+	if (vm_range > (1ull << 32))
+		return ERR_PTR(-E2BIG);
+
+	if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32))
+		/* user vma must not cross 32-bit boundary */
+		return ERR_PTR(-ERANGE);
+
+	kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP);
+	if (!kern_vm)
+		return ERR_PTR(-ENOMEM);
+
+	arena = bpf_map_area_alloc(sizeof(*arena), numa_node);
+	if (!arena)
+		goto err;
+
+	arena->kern_vm = kern_vm;
+	arena->user_vm_start = attr->map_extra;
+	if (arena->user_vm_start)
+		arena->user_vm_end = arena->user_vm_start + vm_range;
+
+	INIT_LIST_HEAD(&arena->vma_list);
+	bpf_map_init_from_attr(&arena->map, attr);
+	mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE);
+	mutex_init(&arena->lock);
+
+	return &arena->map;
+err:
+	free_vm_area(kern_vm);
+	return ERR_PTR(err);
+}
+
+static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
+{
+	struct page *page;
+	pte_t pte;
+
+	pte = ptep_get(ptep);
+	if (!pte_present(pte)) /* sanity check */
+		return 0;
+	page = pte_page(pte);
+	/*
+	 * We do not update pte here:
+	 * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
+	 * 2. TLB flushing is batched or deferred. Even if we clear pte,
+	 * the TLB entries can stick around and continue to permit access to
+	 * the freed page. So it all relies on 1.
+	 */
+	__free_page(page);
+	return 0;
+}
+
+static void arena_map_free(struct bpf_map *map)
+{
+	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+	/*
+	 * Check that user vma-s are not around when bpf map is freed.
+	 * mmap() holds vm_file which holds bpf_map refcnt.
+	 * munmap() must have happened on vma followed by arena_vm_close()
+	 * which would clear arena->vma_list.
+	 */
+	if (WARN_ON_ONCE(!list_empty(&arena->vma_list)))
+		return;
+
+	/*
+	 * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area().
+	 * It unmaps everything from vmalloc area and clears pgtables.
+	 * Call apply_to_existing_page_range() first to find populated ptes and
+	 * free those pages.
+	 */
+	apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
+				     KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+	free_vm_area(arena->kern_vm);
+	mtree_destroy(&arena->mt);
+	bpf_map_area_free(arena);
+}
+
+static void *arena_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static long arena_map_update_elem(struct bpf_map *map, void *key,
+				  void *value, u64 flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+			       const struct btf_type *key_type, const struct btf_type *value_type)
+{
+	return 0;
+}
+
+static u64 arena_map_mem_usage(const struct bpf_map *map)
+{
+	return 0;
+}
+
+struct vma_list {
+	struct vm_area_struct *vma;
+	struct list_head head;
+};
+
+static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
+{
+	struct vma_list *vml;
+
+	vml = kmalloc(sizeof(*vml), GFP_KERNEL);
+	if (!vml)
+		return -ENOMEM;
+	vma->vm_private_data = vml;
+	vml->vma = vma;
+	list_add(&vml->head, &arena->vma_list);
+	return 0;
+}
+
+static void arena_vm_close(struct vm_area_struct *vma)
+{
+	struct bpf_map *map = vma->vm_file->private_data;
+	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+	struct vma_list *vml;
+
+	guard(mutex)(&arena->lock);
+	vml = vma->vm_private_data;
+	list_del(&vml->head);
+	vma->vm_private_data = NULL;
+	kfree(vml);
+}
+
+#define MT_ENTRY ((void *)&arena_map_ops) /* unused. has to be valid pointer */
+
+static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
+{
+	struct bpf_map *map = vmf->vma->vm_file->private_data;
+	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+	struct page *page;
+	long kbase, kaddr;
+	int ret;
+
+	kbase = bpf_arena_get_kern_vm_start(arena);
+	kaddr = kbase + (u32)(vmf->address & PAGE_MASK);
+
+	guard(mutex)(&arena->lock);
+	page = vmalloc_to_page((void *)kaddr);
+	if (page)
+		/* already have a page vmap-ed */
+		goto out;
+
+	if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
+		/* User space requested to segfault when page is not allocated by bpf prog */
+		return VM_FAULT_SIGSEGV;
+
+	ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL);
+	if (ret)
+		return VM_FAULT_SIGSEGV;
+
+	/* Account into memcg of the process that created bpf_arena */
+	ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
+	if (ret) {
+		mtree_erase(&arena->mt, vmf->pgoff);
+		return VM_FAULT_SIGSEGV;
+	}
+
+	ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
+	if (ret) {
+		mtree_erase(&arena->mt, vmf->pgoff);
+		__free_page(page);
+		return VM_FAULT_SIGSEGV;
+	}
+out:
+	page_ref_add(page, 1);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct arena_vm_ops = {
+	.close		= arena_vm_close,
+	.fault          = arena_vm_fault,
+};
+
+static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr,
+					     unsigned long len, unsigned long pgoff,
+					     unsigned long flags)
+{
+	struct bpf_map *map = filp->private_data;
+	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+	long ret;
+
+	if (pgoff)
+		return -EINVAL;
+	if (len > (1ull << 32))
+		return -E2BIG;
+
+	/* if user_vm_start was specified at arena creation time */
+	if (arena->user_vm_start) {
+		if (len > arena->user_vm_end - arena->user_vm_start)
+			return -E2BIG;
+		if (len != arena->user_vm_end - arena->user_vm_start)
+			return -EINVAL;
+		if (addr != arena->user_vm_start)
+			return -EINVAL;
+	}
+
+	ret = current->mm->get_unmapped_area(filp, addr, len * 2, 0, flags);
+	if (IS_ERR_VALUE(ret))
+		return ret;
+	if ((ret >> 32) == ((ret + len - 1) >> 32))
+		return ret;
+	if (WARN_ON_ONCE(arena->user_vm_start))
+		/* checks at map creation time should prevent this */
+		return -EFAULT;
+	return round_up(ret, 1ull << 32);
+}
+
+static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+	guard(mutex)(&arena->lock);
+	if (arena->user_vm_start && arena->user_vm_start != vma->vm_start)
+		/*
+		 * If map_extra was not specified at arena creation time then
+		 * 1st user process can do mmap(NULL, ...) to pick user_vm_start
+		 * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..);
+		 *   or
+		 * specify addr in map_extra and
+		 * use the same addr later with mmap(addr, MAP_FIXED..);
+		 */
+		return -EBUSY;
+
+	if (arena->user_vm_end && arena->user_vm_end != vma->vm_end)
+		/* all user processes must have the same size of mmap-ed region */
+		return -EBUSY;
+
+	/* Earlier checks should prevent this */
+	if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > (1ull << 32) || vma->vm_pgoff))
+		return -EFAULT;
+
+	if (remember_vma(arena, vma))
+		return -ENOMEM;
+
+	arena->user_vm_start = vma->vm_start;
+	arena->user_vm_end = vma->vm_end;
+	/*
+	 * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and
+	 * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid
+	 * potential change of user_vm_start.
+	 */
+	vm_flags_set(vma, VM_DONTEXPAND);
+	vma->vm_ops = &arena_vm_ops;
+	return 0;
+}
+
+static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
+{
+	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+	if ((u64)off > arena->user_vm_end - arena->user_vm_start)
+		return -ERANGE;
+	*imm = (unsigned long)arena->user_vm_start;
+	return 0;
+}
+
+BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena)
+const struct bpf_map_ops arena_map_ops = {
+	.map_meta_equal = bpf_map_meta_equal,
+	.map_alloc = arena_map_alloc,
+	.map_free = arena_map_free,
+	.map_direct_value_addr = arena_map_direct_value_addr,
+	.map_mmap = arena_map_mmap,
+	.map_get_unmapped_area = arena_get_unmapped_area,
+	.map_get_next_key = arena_map_get_next_key,
+	.map_push_elem = arena_map_push_elem,
+	.map_peek_elem = arena_map_peek_elem,
+	.map_pop_elem = arena_map_pop_elem,
+	.map_lookup_elem = arena_map_lookup_elem,
+	.map_update_elem = arena_map_update_elem,
+	.map_delete_elem = arena_map_delete_elem,
+	.map_check_btf = arena_map_check_btf,
+	.map_mem_usage = arena_map_mem_usage,
+	.map_btf_id = &bpf_arena_map_btf_ids[0],
+};
+
+static u64 clear_lo32(u64 val)
+{
+	return val & ~(u64)~0U;
+}
+
+/*
+ * Allocate pages and vmap them into kernel vmalloc area.
+ * Later the pages will be mmaped into user space vma.
+ */
+static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id)
+{
+	/* user_vm_end/start are fixed before bpf prog runs */
+	long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
+	u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
+	struct page **pages;
+	long pgoff = 0;
+	u32 uaddr32;
+	int ret, i;
+
+	if (page_cnt > page_cnt_max)
+		return 0;
+
+	if (uaddr) {
+		if (uaddr & ~PAGE_MASK)
+			return 0;
+		pgoff = compute_pgoff(arena, uaddr);
+		if (pgoff + page_cnt > page_cnt_max)
+			/* requested address will be outside of user VMA */
+			return 0;
+	}
+
+	/* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */
+	pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		return 0;
+
+	guard(mutex)(&arena->lock);
+
+	if (uaddr)
+		ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1,
+					 MT_ENTRY, GFP_KERNEL);
+	else
+		ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY,
+					page_cnt, 0, page_cnt_max - 1, GFP_KERNEL);
+	if (ret)
+		goto out_free_pages;
+
+	ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
+				  node_id, page_cnt, pages);
+	if (ret)
+		goto out;
+
+	uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
+	/* Earlier checks make sure that uaddr32 + page_cnt * PAGE_SIZE will not overflow 32-bit */
+	ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
+				kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
+	if (ret) {
+		for (i = 0; i < page_cnt; i++)
+			__free_page(pages[i]);
+		goto out;
+	}
+	kvfree(pages);
+	return clear_lo32(arena->user_vm_start) + uaddr32;
+out:
+	mtree_erase(&arena->mt, pgoff);
+out_free_pages:
+	kvfree(pages);
+	return 0;
+}
+
+/*
+ * If page is present in vmalloc area, unmap it from vmalloc area,
+ * unmap it from all user space vma-s,
+ * and free it.
+ */
+static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
+{
+	struct vma_list *vml;
+
+	list_for_each_entry(vml, &arena->vma_list, head)
+		zap_page_range_single(vml->vma, uaddr,
+				      PAGE_SIZE * page_cnt, NULL);
+}
+
+static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
+{
+	u64 full_uaddr, uaddr_end;
+	long kaddr, pgoff, i;
+	struct page *page;
+
+	/* only aligned lower 32-bit are relevant */
+	uaddr = (u32)uaddr;
+	uaddr &= PAGE_MASK;
+	full_uaddr = clear_lo32(arena->user_vm_start) + uaddr;
+	uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT));
+	if (full_uaddr >= uaddr_end)
+		return;
+
+	page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
+
+	guard(mutex)(&arena->lock);
+
+	pgoff = compute_pgoff(arena, uaddr);
+	/* clear range */
+	mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL);
+
+	if (page_cnt > 1)
+		/* bulk zap if multiple pages being freed */
+		zap_pages(arena, full_uaddr, page_cnt);
+
+	kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
+	for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) {
+		page = vmalloc_to_page((void *)kaddr);
+		if (!page)
+			continue;
+		if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */
+			zap_pages(arena, full_uaddr, 1);
+		vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
+		__free_page(page);
+	}
+}
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
+					int node_id, u64 flags)
+{
+	struct bpf_map *map = p__map;
+	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+	if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
+		return NULL;
+
+	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id);
+}
+
+__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
+{
+	struct bpf_map *map = p__map;
+	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+	if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
+		return;
+	arena_free_pages(arena, (long)ptr__ign, page_cnt);
+}
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(arena_kfuncs)
+BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_KFUNCS_END(arena_kfuncs)
+
+static const struct btf_kfunc_id_set common_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &arena_kfuncs,
+};
+
+static int __init kfunc_init(void)
+{
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
+}
+late_initcall(kfunc_init);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 134b7979f537..a8ecf69c7754 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2976,6 +2976,17 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp,
 {
 }
 
+/* for configs without MMU or 32-bit */
+__weak const struct bpf_map_ops arena_map_ops;
+__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
+{
+	return 0;
+}
+__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
+{
+	return 0;
+}
+
 #ifdef CONFIG_BPF_SYSCALL
 static int __init bpf_global_ma_init(void)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f63f4da4db5e..67923e41a07e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -164,6 +164,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
 	if (bpf_map_is_offloaded(map)) {
 		return bpf_map_offload_update_elem(map, key, value, flags);
 	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+		   map->map_type == BPF_MAP_TYPE_ARENA ||
 		   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
 		return map->ops->map_update_elem(map, key, value, flags);
 	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
@@ -479,6 +480,39 @@ static void bpf_map_release_memcg(struct bpf_map *map)
 }
 #endif
 
+int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
+			unsigned long nr_pages, struct page **pages)
+{
+	unsigned long i, j;
+	struct page *pg;
+	int ret = 0;
+#ifdef CONFIG_MEMCG_KMEM
+	struct mem_cgroup *memcg, *old_memcg;
+
+	memcg = bpf_map_get_memcg(map);
+	old_memcg = set_active_memcg(memcg);
+#endif
+	for (i = 0; i < nr_pages; i++) {
+		pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0);
+
+		if (pg) {
+			pages[i] = pg;
+			continue;
+		}
+		for (j = 0; j < i; j++)
+			__free_page(pages[j]);
+		ret = -ENOMEM;
+		break;
+	}
+
+#ifdef CONFIG_MEMCG_KMEM
+	set_active_memcg(old_memcg);
+	mem_cgroup_put(memcg);
+#endif
+	return ret;
+}
+
+
 static int btf_field_cmp(const void *a, const void *b)
 {
 	const struct btf_field *f1 = a, *f2 = b;
@@ -1176,6 +1210,7 @@ static int map_create(union bpf_attr *attr)
 	}
 
 	if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
+	    attr->map_type != BPF_MAP_TYPE_ARENA &&
 	    attr->map_extra != 0)
 		return -EINVAL;
 
@@ -1265,6 +1300,7 @@ static int map_create(union bpf_attr *attr)
 	case BPF_MAP_TYPE_LRU_PERCPU_HASH:
 	case BPF_MAP_TYPE_STRUCT_OPS:
 	case BPF_MAP_TYPE_CPUMAP:
+	case BPF_MAP_TYPE_ARENA:
 		if (!bpf_token_capable(token, CAP_BPF))
 			goto put_token;
 		break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bf084c693507..fbcf2e5e635a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18108,6 +18108,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 		case BPF_MAP_TYPE_CGRP_STORAGE:
 		case BPF_MAP_TYPE_QUEUE:
 		case BPF_MAP_TYPE_STACK:
+		case BPF_MAP_TYPE_ARENA:
 			break;
 		default:
 			verbose(env,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 85ec7fc799d7..e30d943db8a4 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1009,6 +1009,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_BLOOM_FILTER,
 	BPF_MAP_TYPE_USER_RINGBUF,
 	BPF_MAP_TYPE_CGRP_STORAGE,
+	BPF_MAP_TYPE_ARENA,
 	__MAX_BPF_MAP_TYPE
 };
 
@@ -1396,6 +1397,12 @@ enum {
 
 /* BPF token FD is passed in a corresponding command's token_fd field */
 	BPF_F_TOKEN_FD          = (1U << 16),
+
+/* When user space page faults in bpf_arena send SIGSEGV instead of inserting new page */
+	BPF_F_SEGV_ON_FAULT	= (1U << 17),
+
+/* Do not translate kernel bpf_arena pointers to user pointers */
+	BPF_F_NO_USER_CONV	= (1U << 18),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1467,6 +1474,9 @@ union bpf_attr {
 		 * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the
 		 * number of hash functions (if 0, the bloom filter will default
 		 * to using 5 hash functions).
+		 *
+		 * BPF_MAP_TYPE_ARENA - contains the address where user space
+		 * is going to mmap() the arena. It has to be page aligned.
 		 */
 		__u64	map_extra;
 
-- 
cgit v1.2.3


From 667a86ad9b71d934c444eec193cf3508016f35c5 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 7 Mar 2024 17:08:00 -0800
Subject: bpf: Disasm support for addr_space_cast instruction.

LLVM generates rX = addr_space_cast(rY, dst_addr_space, src_addr_space)
instruction when pointers in non-zero address space are used by the bpf
program. Recognize this insn in uapi and in bpf disassembler.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/bpf/20240308010812.89848-3-alexei.starovoitov@gmail.com
---
 include/uapi/linux/bpf.h       |  4 ++++
 kernel/bpf/disasm.c            | 10 ++++++++++
 tools/include/uapi/linux/bpf.h |  4 ++++
 3 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e30d943db8a4..3c42b9f1bada 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1339,6 +1339,10 @@ enum {
  */
 #define BPF_PSEUDO_KFUNC_CALL	2
 
+enum bpf_addr_space_cast {
+	BPF_ADDR_SPACE_CAST = 1,
+};
+
 /* flags for BPF_MAP_UPDATE_ELEM command */
 enum {
 	BPF_ANY		= 0, /* create new element or update existing */
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 82b2dbdd048f..bd2e2dd04740 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -166,6 +166,12 @@ static bool is_movsx(const struct bpf_insn *insn)
 	       (insn->off == 8 || insn->off == 16 || insn->off == 32);
 }
 
+static bool is_addr_space_cast(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
+		insn->off == BPF_ADDR_SPACE_CAST;
+}
+
 void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 		    const struct bpf_insn *insn,
 		    bool allow_ptr_leaks)
@@ -184,6 +190,10 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 				insn->code, class == BPF_ALU ? 'w' : 'r',
 				insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
 				insn->dst_reg);
+		} else if (is_addr_space_cast(insn)) {
+			verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n",
+				insn->code, insn->dst_reg,
+				insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm);
 		} else if (BPF_SRC(insn->code) == BPF_X) {
 			verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n",
 				insn->code, class == BPF_ALU ? 'w' : 'r',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e30d943db8a4..3c42b9f1bada 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1339,6 +1339,10 @@ enum {
  */
 #define BPF_PSEUDO_KFUNC_CALL	2
 
+enum bpf_addr_space_cast {
+	BPF_ADDR_SPACE_CAST = 1,
+};
+
 /* flags for BPF_MAP_UPDATE_ELEM command */
 enum {
 	BPF_ANY		= 0, /* create new element or update existing */
-- 
cgit v1.2.3


From 2fe99eb0ccf2bb73df65ebcbbf2f2ff70e63547b Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 7 Mar 2024 17:08:01 -0800
Subject: bpf: Add x86-64 JIT support for PROBE_MEM32 pseudo instructions.

Add support for [LDX | STX | ST], PROBE_MEM32, [B | H | W | DW] instructions.
They are similar to PROBE_MEM instructions with the following differences:
- PROBE_MEM has to check that the address is in the kernel range with
  src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE check
- PROBE_MEM doesn't support store
- PROBE_MEM32 relies on the verifier to clear upper 32-bit in the register
- PROBE_MEM32 adds 64-bit kern_vm_start address (which is stored in %r12 in the prologue)
  Due to bpf_arena constructions such %r12 + %reg + off16 access is guaranteed
  to be within arena virtual range, so no address check at run-time.
- PROBE_MEM32 allows STX and ST. If they fault the store is a nop.
  When LDX faults the destination register is zeroed.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/bpf/20240308010812.89848-4-alexei.starovoitov@gmail.com
---
 arch/x86/net/bpf_jit_comp.c | 191 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/bpf.h         |   1 +
 include/linux/filter.h      |   3 +
 3 files changed, 194 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index e1390d1e331b..38705a1abe62 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -113,6 +113,7 @@ static int bpf_size_to_x86_bytes(int bpf_size)
 /* Pick a register outside of BPF range for JIT internal work */
 #define AUX_REG (MAX_BPF_JIT_REG + 1)
 #define X86_REG_R9 (MAX_BPF_JIT_REG + 2)
+#define X86_REG_R12 (MAX_BPF_JIT_REG + 3)
 
 /*
  * The following table maps BPF registers to x86-64 registers.
@@ -139,6 +140,7 @@ static const int reg2hex[] = {
 	[BPF_REG_AX] = 2, /* R10 temp register */
 	[AUX_REG] = 3,    /* R11 temp register */
 	[X86_REG_R9] = 1, /* R9 register, 6th function argument */
+	[X86_REG_R12] = 4, /* R12 callee saved */
 };
 
 static const int reg2pt_regs[] = {
@@ -167,6 +169,7 @@ static bool is_ereg(u32 reg)
 			     BIT(BPF_REG_8) |
 			     BIT(BPF_REG_9) |
 			     BIT(X86_REG_R9) |
+			     BIT(X86_REG_R12) |
 			     BIT(BPF_REG_AX));
 }
 
@@ -205,6 +208,17 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2)
 	return byte;
 }
 
+static u8 add_3mod(u8 byte, u32 r1, u32 r2, u32 index)
+{
+	if (is_ereg(r1))
+		byte |= 1;
+	if (is_ereg(index))
+		byte |= 2;
+	if (is_ereg(r2))
+		byte |= 4;
+	return byte;
+}
+
 /* Encode 'dst_reg' register into x86-64 opcode 'byte' */
 static u8 add_1reg(u8 byte, u32 dst_reg)
 {
@@ -645,6 +659,8 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
 		pop_r12(&prog);
 	} else {
 		pop_callee_regs(&prog, callee_regs_used);
+		if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
+			pop_r12(&prog);
 	}
 
 	EMIT1(0x58);                              /* pop rax */
@@ -704,6 +720,8 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
 		pop_r12(&prog);
 	} else {
 		pop_callee_regs(&prog, callee_regs_used);
+		if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
+			pop_r12(&prog);
 	}
 
 	EMIT1(0x58);                                  /* pop rax */
@@ -887,6 +905,18 @@ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
 	*pprog = prog;
 }
 
+static void emit_insn_suffix_SIB(u8 **pprog, u32 ptr_reg, u32 val_reg, u32 index_reg, int off)
+{
+	u8 *prog = *pprog;
+
+	if (is_imm8(off)) {
+		EMIT3(add_2reg(0x44, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off);
+	} else {
+		EMIT2_off32(add_2reg(0x84, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off);
+	}
+	*pprog = prog;
+}
+
 /*
  * Emit a REX byte if it will be necessary to address these registers
  */
@@ -968,6 +998,37 @@ static void emit_ldsx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
 	*pprog = prog;
 }
 
+static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
+{
+	u8 *prog = *pprog;
+
+	switch (size) {
+	case BPF_B:
+		/* movzx rax, byte ptr [rax + r12 + off] */
+		EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB6);
+		break;
+	case BPF_H:
+		/* movzx rax, word ptr [rax + r12 + off] */
+		EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB7);
+		break;
+	case BPF_W:
+		/* mov eax, dword ptr [rax + r12 + off] */
+		EMIT2(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x8B);
+		break;
+	case BPF_DW:
+		/* mov rax, qword ptr [rax + r12 + off] */
+		EMIT2(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x8B);
+		break;
+	}
+	emit_insn_suffix_SIB(&prog, src_reg, dst_reg, index_reg, off);
+	*pprog = prog;
+}
+
+static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
+{
+	emit_ldx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off);
+}
+
 /* STX: *(u8*)(dst_reg + off) = src_reg */
 static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
 {
@@ -1002,6 +1063,71 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
 	*pprog = prog;
 }
 
+/* STX: *(u8*)(dst_reg + index_reg + off) = src_reg */
+static void emit_stx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
+{
+	u8 *prog = *pprog;
+
+	switch (size) {
+	case BPF_B:
+		/* mov byte ptr [rax + r12 + off], al */
+		EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x88);
+		break;
+	case BPF_H:
+		/* mov word ptr [rax + r12 + off], ax */
+		EMIT3(0x66, add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89);
+		break;
+	case BPF_W:
+		/* mov dword ptr [rax + r12 + 1], eax */
+		EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89);
+		break;
+	case BPF_DW:
+		/* mov qword ptr [rax + r12 + 1], rax */
+		EMIT2(add_3mod(0x48, dst_reg, src_reg, index_reg), 0x89);
+		break;
+	}
+	emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off);
+	*pprog = prog;
+}
+
+static void emit_stx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
+{
+	emit_stx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off);
+}
+
+/* ST: *(u8*)(dst_reg + index_reg + off) = imm32 */
+static void emit_st_index(u8 **pprog, u32 size, u32 dst_reg, u32 index_reg, int off, int imm)
+{
+	u8 *prog = *pprog;
+
+	switch (size) {
+	case BPF_B:
+		/* mov byte ptr [rax + r12 + off], imm8 */
+		EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC6);
+		break;
+	case BPF_H:
+		/* mov word ptr [rax + r12 + off], imm16 */
+		EMIT3(0x66, add_3mod(0x40, dst_reg, 0, index_reg), 0xC7);
+		break;
+	case BPF_W:
+		/* mov dword ptr [rax + r12 + 1], imm32 */
+		EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC7);
+		break;
+	case BPF_DW:
+		/* mov qword ptr [rax + r12 + 1], imm32 */
+		EMIT2(add_3mod(0x48, dst_reg, 0, index_reg), 0xC7);
+		break;
+	}
+	emit_insn_suffix_SIB(&prog, dst_reg, 0, index_reg, off);
+	EMIT(imm, bpf_size_to_x86_bytes(size));
+	*pprog = prog;
+}
+
+static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm)
+{
+	emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm);
+}
+
 static int emit_atomic(u8 **pprog, u8 atomic_op,
 		       u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
 {
@@ -1043,12 +1169,15 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
 	return 0;
 }
 
+#define DONT_CLEAR 1
+
 bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
 {
 	u32 reg = x->fixup >> 8;
 
 	/* jump over faulting load and clear dest register */
-	*(unsigned long *)((void *)regs + reg) = 0;
+	if (reg != DONT_CLEAR)
+		*(unsigned long *)((void *)regs + reg) = 0;
 	regs->ip += x->fixup & 0xff;
 	return true;
 }
@@ -1147,11 +1276,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 	bool tail_call_seen = false;
 	bool seen_exit = false;
 	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+	u64 arena_vm_start;
 	int i, excnt = 0;
 	int ilen, proglen = 0;
 	u8 *prog = temp;
 	int err;
 
+	arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
+
 	detect_reg_usage(insn, insn_cnt, callee_regs_used,
 			 &tail_call_seen);
 
@@ -1172,8 +1304,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		push_r12(&prog);
 		push_callee_regs(&prog, all_callee_regs_used);
 	} else {
+		if (arena_vm_start)
+			push_r12(&prog);
 		push_callee_regs(&prog, callee_regs_used);
 	}
+	if (arena_vm_start)
+		emit_mov_imm64(&prog, X86_REG_R12,
+			       arena_vm_start >> 32, (u32) arena_vm_start);
 
 	ilen = prog - temp;
 	if (rw_image)
@@ -1564,6 +1701,56 @@ st:			if (is_imm8(insn->off))
 			emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
 			break;
 
+		case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
+		case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
+		case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
+		case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
+			start_of_ldx = prog;
+			emit_st_r12(&prog, BPF_SIZE(insn->code), dst_reg, insn->off, insn->imm);
+			goto populate_extable;
+
+			/* LDX: dst_reg = *(u8*)(src_reg + r12 + off) */
+		case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
+		case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
+		case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
+		case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
+		case BPF_STX | BPF_PROBE_MEM32 | BPF_B:
+		case BPF_STX | BPF_PROBE_MEM32 | BPF_H:
+		case BPF_STX | BPF_PROBE_MEM32 | BPF_W:
+		case BPF_STX | BPF_PROBE_MEM32 | BPF_DW:
+			start_of_ldx = prog;
+			if (BPF_CLASS(insn->code) == BPF_LDX)
+				emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
+			else
+				emit_stx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
+populate_extable:
+			{
+				struct exception_table_entry *ex;
+				u8 *_insn = image + proglen + (start_of_ldx - temp);
+				s64 delta;
+
+				if (!bpf_prog->aux->extable)
+					break;
+
+				if (excnt >= bpf_prog->aux->num_exentries) {
+					pr_err("mem32 extable bug\n");
+					return -EFAULT;
+				}
+				ex = &bpf_prog->aux->extable[excnt++];
+
+				delta = _insn - (u8 *)&ex->insn;
+				/* switch ex to rw buffer for writes */
+				ex = (void *)rw_image + ((void *)ex - (void *)image);
+
+				ex->insn = delta;
+
+				ex->data = EX_TYPE_BPF;
+
+				ex->fixup = (prog - start_of_ldx) |
+					((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8);
+			}
+			break;
+
 			/* LDX: dst_reg = *(u8*)(src_reg + off) */
 		case BPF_LDX | BPF_MEM | BPF_B:
 		case BPF_LDX | BPF_PROBE_MEM | BPF_B:
@@ -2036,6 +2223,8 @@ emit_jmp:
 				pop_r12(&prog);
 			} else {
 				pop_callee_regs(&prog, callee_regs_used);
+				if (arena_vm_start)
+					pop_r12(&prog);
 			}
 			EMIT1(0xC9);         /* leave */
 			emit_return(&prog, image + addrs[i - 1] + (prog - temp));
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ea6ab6e0eef9..8904d1606125 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1458,6 +1458,7 @@ struct bpf_prog_aux {
 	bool xdp_has_frags;
 	bool exception_cb;
 	bool exception_boundary;
+	struct bpf_arena *arena;
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
 	/* function name for valid attach_btf_id */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 36cc29a2934c..b119f04ecb0b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -72,6 +72,9 @@ struct ctl_table_header;
 /* unused opcode to mark special ldsx instruction. Same as BPF_IND */
 #define BPF_PROBE_MEMSX	0x40
 
+/* unused opcode to mark special load instruction. Same as BPF_MSH */
+#define BPF_PROBE_MEM32	0xa0
+
 /* unused opcode to mark call to interpreter with arguments */
 #define BPF_CALL_ARGS	0xe0
 
-- 
cgit v1.2.3


From 142fd4d2dcf58b1720a6af644f31de1a5551f219 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 7 Mar 2024 17:08:02 -0800
Subject: bpf: Add x86-64 JIT support for bpf_addr_space_cast instruction.

LLVM generates bpf_addr_space_cast instruction while translating
pointers between native (zero) address space and
__attribute__((address_space(N))).
The addr_space=1 is reserved as bpf_arena address space.

rY = addr_space_cast(rX, 0, 1) is processed by the verifier and
converted to normal 32-bit move: wX = wY

rY = addr_space_cast(rX, 1, 0) has to be converted by JIT:

aux_reg = upper_32_bits of arena->user_vm_start
aux_reg <<= 32
wX = wY // clear upper 32 bits of dst register
if (wX) // if not zero add upper bits of user_vm_start
  wX |= aux_reg

JIT can do it more efficiently:

mov dst_reg32, src_reg32  // 32-bit move
shl dst_reg, 32
or dst_reg, user_vm_start
rol dst_reg, 32
xor r11, r11
test dst_reg32, dst_reg32 // check if lower 32-bit are zero
cmove r11, dst_reg	  // if so, set dst_reg to zero
			  // Intel swapped src/dst register encoding in CMOVcc

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20240308010812.89848-5-alexei.starovoitov@gmail.com
---
 arch/x86/net/bpf_jit_comp.c | 42 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/filter.h      |  1 +
 kernel/bpf/core.c           |  5 +++++
 3 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 38705a1abe62..27058d7395f6 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1276,13 +1276,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 	bool tail_call_seen = false;
 	bool seen_exit = false;
 	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
-	u64 arena_vm_start;
+	u64 arena_vm_start, user_vm_start;
 	int i, excnt = 0;
 	int ilen, proglen = 0;
 	u8 *prog = temp;
 	int err;
 
 	arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
+	user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
 
 	detect_reg_usage(insn, insn_cnt, callee_regs_used,
 			 &tail_call_seen);
@@ -1350,6 +1351,40 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 			break;
 
 		case BPF_ALU64 | BPF_MOV | BPF_X:
+			if (insn->off == BPF_ADDR_SPACE_CAST &&
+			    insn->imm == 1U << 16) {
+				if (dst_reg != src_reg)
+					/* 32-bit mov */
+					emit_mov_reg(&prog, false, dst_reg, src_reg);
+				/* shl dst_reg, 32 */
+				maybe_emit_1mod(&prog, dst_reg, true);
+				EMIT3(0xC1, add_1reg(0xE0, dst_reg), 32);
+
+				/* or dst_reg, user_vm_start */
+				maybe_emit_1mod(&prog, dst_reg, true);
+				if (is_axreg(dst_reg))
+					EMIT1_off32(0x0D,  user_vm_start >> 32);
+				else
+					EMIT2_off32(0x81, add_1reg(0xC8, dst_reg),  user_vm_start >> 32);
+
+				/* rol dst_reg, 32 */
+				maybe_emit_1mod(&prog, dst_reg, true);
+				EMIT3(0xC1, add_1reg(0xC0, dst_reg), 32);
+
+				/* xor r11, r11 */
+				EMIT3(0x4D, 0x31, 0xDB);
+
+				/* test dst_reg32, dst_reg32; check if lower 32-bit are zero */
+				maybe_emit_mod(&prog, dst_reg, dst_reg, false);
+				EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg));
+
+				/* cmove r11, dst_reg; if so, set dst_reg to zero */
+				/* WARNING: Intel swapped src/dst register encoding in CMOVcc !!! */
+				maybe_emit_mod(&prog, AUX_REG, dst_reg, true);
+				EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg));
+				break;
+			}
+			fallthrough;
 		case BPF_ALU | BPF_MOV | BPF_X:
 			if (insn->off == 0)
 				emit_mov_reg(&prog,
@@ -3432,6 +3467,11 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
 	}
 }
 
+bool bpf_jit_supports_arena(void)
+{
+	return true;
+}
+
 bool bpf_jit_supports_ptr_xchg(void)
 {
 	return true;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index b119f04ecb0b..c99bc3df2d28 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -962,6 +962,7 @@ bool bpf_jit_supports_kfunc_call(void);
 bool bpf_jit_supports_far_kfunc_call(void);
 bool bpf_jit_supports_exceptions(void);
 bool bpf_jit_supports_ptr_xchg(void);
+bool bpf_jit_supports_arena(void);
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
 bool bpf_helper_changes_pkt_data(void *func);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a8ecf69c7754..bdbdc75cdcd5 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2932,6 +2932,11 @@ bool __weak bpf_jit_supports_far_kfunc_call(void)
 	return false;
 }
 
+bool __weak bpf_jit_supports_arena(void)
+{
+	return false;
+}
+
 /* Return TRUE if the JIT backend satisfies the following two conditions:
  * 1) JIT backend supports atomic_xchg() on pointer-sized words.
  * 2) Under the specific arch, the implementation of xchg() is the same
-- 
cgit v1.2.3


From 6082b6c328b5486da2b356eae94b8b83c98b5565 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 7 Mar 2024 17:08:03 -0800
Subject: bpf: Recognize addr_space_cast instruction in the verifier.

rY = addr_space_cast(rX, 0, 1) tells the verifier that rY->type = PTR_TO_ARENA.
Any further operations on PTR_TO_ARENA register have to be in 32-bit domain.

The verifier will mark load/store through PTR_TO_ARENA with PROBE_MEM32.
JIT will generate them as kern_vm_start + 32bit_addr memory accesses.

rY = addr_space_cast(rX, 1, 0) tells the verifier that rY->type = unknown scalar.
If arena->map_flags has BPF_F_NO_USER_CONV set then convert cast_user to mov32 as well.
Otherwise JIT will convert it to:
  rY = (u32)rX;
  if (rY)
     rY |= arena->user_vm_start & ~(u64)~0U;

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240308010812.89848-6-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h          |   1 +
 include/linux/bpf_verifier.h |   1 +
 kernel/bpf/log.c             |   3 ++
 kernel/bpf/syscall.c         |   6 +++
 kernel/bpf/verifier.c        | 107 +++++++++++++++++++++++++++++++++++++++----
 5 files changed, 109 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8904d1606125..d0c836ba009d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -883,6 +883,7 @@ enum bpf_reg_type {
 	 * an explicit null check is required for this struct.
 	 */
 	PTR_TO_MEM,		 /* reg points to valid memory region */
+	PTR_TO_ARENA,
 	PTR_TO_BUF,		 /* reg points to a read/write buffer */
 	PTR_TO_FUNC,		 /* reg points to a bpf program function */
 	CONST_PTR_TO_DYNPTR,	 /* reg points to a const struct bpf_dynptr */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 4b0f6600e499..7cb1b75eee38 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -548,6 +548,7 @@ struct bpf_insn_aux_data {
 	u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
 	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
 	bool zext_dst; /* this insn zero extends dst reg */
+	bool needs_zext; /* alu op needs to clear upper bits */
 	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
 	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
 	bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 63c34e7b0715..2a243cf37c60 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -458,6 +458,7 @@ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
 		[PTR_TO_XDP_SOCK]	= "xdp_sock",
 		[PTR_TO_BTF_ID]		= "ptr_",
 		[PTR_TO_MEM]		= "mem",
+		[PTR_TO_ARENA]		= "arena",
 		[PTR_TO_BUF]		= "buf",
 		[PTR_TO_FUNC]		= "func",
 		[PTR_TO_MAP_KEY]	= "map_key",
@@ -693,6 +694,8 @@ static void print_reg_state(struct bpf_verifier_env *env,
 	}
 
 	verbose(env, "%s", reg_type_str(env, t));
+	if (t == PTR_TO_ARENA)
+		return;
 	if (t == PTR_TO_STACK) {
 		if (state->frameno != reg->frameno)
 			verbose(env, "[%d]", reg->frameno);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 67923e41a07e..07f2a4db4511 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4453,6 +4453,12 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
 			continue;
 		}
 
+		if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX ||
+		     BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) {
+			insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM;
+			continue;
+		}
+
 		if (code != (BPF_LD | BPF_IMM | BPF_DW))
 			continue;
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fbcf2e5e635a..1358e20d315a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4386,6 +4386,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_MEM:
 	case PTR_TO_FUNC:
 	case PTR_TO_MAP_KEY:
+	case PTR_TO_ARENA:
 		return true;
 	default:
 		return false;
@@ -5828,6 +5829,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	case PTR_TO_XDP_SOCK:
 		pointer_desc = "xdp_sock ";
 		break;
+	case PTR_TO_ARENA:
+		return 0;
 	default:
 		break;
 	}
@@ -6937,6 +6940,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 
 		if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
 			mark_reg_unknown(env, regs, value_regno);
+	} else if (reg->type == PTR_TO_ARENA) {
+		if (t == BPF_READ && value_regno >= 0)
+			mark_reg_unknown(env, regs, value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str(env, reg->type));
@@ -8408,6 +8414,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	case PTR_TO_MEM | MEM_RINGBUF:
 	case PTR_TO_BUF:
 	case PTR_TO_BUF | MEM_RDONLY:
+	case PTR_TO_ARENA:
 	case SCALAR_VALUE:
 		return 0;
 	/* All the rest must be rejected, except PTR_TO_BTF_ID which allows
@@ -13852,6 +13859,21 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 
 	dst_reg = &regs[insn->dst_reg];
 	src_reg = NULL;
+
+	if (dst_reg->type == PTR_TO_ARENA) {
+		struct bpf_insn_aux_data *aux = cur_aux(env);
+
+		if (BPF_CLASS(insn->code) == BPF_ALU64)
+			/*
+			 * 32-bit operations zero upper bits automatically.
+			 * 64-bit operations need to be converted to 32.
+			 */
+			aux->needs_zext = true;
+
+		/* Any arithmetic operations are allowed on arena pointers */
+		return 0;
+	}
+
 	if (dst_reg->type != SCALAR_VALUE)
 		ptr_reg = dst_reg;
 	else
@@ -13969,19 +13991,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	} else if (opcode == BPF_MOV) {
 
 		if (BPF_SRC(insn->code) == BPF_X) {
-			if (insn->imm != 0) {
-				verbose(env, "BPF_MOV uses reserved fields\n");
-				return -EINVAL;
-			}
-
 			if (BPF_CLASS(insn->code) == BPF_ALU) {
-				if (insn->off != 0 && insn->off != 8 && insn->off != 16) {
+				if ((insn->off != 0 && insn->off != 8 && insn->off != 16) ||
+				    insn->imm) {
 					verbose(env, "BPF_MOV uses reserved fields\n");
 					return -EINVAL;
 				}
+			} else if (insn->off == BPF_ADDR_SPACE_CAST) {
+				if (insn->imm != 1 && insn->imm != 1u << 16) {
+					verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
+					return -EINVAL;
+				}
 			} else {
-				if (insn->off != 0 && insn->off != 8 && insn->off != 16 &&
-				    insn->off != 32) {
+				if ((insn->off != 0 && insn->off != 8 && insn->off != 16 &&
+				     insn->off != 32) || insn->imm) {
 					verbose(env, "BPF_MOV uses reserved fields\n");
 					return -EINVAL;
 				}
@@ -14008,7 +14031,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
 
 			if (BPF_CLASS(insn->code) == BPF_ALU64) {
-				if (insn->off == 0) {
+				if (insn->imm) {
+					/* off == BPF_ADDR_SPACE_CAST */
+					mark_reg_unknown(env, regs, insn->dst_reg);
+					if (insn->imm == 1) /* cast from as(1) to as(0) */
+						dst_reg->type = PTR_TO_ARENA;
+				} else if (insn->off == 0) {
 					/* case: R1 = R2
 					 * copy register state to dest reg
 					 */
@@ -15182,6 +15210,10 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 	if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
 	    insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
+		if (map->map_type == BPF_MAP_TYPE_ARENA) {
+			__mark_reg_unknown(env, dst_reg);
+			return 0;
+		}
 		dst_reg->type = PTR_TO_MAP_VALUE;
 		dst_reg->off = aux->map_off;
 		WARN_ON_ONCE(map->max_entries != 1);
@@ -16568,6 +16600,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 		 * the same stack frame, since fp-8 in foo != fp-8 in bar
 		 */
 		return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
+	case PTR_TO_ARENA:
+		return true;
 	default:
 		return regs_exact(rold, rcur, idmap);
 	}
@@ -17443,6 +17477,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_XDP_SOCK:
 	case PTR_TO_BTF_ID:
+	case PTR_TO_ARENA:
 		return false;
 	default:
 		return true;
@@ -18296,6 +18331,31 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
 				fdput(f);
 				return -EBUSY;
 			}
+			if (map->map_type == BPF_MAP_TYPE_ARENA) {
+				if (env->prog->aux->arena) {
+					verbose(env, "Only one arena per program\n");
+					fdput(f);
+					return -EBUSY;
+				}
+				if (!env->allow_ptr_leaks || !env->bpf_capable) {
+					verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n");
+					fdput(f);
+					return -EPERM;
+				}
+				if (!env->prog->jit_requested) {
+					verbose(env, "JIT is required to use arena\n");
+					return -EOPNOTSUPP;
+				}
+				if (!bpf_jit_supports_arena()) {
+					verbose(env, "JIT doesn't support arena\n");
+					return -EOPNOTSUPP;
+				}
+				env->prog->aux->arena = (void *)map;
+				if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
+					verbose(env, "arena's user address must be set via map_extra or mmap()\n");
+					return -EINVAL;
+				}
+			}
 
 			fdput(f);
 next_insn:
@@ -18917,6 +18977,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 				env->prog->aux->num_exentries++;
 			}
 			continue;
+		case PTR_TO_ARENA:
+			if (BPF_MODE(insn->code) == BPF_MEMSX) {
+				verbose(env, "sign extending loads from arena are not supported yet\n");
+				return -EOPNOTSUPP;
+			}
+			insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
+			env->prog->aux->num_exentries++;
+			continue;
 		default:
 			continue;
 		}
@@ -19102,13 +19170,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		func[i]->aux->nr_linfo = prog->aux->nr_linfo;
 		func[i]->aux->jited_linfo = prog->aux->jited_linfo;
 		func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
+		func[i]->aux->arena = prog->aux->arena;
 		num_exentries = 0;
 		insn = func[i]->insnsi;
 		for (j = 0; j < func[i]->len; j++, insn++) {
 			if (BPF_CLASS(insn->code) == BPF_LDX &&
 			    (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+			     BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
 			     BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
 				num_exentries++;
+			if ((BPF_CLASS(insn->code) == BPF_STX ||
+			     BPF_CLASS(insn->code) == BPF_ST) &&
+			     BPF_MODE(insn->code) == BPF_PROBE_MEM32)
+				num_exentries++;
 		}
 		func[i]->aux->num_exentries = num_exentries;
 		func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
@@ -19507,6 +19581,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 	}
 
 	for (i = 0; i < insn_cnt;) {
+		if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) {
+			if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) ||
+			    (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
+				/* convert to 32-bit mov that clears upper 32-bit */
+				insn->code = BPF_ALU | BPF_MOV | BPF_X;
+				/* clear off, so it's a normal 'wX = wY' from JIT pov */
+				insn->off = 0;
+			} /* cast from as(0) to as(1) should be handled by JIT */
+			goto next_insn;
+		}
+
+		if (env->insn_aux_data[i + delta].needs_zext)
+			/* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
+			insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);
+
 		/* Make divide-by-zero exceptions impossible. */
 		if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
 		    insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
-- 
cgit v1.2.3


From 2edc3de6fb650924a87fffebebc3b7572cbf6e38 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 7 Mar 2024 17:08:04 -0800
Subject: bpf: Recognize btf_decl_tag("arg: Arena") as PTR_TO_ARENA.

In global bpf functions recognize btf_decl_tag("arg:arena") as PTR_TO_ARENA.

Note, when the verifier sees:

__weak void foo(struct bar *p)

it recognizes 'p' as PTR_TO_MEM and 'struct bar' has to be a struct with scalars.
Hence the only way to use arena pointers in global functions is to tag them with "arg:arena".

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/bpf/20240308010812.89848-7-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h   |  1 +
 kernel/bpf/btf.c      | 19 +++++++++++++++----
 kernel/bpf/verifier.c | 15 +++++++++++++++
 3 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d0c836ba009d..08ad265cb195 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -712,6 +712,7 @@ enum bpf_arg_type {
 	 * on eBPF program stack
 	 */
 	ARG_PTR_TO_MEM,		/* pointer to valid memory (stack, packet, map value) */
+	ARG_PTR_TO_ARENA,
 
 	ARG_CONST_SIZE,		/* number of bytes accessed from memory */
 	ARG_CONST_SIZE_OR_ZERO,	/* number of bytes accessed from memory or 0 */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 170d017e8e4a..90c4a32d89ff 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7111,10 +7111,11 @@ cand_cache_unlock:
 }
 
 enum btf_arg_tag {
-	ARG_TAG_CTX = 0x1,
-	ARG_TAG_NONNULL = 0x2,
-	ARG_TAG_TRUSTED = 0x4,
-	ARG_TAG_NULLABLE = 0x8,
+	ARG_TAG_CTX	 = BIT_ULL(0),
+	ARG_TAG_NONNULL  = BIT_ULL(1),
+	ARG_TAG_TRUSTED  = BIT_ULL(2),
+	ARG_TAG_NULLABLE = BIT_ULL(3),
+	ARG_TAG_ARENA	 = BIT_ULL(4),
 };
 
 /* Process BTF of a function to produce high-level expectation of function
@@ -7226,6 +7227,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 				tags |= ARG_TAG_NONNULL;
 			} else if (strcmp(tag, "nullable") == 0) {
 				tags |= ARG_TAG_NULLABLE;
+			} else if (strcmp(tag, "arena") == 0) {
+				tags |= ARG_TAG_ARENA;
 			} else {
 				bpf_log(log, "arg#%d has unsupported set of tags\n", i);
 				return -EOPNOTSUPP;
@@ -7280,6 +7283,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 			sub->args[i].btf_id = kern_type_id;
 			continue;
 		}
+		if (tags & ARG_TAG_ARENA) {
+			if (tags & ~ARG_TAG_ARENA) {
+				bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i);
+				return -EINVAL;
+			}
+			sub->args[i].arg_type = ARG_PTR_TO_ARENA;
+			continue;
+		}
 		if (is_global) { /* generic user data pointer */
 			u32 mem_size;
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1358e20d315a..d64f7a9b60e8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9379,6 +9379,18 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 				bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
 				return -EINVAL;
 			}
+		} else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
+			/*
+			 * Can pass any value and the kernel won't crash, but
+			 * only PTR_TO_ARENA or SCALAR make sense. Everything
+			 * else is a bug in the bpf program. Point it out to
+			 * the user at the verification time instead of
+			 * run-time debug nightmare.
+			 */
+			if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) {
+				bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno);
+				return -EINVAL;
+			}
 		} else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
 			ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
 			if (ret)
@@ -20448,6 +20460,9 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 				reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */
 				reg->btf_id = arg->btf_id;
 				reg->id = ++env->id_gen;
+			} else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
+				/* caller can pass either PTR_TO_ARENA or SCALAR */
+				mark_reg_unknown(env, regs, i);
 			} else {
 				WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n",
 					  i - BPF_REG_1, arg->arg_type);
-- 
cgit v1.2.3


From eaf657f7adba8984509db7403ac6bdaa219e5722 Mon Sep 17 00:00:00 2001
From: William Tu <witu@nvidia.com>
Date: Sun, 10 Mar 2024 16:55:03 +0200
Subject: devlink: Add comments to use netlink gen tool

Add the comment to remind people not to manually modify
the net/devlink/netlink_gen.c, but to use tools/net/ynl/ynl-regen.sh
to generate it.

Signed-off-by: William Tu <witu@nvidia.com>
Suggested-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/20240310145503.32721-1-witu@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/devlink.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 130cae0d3e20..2da0c7eb6710 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -614,7 +614,10 @@ enum devlink_attr {
 
 	DEVLINK_ATTR_REGION_DIRECT,		/* flag */
 
-	/* add new attributes above here, update the policy in devlink.c */
+	/* Add new attributes above here, update the spec in
+	 * Documentation/netlink/specs/devlink.yaml and re-generate
+	 * net/devlink/netlink_gen.c.
+	 */
 
 	__DEVLINK_ATTR_MAX,
 	DEVLINK_ATTR_MAX = __DEVLINK_ATTR_MAX - 1
-- 
cgit v1.2.3


From 66c8473135c62f478301a0e5b3012f203562dfa6 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 8 Mar 2024 16:47:39 -0800
Subject: bpf: move sleepable flag from bpf_prog_aux to bpf_prog

prog->aux->sleepable is checked very frequently as part of (some) BPF
program run hot paths. So this extra aux indirection seems wasteful and
on busy systems might cause unnecessary memory cache misses.

Let's move sleepable flag into prog itself to eliminate unnecessary
pointer dereference.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Message-ID: <20240309004739.2961431-1-andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  8 ++++----
 kernel/bpf/bpf_iter.c          |  4 ++--
 kernel/bpf/core.c              |  2 +-
 kernel/bpf/syscall.c           |  6 +++---
 kernel/bpf/trampoline.c        |  4 ++--
 kernel/bpf/verifier.c          | 12 ++++++------
 kernel/events/core.c           |  2 +-
 kernel/trace/bpf_trace.c       |  2 +-
 net/bpf/bpf_dummy_struct_ops.c |  2 +-
 9 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 08ad265cb195..4f20f62f9d63 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1455,7 +1455,6 @@ struct bpf_prog_aux {
 	bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
 	bool attach_tracing_prog; /* true if tracing another tracing program */
 	bool func_proto_unreliable;
-	bool sleepable;
 	bool tail_call_reachable;
 	bool xdp_has_frags;
 	bool exception_cb;
@@ -1541,7 +1540,8 @@ struct bpf_prog {
 				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
 				call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
 				call_get_func_ip:1, /* Do we call get_func_ip() */
-				tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */
+				tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
+				sleepable:1;	/* BPF program is sleepable */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
@@ -2112,14 +2112,14 @@ bpf_prog_run_array_uprobe(const struct bpf_prog_array __rcu *array_rcu,
 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
 	item = &array->items[0];
 	while ((prog = READ_ONCE(item->prog))) {
-		if (!prog->aux->sleepable)
+		if (!prog->sleepable)
 			rcu_read_lock();
 
 		run_ctx.bpf_cookie = item->bpf_cookie;
 		ret &= run_prog(prog, ctx);
 		item++;
 
-		if (!prog->aux->sleepable)
+		if (!prog->sleepable)
 			rcu_read_unlock();
 	}
 	bpf_reset_run_ctx(old_run_ctx);
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 0fae79164187..112581cf97e7 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -548,7 +548,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
 		return -ENOENT;
 
 	/* Only allow sleepable program for resched-able iterator */
-	if (prog->aux->sleepable && !bpf_iter_target_support_resched(tinfo))
+	if (prog->sleepable && !bpf_iter_target_support_resched(tinfo))
 		return -EINVAL;
 
 	link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
@@ -697,7 +697,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
 	struct bpf_run_ctx run_ctx, *old_run_ctx;
 	int ret;
 
-	if (prog->aux->sleepable) {
+	if (prog->sleepable) {
 		rcu_read_lock_trace();
 		migrate_disable();
 		might_fault();
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 297184baeb2d..696bc55de8e8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2706,7 +2706,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux,
 	bool sleepable;
 	u32 i;
 
-	sleepable = aux->sleepable;
+	sleepable = aux->prog->sleepable;
 	for (i = 0; i < len; i++) {
 		map = used_maps[i];
 		if (map->ops->map_poke_untrack)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 07f2a4db4511..ae2ff73bde7e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2248,7 +2248,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
 		btf_put(prog->aux->attach_btf);
 
 	if (deferred) {
-		if (prog->aux->sleepable)
+		if (prog->sleepable)
 			call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
 		else
 			call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
@@ -2813,11 +2813,11 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	}
 
 	prog->expected_attach_type = attr->expected_attach_type;
+	prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE);
 	prog->aux->attach_btf = attach_btf;
 	prog->aux->attach_btf_id = attr->attach_btf_id;
 	prog->aux->dst_prog = dst_prog;
 	prog->aux->dev_bound = !!attr->prog_ifindex;
-	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
 	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
 
 	/* move token into prog->aux, reuse taken refcnt */
@@ -5554,7 +5554,7 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
 	/* The bpf program will not access the bpf map, but for the sake of
 	 * simplicity, increase sleepable_refcnt for sleepable program as well.
 	 */
-	if (prog->aux->sleepable)
+	if (prog->sleepable)
 		atomic64_inc(&map->sleepable_refcnt);
 	memcpy(used_maps_new, used_maps_old,
 	       sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index d382f5ebe06c..db7599c59c78 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -1014,7 +1014,7 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
 
 bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
 {
-	bool sleepable = prog->aux->sleepable;
+	bool sleepable = prog->sleepable;
 
 	if (bpf_prog_check_recur(prog))
 		return sleepable ? __bpf_prog_enter_sleepable_recur :
@@ -1029,7 +1029,7 @@ bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
 
 bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
 {
-	bool sleepable = prog->aux->sleepable;
+	bool sleepable = prog->sleepable;
 
 	if (bpf_prog_check_recur(prog))
 		return sleepable ? __bpf_prog_exit_sleepable_recur :
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d64f7a9b60e8..d501c90f8ab5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5274,7 +5274,7 @@ bad_type:
 
 static bool in_sleepable(struct bpf_verifier_env *env)
 {
-	return env->prog->aux->sleepable;
+	return env->prog->sleepable;
 }
 
 /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
@@ -18137,7 +18137,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	if (prog->aux->sleepable)
+	if (prog->sleepable)
 		switch (map->map_type) {
 		case BPF_MAP_TYPE_HASH:
 		case BPF_MAP_TYPE_LRU_HASH:
@@ -18325,7 +18325,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
 				return -E2BIG;
 			}
 
-			if (env->prog->aux->sleepable)
+			if (env->prog->sleepable)
 				atomic64_inc(&map->sleepable_refcnt);
 			/* hold the map. If the program is rejected by verifier,
 			 * the map will be released by release_maps() or it
@@ -20938,7 +20938,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			}
 		}
 
-		if (prog->aux->sleepable) {
+		if (prog->sleepable) {
 			ret = -EINVAL;
 			switch (prog->type) {
 			case BPF_PROG_TYPE_TRACING:
@@ -21049,14 +21049,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	u64 key;
 
 	if (prog->type == BPF_PROG_TYPE_SYSCALL) {
-		if (prog->aux->sleepable)
+		if (prog->sleepable)
 			/* attach_btf_id checked to be zero already */
 			return 0;
 		verbose(env, "Syscall programs can only be sleepable\n");
 		return -EINVAL;
 	}
 
-	if (prog->aux->sleepable && !can_be_sleepable(prog)) {
+	if (prog->sleepable && !can_be_sleepable(prog)) {
 		verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
 		return -EINVAL;
 	}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5ecfa57e3b97..724e6d7e128f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10553,7 +10553,7 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
 	    (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
 		return -EINVAL;
 
-	if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe)
+	if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe)
 		/* only uprobe programs are allowed to be sleepable */
 		return -EINVAL;
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 241ddf5e3895..0a5c4efc73c3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3256,7 +3256,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
 		.uprobe = uprobe,
 	};
 	struct bpf_prog *prog = link->link.prog;
-	bool sleepable = prog->aux->sleepable;
+	bool sleepable = prog->sleepable;
 	struct bpf_run_ctx *old_run_ctx;
 	int err = 0;
 
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index 1b5f812e6972..de33dc1b0daa 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -174,7 +174,7 @@ static int bpf_dummy_ops_check_member(const struct btf_type *t,
 	case offsetof(struct bpf_dummy_ops, test_sleepable):
 		break;
 	default:
-		if (prog->aux->sleepable)
+		if (prog->sleepable)
 			return -EINVAL;
 	}
 
-- 
cgit v1.2.3