From a94fd40a18ae76ba76dbaa8eca0e3c46aa1142c1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 6 Jun 2023 13:23:06 +0300 Subject: xfrm: delete not-needed clear to zero of encap_oa After commit 2f4796518315 ("af_key: Fix heap information leak"), there is no need to clear encap_oa again as it is already initialized to zero. Signed-off-by: Leon Romanovsky Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/key/af_key.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index a815f5ab4c49..1cb4560afd44 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1281,7 +1281,6 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]; natt->encap_dport = n_port->sadb_x_nat_t_port_port; } - memset(&natt->encap_oa, 0, sizeof(natt->encap_oa)); } err = xfrm_init_state(x); -- cgit v1.2.3 From 2597a25cb865b17c1c9c2a0dcf4925e9d1a696bb Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 26 Jun 2023 14:25:22 -0700 Subject: selftests/bpf: Add test to exercise typedef walking Add new bpf_fentry_test_sinfo with skb_shared_info argument and try to access frags. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20230626212522.2414485-2-sdf@google.com --- net/bpf/test_run.c | 4 ++++ tools/testing/selftests/bpf/prog_tests/verifier.c | 2 ++ .../testing/selftests/bpf/progs/verifier_typedef.c | 23 ++++++++++++++++++++++ 3 files changed, 29 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_typedef.c (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 2321bd2f9964..63b11f7a5392 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -555,6 +555,10 @@ __bpf_kfunc u32 bpf_fentry_test9(u32 *a) return *a; } +void noinline bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) +{ +} + __bpf_kfunc int bpf_modify_return_test(int a, int *b) { *b += 1; diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 070a13833c3f..c375e59ff28d 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -58,6 +58,7 @@ #include "verifier_stack_ptr.skel.h" #include "verifier_subprog_precision.skel.h" #include "verifier_subreg.skel.h" +#include "verifier_typedef.skel.h" #include "verifier_uninit.skel.h" #include "verifier_unpriv.skel.h" #include "verifier_unpriv_perf.skel.h" @@ -159,6 +160,7 @@ void test_verifier_spin_lock(void) { RUN(verifier_spin_lock); } void test_verifier_stack_ptr(void) { RUN(verifier_stack_ptr); } void test_verifier_subprog_precision(void) { RUN(verifier_subprog_precision); } void test_verifier_subreg(void) { RUN(verifier_subreg); } +void test_verifier_typedef(void) { RUN(verifier_typedef); } void test_verifier_uninit(void) { RUN(verifier_uninit); } void test_verifier_unpriv(void) { RUN(verifier_unpriv); } void test_verifier_unpriv_perf(void) { RUN(verifier_unpriv_perf); } diff --git a/tools/testing/selftests/bpf/progs/verifier_typedef.c b/tools/testing/selftests/bpf/progs/verifier_typedef.c new file mode 100644 index 000000000000..08481cfaac4b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_typedef.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +SEC("fentry/bpf_fentry_test_sinfo") +__description("typedef: resolve") +__success __retval(0) +__naked void resolve_typedef(void) +{ + asm volatile (" \ + r1 = *(u64 *)(r1 +0); \ + r2 = *(u64 *)(r1 +%[frags_offs]); \ + r0 = 0; \ + exit; \ +" : + : __imm_const(frags_offs, + offsetof(struct skb_shared_info, frags)) + : __clobber_all); +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 274c4a6d529cff48b241b15627b46b0f65987ade Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 Jul 2023 13:08:29 +0300 Subject: net/core: Make use of assign_bit() API We have for some time the assign_bit() API to replace open coded if (foo) set_bit(n, bar); else clear_bit(n, bar); Use this API in the code. No functional change intended. Signed-off-by: Andy Shevchenko Reviewed-by: Alexander Lobakin Message-ID: <20230710100830.89936-1-andriy.shevchenko@linux.intel.com> Signed-off-by: Paolo Abeni --- net/core/dev.c | 8 ++------ net/core/sock.c | 15 +++------------ 2 files changed, 5 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 69a3e544676c..d6e1b786c5c5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6316,12 +6316,8 @@ int dev_set_threaded(struct net_device *dev, bool threaded) * softirq mode will happen in the next round of napi_schedule(). * This should not cause hiccups/stalls to the live traffic. */ - list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (threaded) - set_bit(NAPI_STATE_THREADED, &napi->state); - else - clear_bit(NAPI_STATE_THREADED, &napi->state); - } + list_for_each_entry(napi, &dev->napi_list, dev_list) + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); return err; } diff --git a/net/core/sock.c b/net/core/sock.c index 9370fd50aa2c..ab1e8d1bd5a1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1244,17 +1244,11 @@ set_sndbuf: break; case SO_PASSCRED: - if (valbool) - set_bit(SOCK_PASSCRED, &sock->flags); - else - clear_bit(SOCK_PASSCRED, &sock->flags); + assign_bit(SOCK_PASSCRED, &sock->flags, valbool); break; case SO_PASSPIDFD: - if (valbool) - set_bit(SOCK_PASSPIDFD, &sock->flags); - else - clear_bit(SOCK_PASSPIDFD, &sock->flags); + assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); break; case SO_TIMESTAMP_OLD: @@ -1358,10 +1352,7 @@ set_sndbuf: break; case SO_PASSSEC: - if (valbool) - set_bit(SOCK_PASSSEC, &sock->flags); - else - clear_bit(SOCK_PASSSEC, &sock->flags); + assign_bit(SOCK_PASSSEC, &sock->flags, valbool); break; case SO_MARK: if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && -- cgit v1.2.3 From b8e39b38487e68c6503419db6e4a851a0ef56de7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 Jul 2023 13:08:30 +0300 Subject: netlink: Make use of __assign_bit() API We have for some time the __assign_bit() API to replace open coded if (foo) __set_bit(n, bar); else __clear_bit(n, bar); Use this API in the code. No functional change intended. Signed-off-by: Andy Shevchenko Reviewed-by: Alexander Lobakin Message-ID: <20230710100830.89936-2-andriy.shevchenko@linux.intel.com> Signed-off-by: Paolo Abeni --- net/netlink/af_netlink.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 383631873748..9c9df143a2ec 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1629,10 +1629,7 @@ static void netlink_update_socket_mc(struct netlink_sock *nlk, old = test_bit(group - 1, nlk->groups); subscriptions = nlk->subscriptions - old + new; - if (new) - __set_bit(group - 1, nlk->groups); - else - __clear_bit(group - 1, nlk->groups); + __assign_bit(group - 1, nlk->groups, new); netlink_update_subscriptions(&nlk->sk, subscriptions); netlink_update_listeners(&nlk->sk); } -- cgit v1.2.3 From 9f4a7c930284bf2b5b84d3636a8e88857149328f Mon Sep 17 00:00:00 2001 From: Jian Wen Date: Tue, 11 Jul 2023 11:24:05 +0800 Subject: tcp: add a scheduling point in established_get_first() Kubernetes[1] is going to stick with /proc/net/tcp for a while. This commit reduces the scheduling latency introduced by established_get_first(), similar to commit acffb584cda7 ("net: diag: add a scheduling point in inet_diag_dump_icsk()"). In our environment, the scheduling latency affects the performance of latency-sensitive services like Redis. Changes in V2 : - call cond_resched() before checking if a bucket is empty as suggested by Eric Dumazet - removed the delay of synchronize_net() from the commit message [1] https://github.com/google/cadvisor/blob/v0.47.2/container/libcontainer/handler.go#L130 Signed-off-by: Jian Wen Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230711032405.3253025-1-wenjian1@xiaomi.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fd365de4d5ff..cecd5a135e64 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -2446,6 +2447,8 @@ static void *established_get_first(struct seq_file *seq) struct hlist_nulls_node *node; spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); + cond_resched(); + /* Lockless fast path for the common case of empty buckets */ if (empty_bucket(hinfo, st)) continue; -- cgit v1.2.3 From c5ec13e38af5527f69f647c46ce15ebaa6b35e6c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 10 Jul 2023 14:35:11 -0700 Subject: ipv6: rpl: Remove redundant skb_dst_drop(). RPL code has a pattern where skb_dst_drop() is called before ip6_route_input(). However, ip6_route_input() calls skb_dst_drop() internally, so we need not call skb_dst_drop() before ip6_route_input(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230710213511.5364-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv6/exthdrs.c | 2 -- net/ipv6/rpl_iptunnel.c | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 202fc3aaa83c..f4bfccae003c 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -612,8 +612,6 @@ looped_back: kfree(buf); - skb_dst_drop(skb); - ip6_route_input(skb); if (skb_dst(skb)->error) { diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index b1c028df686e..a013b92cbb86 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -272,8 +272,6 @@ static int rpl_input(struct sk_buff *skb) dst = dst_cache_get(&rlwt->cache); preempt_enable(); - skb_dst_drop(skb); - if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); @@ -284,6 +282,7 @@ static int rpl_input(struct sk_buff *skb) preempt_enable(); } } else { + skb_dst_drop(skb); skb_dst_set(skb, dst); } -- cgit v1.2.3 From 5e9cf77d81f9ba51019157dc55a3b73dc89a7cf4 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 13 Jul 2023 12:07:38 +0800 Subject: selftests/bpf: add testcase for TRACING with 6+ arguments Add fentry_many_args.c and fexit_many_args.c to test the fentry/fexit with 7/11 arguments. As this feature is not supported by arm64 yet, we disable these testcases for arm64 in DENYLIST.aarch64. We can combine them with fentry_test.c/fexit_test.c when arm64 is supported too. Correspondingly, add bpf_testmod_fentry_test7() and bpf_testmod_fentry_test11() to bpf_testmod.c Meanwhile, add bpf_modify_return_test2() to test_run.c to test the MODIFY_RETURN with 7 arguments. Add bpf_testmod_test_struct_arg_7/bpf_testmod_test_struct_arg_7 in bpf_testmod.c to test the struct in the arguments. And the testcases passed on x86_64: ./test_progs -t fexit Summary: 5/14 PASSED, 0 SKIPPED, 0 FAILED ./test_progs -t fentry Summary: 3/2 PASSED, 0 SKIPPED, 0 FAILED ./test_progs -t modify_return Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED ./test_progs -t tracing_struct Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Menglong Dong Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230713040738.1789742-4-imagedong@tencent.com Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 14 +++++- tools/testing/selftests/bpf/DENYLIST.aarch64 | 2 + .../selftests/bpf/bpf_testmod/bpf_testmod.c | 49 +++++++++++++++++++- .../testing/selftests/bpf/prog_tests/fentry_test.c | 43 +++++++++++++++-- .../testing/selftests/bpf/prog_tests/fexit_test.c | 43 +++++++++++++++-- .../selftests/bpf/prog_tests/get_func_args_test.c | 4 +- .../selftests/bpf/prog_tests/modify_return.c | 10 ++-- .../selftests/bpf/prog_tests/tracing_struct.c | 19 ++++++++ .../selftests/bpf/prog_tests/trampoline_count.c | 4 +- .../testing/selftests/bpf/progs/fentry_many_args.c | 39 ++++++++++++++++ .../testing/selftests/bpf/progs/fexit_many_args.c | 40 ++++++++++++++++ tools/testing/selftests/bpf/progs/modify_return.c | 40 ++++++++++++++++ tools/testing/selftests/bpf/progs/tracing_struct.c | 54 ++++++++++++++++++++++ 13 files changed, 345 insertions(+), 16 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/fentry_many_args.c create mode 100644 tools/testing/selftests/bpf/progs/fexit_many_args.c (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 63b11f7a5392..7d47f53f20c1 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -565,6 +565,13 @@ __bpf_kfunc int bpf_modify_return_test(int a, int *b) return a + *b; } +__bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d, + void *e, char f, int g) +{ + *b += 1; + return a + *b + c + d + (long)e + f + g; +} + int noinline bpf_fentry_shadow_test(int a) { return a + 1; @@ -600,6 +607,7 @@ __diag_pop(); BTF_SET8_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) +BTF_ID_FLAGS(func, bpf_modify_return_test2) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) BTF_SET8_END(bpf_test_modify_return_ids) @@ -667,7 +675,11 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, case BPF_MODIFY_RETURN: ret = bpf_modify_return_test(1, &b); if (b != 2) - side_effect = 1; + side_effect++; + b = 2; + ret += bpf_modify_return_test2(1, &b, 3, 4, (void *)5, 6, 7); + if (b != 2) + side_effect++; break; default: goto out; diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64 index 08adc805878b..3b61e8b35d62 100644 --- a/tools/testing/selftests/bpf/DENYLIST.aarch64 +++ b/tools/testing/selftests/bpf/DENYLIST.aarch64 @@ -10,3 +10,5 @@ kprobe_multi_test/link_api_addrs # link_fd unexpected link_fd: a kprobe_multi_test/link_api_syms # link_fd unexpected link_fd: actual -95 < expected 0 kprobe_multi_test/skel_api # libbpf: failed to load BPF skeleton 'kprobe_multi': -3 module_attach # prog 'kprobe_multi': failed to auto-attach: -95 +fentry_test/fentry_many_args # fentry_many_args:FAIL:fentry_many_args_attach unexpected error: -524 +fexit_test/fexit_many_args # fexit_many_args:FAIL:fexit_many_args_attach unexpected error: -524 diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index aaf6ef1201c7..a6f991b56345 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -34,6 +34,11 @@ struct bpf_testmod_struct_arg_3 { int b[]; }; +struct bpf_testmod_struct_arg_4 { + u64 a; + int b; +}; + __diag_push(); __diag_ignore_all("-Wmissing-prototypes", "Global functions as their definitions will be in bpf_testmod.ko BTF"); @@ -75,6 +80,24 @@ bpf_testmod_test_struct_arg_6(struct bpf_testmod_struct_arg_3 *a) { return bpf_testmod_test_struct_arg_result; } +noinline int +bpf_testmod_test_struct_arg_7(u64 a, void *b, short c, int d, void *e, + struct bpf_testmod_struct_arg_4 f) +{ + bpf_testmod_test_struct_arg_result = a + (long)b + c + d + + (long)e + f.a + f.b; + return bpf_testmod_test_struct_arg_result; +} + +noinline int +bpf_testmod_test_struct_arg_8(u64 a, void *b, short c, int d, void *e, + struct bpf_testmod_struct_arg_4 f, int g) +{ + bpf_testmod_test_struct_arg_result = a + (long)b + c + d + + (long)e + f.a + f.b + g; + return bpf_testmod_test_struct_arg_result; +} + __bpf_kfunc void bpf_testmod_test_mod_kfunc(int i) { @@ -191,6 +214,20 @@ noinline int bpf_testmod_fentry_test3(char a, int b, u64 c) return a + b + c; } +noinline int bpf_testmod_fentry_test7(u64 a, void *b, short c, int d, + void *e, char f, int g) +{ + return a + (long)b + c + d + (long)e + f + g; +} + +noinline int bpf_testmod_fentry_test11(u64 a, void *b, short c, int d, + void *e, char f, int g, + unsigned int h, long i, __u64 j, + unsigned long k) +{ + return a + (long)b + c + d + (long)e + f + g + h + i + j + k; +} + int bpf_testmod_fentry_ok; noinline ssize_t @@ -206,6 +243,7 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, struct bpf_testmod_struct_arg_1 struct_arg1 = {10}; struct bpf_testmod_struct_arg_2 struct_arg2 = {2, 3}; struct bpf_testmod_struct_arg_3 *struct_arg3; + struct bpf_testmod_struct_arg_4 struct_arg4 = {21, 22}; int i = 1; while (bpf_testmod_return_ptr(i)) @@ -216,6 +254,11 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, (void)bpf_testmod_test_struct_arg_3(1, 4, struct_arg2); (void)bpf_testmod_test_struct_arg_4(struct_arg1, 1, 2, 3, struct_arg2); (void)bpf_testmod_test_struct_arg_5(); + (void)bpf_testmod_test_struct_arg_7(16, (void *)17, 18, 19, + (void *)20, struct_arg4); + (void)bpf_testmod_test_struct_arg_8(16, (void *)17, 18, 19, + (void *)20, struct_arg4, 23); + struct_arg3 = kmalloc((sizeof(struct bpf_testmod_struct_arg_3) + sizeof(int)), GFP_KERNEL); @@ -243,7 +286,11 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, if (bpf_testmod_fentry_test1(1) != 2 || bpf_testmod_fentry_test2(2, 3) != 5 || - bpf_testmod_fentry_test3(4, 5, 6) != 15) + bpf_testmod_fentry_test3(4, 5, 6) != 15 || + bpf_testmod_fentry_test7(16, (void *)17, 18, 19, (void *)20, + 21, 22) != 133 || + bpf_testmod_fentry_test11(16, (void *)17, 18, 19, (void *)20, + 21, 22, 23, 24, 25, 26) != 231) goto out; bpf_testmod_fentry_ok = 1; diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c index c0d1d61d5f66..aee1bc77a17f 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c @@ -2,8 +2,9 @@ /* Copyright (c) 2019 Facebook */ #include #include "fentry_test.lskel.h" +#include "fentry_many_args.skel.h" -static int fentry_test(struct fentry_test_lskel *fentry_skel) +static int fentry_test_common(struct fentry_test_lskel *fentry_skel) { int err, prog_fd, i; int link_fd; @@ -37,7 +38,7 @@ static int fentry_test(struct fentry_test_lskel *fentry_skel) return 0; } -void test_fentry_test(void) +static void fentry_test(void) { struct fentry_test_lskel *fentry_skel = NULL; int err; @@ -46,13 +47,47 @@ void test_fentry_test(void) if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_load")) goto cleanup; - err = fentry_test(fentry_skel); + err = fentry_test_common(fentry_skel); if (!ASSERT_OK(err, "fentry_first_attach")) goto cleanup; - err = fentry_test(fentry_skel); + err = fentry_test_common(fentry_skel); ASSERT_OK(err, "fentry_second_attach"); cleanup: fentry_test_lskel__destroy(fentry_skel); } + +static void fentry_many_args(void) +{ + struct fentry_many_args *fentry_skel = NULL; + int err; + + fentry_skel = fentry_many_args__open_and_load(); + if (!ASSERT_OK_PTR(fentry_skel, "fentry_many_args_skel_load")) + goto cleanup; + + err = fentry_many_args__attach(fentry_skel); + if (!ASSERT_OK(err, "fentry_many_args_attach")) + goto cleanup; + + ASSERT_OK(trigger_module_test_read(1), "trigger_read"); + + ASSERT_EQ(fentry_skel->bss->test1_result, 1, + "fentry_many_args_result1"); + ASSERT_EQ(fentry_skel->bss->test2_result, 1, + "fentry_many_args_result2"); + ASSERT_EQ(fentry_skel->bss->test3_result, 1, + "fentry_many_args_result3"); + +cleanup: + fentry_many_args__destroy(fentry_skel); +} + +void test_fentry_test(void) +{ + if (test__start_subtest("fentry")) + fentry_test(); + if (test__start_subtest("fentry_many_args")) + fentry_many_args(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_test.c b/tools/testing/selftests/bpf/prog_tests/fexit_test.c index 101b7343036b..1c13007e37dd 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_test.c @@ -2,8 +2,9 @@ /* Copyright (c) 2019 Facebook */ #include #include "fexit_test.lskel.h" +#include "fexit_many_args.skel.h" -static int fexit_test(struct fexit_test_lskel *fexit_skel) +static int fexit_test_common(struct fexit_test_lskel *fexit_skel) { int err, prog_fd, i; int link_fd; @@ -37,7 +38,7 @@ static int fexit_test(struct fexit_test_lskel *fexit_skel) return 0; } -void test_fexit_test(void) +static void fexit_test(void) { struct fexit_test_lskel *fexit_skel = NULL; int err; @@ -46,13 +47,47 @@ void test_fexit_test(void) if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_load")) goto cleanup; - err = fexit_test(fexit_skel); + err = fexit_test_common(fexit_skel); if (!ASSERT_OK(err, "fexit_first_attach")) goto cleanup; - err = fexit_test(fexit_skel); + err = fexit_test_common(fexit_skel); ASSERT_OK(err, "fexit_second_attach"); cleanup: fexit_test_lskel__destroy(fexit_skel); } + +static void fexit_many_args(void) +{ + struct fexit_many_args *fexit_skel = NULL; + int err; + + fexit_skel = fexit_many_args__open_and_load(); + if (!ASSERT_OK_PTR(fexit_skel, "fexit_many_args_skel_load")) + goto cleanup; + + err = fexit_many_args__attach(fexit_skel); + if (!ASSERT_OK(err, "fexit_many_args_attach")) + goto cleanup; + + ASSERT_OK(trigger_module_test_read(1), "trigger_read"); + + ASSERT_EQ(fexit_skel->bss->test1_result, 1, + "fexit_many_args_result1"); + ASSERT_EQ(fexit_skel->bss->test2_result, 1, + "fexit_many_args_result2"); + ASSERT_EQ(fexit_skel->bss->test3_result, 1, + "fexit_many_args_result3"); + +cleanup: + fexit_many_args__destroy(fexit_skel); +} + +void test_fexit_test(void) +{ + if (test__start_subtest("fexit")) + fexit_test(); + if (test__start_subtest("fexit_many_args")) + fexit_many_args(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c index 28cf63963cb7..64a9c95d4acf 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c +++ b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c @@ -30,7 +30,9 @@ void test_get_func_args_test(void) prog_fd = bpf_program__fd(skel->progs.fmod_ret_test); err = bpf_prog_test_run_opts(prog_fd, &topts); ASSERT_OK(err, "test_run"); - ASSERT_EQ(topts.retval, 1234, "test_run"); + + ASSERT_EQ(topts.retval >> 16, 1, "test_run"); + ASSERT_EQ(topts.retval & 0xffff, 1234 + 29, "test_run"); ASSERT_EQ(skel->bss->test1_result, 1, "test1_result"); ASSERT_EQ(skel->bss->test2_result, 1, "test2_result"); diff --git a/tools/testing/selftests/bpf/prog_tests/modify_return.c b/tools/testing/selftests/bpf/prog_tests/modify_return.c index 5d9955af6247..a70c99c2f8c8 100644 --- a/tools/testing/selftests/bpf/prog_tests/modify_return.c +++ b/tools/testing/selftests/bpf/prog_tests/modify_return.c @@ -41,6 +41,10 @@ static void run_test(__u32 input_retval, __u16 want_side_effect, __s16 want_ret) ASSERT_EQ(skel->bss->fexit_result, 1, "modify_return fexit_result"); ASSERT_EQ(skel->bss->fmod_ret_result, 1, "modify_return fmod_ret_result"); + ASSERT_EQ(skel->bss->fentry_result2, 1, "modify_return fentry_result2"); + ASSERT_EQ(skel->bss->fexit_result2, 1, "modify_return fexit_result2"); + ASSERT_EQ(skel->bss->fmod_ret_result2, 1, "modify_return fmod_ret_result2"); + cleanup: modify_return__destroy(skel); } @@ -49,9 +53,9 @@ cleanup: void serial_test_modify_return(void) { run_test(0 /* input_retval */, - 1 /* want_side_effect */, - 4 /* want_ret */); + 2 /* want_side_effect */, + 33 /* want_ret */); run_test(-EINVAL /* input_retval */, 0 /* want_side_effect */, - -EINVAL /* want_ret */); + -EINVAL * 2 /* want_ret */); } diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_struct.c b/tools/testing/selftests/bpf/prog_tests/tracing_struct.c index 1c75a32186d6..fe0fb0c9849a 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_struct.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_struct.c @@ -55,6 +55,25 @@ static void test_fentry(void) ASSERT_EQ(skel->bss->t6, 1, "t6 ret"); + ASSERT_EQ(skel->bss->t7_a, 16, "t7:a"); + ASSERT_EQ(skel->bss->t7_b, 17, "t7:b"); + ASSERT_EQ(skel->bss->t7_c, 18, "t7:c"); + ASSERT_EQ(skel->bss->t7_d, 19, "t7:d"); + ASSERT_EQ(skel->bss->t7_e, 20, "t7:e"); + ASSERT_EQ(skel->bss->t7_f_a, 21, "t7:f.a"); + ASSERT_EQ(skel->bss->t7_f_b, 22, "t7:f.b"); + ASSERT_EQ(skel->bss->t7_ret, 133, "t7 ret"); + + ASSERT_EQ(skel->bss->t8_a, 16, "t8:a"); + ASSERT_EQ(skel->bss->t8_b, 17, "t8:b"); + ASSERT_EQ(skel->bss->t8_c, 18, "t8:c"); + ASSERT_EQ(skel->bss->t8_d, 19, "t8:d"); + ASSERT_EQ(skel->bss->t8_e, 20, "t8:e"); + ASSERT_EQ(skel->bss->t8_f_a, 21, "t8:f.a"); + ASSERT_EQ(skel->bss->t8_f_b, 22, "t8:f.b"); + ASSERT_EQ(skel->bss->t8_g, 23, "t8:g"); + ASSERT_EQ(skel->bss->t8_ret, 156, "t8 ret"); + tracing_struct__detach(skel); destroy_skel: tracing_struct__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c index e91d0d1769f1..6cd7349d4a2b 100644 --- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c +++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c @@ -88,8 +88,8 @@ void serial_test_trampoline_count(void) if (!ASSERT_OK(err, "bpf_prog_test_run_opts")) goto cleanup; - ASSERT_EQ(opts.retval & 0xffff, 4, "bpf_modify_return_test.result"); - ASSERT_EQ(opts.retval >> 16, 1, "bpf_modify_return_test.side_effect"); + ASSERT_EQ(opts.retval & 0xffff, 33, "bpf_modify_return_test.result"); + ASSERT_EQ(opts.retval >> 16, 2, "bpf_modify_return_test.side_effect"); cleanup: for (; i >= 0; i--) { diff --git a/tools/testing/selftests/bpf/progs/fentry_many_args.c b/tools/testing/selftests/bpf/progs/fentry_many_args.c new file mode 100644 index 000000000000..b61bb92fee2c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/fentry_many_args.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Tencent */ +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +__u64 test1_result = 0; +SEC("fentry/bpf_testmod_fentry_test7") +int BPF_PROG(test1, __u64 a, void *b, short c, int d, void *e, char f, + int g) +{ + test1_result = a == 16 && b == (void *)17 && c == 18 && d == 19 && + e == (void *)20 && f == 21 && g == 22; + return 0; +} + +__u64 test2_result = 0; +SEC("fentry/bpf_testmod_fentry_test11") +int BPF_PROG(test2, __u64 a, void *b, short c, int d, void *e, char f, + int g, unsigned int h, long i, __u64 j, unsigned long k) +{ + test2_result = a == 16 && b == (void *)17 && c == 18 && d == 19 && + e == (void *)20 && f == 21 && g == 22 && h == 23 && + i == 24 && j == 25 && k == 26; + return 0; +} + +__u64 test3_result = 0; +SEC("fentry/bpf_testmod_fentry_test11") +int BPF_PROG(test3, __u64 a, __u64 b, __u64 c, __u64 d, __u64 e, __u64 f, + __u64 g, __u64 h, __u64 i, __u64 j, __u64 k) +{ + test3_result = a == 16 && b == 17 && c == 18 && d == 19 && + e == 20 && f == 21 && g == 22 && h == 23 && + i == 24 && j == 25 && k == 26; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/fexit_many_args.c b/tools/testing/selftests/bpf/progs/fexit_many_args.c new file mode 100644 index 000000000000..53b335c2dafb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/fexit_many_args.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Tencent */ +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +__u64 test1_result = 0; +SEC("fexit/bpf_testmod_fentry_test7") +int BPF_PROG(test1, __u64 a, void *b, short c, int d, void *e, char f, + int g, int ret) +{ + test1_result = a == 16 && b == (void *)17 && c == 18 && d == 19 && + e == (void *)20 && f == 21 && g == 22 && ret == 133; + return 0; +} + +__u64 test2_result = 0; +SEC("fexit/bpf_testmod_fentry_test11") +int BPF_PROG(test2, __u64 a, void *b, short c, int d, void *e, char f, + int g, unsigned int h, long i, __u64 j, unsigned long k, + int ret) +{ + test2_result = a == 16 && b == (void *)17 && c == 18 && d == 19 && + e == (void *)20 && f == 21 && g == 22 && h == 23 && + i == 24 && j == 25 && k == 26 && ret == 231; + return 0; +} + +__u64 test3_result = 0; +SEC("fexit/bpf_testmod_fentry_test11") +int BPF_PROG(test3, __u64 a, __u64 b, __u64 c, __u64 d, __u64 e, __u64 f, + __u64 g, __u64 h, __u64 i, __u64 j, __u64 k, __u64 ret) +{ + test3_result = a == 16 && b == 17 && c == 18 && d == 19 && + e == 20 && f == 21 && g == 22 && h == 23 && + i == 24 && j == 25 && k == 26 && ret == 231; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/modify_return.c b/tools/testing/selftests/bpf/progs/modify_return.c index 8b7466a15c6b..3376d4849f58 100644 --- a/tools/testing/selftests/bpf/progs/modify_return.c +++ b/tools/testing/selftests/bpf/progs/modify_return.c @@ -47,3 +47,43 @@ int BPF_PROG(fexit_test, int a, __u64 b, int ret) return 0; } + +static int sequence2; + +__u64 fentry_result2 = 0; +SEC("fentry/bpf_modify_return_test2") +int BPF_PROG(fentry_test2, int a, int *b, short c, int d, void *e, char f, + int g) +{ + sequence2++; + fentry_result2 = (sequence2 == 1); + return 0; +} + +__u64 fmod_ret_result2 = 0; +SEC("fmod_ret/bpf_modify_return_test2") +int BPF_PROG(fmod_ret_test2, int a, int *b, short c, int d, void *e, char f, + int g, int ret) +{ + sequence2++; + /* This is the first fmod_ret program, the ret passed should be 0 */ + fmod_ret_result2 = (sequence2 == 2 && ret == 0); + return input_retval; +} + +__u64 fexit_result2 = 0; +SEC("fexit/bpf_modify_return_test2") +int BPF_PROG(fexit_test2, int a, int *b, short c, int d, void *e, char f, + int g, int ret) +{ + sequence2++; + /* If the input_reval is non-zero a successful modification should have + * occurred. + */ + if (input_retval) + fexit_result2 = (sequence2 == 3 && ret == input_retval); + else + fexit_result2 = (sequence2 == 3 && ret == 29); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/tracing_struct.c b/tools/testing/selftests/bpf/progs/tracing_struct.c index c435a3a8328a..515daef3c84b 100644 --- a/tools/testing/selftests/bpf/progs/tracing_struct.c +++ b/tools/testing/selftests/bpf/progs/tracing_struct.c @@ -18,6 +18,11 @@ struct bpf_testmod_struct_arg_3 { int b[]; }; +struct bpf_testmod_struct_arg_4 { + u64 a; + int b; +}; + long t1_a_a, t1_a_b, t1_b, t1_c, t1_ret, t1_nregs; __u64 t1_reg0, t1_reg1, t1_reg2, t1_reg3; long t2_a, t2_b_a, t2_b_b, t2_c, t2_ret; @@ -25,6 +30,9 @@ long t3_a, t3_b, t3_c_a, t3_c_b, t3_ret; long t4_a_a, t4_b, t4_c, t4_d, t4_e_a, t4_e_b, t4_ret; long t5_ret; int t6; +long t7_a, t7_b, t7_c, t7_d, t7_e, t7_f_a, t7_f_b, t7_ret; +long t8_a, t8_b, t8_c, t8_d, t8_e, t8_f_a, t8_f_b, t8_g, t8_ret; + SEC("fentry/bpf_testmod_test_struct_arg_1") int BPF_PROG2(test_struct_arg_1, struct bpf_testmod_struct_arg_2, a, int, b, int, c) @@ -130,4 +138,50 @@ int BPF_PROG2(test_struct_arg_11, struct bpf_testmod_struct_arg_3 *, a) return 0; } +SEC("fentry/bpf_testmod_test_struct_arg_7") +int BPF_PROG2(test_struct_arg_12, __u64, a, void *, b, short, c, int, d, + void *, e, struct bpf_testmod_struct_arg_4, f) +{ + t7_a = a; + t7_b = (long)b; + t7_c = c; + t7_d = d; + t7_e = (long)e; + t7_f_a = f.a; + t7_f_b = f.b; + return 0; +} + +SEC("fexit/bpf_testmod_test_struct_arg_7") +int BPF_PROG2(test_struct_arg_13, __u64, a, void *, b, short, c, int, d, + void *, e, struct bpf_testmod_struct_arg_4, f, int, ret) +{ + t7_ret = ret; + return 0; +} + +SEC("fentry/bpf_testmod_test_struct_arg_8") +int BPF_PROG2(test_struct_arg_14, __u64, a, void *, b, short, c, int, d, + void *, e, struct bpf_testmod_struct_arg_4, f, int, g) +{ + t8_a = a; + t8_b = (long)b; + t8_c = c; + t8_d = d; + t8_e = (long)e; + t8_f_a = f.a; + t8_f_b = f.b; + t8_g = g; + return 0; +} + +SEC("fexit/bpf_testmod_test_struct_arg_8") +int BPF_PROG2(test_struct_arg_15, __u64, a, void *, b, short, c, int, d, + void *, e, struct bpf_testmod_struct_arg_4, f, int, g, + int, ret) +{ + t8_ret = ret; + return 0; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 5bc67a854cb4982aa7746e8d2206a00b46a9cc0f Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 11 Jul 2023 15:06:21 +0200 Subject: ipv6: Constify the sk parameter of several helper functions. icmpv6_flow_init(), ip6_datagram_flow_key_init() and ip6_mc_hdr() don't need to modify their sk argument. Make that explicit using const. Signed-off-by: Guillaume Nault Reviewed-by: Simon Horman Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/icmpv6.h | 10 ++++------ net/ipv6/datagram.c | 7 ++++--- net/ipv6/icmp.c | 6 ++---- net/ipv6/mcast.c | 8 +++----- 4 files changed, 13 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h index db0f4fcfdaf4..e3b3b0fa2a8f 100644 --- a/include/linux/icmpv6.h +++ b/include/linux/icmpv6.h @@ -85,12 +85,10 @@ extern void icmpv6_param_prob_reason(struct sk_buff *skb, struct flowi6; struct in6_addr; -extern void icmpv6_flow_init(struct sock *sk, - struct flowi6 *fl6, - u8 type, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - int oif); + +void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type, + const struct in6_addr *saddr, + const struct in6_addr *daddr, int oif); static inline void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) { diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 9b6818453afe..d80d6024cafa 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -38,10 +38,11 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a) return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); } -static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) +static void ip6_datagram_flow_key_init(struct flowi6 *fl6, + const struct sock *sk) { - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); int oif = sk->sk_bound_dev_if; memset(fl6, 0, sizeof(*fl6)); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 65fa5014bc85..6d88f5248c1f 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -1034,11 +1034,9 @@ drop_no_count: return 0; } -void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, - u8 type, +void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type, const struct in6_addr *saddr, - const struct in6_addr *daddr, - int oif) + const struct in6_addr *daddr, int oif) { memset(fl6, 0, sizeof(*fl6)); fl6->saddr = *saddr; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 714cdc9e2b8e..5ce25bcb9974 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1699,11 +1699,9 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) return scount; } -static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, - struct net_device *dev, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - int proto, int len) +static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb, + struct net_device *dev, const struct in6_addr *saddr, + const struct in6_addr *daddr, int proto, int len) { struct ipv6hdr *hdr; -- cgit v1.2.3 From 633d76ad01ad0321a1ace3e5cc4fed06753d7ac4 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 13 Jul 2023 11:44:19 +0200 Subject: devlink: remove reload failed checks in params get/set callbacks The checks in question were introduced by: commit 6b4db2e528f6 ("devlink: Fix use-after-free after a failed reload"). That fixed an issue of reload with mlxsw driver. Back then, that was a valid fix, because there was a limitation in place that prevented drivers from registering/unregistering params when devlink instance was registered. It was possible to do the fix differently by changing drivers to register/unregister params in appropriate places making sure the ops operate only on memory which is allocated and initialized. But that, as a dependency, would require to remove the limitation mentioned above. Eventually, this limitation was lifted by: commit 1d18bb1a4ddd ("devlink: allow registering parameters after the instance") Also, the alternative fix (which also fixed another issue) was done by: commit 74cbc3c03c82 ("mlxsw: spectrum_acl_tcam: Move devlink param to TCAM code"). Therefore, the checks are no longer relevant. Each driver should make sure to have the params registered only when the memory the ops are working with is allocated and initialized. So remove the checks. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/devlink/leftover.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 1f00f874471f..5128b9c7eea8 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -3946,7 +3946,7 @@ static int devlink_param_get(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->get || devlink->reload_failed) + if (!param->get) return -EOPNOTSUPP; return param->get(devlink, param->id, ctx); } @@ -3955,7 +3955,7 @@ static int devlink_param_set(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->set || devlink->reload_failed) + if (!param->set) return -EOPNOTSUPP; return param->set(devlink, param->id, ctx); } -- cgit v1.2.3 From 608a147a88728f84bbd2efdde3d4984339f1d872 Mon Sep 17 00:00:00 2001 From: Vignesh Viswanathan Date: Fri, 14 Jul 2023 11:28:44 +0530 Subject: net: qrtr: ns: Change servers radix tree to xarray There is a use after free scenario while iterating through the servers radix tree despite the ns being a single threaded process. This can happen when the radix tree APIs are not synchronized with the rcu_read_lock() APIs. Convert the radix tree for servers to xarray to take advantage of the built in rcu lock usage provided by xarray. Signed-off-by: Chris Lew Signed-off-by: Vignesh Viswanathan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/qrtr/ns.c | 133 +++++++++++----------------------------------------------- 1 file changed, 24 insertions(+), 109 deletions(-) (limited to 'net') diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 0f7a729f1a1f..af28d9e6b53f 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -66,7 +66,7 @@ struct qrtr_server { struct qrtr_node { unsigned int id; - struct radix_tree_root servers; + struct xarray servers; }; static struct qrtr_node *node_get(unsigned int node_id) @@ -83,6 +83,7 @@ static struct qrtr_node *node_get(unsigned int node_id) return NULL; node->id = node_id; + xa_init(&node->servers); if (radix_tree_insert(&nodes, node_id, node)) { kfree(node); @@ -193,40 +194,23 @@ static void lookup_notify(struct sockaddr_qrtr *to, struct qrtr_server *srv, static int announce_servers(struct sockaddr_qrtr *sq) { - struct radix_tree_iter iter; struct qrtr_server *srv; struct qrtr_node *node; - void __rcu **slot; + unsigned long index; int ret; node = node_get(qrtr_ns.local_node); if (!node) return 0; - rcu_read_lock(); /* Announce the list of servers registered in this node */ - radix_tree_for_each_slot(slot, &node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); - + xa_for_each(&node->servers, index, srv) { ret = service_announce_new(sq, srv); if (ret < 0) { pr_err("failed to announce new service\n"); return ret; } - - rcu_read_lock(); } - - rcu_read_unlock(); - return 0; } @@ -256,14 +240,17 @@ static struct qrtr_server *server_add(unsigned int service, goto err; /* Delete the old server on the same port */ - old = radix_tree_lookup(&node->servers, port); + old = xa_store(&node->servers, port, srv, GFP_KERNEL); if (old) { - radix_tree_delete(&node->servers, port); - kfree(old); + if (xa_is_err(old)) { + pr_err("failed to add server [0x%x:0x%x] ret:%d\n", + srv->service, srv->instance, xa_err(old)); + goto err; + } else { + kfree(old); + } } - radix_tree_insert(&node->servers, port, srv); - trace_qrtr_ns_server_add(srv->service, srv->instance, srv->node, srv->port); @@ -280,11 +267,11 @@ static int server_del(struct qrtr_node *node, unsigned int port, bool bcast) struct qrtr_server *srv; struct list_head *li; - srv = radix_tree_lookup(&node->servers, port); + srv = xa_load(&node->servers, port); if (!srv) return -ENOENT; - radix_tree_delete(&node->servers, port); + xa_erase(&node->servers, port); /* Broadcast the removal of local servers */ if (srv->node == qrtr_ns.local_node && bcast) @@ -344,13 +331,12 @@ static int ctrl_cmd_hello(struct sockaddr_qrtr *sq) static int ctrl_cmd_bye(struct sockaddr_qrtr *from) { struct qrtr_node *local_node; - struct radix_tree_iter iter; struct qrtr_ctrl_pkt pkt; struct qrtr_server *srv; struct sockaddr_qrtr sq; struct msghdr msg = { }; struct qrtr_node *node; - void __rcu **slot; + unsigned long index; struct kvec iv; int ret; @@ -361,22 +347,9 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from) if (!node) return 0; - rcu_read_lock(); /* Advertise removal of this client to all servers of remote node */ - radix_tree_for_each_slot(slot, &node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); + xa_for_each(&node->servers, index, srv) server_del(node, srv->port, true); - rcu_read_lock(); - } - rcu_read_unlock(); /* Advertise the removal of this client to all local servers */ local_node = node_get(qrtr_ns.local_node); @@ -387,18 +360,7 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from) pkt.cmd = cpu_to_le32(QRTR_TYPE_BYE); pkt.client.node = cpu_to_le32(from->sq_node); - rcu_read_lock(); - radix_tree_for_each_slot(slot, &local_node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); - + xa_for_each(&local_node->servers, index, srv) { sq.sq_family = AF_QIPCRTR; sq.sq_node = srv->node; sq.sq_port = srv->port; @@ -411,11 +373,7 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from) pr_err("failed to send bye cmd\n"); return ret; } - rcu_read_lock(); } - - rcu_read_unlock(); - return 0; } @@ -423,7 +381,6 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, unsigned int node_id, unsigned int port) { struct qrtr_node *local_node; - struct radix_tree_iter iter; struct qrtr_lookup *lookup; struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; @@ -432,7 +389,7 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, struct qrtr_node *node; struct list_head *tmp; struct list_head *li; - void __rcu **slot; + unsigned long index; struct kvec iv; int ret; @@ -477,18 +434,7 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, pkt.client.node = cpu_to_le32(node_id); pkt.client.port = cpu_to_le32(port); - rcu_read_lock(); - radix_tree_for_each_slot(slot, &local_node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); - + xa_for_each(&local_node->servers, index, srv) { sq.sq_family = AF_QIPCRTR; sq.sq_node = srv->node; sq.sq_port = srv->port; @@ -501,11 +447,7 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, pr_err("failed to send del client cmd\n"); return ret; } - rcu_read_lock(); } - - rcu_read_unlock(); - return 0; } @@ -576,13 +518,12 @@ static int ctrl_cmd_del_server(struct sockaddr_qrtr *from, static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance) { - struct radix_tree_iter node_iter; struct qrtr_server_filter filter; - struct radix_tree_iter srv_iter; struct qrtr_lookup *lookup; + struct qrtr_server *srv; struct qrtr_node *node; - void __rcu **node_slot; - void __rcu **srv_slot; + unsigned long node_idx; + unsigned long srv_idx; /* Accept only local observers */ if (from->sq_node != qrtr_ns.local_node) @@ -601,40 +542,14 @@ static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from, filter.service = service; filter.instance = instance; - rcu_read_lock(); - radix_tree_for_each_slot(node_slot, &nodes, &node_iter, 0) { - node = radix_tree_deref_slot(node_slot); - if (!node) - continue; - if (radix_tree_deref_retry(node)) { - node_slot = radix_tree_iter_retry(&node_iter); - continue; - } - node_slot = radix_tree_iter_resume(node_slot, &node_iter); - - radix_tree_for_each_slot(srv_slot, &node->servers, - &srv_iter, 0) { - struct qrtr_server *srv; - - srv = radix_tree_deref_slot(srv_slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - srv_slot = radix_tree_iter_retry(&srv_iter); - continue; - } - + xa_for_each(&nodes, node_idx, node) { + xa_for_each(&node->servers, srv_idx, srv) { if (!server_match(srv, &filter)) continue; - srv_slot = radix_tree_iter_resume(srv_slot, &srv_iter); - - rcu_read_unlock(); lookup_notify(from, srv, true); - rcu_read_lock(); } } - rcu_read_unlock(); /* Empty notification, to indicate end of listing */ lookup_notify(from, NULL, true); -- cgit v1.2.3 From f26b32ef2fe6f7ae0ba56854e666339e105610ad Mon Sep 17 00:00:00 2001 From: Vignesh Viswanathan Date: Fri, 14 Jul 2023 11:28:45 +0530 Subject: net: qrtr: ns: Change nodes radix tree to xarray There is a use after free scenario while iterating through the nodes radix tree despite the ns being a single threaded process. This can happen when the radix tree APIs are not synchronized with the rcu_read_lock() APIs. Convert the radix tree for nodes to xarray to take advantage of the built in rcu lock usage provided by xarray. Signed-off-by: Chris Lew Signed-off-by: Vignesh Viswanathan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/qrtr/ns.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index af28d9e6b53f..b1db0b519179 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -16,7 +16,7 @@ #define CREATE_TRACE_POINTS #include -static RADIX_TREE(nodes, GFP_KERNEL); +static DEFINE_XARRAY(nodes); static struct { struct socket *sock; @@ -73,7 +73,7 @@ static struct qrtr_node *node_get(unsigned int node_id) { struct qrtr_node *node; - node = radix_tree_lookup(&nodes, node_id); + node = xa_load(&nodes, node_id); if (node) return node; @@ -85,7 +85,7 @@ static struct qrtr_node *node_get(unsigned int node_id) node->id = node_id; xa_init(&node->servers); - if (radix_tree_insert(&nodes, node_id, node)) { + if (xa_store(&nodes, node_id, node, GFP_KERNEL)) { kfree(node); return NULL; } -- cgit v1.2.3 From 69940b888e3502868efd1acf3719fd1af810c123 Mon Sep 17 00:00:00 2001 From: Vignesh Viswanathan Date: Fri, 14 Jul 2023 11:28:46 +0530 Subject: net: qrtr: Handle IPCR control port format of older targets The destination port value in the IPCR control buffer on older targets is 0xFFFF. Handle the same by updating the dst_port to QRTR_PORT_CTRL. Signed-off-by: Vignesh Viswanathan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/qrtr/af_qrtr.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c index 78beb74146e7..41ece61eb57a 100644 --- a/net/qrtr/af_qrtr.c +++ b/net/qrtr/af_qrtr.c @@ -23,6 +23,8 @@ #define QRTR_EPH_PORT_RANGE \ XA_LIMIT(QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET) +#define QRTR_PORT_CTRL_LEGACY 0xffff + /** * struct qrtr_hdr_v1 - (I|R)PCrouter packet header version 1 * @version: protocol version @@ -495,6 +497,9 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) goto err; } + if (cb->dst_port == QRTR_PORT_CTRL_LEGACY) + cb->dst_port = QRTR_PORT_CTRL; + if (!size || len != ALIGN(size, 4) + hdrlen) goto err; -- cgit v1.2.3 From a88dd7538461b2daf6883e5392957b21270638ed Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 14 Jul 2023 10:12:07 +0100 Subject: net: dsa: remove legacy_pre_march2020 detection All drivers are now updated for the March 2020 changes, and no longer make use of the mac_pcs_get_state() or mac_an_restart() operations, which are now NULL across all DSA drivers. All DSA drivers don't look at speed, duplex, pause or advertisement in their phylink_mac_config() method either. Remove support for these operations from DSA, and stop marking DSA as a legacy driver by default. Signed-off-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Signed-off-by: Paolo Abeni --- include/net/dsa.h | 3 --- net/dsa/port.c | 41 ----------------------------------------- 2 files changed, 44 deletions(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index d309ee7ed04b..0b9c6aa27047 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -873,8 +873,6 @@ struct dsa_switch_ops { struct phylink_pcs *(*phylink_mac_select_pcs)(struct dsa_switch *ds, int port, phy_interface_t iface); - int (*phylink_mac_link_state)(struct dsa_switch *ds, int port, - struct phylink_link_state *state); int (*phylink_mac_prepare)(struct dsa_switch *ds, int port, unsigned int mode, phy_interface_t interface); @@ -884,7 +882,6 @@ struct dsa_switch_ops { int (*phylink_mac_finish)(struct dsa_switch *ds, int port, unsigned int mode, phy_interface_t interface); - void (*phylink_mac_an_restart)(struct dsa_switch *ds, int port); void (*phylink_mac_link_down)(struct dsa_switch *ds, int port, unsigned int mode, phy_interface_t interface); diff --git a/net/dsa/port.c b/net/dsa/port.c index 0ce8fd311c78..c63cbfbe6489 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1568,27 +1568,6 @@ static void dsa_port_phylink_validate(struct phylink_config *config, phylink_generic_validate(config, supported, state); } -static void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, - struct phylink_link_state *state) -{ - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); - struct dsa_switch *ds = dp->ds; - int err; - - /* Only called for inband modes */ - if (!ds->ops->phylink_mac_link_state) { - state->link = 0; - return; - } - - err = ds->ops->phylink_mac_link_state(ds, dp->index, state); - if (err < 0) { - dev_err(ds->dev, "p%d: phylink_mac_link_state() failed: %d\n", - dp->index, err); - state->link = 0; - } -} - static struct phylink_pcs * dsa_port_phylink_mac_select_pcs(struct phylink_config *config, phy_interface_t interface) @@ -1646,17 +1625,6 @@ static int dsa_port_phylink_mac_finish(struct phylink_config *config, return err; } -static void dsa_port_phylink_mac_an_restart(struct phylink_config *config) -{ - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_an_restart) - return; - - ds->ops->phylink_mac_an_restart(ds, dp->index); -} - static void dsa_port_phylink_mac_link_down(struct phylink_config *config, unsigned int mode, phy_interface_t interface) @@ -1700,11 +1668,9 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config, static const struct phylink_mac_ops dsa_port_phylink_mac_ops = { .validate = dsa_port_phylink_validate, .mac_select_pcs = dsa_port_phylink_mac_select_pcs, - .mac_pcs_get_state = dsa_port_phylink_mac_pcs_get_state, .mac_prepare = dsa_port_phylink_mac_prepare, .mac_config = dsa_port_phylink_mac_config, .mac_finish = dsa_port_phylink_mac_finish, - .mac_an_restart = dsa_port_phylink_mac_an_restart, .mac_link_down = dsa_port_phylink_mac_link_down, .mac_link_up = dsa_port_phylink_mac_link_up, }; @@ -1720,13 +1686,6 @@ int dsa_port_phylink_create(struct dsa_port *dp) if (err) mode = PHY_INTERFACE_MODE_NA; - /* Presence of phylink_mac_link_state or phylink_mac_an_restart is - * an indicator of a legacy phylink driver. - */ - if (ds->ops->phylink_mac_link_state || - ds->ops->phylink_mac_an_restart) - dp->pl_config.legacy_pre_march2020 = true; - if (ds->ops->phylink_get_caps) ds->ops->phylink_get_caps(ds, dp->index, &dp->pl_config); -- cgit v1.2.3 From 4a59cdfd66998259061b86437b9439d186f89d5a Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 16 Jul 2023 10:24:40 +0300 Subject: rtnetlink: Move nesting cancellation rollback to proper function Make rtnl_fill_vf() cancel the vfinfo attribute on error instead of the inner rtnl_fill_vfinfo(), as it is the function that starts it. Signed-off-by: Gal Pressman Link: https://lore.kernel.org/r/20230716072440.2372567-1-gal@nvidia.com Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3ad4e030846d..70838d7e5b32 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1273,7 +1273,6 @@ static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, - struct nlattr *vfinfo, u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; @@ -1343,7 +1342,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_trust.setting = ivi.trusted; vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); if (!vf) - goto nla_put_vfinfo_failure; + return -EMSGSIZE; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || @@ -1414,8 +1413,6 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_vf_failure: nla_nest_cancel(skb, vf); -nla_put_vfinfo_failure: - nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; } @@ -1441,8 +1438,10 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { - if (rtnl_fill_vfinfo(skb, dev, i, vfinfo, ext_filter_mask)) + if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) { + nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; + } } nla_nest_end(skb, vfinfo); -- cgit v1.2.3 From 8daf847714ec55fe590d1135e26c531658b2ac1c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 17 Jul 2023 16:32:52 +0800 Subject: bpf: Drop useless btf_vmlinux in bpf_tcp_ca The code using btf_vmlinux in bpf_tcp_ca is removed by the commit 9f0265e921de ("bpf: Require only one of cong_avoid() and cong_control() from a TCP CC") so drop this useless btf_vmlinux declaration. Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/4d38da4eadaba476bd92ffcd7a5a03a5e28745c0.1689582557.git.geliang.tang@suse.com Signed-off-by: Alexei Starovoitov --- net/ipv4/bpf_tcp_ca.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 4406d796cc2f..39dcccf0f174 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -51,8 +51,6 @@ static bool is_unsupported(u32 member_offset) return false; } -extern struct btf *btf_vmlinux; - static bool bpf_tcp_ca_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, -- cgit v1.2.3 From dfa2f0483360d4d6f2324405464c9f281156bd87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 17 Jul 2023 15:29:17 +0000 Subject: tcp: get rid of sysctl_tcp_adv_win_scale With modern NIC drivers shifting to full page allocations per received frame, we face the following issue: TCP has one per-netns sysctl used to tweak how to translate a memory use into an expected payload (RWIN), in RX path. tcp_win_from_space() implementation is limited to few cases. For hosts dealing with various MSS, we either under estimate or over estimate the RWIN we send to the remote peers. For instance with the default sysctl_tcp_adv_win_scale value, we expect to store 50% of payload per allocated chunk of memory. For the typical use of MTU=1500 traffic, and order-0 pages allocations by NIC drivers, we are sending too big RWIN, leading to potential tcp collapse operations, which are extremely expensive and source of latency spikes. This patch makes sysctl_tcp_adv_win_scale obsolete, and instead uses a per socket scaling factor, so that we can precisely adjust the RWIN based on effective skb->len/skb->truesize ratio. This patch alone can double TCP receive performance when receivers are too slow to drain their receive queue, or by allowing a bigger RWIN when MSS is close to PAGE_SIZE. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Link: https://lore.kernel.org/r/20230717152917.751987-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 1 + include/linux/tcp.h | 4 +++- include/net/netns/ipv4.h | 2 +- include/net/tcp.h | 24 ++++++++++++++++++++---- net/ipv4/tcp.c | 11 ++++++----- net/ipv4/tcp_input.c | 19 ++++++++++++------- 6 files changed, 43 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 4a010a7cde7f..82f2117cf2b3 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -321,6 +321,7 @@ tcp_abort_on_overflow - BOOLEAN option can harm clients of your server. tcp_adv_win_scale - INTEGER + Obsolete since linux-6.6 Count buffering overhead as bytes/2^tcp_adv_win_scale (if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale), if it is <= 0. diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b4c08ac86983..fbcb0ce13171 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -172,6 +172,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) return (struct tcp_request_sock *)req; } +#define TCP_RMEM_TO_WIN_SCALE 8 + struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; @@ -238,7 +240,7 @@ struct tcp_sock { u32 window_clamp; /* Maximal window to advertise */ u32 rcv_ssthresh; /* Current window clamp */ - + u8 scaling_ratio; /* see tcp_win_from_space() */ /* Information of the most recently (s)acked skb */ struct tcp_rack { u64 mstamp; /* (Re)sent time of the skb */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index f00374718159..7a41c4791536 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -152,7 +152,7 @@ struct netns_ipv4 { u8 sysctl_tcp_abort_on_overflow; u8 sysctl_tcp_fack; /* obsolete */ int sysctl_tcp_max_reordering; - int sysctl_tcp_adv_win_scale; + int sysctl_tcp_adv_win_scale; /* obsolete */ u8 sysctl_tcp_dsack; u8 sysctl_tcp_app_win; u8 sysctl_tcp_frto; diff --git a/include/net/tcp.h b/include/net/tcp.h index 226bce6d1e8c..2104a71c75ba 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1434,11 +1434,27 @@ void tcp_select_initial_window(const struct sock *sk, int __space, static inline int tcp_win_from_space(const struct sock *sk, int space) { - int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale); + s64 scaled_space = (s64)space * tcp_sk(sk)->scaling_ratio; - return tcp_adv_win_scale <= 0 ? - (space>>(-tcp_adv_win_scale)) : - space - (space>>tcp_adv_win_scale); + return scaled_space >> TCP_RMEM_TO_WIN_SCALE; +} + +/* inverse of tcp_win_from_space() */ +static inline int tcp_space_from_win(const struct sock *sk, int win) +{ + u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE; + + do_div(val, tcp_sk(sk)->scaling_ratio); + return val; +} + +static inline void tcp_scaling_ratio_init(struct sock *sk) +{ + /* Assume a conservative default of 1200 bytes of payload per 4K page. + * This may be adjusted later in tcp_measure_rcv_mss(). + */ + tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) / + SKB_TRUESIZE(4096); } /* Note: caller must be prepared to deal with negative returns */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e03e08745308..88f4ebab12ac 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk) WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); + tcp_scaling_ratio_init(sk); set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); @@ -1700,7 +1701,7 @@ EXPORT_SYMBOL(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) { - int cap; + int space, cap; if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; @@ -1715,10 +1716,10 @@ int tcp_set_rcvlowat(struct sock *sk, int val) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; - val <<= 1; - if (val > sk->sk_rcvbuf) { - WRITE_ONCE(sk->sk_rcvbuf, val); - tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); + space = tcp_space_from_win(sk, val); + if (space > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, space); + tcp_sk(sk)->window_clamp = val; } return 0; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 57c8af1859c1..3cd92035e090 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -237,6 +237,16 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) */ len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { + /* Note: divides are still a bit expensive. + * For the moment, only adjust scaling_ratio + * when we update icsk_ack.rcv_mss. + */ + if (unlikely(len != icsk->icsk_ack.rcv_mss)) { + u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE; + + do_div(val, skb->truesize); + tcp_sk(sk)->scaling_ratio = val ? val : 1; + } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); /* Account for possibly-removed options */ @@ -727,8 +737,8 @@ void tcp_rcv_space_adjust(struct sock *sk) if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvmem, rcvbuf; u64 rcvwin, grow; + int rcvbuf; /* minimal window to cope with packet losses, assuming * steady state. Add some cushion because of small variations. @@ -740,12 +750,7 @@ void tcp_rcv_space_adjust(struct sock *sk) do_div(grow, tp->rcvq_space.space); rcvwin += (grow << 1); - rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(sk, rcvmem) < tp->advmss) - rcvmem += 128; - - do_div(rcvwin, tp->advmss); - rcvbuf = min_t(u64, rcvwin * rcvmem, + rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); -- cgit v1.2.3 From 29cfb2aaa4425a608651a05b9b875bc445394443 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 17 Jul 2023 11:12:28 +0300 Subject: bridge: Add backup nexthop ID support Add a new bridge port attribute that allows attaching a nexthop object ID to an skb that is redirected to a backup bridge port with VLAN tunneling enabled. Specifically, when redirecting a known unicast packet, read the backup nexthop ID from the bridge port that lost its carrier and set it in the bridge control block of the skb before forwarding it via the backup port. Note that reading the ID from the bridge port should not result in a cache miss as the ID is added next to the 'backup_port' field that was already accessed. After this change, the 'state' field still stays on the first cache line, together with other data path related fields such as 'flags and 'vlgrp': struct net_bridge_port { struct net_bridge * br; /* 0 8 */ struct net_device * dev; /* 8 8 */ netdevice_tracker dev_tracker; /* 16 0 */ struct list_head list; /* 16 16 */ long unsigned int flags; /* 32 8 */ struct net_bridge_vlan_group * vlgrp; /* 40 8 */ struct net_bridge_port * backup_port; /* 48 8 */ u32 backup_nhid; /* 56 4 */ u8 priority; /* 60 1 */ u8 state; /* 61 1 */ u16 port_no; /* 62 2 */ /* --- cacheline 1 boundary (64 bytes) --- */ [...] } __attribute__((__aligned__(8))); When forwarding an skb via a bridge port that has VLAN tunneling enabled, check if the backup nexthop ID stored in the bridge control block is valid (i.e., not zero). If so, instead of attaching the pre-allocated metadata (that only has the tunnel key set), allocate a new metadata, set both the tunnel key and the nexthop object ID and attach it to the skb. By default, do not dump the new attribute to user space as a value of zero is an invalid nexthop object ID. The above is useful for EVPN multihoming. When one of the links composing an Ethernet Segment (ES) fails, traffic needs to be redirected towards the host via one of the other ES peers. For example, if a host is multihomed to three different VTEPs, the backup port of each ES link needs to be set to the VXLAN device and the backup nexthop ID needs to point to an FDB nexthop group that includes the IP addresses of the other two VTEPs. The VXLAN driver will extract the ID from the metadata of the redirected skb, calculate its flow hash and forward it towards one of the other VTEPs. If the ID does not exist, or represents an invalid nexthop object, the VXLAN driver will drop the skb. This relieves the bridge driver from the need to validate the ID. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_forward.c | 1 + net/bridge/br_netlink.c | 12 ++++++++++++ net/bridge/br_private.h | 3 +++ net/bridge/br_vlan_tunnel.c | 15 +++++++++++++++ net/core/rtnetlink.c | 2 +- 6 files changed, 33 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 0f6a0fe09bdb..ce3117df9cec 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -570,6 +570,7 @@ enum { IFLA_BRPORT_MCAST_N_GROUPS, IFLA_BRPORT_MCAST_MAX_GROUPS, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS, + IFLA_BRPORT_BACKUP_NHID, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 6116eba1bd89..9d7bc8b96b53 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -154,6 +154,7 @@ void br_forward(const struct net_bridge_port *to, backup_port = rcu_dereference(to->backup_port); if (unlikely(!backup_port)) goto out; + BR_INPUT_SKB_CB(skb)->backup_nhid = READ_ONCE(to->backup_nhid); to = backup_port; } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 05c5863d2e20..10f0d33d8ccf 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -211,6 +211,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_CNT */ + + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_BACKUP_NHID */ + 0; } @@ -319,6 +320,10 @@ static int br_port_fill_attrs(struct sk_buff *skb, backup_p->dev->ifindex); rcu_read_unlock(); + if (p->backup_nhid && + nla_put_u32(skb, IFLA_BRPORT_BACKUP_NHID, p->backup_nhid)) + return -EMSGSIZE; + return 0; } @@ -895,6 +900,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT }, [IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, [IFLA_BRPORT_NEIGH_VLAN_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1), + [IFLA_BRPORT_BACKUP_NHID] = { .type = NLA_U32 }, }; /* Change the state of the port and notify spanning tree */ @@ -1065,6 +1071,12 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], return err; } + if (tb[IFLA_BRPORT_BACKUP_NHID]) { + u32 backup_nhid = nla_get_u32(tb[IFLA_BRPORT_BACKUP_NHID]); + + WRITE_ONCE(p->backup_nhid, backup_nhid); + } + return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a63b32c1638e..05a965ef76f1 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -387,6 +387,7 @@ struct net_bridge_port { struct net_bridge_vlan_group __rcu *vlgrp; #endif struct net_bridge_port __rcu *backup_port; + u32 backup_nhid; /* STP */ u8 priority; @@ -605,6 +606,8 @@ struct br_input_skb_cb { */ unsigned long fwd_hwdoms; #endif + + u32 backup_nhid; }; #define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb) diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 6399a8a69d07..81833ca7a2c7 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -201,6 +201,21 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb, if (err) return err; + if (BR_INPUT_SKB_CB(skb)->backup_nhid) { + tunnel_dst = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, + tunnel_id, 0); + if (!tunnel_dst) + return -ENOMEM; + + tunnel_dst->u.tun_info.mode |= IP_TUNNEL_INFO_TX | + IP_TUNNEL_INFO_BRIDGE; + tunnel_dst->u.tun_info.key.nhid = + BR_INPUT_SKB_CB(skb)->backup_nhid; + skb_dst_set(skb, &tunnel_dst->dst); + + return 0; + } + tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst)) skb_dst_set(skb, &tunnel_dst->dst); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 70838d7e5b32..1777a5e1830b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -61,7 +61,7 @@ #include "dev.h" #define RTNL_MAX_TYPE 50 -#define RTNL_SLAVE_MAX_TYPE 43 +#define RTNL_SLAVE_MAX_TYPE 44 struct rtnl_link { rtnl_doit_func doit; -- cgit v1.2.3 From 2d6c85ca3eb8e175e4b5b508b1203b909af5189c Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 17 Jul 2023 15:53:35 +0200 Subject: dccp: Set TOS and routing scope independently for fib lookups. There's no reason for setting the RTO_ONLINK flag in ->flowi4_tos as RT_CONN_FLAGS() does. We can easily set ->flowi4_scope properly instead. This makes the code more explicit and will allow to convert ->flowi4_tos to dscp_t in the future. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index fa8079303cb0..8e919cfe6e23 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -474,7 +474,8 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, .flowi4_oif = inet_iif(skb), .daddr = iph->saddr, .saddr = iph->daddr, - .flowi4_tos = RT_CONN_FLAGS(sk), + .flowi4_tos = ip_sock_rt_tos(sk), + .flowi4_scope = ip_sock_rt_scope(sk), .flowi4_proto = sk->sk_protocol, .fl4_sport = dccp_hdr(skb)->dccph_dport, .fl4_dport = dccp_hdr(skb)->dccph_sport, -- cgit v1.2.3 From ba80e20d7f3f87dab3f9f0c0ca66e4b1fcc7be9f Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 17 Jul 2023 15:53:40 +0200 Subject: sctp: Set TOS and routing scope independently for fib lookups. There's no reason for setting the RTO_ONLINK flag in ->flowi4_tos as RT_CONN_FLAGS() does. We can easily set ->flowi4_scope properly instead. This makes the code more explicit and will allow to convert ->flowi4_tos to dscp_t in the future. Signed-off-by: Guillaume Nault Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/protocol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 274d07bd774f..33c0895e101c 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -435,7 +435,8 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = RT_CONN_FLAGS_TOS(asoc->base.sk, tos); + fl4->flowi4_tos = RT_TOS(tos); + fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); } -- cgit v1.2.3 From 63a64a56bc3f77c74085047ee45356ac850da3e8 Mon Sep 17 00:00:00 2001 From: Tirthendu Sarkar Date: Wed, 19 Jul 2023 15:23:58 +0200 Subject: xsk: prepare 'options' in xdp_desc for multi-buffer use Use the 'options' field in xdp_desc as a packet continuity marker. Since 'options' field was unused till now and was expected to be set to 0, the 'eop' descriptor will have it set to 0, while the non-eop descriptors will have to set it to 1. This ensures legacy applications continue to work without needing any change for single-buffer packets. Add helper functions and extend xskq_prod_reserve_desc() to use the 'options' field. Signed-off-by: Tirthendu Sarkar Link: https://lore.kernel.org/r/20230719132421.584801-2-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/if_xdp.h | 7 +++++++ net/xdp/xsk.c | 8 ++++---- net/xdp/xsk_queue.h | 12 +++++++++--- 3 files changed, 20 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index a78a8096f4ce..434f313dc26c 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -108,4 +108,11 @@ struct xdp_desc { /* UMEM descriptor is __u64 */ +/* Flag indicating that the packet continues with the buffer pointed out by the + * next frame in the ring. The end of the packet is signalled by setting this + * bit to zero. For single buffer packets, every descriptor has 'options' set + * to 0 and this maintains backward compatibility. + */ +#define XDP_PKT_CONTD (1 << 0) + #endif /* _LINUX_IF_XDP_H */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 31dca4ecb2c5..914a80cd55d3 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -135,14 +135,14 @@ int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, return 0; } -static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, u32 flags) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); u64 addr; int err; addr = xp_get_handle(xskb); - err = xskq_prod_reserve_desc(xs->rx, addr, len); + err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); if (err) { xs->rx_queue_full++; return err; @@ -189,7 +189,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) } xsk_copy_xdp(xsk_xdp, xdp, len); - err = __xsk_rcv_zc(xs, xsk_xdp, len); + err = __xsk_rcv_zc(xs, xsk_xdp, len, 0); if (err) { xsk_buff_free(xsk_xdp); return err; @@ -259,7 +259,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { len = xdp->data_end - xdp->data; - return __xsk_rcv_zc(xs, xdp, len); + return __xsk_rcv_zc(xs, xdp, len, 0); } err = __xsk_rcv(xs, xdp); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 6d40a77fccbe..2f1aacae81af 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -130,6 +130,11 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) return false; } +static inline bool xp_unused_options_set(u32 options) +{ + return options & ~XDP_PKT_CONTD; +} + static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { @@ -141,7 +146,7 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, if (desc->addr >= pool->addrs_cnt) return false; - if (desc->options) + if (xp_unused_options_set(desc->options)) return false; return true; } @@ -158,7 +163,7 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) return false; - if (desc->options) + if (xp_unused_options_set(desc->options)) return false; return true; } @@ -360,7 +365,7 @@ static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_de } static inline int xskq_prod_reserve_desc(struct xsk_queue *q, - u64 addr, u32 len) + u64 addr, u32 len, u32 flags) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx; @@ -372,6 +377,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q, idx = q->cached_prod++ & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; + ring->desc[idx].options = flags; return 0; } -- cgit v1.2.3 From 81470b5c3c6649eef8e5f282cd06793f788ae165 Mon Sep 17 00:00:00 2001 From: Tirthendu Sarkar Date: Wed, 19 Jul 2023 15:23:59 +0200 Subject: xsk: introduce XSK_USE_SG bind flag for xsk socket As of now xsk core drops any xdp_buff with data size greater than the xsk frame_size as set by the af_xdp application. With multi-buffer support introduced in the next patch xsk core can now split those buffers into multiple descriptors provided the af_xdp application can handle them. Such capability of the application needs to be independent of the xdp_prog's frag support capability since there are cases where even a single xdp_buffer may need to be split into multiple descriptors owing to a smaller xsk frame size. For e.g., with NIC rx_buffer size set to 4kB, a 3kB packet will constitute of a single buffer and so will be sent as such to AF_XDP layer irrespective of 'xdp.frags' capability of the XDP program. Now if the xsk frame size is set to 2kB by the AF_XDP application, then the packet will need to be split into 2 descriptors if AF_XDP application can handle multi-buffer, else it needs to be dropped. Applications can now advertise their frag handling capability to xsk core so that xsk core can decide if it should drop or split xdp_buffs that exceed xsk frame size. This is done using a new 'XSK_USE_SG' bind flag for the xdp socket. Signed-off-by: Tirthendu Sarkar Link: https://lore.kernel.org/r/20230719132421.584801-3-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 1 + include/uapi/linux/if_xdp.h | 6 ++++++ net/xdp/xsk.c | 5 +++-- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index e96a1151ec75..36b0411a0d1b 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -52,6 +52,7 @@ struct xdp_sock { struct xsk_buff_pool *pool; u16 queue_id; bool zc; + bool sg; enum { XSK_READY = 0, XSK_BOUND, diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 434f313dc26c..8d48863472b9 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -25,6 +25,12 @@ * application. */ #define XDP_USE_NEED_WAKEUP (1 << 3) +/* By setting this option, userspace application indicates that it can + * handle multiple descriptors per packet thus enabling AF_XDP to split + * multi-buffer XDP frames into multiple Rx descriptors. Without this set + * such frames will be dropped. + */ +#define XDP_USE_SG (1 << 4) /* Flags for xsk_umem_config flags */ #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 914a80cd55d3..7b709e4e7ec4 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -897,7 +897,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) flags = sxdp->sxdp_flags; if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | - XDP_USE_NEED_WAKEUP)) + XDP_USE_NEED_WAKEUP | XDP_USE_SG)) return -EINVAL; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); @@ -929,7 +929,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct socket *sock; if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || - (flags & XDP_USE_NEED_WAKEUP)) { + (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { /* Cannot specify flags for shared sockets. */ err = -EINVAL; goto out_unlock; @@ -1028,6 +1028,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xs->dev = dev; xs->zc = xs->umem->zc; + xs->sg = !!(flags & XDP_USE_SG); xs->queue_id = qid; xp_add_xsk(xs->pool, xs); -- cgit v1.2.3 From 556444c4e683e73531766d4a577b8b94c42ac160 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 19 Jul 2023 15:24:00 +0200 Subject: xsk: prepare both copy and zero-copy modes to co-exist Currently, __xsk_rcv_zc() is a function that is responsible for producing AF_XDP Rx descriptors. It is used by both copy and zero-copy mode. Both of these modes are going to differ when multi-buffer support is going to be added. ZC will work on a chain of xdp_buff_xsk structs whereas copy-mode is going to utilize skb_shared_info contents. This means that ZC-specific changes would affect the copy mode. Let's modify __xsk_rcv_zc() to work directly on xdp_buff_xsk so the callsites have to retrieve this from xdp_buff. Also, introduce xsk_rcv_zc() which will carry all the needed later changes for supporting multi-buffer on ZC side that do not apply to copy mode. Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20230719132421.584801-4-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 7b709e4e7ec4..029a092582da 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -135,9 +135,9 @@ int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, return 0; } -static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, u32 flags) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, + u32 flags) { - struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); u64 addr; int err; @@ -152,6 +152,13 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, u32 return 0; } +static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + return __xsk_rcv_zc(xs, xskb, len, 0); +} + static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) { void *from_buf, *to_buf; @@ -172,6 +179,7 @@ static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + struct xdp_buff_xsk *xskb; struct xdp_buff *xsk_xdp; int err; u32 len; @@ -189,7 +197,8 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) } xsk_copy_xdp(xsk_xdp, xdp, len); - err = __xsk_rcv_zc(xs, xsk_xdp, len, 0); + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + err = __xsk_rcv_zc(xs, xskb, len, 0); if (err) { xsk_buff_free(xsk_xdp); return err; @@ -259,7 +268,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { len = xdp->data_end - xdp->data; - return __xsk_rcv_zc(xs, xdp, len, 0); + return xsk_rcv_zc(xs, xdp, len); } err = __xsk_rcv(xs, xdp); -- cgit v1.2.3 From faa91b839b09b15d6f9655fc866fa433bb65454c Mon Sep 17 00:00:00 2001 From: Tirthendu Sarkar Date: Wed, 19 Jul 2023 15:24:01 +0200 Subject: xsk: move xdp_buff's data length check to xsk_rcv_check If the data in xdp_buff exceeds the xsk frame length, the packet needs to be dropped. This check is currently being done in __xsk_rcv(). Move the described logic to xsk_rcv_check() so that such a xdp_buff will only be dropped if the application does not support multi-buffer (absence of XDP_USE_SG bind flag). This is applicable for all cases: copy mode, zero copy mode as well as skb mode. Signed-off-by: Tirthendu Sarkar Link: https://lore.kernel.org/r/20230719132421.584801-5-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 029a092582da..a968ac506f4a 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -177,18 +177,11 @@ static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) memcpy(to_buf, from_buf, len + metalen); } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { struct xdp_buff_xsk *xskb; struct xdp_buff *xsk_xdp; int err; - u32 len; - - len = xdp->data_end - xdp->data; - if (len > xsk_pool_get_rx_frame_size(xs->pool)) { - xs->rx_dropped++; - return -ENOSPC; - } xsk_xdp = xsk_buff_alloc(xs->pool); if (!xsk_xdp) { @@ -224,7 +217,7 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp) +static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { if (!xsk_is_bound(xs)) return -ENXIO; @@ -232,6 +225,11 @@ static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp) if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; + if (len > xsk_pool_get_rx_frame_size(xs->pool)) { + xs->rx_dropped++; + return -ENOSPC; + } + sk_mark_napi_id_once_xdp(&xs->sk, xdp); return 0; } @@ -245,12 +243,13 @@ static void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 len = xdp_get_buff_len(xdp); int err; spin_lock_bh(&xs->rx_lock); - err = xsk_rcv_check(xs, xdp); + err = xsk_rcv_check(xs, xdp, len); if (!err) { - err = __xsk_rcv(xs, xdp); + err = __xsk_rcv(xs, xdp, len); xsk_flush(xs); } spin_unlock_bh(&xs->rx_lock); @@ -259,10 +258,10 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 len = xdp_get_buff_len(xdp); int err; - u32 len; - err = xsk_rcv_check(xs, xdp); + err = xsk_rcv_check(xs, xdp, len); if (err) return err; @@ -271,7 +270,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return xsk_rcv_zc(xs, xdp, len); } - err = __xsk_rcv(xs, xdp); + err = __xsk_rcv(xs, xdp, len); if (!err) xdp_return_buff(xdp); return err; -- cgit v1.2.3 From 804627751b4281dd95148e7564759145da67855e Mon Sep 17 00:00:00 2001 From: Tirthendu Sarkar Date: Wed, 19 Jul 2023 15:24:02 +0200 Subject: xsk: add support for AF_XDP multi-buffer on Rx path Add multi-buffer support for AF_XDP by extending the XDP multi-buffer support to be reflected in user-space when a packet is redirected to an AF_XDP socket. In the XDP implementation, the NIC driver builds the xdp_buff from the first frag of the packet and adds any subsequent frags in the skb_shinfo area of the xdp_buff. In AF_XDP core, XDP buffers are allocated from xdp_sock's pool and data is copied from the driver's xdp_buff and frags. Once an allocated XDP buffer is full and there is still data to be copied, the 'XDP_PKT_CONTD' flag in'options' field of the corresponding xdp ring descriptor is set and passed to the application. When application sees the aforementioned flag set it knows there is pending data for this packet that will be carried in the following descriptors. If there is no more data to be copied, the flag in 'options' field is cleared for that descriptor signalling EOP to the application. If application reads a batch of descriptors using for example the libxdp interfaces, it is not guaranteed that the batch will end with a full packet. It might end in the middle of a packet and the rest of the frames of that packet will arrive at the beginning of the next batch. AF_XDP ensures that only a complete packet (along with all its frags) is sent to application. Signed-off-by: Tirthendu Sarkar Link: https://lore.kernel.org/r/20230719132421.584801-6-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 7 +--- net/xdp/xsk.c | 110 ++++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 88 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 06ba0e56e369..b4410dc841a0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4345,13 +4345,8 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); enum bpf_map_type map_type = ri->map_type; - if (map_type == BPF_MAP_TYPE_XSKMAP) { - /* XDP_REDIRECT is not supported AF_XDP yet. */ - if (unlikely(xdp_buff_has_frags(xdp))) - return -EOPNOTSUPP; - + if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); - } return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), xdp_prog); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index a968ac506f4a..01c134b1186e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -159,43 +159,107 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return __xsk_rcv_zc(xs, xskb, len, 0); } -static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) +static void *xsk_copy_xdp_start(struct xdp_buff *from) { - void *from_buf, *to_buf; - u32 metalen; + if (unlikely(xdp_data_meta_unsupported(from))) + return from->data; + else + return from->data_meta; +} - if (unlikely(xdp_data_meta_unsupported(from))) { - from_buf = from->data; - to_buf = to->data; - metalen = 0; - } else { - from_buf = from->data_meta; - metalen = from->data - from->data_meta; - to_buf = to->data - metalen; - } +static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, + u32 *from_len, skb_frag_t **frag, u32 rem) +{ + u32 copied = 0; + + while (1) { + u32 copy_len = min_t(u32, *from_len, to_len); + + memcpy(to, *from, copy_len); + copied += copy_len; + if (rem == copied) + return copied; + + if (*from_len == copy_len) { + *from = skb_frag_address(*frag); + *from_len = skb_frag_size((*frag)++); + } else { + *from += copy_len; + *from_len -= copy_len; + } + if (to_len == copy_len) + return copied; - memcpy(to_buf, from_buf, len + metalen); + to_len -= copy_len; + to += copy_len; + } } static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { + u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); + void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; + u32 from_len, meta_len, rem, num_desc; struct xdp_buff_xsk *xskb; struct xdp_buff *xsk_xdp; - int err; + skb_frag_t *frag; + + from_len = xdp->data_end - copy_from; + meta_len = xdp->data - copy_from; + rem = len + meta_len; + + if (len <= frame_size && !xdp_buff_has_frags(xdp)) { + int err; - xsk_xdp = xsk_buff_alloc(xs->pool); - if (!xsk_xdp) { + xsk_xdp = xsk_buff_alloc(xs->pool); + if (!xsk_xdp) { + xs->rx_dropped++; + return -ENOMEM; + } + memcpy(xsk_xdp->data - meta_len, copy_from, rem); + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + err = __xsk_rcv_zc(xs, xskb, len, 0); + if (err) { + xsk_buff_free(xsk_xdp); + return err; + } + + return 0; + } + + num_desc = (len - 1) / frame_size + 1; + + if (!xsk_buff_can_alloc(xs->pool, num_desc)) { xs->rx_dropped++; return -ENOMEM; } + if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { + xs->rx_queue_full++; + return -ENOBUFS; + } - xsk_copy_xdp(xsk_xdp, xdp, len); - xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); - err = __xsk_rcv_zc(xs, xskb, len, 0); - if (err) { - xsk_buff_free(xsk_xdp); - return err; + if (xdp_buff_has_frags(xdp)) { + struct skb_shared_info *sinfo; + + sinfo = xdp_get_shared_info_from_buff(xdp); + frag = &sinfo->frags[0]; } + + do { + u32 to_len = frame_size + meta_len; + u32 copied; + + xsk_xdp = xsk_buff_alloc(xs->pool); + copy_to = xsk_xdp->data - meta_len; + + copied = xsk_copy_xdp(copy_to, ©_from, to_len, &from_len, &frag, rem); + rem -= copied; + + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); + meta_len = 0; + } while (rem); + return 0; } @@ -225,7 +289,7 @@ static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; - if (len > xsk_pool_get_rx_frame_size(xs->pool)) { + if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { xs->rx_dropped++; return -ENOSPC; } -- cgit v1.2.3 From b7f72a30e9ac2555b05afc6cfddc9dbc98e1eb8d Mon Sep 17 00:00:00 2001 From: Tirthendu Sarkar Date: Wed, 19 Jul 2023 15:24:03 +0200 Subject: xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path In Tx path, xsk core reserves space for each desc to be transmitted in the completion queue and it's address contained in it is stored in the skb destructor arg. After successful transmission the skb destructor submits the addr marking completion. To handle multiple descriptors per packet, now along with reserving space for each descriptor, the corresponding address is also stored in completion queue. The number of pending descriptors are stored in skb destructor arg and is used by the skb destructor to update completions. Introduce 'skb' in xdp_sock to store a partially built packet when __xsk_generic_xmit() must return before it sees the EOP descriptor for the current packet so that packet building can resume in next call of __xsk_generic_xmit(). Helper functions are introduced to set and get the pending descriptors in the skb destructor arg. Also, wrappers are introduced for storing descriptor addresses, submitting and cancelling (for unsuccessful transmissions) the number of completions. Signed-off-by: Tirthendu Sarkar Link: https://lore.kernel.org/r/20230719132421.584801-7-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 6 ++++ net/xdp/xsk.c | 74 ++++++++++++++++++++++++++++++++++++-------------- net/xdp/xsk_queue.h | 19 +++++-------- 3 files changed, 67 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 36b0411a0d1b..1617af380162 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -68,6 +68,12 @@ struct xdp_sock { u64 rx_dropped; u64 rx_queue_full; + /* When __xsk_generic_xmit() must return before it sees the EOP descriptor for the current + * packet, the partially built skb is saved here so that packet building can resume in next + * call of __xsk_generic_xmit(). + */ + struct sk_buff *skb; + struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 01c134b1186e..ed7f86300087 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -480,19 +480,65 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } -static void xsk_destruct_skb(struct sk_buff *skb) +static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&xs->pool->cq_lock, flags); + ret = xskq_prod_reserve_addr(xs->pool->cq, addr); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + + return ret; +} + +static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n) +{ + unsigned long flags; + + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_submit_n(xs->pool->cq, n); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); +} + +static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n) { - u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; - struct xdp_sock *xs = xdp_sk(skb->sk); unsigned long flags; spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_submit_addr(xs->pool->cq, addr); + xskq_prod_cancel_n(xs->pool->cq, n); spin_unlock_irqrestore(&xs->pool->cq_lock, flags); +} + +static u32 xsk_get_num_desc(struct sk_buff *skb) +{ + return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; +} +static void xsk_destruct_skb(struct sk_buff *skb) +{ + xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb)); sock_wfree(skb); } +static void xsk_set_destructor_arg(struct sk_buff *skb) +{ + long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; + + skb_shinfo(skb)->destructor_arg = (void *)num; +} + +static void xsk_consume_skb(struct sk_buff *skb) +{ + struct xdp_sock *xs = xdp_sk(skb->sk); + + skb->destructor = sock_wfree; + xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb)); + /* Free skb without triggering the perf drop trace */ + consume_skb(skb); + xs->skb = NULL; +} + static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { @@ -578,8 +624,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb->dev = dev; skb->priority = xs->sk.sk_priority; skb->mark = xs->sk.sk_mark; - skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr; skb->destructor = xsk_destruct_skb; + xsk_set_destructor_arg(skb); return skb; } @@ -591,7 +637,6 @@ static int __xsk_generic_xmit(struct sock *sk) bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; - unsigned long flags; int err = 0; mutex_lock(&xs->mutex); @@ -616,31 +661,20 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - spin_lock_irqsave(&xs->pool->cq_lock, flags); - if (xskq_prod_reserve(xs->pool->cq)) { - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + if (xsk_cq_reserve_addr_locked(xs, desc.addr)) goto out; - } - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); skb = xsk_build_skb(xs, &desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); - spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_cancel(xs->pool->cq); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + xsk_cq_cancel_locked(xs, 1); goto out; } err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ - skb->destructor = sock_wfree; - spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_cancel(xs->pool->cq); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); - /* Free skb without triggering the perf drop trace */ - consume_skb(skb); + xsk_consume_skb(skb); err = -EAGAIN; goto out; } diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 2f1aacae81af..423ad7368fa2 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -297,6 +297,11 @@ static inline void xskq_cons_release(struct xsk_queue *q) q->cached_cons++; } +static inline void xskq_cons_cancel_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_cons -= cnt; +} + static inline u32 xskq_cons_present_entries(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ @@ -324,9 +329,9 @@ static inline bool xskq_prod_is_full(struct xsk_queue *q) return xskq_prod_nb_free(q, 1) ? false : true; } -static inline void xskq_prod_cancel(struct xsk_queue *q) +static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt) { - q->cached_prod--; + q->cached_prod -= cnt; } static inline int xskq_prod_reserve(struct xsk_queue *q) @@ -392,16 +397,6 @@ static inline void xskq_prod_submit(struct xsk_queue *q) __xskq_prod_submit(q, q->cached_prod); } -static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr) -{ - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - u32 idx = q->ring->producer; - - ring->desc[idx++ & q->ring_mask] = addr; - - __xskq_prod_submit(q, idx); -} - static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) { __xskq_prod_submit(q, q->ring->producer + nb_entries); -- cgit v1.2.3 From cf24f5a5feeaae34c1a34d1e04f8ac697290427a Mon Sep 17 00:00:00 2001 From: Tirthendu Sarkar Date: Wed, 19 Jul 2023 15:24:05 +0200 Subject: xsk: add support for AF_XDP multi-buffer on Tx path For transmitting an AF_XDP packet, allocate skb while processing the first desc and copy data to it. The 'XDP_PKT_CONTD' flag in 'options' field of the desc indicates the EOP status of the packet. If the current desc is not EOP, store the skb, release the current desc and go on to read the next descs. Allocate a page for each subsequent desc, copy data to it and add it as a frag in the skb stored in xsk. On processing EOP, transmit the skb with frags. Addresses contained in descs have been already queued in consumer queue and skb destructor updated the completion count. On transmit failure cancel the releases, clear the descs from the completion queue and consume the skb for retrying packet transmission. For any invalid descriptor (invalid length/address/options) in the middle of a packet, all pending descriptors will be dropped by xsk core along with the invalid one and the next descriptor is treated as the start of a new packet. Maximum supported frames for a packet is MAX_SKB_FRAGS + 1. If it is exceeded, all descriptors accumulated so far are dropped. Signed-off-by: Tirthendu Sarkar Link: https://lore.kernel.org/r/20230719132421.584801-9-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 120 ++++++++++++++++++++++++++++++++++++++++------------ net/xdp/xsk_queue.h | 13 +++--- 2 files changed, 100 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index ed7f86300087..ba755fed3750 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -393,7 +393,8 @@ bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { - xs->tx->queue_empty_descs++; + if (xskq_has_descs(xs->tx)) + xskq_cons_release(xs->tx); continue; } @@ -539,24 +540,32 @@ static void xsk_consume_skb(struct sk_buff *skb) xs->skb = NULL; } +static void xsk_drop_skb(struct sk_buff *skb) +{ + xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); + xsk_consume_skb(skb); +} + static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { struct xsk_buff_pool *pool = xs->pool; u32 hr, len, ts, offset, copy, copied; - struct sk_buff *skb; + struct sk_buff *skb = xs->skb; struct page *page; void *buffer; int err, i; u64 addr; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); + if (!skb) { + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); - skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); - if (unlikely(!skb)) - return ERR_PTR(err); + skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); + if (unlikely(!skb)) + return ERR_PTR(err); - skb_reserve(skb, hr); + skb_reserve(skb, hr); + } addr = desc->addr; len = desc->len; @@ -566,7 +575,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, offset = offset_in_page(buffer); addr = buffer - pool->addrs; - for (copied = 0, i = 0; copied < len; i++) { + for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { + if (unlikely(i >= MAX_SKB_FRAGS)) + return ERR_PTR(-EFAULT); + page = pool->umem->pgs[addr >> PAGE_SHIFT]; get_page(page); @@ -591,33 +603,56 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { struct net_device *dev = xs->dev; - struct sk_buff *skb; + struct sk_buff *skb = xs->skb; + int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { skb = xsk_build_skb_zerocopy(xs, desc); - if (IS_ERR(skb)) - return skb; + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + goto free_err; + } } else { u32 hr, tr, len; void *buffer; - int err; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); - tr = dev->needed_tailroom; + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); len = desc->len; - skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); - if (unlikely(!skb)) - return ERR_PTR(err); + if (!skb) { + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); + tr = dev->needed_tailroom; + skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); + if (unlikely(!skb)) + goto free_err; - skb_reserve(skb, hr); - skb_put(skb, len); + skb_reserve(skb, hr); + skb_put(skb, len); - buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); - err = skb_store_bits(skb, 0, buffer, len); - if (unlikely(err)) { - kfree_skb(skb); - return ERR_PTR(err); + err = skb_store_bits(skb, 0, buffer, len); + if (unlikely(err)) + goto free_err; + } else { + int nr_frags = skb_shinfo(skb)->nr_frags; + struct page *page; + u8 *vaddr; + + if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { + err = -EFAULT; + goto free_err; + } + + page = alloc_page(xs->sk.sk_allocation); + if (unlikely(!page)) { + err = -EAGAIN; + goto free_err; + } + + vaddr = kmap_local_page(page); + memcpy(vaddr, buffer, len); + kunmap_local(vaddr); + + skb_add_rx_frag(skb, nr_frags, page, 0, len, 0); } } @@ -628,6 +663,17 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, xsk_set_destructor_arg(skb); return skb; + +free_err: + if (err == -EAGAIN) { + xsk_cq_cancel_locked(xs, 1); + } else { + xsk_set_destructor_arg(skb); + xsk_drop_skb(skb); + xskq_cons_release(xs->tx); + } + + return ERR_PTR(err); } static int __xsk_generic_xmit(struct sock *sk) @@ -667,30 +713,45 @@ static int __xsk_generic_xmit(struct sock *sk) skb = xsk_build_skb(xs, &desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); - xsk_cq_cancel_locked(xs, 1); - goto out; + if (err == -EAGAIN) + goto out; + err = 0; + continue; + } + + xskq_cons_release(xs->tx); + + if (xp_mb_desc(&desc)) { + xs->skb = skb; + continue; } err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ + xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); xsk_consume_skb(skb); err = -EAGAIN; goto out; } - xskq_cons_release(xs->tx); /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP) { /* SKB completed but not sent */ err = -EBUSY; + xs->skb = NULL; goto out; } sent_frame = true; + xs->skb = NULL; } - xs->tx->queue_empty_descs++; + if (xskq_has_descs(xs->tx)) { + if (xs->skb) + xsk_drop_skb(xs->skb); + xskq_cons_release(xs->tx); + } out: if (sent_frame) @@ -940,6 +1001,9 @@ static int xsk_release(struct socket *sock) net = sock_net(sk); + if (xs->skb) + xsk_drop_skb(xs->skb); + mutex_lock(&net->xdp.lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->xdp.lock); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 423ad7368fa2..5752c9fd8ce7 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -175,6 +175,11 @@ static inline bool xp_validate_desc(struct xsk_buff_pool *pool, xp_aligned_validate_desc(pool, desc); } +static inline bool xskq_has_descs(struct xsk_queue *q) +{ + return q->cached_cons != q->cached_prod; +} + static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xsk_buff_pool *pool) @@ -190,17 +195,15 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xsk_buff_pool *pool) { - while (q->cached_cons != q->cached_prod) { + if (q->cached_cons != q->cached_prod) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = q->cached_cons & q->ring_mask; *desc = ring->desc[idx]; - if (xskq_cons_is_valid_desc(q, desc, pool)) - return true; - - q->cached_cons++; + return xskq_cons_is_valid_desc(q, desc, pool); } + q->queue_empty_descs++; return false; } -- cgit v1.2.3 From 07428da9e25a5dfae7252cd554c90557f9086a73 Mon Sep 17 00:00:00 2001 From: Tirthendu Sarkar Date: Wed, 19 Jul 2023 15:24:06 +0200 Subject: xsk: discard zero length descriptors in Tx path Descriptors with zero length are not supported by many NICs. To preserve uniform behavior discard any zero length desc as invvalid desc. Signed-off-by: Tirthendu Sarkar Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20230719132421.584801-10-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- net/xdp/xsk_queue.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 5752c9fd8ce7..bac32027f865 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -140,6 +140,9 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, { u64 offset = desc->addr & (pool->chunk_size - 1); + if (!desc->len) + return false; + if (offset + desc->len > pool->chunk_size) return false; @@ -156,6 +159,9 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, { u64 addr = xp_unaligned_add_offset_to_addr(desc->addr); + if (!desc->len) + return false; + if (desc->len > pool->chunk_size) return false; -- cgit v1.2.3 From 13ce2daa259a3bfbc9a5aeeee8b9a87058703731 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 19 Jul 2023 15:24:07 +0200 Subject: xsk: add new netlink attribute dedicated for ZC max frags Introduce new netlink attribute NETDEV_A_DEV_XDP_ZC_MAX_SEGS that will carry maximum fragments that underlying ZC driver is able to handle on TX side. It is going to be included in netlink response only when driver supports ZC. Any value higher than 1 implies multi-buffer ZC support on underlying device. Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20230719132421.584801-11-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- Documentation/netlink/specs/netdev.yaml | 6 ++++++ include/linux/netdevice.h | 1 + include/uapi/linux/netdev.h | 1 + net/core/dev.c | 1 + net/core/netdev-genl.c | 8 ++++++++ tools/include/uapi/linux/netdev.h | 1 + tools/lib/bpf/libbpf.h | 3 ++- tools/lib/bpf/netlink.c | 5 +++++ 8 files changed, 25 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index b99e7ffef7a1..e41015310a6e 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -62,6 +62,12 @@ attribute-sets: type: u64 enum: xdp-act enum-as-flags: true + - + name: xdp_zc_max_segs + doc: max fragment count supported by ZC driver + type: u32 + checks: + min: 1 operations: list: diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b828c7a75be2..b12477ea4032 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2250,6 +2250,7 @@ struct net_device { #define GRO_MAX_SIZE (8 * 65535u) unsigned int gro_max_size; unsigned int gro_ipv4_max_size; + unsigned int xdp_zc_max_segs; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 639524b59930..bf71698a1e82 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -41,6 +41,7 @@ enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, NETDEV_A_DEV_XDP_FEATURES, + NETDEV_A_DEV_XDP_ZC_MAX_SEGS, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) diff --git a/net/core/dev.c b/net/core/dev.c index d6e1b786c5c5..dd4f114a7cbf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10613,6 +10613,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); dev->gso_max_size = GSO_LEGACY_MAX_SIZE; + dev->xdp_zc_max_segs = 1; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_LEGACY_MAX_SIZE; dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index a4270fafdf11..65ef4867fc49 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -25,6 +25,14 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, return -EINVAL; } + if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { + if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + netdev->xdp_zc_max_segs)) { + genlmsg_cancel(rsp, hdr); + return -EINVAL; + } + } + genlmsg_end(rsp, hdr); return 0; diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 639524b59930..bf71698a1e82 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -41,6 +41,7 @@ enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, NETDEV_A_DEV_XDP_FEATURES, + NETDEV_A_DEV_XDP_ZC_MAX_SEGS, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 10642ad69d76..674e5788eb10 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1105,9 +1105,10 @@ struct bpf_xdp_query_opts { __u32 skb_prog_id; /* output */ __u8 attach_mode; /* output */ __u64 feature_flags; /* output */ + __u32 xdp_zc_max_segs; /* output */ size_t :0; }; -#define bpf_xdp_query_opts__last_field feature_flags +#define bpf_xdp_query_opts__last_field xdp_zc_max_segs LIBBPF_API int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags, const struct bpf_xdp_attach_opts *opts); diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 84dd5fa14905..090bcf6e3b3d 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -45,6 +45,7 @@ struct xdp_id_md { struct xdp_features_md { int ifindex; + __u32 xdp_zc_max_segs; __u64 flags; }; @@ -421,6 +422,9 @@ static int parse_xdp_features(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, return NL_CONT; md->flags = libbpf_nla_getattr_u64(tb[NETDEV_A_DEV_XDP_FEATURES]); + if (tb[NETDEV_A_DEV_XDP_ZC_MAX_SEGS]) + md->xdp_zc_max_segs = + libbpf_nla_getattr_u32(tb[NETDEV_A_DEV_XDP_ZC_MAX_SEGS]); return NL_DONE; } @@ -493,6 +497,7 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) return libbpf_err(err); opts->feature_flags = md.flags; + opts->xdp_zc_max_segs = md.xdp_zc_max_segs; skip_feature_flags: return 0; -- cgit v1.2.3 From 24ea50127ecf0efe819c1f6230add27abc6ca9d9 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 19 Jul 2023 15:24:08 +0200 Subject: xsk: support mbuf on ZC RX Given that skb_shared_info relies on skb_frag_t, in order to support xskb chaining, introduce xdp_buff_xsk::xskb_list_node and xsk_buff_pool::xskb_list. This is needed so ZC drivers can add frags as xskb nodes which will make it possible to handle it both when producing AF_XDP Rx descriptors as well as freeing/recycling all the frags that a single frame carries. Speaking of latter, update xsk_buff_free() to take care of list nodes. For the former (adding as frags), introduce xsk_buff_add_frag() for ZC drivers usage that is going to be used to add a frag to xskb list from pool. xsk_buff_get_frag() will be utilized by XDP_TX and, on contrary, will return xdp_buff. One of the previous patches added a wrapper for ZC Rx so implement xskb list walk and production of Rx descriptors there. On bind() path, bail out if socket wants to use ZC multi-buffer but underlying netdev does not support it. Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20230719132421.584801-12-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock_drv.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ include/net/xsk_buff_pool.h | 2 ++ net/xdp/xsk.c | 26 +++++++++++++++++++++++++- net/xdp/xsk_buff_pool.c | 7 +++++++ 4 files changed, 78 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 0d34cdb5567d..1f6fc8c7a84c 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -108,10 +108,45 @@ static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) static inline void xsk_buff_free(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + struct list_head *xskb_list = &xskb->pool->xskb_list; + struct xdp_buff_xsk *pos, *tmp; + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + + list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { + list_del(&pos->xskb_list_node); + xp_free(pos); + } + + xdp_get_shared_info_from_buff(xdp)->nr_frags = 0; +out: xp_free(xskb); } +static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); + + list_add_tail(&frag->xskb_list_node, &frag->pool->xskb_list); +} + +static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff *ret = NULL; + struct xdp_buff_xsk *frag; + + frag = list_first_entry_or_null(&xskb->pool->xskb_list, + struct xdp_buff_xsk, xskb_list_node); + if (frag) { + list_del(&frag->xskb_list_node); + ret = &frag->xdp; + } + + return ret; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; @@ -265,6 +300,15 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) { } +static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +{ +} + +static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +{ + return NULL; +} + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 4dcca163e076..b0bdff26fc88 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -29,6 +29,7 @@ struct xdp_buff_xsk { struct xsk_buff_pool *pool; u64 orig_addr; struct list_head free_list_node; + struct list_head xskb_list_node; }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) @@ -54,6 +55,7 @@ struct xsk_buff_pool { struct xdp_umem *umem; struct work_struct work; struct list_head free_list; + struct list_head xskb_list; u32 heads_cnt; u16 queue_id; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index ba755fed3750..4f1e0599146e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -155,8 +155,32 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + u32 frags = xdp_buff_has_frags(xdp); + struct xdp_buff_xsk *pos, *tmp; + struct list_head *xskb_list; + u32 contd = 0; + int err; + + if (frags) + contd = XDP_PKT_CONTD; - return __xsk_rcv_zc(xs, xskb, len, 0); + err = __xsk_rcv_zc(xs, xskb, len, contd); + if (err || likely(!frags)) + goto out; + + xskb_list = &xskb->pool->xskb_list; + list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { + if (list_is_singular(xskb_list)) + contd = 0; + len = pos->xdp.data_end - pos->xdp.data; + err = __xsk_rcv_zc(xs, pos, len, contd); + if (err) + return err; + list_del(&pos->xskb_list_node); + } + +out: + return err; } static void *xsk_copy_xdp_start(struct xdp_buff *from) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 26f6d304451e..b3f7b310811e 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -86,6 +86,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->umem = umem; pool->addrs = umem->addrs; INIT_LIST_HEAD(&pool->free_list); + INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); spin_lock_init(&pool->xsk_tx_list_lock); spin_lock_init(&pool->cq_lock); @@ -99,6 +100,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; INIT_LIST_HEAD(&xskb->free_list_node); + INIT_LIST_HEAD(&xskb->xskb_list_node); if (pool->unaligned) pool->free_heads[i] = xskb; else @@ -187,6 +189,11 @@ int xp_assign_dev(struct xsk_buff_pool *pool, goto err_unreg_pool; } + if (netdev->xdp_zc_max_segs == 1 && (flags & XDP_USE_SG)) { + err = -EOPNOTSUPP; + goto err_unreg_pool; + } + bpf.command = XDP_SETUP_XSK_POOL; bpf.xsk.pool = pool; bpf.xsk.queue_id = queue_id; -- cgit v1.2.3 From d5581966040f313c76b0f54ad76b3fb4095fddcd Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 19 Jul 2023 15:24:11 +0200 Subject: xsk: support ZC Tx multi-buffer in batch API Modify xskq_cons_read_desc_batch() in a way that each processed descriptor will be checked if it is an EOP one or not and act accordingly to that. Change the behavior of mentioned function to break the processing when stumbling upon invalid descriptor instead of skipping it. Furthermore, let us give only full packets down to ZC driver. With these two assumptions ZC drivers will not have to take care of an intermediate state of incomplete frames, which will simplify its implementations a lot. Last but not least, stop processing when count of frags would exceed max supported segments on underlying device. Signed-off-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20230719132421.584801-15-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- net/xdp/xsk_queue.h | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index bac32027f865..13354a1e4280 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -48,6 +48,11 @@ struct xsk_queue { size_t ring_vmalloc_size; }; +struct parsed_desc { + u32 mb; + u32 valid; +}; + /* The structure of the shared state of the rings are a simple * circular buffer, as outlined in * Documentation/core-api/circular-buffers.rst. For the Rx and @@ -218,30 +223,52 @@ static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) q->cached_cons += cnt; } -static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, - u32 max) +static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool, + struct xdp_desc *desc, struct parsed_desc *parsed) +{ + parsed->valid = xskq_cons_is_valid_desc(q, desc, pool); + parsed->mb = xp_mb_desc(desc); +} + +static inline +u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, + u32 max) { u32 cached_cons = q->cached_cons, nb_entries = 0; struct xdp_desc *descs = pool->tx_descs; + u32 total_descs = 0, nr_frags = 0; + /* track first entry, if stumble upon *any* invalid descriptor, rewind + * current packet that consists of frags and stop the processing + */ while (cached_cons != q->cached_prod && nb_entries < max) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = cached_cons & q->ring_mask; + struct parsed_desc parsed; descs[nb_entries] = ring->desc[idx]; - if (unlikely(!xskq_cons_is_valid_desc(q, &descs[nb_entries], pool))) { - /* Skip the entry */ - cached_cons++; - continue; + cached_cons++; + parse_desc(q, pool, &descs[nb_entries], &parsed); + if (unlikely(!parsed.valid)) + break; + + if (likely(!parsed.mb)) { + total_descs += (nr_frags + 1); + nr_frags = 0; + } else { + nr_frags++; + if (nr_frags == pool->netdev->xdp_zc_max_segs) { + nr_frags = 0; + break; + } } - nb_entries++; - cached_cons++; } + cached_cons -= nr_frags; /* Release valid plus any invalid entries */ xskq_cons_release_n(q, cached_cons - q->cached_cons); - return nb_entries; + return total_descs; } /* Functions for consumers */ -- cgit v1.2.3 From e420bed025071a623d2720a92bc2245c84757ecb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 19 Jul 2023 16:08:52 +0200 Subject: bpf: Add fd-based tcx multi-prog infra with link support This work refactors and adds a lightweight extension ("tcx") to the tc BPF ingress and egress data path side for allowing BPF program management based on fds via bpf() syscall through the newly added generic multi-prog API. The main goal behind this work which we also presented at LPC [0] last year and a recent update at LSF/MM/BPF this year [3] is to support long-awaited BPF link functionality for tc BPF programs, which allows for a model of safe ownership and program detachment. Given the rise in tc BPF users in cloud native environments, this becomes necessary to avoid hard to debug incidents either through stale leftover programs or 3rd party applications accidentally stepping on each others toes. As a recap, a BPF link represents the attachment of a BPF program to a BPF hook point. The BPF link holds a single reference to keep BPF program alive. Moreover, hook points do not reference a BPF link, only the application's fd or pinning does. A BPF link holds meta-data specific to attachment and implements operations for link creation, (atomic) BPF program update, detachment and introspection. The motivation for BPF links for tc BPF programs is multi-fold, for example: - From Meta: "It's especially important for applications that are deployed fleet-wide and that don't "control" hosts they are deployed to. If such application crashes and no one notices and does anything about that, BPF program will keep running draining resources or even just, say, dropping packets. We at FB had outages due to such permanent BPF attachment semantics. With fd-based BPF link we are getting a framework, which allows safe, auto-detachable behavior by default, unless application explicitly opts in by pinning the BPF link." [1] - From Cilium-side the tc BPF programs we attach to host-facing veth devices and phys devices build the core datapath for Kubernetes Pods, and they implement forwarding, load-balancing, policy, EDT-management, etc, within BPF. Currently there is no concept of 'safe' ownership, e.g. we've recently experienced hard-to-debug issues in a user's staging environment where another Kubernetes application using tc BPF attached to the same prio/handle of cls_bpf, accidentally wiping all Cilium-based BPF programs from underneath it. The goal is to establish a clear/safe ownership model via links which cannot accidentally be overridden. [0,2] BPF links for tc can co-exist with non-link attachments, and the semantics are in line also with XDP links: BPF links cannot replace other BPF links, BPF links cannot replace non-BPF links, non-BPF links cannot replace BPF links and lastly only non-BPF links can replace non-BPF links. In case of Cilium, this would solve mentioned issue of safe ownership model as 3rd party applications would not be able to accidentally wipe Cilium programs, even if they are not BPF link aware. Earlier attempts [4] have tried to integrate BPF links into core tc machinery to solve cls_bpf, which has been intrusive to the generic tc kernel API with extensions only specific to cls_bpf and suboptimal/complex since cls_bpf could be wiped from the qdisc also. Locking a tc BPF program in place this way, is getting into layering hacks given the two object models are vastly different. We instead implemented the tcx (tc 'express') layer which is an fd-based tc BPF attach API, so that the BPF link implementation blends in naturally similar to other link types which are fd-based and without the need for changing core tc internal APIs. BPF programs for tc can then be successively migrated from classic cls_bpf to the new tc BPF link without needing to change the program's source code, just the BPF loader mechanics for attaching is sufficient. For the current tc framework, there is no change in behavior with this change and neither does this change touch on tc core kernel APIs. The gist of this patch is that the ingress and egress hook have a lightweight, qdisc-less extension for BPF to attach its tc BPF programs, in other words, a minimal entry point for tc BPF. The name tcx has been suggested from discussion of earlier revisions of this work as a good fit, and to more easily differ between the classic cls_bpf attachment and the fd-based one. For the ingress and egress tcx points, the device holds a cache-friendly array with program pointers which is separated from control plane (slow-path) data. Earlier versions of this work used priority to determine ordering and expression of dependencies similar as with classic tc, but it was challenged that for something more future-proof a better user experience is required. Hence this resulted in the design and development of the generic attach/detach/query API for multi-progs. See prior patch with its discussion on the API design. tcx is the first user and later we plan to integrate also others, for example, one candidate is multi-prog support for XDP which would benefit and have the same 'look and feel' from API perspective. The goal with tcx is to have maximum compatibility to existing tc BPF programs, so they don't need to be rewritten specifically. Compatibility to call into classic tcf_classify() is also provided in order to allow successive migration or both to cleanly co-exist where needed given its all one logical tc layer and the tcx plus classic tc cls/act build one logical overall processing pipeline. tcx supports the simplified return codes TCX_NEXT which is non-terminating (go to next program) and terminating ones with TCX_PASS, TCX_DROP, TCX_REDIRECT. The fd-based API is behind a static key, so that when unused the code is also not entered. The struct tcx_entry's program array is currently static, but could be made dynamic if necessary at a point in future. The a/b pair swap design has been chosen so that for detachment there are no allocations which otherwise could fail. The work has been tested with tc-testing selftest suite which all passes, as well as the tc BPF tests from the BPF CI, and also with Cilium's L4LB. Thanks also to Nikolay Aleksandrov and Martin Lau for in-depth early reviews of this work. [0] https://lpc.events/event/16/contributions/1353/ [1] https://lore.kernel.org/bpf/CAEf4BzbokCJN33Nw_kg82sO=xppXnKWEncGTWCTB9vGCmLB6pw@mail.gmail.com [2] https://colocatedeventseu2023.sched.com/event/1Jo6O/tales-from-an-ebpf-programs-murder-mystery-hemanth-malla-guillaume-fournier-datadog [3] http://vger.kernel.org/bpfconf2023_material/tcx_meta_netdev_borkmann.pdf [4] https://lore.kernel.org/bpf/20210604063116.234316-1-memxor@gmail.com Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230719140858.13224-3-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 4 +- include/linux/bpf_mprog.h | 9 ++ include/linux/netdevice.h | 15 +- include/linux/skbuff.h | 4 +- include/net/sch_generic.h | 2 +- include/net/tcx.h | 206 ++++++++++++++++++++++++ include/uapi/linux/bpf.h | 34 +++- kernel/bpf/Kconfig | 1 + kernel/bpf/Makefile | 1 + kernel/bpf/syscall.c | 82 ++++++++-- kernel/bpf/tcx.c | 348 +++++++++++++++++++++++++++++++++++++++++ net/Kconfig | 5 + net/core/dev.c | 265 +++++++++++++++++++------------ net/core/filter.c | 4 +- net/sched/Kconfig | 4 +- net/sched/sch_ingress.c | 61 +++++++- tools/include/uapi/linux/bpf.h | 34 +++- 17 files changed, 935 insertions(+), 144 deletions(-) create mode 100644 include/net/tcx.h create mode 100644 kernel/bpf/tcx.c (limited to 'net') diff --git a/MAINTAINERS b/MAINTAINERS index 678bef9f60b4..990e3fce753c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3778,13 +3778,15 @@ L: netdev@vger.kernel.org S: Maintained F: kernel/bpf/bpf_struct* -BPF [NETWORKING] (tc BPF, sock_addr) +BPF [NETWORKING] (tcx & tc BPF, sock_addr) M: Martin KaFai Lau M: Daniel Borkmann R: John Fastabend L: bpf@vger.kernel.org L: netdev@vger.kernel.org S: Maintained +F: include/net/tcx.h +F: kernel/bpf/tcx.c F: net/core/filter.c F: net/sched/act_bpf.c F: net/sched/cls_bpf.c diff --git a/include/linux/bpf_mprog.h b/include/linux/bpf_mprog.h index 6feefec43422..2b429488f840 100644 --- a/include/linux/bpf_mprog.h +++ b/include/linux/bpf_mprog.h @@ -315,4 +315,13 @@ int bpf_mprog_detach(struct bpf_mprog_entry *entry, int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, struct bpf_mprog_entry *entry); +static inline bool bpf_mprog_supported(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_SCHED_CLS: + return true; + default: + return false; + } +} #endif /* __BPF_MPROG_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b12477ea4032..3800d0479698 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1930,8 +1930,7 @@ enum netdev_ml_priv_type { * * @rx_handler: handler for received packets * @rx_handler_data: XXX: need comments on this one - * @miniq_ingress: ingress/clsact qdisc specific data for - * ingress processing + * @tcx_ingress: BPF & clsact qdisc specific data for ingress processing * @ingress_queue: XXX: need comments on this one * @nf_hooks_ingress: netfilter hooks executed for ingress packets * @broadcast: hw bcast address @@ -1952,8 +1951,7 @@ enum netdev_ml_priv_type { * @xps_maps: all CPUs/RXQs maps for XPS device * * @xps_maps: XXX: need comments on this one - * @miniq_egress: clsact qdisc specific data for - * egress processing + * @tcx_egress: BPF & clsact qdisc specific data for egress processing * @nf_hooks_egress: netfilter hooks executed for egress packets * @qdisc_hash: qdisc hash table * @watchdog_timeo: Represents the timeout that is used by @@ -2253,9 +2251,8 @@ struct net_device { unsigned int xdp_zc_max_segs; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; - -#ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc __rcu *miniq_ingress; +#ifdef CONFIG_NET_XGRESS + struct bpf_mprog_entry __rcu *tcx_ingress; #endif struct netdev_queue __rcu *ingress_queue; #ifdef CONFIG_NETFILTER_INGRESS @@ -2283,8 +2280,8 @@ struct net_device { #ifdef CONFIG_XPS struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX]; #endif -#ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc __rcu *miniq_egress; +#ifdef CONFIG_NET_XGRESS + struct bpf_mprog_entry __rcu *tcx_egress; #endif #ifdef CONFIG_NETFILTER_EGRESS struct nf_hook_entries __rcu *nf_hooks_egress; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 91ed66952580..ed83f1c5fc1f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -944,7 +944,7 @@ struct sk_buff { __u8 __mono_tc_offset[0]; /* public: */ __u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */ -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ __u8 tc_skip_classify:1; #endif @@ -993,7 +993,7 @@ struct sk_buff { __u8 csum_not_inet:1; #endif -#ifdef CONFIG_NET_SCHED +#if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS) __u16 tc_index; /* traffic control index */ #endif diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e92f73bb3198..15be2d96b06d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -703,7 +703,7 @@ int skb_do_redirect(struct sk_buff *); static inline bool skb_at_tc_ingress(const struct sk_buff *skb) { -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS return skb->tc_at_ingress; #else return false; diff --git a/include/net/tcx.h b/include/net/tcx.h new file mode 100644 index 000000000000..264f147953ba --- /dev/null +++ b/include/net/tcx.h @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2023 Isovalent */ +#ifndef __NET_TCX_H +#define __NET_TCX_H + +#include +#include + +#include + +struct mini_Qdisc; + +struct tcx_entry { + struct mini_Qdisc __rcu *miniq; + struct bpf_mprog_bundle bundle; + bool miniq_active; + struct rcu_head rcu; +}; + +struct tcx_link { + struct bpf_link link; + struct net_device *dev; + u32 location; +}; + +static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress) +{ +#ifdef CONFIG_NET_XGRESS + skb->tc_at_ingress = ingress; +#endif +} + +#ifdef CONFIG_NET_XGRESS +static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry) +{ + struct bpf_mprog_bundle *bundle = entry->parent; + + return container_of(bundle, struct tcx_entry, bundle); +} + +static inline struct tcx_link *tcx_link(struct bpf_link *link) +{ + return container_of(link, struct tcx_link, link); +} + +static inline const struct tcx_link *tcx_link_const(const struct bpf_link *link) +{ + return tcx_link((struct bpf_link *)link); +} + +void tcx_inc(void); +void tcx_dec(void); + +static inline void tcx_entry_sync(void) +{ + /* bpf_mprog_entry got a/b swapped, therefore ensure that + * there are no inflight users on the old one anymore. + */ + synchronize_rcu(); +} + +static inline void +tcx_entry_update(struct net_device *dev, struct bpf_mprog_entry *entry, + bool ingress) +{ + ASSERT_RTNL(); + if (ingress) + rcu_assign_pointer(dev->tcx_ingress, entry); + else + rcu_assign_pointer(dev->tcx_egress, entry); +} + +static inline struct bpf_mprog_entry * +tcx_entry_fetch(struct net_device *dev, bool ingress) +{ + ASSERT_RTNL(); + if (ingress) + return rcu_dereference_rtnl(dev->tcx_ingress); + else + return rcu_dereference_rtnl(dev->tcx_egress); +} + +static inline struct bpf_mprog_entry *tcx_entry_create(void) +{ + struct tcx_entry *tcx = kzalloc(sizeof(*tcx), GFP_KERNEL); + + if (tcx) { + bpf_mprog_bundle_init(&tcx->bundle); + return &tcx->bundle.a; + } + return NULL; +} + +static inline void tcx_entry_free(struct bpf_mprog_entry *entry) +{ + kfree_rcu(tcx_entry(entry), rcu); +} + +static inline struct bpf_mprog_entry * +tcx_entry_fetch_or_create(struct net_device *dev, bool ingress, bool *created) +{ + struct bpf_mprog_entry *entry = tcx_entry_fetch(dev, ingress); + + *created = false; + if (!entry) { + entry = tcx_entry_create(); + if (!entry) + return NULL; + *created = true; + } + return entry; +} + +static inline void tcx_skeys_inc(bool ingress) +{ + tcx_inc(); + if (ingress) + net_inc_ingress_queue(); + else + net_inc_egress_queue(); +} + +static inline void tcx_skeys_dec(bool ingress) +{ + if (ingress) + net_dec_ingress_queue(); + else + net_dec_egress_queue(); + tcx_dec(); +} + +static inline void tcx_miniq_set_active(struct bpf_mprog_entry *entry, + const bool active) +{ + ASSERT_RTNL(); + tcx_entry(entry)->miniq_active = active; +} + +static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry) +{ + ASSERT_RTNL(); + return bpf_mprog_total(entry) || tcx_entry(entry)->miniq_active; +} + +static inline enum tcx_action_base tcx_action_code(struct sk_buff *skb, + int code) +{ + switch (code) { + case TCX_PASS: + skb->tc_index = qdisc_skb_cb(skb)->tc_classid; + fallthrough; + case TCX_DROP: + case TCX_REDIRECT: + return code; + case TCX_NEXT: + default: + return TCX_NEXT; + } +} +#endif /* CONFIG_NET_XGRESS */ + +#if defined(CONFIG_NET_XGRESS) && defined(CONFIG_BPF_SYSCALL) +int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); +int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); +void tcx_uninstall(struct net_device *dev, bool ingress); + +int tcx_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); + +static inline void dev_tcx_uninstall(struct net_device *dev) +{ + ASSERT_RTNL(); + tcx_uninstall(dev, true); + tcx_uninstall(dev, false); +} +#else +static inline int tcx_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int tcx_link_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int tcx_prog_detach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EINVAL; +} + +static inline int tcx_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} + +static inline void dev_tcx_uninstall(struct net_device *dev) +{ +} +#endif /* CONFIG_NET_XGRESS && CONFIG_BPF_SYSCALL */ +#endif /* __NET_TCX_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d4c07e435336..739c15906a65 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1036,6 +1036,8 @@ enum bpf_attach_type { BPF_LSM_CGROUP, BPF_STRUCT_OPS, BPF_NETFILTER, + BPF_TCX_INGRESS, + BPF_TCX_EGRESS, __MAX_BPF_ATTACH_TYPE }; @@ -1053,7 +1055,7 @@ enum bpf_link_type { BPF_LINK_TYPE_KPROBE_MULTI = 8, BPF_LINK_TYPE_STRUCT_OPS = 9, BPF_LINK_TYPE_NETFILTER = 10, - + BPF_LINK_TYPE_TCX = 11, MAX_BPF_LINK_TYPE, }; @@ -1569,13 +1571,13 @@ union bpf_attr { __u32 map_fd; /* struct_ops to attach */ }; union { - __u32 target_fd; /* object to attach to */ - __u32 target_ifindex; /* target ifindex */ + __u32 target_fd; /* target object to attach to or ... */ + __u32 target_ifindex; /* target ifindex */ }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ union { - __u32 target_btf_id; /* btf_id of target to attach to */ + __u32 target_btf_id; /* btf_id of target to attach to */ struct { __aligned_u64 iter_info; /* extra bpf_iter_link_info */ __u32 iter_info_len; /* iter_info length */ @@ -1609,6 +1611,13 @@ union bpf_attr { __s32 priority; __u32 flags; } netfilter; + struct { + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; + } tcx; }; } link_create; @@ -6217,6 +6226,19 @@ struct bpf_sock_tuple { }; }; +/* (Simplified) user return codes for tcx prog type. + * A valid tcx program must return one of these defined values. All other + * return codes are reserved for future use. Must remain compatible with + * their TC_ACT_* counter-parts. For compatibility in behavior, unknown + * return codes are mapped to TCX_NEXT. + */ +enum tcx_action_base { + TCX_NEXT = -1, + TCX_PASS = 0, + TCX_DROP = 2, + TCX_REDIRECT = 7, +}; + struct bpf_xdp_sock { __u32 queue_id; }; @@ -6499,6 +6521,10 @@ struct bpf_link_info { } event; /* BPF_PERF_EVENT_EVENT */ }; } perf_event; + struct { + __u32 ifindex; + __u32 attach_type; + } tcx; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 2dfe1079f772..6a906ff93006 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -31,6 +31,7 @@ config BPF_SYSCALL select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET + select NET_XGRESS if NET select PAGE_POOL if NET default n help diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 1bea2eb912cd..f526b7573e97 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o obj-$(CONFIG_BPF_SYSCALL) += offload.o obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o +obj-$(CONFIG_BPF_SYSCALL) += tcx.o endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ee8cb1a174aa..7f4e8c357a6a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -37,6 +37,8 @@ #include #include +#include + #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) @@ -3740,31 +3742,45 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_XDP; case BPF_LSM_CGROUP: return BPF_PROG_TYPE_LSM; + case BPF_TCX_INGRESS: + case BPF_TCX_EGRESS: + return BPF_PROG_TYPE_SCHED_CLS; default: return BPF_PROG_TYPE_UNSPEC; } } -#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd +#define BPF_PROG_ATTACH_LAST_FIELD expected_revision + +#define BPF_F_ATTACH_MASK_BASE \ + (BPF_F_ALLOW_OVERRIDE | \ + BPF_F_ALLOW_MULTI | \ + BPF_F_REPLACE) -#define BPF_F_ATTACH_MASK \ - (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE) +#define BPF_F_ATTACH_MASK_MPROG \ + (BPF_F_REPLACE | \ + BPF_F_BEFORE | \ + BPF_F_AFTER | \ + BPF_F_ID | \ + BPF_F_LINK) static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; struct bpf_prog *prog; + u32 mask; int ret; if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; - if (attr->attach_flags & ~BPF_F_ATTACH_MASK) - return -EINVAL; - ptype = attach_type_to_prog_type(attr->attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC) return -EINVAL; + mask = bpf_mprog_supported(ptype) ? + BPF_F_ATTACH_MASK_MPROG : BPF_F_ATTACH_MASK_BASE; + if (attr->attach_flags & ~mask) + return -EINVAL; prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) @@ -3800,6 +3816,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) else ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; + case BPF_PROG_TYPE_SCHED_CLS: + ret = tcx_prog_attach(attr, prog); + break; default: ret = -EINVAL; } @@ -3809,25 +3828,41 @@ static int bpf_prog_attach(const union bpf_attr *attr) return ret; } -#define BPF_PROG_DETACH_LAST_FIELD attach_type +#define BPF_PROG_DETACH_LAST_FIELD expected_revision static int bpf_prog_detach(const union bpf_attr *attr) { + struct bpf_prog *prog = NULL; enum bpf_prog_type ptype; + int ret; if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; ptype = attach_type_to_prog_type(attr->attach_type); + if (bpf_mprog_supported(ptype)) { + if (ptype == BPF_PROG_TYPE_UNSPEC) + return -EINVAL; + if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) + return -EINVAL; + if (attr->attach_bpf_fd) { + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + } switch (ptype) { case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_SKB: - return sock_map_prog_detach(attr, ptype); + ret = sock_map_prog_detach(attr, ptype); + break; case BPF_PROG_TYPE_LIRC_MODE2: - return lirc_prog_detach(attr); + ret = lirc_prog_detach(attr); + break; case BPF_PROG_TYPE_FLOW_DISSECTOR: - return netns_bpf_prog_detach(attr, ptype); + ret = netns_bpf_prog_detach(attr, ptype); + break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: @@ -3836,13 +3871,21 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_LSM: - return cgroup_bpf_prog_detach(attr, ptype); + ret = cgroup_bpf_prog_detach(attr, ptype); + break; + case BPF_PROG_TYPE_SCHED_CLS: + ret = tcx_prog_detach(attr, prog); + break; default: - return -EINVAL; + ret = -EINVAL; } + + if (prog) + bpf_prog_put(prog); + return ret; } -#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags +#define BPF_PROG_QUERY_LAST_FIELD query.link_attach_flags static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -3890,6 +3933,9 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_SK_MSG_VERDICT: case BPF_SK_SKB_VERDICT: return sock_map_bpf_prog_query(attr, uattr); + case BPF_TCX_INGRESS: + case BPF_TCX_EGRESS: + return tcx_prog_query(attr, uattr); default: return -EINVAL; } @@ -4852,6 +4898,13 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) goto out; } break; + case BPF_PROG_TYPE_SCHED_CLS: + if (attr->link_create.attach_type != BPF_TCX_INGRESS && + attr->link_create.attach_type != BPF_TCX_EGRESS) { + ret = -EINVAL; + goto out; + } + break; default: ptype = attach_type_to_prog_type(attr->link_create.attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { @@ -4903,6 +4956,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); break; + case BPF_PROG_TYPE_SCHED_CLS: + ret = tcx_link_attach(attr, prog); + break; case BPF_PROG_TYPE_NETFILTER: ret = bpf_nf_link_attach(attr, prog); break; diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c new file mode 100644 index 000000000000..69a272712b29 --- /dev/null +++ b/kernel/bpf/tcx.c @@ -0,0 +1,348 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Isovalent */ + +#include +#include +#include + +#include + +int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + bool created, ingress = attr->attach_type == BPF_TCX_INGRESS; + struct net *net = current->nsproxy->net_ns; + struct bpf_mprog_entry *entry, *entry_new; + struct bpf_prog *replace_prog = NULL; + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = __dev_get_by_index(net, attr->target_ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + if (attr->attach_flags & BPF_F_REPLACE) { + replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, + prog->type); + if (IS_ERR(replace_prog)) { + ret = PTR_ERR(replace_prog); + replace_prog = NULL; + goto out; + } + } + entry = tcx_entry_fetch_or_create(dev, ingress, &created); + if (!entry) { + ret = -ENOMEM; + goto out; + } + ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog, + attr->attach_flags, attr->relative_fd, + attr->expected_revision); + if (!ret) { + if (entry != entry_new) { + tcx_entry_update(dev, entry_new, ingress); + tcx_entry_sync(); + tcx_skeys_inc(ingress); + } + bpf_mprog_commit(entry); + } else if (created) { + tcx_entry_free(entry); + } +out: + if (replace_prog) + bpf_prog_put(replace_prog); + rtnl_unlock(); + return ret; +} + +int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + bool ingress = attr->attach_type == BPF_TCX_INGRESS; + struct net *net = current->nsproxy->net_ns; + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = __dev_get_by_index(net, attr->target_ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + entry = tcx_entry_fetch(dev, ingress); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags, + attr->relative_fd, attr->expected_revision); + if (!ret) { + if (!tcx_entry_is_active(entry_new)) + entry_new = NULL; + tcx_entry_update(dev, entry_new, ingress); + tcx_entry_sync(); + tcx_skeys_dec(ingress); + bpf_mprog_commit(entry); + if (!entry_new) + tcx_entry_free(entry); + } +out: + rtnl_unlock(); + return ret; +} + +void tcx_uninstall(struct net_device *dev, bool ingress) +{ + struct bpf_tuple tuple = {}; + struct bpf_mprog_entry *entry; + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + + entry = tcx_entry_fetch(dev, ingress); + if (!entry) + return; + tcx_entry_update(dev, NULL, ingress); + tcx_entry_sync(); + bpf_mprog_foreach_tuple(entry, fp, cp, tuple) { + if (tuple.link) + tcx_link(tuple.link)->dev = NULL; + else + bpf_prog_put(tuple.prog); + tcx_skeys_dec(ingress); + } + WARN_ON_ONCE(tcx_entry(entry)->miniq_active); + tcx_entry_free(entry); +} + +int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) +{ + bool ingress = attr->query.attach_type == BPF_TCX_INGRESS; + struct net *net = current->nsproxy->net_ns; + struct bpf_mprog_entry *entry; + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = __dev_get_by_index(net, attr->query.target_ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + entry = tcx_entry_fetch(dev, ingress); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_query(attr, uattr, entry); +out: + rtnl_unlock(); + return ret; +} + +static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd, + u64 revision) +{ + struct tcx_link *tcx = tcx_link(link); + bool created, ingress = tcx->location == BPF_TCX_INGRESS; + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev = tcx->dev; + int ret; + + ASSERT_RTNL(); + entry = tcx_entry_fetch_or_create(dev, ingress, &created); + if (!entry) + return -ENOMEM; + ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags, + id_or_fd, revision); + if (!ret) { + if (entry != entry_new) { + tcx_entry_update(dev, entry_new, ingress); + tcx_entry_sync(); + tcx_skeys_inc(ingress); + } + bpf_mprog_commit(entry); + } else if (created) { + tcx_entry_free(entry); + } + return ret; +} + +static void tcx_link_release(struct bpf_link *link) +{ + struct tcx_link *tcx = tcx_link(link); + bool ingress = tcx->location == BPF_TCX_INGRESS; + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + dev = tcx->dev; + if (!dev) + goto out; + entry = tcx_entry_fetch(dev, ingress); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0); + if (!ret) { + if (!tcx_entry_is_active(entry_new)) + entry_new = NULL; + tcx_entry_update(dev, entry_new, ingress); + tcx_entry_sync(); + tcx_skeys_dec(ingress); + bpf_mprog_commit(entry); + if (!entry_new) + tcx_entry_free(entry); + tcx->dev = NULL; + } +out: + WARN_ON_ONCE(ret); + rtnl_unlock(); +} + +static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog, + struct bpf_prog *oprog) +{ + struct tcx_link *tcx = tcx_link(link); + bool ingress = tcx->location == BPF_TCX_INGRESS; + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + dev = tcx->dev; + if (!dev) { + ret = -ENOLINK; + goto out; + } + if (oprog && link->prog != oprog) { + ret = -EPERM; + goto out; + } + oprog = link->prog; + if (oprog == nprog) { + bpf_prog_put(nprog); + goto out; + } + entry = tcx_entry_fetch(dev, ingress); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog, + BPF_F_REPLACE | BPF_F_ID, + link->prog->aux->id, 0); + if (!ret) { + WARN_ON_ONCE(entry != entry_new); + oprog = xchg(&link->prog, nprog); + bpf_prog_put(oprog); + bpf_mprog_commit(entry); + } +out: + rtnl_unlock(); + return ret; +} + +static void tcx_link_dealloc(struct bpf_link *link) +{ + kfree(tcx_link(link)); +} + +static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq) +{ + const struct tcx_link *tcx = tcx_link_const(link); + u32 ifindex = 0; + + rtnl_lock(); + if (tcx->dev) + ifindex = tcx->dev->ifindex; + rtnl_unlock(); + + seq_printf(seq, "ifindex:\t%u\n", ifindex); + seq_printf(seq, "attach_type:\t%u (%s)\n", + tcx->location, + tcx->location == BPF_TCX_INGRESS ? "ingress" : "egress"); +} + +static int tcx_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct tcx_link *tcx = tcx_link_const(link); + u32 ifindex = 0; + + rtnl_lock(); + if (tcx->dev) + ifindex = tcx->dev->ifindex; + rtnl_unlock(); + + info->tcx.ifindex = ifindex; + info->tcx.attach_type = tcx->location; + return 0; +} + +static int tcx_link_detach(struct bpf_link *link) +{ + tcx_link_release(link); + return 0; +} + +static const struct bpf_link_ops tcx_link_lops = { + .release = tcx_link_release, + .detach = tcx_link_detach, + .dealloc = tcx_link_dealloc, + .update_prog = tcx_link_update, + .show_fdinfo = tcx_link_fdinfo, + .fill_link_info = tcx_link_fill_info, +}; + +static int tcx_link_init(struct tcx_link *tcx, + struct bpf_link_primer *link_primer, + const union bpf_attr *attr, + struct net_device *dev, + struct bpf_prog *prog) +{ + bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog); + tcx->location = attr->link_create.attach_type; + tcx->dev = dev; + return bpf_link_prime(&tcx->link, link_primer); +} + +int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct net_device *dev; + struct tcx_link *tcx; + int ret; + + rtnl_lock(); + dev = __dev_get_by_index(net, attr->link_create.target_ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + tcx = kzalloc(sizeof(*tcx), GFP_USER); + if (!tcx) { + ret = -ENOMEM; + goto out; + } + ret = tcx_link_init(tcx, &link_primer, attr, dev, prog); + if (ret) { + kfree(tcx); + goto out; + } + ret = tcx_link_prog_attach(&tcx->link, attr->link_create.flags, + attr->link_create.tcx.relative_fd, + attr->link_create.tcx.expected_revision); + if (ret) { + tcx->dev = NULL; + bpf_link_cleanup(&link_primer); + goto out; + } + ret = bpf_link_settle(&link_primer); +out: + rtnl_unlock(); + return ret; +} diff --git a/net/Kconfig b/net/Kconfig index 2fb25b534df5..d532ec33f1fe 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -52,6 +52,11 @@ config NET_INGRESS config NET_EGRESS bool +config NET_XGRESS + select NET_INGRESS + select NET_EGRESS + bool + config NET_REDIRECT bool diff --git a/net/core/dev.c b/net/core/dev.c index dd4f114a7cbf..8e7d0cb540cd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -107,6 +107,7 @@ #include #include #include +#include #include #include #include @@ -154,7 +155,6 @@ #include "dev.h" #include "net-sysfs.h" - static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; struct list_head ptype_all __read_mostly; /* Taps */ @@ -3882,69 +3882,198 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) EXPORT_SYMBOL(dev_loopback_xmit); #ifdef CONFIG_NET_EGRESS -static struct sk_buff * -sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +static struct netdev_queue * +netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) +{ + int qm = skb_get_queue_mapping(skb); + + return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); +} + +static bool netdev_xmit_txqueue_skipped(void) { + return __this_cpu_read(softnet_data.xmit.skip_txqueue); +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); +#endif /* CONFIG_NET_EGRESS */ + +#ifdef CONFIG_NET_XGRESS +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb) +{ + int ret = TC_ACT_UNSPEC; #ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); - struct tcf_result cl_res; + struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); + struct tcf_result res; if (!miniq) - return skb; + return ret; - /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; - mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { + mini_qdisc_bstats_cpu_update(miniq, skb); + ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); + /* Only tcf related quirks below. */ + switch (ret) { + case TC_ACT_SHOT: + mini_qdisc_qstats_cpu_drop(miniq); + break; case TC_ACT_OK: case TC_ACT_RECLASSIFY: - skb->tc_index = TC_H_MIN(cl_res.classid); + skb->tc_index = TC_H_MIN(res.classid); break; + } +#endif /* CONFIG_NET_CLS_ACT */ + return ret; +} + +static DEFINE_STATIC_KEY_FALSE(tcx_needed_key); + +void tcx_inc(void) +{ + static_branch_inc(&tcx_needed_key); +} + +void tcx_dec(void) +{ + static_branch_dec(&tcx_needed_key); +} + +static __always_inline enum tcx_action_base +tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, + const bool needs_mac) +{ + const struct bpf_mprog_fp *fp; + const struct bpf_prog *prog; + int ret = TCX_NEXT; + + if (needs_mac) + __skb_push(skb, skb->mac_len); + bpf_mprog_foreach_prog(entry, fp, prog) { + bpf_compute_data_pointers(skb); + ret = bpf_prog_run(prog, skb); + if (ret != TCX_NEXT) + break; + } + if (needs_mac) + __skb_pull(skb, skb->mac_len); + return tcx_action_code(skb, ret); +} + +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); + int sch_ret; + + if (!entry) + return skb; + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + qdisc_skb_cb(skb)->pkt_len = skb->len; + tcx_set_ingress(skb, true); + + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, true); + if (sch_ret != TC_ACT_UNSPEC) + goto ingress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb); +ingress_verdict: + switch (sch_ret) { + case TC_ACT_REDIRECT: + /* skb_mac_header check was done by BPF, so we can safely + * push the L2 header back before redirecting to another + * netdev. + */ + __skb_push(skb, skb->mac_len); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } + *ret = NET_RX_SUCCESS; + return NULL; case TC_ACT_SHOT: - mini_qdisc_qstats_cpu_drop(miniq); - *ret = NET_XMIT_DROP; - kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); + kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); + *ret = NET_RX_DROP; return NULL; + /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: - *ret = NET_XMIT_SUCCESS; consume_skb(skb); + fallthrough; + case TC_ACT_CONSUMED: + *ret = NET_RX_SUCCESS; return NULL; + } + + return skb; +} + +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); + int sch_ret; + + if (!entry) + return skb; + + /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was + * already set by the caller. + */ + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, false); + if (sch_ret != TC_ACT_UNSPEC) + goto egress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb); +egress_verdict: + switch (sch_ret) { case TC_ACT_REDIRECT: /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; return NULL; - default: - break; + case TC_ACT_SHOT: + kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); + *ret = NET_XMIT_DROP; + return NULL; + /* used by tc_run */ + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *ret = NET_XMIT_SUCCESS; + return NULL; } -#endif /* CONFIG_NET_CLS_ACT */ return skb; } - -static struct netdev_queue * -netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) -{ - int qm = skb_get_queue_mapping(skb); - - return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); -} - -static bool netdev_xmit_txqueue_skipped(void) +#else +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) { - return __this_cpu_read(softnet_data.xmit.skip_txqueue); + return skb; } -void netdev_xmit_skip_txqueue(bool skip) +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { - __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); + return skb; } -EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); -#endif /* CONFIG_NET_EGRESS */ +#endif /* CONFIG_NET_XGRESS */ #ifdef CONFIG_XPS static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, @@ -4128,9 +4257,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_update_prio(skb); qdisc_pkt_len_init(skb); -#ifdef CONFIG_NET_CLS_ACT - skb->tc_at_ingress = 0; -#endif + tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { if (nf_hook_egress_active()) { @@ -5064,72 +5191,6 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev, EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -static inline struct sk_buff * -sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev, bool *another) -{ -#ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); - struct tcf_result cl_res; - - /* If there's at least one ingress present somewhere (so - * we get here via enabled static key), remaining devices - * that are not configured with an ingress qdisc will bail - * out here. - */ - if (!miniq) - return skb; - - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - - qdisc_skb_cb(skb)->pkt_len = skb->len; - tc_skb_cb(skb)->mru = 0; - tc_skb_cb(skb)->post_ct = false; - skb->tc_at_ingress = 1; - mini_qdisc_bstats_cpu_update(miniq, skb); - - switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { - case TC_ACT_OK: - case TC_ACT_RECLASSIFY: - skb->tc_index = TC_H_MIN(cl_res.classid); - break; - case TC_ACT_SHOT: - mini_qdisc_qstats_cpu_drop(miniq); - kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); - *ret = NET_RX_DROP; - return NULL; - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - case TC_ACT_TRAP: - consume_skb(skb); - *ret = NET_RX_SUCCESS; - return NULL; - case TC_ACT_REDIRECT: - /* skb_mac_header check was done by cls/act_bpf, so - * we can safely push the L2 header back before - * redirecting to another netdev - */ - __skb_push(skb, skb->mac_len); - if (skb_do_redirect(skb) == -EAGAIN) { - __skb_pull(skb, skb->mac_len); - *another = true; - break; - } - *ret = NET_RX_SUCCESS; - return NULL; - case TC_ACT_CONSUMED: - *ret = NET_RX_SUCCESS; - return NULL; - default: - break; - } -#endif /* CONFIG_NET_CLS_ACT */ - return skb; -} - /** * netdev_is_rx_handler_busy - check if receive handler is registered * @dev: device to check @@ -10835,7 +10896,7 @@ void unregister_netdevice_many_notify(struct list_head *head, /* Shutdown queueing discipline. */ dev_shutdown(dev); - + dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); diff --git a/net/core/filter.c b/net/core/filter.c index b4410dc841a0..797e8f039696 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9307,7 +9307,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, read skb->tstamp as is if tstamp_type_access is true. @@ -9341,7 +9341,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, __u8 value_reg = si->src_reg; __u8 skb_reg = si->dst_reg; -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, write skb->tstamp as is if tstamp_type_access is true. diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 4b95cb1ac435..470c70deffe2 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -347,8 +347,7 @@ config NET_SCH_FQ_PIE config NET_SCH_INGRESS tristate "Ingress/classifier-action Qdisc" depends on NET_CLS_ACT - select NET_INGRESS - select NET_EGRESS + select NET_XGRESS help Say Y here if you want to use classifiers for incoming and/or outgoing packets. This qdisc doesn't do anything else besides running classifiers, @@ -679,6 +678,7 @@ config NET_EMATCH_IPT config NET_CLS_ACT bool "Actions" select NET_CLS + select NET_XGRESS help Say Y here if you want to use traffic control actions. Actions get attached to classifiers and are invoked after a successful diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index e43a45499372..04e886f6cee4 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -13,6 +13,7 @@ #include #include #include +#include struct ingress_sched_data { struct tcf_block *block; @@ -78,6 +79,8 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, { struct ingress_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *entry; + bool created; int err; if (sch->parent != TC_H_INGRESS) @@ -85,7 +88,13 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, net_inc_ingress_queue(); - mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress); + entry = tcx_entry_fetch_or_create(dev, true, &created); + if (!entry) + return -ENOMEM; + tcx_miniq_set_active(entry, true); + mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq); + if (created) + tcx_entry_update(dev, entry, true); q->block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; q->block_info.chain_head_change = clsact_chain_head_change; @@ -103,11 +112,22 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, static void ingress_destroy(struct Qdisc *sch) { struct ingress_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *entry = rtnl_dereference(dev->tcx_ingress); if (sch->parent != TC_H_INGRESS) return; tcf_block_put_ext(q->block, sch, &q->block_info); + + if (entry) { + tcx_miniq_set_active(entry, false); + if (!tcx_entry_is_active(entry)) { + tcx_entry_update(dev, NULL, false); + tcx_entry_free(entry); + } + } + net_dec_ingress_queue(); } @@ -223,6 +243,8 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, { struct clsact_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *entry; + bool created; int err; if (sch->parent != TC_H_CLSACT) @@ -231,7 +253,13 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, net_inc_ingress_queue(); net_inc_egress_queue(); - mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress); + entry = tcx_entry_fetch_or_create(dev, true, &created); + if (!entry) + return -ENOMEM; + tcx_miniq_set_active(entry, true); + mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq); + if (created) + tcx_entry_update(dev, entry, true); q->ingress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; q->ingress_block_info.chain_head_change = clsact_chain_head_change; @@ -244,7 +272,13 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, mini_qdisc_pair_block_init(&q->miniqp_ingress, q->ingress_block); - mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress); + entry = tcx_entry_fetch_or_create(dev, false, &created); + if (!entry) + return -ENOMEM; + tcx_miniq_set_active(entry, true); + mini_qdisc_pair_init(&q->miniqp_egress, sch, &tcx_entry(entry)->miniq); + if (created) + tcx_entry_update(dev, entry, false); q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS; q->egress_block_info.chain_head_change = clsact_chain_head_change; @@ -256,12 +290,31 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, static void clsact_destroy(struct Qdisc *sch) { struct clsact_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *ingress_entry = rtnl_dereference(dev->tcx_ingress); + struct bpf_mprog_entry *egress_entry = rtnl_dereference(dev->tcx_egress); if (sch->parent != TC_H_CLSACT) return; - tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); + tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); + + if (ingress_entry) { + tcx_miniq_set_active(ingress_entry, false); + if (!tcx_entry_is_active(ingress_entry)) { + tcx_entry_update(dev, NULL, true); + tcx_entry_free(ingress_entry); + } + } + + if (egress_entry) { + tcx_miniq_set_active(egress_entry, false); + if (!tcx_entry_is_active(egress_entry)) { + tcx_entry_update(dev, NULL, false); + tcx_entry_free(egress_entry); + } + } net_dec_ingress_queue(); net_dec_egress_queue(); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1c166870cdf3..47b76925189f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1036,6 +1036,8 @@ enum bpf_attach_type { BPF_LSM_CGROUP, BPF_STRUCT_OPS, BPF_NETFILTER, + BPF_TCX_INGRESS, + BPF_TCX_EGRESS, __MAX_BPF_ATTACH_TYPE }; @@ -1053,7 +1055,7 @@ enum bpf_link_type { BPF_LINK_TYPE_KPROBE_MULTI = 8, BPF_LINK_TYPE_STRUCT_OPS = 9, BPF_LINK_TYPE_NETFILTER = 10, - + BPF_LINK_TYPE_TCX = 11, MAX_BPF_LINK_TYPE, }; @@ -1569,13 +1571,13 @@ union bpf_attr { __u32 map_fd; /* struct_ops to attach */ }; union { - __u32 target_fd; /* object to attach to */ - __u32 target_ifindex; /* target ifindex */ + __u32 target_fd; /* target object to attach to or ... */ + __u32 target_ifindex; /* target ifindex */ }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ union { - __u32 target_btf_id; /* btf_id of target to attach to */ + __u32 target_btf_id; /* btf_id of target to attach to */ struct { __aligned_u64 iter_info; /* extra bpf_iter_link_info */ __u32 iter_info_len; /* iter_info length */ @@ -1609,6 +1611,13 @@ union bpf_attr { __s32 priority; __u32 flags; } netfilter; + struct { + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; + } tcx; }; } link_create; @@ -6217,6 +6226,19 @@ struct bpf_sock_tuple { }; }; +/* (Simplified) user return codes for tcx prog type. + * A valid tcx program must return one of these defined values. All other + * return codes are reserved for future use. Must remain compatible with + * their TC_ACT_* counter-parts. For compatibility in behavior, unknown + * return codes are mapped to TCX_NEXT. + */ +enum tcx_action_base { + TCX_NEXT = -1, + TCX_PASS = 0, + TCX_DROP = 2, + TCX_REDIRECT = 7, +}; + struct bpf_xdp_sock { __u32 queue_id; }; @@ -6499,6 +6521,10 @@ struct bpf_link_info { } event; /* BPF_PERF_EVENT_EVENT */ }; } perf_event; + struct { + __u32 ifindex; + __u32 attach_type; + } tcx; }; } __attribute__((aligned(8))); -- cgit v1.2.3 From 0558e1674598ec9029c1d3bceb787c8340272b51 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Jul 2023 16:38:09 +0200 Subject: udp: use indirect call wrapper for data ready() In most cases UDP sockets use the default data ready callback. Leverage the indirect call wrapper for such callback to avoid an indirect call in fastpath. The above gives small but measurable performance gain under UDP flood. Signed-off-by: Paolo Abeni Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/d47d53e6f8ee7a11228ca2f025d6243cc04b77f3.1689691004.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 42a96b3547c9..8c3ebd95f5b9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1553,7 +1553,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) spin_unlock(&list->lock); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk); + INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); busylock_release(busy); return 0; -- cgit v1.2.3 From 03b123debcbc8db987bda17ed8412cc011064c22 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 18 Jul 2023 16:20:49 +0000 Subject: tcp: tcp_enter_quickack_mode() should be static After commit d2ccd7bc8acd ("tcp: avoid resetting ACK timer in DCTCP"), tcp_enter_quickack_mode() is only used from net/ipv4/tcp_input.c. Fixes: d2ccd7bc8acd ("tcp: avoid resetting ACK timer in DCTCP") Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Link: https://lore.kernel.org/r/20230718162049.1444938-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 1 - net/ipv4/tcp_input.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 8d1f1af5e02a..c5fb90079920 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -350,7 +350,6 @@ ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule); -void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks); static inline void tcp_dec_quickack_mode(struct sock *sk, const unsigned int pkts) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3cd92035e090..e2affdd147da 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -297,7 +297,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) icsk->icsk_ack.quick = quickacks; } -void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) +static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -305,7 +305,6 @@ void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } -EXPORT_SYMBOL(tcp_enter_quickack_mode); /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. -- cgit v1.2.3 From 4914109a8e1e494c6aa9852f9e84ec77a5fc643f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 16 Jul 2023 17:09:17 -0400 Subject: netfilter: allow exp not to be removed in nf_ct_find_expectation Currently nf_conntrack_in() calling nf_ct_find_expectation() will remove the exp from the hash table. However, in some scenario, we expect the exp not to be removed when the created ct will not be confirmed, like in OVS and TC conntrack in the following patches. This patch allows exp not to be removed by setting IPS_CONFIRMED in the status of the tmpl. Signed-off-by: Xin Long Acked-by: Aaron Conole Acked-by: Florian Westphal Signed-off-by: Paolo Abeni --- include/net/netfilter/nf_conntrack_expect.h | 2 +- net/netfilter/nf_conntrack_core.c | 2 +- net/netfilter/nf_conntrack_expect.c | 4 ++-- net/netfilter/nft_ct.c | 2 ++ 4 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index cf0d81be5a96..165e7a03b8e9 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -100,7 +100,7 @@ nf_ct_expect_find_get(struct net *net, struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, - const struct nf_conntrack_tuple *tuple); + const struct nf_conntrack_tuple *tuple, bool unlink); void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 992393102d5f..9f6f2e643575 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1756,7 +1756,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, cnet = nf_ct_pernet(net); if (cnet->expect_count) { spin_lock_bh(&nf_conntrack_expect_lock); - exp = nf_ct_find_expectation(net, zone, tuple); + exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); if (exp) { /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 96948e98ec53..81ca348915c9 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, - const struct nf_conntrack_tuple *tuple) + const struct nf_conntrack_tuple *tuple, bool unlink) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; @@ -211,7 +211,7 @@ nf_ct_find_expectation(struct net *net, !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; - if (exp->flags & NF_CT_EXPECT_PERMANENT) { + if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) { refcount_inc(&exp->use); return exp; } else if (del_timer(&exp->timeout)) { diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 38958e067aa8..e87fd4314c68 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -262,6 +262,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, regs->verdict.code = NF_DROP; return; } + __set_bit(IPS_CONFIRMED_BIT, &ct->status); } nf_ct_set(skb, ct, IP_CT_NEW); @@ -368,6 +369,7 @@ static bool nft_ct_tmpl_alloc_pcpu(void) return false; } + __set_bit(IPS_CONFIRMED_BIT, &tmp->status); per_cpu(nft_ct_pcpu_template, cpu) = tmp; } -- cgit v1.2.3 From 76622ced50a103e278ad560566b5e43d82f718c6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 16 Jul 2023 17:09:18 -0400 Subject: net: sched: set IPS_CONFIRMED in tmpl status only when commit is set in act_ct With the following flows, the packets will be dropped if OVS TC offload is enabled. 'ip,ct_state=-trk,in_port=1 actions=ct(zone=1)' 'ip,ct_state=+trk+new+rel,in_port=1 actions=ct(commit,zone=1)' 'ip,ct_state=+trk+new+rel,in_port=1 actions=ct(commit,zone=2),normal' In the 1st flow, it finds the exp from the hashtable and removes it then creates the ct with this exp in act_ct. However, in the 2nd flow it goes to the OVS upcall at the 1st time. When the skb comes back from userspace, it has to create the ct again without exp(the exp was removed last time). With no 'rel' set in the ct, the 3rd flow can never get matched. In OVS conntrack, it works around it by adding its own exp lookup function ovs_ct_expect_find() where it doesn't remove the exp. Instead of creating a real ct, it only updates its keys with the exp and its master info. So when the skb comes back, the exp is still in the hashtable. However, we can't do this trick in act_ct, as tc flower match is using a real ct, and passing the exp and its master info to flower parsing via tc_skb_cb is also not possible (tc_skb_cb size is not big enough). The simple and clear fix is to not remove the exp at the 1st flow, namely, not set IPS_CONFIRMED in tmpl when commit is not set in act_ct. Reported-by: Shuang Li Signed-off-by: Xin Long Acked-by: Aaron Conole Reviewed-by: Davide Caratti Acked-by: Florian Westphal Signed-off-by: Paolo Abeni --- net/sched/act_ct.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index abc71a06d634..7c652d14528b 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1238,7 +1238,8 @@ static int tcf_ct_fill_params(struct net *net, } } - __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); + if (p->ct_action & TCA_CT_ACT_COMMIT) + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); return 0; err: nf_ct_put(p->tmpl); -- cgit v1.2.3 From 8c8b733208058702da451b7d60a12c0ff90b6879 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 16 Jul 2023 17:09:19 -0400 Subject: openvswitch: set IPS_CONFIRMED in tmpl status only when commit is set in conntrack By not setting IPS_CONFIRMED in tmpl that allows the exp not to be removed from the hashtable when lookup, we can simplify the exp processing code a lot in openvswitch conntrack. Signed-off-by: Xin Long Acked-by: Aaron Conole Acked-by: Florian Westphal Signed-off-by: Paolo Abeni --- net/openvswitch/conntrack.c | 78 ++++++--------------------------------------- 1 file changed, 10 insertions(+), 68 deletions(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 331730fd3580..fa955e892210 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -455,45 +455,6 @@ static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key, return 0; } -static struct nf_conntrack_expect * -ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, - u16 proto, const struct sk_buff *skb) -{ - struct nf_conntrack_tuple tuple; - struct nf_conntrack_expect *exp; - - if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) - return NULL; - - exp = __nf_ct_expect_find(net, zone, &tuple); - if (exp) { - struct nf_conntrack_tuple_hash *h; - - /* Delete existing conntrack entry, if it clashes with the - * expectation. This can happen since conntrack ALGs do not - * check for clashes between (new) expectations and existing - * conntrack entries. nf_conntrack_in() will check the - * expectations only if a conntrack entry can not be found, - * which can lead to OVS finding the expectation (here) in the - * init direction, but which will not be removed by the - * nf_conntrack_in() call, if a matching conntrack entry is - * found instead. In this case all init direction packets - * would be reported as new related packets, while reply - * direction packets would be reported as un-related - * established packets. - */ - h = nf_conntrack_find_get(net, zone, &tuple); - if (h) { - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - - nf_ct_delete(ct, 0, 0); - nf_ct_put(ct); - } - } - - return exp; -} - /* This replicates logic from nf_conntrack_core.c that is not exported. */ static enum ip_conntrack_info ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) @@ -852,36 +813,16 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb) { - struct nf_conntrack_expect *exp; - - /* If we pass an expected packet through nf_conntrack_in() the - * expectation is typically removed, but the packet could still be - * lost in upcall processing. To prevent this from happening we - * perform an explicit expectation lookup. Expected connections are - * always new, and will be passed through conntrack only when they are - * committed, as it is OK to remove the expectation at that time. - */ - exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); - if (exp) { - u8 state; - - /* NOTE: New connections are NATted and Helped only when - * committed, so we are not calling into NAT here. - */ - state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; - __ovs_ct_update_key(key, state, &info->zone, exp->master); - } else { - struct nf_conn *ct; - int err; + struct nf_conn *ct; + int err; - err = __ovs_ct_lookup(net, key, info, skb); - if (err) - return err; + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; - ct = (struct nf_conn *)skb_nfct(skb); - if (ct) - nf_ct_deliver_cached_events(ct); - } + ct = (struct nf_conn *)skb_nfct(skb); + if (ct) + nf_ct_deliver_cached_events(ct); return 0; } @@ -1460,7 +1401,8 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, if (err) goto err_free_ct; - __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + if (ct_info.commit) + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); return 0; err_free_ct: __ovs_ct_free_action(&ct_info); -- cgit v1.2.3 From b44693495af8f309b8ddec4b30833085d1c2d0c4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 06:47:54 +0000 Subject: tcp: add TCP_OLD_SEQUENCE drop reason tcp_sequence() uses two conditions to decide to drop a packet, and we currently report generic TCP_INVALID_SEQUENCE drop reason. Duplicates are common, we need to distinguish them from the other case. I chose to not reuse TCP_OLD_DATA, and instead added TCP_OLD_SEQUENCE drop reason. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719064754.2794106-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 3 +++ net/ipv4/tcp_input.c | 16 +++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index a2b953b57689..f291a3b0f9e5 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -30,6 +30,7 @@ FN(TCP_OVERWINDOW) \ FN(TCP_OFOMERGE) \ FN(TCP_RFC7323_PAWS) \ + FN(TCP_OLD_SEQUENCE) \ FN(TCP_INVALID_SEQUENCE) \ FN(TCP_RESET) \ FN(TCP_INVALID_SYN) \ @@ -188,6 +189,8 @@ enum skb_drop_reason { * LINUX_MIB_PAWSESTABREJECTED */ SKB_DROP_REASON_TCP_RFC7323_PAWS, + /** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */ + SKB_DROP_REASON_TCP_OLD_SEQUENCE, /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */ SKB_DROP_REASON_TCP_INVALID_SEQUENCE, /** @SKB_DROP_REASON_TCP_RESET: Invalid RST packet */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e2affdd147da..670c3dab24f2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4312,10 +4312,16 @@ static inline bool tcp_paws_discard(const struct sock *sk, * (borrowed from freebsd) */ -static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) +static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp, + u32 seq, u32 end_seq) { - return !before(end_seq, tp->rcv_wup) && - !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); + if (before(end_seq, tp->rcv_wup)) + return SKB_DROP_REASON_TCP_OLD_SEQUENCE; + + if (after(seq, tp->rcv_nxt + tcp_receive_window(tp))) + return SKB_DROP_REASON_TCP_INVALID_SEQUENCE; + + return SKB_NOT_DROPPED_YET; } /* When we get a reset we do this. */ @@ -5738,7 +5744,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } /* Step 1: check sequence number */ - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + if (reason) { /* RFC793, page 37: "In all states except SYN-SENT, all reset * (RST) segments are validated by checking their SEQ-fields." * And page 69: "If an incoming segment is not acceptable, @@ -5755,7 +5762,6 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } else if (tcp_reset_check(sk, skb)) { goto reset; } - SKB_DR_SET(reason, TCP_INVALID_SEQUENCE); goto discard; } -- cgit v1.2.3 From eedd47a6ec9f683f0b8d931aacca81985be55eec Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Wed, 19 Jul 2023 13:57:11 +0000 Subject: nexthop: Factor out hash threshold fdb nexthop selection The loop in nexthop_select_path_hthr() includes code to check for neighbor validity. Since this does not apply to fdb nexthops, simplify the loop by moving the fdb nexthop selection to its own function. Signed-off-by: Ido Schimmel Signed-off-by: Benjamin Poirier Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230719-nh_select-v2-1-04383e89f868@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index f95142e56da0..27089dea0ed0 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1152,11 +1152,31 @@ static bool ipv4_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } +static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) +{ + int i; + + for (i = 0; i < nhg->num_nh; i++) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + + if (hash > atomic_read(&nhge->hthr.upper_bound)) + continue; + + return nhge->nh; + } + + WARN_ON_ONCE(1); + return NULL; +} + static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) { struct nexthop *rc = NULL; int i; + if (nhg->fdb_nh) + return nexthop_select_path_fdb(nhg, hash); + for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; @@ -1165,8 +1185,6 @@ static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) continue; nhi = rcu_dereference(nhge->nh->nh_info); - if (nhi->fdb_nh) - return nhge->nh; /* nexthops always check if it is good and does * not rely on a sysctl for this behavior -- cgit v1.2.3 From 4bb5239b43348f4a1cd73f44be3dc57ac653ae9c Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Wed, 19 Jul 2023 13:57:16 +0000 Subject: nexthop: Factor out neighbor validity check For legacy nexthops, there is fib_good_nh() to check the neighbor validity. In order to make the nexthop object code more similar to the legacy nexthop code, factor out the nexthop object neighbor validity check into its own function. Signed-off-by: Ido Schimmel Signed-off-by: Benjamin Poirier Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230719-nh_select-v2-2-04383e89f868@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 27089dea0ed0..c12acbf39659 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1152,6 +1152,20 @@ static bool ipv4_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } +static bool nexthop_is_good_nh(const struct nexthop *nh) +{ + struct nh_info *nhi = rcu_dereference(nh->nh_info); + + switch (nhi->family) { + case AF_INET: + return ipv4_good_nh(&nhi->fib_nh); + case AF_INET6: + return ipv6_good_nh(&nhi->fib6_nh); + } + + return false; +} + static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) { int i; @@ -1179,26 +1193,15 @@ static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; - struct nh_info *nhi; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; - nhi = rcu_dereference(nhge->nh->nh_info); - /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ - switch (nhi->family) { - case AF_INET: - if (ipv4_good_nh(&nhi->fib_nh)) - return nhge->nh; - break; - case AF_INET6: - if (ipv6_good_nh(&nhi->fib6_nh)) - return nhge->nh; - break; - } + if (nexthop_is_good_nh(nhge->nh)) + return nhge->nh; if (!rc) rc = nhge->nh; -- cgit v1.2.3 From 75f5f04c7bd2163044f80d5f7bea6b79edc288fa Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Wed, 19 Jul 2023 13:57:22 +0000 Subject: nexthop: Do not return invalid nexthop object during multipath selection With legacy nexthops, when net.ipv4.fib_multipath_use_neigh is set, fib_select_multipath() will never set res->nhc to a nexthop that is not good (as per fib_good_nh()). OTOH, with nexthop objects, nexthop_select_path_hthr() may return a nexthop that failed the nexthop_is_good_nh() test even if there was one that passed. Refactor nexthop_select_path_hthr() to follow a selection logic more similar to fib_select_multipath(). The issue can be demonstrated with the following sequence of commands. The first block shows that things work as expected with legacy nexthops. The last sequence of `ip rou get` in the second block shows the problem case - some routes still use the .2 nexthop. sysctl net.ipv4.fib_multipath_use_neigh=1 ip link add dummy1 up type dummy ip rou add 198.51.100.0/24 nexthop via 192.0.2.1 dev dummy1 onlink nexthop via 192.0.2.2 dev dummy1 onlink for i in {10..19}; do ip -o rou get 198.51.100.$i; done ip neigh add 192.0.2.1 dev dummy1 nud failed echo ".1 failed:" # results should not use .1 for i in {10..19}; do ip -o rou get 198.51.100.$i; done ip neigh del 192.0.2.1 dev dummy1 ip neigh add 192.0.2.2 dev dummy1 nud failed echo ".2 failed:" # results should not use .2 for i in {10..19}; do ip -o rou get 198.51.100.$i; done ip link del dummy1 ip link add dummy1 up type dummy ip nexthop add id 1 via 192.0.2.1 dev dummy1 onlink ip nexthop add id 2 via 192.0.2.2 dev dummy1 onlink ip nexthop add id 1001 group 1/2 ip rou add 198.51.100.0/24 nhid 1001 for i in {10..19}; do ip -o rou get 198.51.100.$i; done ip neigh add 192.0.2.1 dev dummy1 nud failed echo ".1 failed:" # results should not use .1 for i in {10..19}; do ip -o rou get 198.51.100.$i; done ip neigh del 192.0.2.1 dev dummy1 ip neigh add 192.0.2.2 dev dummy1 nud failed echo ".2 failed:" # results should not use .2 for i in {10..19}; do ip -o rou get 198.51.100.$i; done ip link del dummy1 Signed-off-by: Ido Schimmel Signed-off-by: Benjamin Poirier Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230719-nh_select-v2-3-04383e89f868@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index c12acbf39659..93f14d39fef6 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1194,20 +1194,22 @@ static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; - if (hash > atomic_read(&nhge->hthr.upper_bound)) - continue; - /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ - if (nexthop_is_good_nh(nhge->nh)) - return nhge->nh; + if (!nexthop_is_good_nh(nhge->nh)) + continue; if (!rc) rc = nhge->nh; + + if (hash > atomic_read(&nhge->hthr.upper_bound)) + continue; + + return nhge->nh; } - return rc; + return rc ? : nhg->nh_entries[0].nh; } static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) -- cgit v1.2.3 From 989280d6ea70b2d91b8b0e20ac11a6529a37ac08 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 19 Jul 2023 13:01:16 +0200 Subject: net: bridge: br_switchdev: Tolerate -EOPNOTSUPP when replaying MDB There are two kinds of MDB entries to be replayed: port MDB entries, and host MDB entries. They are both replayed by br_switchdev_mdb_replay(). If the driver supports one kind, but lacks the other, the first -EOPNOTSUPP returned terminates the whole replay, including any further still-supported objects in the list. For this to cause issues, there must be MDB entries for both the host and the port being replayed. In that case, if the driver bails out from handling the host entry, the port entries are never replayed. However, the replay is currently only done when a switchdev port joins a bridge. There would be no port memberships at that point. Thus despite being erroneous, the code does not cause observable bugs. This is not an issue with other object kinds either, because there, each function replays one object kind. If a driver does not support that kind, it makes sense to bail out early. -EOPNOTSUPP is then ignored in nbp_switchdev_sync_objs(). For MDB, suppress the -EOPNOTSUPP error code in br_switchdev_mdb_replay() already, so that the whole list gets replayed. The reason we need this patch is that a future patch will introduce a replay that should be used when a front-panel port netdevice is enslaved to a bridge lower, in particular a LAG. The LAG netdevice can already have both host and port MDB entries. The port entries need to be replayed so that they are offloaded on the port that joins the LAG. Cc: Jiri Pirko Cc: Ivan Vecera Cc: Roopa Prabhu Cc: Nikolay Aleksandrov Cc: bridge@lists.linux-foundation.org Signed-off-by: Petr Machata Reviewed-by: Danielle Ratson Signed-off-by: David S. Miller --- net/bridge/br_switchdev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index ba95c4d74a60..e92e0338afee 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -727,6 +727,8 @@ br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev, err = br_switchdev_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj), action, ctx, extack); + if (err == -EOPNOTSUPP) + err = 0; if (err) goto out_free_mdb; } @@ -759,8 +761,10 @@ static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx, err = br_switchdev_mdb_replay(br_dev, dev, ctx, true, blocking_nb, extack); - if (err && err != -EOPNOTSUPP) + if (err) { + /* -EOPNOTSUPP not propagated from MDB replay. */ return err; + } err = br_switchdev_fdb_replay(br_dev, ctx, true, atomic_nb); if (err && err != -EOPNOTSUPP) -- cgit v1.2.3 From f2e2857b352277a451e2f91409e461fa7ebf2d15 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 19 Jul 2023 13:01:17 +0200 Subject: net: switchdev: Add a helper to replay objects on a bridge port When a front panel joins a bridge via another netdevice (typically a LAG), the driver needs to learn about the objects configured on the bridge port. When the bridge port is offloaded by the driver for the first time, this can be achieved by passing a notifier to switchdev_bridge_port_offload(). The notifier is then invoked for the individual objects (such as VLANs) configured on the bridge, and can look for the interesting ones. Calling switchdev_bridge_port_offload() when the second port joins the bridge lower is unnecessary, but the replay is still needed. To that end, add a new function, switchdev_bridge_port_replay(), which does only the replay part of the _offload() function in exactly the same way as that function. Cc: Jiri Pirko Cc: Ivan Vecera Cc: Roopa Prabhu Cc: Nikolay Aleksandrov Cc: bridge@lists.linux-foundation.org Signed-off-by: Petr Machata Reviewed-by: Danielle Ratson Signed-off-by: David S. Miller --- include/net/switchdev.h | 6 ++++++ net/bridge/br.c | 8 ++++++++ net/bridge/br_private.h | 16 ++++++++++++++++ net/bridge/br_switchdev.c | 9 +++++++++ net/switchdev/switchdev.c | 25 +++++++++++++++++++++++++ 5 files changed, 64 insertions(+) (limited to 'net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index ca0312b78294..4d324e2a2eef 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -231,6 +231,7 @@ enum switchdev_notifier_type { SWITCHDEV_BRPORT_OFFLOADED, SWITCHDEV_BRPORT_UNOFFLOADED, + SWITCHDEV_BRPORT_REPLAY, }; struct switchdev_notifier_info { @@ -299,6 +300,11 @@ void switchdev_bridge_port_unoffload(struct net_device *brport_dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb); +int switchdev_bridge_port_replay(struct net_device *brport_dev, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack); void switchdev_deferred_process(void); int switchdev_port_attr_set(struct net_device *dev, diff --git a/net/bridge/br.c b/net/bridge/br.c index 4f5098d33a46..a6e94ceb7c9a 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -234,6 +234,14 @@ static int br_switchdev_blocking_event(struct notifier_block *nb, br_switchdev_port_unoffload(p, b->ctx, b->atomic_nb, b->blocking_nb); break; + case SWITCHDEV_BRPORT_REPLAY: + brport_info = ptr; + b = &brport_info->brport; + + err = br_switchdev_port_replay(p, b->dev, b->ctx, b->atomic_nb, + b->blocking_nb, extack); + err = notifier_from_errno(err); + break; } out: diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 05a965ef76f1..51e4ca54b537 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -2118,6 +2118,12 @@ void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb); +int br_switchdev_port_replay(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack); + bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb); void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb); @@ -2168,6 +2174,16 @@ br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, { } +static inline int +br_switchdev_port_replay(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + static inline bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) { return false; diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index e92e0338afee..ee84e783e1df 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -829,3 +829,12 @@ void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, nbp_switchdev_del(p); } + +int br_switchdev_port_replay(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) +{ + return nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack); +} diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 8cc42aea19c7..5b045284849e 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -862,3 +862,28 @@ void switchdev_bridge_port_unoffload(struct net_device *brport_dev, NULL); } EXPORT_SYMBOL_GPL(switchdev_bridge_port_unoffload); + +int switchdev_bridge_port_replay(struct net_device *brport_dev, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) +{ + struct switchdev_notifier_brport_info brport_info = { + .brport = { + .dev = dev, + .ctx = ctx, + .atomic_nb = atomic_nb, + .blocking_nb = blocking_nb, + }, + }; + int err; + + ASSERT_RTNL(); + + err = call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_REPLAY, + brport_dev, &brport_info.info, + extack); + return notifier_to_errno(err); +} +EXPORT_SYMBOL_GPL(switchdev_bridge_port_replay); -- cgit v1.2.3 From 9fe63d5f1da939855bdfaebfd9e95c96938b6411 Mon Sep 17 00:00:00 2001 From: Naveen Mamindlapalli Date: Wed, 19 Jul 2023 16:34:41 +0530 Subject: sch_htb: Allow HTB quantum parameter in offload mode The current implementation of HTB offload returns the EINVAL error for quantum parameter. This patch removes the error returning checks for 'quantum' parameter and populates its value to tc_htb_qopt_offload structure such that driver can use the same. Add quantum parameter check in mlx5 driver, as mlx5 devices are not capable of supporting the quantum parameter when htb offload is used. Report error if quantum parameter is set to a non-default value. Signed-off-by: Naveen Mamindlapalli Signed-off-by: Hariprasad Kelam Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en/qos.c | 4 ++-- include/net/pkt_cls.h | 1 + net/sched/sch_htb.c | 7 +++---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c index 1874c2f0587f..244bc15a42ab 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c @@ -379,9 +379,9 @@ int mlx5e_htb_setup_tc(struct mlx5e_priv *priv, struct tc_htb_qopt_offload *htb_ if (!htb && htb_qopt->command != TC_HTB_CREATE) return -EINVAL; - if (htb_qopt->prio) { + if (htb_qopt->prio || htb_qopt->quantum) { NL_SET_ERR_MSG_MOD(htb_qopt->extack, - "prio parameter is not supported by device with HTB offload enabled."); + "prio and quantum parameters are not supported by device with HTB offload enabled."); return -EOPNOTSUPP; } diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a2ea45c7b53e..139cd09828af 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -866,6 +866,7 @@ struct tc_htb_qopt_offload { u32 parent_classid; u16 classid; u16 qid; + u32 quantum; u64 rate; u64 ceil; u8 prio; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 325c29041c7d..333800a7d4eb 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1810,10 +1810,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter"); goto failure; } - if (hopt->quantum) { - NL_SET_ERR_MSG(extack, "HTB offload doesn't support the quantum parameter"); - goto failure; - } } /* Keeping backward compatible with rate_table based iproute2 tc */ @@ -1910,6 +1906,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, .rate = max_t(u64, hopt->rate.rate, rate64), .ceil = max_t(u64, hopt->ceil.rate, ceil64), .prio = hopt->prio, + .quantum = hopt->quantum, .extack = extack, }; err = htb_offload(dev, &offload_opt); @@ -1931,6 +1928,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, .rate = max_t(u64, hopt->rate.rate, rate64), .ceil = max_t(u64, hopt->ceil.rate, ceil64), .prio = hopt->prio, + .quantum = hopt->quantum, .extack = extack, }; err = htb_offload(dev, &offload_opt); @@ -2017,6 +2015,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, .rate = max_t(u64, hopt->rate.rate, rate64), .ceil = max_t(u64, hopt->ceil.rate, ceil64), .prio = hopt->prio, + .quantum = hopt->quantum, .extack = extack, }; err = htb_offload(dev, &offload_opt); -- cgit v1.2.3 From 5766946ea5117e4edeb78c80cac367fb06854cc1 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 20 Jul 2023 13:13:54 +0200 Subject: genetlink: add explicit ordering break check for split ops Currently, if cmd in the split ops array is of lower value than the previous one, genl_validate_ops() continues to do the checks as if the values are equal. This may result in non-obvious WARN_ON() hit in these check. Instead, check the incorrect ordering explicitly and put a WARN_ON() in case it is broken. Signed-off-by: Jiri Pirko Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20230720111354.562242-1-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/netlink/genetlink.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index a157247a1e45..6bd2ce51271f 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -593,8 +593,12 @@ static int genl_validate_ops(const struct genl_family *family) return -EINVAL; /* Check sort order */ - if (a->cmd < b->cmd) + if (a->cmd < b->cmd) { continue; + } else if (a->cmd > b->cmd) { + WARN_ON(1); + return -EINVAL; + } if (a->internal_flags != b->internal_flags || ((a->flags ^ b->flags) & ~(GENL_CMD_CAP_DO | -- cgit v1.2.3 From 535b9c61bdef6017228c708128b7849a476f8da5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 19 Jul 2023 18:04:08 -0700 Subject: net: page_pool: hide page_pool_release_page() There seems to be no user calling page_pool_release_page() for legit reasons, all the users simply haven't been converted to skb-based recycling, yet. Previous changes converted them. Update the docs, and unexport the function. Link: https://lore.kernel.org/r/20230720010409.1967072-4-kuba@kernel.org Reviewed-by: Alexander Lobakin Signed-off-by: Jakub Kicinski --- Documentation/networking/page_pool.rst | 11 ++++------- include/net/page_pool.h | 10 ++-------- net/core/page_pool.c | 3 +-- 3 files changed, 7 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/Documentation/networking/page_pool.rst b/Documentation/networking/page_pool.rst index 873efd97f822..0aa850cf4447 100644 --- a/Documentation/networking/page_pool.rst +++ b/Documentation/networking/page_pool.rst @@ -13,9 +13,9 @@ replacing dev_alloc_pages(). API keeps track of in-flight pages, in order to let API user know when it is safe to free a page_pool object. Thus, API users -must run page_pool_release_page() when a page is leaving the page_pool or -call page_pool_put_page() where appropriate in order to maintain correct -accounting. +must call page_pool_put_page() to free the page, or attach +the page to a page_pool-aware objects like skbs marked with +skb_mark_for_recycle(). API user must call page_pool_put_page() once on a page, as it will either recycle the page, or in case of refcnt > 1, it will @@ -87,9 +87,6 @@ a page will cause no race conditions is enough. must guarantee safe context (e.g NAPI), since it will recycle the page directly into the pool fast cache. -* page_pool_release_page(): Unmap the page (if mapped) and account for it on - in-flight counters. - * page_pool_dev_alloc_pages(): Get a page from the page allocator or page_pool caches. @@ -194,7 +191,7 @@ NAPI poller if XDP_DROP: page_pool_recycle_direct(page_pool, page); } else (packet_is_skb) { - page_pool_release_page(page_pool, page); + skb_mark_for_recycle(skb); new_page = page_pool_dev_alloc_pages(page_pool); } } diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 126f9e294389..f1d5cc1fa13b 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -18,9 +18,8 @@ * * API keeps track of in-flight pages, in-order to let API user know * when it is safe to dealloactor page_pool object. Thus, API users - * must make sure to call page_pool_release_page() when a page is - * "leaving" the page_pool. Or call page_pool_put_page() where - * appropiate. For maintaining correct accounting. + * must call page_pool_put_page() where appropriate and only attach + * the page to a page_pool-aware objects, like skbs marked for recycling. * * API user must only call page_pool_put_page() once on a page, as it * will either recycle the page, or in case of elevated refcnt, it @@ -251,7 +250,6 @@ void page_pool_unlink_napi(struct page_pool *pool); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), struct xdp_mem_info *mem); -void page_pool_release_page(struct page_pool *pool, struct page *page); void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count); #else @@ -268,10 +266,6 @@ static inline void page_pool_use_xdp_mem(struct page_pool *pool, struct xdp_mem_info *mem) { } -static inline void page_pool_release_page(struct page_pool *pool, - struct page *page) -{ -} static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a3e12a61d456..2c7cf5f2bcb8 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -492,7 +492,7 @@ static s32 page_pool_inflight(struct page_pool *pool) * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ -void page_pool_release_page(struct page_pool *pool, struct page *page) +static void page_pool_release_page(struct page_pool *pool, struct page *page) { dma_addr_t dma; int count; @@ -519,7 +519,6 @@ skip_dma_unmap: count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); } -EXPORT_SYMBOL(page_pool_release_page); /* Return a page to the page allocator, cleaning up our state */ static void page_pool_return_page(struct page_pool *pool, struct page *page) -- cgit v1.2.3 From 07e0c7d3179da5d06132f3d71b740aa91bde52aa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 19 Jul 2023 18:04:09 -0700 Subject: net: page_pool: merge page_pool_release_page() with page_pool_return_page() Now that page_pool_release_page() is not exported we can merge it with page_pool_return_page(). I believe that the "Do not replace this with page_pool_return_page()" comment was there in case page_pool_return_page() was not inlined, to avoid two function calls. Acked-by: Jesper Dangaard Brouer Reviewed-by: Yunsheng Lin Link: https://lore.kernel.org/r/20230720010409.1967072-5-kuba@kernel.org Reviewed-by: Alexander Lobakin Signed-off-by: Jakub Kicinski --- net/core/page_pool.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 2c7cf5f2bcb8..7ca456bfab71 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -492,7 +492,7 @@ static s32 page_pool_inflight(struct page_pool *pool) * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ -static void page_pool_release_page(struct page_pool *pool, struct page *page) +static void page_pool_return_page(struct page_pool *pool, struct page *page) { dma_addr_t dma; int count; @@ -518,12 +518,6 @@ skip_dma_unmap: */ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); -} - -/* Return a page to the page allocator, cleaning up our state */ -static void page_pool_return_page(struct page_pool *pool, struct page *page) -{ - page_pool_release_page(pool, page); put_page(page); /* An optimization would be to call __free_pages(page, pool->p.order) @@ -615,9 +609,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * will be invoking put_page. */ recycle_stat_inc(pool, released_refcnt); - /* Do not replace this with page_pool_return_page() */ - page_pool_release_page(pool, page); - put_page(page); + page_pool_return_page(pool, page); return NULL; } -- cgit v1.2.3 From a3377386b56420d78a4c0a931a40f9a25c3ca2bd Mon Sep 17 00:00:00 2001 From: Anjali Kulkarni Date: Wed, 19 Jul 2023 13:18:16 -0700 Subject: netlink: Reverse the patch which removed filtering To use filtering at the connector & cn_proc layers, we need to enable filtering in the netlink layer. This reverses the patch which removed netlink filtering - commit ID for that patch: 549017aa1bb7 (netlink: remove netlink_broadcast_filtered). Signed-off-by: Anjali Kulkarni Reviewed-by: Liam R. Howlett Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netlink.h | 5 +++++ net/netlink/af_netlink.c | 27 +++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 9eec3f4f5351..3a6563681b50 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -227,6 +227,11 @@ bool netlink_strict_get_check(struct sk_buff *skb); int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); +int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, + __u32 portid, __u32 group, gfp_t allocation, + int (*filter)(struct sock *dsk, + struct sk_buff *skb, void *data), + void *filter_data); int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code); int netlink_register_notifier(struct notifier_block *nb); int netlink_unregister_notifier(struct notifier_block *nb); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 9c9df143a2ec..6c0bcde620e8 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1432,6 +1432,8 @@ struct netlink_broadcast_data { int delivered; gfp_t allocation; struct sk_buff *skb, *skb2; + int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data); + void *tx_data; }; static void do_one_broadcast(struct sock *sk, @@ -1485,6 +1487,13 @@ static void do_one_broadcast(struct sock *sk, p->delivery_failure = 1; goto out; } + + if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { + kfree_skb(p->skb2); + p->skb2 = NULL; + goto out; + } + if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; @@ -1507,8 +1516,12 @@ out: sock_put(sk); } -int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, - u32 group, gfp_t allocation) +int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, + u32 portid, + u32 group, gfp_t allocation, + int (*filter)(struct sock *dsk, + struct sk_buff *skb, void *data), + void *filter_data) { struct net *net = sock_net(ssk); struct netlink_broadcast_data info; @@ -1527,6 +1540,8 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, info.allocation = allocation; info.skb = skb; info.skb2 = NULL; + info.tx_filter = filter; + info.tx_data = filter_data; /* While we sleep in clone, do not allow to change socket list */ @@ -1552,6 +1567,14 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, } return -ESRCH; } +EXPORT_SYMBOL(netlink_broadcast_filtered); + +int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, + u32 group, gfp_t allocation) +{ + return netlink_broadcast_filtered(ssk, skb, portid, group, allocation, + NULL, NULL); +} EXPORT_SYMBOL(netlink_broadcast); struct netlink_set_err_data { -- cgit v1.2.3 From a4c9a56e6a2cdeeab7caef1f496b7bfefd95b50e Mon Sep 17 00:00:00 2001 From: Anjali Kulkarni Date: Wed, 19 Jul 2023 13:18:17 -0700 Subject: netlink: Add new netlink_release function A new function netlink_release is added in netlink_sock to store the protocol's release function. This is called when the socket is deleted. This can be supplied by the protocol via the release function in netlink_kernel_cfg. This is being added for the NETLINK_CONNECTOR protocol, so it can free it's data when socket is deleted. Signed-off-by: Anjali Kulkarni Reviewed-by: Liam R. Howlett Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netlink.h | 1 + net/netlink/af_netlink.c | 6 ++++++ net/netlink/af_netlink.h | 4 ++++ 3 files changed, 11 insertions(+) (limited to 'net') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 3a6563681b50..75d7de34c908 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -50,6 +50,7 @@ struct netlink_kernel_cfg { struct mutex *cb_mutex; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); + void (*release) (struct sock *sk, unsigned long *groups); }; struct sock *__netlink_kernel_create(struct net *net, int unit, diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 6c0bcde620e8..96c605e45235 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -677,6 +677,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, struct netlink_sock *nlk; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); + void (*release)(struct sock *sock, unsigned long *groups); int err = 0; sock->state = SS_UNCONNECTED; @@ -704,6 +705,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, cb_mutex = nl_table[protocol].cb_mutex; bind = nl_table[protocol].bind; unbind = nl_table[protocol].unbind; + release = nl_table[protocol].release; netlink_unlock_table(); if (err < 0) @@ -719,6 +721,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, nlk->module = module; nlk->netlink_bind = bind; nlk->netlink_unbind = unbind; + nlk->netlink_release = release; out: return err; @@ -763,6 +766,8 @@ static int netlink_release(struct socket *sock) * OK. Socket is unlinked, any packets that arrive now * will be purged. */ + if (nlk->netlink_release) + nlk->netlink_release(sk, nlk->groups); /* must not acquire netlink_table_lock in any way again before unbind * and notifying genetlink is done as otherwise it might deadlock @@ -2089,6 +2094,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, if (cfg) { nl_table[unit].bind = cfg->bind; nl_table[unit].unbind = cfg->unbind; + nl_table[unit].release = cfg->release; nl_table[unit].flags = cfg->flags; } nl_table[unit].registered = 1; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 90a3198a9b7f..fd424cd63f31 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -42,6 +42,8 @@ struct netlink_sock { void (*netlink_rcv)(struct sk_buff *skb); int (*netlink_bind)(struct net *net, int group); void (*netlink_unbind)(struct net *net, int group); + void (*netlink_release)(struct sock *sk, + unsigned long *groups); struct module *module; struct rhash_head node; @@ -64,6 +66,8 @@ struct netlink_table { struct module *module; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); + void (*release)(struct sock *sk, + unsigned long *groups); int registered; }; -- cgit v1.2.3 From 5c9f7b04aadf79f783542d096dfbb8184e7c91f5 Mon Sep 17 00:00:00 2001 From: "justinstitt@google.com" Date: Tue, 18 Jul 2023 22:56:38 +0000 Subject: net: dsa: remove deprecated strncpy `strncpy` is deprecated for use on NUL-terminated destination strings [1]. Even call sites utilizing length-bounded destination buffers should switch over to using `strtomem` or `strtomem_pad`. In this case, however, the compiler is unable to determine the size of the `data` buffer which renders `strtomem` unusable. Due to this, `strscpy` should be used. It should be noted that most call sites already zero-initialize the destination buffer. However, I've opted to use `strscpy_pad` to maintain the same exact behavior that `strncpy` produced (zero-padded tail up to `len`). Also see [3]. [1]: www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [2]: elixir.bootlin.com/linux/v6.3/source/net/ethtool/ioctl.c#L1944 [3]: manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html Link: https://github.com/KSPP/linux/issues/90 Reviewed-by: Nick Desaulniers Reviewed-by: Kees Cook Signed-off-by: Justin Stitt Signed-off-by: David S. Miller --- net/dsa/slave.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 527b1d576460..48db91b33390 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "dsa.h" #include "port.h" @@ -1056,10 +1057,10 @@ static void dsa_slave_get_strings(struct net_device *dev, if (stringset == ETH_SS_STATS) { int len = ETH_GSTRING_LEN; - strncpy(data, "tx_packets", len); - strncpy(data + len, "tx_bytes", len); - strncpy(data + 2 * len, "rx_packets", len); - strncpy(data + 3 * len, "rx_bytes", len); + strscpy_pad(data, "tx_packets", len); + strscpy_pad(data + len, "tx_bytes", len); + strscpy_pad(data + 2 * len, "rx_packets", len); + strscpy_pad(data + 3 * len, "rx_bytes", len); if (ds->ops->get_strings) ds->ops->get_strings(ds, dp->index, stringset, data + 4 * len); -- cgit v1.2.3 From 1671bcfd76fdc0b9e65153cf759153083755fe4c Mon Sep 17 00:00:00 2001 From: Patrick Rohr Date: Wed, 19 Jul 2023 07:52:13 -0700 Subject: net: add sysctl accept_ra_min_rtr_lft MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adds a new sysctl accept_ra_min_rtr_lft to specify the minimum acceptable router lifetime in an RA. If the received RA router lifetime is less than the configured value (and not 0), the RA is ignored. This is useful for mobile devices, whose battery life can be impacted by networks that configure RAs with a short lifetime. On such networks, the device should never gain IPv6 provisioning and should attempt to drop RAs via hardware offload, if available. Signed-off-by: Patrick Rohr Cc: Maciej Å»enczykowski Cc: Lorenzo Colitti Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 8 ++++++++ include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 10 ++++++++++ net/ipv6/ndisc.c | 18 ++++++++++++++++-- 5 files changed, 36 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 82f2117cf2b3..37603ad6126b 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -2288,6 +2288,14 @@ accept_ra_min_hop_limit - INTEGER Default: 1 +accept_ra_min_rtr_lft - INTEGER + Minimum acceptable router lifetime in Router Advertisement. + + RAs with a router lifetime less than this value shall be + ignored. RAs with a router lifetime of 0 are unaffected. + + Default: 0 + accept_ra_pinfo - BOOLEAN Learn Prefix Information in Router Advertisement. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 839247a4f48e..ed3d110c2eb5 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -33,6 +33,7 @@ struct ipv6_devconf { __s32 accept_ra_defrtr; __u32 ra_defrtr_metric; __s32 accept_ra_min_hop_limit; + __s32 accept_ra_min_rtr_lft; __s32 accept_ra_pinfo; __s32 ignore_routes_with_linkdown; #ifdef CONFIG_IPV6_ROUTER_PREF diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index ac56605fe9bc..8b6bcbf6ed4a 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -198,6 +198,7 @@ enum { DEVCONF_IOAM6_ID_WIDE, DEVCONF_NDISC_EVICT_NOCARRIER, DEVCONF_ACCEPT_UNTRACKED_NA, + DEVCONF_ACCEPT_RA_MIN_RTR_LFT, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e5213e598a04..19eb4b3d26ea 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -202,6 +202,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, + .accept_ra_min_rtr_lft = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -262,6 +263,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, + .accept_ra_min_rtr_lft = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -5596,6 +5598,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide; array[DEVCONF_NDISC_EVICT_NOCARRIER] = cnf->ndisc_evict_nocarrier; array[DEVCONF_ACCEPT_UNTRACKED_NA] = cnf->accept_untracked_na; + array[DEVCONF_ACCEPT_RA_MIN_RTR_LFT] = cnf->accept_ra_min_rtr_lft; } static inline size_t inet6_ifla6_size(void) @@ -6789,6 +6792,13 @@ static const struct ctl_table addrconf_sysctl[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "accept_ra_min_rtr_lft", + .data = &ipv6_devconf.accept_ra_min_rtr_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "accept_ra_pinfo", .data = &ipv6_devconf.accept_ra_pinfo, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 18634ebd20a4..29ddad1c1a2f 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1280,6 +1280,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; + lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); + if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, "RA: %s, did not accept ra for dev: %s\n", @@ -1287,6 +1289,13 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) goto skip_linkparms; } + if (lifetime != 0 && lifetime < in6_dev->cnf.accept_ra_min_rtr_lft) { + ND_PRINTK(2, info, + "RA: router lifetime (%ds) is too short: %s\n", + lifetime, skb->dev->name); + goto skip_linkparms; + } + #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific parameters from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { @@ -1339,8 +1348,6 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) goto skip_defrtr; } - lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); - #ifdef CONFIG_IPV6_ROUTER_PREF pref = ra_msg->icmph.icmp6_router_pref; /* 10b is handled as if it were 00b (medium) */ @@ -1492,6 +1499,13 @@ skip_linkparms: goto out; } + if (lifetime != 0 && lifetime < in6_dev->cnf.accept_ra_min_rtr_lft) { + ND_PRINTK(2, info, + "RA: router lifetime (%ds) is too short: %s\n", + lifetime, skb->dev->name); + goto out; + } + #ifdef CONFIG_IPV6_ROUTE_INFO if (!in6_dev->cnf.accept_ra_from_local && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, -- cgit v1.2.3 From f5f80e32de12fad2813d37270e8364a03e6d3ef0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Jul 2023 11:09:01 +0000 Subject: ipv6: remove hard coded limitation on ipv6_pinfo IPv6 inet sockets are supposed to have a "struct ipv6_pinfo" field at the end of their definition, so that inet6_sk_generic() can derive from socket size the offset of the "struct ipv6_pinfo". This is very fragile, and prevents adding bigger alignment in sockets, because inet6_sk_generic() does not work if the compiler adds padding after the ipv6_pinfo component. We are currently working on a patch series to reorganize TCP structures for better data locality and found issues similar to the one fixed in commit f5d547676ca0 ("tcp: fix tcp_inet6_sk() for 32bit kernels") Alternative would be to force an alignment on "struct ipv6_pinfo", greater or equal to __alignof__(any ipv6 sock) to ensure there is no padding. This does not look great. v2: fix typo in mptcp_proto_v6_init() (Paolo) Signed-off-by: Eric Dumazet Cc: Chao Wu Cc: Wei Wang Cc: Coco Li Cc: YiFei Zhu Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/ipv6.h | 15 ++++----------- include/net/sock.h | 1 + net/dccp/ipv6.c | 1 + net/dccp/ipv6.h | 4 ---- net/ipv6/af_inet6.c | 4 ++-- net/ipv6/ping.c | 1 + net/ipv6/raw.c | 1 + net/ipv6/tcp_ipv6.c | 1 + net/ipv6/udp.c | 1 + net/ipv6/udplite.c | 1 + net/l2tp/l2tp_ip6.c | 4 +--- net/mptcp/protocol.c | 1 + net/sctp/socket.c | 1 + 13 files changed, 16 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index ed3d110c2eb5..0295b47c10a3 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -200,14 +200,7 @@ struct inet6_cork { u8 tclass; }; -/** - * struct ipv6_pinfo - ipv6 private area - * - * In the struct sock hierarchy (tcp6_sock, upd6_sock, etc) - * this _must_ be the last member, so that inet6_sk_generic - * is able to calculate its offset from the base struct sock - * by using the struct proto->slab_obj_size member. -acme - */ +/* struct ipv6_pinfo - ipv6 private area */ struct ipv6_pinfo { struct in6_addr saddr; struct in6_pktinfo sticky_pktinfo; @@ -307,19 +300,19 @@ struct raw6_sock { __u32 offset; /* checksum offset */ struct icmp6_filter filter; __u32 ip6mr_table; - /* ipv6_pinfo has to be the last member of raw6_sock, see inet6_sk_generic */ + struct ipv6_pinfo inet6; }; struct udp6_sock { struct udp_sock udp; - /* ipv6_pinfo has to be the last member of udp6_sock, see inet6_sk_generic */ + struct ipv6_pinfo inet6; }; struct tcp6_sock { struct tcp_sock tcp; - /* ipv6_pinfo has to be the last member of tcp6_sock, see inet6_sk_generic */ + struct ipv6_pinfo inet6; }; diff --git a/include/net/sock.h b/include/net/sock.h index 2eb916d1ff64..7ae44bf866af 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1339,6 +1339,7 @@ struct proto { struct kmem_cache *slab; unsigned int obj_size; + unsigned int ipv6_pinfo_offset; slab_flags_t slab_flags; unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 7249ef218178..e03b5331df6d 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1056,6 +1056,7 @@ static struct proto dccp_v6_prot = { .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp6_sock), + .ipv6_pinfo_offset = offsetof(struct dccp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .rsk_prot = &dccp6_request_sock_ops, .twsk_prot = &dccp6_timewait_sock_ops, diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h index 7e4c2a3b322b..c5d14c48def1 100644 --- a/net/dccp/ipv6.h +++ b/net/dccp/ipv6.h @@ -13,10 +13,6 @@ struct dccp6_sock { struct dccp_sock dccp; - /* - * ipv6_pinfo has to be the last member of dccp6_sock, - * see inet6_sk_generic. - */ struct ipv6_pinfo inet6; }; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 5d593ddc0347..9f9c4b838664 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -102,9 +102,9 @@ bool ipv6_mod_enabled(void) } EXPORT_SYMBOL_GPL(ipv6_mod_enabled); -static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) +static struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { - const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); + const int offset = sk->sk_prot->ipv6_pinfo_offset; return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index f804c11e2146..2a0e8bc07398 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -215,6 +215,7 @@ struct proto pingv6_prot = { .get_port = ping_get_port, .put_port = ping_unhash, .obj_size = sizeof(struct raw6_sock), + .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6), }; EXPORT_SYMBOL_GPL(pingv6_prot); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ac1cef094c5f..0fcf1b890807 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1216,6 +1216,7 @@ struct proto rawv6_prot = { .hash = raw_hash_sk, .unhash = raw_unhash_sk, .obj_size = sizeof(struct raw6_sock), + .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6), .useroffset = offsetof(struct raw6_sock, filter), .usersize = sizeof_field(struct raw6_sock, filter), .h.raw_hash = &raw_v6_hashinfo, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4714eb695913..1b4529e833a1 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2175,6 +2175,7 @@ struct proto tcpv6_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), + .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b7c972aa09a7..95c75d8f73d5 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1798,6 +1798,7 @@ struct proto udpv6_prot = { .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), + .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6), .h.udp_table = NULL, .diag_destroy = udp_abort, }; diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 8e010d07917a..267d491e9707 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -67,6 +67,7 @@ struct proto udplitev6_prot = { .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), + .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6), .h.udp_table = &udplite_table, }; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index b1623f9c4f92..2eee95a00c05 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -36,9 +36,6 @@ struct l2tp_ip6_sock { u32 conn_id; u32 peer_conn_id; - /* ipv6_pinfo has to be the last member of l2tp_ip6_sock, see - * inet6_sk_generic - */ struct ipv6_pinfo inet6; }; @@ -730,6 +727,7 @@ static struct proto l2tp_ip6_prot = { .hash = l2tp_ip6_hash, .unhash = l2tp_ip6_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), + .ipv6_pinfo_offset = offsetof(struct l2tp_ip6_sock, inet6), }; static const struct proto_ops l2tp_ip6_ops = { diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3613489eb6e3..22323b1fa184 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3988,6 +3988,7 @@ int __init mptcp_proto_v6_init(void) strcpy(mptcp_v6_prot.name, "MPTCPv6"); mptcp_v6_prot.slab = NULL; mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); + mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np); err = proto_register(&mptcp_v6_prot, 1); if (err) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9388d98aebc0..6e3d28aa587c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9732,6 +9732,7 @@ struct proto sctpv6_prot = { .unhash = sctp_unhash, .no_autobind = true, .obj_size = sizeof(struct sctp6_sock), + .ipv6_pinfo_offset = offsetof(struct sctp6_sock, inet6), .useroffset = offsetof(struct sctp6_sock, sctp.subscribe), .usersize = offsetof(struct sctp6_sock, sctp.initmsg) - offsetof(struct sctp6_sock, sctp.subscribe) + -- cgit v1.2.3 From dc644b540a2d2874112706591234be3d3fbf9ef7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 22 Jul 2023 01:33:30 +0200 Subject: tcx: Fix splat in ingress_destroy upon tcx_entry_free On qdisc destruction, the ingress_destroy() needs to update the correct entry, that is, tcx_entry_update must NULL the dev->tcx_ingress pointer. Therefore, fix the typo. Fixes: e420bed02507 ("bpf: Add fd-based tcx multi-prog infra with link support") Reported-by: syzbot+bdcf141f362ef83335cf@syzkaller.appspotmail.com Reported-by: syzbot+b202b7208664142954fa@syzkaller.appspotmail.com Reported-by: syzbot+14736e249bce46091c18@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Tested-by: syzbot+bdcf141f362ef83335cf@syzkaller.appspotmail.com Tested-by: syzbot+b202b7208664142954fa@syzkaller.appspotmail.com Tested-by: syzbot+14736e249bce46091c18@syzkaller.appspotmail.com Tested-by: Petr Machata Link: https://lore.kernel.org/r/20230721233330.5678-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- net/sched/sch_ingress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 04e886f6cee4..a463a63192c3 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -123,7 +123,7 @@ static void ingress_destroy(struct Qdisc *sch) if (entry) { tcx_miniq_set_active(entry, false); if (!tcx_entry_is_active(entry)) { - tcx_entry_update(dev, NULL, false); + tcx_entry_update(dev, NULL, true); tcx_entry_free(entry); } } -- cgit v1.2.3 From b8dc6d6ce93142ccd4c976003bb6c25d63aac2ce Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 20 Jul 2023 20:47:50 +0200 Subject: mptcp: fix rcv buffer auto-tuning The MPTCP code uses the assumption that the tcp_win_from_space() helper does not use any TCP-specific field, and thus works correctly operating on an MPTCP socket. The commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale") broke such assumption, and as a consequence most MPTCP connections stall on zero-window event due to auto-tuning changing the rcv buffer size quite randomly. Address the issue syncing again the MPTCP auto-tuning code with the TCP one. To achieve that, factor out the windows size logic in socket independent helpers, and reuse them in mptcp_rcv_space_adjust(). The MPTCP level scaling_ratio is selected as the minimum one from the all the subflows, as a worst-case estimate. Fixes: dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale") Signed-off-by: Paolo Abeni Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Reviewed-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Link: https://lore.kernel.org/r/20230720-upstream-net-next-20230720-mptcp-fix-rcv-buffer-auto-tuning-v1-1-175ef12b8380@tessares.net Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 20 +++++++++++++++----- net/mptcp/protocol.c | 15 +++++++-------- net/mptcp/protocol.h | 8 +++++++- net/mptcp/subflow.c | 2 +- 4 files changed, 30 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index d17cb8ab4c48..6ebf54992ffe 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1430,22 +1430,32 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd); -static inline int tcp_win_from_space(const struct sock *sk, int space) +static inline int __tcp_win_from_space(u8 scaling_ratio, int space) { - s64 scaled_space = (s64)space * tcp_sk(sk)->scaling_ratio; + s64 scaled_space = (s64)space * scaling_ratio; return scaled_space >> TCP_RMEM_TO_WIN_SCALE; } -/* inverse of tcp_win_from_space() */ -static inline int tcp_space_from_win(const struct sock *sk, int win) +static inline int tcp_win_from_space(const struct sock *sk, int space) +{ + return __tcp_win_from_space(tcp_sk(sk)->scaling_ratio, space); +} + +/* inverse of __tcp_win_from_space() */ +static inline int __tcp_space_from_win(u8 scaling_ratio, int win) { u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE; - do_div(val, tcp_sk(sk)->scaling_ratio); + do_div(val, scaling_ratio); return val; } +static inline int tcp_space_from_win(const struct sock *sk, int win) +{ + return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win); +} + static inline void tcp_scaling_ratio_init(struct sock *sk) { /* Assume a conservative default of 1200 bytes of payload per 4K page. diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 22323b1fa184..68da26539970 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -90,6 +90,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) if (err) return err; + msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio; WRITE_ONCE(msk->first, ssock->sk); WRITE_ONCE(msk->subflow, ssock); subflow = mptcp_subflow_ctx(ssock->sk); @@ -1881,6 +1882,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; + u8 scaling_ratio = U8_MAX; u32 time, advmss = 1; u64 rtt_us, mstamp; @@ -1911,9 +1913,11 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) rtt_us = max(sf_rtt_us, rtt_us); advmss = max(sf_advmss, advmss); + scaling_ratio = min(tp->scaling_ratio, scaling_ratio); } msk->rcvq_space.rtt_us = rtt_us; + msk->scaling_ratio = scaling_ratio; if (time < (rtt_us >> 3) || rtt_us == 0) return; @@ -1922,8 +1926,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvmem, rcvbuf; u64 rcvwin, grow; + int rcvbuf; rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; @@ -1932,18 +1936,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) do_div(grow, msk->rcvq_space.space); rcvwin += (grow << 1); - rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(sk, rcvmem) < advmss) - rcvmem += 128; - - do_div(rcvwin, advmss); - rcvbuf = min_t(u64, rcvwin * rcvmem, + rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; - window_clamp = tcp_win_from_space(sk, rcvbuf); + window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf); WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make subflows follow along. If we do not do this, we diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 37fbe22e2433..795f422e8597 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -321,6 +321,7 @@ struct mptcp_sock { u64 time; /* start time of measurement window */ u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; + u8 scaling_ratio; u32 subflow_id; u32 setsockopt_seq; @@ -351,9 +352,14 @@ static inline int __mptcp_rmem(const struct sock *sk) return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); } +static inline int mptcp_win_from_space(const struct sock *sk, int space) +{ + return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); +} + static inline int __mptcp_space(const struct sock *sk) { - return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); + return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); } static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9ee3b7abbaf6..ad7080fabb2f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1359,7 +1359,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) const struct sock *sk = subflow->conn; *space = __mptcp_space(sk); - *full_space = tcp_full_space(sk); + *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } void __mptcp_error_report(struct sock *sk) -- cgit v1.2.3 From 89edf40220be8e68922beb54a06fdfc66f743c39 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 19 Jul 2023 12:26:56 +0300 Subject: xfrm: Support UDP encapsulation in packet offload mode Since mlx5 supports UDP encapsulation in packet offload, change the XFRM core to allow users to configure it. Signed-off-by: Leon Romanovsky Acked-by: Steffen Klassert Signed-off-by: Paolo Abeni --- net/xfrm/xfrm_device.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 533697e2488f..3784534c9185 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -247,12 +247,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } - /* We don't yet support UDP encapsulation and TFC padding. */ - if (x->encap || x->tfcpad) { - NL_SET_ERR_MSG(extack, "Encapsulation and TFC padding can't be offloaded"); - return -EINVAL; - } - if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) { NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request"); @@ -260,6 +254,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, } is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET; + + /* We don't yet support UDP encapsulation and TFC padding. */ + if ((!is_packet_offload && x->encap) || x->tfcpad) { + NL_SET_ERR_MSG(extack, "Encapsulation and TFC padding can't be offloaded"); + return -EINVAL; + } + dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { -- cgit v1.2.3 From f0ea27e7bfe1c34e1f451a63eb68faa1d4c3a86d Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 20 Jul 2023 17:30:05 +0200 Subject: udp: re-score reuseport groups when connected sockets are present Contrary to TCP, UDP reuseport groups can contain TCP_ESTABLISHED sockets. To support these properly we remember whether a group has a connected socket and skip the fast reuseport early-return. In effect we continue scoring all reuseport sockets and then choose the one with the highest score. The current code fails to re-calculate the score for the result of lookup_reuseport. According to Kuniyuki Iwashima: 1) SO_INCOMING_CPU is set -> selected sk might have +1 score 2) BPF prog returns ESTABLISHED and/or SO_INCOMING_CPU sk -> selected sk will have more than 8 Using the old score could trigger more lookups depending on the order that sockets are created. sk -> sk (SO_INCOMING_CPU) -> sk (ESTABLISHED) | | `-> select the next SO_INCOMING_CPU sk | `-> select itself (We should save this lookup) Fixes: efc6b6f6c311 ("udp: Improve load balancing for SO_REUSEPORT.") Reviewed-by: Kuniyuki Iwashima Signed-off-by: Lorenz Bauer Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-1-7021b683cdae@isovalent.com Signed-off-by: Martin KaFai Lau --- net/ipv4/udp.c | 20 +++++++++++++++----- net/ipv6/udp.c | 19 ++++++++++++++----- 2 files changed, 29 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8c3ebd95f5b9..0d81339bda2f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -451,14 +451,24 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - result = lookup_reuseport(net, sk, skb, - saddr, sport, daddr, hnum); + badness = score; + result = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); + if (!result) { + result = sk; + continue; + } + /* Fall back to scoring if group has connections */ - if (result && !reuseport_has_conns(sk)) + if (!reuseport_has_conns(sk)) return result; - result = result ? : sk; - badness = score; + /* Reuseport logic returned an error, keep original score. */ + if (IS_ERR(result)) + continue; + + badness = compute_score(result, net, saddr, sport, + daddr, hnum, dif, sdif); + } } return result; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 95c75d8f73d5..556d74a7d7ab 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -194,14 +194,23 @@ static struct sock *udp6_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { - result = lookup_reuseport(net, sk, skb, - saddr, sport, daddr, hnum); + badness = score; + result = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); + if (!result) { + result = sk; + continue; + } + /* Fall back to scoring if group has connections */ - if (result && !reuseport_has_conns(sk)) + if (!reuseport_has_conns(sk)) return result; - result = result ? : sk; - badness = score; + /* Reuseport logic returned an error, keep original score. */ + if (IS_ERR(result)) + continue; + + badness = compute_score(sk, net, saddr, sport, + daddr, hnum, dif, sdif); } } return result; -- cgit v1.2.3 From 67312adc96b5a585970d03b62412847afe2c6b01 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 20 Jul 2023 17:30:06 +0200 Subject: bpf: reject unhashed sockets in bpf_sk_assign The semantics for bpf_sk_assign are as follows: sk = some_lookup_func() bpf_sk_assign(skb, sk) bpf_sk_release(sk) That is, the sk is not consumed by bpf_sk_assign. The function therefore needs to make sure that sk lives long enough to be consumed from __inet_lookup_skb. The path through the stack for a TCPv4 packet is roughly: netif_receive_skb_core: takes RCU read lock __netif_receive_skb_core: sch_handle_ingress: tcf_classify: bpf_sk_assign() deliver_ptype_list_skb: deliver_skb: ip_packet_type->func == ip_rcv: ip_rcv_core: ip_rcv_finish_core: dst_input: ip_local_deliver: ip_local_deliver_finish: ip_protocol_deliver_rcu: tcp_v4_rcv: __inet_lookup_skb: skb_steal_sock The existing helper takes advantage of the fact that everything happens in the same RCU critical section: for sockets with SOCK_RCU_FREE set bpf_sk_assign never takes a reference. skb_steal_sock then checks SOCK_RCU_FREE again and does sock_put if necessary. This approach assumes that SOCK_RCU_FREE is never set on a sk between bpf_sk_assign and skb_steal_sock, but this invariant is violated by unhashed UDP sockets. A new UDP socket is created in TCP_CLOSE state but without SOCK_RCU_FREE set. That flag is only added in udp_lib_get_port() which happens when a socket is bound. When bpf_sk_assign was added it wasn't possible to access unhashed UDP sockets from BPF, so this wasn't a problem. This changed in commit 0c48eefae712 ("sock_map: Lift socket state restriction for datagram sockets"), but the helper wasn't adjusted accordingly. The following sequence of events will therefore lead to a refcount leak: 1. Add socket(AF_INET, SOCK_DGRAM) to a sockmap. 2. Pull socket out of sockmap and bpf_sk_assign it. Since SOCK_RCU_FREE is not set we increment the refcount. 3. bind() or connect() the socket, setting SOCK_RCU_FREE. 4. skb_steal_sock will now set refcounted = false due to SOCK_RCU_FREE. 5. tcp_v4_rcv() skips sock_put(). Fix the problem by rejecting unhashed sockets in bpf_sk_assign(). This matches the behaviour of __inet_lookup_skb which is ultimately the goal of bpf_sk_assign(). Fixes: cf7fbe660f2d ("bpf: Add socket assign support") Cc: Joe Stringer Signed-off-by: Lorenz Bauer Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-2-7021b683cdae@isovalent.com Signed-off-by: Martin KaFai Lau --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 797e8f039696..b5b51ef48c5f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7353,6 +7353,8 @@ BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) return -ENETUNREACH; if (unlikely(sk_fullsock(sk) && sk->sk_reuseport)) return -ESOCKTNOSUPPORT; + if (sk_unhashed(sk)) + return -EOPNOTSUPP; if (sk_is_refcounted(sk) && unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) return -ENOENT; -- cgit v1.2.3 From ce796e60b3b196b61fcc565df195443cbb846ef0 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 20 Jul 2023 17:30:07 +0200 Subject: net: export inet_lookup_reuseport and inet6_lookup_reuseport Rename the existing reuseport helpers for IPv4 and IPv6 so that they can be invoked in the follow up commit. Export them so that building DCCP and IPv6 as a module works. No change in functionality. Reviewed-by: Kuniyuki Iwashima Signed-off-by: Lorenz Bauer Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-3-7021b683cdae@isovalent.com Signed-off-by: Martin KaFai Lau --- include/net/inet6_hashtables.h | 7 +++++++ include/net/inet_hashtables.h | 5 +++++ net/ipv4/inet_hashtables.c | 15 ++++++++------- net/ipv6/inet6_hashtables.c | 19 ++++++++++--------- 4 files changed, 30 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 56f1286583d3..032ddab48f8f 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -48,6 +48,13 @@ struct sock *__inet6_lookup_established(struct net *net, const u16 hnum, const int dif, const int sdif); +struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + __be16 sport, + const struct in6_addr *daddr, + unsigned short hnum); + struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 99bd823e97f6..8734f3488f5d 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -379,6 +379,11 @@ struct sock *__inet_lookup_established(struct net *net, const __be32 daddr, const u16 hnum, const int dif, const int sdif); +struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned short hnum); + static inline struct sock * inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 0819d6001b9a..ecb838460629 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -332,10 +332,10 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } -static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, int doff, - __be32 saddr, __be16 sport, - __be32 daddr, unsigned short hnum) +struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned short hnum) { struct sock *reuse_sk = NULL; u32 phash; @@ -346,6 +346,7 @@ static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, } return reuse_sk; } +EXPORT_SYMBOL_GPL(inet_lookup_reuseport); /* * Here are some nice properties to exploit here. The BSD API @@ -369,8 +370,8 @@ static struct sock *inet_lhash2_lookup(struct net *net, sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { - result = lookup_reuseport(net, sk, skb, doff, - saddr, sport, daddr, hnum); + result = inet_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum); if (result) return result; @@ -399,7 +400,7 @@ static inline struct sock *inet_lookup_run_bpf(struct net *net, if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; - reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); if (reuse_sk) sk = reuse_sk; return sk; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b64b49012655..b7c56867314e 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -111,12 +111,12 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } -static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, int doff, - const struct in6_addr *saddr, - __be16 sport, - const struct in6_addr *daddr, - unsigned short hnum) +struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + __be16 sport, + const struct in6_addr *daddr, + unsigned short hnum) { struct sock *reuse_sk = NULL; u32 phash; @@ -127,6 +127,7 @@ static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk, } return reuse_sk; } +EXPORT_SYMBOL_GPL(inet6_lookup_reuseport); /* called with rcu_read_lock() */ static struct sock *inet6_lhash2_lookup(struct net *net, @@ -143,8 +144,8 @@ static struct sock *inet6_lhash2_lookup(struct net *net, sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { - result = lookup_reuseport(net, sk, skb, doff, - saddr, sport, daddr, hnum); + result = inet6_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum); if (result) return result; @@ -175,7 +176,7 @@ static inline struct sock *inet6_lookup_run_bpf(struct net *net, if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; - reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); if (reuse_sk) sk = reuse_sk; return sk; -- cgit v1.2.3 From 0f495f7617229772403e683033abc473f0f0553c Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 20 Jul 2023 17:30:08 +0200 Subject: net: remove duplicate reuseport_lookup functions There are currently four copies of reuseport_lookup: one each for (TCP, UDP)x(IPv4, IPv6). This forces us to duplicate all callers of those functions as well. This is already the case for sk_lookup helpers (inet,inet6,udp4,udp6)_lookup_run_bpf. There are two differences between the reuseport_lookup helpers: 1. They call different hash functions depending on protocol 2. UDP reuseport_lookup checks that sk_state != TCP_ESTABLISHED Move the check for sk_state into the caller and use the INDIRECT_CALL infrastructure to cut down the helpers to one per IP version. Reviewed-by: Kuniyuki Iwashima Signed-off-by: Lorenz Bauer Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-4-7021b683cdae@isovalent.com Signed-off-by: Martin KaFai Lau --- include/net/inet6_hashtables.h | 11 ++++++++++- include/net/inet_hashtables.h | 15 ++++++++++----- net/ipv4/inet_hashtables.c | 20 +++++++++++++------- net/ipv4/udp.c | 34 +++++++++++++--------------------- net/ipv6/inet6_hashtables.c | 14 ++++++++++---- net/ipv6/udp.c | 41 ++++++++++++++++------------------------- 6 files changed, 72 insertions(+), 63 deletions(-) (limited to 'net') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 032ddab48f8f..f89320b6fee3 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -48,12 +48,21 @@ struct sock *__inet6_lookup_established(struct net *net, const u16 hnum, const int dif, const int sdif); +typedef u32 (inet6_ehashfn_t)(const struct net *net, + const struct in6_addr *laddr, const u16 lport, + const struct in6_addr *faddr, const __be16 fport); + +inet6_ehashfn_t inet6_ehashfn; + +INDIRECT_CALLABLE_DECLARE(inet6_ehashfn_t udp6_ehashfn); + struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, - unsigned short hnum); + unsigned short hnum, + inet6_ehashfn_t *ehashfn); struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 8734f3488f5d..ddfa2e67fdb5 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -379,10 +379,19 @@ struct sock *__inet_lookup_established(struct net *net, const __be32 daddr, const u16 hnum, const int dif, const int sdif); +typedef u32 (inet_ehashfn_t)(const struct net *net, + const __be32 laddr, const __u16 lport, + const __be32 faddr, const __be16 fport); + +inet_ehashfn_t inet_ehashfn; + +INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn); + struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, - __be32 daddr, unsigned short hnum); + __be32 daddr, unsigned short hnum, + inet_ehashfn_t *ehashfn); static inline struct sock * inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, @@ -453,10 +462,6 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, refcounted); } -u32 inet6_ehashfn(const struct net *net, - const struct in6_addr *laddr, const u16 lport, - const struct in6_addr *faddr, const __be16 fport); - static inline void sk_daddr_set(struct sock *sk, __be32 addr) { sk->sk_daddr = addr; /* alias of inet_daddr */ diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ecb838460629..0a7a726464d9 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -28,9 +28,9 @@ #include #include -static u32 inet_ehashfn(const struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) +u32 inet_ehashfn(const struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) { static u32 inet_ehash_secret __read_mostly; @@ -39,6 +39,7 @@ static u32 inet_ehashfn(const struct net *net, const __be32 laddr, return __inet_ehashfn(laddr, lport, faddr, fport, inet_ehash_secret + net_hash_mix(net)); } +EXPORT_SYMBOL_GPL(inet_ehashfn); /* This function handles inet_sock, but also timewait and request sockets * for IPv4/IPv6. @@ -332,16 +333,20 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } +INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn); + struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, - __be32 daddr, unsigned short hnum) + __be32 daddr, unsigned short hnum, + inet_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { - phash = inet_ehashfn(net, daddr, hnum, saddr, sport); + phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn, + net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; @@ -371,7 +376,7 @@ static struct sock *inet_lhash2_lookup(struct net *net, score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = inet_lookup_reuseport(net, sk, skb, doff, - saddr, sport, daddr, hnum); + saddr, sport, daddr, hnum, inet_ehashfn); if (result) return result; @@ -400,7 +405,8 @@ static inline struct sock *inet_lookup_run_bpf(struct net *net, if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; - reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, + inet_ehashfn); if (reuse_sk) sk = reuse_sk; return sk; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0d81339bda2f..3abe8a1520da 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -406,9 +406,9 @@ static int compute_score(struct sock *sk, struct net *net, return score; } -static u32 udp_ehashfn(const struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) +INDIRECT_CALLABLE_SCOPE +u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, + const __be32 faddr, const __be16 fport) { static u32 udp_ehash_secret __read_mostly; @@ -418,22 +418,6 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, udp_ehash_secret + net_hash_mix(net)); } -static struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, - __be32 saddr, __be16 sport, - __be32 daddr, unsigned short hnum) -{ - struct sock *reuse_sk = NULL; - u32 hash; - - if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { - hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - reuse_sk = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - } - return reuse_sk; -} - /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, @@ -452,7 +436,14 @@ static struct sock *udp4_lib_lookup2(struct net *net, daddr, hnum, dif, sdif); if (score > badness) { badness = score; - result = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); + + if (sk->sk_state == TCP_ESTABLISHED) { + result = sk; + continue; + } + + result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, udp_ehashfn); if (!result) { result = sk; continue; @@ -491,7 +482,8 @@ static struct sock *udp4_lookup_run_bpf(struct net *net, if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; - reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); + reuse_sk = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, udp_ehashfn); if (reuse_sk) sk = reuse_sk; return sk; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b7c56867314e..3616225c89ef 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -39,6 +39,7 @@ u32 inet6_ehashfn(const struct net *net, return __inet6_ehashfn(lhash, lport, fhash, fport, inet6_ehash_secret + net_hash_mix(net)); } +EXPORT_SYMBOL_GPL(inet6_ehashfn); /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so @@ -111,18 +112,22 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } +INDIRECT_CALLABLE_DECLARE(inet6_ehashfn_t udp6_ehashfn); + struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, - unsigned short hnum) + unsigned short hnum, + inet6_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { - phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + phash = INDIRECT_CALL_INET(ehashfn, udp6_ehashfn, inet6_ehashfn, + net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; @@ -145,7 +150,7 @@ static struct sock *inet6_lhash2_lookup(struct net *net, score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = inet6_lookup_reuseport(net, sk, skb, doff, - saddr, sport, daddr, hnum); + saddr, sport, daddr, hnum, inet6_ehashfn); if (result) return result; @@ -176,7 +181,8 @@ static inline struct sock *inet6_lookup_run_bpf(struct net *net, if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; - reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum); + reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, hnum, inet6_ehashfn); if (reuse_sk) sk = reuse_sk; return sk; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 556d74a7d7ab..918ca1771710 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -71,11 +71,12 @@ int udpv6_init_sock(struct sock *sk) return 0; } -static u32 udp6_ehashfn(const struct net *net, - const struct in6_addr *laddr, - const u16 lport, - const struct in6_addr *faddr, - const __be16 fport) +INDIRECT_CALLABLE_SCOPE +u32 udp6_ehashfn(const struct net *net, + const struct in6_addr *laddr, + const u16 lport, + const struct in6_addr *faddr, + const __be16 fport) { static u32 udp6_ehash_secret __read_mostly; static u32 udp_ipv6_hash_secret __read_mostly; @@ -160,24 +161,6 @@ static int compute_score(struct sock *sk, struct net *net, return score; } -static struct sock *lookup_reuseport(struct net *net, struct sock *sk, - struct sk_buff *skb, - const struct in6_addr *saddr, - __be16 sport, - const struct in6_addr *daddr, - unsigned int hnum) -{ - struct sock *reuse_sk = NULL; - u32 hash; - - if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { - hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - reuse_sk = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - } - return reuse_sk; -} - /* called with rcu_read_lock() */ static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -195,7 +178,14 @@ static struct sock *udp6_lib_lookup2(struct net *net, daddr, hnum, dif, sdif); if (score > badness) { badness = score; - result = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); + + if (sk->sk_state == TCP_ESTABLISHED) { + result = sk; + continue; + } + + result = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, udp6_ehashfn); if (!result) { result = sk; continue; @@ -235,7 +225,8 @@ static inline struct sock *udp6_lookup_run_bpf(struct net *net, if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; - reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); + reuse_sk = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, udp6_ehashfn); if (reuse_sk) sk = reuse_sk; return sk; -- cgit v1.2.3 From 2a61776366bd17db9ac8c93ec32b8e0ae6b11cf2 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 20 Jul 2023 17:30:09 +0200 Subject: net: document inet[6]_lookup_reuseport sk_state requirements The current implementation was extracted from inet[6]_lhash2_lookup in commit 80b373f74f9e ("inet: Extract helper for selecting socket from reuseport group") and commit 5df6531292b5 ("inet6: Extract helper for selecting socket from reuseport group"). In the original context, sk is always in TCP_LISTEN state and so did not have a separate check. Add documentation that specifies which sk_state are valid to pass to the function. Reviewed-by: Kuniyuki Iwashima Signed-off-by: Lorenz Bauer Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-5-7021b683cdae@isovalent.com Signed-off-by: Martin KaFai Lau --- net/ipv4/inet_hashtables.c | 15 +++++++++++++++ net/ipv6/inet6_hashtables.c | 15 +++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'net') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 0a7a726464d9..e9cec4807694 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -335,6 +335,21 @@ static inline int compute_score(struct sock *sk, struct net *net, INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn); +/** + * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. + * @net: network namespace. + * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. + * @skb: context for a potential SK_REUSEPORT program. + * @doff: header offset. + * @saddr: source address. + * @sport: source port. + * @daddr: destination address. + * @hnum: destination port in host byte order. + * @ehashfn: hash function used to generate the fallback hash. + * + * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to + * the selected sock or an error. + */ struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 3616225c89ef..f76dbbb29332 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -114,6 +114,21 @@ static inline int compute_score(struct sock *sk, struct net *net, INDIRECT_CALLABLE_DECLARE(inet6_ehashfn_t udp6_ehashfn); +/** + * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary. + * @net: network namespace. + * @sk: AF_INET6 socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. + * @skb: context for a potential SK_REUSEPORT program. + * @doff: header offset. + * @saddr: source address. + * @sport: source port. + * @daddr: destination address. + * @hnum: destination port in host byte order. + * @ehashfn: hash function used to generate the fallback hash. + * + * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to + * the selected sock or an error. + */ struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, const struct in6_addr *saddr, -- cgit v1.2.3 From 6c886db2e78ce1dee163d07240467770a235f33e Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 20 Jul 2023 17:30:10 +0200 Subject: net: remove duplicate sk_lookup helpers Now that inet[6]_lookup_reuseport are parameterised on the ehashfn we can remove two sk_lookup helpers. Reviewed-by: Kuniyuki Iwashima Signed-off-by: Lorenz Bauer Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-6-7021b683cdae@isovalent.com Signed-off-by: Martin KaFai Lau --- include/net/inet6_hashtables.h | 9 +++++++++ include/net/inet_hashtables.h | 7 +++++++ net/ipv4/inet_hashtables.c | 26 +++++++++++++------------- net/ipv4/udp.c | 32 +++++--------------------------- net/ipv6/inet6_hashtables.c | 31 ++++++++++++++++--------------- net/ipv6/udp.c | 34 +++++----------------------------- 6 files changed, 55 insertions(+), 84 deletions(-) (limited to 'net') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index f89320b6fee3..a6722d6ef80f 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -73,6 +73,15 @@ struct sock *inet6_lookup_listener(struct net *net, const unsigned short hnum, const int dif, const int sdif); +struct sock *inet6_lookup_run_sk_lookup(struct net *net, + int protocol, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum, const int dif, + inet6_ehashfn_t *ehashfn); + static inline struct sock *__inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index ddfa2e67fdb5..c0532cc7587f 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -393,6 +393,13 @@ struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, __be32 daddr, unsigned short hnum, inet_ehashfn_t *ehashfn); +struct sock *inet_lookup_run_sk_lookup(struct net *net, + int protocol, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, u16 hnum, const int dif, + inet_ehashfn_t *ehashfn); + static inline struct sock * inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e9cec4807694..6a872b8fb0d3 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -403,25 +403,23 @@ static struct sock *inet_lhash2_lookup(struct net *net, return result; } -static inline struct sock *inet_lookup_run_bpf(struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - __be32 saddr, __be16 sport, - __be32 daddr, u16 hnum, const int dif) +struct sock *inet_lookup_run_sk_lookup(struct net *net, + int protocol, + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, u16 hnum, const int dif, + inet_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool no_reuseport; - if (hashinfo != net->ipv4.tcp_death_row.hashinfo) - return NULL; /* only TCP is supported */ - - no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport, + no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport, daddr, hnum, dif, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, - inet_ehashfn); + ehashfn); if (reuse_sk) sk = reuse_sk; return sk; @@ -439,9 +437,11 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash2; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { - result = inet_lookup_run_bpf(net, hashinfo, skb, doff, - saddr, sport, daddr, hnum, dif); + if (static_branch_unlikely(&bpf_sk_lookup_enabled) && + hashinfo == net->ipv4.tcp_death_row.hashinfo) { + result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, + saddr, sport, daddr, hnum, dif, + inet_ehashfn); if (result) goto done; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3abe8a1520da..456fb6ee6ee8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -465,30 +465,6 @@ static struct sock *udp4_lib_lookup2(struct net *net, return result; } -static struct sock *udp4_lookup_run_bpf(struct net *net, - struct udp_table *udptable, - struct sk_buff *skb, - __be32 saddr, __be16 sport, - __be32 daddr, u16 hnum, const int dif) -{ - struct sock *sk, *reuse_sk; - bool no_reuseport; - - if (udptable != net->ipv4.udp_table) - return NULL; /* only UDP is supported */ - - no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, saddr, sport, - daddr, hnum, dif, &sk); - if (no_reuseport || IS_ERR_OR_NULL(sk)) - return sk; - - reuse_sk = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), - saddr, sport, daddr, hnum, udp_ehashfn); - if (reuse_sk) - sk = reuse_sk; - return sk; -} - /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ @@ -513,9 +489,11 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, goto done; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { - sk = udp4_lookup_run_bpf(net, udptable, skb, - saddr, sport, daddr, hnum, dif); + if (static_branch_unlikely(&bpf_sk_lookup_enabled) && + udptable == net->ipv4.udp_table) { + sk = inet_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, dif, + udp_ehashfn); if (sk) { result = sk; goto done; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index f76dbbb29332..7c9700c7c9c8 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -177,31 +177,30 @@ static struct sock *inet6_lhash2_lookup(struct net *net, return result; } -static inline struct sock *inet6_lookup_run_bpf(struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - const struct in6_addr *saddr, - const __be16 sport, - const struct in6_addr *daddr, - const u16 hnum, const int dif) +struct sock *inet6_lookup_run_sk_lookup(struct net *net, + int protocol, + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum, const int dif, + inet6_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool no_reuseport; - if (hashinfo != net->ipv4.tcp_death_row.hashinfo) - return NULL; /* only TCP is supported */ - - no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_TCP, saddr, sport, + no_reuseport = bpf_sk_lookup_run_v6(net, protocol, saddr, sport, daddr, hnum, dif, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, - saddr, sport, daddr, hnum, inet6_ehashfn); + saddr, sport, daddr, hnum, ehashfn); if (reuse_sk) sk = reuse_sk; return sk; } +EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup); struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, @@ -215,9 +214,11 @@ struct sock *inet6_lookup_listener(struct net *net, unsigned int hash2; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { - result = inet6_lookup_run_bpf(net, hashinfo, skb, doff, - saddr, sport, daddr, hnum, dif); + if (static_branch_unlikely(&bpf_sk_lookup_enabled) && + hashinfo == net->ipv4.tcp_death_row.hashinfo) { + result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, + saddr, sport, daddr, hnum, dif, + inet6_ehashfn); if (result) goto done; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 918ca1771710..4b8974a8c7a3 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -206,32 +206,6 @@ static struct sock *udp6_lib_lookup2(struct net *net, return result; } -static inline struct sock *udp6_lookup_run_bpf(struct net *net, - struct udp_table *udptable, - struct sk_buff *skb, - const struct in6_addr *saddr, - __be16 sport, - const struct in6_addr *daddr, - u16 hnum, const int dif) -{ - struct sock *sk, *reuse_sk; - bool no_reuseport; - - if (udptable != net->ipv4.udp_table) - return NULL; /* only UDP is supported */ - - no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_UDP, saddr, sport, - daddr, hnum, dif, &sk); - if (no_reuseport || IS_ERR_OR_NULL(sk)) - return sk; - - reuse_sk = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), - saddr, sport, daddr, hnum, udp6_ehashfn); - if (reuse_sk) - sk = reuse_sk; - return sk; -} - /* rcu_read_lock() must be held */ struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -256,9 +230,11 @@ struct sock *__udp6_lib_lookup(struct net *net, goto done; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { - sk = udp6_lookup_run_bpf(net, udptable, skb, - saddr, sport, daddr, hnum, dif); + if (static_branch_unlikely(&bpf_sk_lookup_enabled) && + udptable == net->ipv4.udp_table) { + sk = inet6_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr), + saddr, sport, daddr, hnum, dif, + udp6_ehashfn); if (sk) { result = sk; goto done; -- cgit v1.2.3 From 9c02bec95954252c3c01bfbb3f7560e0b95ca955 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 20 Jul 2023 17:30:11 +0200 Subject: bpf, net: Support SO_REUSEPORT sockets with bpf_sk_assign Currently the bpf_sk_assign helper in tc BPF context refuses SO_REUSEPORT sockets. This means we can't use the helper to steer traffic to Envoy, which configures SO_REUSEPORT on its sockets. In turn, we're blocked from removing TPROXY from our setup. The reason that bpf_sk_assign refuses such sockets is that the bpf_sk_lookup helpers don't execute SK_REUSEPORT programs. Instead, one of the reuseport sockets is selected by hash. This could cause dispatch to the "wrong" socket: sk = bpf_sk_lookup_tcp(...) // select SO_REUSEPORT by hash bpf_sk_assign(skb, sk) // SK_REUSEPORT wasn't executed Fixing this isn't as simple as invoking SK_REUSEPORT from the lookup helpers unfortunately. In the tc context, L2 headers are at the start of the skb, while SK_REUSEPORT expects L3 headers instead. Instead, we execute the SK_REUSEPORT program when the assigned socket is pulled out of the skb, further up the stack. This creates some trickiness with regards to refcounting as bpf_sk_assign will put both refcounted and RCU freed sockets in skb->sk. reuseport sockets are RCU freed. We can infer that the sk_assigned socket is RCU freed if the reuseport lookup succeeds, but convincing yourself of this fact isn't straight forward. Therefore we defensively check refcounting on the sk_assign sock even though it's probably not required in practice. Fixes: 8e368dc72e86 ("bpf: Fix use of sk->sk_reuseport from sk_assign") Fixes: cf7fbe660f2d ("bpf: Add socket assign support") Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Cc: Joe Stringer Link: https://lore.kernel.org/bpf/CACAyw98+qycmpQzKupquhkxbvWK4OFyDuuLMBNROnfWMZxUWeA@mail.gmail.com/ Reviewed-by: Kuniyuki Iwashima Signed-off-by: Lorenz Bauer Link: https://lore.kernel.org/r/20230720-so-reuseport-v6-7-7021b683cdae@isovalent.com Signed-off-by: Martin KaFai Lau --- include/net/inet6_hashtables.h | 56 ++++++++++++++++++++++++++++++++++++++---- include/net/inet_hashtables.h | 49 ++++++++++++++++++++++++++++++++++-- include/net/sock.h | 7 ++++-- include/uapi/linux/bpf.h | 3 --- net/core/filter.c | 2 -- net/ipv4/udp.c | 8 ++++-- net/ipv6/udp.c | 8 ++++-- tools/include/uapi/linux/bpf.h | 3 --- 8 files changed, 115 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index a6722d6ef80f..284b5ce7205d 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -103,6 +103,46 @@ static inline struct sock *__inet6_lookup(struct net *net, daddr, hnum, dif, sdif); } +static inline +struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff, + const struct in6_addr *saddr, const __be16 sport, + const struct in6_addr *daddr, const __be16 dport, + bool *refcounted, inet6_ehashfn_t *ehashfn) +{ + struct sock *sk, *reuse_sk; + bool prefetched; + + sk = skb_steal_sock(skb, refcounted, &prefetched); + if (!sk) + return NULL; + + if (!prefetched) + return sk; + + if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_state != TCP_LISTEN) + return sk; + } else if (sk->sk_protocol == IPPROTO_UDP) { + if (sk->sk_state != TCP_CLOSE) + return sk; + } else { + return sk; + } + + reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, ntohs(dport), + ehashfn); + if (!reuse_sk) + return sk; + + /* We've chosen a new reuseport sock which is never refcounted. This + * implies that sk also isn't refcounted. + */ + WARN_ON_ONCE(*refcounted); + + return reuse_sk; +} + static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be16 sport, @@ -110,14 +150,20 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, int iif, int sdif, bool *refcounted) { - struct sock *sk = skb_steal_sock(skb, refcounted); - + struct net *net = dev_net(skb_dst(skb)->dev); + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct sock *sk; + + sk = inet6_steal_sock(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, dport, + refcounted, inet6_ehashfn); + if (IS_ERR(sk)) + return NULL; if (sk) return sk; - return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, - doff, &ipv6_hdr(skb)->saddr, sport, - &ipv6_hdr(skb)->daddr, ntohs(dport), + return __inet6_lookup(net, hashinfo, skb, + doff, &ip6h->saddr, sport, + &ip6h->daddr, ntohs(dport), iif, sdif, refcounted); } diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index c0532cc7587f..1177effabed3 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -449,6 +449,46 @@ static inline struct sock *inet_lookup(struct net *net, return sk; } +static inline +struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff, + const __be32 saddr, const __be16 sport, + const __be32 daddr, const __be16 dport, + bool *refcounted, inet_ehashfn_t *ehashfn) +{ + struct sock *sk, *reuse_sk; + bool prefetched; + + sk = skb_steal_sock(skb, refcounted, &prefetched); + if (!sk) + return NULL; + + if (!prefetched) + return sk; + + if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_state != TCP_LISTEN) + return sk; + } else if (sk->sk_protocol == IPPROTO_UDP) { + if (sk->sk_state != TCP_CLOSE) + return sk; + } else { + return sk; + } + + reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, + saddr, sport, daddr, ntohs(dport), + ehashfn); + if (!reuse_sk) + return sk; + + /* We've chosen a new reuseport sock which is never refcounted. This + * implies that sk also isn't refcounted. + */ + WARN_ON_ONCE(*refcounted); + + return reuse_sk; +} + static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -457,13 +497,18 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, const int sdif, bool *refcounted) { - struct sock *sk = skb_steal_sock(skb, refcounted); + struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); + struct sock *sk; + sk = inet_steal_sock(net, skb, doff, iph->saddr, sport, iph->daddr, dport, + refcounted, inet_ehashfn); + if (IS_ERR(sk)) + return NULL; if (sk) return sk; - return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, + return __inet_lookup(net, hashinfo, skb, doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), sdif, refcounted); diff --git a/include/net/sock.h b/include/net/sock.h index 7ae44bf866af..74cbfb15d289 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2815,20 +2815,23 @@ sk_is_refcounted(struct sock *sk) * skb_steal_sock - steal a socket from an sk_buff * @skb: sk_buff to steal the socket from * @refcounted: is set to true if the socket is reference-counted + * @prefetched: is set to true if the socket was assigned from bpf */ static inline struct sock * -skb_steal_sock(struct sk_buff *skb, bool *refcounted) +skb_steal_sock(struct sk_buff *skb, bool *refcounted, bool *prefetched) { if (skb->sk) { struct sock *sk = skb->sk; *refcounted = true; - if (skb_sk_is_prefetched(skb)) + *prefetched = skb_sk_is_prefetched(skb); + if (*prefetched) *refcounted = sk_is_refcounted(sk); skb->destructor = NULL; skb->sk = NULL; return sk; } + *prefetched = false; *refcounted = false; return NULL; } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 739c15906a65..7fc98f4b63e9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4198,9 +4198,6 @@ union bpf_attr { * **-EOPNOTSUPP** if the operation is not supported, for example * a call from outside of TC ingress. * - * **-ESOCKTNOSUPPORT** if the socket type is not supported - * (reuseport). - * * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) * Description * Helper is overloaded depending on BPF program type. This diff --git a/net/core/filter.c b/net/core/filter.c index b5b51ef48c5f..7c37f4646c20 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7351,8 +7351,6 @@ BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) return -EOPNOTSUPP; if (unlikely(dev_net(skb->dev) != sock_net(sk))) return -ENETUNREACH; - if (unlikely(sk_fullsock(sk) && sk->sk_reuseport)) - return -ESOCKTNOSUPPORT; if (sk_unhashed(sk)) return -EOPNOTSUPP; if (sk_is_refcounted(sk) && diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 456fb6ee6ee8..d89c4a39cf27 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2388,7 +2388,11 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp4_csum_init(skb, uh, proto)) goto csum_error; - sk = skb_steal_sock(skb, &refcounted); + sk = inet_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest, + &refcounted, udp_ehashfn); + if (IS_ERR(sk)) + goto no_sk; + if (sk) { struct dst_entry *dst = skb_dst(skb); int ret; @@ -2409,7 +2413,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) return udp_unicast_rcv_skb(sk, skb, uh); - +no_sk: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; nf_reset_ct(skb); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4b8974a8c7a3..00996f0f7cfe 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -964,7 +964,11 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, goto csum_error; /* Check if the socket is already available, e.g. due to early demux */ - sk = skb_steal_sock(skb, &refcounted); + sk = inet6_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest, + &refcounted, udp6_ehashfn); + if (IS_ERR(sk)) + goto no_sk; + if (sk) { struct dst_entry *dst = skb_dst(skb); int ret; @@ -998,7 +1002,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, goto report_csum_error; return udp6_unicast_rcv_skb(sk, skb, uh); } - +no_sk: reason = SKB_DROP_REASON_NO_SOCKET; if (!uh->check) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 739c15906a65..7fc98f4b63e9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4198,9 +4198,6 @@ union bpf_attr { * **-EOPNOTSUPP** if the operation is not supported, for example * a call from outside of TC ingress. * - * **-ESOCKTNOSUPPORT** if the socket type is not supported - * (reuseport). - * * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) * Description * Helper is overloaded depending on BPF program type. This -- cgit v1.2.3 From f080864a9d906678e050f10f0e81add711b86fbc Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Mon, 24 Jul 2023 10:37:35 +0800 Subject: net: remove redundant NULL check in remove_xps_queue() There are currently two paths that call remove_xps_queue(): 1. __netif_set_xps_queue -> remove_xps_queue 2. clean_xps_maps -> remove_xps_queue_cpu -> remove_xps_queue There is no need to check dev_maps in remove_xps_queue() because dev_maps has been checked on these two paths. Signed-off-by: Zhengchao Shao Link: https://lore.kernel.org/r/20230724023735.2751602-1-shaozhengchao@huawei.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 8e7d0cb540cd..e7ffcfa037f7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2384,8 +2384,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, struct xps_map *map = NULL; int pos; - if (dev_maps) - map = xmap_dereference(dev_maps->attr_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; -- cgit v1.2.3 From a927d77778e380ffd6ffea1f2037c6bdb23af99a Mon Sep 17 00:00:00 2001 From: Zhu Wang Date: Wed, 26 Jul 2023 18:15:31 +0800 Subject: nf_conntrack: fix -Wunused-const-variable= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building with W=1, the following warning occurs. net/netfilter/nf_conntrack_proto_dccp.c:72:27: warning: ‘dccp_state_names’ defined but not used [-Wunused-const-variable=] static const char * const dccp_state_names[] = { We include dccp_state_names in the macro CONFIG_NF_CONNTRACK_PROCFS, since it is only used in the place which is included in the macro CONFIG_NF_CONNTRACK_PROCFS. Fixes: 2bc780499aa3 ("[NETFILTER]: nf_conntrack: add DCCP protocol support") Signed-off-by: Zhu Wang Reviewed-by: Simon Horman Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_proto_dccp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index d4fd626d2b8c..e2db1f4ec2df 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -69,6 +69,7 @@ #define DCCP_MSL (2 * 60 * HZ) +#ifdef CONFIG_NF_CONNTRACK_PROCFS static const char * const dccp_state_names[] = { [CT_DCCP_NONE] = "NONE", [CT_DCCP_REQUEST] = "REQUEST", @@ -81,6 +82,7 @@ static const char * const dccp_state_names[] = { [CT_DCCP_IGNORE] = "IGNORE", [CT_DCCP_INVALID] = "INVALID", }; +#endif #define sNO CT_DCCP_NONE #define sRQ CT_DCCP_REQUEST -- cgit v1.2.3 From 100a11b69842ab568cff2a59959daf317be525c9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Jul 2023 09:52:30 +0200 Subject: netfilter: nf_tables: use NLA_POLICY_MASK to test for valid flag options nf_tables relies on manual test of netlink attributes coming from userspace even in cases where this could be handled via netlink policy. Convert a bunch of 'flag' attributes to use NLA_POLICY_MASK checks. Signed-off-by: Florian Westphal --- net/netfilter/nft_fib.c | 13 +++++++------ net/netfilter/nft_lookup.c | 6 ++---- net/netfilter/nft_masq.c | 8 +++----- net/netfilter/nft_nat.c | 8 +++----- net/netfilter/nft_redir.c | 8 +++----- 5 files changed, 18 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 6e049fd48760..601c9e09d07a 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -14,17 +14,18 @@ #include #include +#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \ + NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \ + NFTA_FIB_F_PRESENT) + const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = { [NFTA_FIB_DREG] = { .type = NLA_U32 }, [NFTA_FIB_RESULT] = { .type = NLA_U32 }, - [NFTA_FIB_FLAGS] = { .type = NLA_U32 }, + [NFTA_FIB_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NFTA_FIB_F_ALL), }; EXPORT_SYMBOL(nft_fib_policy); -#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \ - NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \ - NFTA_FIB_F_PRESENT) - int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { @@ -77,7 +78,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->flags = ntohl(nla_get_be32(tb[NFTA_FIB_FLAGS])); - if (priv->flags == 0 || (priv->flags & ~NFTA_FIB_F_ALL)) + if (priv->flags == 0) return -EINVAL; if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) == diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 29ac48cdd6db..870e5b113d13 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -90,7 +90,8 @@ static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 }, [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, - [NFTA_LOOKUP_FLAGS] = { .type = NLA_U32 }, + [NFTA_LOOKUP_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NFT_LOOKUP_F_INV), }; static int nft_lookup_init(const struct nft_ctx *ctx, @@ -120,9 +121,6 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (tb[NFTA_LOOKUP_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFTA_LOOKUP_FLAGS])); - if (flags & ~NFT_LOOKUP_F_INV) - return -EINVAL; - if (flags & NFT_LOOKUP_F_INV) priv->invert = true; } diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index b115d77fbbc7..8a14aaca93bb 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -20,7 +20,8 @@ struct nft_masq { }; static const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { - [NFTA_MASQ_FLAGS] = { .type = NLA_U32 }, + [NFTA_MASQ_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), [NFTA_MASQ_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_MASQ_REG_PROTO_MAX] = { .type = NLA_U32 }, }; @@ -47,11 +48,8 @@ static int nft_masq_init(const struct nft_ctx *ctx, struct nft_masq *priv = nft_expr_priv(expr); int err; - if (tb[NFTA_MASQ_FLAGS]) { + if (tb[NFTA_MASQ_FLAGS]) priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; - } if (tb[NFTA_MASQ_REG_PROTO_MIN]) { err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MIN], diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 5c29915ab028..583885ce7232 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -132,7 +132,8 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, - [NFTA_NAT_FLAGS] = { .type = NLA_U32 }, + [NFTA_NAT_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_nat_validate(const struct nft_ctx *ctx, @@ -246,11 +247,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } - if (tb[NFTA_NAT_FLAGS]) { + if (tb[NFTA_NAT_FLAGS]) priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EOPNOTSUPP; - } return nf_ct_netns_get(ctx->net, family); } diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index a70196ffcb1e..a58bd8d291ff 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -22,7 +22,8 @@ struct nft_redir { static const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = { [NFTA_REDIR_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_REDIR_REG_PROTO_MAX] = { .type = NLA_U32 }, - [NFTA_REDIR_FLAGS] = { .type = NLA_U32 }, + [NFTA_REDIR_FLAGS] = + NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_redir_validate(const struct nft_ctx *ctx, @@ -68,11 +69,8 @@ static int nft_redir_init(const struct nft_ctx *ctx, priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } - if (tb[NFTA_REDIR_FLAGS]) { + if (tb[NFTA_REDIR_FLAGS]) priv->flags = ntohl(nla_get_be32(tb[NFTA_REDIR_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; - } return nf_ct_netns_get(ctx->net, ctx->family); } -- cgit v1.2.3 From 0c805e80e35d042a41c8702fa13f453a504d2ede Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Wed, 12 Jul 2023 21:32:36 +0800 Subject: netfilter: conntrack: validate cta_ip via parsing In current ctnetlink_parse_tuple_ip() function, nested parsing and validation is splitting as two parts, which could be cleanup to a simplified form. As the nla_parse_nested_deprecated function supports validation in the fly. These two finially reach same place __nla_validate_parse with same validate flag. nla_parse_nested_deprecated __nla_parse(.., NL_VALIDATE_LIBERAL, ..) __nla_validate_parse nla_validate_nested_deprecated __nla_validate_nested(.., NL_VALIDATE_LIBERAL, ..) __nla_validate __nla_validate_parse This commit removes the call to nla_validate_nested_deprecated and pass cta_ip_nla_policy when do parsing. Signed-off-by: Lin Ma Reviewed-by: Simon Horman Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_netlink.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 69c8c8c7e9b8..334db22199c1 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1321,15 +1321,11 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nlattr *tb[CTA_IP_MAX+1]; int ret = 0; - ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, NULL, NULL); + ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, + cta_ip_nla_policy, NULL); if (ret < 0) return ret; - ret = nla_validate_nested_deprecated(attr, CTA_IP_MAX, - cta_ip_nla_policy, NULL); - if (ret) - return ret; - switch (tuple->src.l3num) { case NFPROTO_IPV4: ret = ipv4_nlattr_to_tuple(tb, tuple, flags); -- cgit v1.2.3 From 051e77e33946f932e6879bf4df2773e3c998c3a7 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Tue, 25 Jul 2023 20:29:09 +0300 Subject: virtio/vsock: rework MSG_PEEK for SOCK_STREAM This reworks current implementation of MSG_PEEK logic: 1) Replaces 'skb_queue_walk_safe()' with 'skb_queue_walk()'. There is no need in the first one, as there are no removes of skb in loop. 2) Removes nested while loop - MSG_PEEK logic could be implemented without it: just iterate over skbs without removing it and copy data from each until destination buffer is not full. Signed-off-by: Arseniy Krasnov Reviewed-by: Bobby Eshleman Reviewed-by: Stefano Garzarella Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 41 +++++++++++++++------------------ 1 file changed, 19 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index b769fc258931..2ee40574c339 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -348,37 +348,34 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; - size_t bytes, total = 0, off; - struct sk_buff *skb, *tmp; - int err = -EFAULT; + struct sk_buff *skb; + size_t total = 0; + int err; spin_lock_bh(&vvs->rx_lock); - skb_queue_walk_safe(&vvs->rx_queue, skb, tmp) { - off = 0; + skb_queue_walk(&vvs->rx_queue, skb) { + size_t bytes; - if (total == len) - break; + bytes = len - total; + if (bytes > skb->len) + bytes = skb->len; - while (total < len && off < skb->len) { - bytes = len - total; - if (bytes > skb->len - off) - bytes = skb->len - off; + spin_unlock_bh(&vvs->rx_lock); - /* sk_lock is held by caller so no one else can dequeue. - * Unlock rx_lock since memcpy_to_msg() may sleep. - */ - spin_unlock_bh(&vvs->rx_lock); + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + err = memcpy_to_msg(msg, skb->data, bytes); + if (err) + goto out; - err = memcpy_to_msg(msg, skb->data + off, bytes); - if (err) - goto out; + total += bytes; - spin_lock_bh(&vvs->rx_lock); + spin_lock_bh(&vvs->rx_lock); - total += bytes; - off += bytes; - } + if (total == len) + break; } spin_unlock_bh(&vvs->rx_lock); -- cgit v1.2.3 From a75f501de88e58db24dc8822505f924cd1085ac6 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Tue, 25 Jul 2023 20:29:10 +0300 Subject: virtio/vsock: support MSG_PEEK for SOCK_SEQPACKET This adds support of MSG_PEEK flag for SOCK_SEQPACKET type of socket. Difference with SOCK_STREAM is that this callback returns either length of the message or error. Signed-off-by: Arseniy Krasnov Reviewed-by: Stefano Garzarella Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 63 +++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 2ee40574c339..352d042b130b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -460,6 +460,63 @@ out: return err; } +static ssize_t +virtio_transport_seqpacket_do_peek(struct vsock_sock *vsk, + struct msghdr *msg) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct sk_buff *skb; + size_t total, len; + + spin_lock_bh(&vvs->rx_lock); + + if (!vvs->msg_count) { + spin_unlock_bh(&vvs->rx_lock); + return 0; + } + + total = 0; + len = msg_data_left(msg); + + skb_queue_walk(&vvs->rx_queue, skb) { + struct virtio_vsock_hdr *hdr; + + if (total < len) { + size_t bytes; + int err; + + bytes = len - total; + if (bytes > skb->len) + bytes = skb->len; + + spin_unlock_bh(&vvs->rx_lock); + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + err = memcpy_to_msg(msg, skb->data, bytes); + if (err) + return err; + + spin_lock_bh(&vvs->rx_lock); + } + + total += skb->len; + hdr = virtio_vsock_hdr(skb); + + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) { + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) + msg->msg_flags |= MSG_EOR; + + break; + } + } + + spin_unlock_bh(&vvs->rx_lock); + + return total; +} + static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, struct msghdr *msg, int flags) @@ -554,9 +611,9 @@ virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk, int flags) { if (flags & MSG_PEEK) - return -EOPNOTSUPP; - - return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags); + return virtio_transport_seqpacket_do_peek(vsk, msg); + else + return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags); } EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue); -- cgit v1.2.3 From ef27ba5c845d22de79447d7da8e12471168a8e77 Mon Sep 17 00:00:00 2001 From: Patrick Rohr Date: Wed, 26 Jul 2023 11:47:42 -0700 Subject: net: remove comment in ndisc_router_discovery Removes superfluous (and misplaced) comment from ndisc_router_discovery. Signed-off-by: Patrick Rohr Reviewed-by: Simon Horman Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230726184742.342825-1-prohr@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ndisc.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 29ddad1c1a2f..95750fbf6782 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1266,10 +1266,6 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) } #endif - /* - * set the RA_RECV flag in the interface - */ - in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) { ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", -- cgit v1.2.3 From 4d66f235c7904576a7df396791b96bc7d259507b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 26 Jul 2023 22:31:41 +0800 Subject: bridge: Remove unused declaration br_multicast_set_hash_max() Since commit 19e3a9c90c53 ("net: bridge: convert multicast to generic rhashtable") this is not used, so can remove it. Signed-off-by: YueHaibing Reviewed-by: Simon Horman Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230726143141.11704-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/bridge/br_private.h | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 51e4ca54b537..a1f4acfa6994 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -974,7 +974,6 @@ int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router); int br_multicast_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val); -int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val); int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx, unsigned long val); #if IS_ENABLED(CONFIG_IPV6) -- cgit v1.2.3 From d4a80cc69aea74fe220dc9faa0d6d62e4dfa871a Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 26 Jul 2023 22:32:39 +0800 Subject: dccp: Remove unused declaration dccp_feat_initialise_sysctls() This is never used, so can remove it. Signed-off-by: YueHaibing Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230726143239.9904-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/dccp/feat.h | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/dccp/feat.h b/net/dccp/feat.h index d76c9be5bfca..57d9c026aa3f 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -105,7 +105,6 @@ extern int sysctl_dccp_rx_ccid; extern int sysctl_dccp_tx_ccid; int dccp_feat_init(struct sock *sk); -void dccp_feat_initialise_sysctls(void); int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, u8 const *list, u8 len); int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, -- cgit v1.2.3 From e22e358bbeb3256e2eb854493b46c815402087bb Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 26 Jul 2023 21:15:51 +0200 Subject: net/tls: handle MSG_EOR for tls_sw TX flow tls_sw_sendmsg() already handles MSG_MORE, but bails out on MSG_EOR. Seeing that MSG_EOR is basically the opposite of MSG_MORE this patch adds handling MSG_EOR by treating it as the negation of MSG_MORE. And erroring out if MSG_EOR is specified with MSG_MORE. Signed-off-by: Hannes Reinecke Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230726191556.41714-2-hare@suse.de Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 53f944e6d8ef..9aef45e870a5 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -984,6 +984,9 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, int ret = 0; int pending; + if (!eor && (msg->msg_flags & MSG_EOR)) + return -EINVAL; + if (unlikely(msg->msg_controllen)) { ret = tls_process_cmsg(sk, msg, &record_type); if (ret) { @@ -1193,7 +1196,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) int ret; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_CMSG_COMPAT | MSG_SPLICE_PAGES | + MSG_CMSG_COMPAT | MSG_SPLICE_PAGES | MSG_EOR | MSG_SENDPAGE_NOPOLICY)) return -EOPNOTSUPP; -- cgit v1.2.3 From c004b0e00c94322a2f82a8b0b7711ed938097774 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 26 Jul 2023 21:15:52 +0200 Subject: net/tls: handle MSG_EOR for tls_device TX flow tls_push_data() MSG_MORE, but bails out on MSG_EOR. Seeing that MSG_EOR is basically the opposite of MSG_MORE this patch adds handling MSG_EOR by treating it as the absence of MSG_MORE. Consequently we should return an error when both are set. Signed-off-by: Hannes Reinecke Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230726191556.41714-3-hare@suse.de Signed-off-by: Jakub Kicinski --- net/tls/tls_device.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 2021fe557e50..5df18f696d7f 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -441,9 +441,13 @@ static int tls_push_data(struct sock *sk, long timeo; if (flags & - ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SPLICE_PAGES)) + ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SPLICE_PAGES | MSG_EOR)) return -EOPNOTSUPP; + if ((flags & (MSG_MORE | MSG_EOR)) == (MSG_MORE | MSG_EOR)) + return -EINVAL; + if (unlikely(sk->sk_err)) return -sk->sk_err; -- cgit v1.2.3 From 11863c6d440d34c4b967e517739b38a7e68ed092 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 26 Jul 2023 21:15:54 +0200 Subject: net/tls: Use tcp_read_sock() instead of ops->read_sock() TLS resets the protocol operations, so the read_sock() callback might be changed, too. In this case using sock->ops->readsock() in tls_strp_read_copyin() will enter an infinite recursion if the read_sock() callback is calling tls_rx_rec_wait() which will call into sock->ops->readsock() via tls_strp_read_copyin(). But as tls_strp_read_copyin() is supposed to produce data from the consumed socket and that socket is always a TCP socket we can call tcp_read_sock() directly without having to deal with callbacks. Signed-off-by: Hannes Reinecke Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230726191556.41714-5-hare@suse.de Signed-off-by: Jakub Kicinski --- net/tls/tls_strp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index f37f4a0fcd3c..ca1e0e198ceb 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -369,7 +369,6 @@ static int tls_strp_copyin(read_descriptor_t *desc, struct sk_buff *in_skb, static int tls_strp_read_copyin(struct tls_strparser *strp) { - struct socket *sock = strp->sk->sk_socket; read_descriptor_t desc; desc.arg.data = strp; @@ -377,7 +376,7 @@ static int tls_strp_read_copyin(struct tls_strparser *strp) desc.count = 1; /* give more than one skb per call */ /* sk should be locked here, so okay to do read_sock */ - sock->ops->read_sock(strp->sk, &desc, tls_strp_copyin); + tcp_read_sock(strp->sk, &desc, tls_strp_copyin); return desc.error; } -- cgit v1.2.3 From f9ae3204fb45d0749befc1cdff50f691c7461e5a Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 26 Jul 2023 21:15:55 +0200 Subject: net/tls: split tls_rx_reader_lock Split tls_rx_reader_{lock,unlock} into an 'acquire/release' and the actual locking part. With that we can use the tls_rx_reader_lock in situations where the socket is already locked. Suggested-by: Sagi Grimberg Signed-off-by: Hannes Reinecke Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230726191556.41714-6-hare@suse.de Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9aef45e870a5..d0636ea13009 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1848,13 +1848,10 @@ tls_read_flush_backlog(struct sock *sk, struct tls_prot_info *prot, return sk_flush_backlog(sk); } -static int tls_rx_reader_lock(struct sock *sk, struct tls_sw_context_rx *ctx, - bool nonblock) +static int tls_rx_reader_acquire(struct sock *sk, struct tls_sw_context_rx *ctx, + bool nonblock) { long timeo; - int err; - - lock_sock(sk); timeo = sock_rcvtimeo(sk, nonblock); @@ -1868,26 +1865,30 @@ static int tls_rx_reader_lock(struct sock *sk, struct tls_sw_context_rx *ctx, !READ_ONCE(ctx->reader_present), &wait); remove_wait_queue(&ctx->wq, &wait); - if (timeo <= 0) { - err = -EAGAIN; - goto err_unlock; - } - if (signal_pending(current)) { - err = sock_intr_errno(timeo); - goto err_unlock; - } + if (timeo <= 0) + return -EAGAIN; + if (signal_pending(current)) + return sock_intr_errno(timeo); } WRITE_ONCE(ctx->reader_present, 1); return 0; +} -err_unlock: - release_sock(sk); +static int tls_rx_reader_lock(struct sock *sk, struct tls_sw_context_rx *ctx, + bool nonblock) +{ + int err; + + lock_sock(sk); + err = tls_rx_reader_acquire(sk, ctx, nonblock); + if (err) + release_sock(sk); return err; } -static void tls_rx_reader_unlock(struct sock *sk, struct tls_sw_context_rx *ctx) +static void tls_rx_reader_release(struct sock *sk, struct tls_sw_context_rx *ctx) { if (unlikely(ctx->reader_contended)) { if (wq_has_sleeper(&ctx->wq)) @@ -1899,6 +1900,11 @@ static void tls_rx_reader_unlock(struct sock *sk, struct tls_sw_context_rx *ctx) } WRITE_ONCE(ctx->reader_present, 0); +} + +static void tls_rx_reader_unlock(struct sock *sk, struct tls_sw_context_rx *ctx) +{ + tls_rx_reader_release(sk, ctx); release_sock(sk); } -- cgit v1.2.3 From 662fbcec32f4af6bdcf5b4006b792ebe9543d945 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 26 Jul 2023 21:15:56 +0200 Subject: net/tls: implement ->read_sock() Implement ->read_sock() function for use with nvme-tcp. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Jakub Kicinski Cc: Boris Pismenny Link: https://lore.kernel.org/r/20230726191556.41714-7-hare@suse.de Signed-off-by: Jakub Kicinski --- net/tls/tls.h | 2 ++ net/tls/tls_main.c | 2 ++ net/tls/tls_sw.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+) (limited to 'net') diff --git a/net/tls/tls.h b/net/tls/tls.h index 86cef1c68e03..7e4d45537deb 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -110,6 +110,8 @@ bool tls_sw_sock_is_readable(struct sock *sk); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t read_actor); int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); void tls_device_splice_eof(struct socket *sock); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index b6896126bb92..7dbb8cd8f809 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -962,10 +962,12 @@ static void build_proto_ops(struct proto_ops ops[TLS_NUM_CONFIG][TLS_NUM_CONFIG] ops[TLS_BASE][TLS_SW ] = ops[TLS_BASE][TLS_BASE]; ops[TLS_BASE][TLS_SW ].splice_read = tls_sw_splice_read; ops[TLS_BASE][TLS_SW ].poll = tls_sk_poll; + ops[TLS_BASE][TLS_SW ].read_sock = tls_sw_read_sock; ops[TLS_SW ][TLS_SW ] = ops[TLS_SW ][TLS_BASE]; ops[TLS_SW ][TLS_SW ].splice_read = tls_sw_splice_read; ops[TLS_SW ][TLS_SW ].poll = tls_sk_poll; + ops[TLS_SW ][TLS_SW ].read_sock = tls_sw_read_sock; #ifdef CONFIG_TLS_DEVICE ops[TLS_HW ][TLS_BASE] = ops[TLS_BASE][TLS_BASE]; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index d0636ea13009..9c1f13541708 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2202,6 +2202,105 @@ splice_requeue: goto splice_read_end; } +int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t read_actor) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct tls_prot_info *prot = &tls_ctx->prot_info; + struct strp_msg *rxm = NULL; + struct sk_buff *skb = NULL; + struct sk_psock *psock; + size_t flushed_at = 0; + bool released = true; + struct tls_msg *tlm; + ssize_t copied = 0; + ssize_t decrypted; + int err, used; + + psock = sk_psock_get(sk); + if (psock) { + sk_psock_put(sk, psock); + return -EINVAL; + } + err = tls_rx_reader_acquire(sk, ctx, true); + if (err < 0) + return err; + + /* If crypto failed the connection is broken */ + err = ctx->async_wait.err; + if (err) + goto read_sock_end; + + decrypted = 0; + do { + if (!skb_queue_empty(&ctx->rx_list)) { + skb = __skb_dequeue(&ctx->rx_list); + rxm = strp_msg(skb); + tlm = tls_msg(skb); + } else { + struct tls_decrypt_arg darg; + int to_decrypt; + + err = tls_rx_rec_wait(sk, NULL, true, released); + if (err <= 0) + goto read_sock_end; + + memset(&darg.inargs, 0, sizeof(darg.inargs)); + + rxm = strp_msg(tls_strp_msg(ctx)); + tlm = tls_msg(tls_strp_msg(ctx)); + + to_decrypt = rxm->full_len - prot->overhead_size; + + err = tls_rx_one_record(sk, NULL, &darg); + if (err < 0) { + tls_err_abort(sk, -EBADMSG); + goto read_sock_end; + } + + released = tls_read_flush_backlog(sk, prot, rxm->full_len, to_decrypt, + decrypted, &flushed_at); + skb = darg.skb; + decrypted += rxm->full_len; + + tls_rx_rec_done(ctx); + } + + /* read_sock does not support reading control messages */ + if (tlm->control != TLS_RECORD_TYPE_DATA) { + err = -EINVAL; + goto read_sock_requeue; + } + + used = read_actor(desc, skb, rxm->offset, rxm->full_len); + if (used <= 0) { + if (!copied) + err = used; + goto read_sock_requeue; + } + copied += used; + if (used < rxm->full_len) { + rxm->offset += used; + rxm->full_len -= used; + if (!desc->count) + goto read_sock_requeue; + } else { + consume_skb(skb); + if (!desc->count) + skb = NULL; + } + } while (skb); + +read_sock_end: + tls_rx_reader_release(sk, ctx); + return copied ? : err; + +read_sock_requeue: + __skb_queue_head(&ctx->rx_list, skb); + goto read_sock_end; +} + bool tls_sw_sock_is_readable(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); -- cgit v1.2.3 From 3d40aed862874db14e1dd41fd6f12636dcfdcc3e Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 26 Jul 2023 19:49:39 -0600 Subject: net: Explicitly include correct DT includes The DT of_device.h and of_platform.h date back to the separate of_platform_bus_type before it as merged into the regular platform bus. As part of that merge prepping Arm DT support 13 years ago, they "temporarily" include each other. They also include platform_device.h and of.h. As a result, there's a pretty much random mix of those include files used throughout the tree. In order to detangle these headers and replace the implicit includes with struct declarations, users need to explicitly include the correct includes. Acked-by: Alex Elder Reviewed-by: Bhupesh Sharma Reviewed-by: Wei Fang Signed-off-by: Rob Herring Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230727014944.3972546-1-robh@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/aeroflex/greth.c | 4 ++-- drivers/net/ethernet/amd/sunlance.c | 2 +- drivers/net/ethernet/apm/xgene-v2/main.h | 1 + drivers/net/ethernet/arc/emac_main.c | 2 +- drivers/net/ethernet/atheros/ag71xx.c | 3 ++- drivers/net/ethernet/cadence/macb_main.c | 1 - drivers/net/ethernet/cirrus/cs89x0.c | 1 - drivers/net/ethernet/ezchip/nps_enet.c | 5 ++--- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 3 ++- drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 2 ++ drivers/net/ethernet/freescale/enetc/enetc_ierb.c | 2 +- drivers/net/ethernet/freescale/fec_mpc52xx.c | 4 ++-- drivers/net/ethernet/freescale/fec_mpc52xx_phy.c | 3 ++- drivers/net/ethernet/freescale/fec_ptp.c | 1 - drivers/net/ethernet/freescale/fman/fman.c | 1 + drivers/net/ethernet/freescale/fman/fman_port.c | 1 + drivers/net/ethernet/freescale/fman/mac.c | 2 ++ drivers/net/ethernet/freescale/fs_enet/mac-fcc.c | 1 - drivers/net/ethernet/freescale/fs_enet/mac-fec.c | 1 - drivers/net/ethernet/freescale/fs_enet/mac-scc.c | 1 - drivers/net/ethernet/freescale/fsl_pq_mdio.c | 1 + drivers/net/ethernet/freescale/gianfar.c | 2 +- drivers/net/ethernet/freescale/gianfar_ethtool.c | 2 ++ drivers/net/ethernet/freescale/ucc_geth.c | 3 ++- drivers/net/ethernet/freescale/xgmac_mdio.c | 4 ++-- drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c | 3 --- drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c | 4 ---- drivers/net/ethernet/ibm/ehea/ehea_main.c | 1 + drivers/net/ethernet/ibm/emac/core.c | 1 + drivers/net/ethernet/ibm/emac/core.h | 1 - drivers/net/ethernet/ibm/emac/mal.c | 2 ++ drivers/net/ethernet/ibm/emac/rgmii.c | 2 ++ drivers/net/ethernet/ibm/emac/tah.c | 2 ++ drivers/net/ethernet/ibm/emac/zmii.c | 2 ++ drivers/net/ethernet/korina.c | 2 +- drivers/net/ethernet/marvell/mvmdio.c | 2 +- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 1 - drivers/net/ethernet/marvell/prestera/prestera_rxtx.c | 3 --- drivers/net/ethernet/marvell/sky2.c | 1 - drivers/net/ethernet/mediatek/mtk_eth_soc.c | 3 ++- drivers/net/ethernet/mediatek/mtk_star_emac.c | 1 - drivers/net/ethernet/mediatek/mtk_wed.c | 1 + drivers/net/ethernet/mediatek/mtk_wed_wo.c | 3 +-- drivers/net/ethernet/microchip/lan966x/lan966x_main.c | 3 ++- drivers/net/ethernet/mscc/ocelot_fdma.c | 1 - drivers/net/ethernet/mscc/ocelot_vsc7514.c | 3 ++- drivers/net/ethernet/ni/nixge.c | 5 ++--- drivers/net/ethernet/qualcomm/emac/emac-sgmii.c | 2 ++ drivers/net/ethernet/qualcomm/emac/emac.c | 1 - drivers/net/ethernet/qualcomm/qca_spi.c | 1 - drivers/net/ethernet/qualcomm/qca_uart.c | 1 - drivers/net/ethernet/renesas/ravb_main.c | 3 +-- drivers/net/ethernet/renesas/rswitch.c | 3 +-- drivers/net/ethernet/renesas/sh_eth.c | 2 -- drivers/net/ethernet/smsc/smsc911x.c | 1 - drivers/net/ethernet/socionext/sni_ave.c | 3 ++- drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 1 - drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c | 1 - drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c | 1 - drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 1 - drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-starfive.c | 4 +++- drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c | 1 - drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c | 1 - drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 3 ++- drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c | 3 ++- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 1 - drivers/net/ethernet/sun/niu.c | 2 +- drivers/net/ethernet/sun/sunbmac.c | 2 +- drivers/net/ethernet/sun/sungem.c | 1 + drivers/net/ethernet/sun/sunhme.c | 3 ++- drivers/net/ethernet/sun/sunqe.c | 2 +- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 1 + drivers/net/ethernet/ti/cpsw-common.c | 1 - drivers/net/ethernet/ti/cpsw-phy-sel.c | 1 - drivers/net/ethernet/ti/davinci_mdio.c | 1 - drivers/net/ethernet/via/via-rhine.c | 2 +- drivers/net/ethernet/via/via-velocity.c | 2 +- drivers/net/ethernet/xilinx/ll_temac.h | 1 + drivers/net/ethernet/xilinx/ll_temac_main.c | 4 +--- drivers/net/ethernet/xilinx/ll_temac_mdio.c | 2 +- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 3 ++- drivers/net/ethernet/xilinx/xilinx_emaclite.c | 4 ++-- drivers/net/ieee802154/ca8210.c | 1 - drivers/net/ipa/ipa_main.c | 2 +- net/core/of_net.c | 1 + 90 files changed, 88 insertions(+), 92 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/aeroflex/greth.c b/drivers/net/ethernet/aeroflex/greth.c index aa0d2f3aaeaa..597a02c75d52 100644 --- a/drivers/net/ethernet/aeroflex/greth.c +++ b/drivers/net/ethernet/aeroflex/greth.c @@ -29,9 +29,9 @@ #include #include #include -#include +#include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/amd/sunlance.c b/drivers/net/ethernet/amd/sunlance.c index 68ca1225eedc..33bb539ad70a 100644 --- a/drivers/net/ethernet/amd/sunlance.c +++ b/drivers/net/ethernet/amd/sunlance.c @@ -92,7 +92,7 @@ static char lancestr[] = "LANCE"; #include #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/apm/xgene-v2/main.h b/drivers/net/ethernet/apm/xgene-v2/main.h index b3985a7be59d..7be6f83e22fe 100644 --- a/drivers/net/ethernet/apm/xgene-v2/main.h +++ b/drivers/net/ethernet/apm/xgene-v2/main.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "mac.h" #include "enet.h" diff --git a/drivers/net/ethernet/arc/emac_main.c b/drivers/net/ethernet/arc/emac_main.c index 2b427d8a1831..31ee477dd131 100644 --- a/drivers/net/ethernet/arc/emac_main.c +++ b/drivers/net/ethernet/arc/emac_main.c @@ -15,11 +15,11 @@ #include #include #include +#include #include #include #include #include -#include #include "emac.h" diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c index ff1a5edf8df1..009e0b3066fa 100644 --- a/drivers/net/ethernet/atheros/ag71xx.c +++ b/drivers/net/ethernet/atheros/ag71xx.c @@ -29,9 +29,10 @@ #include #include +#include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index f6a0f12a6d52..b61566afb2f4 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/cirrus/cs89x0.c b/drivers/net/ethernet/cirrus/cs89x0.c index 276c32c3926a..7c51fd9fc9be 100644 --- a/drivers/net/ethernet/cirrus/cs89x0.c +++ b/drivers/net/ethernet/cirrus/cs89x0.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/ezchip/nps_enet.c b/drivers/net/ethernet/ezchip/nps_enet.c index f1eb660aaee2..edf000e7bab4 100644 --- a/drivers/net/ethernet/ezchip/nps_enet.c +++ b/drivers/net/ethernet/ezchip/nps_enet.c @@ -6,10 +6,9 @@ #include #include #include -#include -#include +#include #include -#include +#include #include "nps_enet.h" #define DRV_NAME "nps_mgt_enet" diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 85e38a1ec22a..dcbc598b11c6 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -7,8 +7,8 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include -#include #include #include #include @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c index 9c71cbbb13d8..5bd0b36d1feb 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c @@ -6,7 +6,9 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include +#include #include #include diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ierb.c b/drivers/net/ethernet/freescale/enetc/enetc_ierb.c index b307bef4dc29..d39617ab9306 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ierb.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ierb.c @@ -18,8 +18,8 @@ */ #include +#include #include -#include #include #include #include "enetc.h" diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c b/drivers/net/ethernet/freescale/fec_mpc52xx.c index 984ece5cd64f..ebae71ec26c6 100644 --- a/drivers/net/ethernet/freescale/fec_mpc52xx.c +++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c @@ -29,12 +29,12 @@ #include #include #include +#include #include -#include #include #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c b/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c index 61d3776d6750..39689826cc8f 100644 --- a/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c +++ b/drivers/net/ethernet/freescale/fec_mpc52xx_phy.c @@ -13,10 +13,11 @@ #include #include #include -#include #include +#include #include #include +#include #include #include #include "fec_mpc52xx.h" diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c index afc658d2c271..fc4674cb65be 100644 --- a/drivers/net/ethernet/freescale/fec_ptp.c +++ b/drivers/net/ethernet/freescale/fec_ptp.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c index 9d85fb136e34..d96028f01770 100644 --- a/drivers/net/ethernet/freescale/fman/fman.c +++ b/drivers/net/ethernet/freescale/fman/fman.c @@ -7,6 +7,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include diff --git a/drivers/net/ethernet/freescale/fman/fman_port.c b/drivers/net/ethernet/freescale/fman/fman_port.c index ab90fe2bee5e..406e75e9e5ea 100644 --- a/drivers/net/ethernet/freescale/fman/fman_port.c +++ b/drivers/net/ethernet/freescale/fman/fman_port.c @@ -6,6 +6,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c index c5045891d694..b6c7c4c0b367 100644 --- a/drivers/net/ethernet/freescale/fman/mac.c +++ b/drivers/net/ethernet/freescale/fman/mac.c @@ -12,12 +12,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include "mac.h" #include "fman_mac.h" diff --git a/drivers/net/ethernet/freescale/fs_enet/mac-fcc.c b/drivers/net/ethernet/freescale/fs_enet/mac-fcc.c index b47490be872c..925428f1b0c8 100644 --- a/drivers/net/ethernet/freescale/fs_enet/mac-fcc.c +++ b/drivers/net/ethernet/freescale/fs_enet/mac-fcc.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/freescale/fs_enet/mac-fec.c b/drivers/net/ethernet/freescale/fs_enet/mac-fec.c index 61f4b6e50d29..f609dc112458 100644 --- a/drivers/net/ethernet/freescale/fs_enet/mac-fec.c +++ b/drivers/net/ethernet/freescale/fs_enet/mac-fec.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/ethernet/freescale/fs_enet/mac-scc.c b/drivers/net/ethernet/freescale/fs_enet/mac-scc.c index 64300ac13e02..66d40da5cde0 100644 --- a/drivers/net/ethernet/freescale/fs_enet/mac-scc.c +++ b/drivers/net/ethernet/freescale/fs_enet/mac-scc.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/ethernet/freescale/fsl_pq_mdio.c b/drivers/net/ethernet/freescale/fsl_pq_mdio.c index 01d594886f52..eee675a25b2c 100644 --- a/drivers/net/ethernet/freescale/fsl_pq_mdio.c +++ b/drivers/net/ethernet/freescale/fsl_pq_mdio.c @@ -12,6 +12,7 @@ */ #include +#include #include #include #include diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 0c898f9ee358..e3dfbd7a4236 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -60,6 +60,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include @@ -75,7 +76,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c index b2b0d3c26fcc..7a15b9245698 100644 --- a/drivers/net/ethernet/freescale/gianfar_ethtool.c +++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c @@ -38,7 +38,9 @@ #include #include #include +#include #include +#include #include #include "gianfar.h" diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 2b3a15f24e7c..ab421243a419 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -28,11 +28,12 @@ #include #include #include +#include #include #include #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/freescale/xgmac_mdio.c b/drivers/net/ethernet/freescale/xgmac_mdio.c index a13b4ba4d6e1..65dc07d0df0f 100644 --- a/drivers/net/ethernet/freescale/xgmac_mdio.c +++ b/drivers/net/ethernet/freescale/xgmac_mdio.c @@ -19,10 +19,10 @@ #include #include #include -#include +#include #include -#include #include +#include #include /* Number of microseconds to wait for a register to respond */ diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c index a7eb87da4e70..a08d1f0a5a16 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c @@ -9,9 +9,6 @@ #include #include #include -#include -#include -#include #include "hns_dsaf_ppe.h" diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c index e2ff3ca198d1..93344563a259 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c @@ -11,10 +11,6 @@ #include #include #include -#include -#include -#include -#include #include #include "hns_dsaf_main.h" diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c index b4aff59b3eb4..0a56e9752464 100644 --- a/drivers/net/ethernet/ibm/ehea/ehea_main.c +++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c @@ -31,6 +31,7 @@ #include #include #include +#include #include diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c index c97095abd26a..0c314bf97480 100644 --- a/drivers/net/ethernet/ibm/emac/core.c +++ b/drivers/net/ethernet/ibm/emac/core.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/ethernet/ibm/emac/core.h b/drivers/net/ethernet/ibm/emac/core.h index 89a1b0fea158..295516b07662 100644 --- a/drivers/net/ethernet/ibm/emac/core.h +++ b/drivers/net/ethernet/ibm/emac/core.h @@ -27,7 +27,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c index ff5487bbebe3..c3236b59e7e9 100644 --- a/drivers/net/ethernet/ibm/emac/mal.c +++ b/drivers/net/ethernet/ibm/emac/mal.c @@ -22,7 +22,9 @@ #include #include +#include #include +#include #include "core.h" #include diff --git a/drivers/net/ethernet/ibm/emac/rgmii.c b/drivers/net/ethernet/ibm/emac/rgmii.c index 50358cf00130..fd437f986edf 100644 --- a/drivers/net/ethernet/ibm/emac/rgmii.c +++ b/drivers/net/ethernet/ibm/emac/rgmii.c @@ -19,7 +19,9 @@ #include #include #include +#include #include +#include #include #include "emac.h" diff --git a/drivers/net/ethernet/ibm/emac/tah.c b/drivers/net/ethernet/ibm/emac/tah.c index 008bbdaf1204..aae9a88d95d7 100644 --- a/drivers/net/ethernet/ibm/emac/tah.c +++ b/drivers/net/ethernet/ibm/emac/tah.c @@ -14,7 +14,9 @@ * * Copyright (c) 2005 Eugene Surovegin */ +#include #include +#include #include #include "emac.h" diff --git a/drivers/net/ethernet/ibm/emac/zmii.c b/drivers/net/ethernet/ibm/emac/zmii.c index 57a25c7a9e70..6337388ee5f4 100644 --- a/drivers/net/ethernet/ibm/emac/zmii.c +++ b/drivers/net/ethernet/ibm/emac/zmii.c @@ -19,7 +19,9 @@ #include #include #include +#include #include +#include #include #include "emac.h" diff --git a/drivers/net/ethernet/korina.c b/drivers/net/ethernet/korina.c index 2b9335cb4bb3..25f7a2a3b847 100644 --- a/drivers/net/ethernet/korina.c +++ b/drivers/net/ethernet/korina.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c index 8662543ca5c8..a1a80f13b1e8 100644 --- a/drivers/net/ethernet/marvell/mvmdio.c +++ b/drivers/net/ethernet/marvell/mvmdio.c @@ -24,8 +24,8 @@ #include #include #include +#include #include -#include #include #include #include diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 1fec84b4c068..9e1b596c8f08 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/marvell/prestera/prestera_rxtx.c b/drivers/net/ethernet/marvell/prestera/prestera_rxtx.c index 9277a8fd1339..cc2a9ae794be 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_rxtx.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_rxtx.c @@ -5,9 +5,6 @@ #include #include #include -#include -#include -#include #include #include "prestera_dsa.h" diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index 7c487f9b36ec..c4cca27fb0d5 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 30e3935e83f9..38f39be04eae 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -6,11 +6,12 @@ * Copyright (C) 2013-2016 Michael Lee */ -#include +#include #include #include #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c index 02c03325911f..31aebeb2e285 100644 --- a/drivers/net/ethernet/mediatek/mtk_star_emac.c +++ b/drivers/net/ethernet/mediatek/mtk_star_emac.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c index 5f062ecb402c..00aeee0d5e45 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.c +++ b/drivers/net/ethernet/mediatek/mtk_wed.c @@ -2,6 +2,7 @@ /* Copyright (C) 2021 Felix Fietkau */ #include +#include #include #include #include diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c b/drivers/net/ethernet/mediatek/mtk_wed_wo.c index 69fba29055e9..3bd51a3d6650 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c +++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c @@ -7,10 +7,9 @@ #include #include -#include #include -#include #include +#include #include #include diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c index fbb0bb4594cd..73f20683210e 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c @@ -5,9 +5,10 @@ #include #include #include -#include +#include #include #include +#include #include #include diff --git a/drivers/net/ethernet/mscc/ocelot_fdma.c b/drivers/net/ethernet/mscc/ocelot_fdma.c index 83a3ce0c568e..312a46832154 100644 --- a/drivers/net/ethernet/mscc/ocelot_fdma.c +++ b/drivers/net/ethernet/mscc/ocelot_fdma.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include "ocelot_fdma.h" diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index 97e90e2869d4..151b42465348 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -10,8 +10,9 @@ #include #include #include +#include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c index 0fd156286d4d..ba27bbc68f85 100644 --- a/drivers/net/ethernet/ni/nixge.c +++ b/drivers/net/ethernet/ni/nixge.c @@ -7,11 +7,10 @@ #include #include #include -#include +#include #include #include -#include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c index 802ef81493e0..e4bc18009d08 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c @@ -8,7 +8,9 @@ #include #include #include +#include #include +#include #include "emac.h" #include "emac-mac.h" #include "emac-sgmii.h" diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c index eaa50050aa0b..19bb16daf4e7 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c index 4a1b94e5a8ea..bec723028e96 100644 --- a/drivers/net/ethernet/qualcomm/qca_spi.c +++ b/drivers/net/ethernet/qualcomm/qca_spi.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/qualcomm/qca_uart.c b/drivers/net/ethernet/qualcomm/qca_uart.c index 26646cb6a20a..ace99c62d03a 100644 --- a/drivers/net/ethernet/qualcomm/qca_uart.c +++ b/drivers/net/ethernet/qualcomm/qca_uart.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 4d6b3b7d6abb..7df9f9f8e134 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -21,10 +21,9 @@ #include #include #include -#include -#include #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/renesas/rswitch.c b/drivers/net/ethernet/renesas/rswitch.c index 4e412ac0965a..0ba7fb75d589 100644 --- a/drivers/net/ethernet/renesas/rswitch.c +++ b/drivers/net/ethernet/renesas/rswitch.c @@ -12,11 +12,10 @@ #include #include #include -#include -#include #include #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index d8ec729825be..274ea16c0a1f 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -19,8 +19,6 @@ #include #include #include -#include -#include #include #include #include diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c index c362bff3cb83..cb590db625e8 100644 --- a/drivers/net/ethernet/smsc/smsc911x.c +++ b/drivers/net/ethernet/smsc/smsc911x.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c index 492c39c08af1..4838d2383a43 100644 --- a/drivers/net/ethernet/socionext/sni_ave.c +++ b/drivers/net/ethernet/socionext/sni_ave.c @@ -15,10 +15,11 @@ #include #include #include +#include #include #include -#include #include +#include #include #include #include diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index b5efd9c2eac7..74c10373cc39 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index b9378a63f0e8..92e06a96757a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c index 8063ba1c3ce8..e22ef0d6bc73 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c index 4d877d75642d..7580077383c0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c @@ -7,8 +7,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c index 92b16048f91c..0b159dc0d5f6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c index 42954020de2c..f91753565d60 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index a85501874801..5b3df27035ec 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -3,7 +3,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c index d81591b470a2..35789cb549f7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c @@ -14,8 +14,8 @@ #include #include #include +#include #include -#include #include #include #include diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-starfive.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-starfive.c index d3a39d2fb3a9..b1c5f2527c0d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-starfive.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-starfive.c @@ -7,8 +7,10 @@ * */ +#include +#include +#include #include -#include #include #include "stmmac_platform.h" diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c index dcbb17c4f07a..ef03e4669491 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include "stmmac_platform.h" diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c index bdb4de59a672..3a09085819dc 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index b607279e8cbd..c23420863a8d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -11,9 +11,10 @@ #include #include #include -#include +#include #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c index 99e2e5a5cd60..2bc6ef1eca1f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c index acbb284be174..a8731ce0fff0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c @@ -6,7 +6,8 @@ */ #include -#include +#include +#include #include #include diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 23d53ea04b24..be8e79c7aa34 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include "stmmac.h" diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index 7a2e76776297..011d74087f86 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include "niu.h" diff --git a/drivers/net/ethernet/sun/sunbmac.c b/drivers/net/ethernet/sun/sunbmac.c index 34b94153bf0c..cc34d92d2e3d 100644 --- a/drivers/net/ethernet/sun/sunbmac.c +++ b/drivers/net/ethernet/sun/sunbmac.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c index 4154e68639ac..9bd1df8308d2 100644 --- a/drivers/net/ethernet/sun/sungem.c +++ b/drivers/net/ethernet/sun/sungem.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c index b93613cd1994..b983b9c23be6 100644 --- a/drivers/net/ethernet/sun/sunhme.c +++ b/drivers/net/ethernet/sun/sunhme.c @@ -32,9 +32,10 @@ #include #include #include -#include #include +#include #include +#include #include #include #include diff --git a/drivers/net/ethernet/sun/sunqe.c b/drivers/net/ethernet/sun/sunqe.c index 6418fcc3139f..b37360f44972 100644 --- a/drivers/net/ethernet/sun/sunqe.c +++ b/drivers/net/ethernet/sun/sunqe.c @@ -27,8 +27,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index bebcfd5e6b57..bea6fc0f324c 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/ti/cpsw-common.c b/drivers/net/ethernet/ti/cpsw-common.c index bfa81bbfce3f..26dc906eae90 100644 --- a/drivers/net/ethernet/ti/cpsw-common.c +++ b/drivers/net/ethernet/ti/cpsw-common.c @@ -3,7 +3,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/ethernet/ti/cpsw-phy-sel.c b/drivers/net/ethernet/ti/cpsw-phy-sel.c index 25e707d7b87c..4edb7963f856 100644 --- a/drivers/net/ethernet/ti/cpsw-phy-sel.c +++ b/drivers/net/ethernet/ti/cpsw-phy-sel.c @@ -12,7 +12,6 @@ #include #include #include -#include #include "cpsw.h" diff --git a/drivers/net/ethernet/ti/davinci_mdio.c b/drivers/net/ethernet/ti/davinci_mdio.c index 23169e36a3d4..89b6d23e9937 100644 --- a/drivers/net/ethernet/ti/davinci_mdio.c +++ b/drivers/net/ethernet/ti/davinci_mdio.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c index d716e6fe26e1..3e09e5036490 100644 --- a/drivers/net/ethernet/via/via-rhine.c +++ b/drivers/net/ethernet/via/via-rhine.c @@ -94,7 +94,7 @@ static const int multicast_filter_limit = 32; #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c index 86f7843b4591..731f689412e6 100644 --- a/drivers/net/ethernet/via/via-velocity.c +++ b/drivers/net/ethernet/via/via-velocity.c @@ -57,8 +57,8 @@ #include #include #include +#include #include -#include #include #include #include diff --git a/drivers/net/ethernet/xilinx/ll_temac.h b/drivers/net/ethernet/xilinx/ll_temac.h index 6668d1b760d8..90d122d5475c 100644 --- a/drivers/net/ethernet/xilinx/ll_temac.h +++ b/drivers/net/ethernet/xilinx/ll_temac.h @@ -5,6 +5,7 @@ #include #include +#include #include #ifdef CONFIG_PPC_DCR diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index e0ac1bcd9925..0bf9cdee1df2 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -35,12 +35,10 @@ #include #include #include -#include #include #include #include -#include -#include +#include #include #include #include /* needed for sizeof(tcphdr) */ diff --git a/drivers/net/ethernet/xilinx/ll_temac_mdio.c b/drivers/net/ethernet/xilinx/ll_temac_mdio.c index 2371c072b53f..07a9fb49eda1 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_mdio.c +++ b/drivers/net/ethernet/xilinx/ll_temac_mdio.c @@ -10,8 +10,8 @@ #include #include #include -#include #include +#include #include #include #include diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 8e32dc50a408..b7ec4dafae90 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -27,11 +27,12 @@ #include #include #include +#include #include #include -#include #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index ad2c30d9a482..b358ecc67227 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include @@ -15,9 +16,8 @@ #include #include #include +#include #include -#include -#include #include #include #include diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c index f9b10e84de06..aebb19f1b3a4 100644 --- a/drivers/net/ieee802154/ca8210.c +++ b/drivers/net/ieee802154/ca8210.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ipa/ipa_main.c b/drivers/net/ipa/ipa_main.c index 6a2f2fc2f501..da853353a5c7 100644 --- a/drivers/net/ipa/ipa_main.c +++ b/drivers/net/ipa/ipa_main.c @@ -13,8 +13,8 @@ #include #include #include -#include #include +#include #include #include #include diff --git a/net/core/of_net.c b/net/core/of_net.c index 55d3fe229269..93ea425b9248 100644 --- a/net/core/of_net.c +++ b/net/core/of_net.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 801b27e88046cb0b34df7d645afb6ba5fc6f677c Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Tue, 25 Jul 2023 20:56:54 +0000 Subject: net: ethtool: Unify ETHTOOL_{G,S}RXFH rxnfc copy ETHTOOL_GRXFH correctly copies in the full struct ethtool_rxnfc when FLOW_RSS is set; ETHTOOL_SRXFH needs a similar code path to handle the FLOW_RSS case so that ethtool can set the flow hash for custom RSS contexts (if supported by the driver). The copy code from ETHTOOL_GRXFH has been pulled out in to a helper so that it can be called in both ETHTOOL_{G,S}RXFH code paths. Acked-by: Edward Cree Signed-off-by: Joe Damato Signed-off-by: David S. Miller --- net/ethtool/ioctl.c | 75 +++++++++++++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 4a51e0ec295c..7d40e7913e76 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -907,6 +907,38 @@ static int ethtool_rxnfc_copy_to_compat(void __user *useraddr, return 0; } +static int ethtool_rxnfc_copy_struct(u32 cmd, struct ethtool_rxnfc *info, + size_t *info_size, void __user *useraddr) +{ + /* struct ethtool_rxnfc was originally defined for + * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data + * members. User-space might still be using that + * definition. + */ + if (cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH) + *info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info->data)); + + if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size)) + return -EFAULT; + + if ((cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH) && info->flow_type & FLOW_RSS) { + *info_size = sizeof(*info); + if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size)) + return -EFAULT; + /* Since malicious users may modify the original data, + * we need to check whether FLOW_RSS is still requested. + */ + if (!(info->flow_type & FLOW_RSS)) + return -EINVAL; + } + + if (info->cmd != cmd) + return -EINVAL; + + return 0; +} + static int ethtool_rxnfc_copy_to_user(void __user *useraddr, const struct ethtool_rxnfc *rxnfc, size_t size, const u32 *rule_buf) @@ -944,16 +976,9 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, if (!dev->ethtool_ops->set_rxnfc) return -EOPNOTSUPP; - /* struct ethtool_rxnfc was originally defined for - * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data - * members. User-space might still be using that - * definition. */ - if (cmd == ETHTOOL_SRXFH) - info_size = (offsetof(struct ethtool_rxnfc, data) + - sizeof(info.data)); - - if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) - return -EFAULT; + rc = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); + if (rc) + return rc; rc = dev->ethtool_ops->set_rxnfc(dev, &info); if (rc) @@ -978,33 +1003,9 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, if (!ops->get_rxnfc) return -EOPNOTSUPP; - /* struct ethtool_rxnfc was originally defined for - * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data - * members. User-space might still be using that - * definition. */ - if (cmd == ETHTOOL_GRXFH) - info_size = (offsetof(struct ethtool_rxnfc, data) + - sizeof(info.data)); - - if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) - return -EFAULT; - - /* If FLOW_RSS was requested then user-space must be using the - * new definition, as FLOW_RSS is newer. - */ - if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { - info_size = sizeof(info); - if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) - return -EFAULT; - /* Since malicious users may modify the original data, - * we need to check whether FLOW_RSS is still requested. - */ - if (!(info.flow_type & FLOW_RSS)) - return -EINVAL; - } - - if (info.cmd != cmd) - return -EINVAL; + ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); + if (ret) + return ret; if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { -- cgit v1.2.3 From 7f6c40391a048c5d0f593f285bee45f7f98a3ca4 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 26 Jul 2023 10:39:05 +0800 Subject: IPv6: add extack info for IPv6 address add/delete Add extack info for IPv6 address add/delete, which would be useful for users to understand the problem without having to read kernel code. Suggested-by: Beniamino Galvani Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/ip6_route.h | 2 +- net/ipv6/addrconf.c | 77 ++++++++++++++++++++++++++++++++++--------------- net/ipv6/anycast.c | 2 +- net/ipv6/route.c | 5 ++-- 4 files changed, 59 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 3556595ce59a..b32539bb0fb0 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -156,7 +156,7 @@ void fib6_force_start_gc(struct net *net); struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast, - gfp_t gfp_flags); + gfp_t gfp_flags, struct netlink_ext_ack *extack); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 94d1fdb0393f..5daf6a4986f8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1063,20 +1063,28 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, struct fib6_info *f6i = NULL; int err = 0; - if (addr_type == IPV6_ADDR_ANY || - (addr_type & IPV6_ADDR_MULTICAST && - !(cfg->ifa_flags & IFA_F_MCAUTOJOIN)) || - (!(idev->dev->flags & IFF_LOOPBACK) && - !netif_is_l3_master(idev->dev) && - addr_type & IPV6_ADDR_LOOPBACK)) + if (addr_type == IPV6_ADDR_ANY) { + NL_SET_ERR_MSG_MOD(extack, "Invalid address"); return ERR_PTR(-EADDRNOTAVAIL); + } else if (addr_type & IPV6_ADDR_MULTICAST && + !(cfg->ifa_flags & IFA_F_MCAUTOJOIN)) { + NL_SET_ERR_MSG_MOD(extack, "Cannot assign multicast address without \"IFA_F_MCAUTOJOIN\" flag"); + return ERR_PTR(-EADDRNOTAVAIL); + } else if (!(idev->dev->flags & IFF_LOOPBACK) && + !netif_is_l3_master(idev->dev) && + addr_type & IPV6_ADDR_LOOPBACK) { + NL_SET_ERR_MSG_MOD(extack, "Cannot assign loopback address on this device"); + return ERR_PTR(-EADDRNOTAVAIL); + } if (idev->dead) { - err = -ENODEV; /*XXX*/ + NL_SET_ERR_MSG_MOD(extack, "device is going away"); + err = -ENODEV; goto out; } if (idev->cnf.disable_ipv6) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); err = -EACCES; goto out; } @@ -1103,7 +1111,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, goto out; } - f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags); + f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags, extack); if (IS_ERR(f6i)) { err = PTR_ERR(f6i); f6i = NULL; @@ -2927,30 +2935,40 @@ static int inet6_addr_add(struct net *net, int ifindex, ASSERT_RTNL(); - if (cfg->plen > 128) + if (cfg->plen > 128) { + NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length"); return -EINVAL; + } /* check the lifetime */ - if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) + if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) { + NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid"); return -EINVAL; + } - if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) + if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) { + NL_SET_ERR_MSG_MOD(extack, "address with \"mngtmpaddr\" flag must have a prefix length of 64"); return -EINVAL; + } dev = __dev_get_by_index(net, ifindex); if (!dev) return -ENODEV; idev = addrconf_add_dev(dev); - if (IS_ERR(idev)) + if (IS_ERR(idev)) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); return PTR_ERR(idev); + } if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk, true, cfg->pfx, ifindex); - if (ret < 0) + if (ret < 0) { + NL_SET_ERR_MSG_MOD(extack, "Multicast auto join failed"); return ret; + } } cfg->scope = ipv6_addr_scope(cfg->pfx); @@ -3007,22 +3025,29 @@ static int inet6_addr_add(struct net *net, int ifindex, } static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, - const struct in6_addr *pfx, unsigned int plen) + const struct in6_addr *pfx, unsigned int plen, + struct netlink_ext_ack *extack) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; struct net_device *dev; - if (plen > 128) + if (plen > 128) { + NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length"); return -EINVAL; + } dev = __dev_get_by_index(net, ifindex); - if (!dev) + if (!dev) { + NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface"); return -ENODEV; + } idev = __in6_dev_get(dev); - if (!idev) + if (!idev) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); return -ENXIO; + } read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { @@ -3045,6 +3070,8 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, } } read_unlock_bh(&idev->lock); + + NL_SET_ERR_MSG_MOD(extack, "address not found"); return -EADDRNOTAVAIL; } @@ -3087,7 +3114,7 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) rtnl_lock(); err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr, - ireq.ifr6_prefixlen); + ireq.ifr6_prefixlen, NULL); rtnl_unlock(); return err; } @@ -3490,7 +3517,7 @@ static int fixup_permanent_addr(struct net *net, struct fib6_info *f6i, *prev; f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false, - GFP_ATOMIC); + GFP_ATOMIC, NULL); if (IS_ERR(f6i)) return PTR_ERR(f6i); @@ -4700,7 +4727,7 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, ifa_flags &= IFA_F_MANAGETEMPADDR; return inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx, - ifm->ifa_prefixlen); + ifm->ifa_prefixlen, extack); } static int modify_prefix_route(struct inet6_ifaddr *ifp, @@ -4905,8 +4932,10 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, } dev = __dev_get_by_index(net, ifm->ifa_index); - if (!dev) + if (!dev) { + NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface"); return -ENODEV; + } if (tb[IFA_FLAGS]) cfg.ifa_flags = nla_get_u32(tb[IFA_FLAGS]); @@ -4941,10 +4970,12 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, } if (nlh->nlmsg_flags & NLM_F_EXCL || - !(nlh->nlmsg_flags & NLM_F_REPLACE)) + !(nlh->nlmsg_flags & NLM_F_REPLACE)) { + NL_SET_ERR_MSG_MOD(extack, "address already assigned"); err = -EEXIST; - else + } else { err = inet6_addr_modify(net, ifa, &cfg); + } in6_ifa_put(ifa); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index dacdea7fcb62..bb17f484ee2c 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -305,7 +305,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) } net = dev_net(idev->dev); - f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC); + f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC, NULL); if (IS_ERR(f6i)) { err = PTR_ERR(f6i); goto out; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 64e873f5895f..c90700aed3a1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4543,7 +4543,8 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, - bool anycast, gfp_t gfp_flags) + bool anycast, gfp_t gfp_flags, + struct netlink_ext_ack *extack) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, @@ -4565,7 +4566,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, cfg.fc_flags |= RTF_LOCAL; } - f6i = ip6_route_info_create(&cfg, gfp_flags, NULL); + f6i = ip6_route_info_create(&cfg, gfp_flags, extack); if (!IS_ERR(f6i)) { f6i->dst_nocount = true; -- cgit v1.2.3 From 759ab1edb56c88906830fd6b2e7b12514dd32758 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jul 2023 11:55:29 -0700 Subject: net: store netdevs in an xarray Iterating over the netdev hash table for netlink dumps is hard. Dumps are done in "chunks" so we need to save the position after each chunk, so we know where to restart from. Because netdevs are stored in a hash table we remember which bucket we were in and how many devices we dumped. Since we don't hold any locks across the "chunks" - devices may come and go while we're dumping. If that happens we may miss a device (if device is deleted from the bucket we were in). We indicate to user space that this may have happened by setting NLM_F_DUMP_INTR. User space is supposed to dump again (I think) if it sees that. Somehow I doubt most user space gets this right.. To illustrate let's look at an example: System state: start: # [A, B, C] del: B # [A, C] with the hash table we may dump [A, B], missing C completely even tho it existed both before and after the "del B". Add an xarray and use it to allocate ifindexes. This way we can iterate ifindexes in order, without the worry that we'll skip one. We may still generate a dump of a state which "never existed", for example for a set of values and sequence of ops: System state: start: # [A, B] add: C # [A, C, B] del: B # [A, C] we may generate a dump of [A], if C got an index between A and B. System has never been in such state. But I'm 90% sure that's perfectly fine, important part is that we can't _miss_ devices which exist before and after. User space which wants to mirror kernel's state subscribes to notifications and does periodic dumps so it will know that C exists from the notification about its creation or from the next dump (next dump is _guaranteed_ to include C, if it doesn't get removed). To avoid any perf regressions keep the hash table for now. Most net namespaces have very few devices and microbenchmarking 1M lookups on Skylake I get the following results (not counting loopback to number of devs): #devs | hash | xa | delta 2 | 18.3 | 20.1 | + 9.8% 16 | 18.3 | 20.1 | + 9.5% 64 | 18.3 | 26.3 | +43.8% 128 | 20.4 | 26.3 | +28.6% 256 | 20.0 | 26.4 | +32.1% 1024 | 26.6 | 26.7 | + 0.2% 8192 |541.3 | 33.5 | -93.8% No surprises since the hash table has 256 entries. The microbenchmark scans indexes in order, if the pattern is more random xa starts to win at 512 devices already. But that's a lot of devices, in practice. Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230726185530.2247698-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 4 ++- net/core/dev.c | 82 +++++++++++++++++++++++++++++---------------- 2 files changed, 57 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 78beaa765c73..9f6add96de2d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -42,6 +42,7 @@ #include #include #include +#include struct user_namespace; struct proc_dir_entry; @@ -69,7 +70,7 @@ struct net { atomic_t dev_unreg_count; unsigned int dev_base_seq; /* protected by rtnl_mutex */ - int ifindex; + u32 ifindex; spinlock_t nsid_lock; atomic_t fnhe_genid; @@ -110,6 +111,7 @@ struct net { struct hlist_head *dev_name_head; struct hlist_head *dev_index_head; + struct xarray dev_by_index; struct raw_notifier_head netdev_chain; /* Note that @hash_mix can be read millions times per second, diff --git a/net/core/dev.c b/net/core/dev.c index e7ffcfa037f7..b58674774a57 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -388,6 +388,8 @@ static void list_netdevice(struct net_device *dev) hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); write_unlock(&dev_base_lock); + /* We reserved the ifindex, this can't fail */ + WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL)); dev_base_seq_inc(net); } @@ -397,8 +399,12 @@ static void list_netdevice(struct net_device *dev) */ static void unlist_netdevice(struct net_device *dev, bool lock) { + struct net *net = dev_net(dev); + ASSERT_RTNL(); + xa_erase(&net->dev_by_index, dev->ifindex); + /* Unlink dev from the device chain */ if (lock) write_lock(&dev_base_lock); @@ -9565,23 +9571,35 @@ err_out: } /** - * dev_new_index - allocate an ifindex - * @net: the applicable net namespace + * dev_index_reserve() - allocate an ifindex in a namespace + * @net: the applicable net namespace + * @ifindex: requested ifindex, pass %0 to get one allocated + * + * Allocate a ifindex for a new device. Caller must either use the ifindex + * to store the device (via list_netdevice()) or call dev_index_release() + * to give the index up. * - * Returns a suitable unique value for a new device interface - * number. The caller must hold the rtnl semaphore or the - * dev_base_lock to be sure it remains unique. + * Return: a suitable unique value for a new device interface number or -errno. */ -static int dev_new_index(struct net *net) +static int dev_index_reserve(struct net *net, u32 ifindex) { - int ifindex = net->ifindex; + int err; - for (;;) { - if (++ifindex <= 0) - ifindex = 1; - if (!__dev_get_by_index(net, ifindex)) - return net->ifindex = ifindex; - } + if (!ifindex) + err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL, + xa_limit_31b, &net->ifindex, GFP_KERNEL); + else + err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL); + if (err < 0) + return err; + + return ifindex; +} + +static void dev_index_release(struct net *net, int ifindex) +{ + /* Expect only unused indexes, unlist_netdevice() removes the used */ + WARN_ON(xa_erase(&net->dev_by_index, ifindex)); } /* Delayed registration/unregisteration */ @@ -10051,11 +10069,10 @@ int register_netdevice(struct net_device *dev) goto err_uninit; } - ret = -EBUSY; - if (!dev->ifindex) - dev->ifindex = dev_new_index(net); - else if (__dev_get_by_index(net, dev->ifindex)) + ret = dev_index_reserve(net, dev->ifindex); + if (ret < 0) goto err_uninit; + dev->ifindex = ret; /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). @@ -10102,7 +10119,7 @@ int register_netdevice(struct net_device *dev) ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) - goto err_uninit; + goto err_ifindex_release; ret = netdev_register_kobject(dev); write_lock(&dev_base_lock); @@ -10158,6 +10175,8 @@ out: err_uninit_notify: call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); +err_ifindex_release: + dev_index_release(net, dev->ifindex); err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -11035,9 +11054,19 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, } /* Check that new_ifindex isn't used yet. */ - err = -EBUSY; - if (new_ifindex && __dev_get_by_index(net, new_ifindex)) - goto out; + if (new_ifindex) { + err = dev_index_reserve(net, new_ifindex); + if (err < 0) + goto out; + } else { + /* If there is an ifindex conflict assign a new one */ + err = dev_index_reserve(net, dev->ifindex); + if (err == -EBUSY) + err = dev_index_reserve(net, 0); + if (err < 0) + goto out; + new_ifindex = err; + } /* * And now a mini version of register_netdevice unregister_netdevice. @@ -11065,13 +11094,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, rcu_barrier(); new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); - /* If there is an ifindex conflict assign a new one */ - if (!new_ifindex) { - if (__dev_get_by_index(net, dev->ifindex)) - new_ifindex = dev_new_index(net); - else - new_ifindex = dev->ifindex; - } rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); @@ -11249,6 +11271,9 @@ static int __net_init netdev_init(struct net *net) if (net->dev_index_head == NULL) goto err_idx; + net->ifindex = 1; + xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC); + RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); return 0; @@ -11346,6 +11371,7 @@ static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); kfree(net->dev_index_head); + xa_destroy(&net->dev_by_index); if (net != &init_net) WARN_ON_ONCE(!list_empty(&net->dev_base_head)); } -- cgit v1.2.3 From 84e00d9bd4e472bd9b145ed40dbd132dd7a15462 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jul 2023 11:55:30 -0700 Subject: net: convert some netlink netdev iterators to depend on the xarray Reap the benefits of easier iteration thanks to the xarray. Convert just the genetlink ones, those are easier to test. Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230726185530.2247698-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 ++ net/core/netdev-genl.c | 37 ++++++------------------ net/ethtool/netlink.c | 65 ++++++++++++------------------------------- net/ethtool/tunnels.c | 71 +++++++++++++++++------------------------------ 4 files changed, 54 insertions(+), 122 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 32a4cdf37dd4..84c36a7f873f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3016,6 +3016,9 @@ extern rwlock_t dev_base_lock; /* Device list lock */ if (netdev_master_upper_dev_get_rcu(slave) == (bond)) #define net_device_entry(lh) list_entry(lh, struct net_device, dev_list) +#define for_each_netdev_dump(net, d, ifindex) \ + xa_for_each_start(&(net)->dev_by_index, (ifindex), (d), (ifindex)) + static inline struct net_device *next_net_device(struct net_device *dev) { struct list_head *lh; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 65ef4867fc49..797c813c7c77 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -101,43 +101,22 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct net_device *netdev; - int idx = 0, s_idx; - int h, s_h; - int err; - - s_h = cb->args[0]; - s_idx = cb->args[1]; + int err = 0; rtnl_lock(); - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - struct hlist_head *head; - - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry(netdev, head, index_hlist) { - if (idx < s_idx) - goto cont; - err = netdev_nl_dev_fill(netdev, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - NETDEV_CMD_DEV_GET); - if (err < 0) - break; -cont: - idx++; - } + for_each_netdev_dump(net, netdev, cb->args[0]) { + err = netdev_nl_dev_fill(netdev, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + NETDEV_CMD_DEV_GET); + if (err < 0) + break; } - rtnl_unlock(); if (err != -EMSGSIZE) return err; - cb->args[1] = idx; - cb->args[0] = h; - cb->seq = net->dev_base_seq; - return skb->len; } diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 39a459b0111b..ae344f1b0bbd 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -252,8 +252,7 @@ int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) * @ops: request ops of currently processed message type * @req_info: parsed request header of processed request * @reply_data: data needed to compose the reply - * @pos_hash: saved iteration position - hashbucket - * @pos_idx: saved iteration position - index + * @pos_ifindex: saved iteration position - ifindex * * These parameters are kept in struct netlink_callback as context preserved * between iterations. They are initialized by ethnl_default_start() and used @@ -263,8 +262,7 @@ struct ethnl_dump_ctx { const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct ethnl_reply_data *reply_data; - int pos_hash; - int pos_idx; + unsigned long pos_ifindex; }; static const struct ethnl_request_ops * @@ -490,55 +488,27 @@ static int ethnl_default_dumpit(struct sk_buff *skb, { struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); struct net *net = sock_net(skb->sk); - int s_idx = ctx->pos_idx; - int h, idx = 0; + struct net_device *dev; int ret = 0; rtnl_lock(); - for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - struct hlist_head *head; - struct net_device *dev; - unsigned int seq; - - head = &net->dev_index_head[h]; - -restart_chain: - seq = net->dev_base_seq; - cb->seq = seq; - idx = 0; - hlist_for_each_entry(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - dev_hold(dev); - rtnl_unlock(); - - ret = ethnl_default_dump_one(skb, dev, ctx, cb); - dev_put(dev); - if (ret < 0) { - if (ret == -EOPNOTSUPP) - goto lock_and_cont; - if (likely(skb->len)) - ret = skb->len; - goto out; - } -lock_and_cont: - rtnl_lock(); - if (net->dev_base_seq != seq) { - s_idx = idx + 1; - goto restart_chain; - } -cont: - idx++; - } + for_each_netdev_dump(net, dev, ctx->pos_ifindex) { + dev_hold(dev); + rtnl_unlock(); + + ret = ethnl_default_dump_one(skb, dev, ctx, cb); + + rtnl_lock(); + dev_put(dev); + if (ret < 0 && ret != -EOPNOTSUPP) { + if (likely(skb->len)) + ret = skb->len; + break; + } } rtnl_unlock(); -out: - ctx->pos_hash = h; - ctx->pos_idx = idx; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); - return ret; } @@ -584,8 +554,7 @@ static int ethnl_default_start(struct netlink_callback *cb) ctx->ops = ops; ctx->req_info = req_info; ctx->reply_data = reply_data; - ctx->pos_hash = 0; - ctx->pos_idx = 0; + ctx->pos_ifindex = 0; return 0; diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c index 67fb414ca859..05f752557b5e 100644 --- a/net/ethtool/tunnels.c +++ b/net/ethtool/tunnels.c @@ -212,8 +212,7 @@ err_unlock_rtnl: struct ethnl_tunnel_info_dump_ctx { struct ethnl_req_info req_info; - int pos_hash; - int pos_idx; + unsigned long ifindex; }; int ethnl_tunnel_info_start(struct netlink_callback *cb) @@ -243,57 +242,39 @@ int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); - int s_idx = ctx->pos_idx; - int h, idx = 0; + struct net_device *dev; int ret = 0; void *ehdr; rtnl_lock(); - cb->seq = net->dev_base_seq; - for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - struct hlist_head *head; - struct net_device *dev; - - head = &net->dev_index_head[h]; - idx = 0; - hlist_for_each_entry(dev, head, index_hlist) { - if (idx < s_idx) - goto cont; - - ehdr = ethnl_dump_put(skb, cb, - ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY); - if (!ehdr) { - ret = -EMSGSIZE; - goto out; - } - - ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TUNNEL_INFO_HEADER); - if (ret < 0) { - genlmsg_cancel(skb, ehdr); - goto out; - } - - ctx->req_info.dev = dev; - ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb); - ctx->req_info.dev = NULL; - if (ret < 0) { - genlmsg_cancel(skb, ehdr); - if (ret == -EOPNOTSUPP) - goto cont; - goto out; - } - genlmsg_end(skb, ehdr); -cont: - idx++; + for_each_netdev_dump(net, dev, ctx->ifindex) { + ehdr = ethnl_dump_put(skb, cb, + ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY); + if (!ehdr) { + ret = -EMSGSIZE; + break; } + + ret = ethnl_fill_reply_header(skb, dev, + ETHTOOL_A_TUNNEL_INFO_HEADER); + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + break; + } + + ctx->req_info.dev = dev; + ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb); + ctx->req_info.dev = NULL; + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + if (ret == -EOPNOTSUPP) + continue; + break; + } + genlmsg_end(skb, ehdr); } -out: rtnl_unlock(); - ctx->pos_hash = h; - ctx->pos_idx = idx; - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); - if (ret == -EMSGSIZE && skb->len) return skb->len; return ret; -- cgit v1.2.3 From 5027d54a9c30bc7ec808360378e2b4753f053f25 Mon Sep 17 00:00:00 2001 From: Patrick Rohr Date: Wed, 26 Jul 2023 16:07:01 -0700 Subject: net: change accept_ra_min_rtr_lft to affect all RA lifetimes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit accept_ra_min_rtr_lft only considered the lifetime of the default route and discarded entire RAs accordingly. This change renames accept_ra_min_rtr_lft to accept_ra_min_lft, and applies the value to individual RA sections; in particular, router lifetime, PIO preferred lifetime, and RIO lifetime. If any of those lifetimes are lower than the configured value, the specific RA section is ignored. In order for the sysctl to be useful to Android, it should really apply to all lifetimes in the RA, since that is what determines the minimum frequency at which RAs must be processed by the kernel. Android uses hardware offloads to drop RAs for a fraction of the minimum of all lifetimes present in the RA (some networks have very frequent RAs (5s) with high lifetimes (2h)). Despite this, we have encountered networks that set the router lifetime to 30s which results in very frequent CPU wakeups. Instead of disabling IPv6 (and dropping IPv6 ethertype in the WiFi firmware) entirely on such networks, it seems better to ignore the misconfigured routers while still processing RAs from other IPv6 routers on the same network (i.e. to support IoT applications). The previous implementation dropped the entire RA based on router lifetime. This turned out to be hard to expand to the other lifetimes present in the RA in a consistent manner; dropping the entire RA based on RIO/PIO lifetimes would essentially require parsing the whole thing twice. Fixes: 1671bcfd76fd ("net: add sysctl accept_ra_min_rtr_lft") Cc: Lorenzo Colitti Signed-off-by: Patrick Rohr Reviewed-by: Maciej Å»enczykowski Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230726230701.919212-1-prohr@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 8 ++++---- include/linux/ipv6.h | 2 +- include/uapi/linux/ipv6.h | 2 +- net/ipv6/addrconf.c | 13 ++++++++----- net/ipv6/ndisc.c | 27 +++++++++++---------------- 5 files changed, 25 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 37603ad6126b..a66054d0763a 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -2288,11 +2288,11 @@ accept_ra_min_hop_limit - INTEGER Default: 1 -accept_ra_min_rtr_lft - INTEGER - Minimum acceptable router lifetime in Router Advertisement. +accept_ra_min_lft - INTEGER + Minimum acceptable lifetime value in Router Advertisement. - RAs with a router lifetime less than this value shall be - ignored. RAs with a router lifetime of 0 are unaffected. + RA sections with a lifetime less than this value shall be + ignored. Zero lifetimes stay unaffected. Default: 0 diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 0295b47c10a3..5883551b1ee8 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -33,7 +33,7 @@ struct ipv6_devconf { __s32 accept_ra_defrtr; __u32 ra_defrtr_metric; __s32 accept_ra_min_hop_limit; - __s32 accept_ra_min_rtr_lft; + __s32 accept_ra_min_lft; __s32 accept_ra_pinfo; __s32 ignore_routes_with_linkdown; #ifdef CONFIG_IPV6_ROUTER_PREF diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 8b6bcbf6ed4a..cf592d7b630f 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -198,7 +198,7 @@ enum { DEVCONF_IOAM6_ID_WIDE, DEVCONF_NDISC_EVICT_NOCARRIER, DEVCONF_ACCEPT_UNTRACKED_NA, - DEVCONF_ACCEPT_RA_MIN_RTR_LFT, + DEVCONF_ACCEPT_RA_MIN_LFT, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5daf6a4986f8..5184bd0ceb12 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -202,7 +202,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, - .accept_ra_min_rtr_lft = 0, + .accept_ra_min_lft = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -263,7 +263,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, - .accept_ra_min_rtr_lft = 0, + .accept_ra_min_lft = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -2741,6 +2741,9 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) return; } + if (valid_lft != 0 && valid_lft < in6_dev->cnf.accept_ra_min_lft) + return; + /* * Two things going on here: * 1) Add routes for on-link prefixes @@ -5635,7 +5638,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide; array[DEVCONF_NDISC_EVICT_NOCARRIER] = cnf->ndisc_evict_nocarrier; array[DEVCONF_ACCEPT_UNTRACKED_NA] = cnf->accept_untracked_na; - array[DEVCONF_ACCEPT_RA_MIN_RTR_LFT] = cnf->accept_ra_min_rtr_lft; + array[DEVCONF_ACCEPT_RA_MIN_LFT] = cnf->accept_ra_min_lft; } static inline size_t inet6_ifla6_size(void) @@ -6830,8 +6833,8 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { - .procname = "accept_ra_min_rtr_lft", - .data = &ipv6_devconf.accept_ra_min_rtr_lft, + .procname = "accept_ra_min_lft", + .data = &ipv6_devconf.accept_ra_min_lft, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 95750fbf6782..c394799be38f 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1276,8 +1276,6 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; - lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); - if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, "RA: %s, did not accept ra for dev: %s\n", @@ -1285,13 +1283,6 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) goto skip_linkparms; } - if (lifetime != 0 && lifetime < in6_dev->cnf.accept_ra_min_rtr_lft) { - ND_PRINTK(2, info, - "RA: router lifetime (%ds) is too short: %s\n", - lifetime, skb->dev->name); - goto skip_linkparms; - } - #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific parameters from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { @@ -1332,6 +1323,14 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) goto skip_defrtr; } + lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); + if (lifetime != 0 && lifetime < in6_dev->cnf.accept_ra_min_lft) { + ND_PRINTK(2, info, + "RA: router lifetime (%ds) is too short: %s\n", + lifetime, skb->dev->name); + goto skip_defrtr; + } + /* Do not accept RA with source-addr found on local machine unless * accept_ra_from_local is set to true. */ @@ -1495,13 +1494,6 @@ skip_linkparms: goto out; } - if (lifetime != 0 && lifetime < in6_dev->cnf.accept_ra_min_rtr_lft) { - ND_PRINTK(2, info, - "RA: router lifetime (%ds) is too short: %s\n", - lifetime, skb->dev->name); - goto out; - } - #ifdef CONFIG_IPV6_ROUTE_INFO if (!in6_dev->cnf.accept_ra_from_local && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, @@ -1526,6 +1518,9 @@ skip_linkparms: if (ri->prefix_len == 0 && !in6_dev->cnf.accept_ra_defrtr) continue; + if (ri->lifetime != 0 && + ntohl(ri->lifetime) < in6_dev->cnf.accept_ra_min_lft) + continue; if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen) continue; if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) -- cgit v1.2.3 From 6a7eccef47b205ae66371a26d36dfb2529835075 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 27 Jul 2023 13:35:23 -0400 Subject: net/tls: Move TLS protocol elements to a separate header Kernel TLS consumers will need definitions of various parts of the TLS protocol, but often do not need the function declarations and other infrastructure provided in . Break out existing standardized protocol elements into a separate header, and make room for a few more elements in subsequent patches. Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/169047931374.5241.7713175865185969309.stgit@oracle-102.nfsv4bat.org Signed-off-by: Jakub Kicinski --- .../ethernet/chelsio/inline_crypto/chtls/chtls.h | 1 + include/net/tls.h | 4 ---- include/net/tls_prot.h | 26 ++++++++++++++++++++++ net/sunrpc/svcsock.c | 1 + net/sunrpc/xprtsock.c | 1 + net/tls/tls.h | 1 + 6 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 include/net/tls_prot.h (limited to 'net') diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h index 68562a82d036..62f62bff74a5 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "t4fw_api.h" diff --git a/include/net/tls.h b/include/net/tls.h index 5e71dd3df8ca..06fca9160346 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -69,10 +69,6 @@ extern const struct tls_cipher_size_desc tls_cipher_size_desc[]; #define TLS_CRYPTO_INFO_READY(info) ((info)->cipher_type) -#define TLS_RECORD_TYPE_ALERT 0x15 -#define TLS_RECORD_TYPE_HANDSHAKE 0x16 -#define TLS_RECORD_TYPE_DATA 0x17 - #define TLS_AAD_SPACE_SIZE 13 #define MAX_IV_SIZE 16 diff --git a/include/net/tls_prot.h b/include/net/tls_prot.h new file mode 100644 index 000000000000..47d6cfd1619e --- /dev/null +++ b/include/net/tls_prot.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. + * + * TLS Protocol definitions + * + * From https://www.iana.org/assignments/tls-parameters/tls-parameters.xhtml + */ + +#ifndef _TLS_PROT_H +#define _TLS_PROT_H + +/* + * TLS Record protocol: ContentType + */ +enum { + TLS_RECORD_TYPE_CHANGE_CIPHER_SPEC = 20, + TLS_RECORD_TYPE_ALERT = 21, + TLS_RECORD_TYPE_HANDSHAKE = 22, + TLS_RECORD_TYPE_DATA = 23, + TLS_RECORD_TYPE_HEARTBEAT = 24, + TLS_RECORD_TYPE_TLS12_CID = 25, + TLS_RECORD_TYPE_ACK = 26, +}; + +#endif /* _TLS_PROT_H */ diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index e43f26382411..449df8cabfcb 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9f010369100a..9457ebf22fb1 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include diff --git a/net/tls/tls.h b/net/tls/tls.h index 86cef1c68e03..26a0358f6f49 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -39,6 +39,7 @@ #include #include #include +#include #define TLS_PAGE_ORDER (min_t(unsigned int, PAGE_ALLOC_COSTLY_ORDER, \ TLS_MAX_PAYLOAD_SIZE >> PAGE_SHIFT)) -- cgit v1.2.3 From 35b1b538d422fd765d88fbdaaa6e06ee466d9f93 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 27 Jul 2023 13:36:17 -0400 Subject: net/handshake: Add API for sending TLS Closure alerts This helper sends an alert only if a TLS session was established. Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/169047936730.5241.618595693821012638.stgit@oracle-102.nfsv4bat.org Signed-off-by: Jakub Kicinski --- include/net/handshake.h | 1 + net/handshake/Makefile | 2 +- net/handshake/alert.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++ net/handshake/handshake.h | 6 +++++ net/handshake/tlshd.c | 23 ++++++++++++++++++ 5 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 net/handshake/alert.c (limited to 'net') diff --git a/include/net/handshake.h b/include/net/handshake.h index 2e26e436e85f..bb88dfa6e3c9 100644 --- a/include/net/handshake.h +++ b/include/net/handshake.h @@ -40,5 +40,6 @@ int tls_server_hello_x509(const struct tls_handshake_args *args, gfp_t flags); int tls_server_hello_psk(const struct tls_handshake_args *args, gfp_t flags); bool tls_handshake_cancel(struct sock *sk); +void tls_handshake_close(struct socket *sock); #endif /* _NET_HANDSHAKE_H */ diff --git a/net/handshake/Makefile b/net/handshake/Makefile index 247d73c6ff6e..ef4d9a2112bd 100644 --- a/net/handshake/Makefile +++ b/net/handshake/Makefile @@ -8,6 +8,6 @@ # obj-y += handshake.o -handshake-y := genl.o netlink.o request.o tlshd.o trace.o +handshake-y := alert.o genl.o netlink.o request.o tlshd.o trace.o obj-$(CONFIG_NET_HANDSHAKE_KUNIT_TEST) += handshake-test.o diff --git a/net/handshake/alert.c b/net/handshake/alert.c new file mode 100644 index 000000000000..2f1a16868ff6 --- /dev/null +++ b/net/handshake/alert.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Handle the TLS Alert protocol + * + * Author: Chuck Lever + * + * Copyright (c) 2023, Oracle and/or its affiliates. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "handshake.h" + +/** + * tls_alert_send - send a TLS Alert on a kTLS socket + * @sock: open kTLS socket to send on + * @level: TLS Alert level + * @description: TLS Alert description + * + * Returns zero on success or a negative errno. + */ +int tls_alert_send(struct socket *sock, u8 level, u8 description) +{ + u8 record_type = TLS_RECORD_TYPE_ALERT; + u8 buf[CMSG_SPACE(sizeof(record_type))]; + struct msghdr msg = { 0 }; + struct cmsghdr *cmsg; + struct kvec iov; + u8 alert[2]; + int ret; + + alert[0] = level; + alert[1] = description; + iov.iov_base = alert; + iov.iov_len = sizeof(alert); + + memset(buf, 0, sizeof(buf)); + msg.msg_control = buf; + msg.msg_controllen = sizeof(buf); + msg.msg_flags = MSG_DONTWAIT; + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_TLS; + cmsg->cmsg_type = TLS_SET_RECORD_TYPE; + cmsg->cmsg_len = CMSG_LEN(sizeof(record_type)); + memcpy(CMSG_DATA(cmsg), &record_type, sizeof(record_type)); + + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, iov.iov_len); + ret = sock_sendmsg(sock, &msg); + return ret < 0 ? ret : 0; +} diff --git a/net/handshake/handshake.h b/net/handshake/handshake.h index 4dac965c99df..a48163765a7a 100644 --- a/net/handshake/handshake.h +++ b/net/handshake/handshake.h @@ -41,8 +41,11 @@ struct handshake_req { enum hr_flags_bits { HANDSHAKE_F_REQ_COMPLETED, + HANDSHAKE_F_REQ_SESSION, }; +struct genl_info; + /* Invariants for all handshake requests for one transport layer * security protocol */ @@ -63,6 +66,9 @@ enum hp_flags_bits { HANDSHAKE_F_PROTO_NOTIFY, }; +/* alert.c */ +int tls_alert_send(struct socket *sock, u8 level, u8 description); + /* netlink.c */ int handshake_genl_notify(struct net *net, const struct handshake_proto *proto, gfp_t flags); diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c index b735f5cced2f..bbfb4095ddd6 100644 --- a/net/handshake/tlshd.c +++ b/net/handshake/tlshd.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -100,6 +101,9 @@ static void tls_handshake_done(struct handshake_req *req, if (info) tls_handshake_remote_peerids(treq, info); + if (!status) + set_bit(HANDSHAKE_F_REQ_SESSION, &req->hr_flags); + treq->th_consumer_done(treq->th_consumer_data, -status, treq->th_peerid[0]); } @@ -424,3 +428,22 @@ bool tls_handshake_cancel(struct sock *sk) return handshake_req_cancel(sk); } EXPORT_SYMBOL(tls_handshake_cancel); + +/** + * tls_handshake_close - send a Closure alert + * @sock: an open socket + * + */ +void tls_handshake_close(struct socket *sock) +{ + struct handshake_req *req; + + req = handshake_req_hash_lookup(sock->sk); + if (!req) + return; + if (!test_and_clear_bit(HANDSHAKE_F_REQ_SESSION, &req->hr_flags)) + return; + tls_alert_send(sock, TLS_ALERT_LEVEL_WARNING, + TLS_ALERT_DESC_CLOSE_NOTIFY); +} +EXPORT_SYMBOL(tls_handshake_close); -- cgit v1.2.3 From 5dd5ad682cfe9550a37634a8f55780ebd74edbe8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 27 Jul 2023 13:36:44 -0400 Subject: SUNRPC: Send TLS Closure alerts before closing a TCP socket Before closing a TCP connection, the TLS protocol wants peers to send session close Alert notifications. Add those in both the RPC client and server. Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/169047939404.5241.14392506226409865832.stgit@oracle-102.nfsv4bat.org Signed-off-by: Jakub Kicinski --- net/sunrpc/svcsock.c | 2 ++ net/sunrpc/xprtsock.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 449df8cabfcb..cca6ee716020 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1622,6 +1622,8 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + tls_handshake_close(svsk->sk_sock); + svc_sock_detach(xprt); if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) { diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9457ebf22fb1..5096aa62de5c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1293,6 +1293,8 @@ static void xs_close(struct rpc_xprt *xprt) dprintk("RPC: xs_close xprt %p\n", xprt); + if (transport->sock) + tls_handshake_close(transport->sock); xs_reset_transport(transport); xprt->reestablish_timeout = 0; } -- cgit v1.2.3 From 39d0e38dcced8d4da92cd11f3ff618bacc42d8a9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 27 Jul 2023 13:37:10 -0400 Subject: net/handshake: Add helpers for parsing incoming TLS Alerts Kernel TLS consumers can replace common TLS Alert parsing code with these helpers. Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/169047942074.5241.13791647439480672048.stgit@oracle-102.nfsv4bat.org Signed-off-by: Jakub Kicinski --- include/net/handshake.h | 4 ++++ net/handshake/alert.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) (limited to 'net') diff --git a/include/net/handshake.h b/include/net/handshake.h index bb88dfa6e3c9..8ebd4f9ed26e 100644 --- a/include/net/handshake.h +++ b/include/net/handshake.h @@ -42,4 +42,8 @@ int tls_server_hello_psk(const struct tls_handshake_args *args, gfp_t flags); bool tls_handshake_cancel(struct sock *sk); void tls_handshake_close(struct socket *sock); +u8 tls_get_record_type(const struct sock *sk, const struct cmsghdr *msg); +void tls_alert_recv(const struct sock *sk, const struct msghdr *msg, + u8 *level, u8 *description); + #endif /* _NET_HANDSHAKE_H */ diff --git a/net/handshake/alert.c b/net/handshake/alert.c index 2f1a16868ff6..286bf94d93a3 100644 --- a/net/handshake/alert.c +++ b/net/handshake/alert.c @@ -59,3 +59,45 @@ int tls_alert_send(struct socket *sock, u8 level, u8 description) ret = sock_sendmsg(sock, &msg); return ret < 0 ? ret : 0; } + +/** + * tls_get_record_type - Look for TLS RECORD_TYPE information + * @sk: socket (for IP address information) + * @cmsg: incoming message to be parsed + * + * Returns zero or a TLS_RECORD_TYPE value. + */ +u8 tls_get_record_type(const struct sock *sk, const struct cmsghdr *cmsg) +{ + u8 record_type; + + if (cmsg->cmsg_level != SOL_TLS) + return 0; + if (cmsg->cmsg_type != TLS_GET_RECORD_TYPE) + return 0; + + record_type = *((u8 *)CMSG_DATA(cmsg)); + return record_type; +} +EXPORT_SYMBOL(tls_get_record_type); + +/** + * tls_alert_recv - Parse TLS Alert messages + * @sk: socket (for IP address information) + * @msg: incoming message to be parsed + * @level: OUT - TLS AlertLevel value + * @description: OUT - TLS AlertDescription value + * + */ +void tls_alert_recv(const struct sock *sk, const struct msghdr *msg, + u8 *level, u8 *description) +{ + const struct kvec *iov; + u8 *data; + + iov = msg->msg_iter.kvec; + data = iov->iov_base; + *level = data[0]; + *description = data[1]; +} +EXPORT_SYMBOL(tls_alert_recv); -- cgit v1.2.3 From 39067dda1d865d7fac1f56c18479e67b0917bbe4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 27 Jul 2023 13:37:37 -0400 Subject: SUNRPC: Use new helpers to handle TLS Alerts Use the helpers to parse the level and description fields in incoming alerts. "Warning" alerts are discarded, and "fatal" alerts mean the session is no longer valid. Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/169047944747.5241.1974889594004407123.stgit@oracle-102.nfsv4bat.org Signed-off-by: Jakub Kicinski --- net/sunrpc/svcsock.c | 47 +++++++++++++++++++++++++---------------------- net/sunrpc/xprtsock.c | 42 +++++++++++++++++++++++------------------- 2 files changed, 48 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index cca6ee716020..2ed29e40c6a9 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include @@ -227,27 +226,30 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) } static int -svc_tcp_sock_process_cmsg(struct svc_sock *svsk, struct msghdr *msg, +svc_tcp_sock_process_cmsg(struct socket *sock, struct msghdr *msg, struct cmsghdr *cmsg, int ret) { - if (cmsg->cmsg_level == SOL_TLS && - cmsg->cmsg_type == TLS_GET_RECORD_TYPE) { - u8 content_type = *((u8 *)CMSG_DATA(cmsg)); - - switch (content_type) { - case TLS_RECORD_TYPE_DATA: - /* TLS sets EOR at the end of each application data - * record, even though there might be more frames - * waiting to be decrypted. - */ - msg->msg_flags &= ~MSG_EOR; - break; - case TLS_RECORD_TYPE_ALERT: - ret = -ENOTCONN; - break; - default: - ret = -EAGAIN; - } + u8 content_type = tls_get_record_type(sock->sk, cmsg); + u8 level, description; + + switch (content_type) { + case 0: + break; + case TLS_RECORD_TYPE_DATA: + /* TLS sets EOR at the end of each application data + * record, even though there might be more frames + * waiting to be decrypted. + */ + msg->msg_flags &= ~MSG_EOR; + break; + case TLS_RECORD_TYPE_ALERT: + tls_alert_recv(sock->sk, msg, &level, &description); + ret = (level == TLS_ALERT_LEVEL_FATAL) ? + -ENOTCONN : -EAGAIN; + break; + default: + /* discard this record type */ + ret = -EAGAIN; } return ret; } @@ -259,13 +261,14 @@ svc_tcp_sock_recv_cmsg(struct svc_sock *svsk, struct msghdr *msg) struct cmsghdr cmsg; u8 buf[CMSG_SPACE(sizeof(u8))]; } u; + struct socket *sock = svsk->sk_sock; int ret; msg->msg_control = &u; msg->msg_controllen = sizeof(u); - ret = sock_recvmsg(svsk->sk_sock, msg, MSG_DONTWAIT); + ret = sock_recvmsg(sock, msg, MSG_DONTWAIT); if (unlikely(msg->msg_controllen != sizeof(u))) - ret = svc_tcp_sock_process_cmsg(svsk, msg, &u.cmsg, ret); + ret = svc_tcp_sock_process_cmsg(sock, msg, &u.cmsg, ret); return ret; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 5096aa62de5c..268a2cc61acd 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include @@ -361,24 +360,27 @@ static int xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, struct cmsghdr *cmsg, int ret) { - if (cmsg->cmsg_level == SOL_TLS && - cmsg->cmsg_type == TLS_GET_RECORD_TYPE) { - u8 content_type = *((u8 *)CMSG_DATA(cmsg)); - - switch (content_type) { - case TLS_RECORD_TYPE_DATA: - /* TLS sets EOR at the end of each application data - * record, even though there might be more frames - * waiting to be decrypted. - */ - msg->msg_flags &= ~MSG_EOR; - break; - case TLS_RECORD_TYPE_ALERT: - ret = -ENOTCONN; - break; - default: - ret = -EAGAIN; - } + u8 content_type = tls_get_record_type(sock->sk, cmsg); + u8 level, description; + + switch (content_type) { + case 0: + break; + case TLS_RECORD_TYPE_DATA: + /* TLS sets EOR at the end of each application data + * record, even though there might be more frames + * waiting to be decrypted. + */ + msg->msg_flags &= ~MSG_EOR; + break; + case TLS_RECORD_TYPE_ALERT: + tls_alert_recv(sock->sk, msg, &level, &description); + ret = (level == TLS_ALERT_LEVEL_FATAL) ? + -EACCES : -EAGAIN; + break; + default: + /* discard this record type */ + ret = -EAGAIN; } return ret; } @@ -778,6 +780,8 @@ static void xs_stream_data_receive(struct sock_xprt *transport) } if (ret == -ESHUTDOWN) kernel_sock_shutdown(transport->sock, SHUT_RDWR); + else if (ret == -EACCES) + xprt_wake_pending_tasks(&transport->xprt, -EACCES); else xs_poll_check_readable(transport); out: -- cgit v1.2.3 From b470985c76df6d53a9454670fb7551e1197f55e2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 27 Jul 2023 13:38:04 -0400 Subject: net/handshake: Trace events for TLS Alert helpers Add observability for the new TLS Alert infrastructure. Reviewed-by: Hannes Reinecke Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/169047947409.5241.14548832149596892717.stgit@oracle-102.nfsv4bat.org Signed-off-by: Jakub Kicinski --- include/trace/events/handshake.h | 160 +++++++++++++++++++++++++++++++++++++++ net/handshake/alert.c | 7 ++ net/handshake/trace.c | 2 + 3 files changed, 169 insertions(+) (limited to 'net') diff --git a/include/trace/events/handshake.h b/include/trace/events/handshake.h index 8dadcab5f12a..bdd8a03cf5ba 100644 --- a/include/trace/events/handshake.h +++ b/include/trace/events/handshake.h @@ -6,7 +6,86 @@ #define _TRACE_HANDSHAKE_H #include +#include #include +#include + +#define TLS_RECORD_TYPE_LIST \ + record_type(CHANGE_CIPHER_SPEC) \ + record_type(ALERT) \ + record_type(HANDSHAKE) \ + record_type(DATA) \ + record_type(HEARTBEAT) \ + record_type(TLS12_CID) \ + record_type_end(ACK) + +#undef record_type +#undef record_type_end +#define record_type(x) TRACE_DEFINE_ENUM(TLS_RECORD_TYPE_##x); +#define record_type_end(x) TRACE_DEFINE_ENUM(TLS_RECORD_TYPE_##x); + +TLS_RECORD_TYPE_LIST + +#undef record_type +#undef record_type_end +#define record_type(x) { TLS_RECORD_TYPE_##x, #x }, +#define record_type_end(x) { TLS_RECORD_TYPE_##x, #x } + +#define show_tls_content_type(type) \ + __print_symbolic(type, TLS_RECORD_TYPE_LIST) + +TRACE_DEFINE_ENUM(TLS_ALERT_LEVEL_WARNING); +TRACE_DEFINE_ENUM(TLS_ALERT_LEVEL_FATAL); + +#define show_tls_alert_level(level) \ + __print_symbolic(level, \ + { TLS_ALERT_LEVEL_WARNING, "Warning" }, \ + { TLS_ALERT_LEVEL_FATAL, "Fatal" }) + +#define TLS_ALERT_DESCRIPTION_LIST \ + alert_description(CLOSE_NOTIFY) \ + alert_description(UNEXPECTED_MESSAGE) \ + alert_description(BAD_RECORD_MAC) \ + alert_description(RECORD_OVERFLOW) \ + alert_description(HANDSHAKE_FAILURE) \ + alert_description(BAD_CERTIFICATE) \ + alert_description(UNSUPPORTED_CERTIFICATE) \ + alert_description(CERTIFICATE_REVOKED) \ + alert_description(CERTIFICATE_EXPIRED) \ + alert_description(CERTIFICATE_UNKNOWN) \ + alert_description(ILLEGAL_PARAMETER) \ + alert_description(UNKNOWN_CA) \ + alert_description(ACCESS_DENIED) \ + alert_description(DECODE_ERROR) \ + alert_description(DECRYPT_ERROR) \ + alert_description(TOO_MANY_CIDS_REQUESTED) \ + alert_description(PROTOCOL_VERSION) \ + alert_description(INSUFFICIENT_SECURITY) \ + alert_description(INTERNAL_ERROR) \ + alert_description(INAPPROPRIATE_FALLBACK) \ + alert_description(USER_CANCELED) \ + alert_description(MISSING_EXTENSION) \ + alert_description(UNSUPPORTED_EXTENSION) \ + alert_description(UNRECOGNIZED_NAME) \ + alert_description(BAD_CERTIFICATE_STATUS_RESPONSE) \ + alert_description(UNKNOWN_PSK_IDENTITY) \ + alert_description(CERTIFICATE_REQUIRED) \ + alert_description_end(NO_APPLICATION_PROTOCOL) + +#undef alert_description +#undef alert_description_end +#define alert_description(x) TRACE_DEFINE_ENUM(TLS_ALERT_DESC_##x); +#define alert_description_end(x) TRACE_DEFINE_ENUM(TLS_ALERT_DESC_##x); + +TLS_ALERT_DESCRIPTION_LIST + +#undef alert_description +#undef alert_description_end +#define alert_description(x) { TLS_ALERT_DESC_##x, #x }, +#define alert_description_end(x) { TLS_ALERT_DESC_##x, #x } + +#define show_tls_alert_description(desc) \ + __print_symbolic(desc, TLS_ALERT_DESCRIPTION_LIST) DECLARE_EVENT_CLASS(handshake_event_class, TP_PROTO( @@ -106,6 +185,47 @@ DECLARE_EVENT_CLASS(handshake_error_class, ), \ TP_ARGS(net, req, sk, err)) +DECLARE_EVENT_CLASS(handshake_alert_class, + TP_PROTO( + const struct sock *sk, + unsigned char level, + unsigned char description + ), + TP_ARGS(sk, level, description), + TP_STRUCT__entry( + /* sockaddr_in6 is always bigger than sockaddr_in */ + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) + __field(unsigned int, netns_ino) + __field(unsigned long, level) + __field(unsigned long, description) + ), + TP_fast_assign( + const struct inet_sock *inet = inet_sk(sk); + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + TP_STORE_ADDR_PORTS(__entry, inet, sk); + + __entry->netns_ino = sock_net(sk)->ns.inum; + __entry->level = level; + __entry->description = description; + ), + TP_printk("src=%pISpc dest=%pISpc %s: %s", + __entry->saddr, __entry->daddr, + show_tls_alert_level(__entry->level), + show_tls_alert_description(__entry->description) + ) +); +#define DEFINE_HANDSHAKE_ALERT(name) \ + DEFINE_EVENT(handshake_alert_class, name, \ + TP_PROTO( \ + const struct sock *sk, \ + unsigned char level, \ + unsigned char description \ + ), \ + TP_ARGS(sk, level, description)) + /* * Request lifetime events @@ -154,6 +274,46 @@ DEFINE_HANDSHAKE_ERROR(handshake_cmd_accept_err); DEFINE_HANDSHAKE_FD_EVENT(handshake_cmd_done); DEFINE_HANDSHAKE_ERROR(handshake_cmd_done_err); +/* + * TLS Record events + */ + +TRACE_EVENT(tls_contenttype, + TP_PROTO( + const struct sock *sk, + unsigned char type + ), + TP_ARGS(sk, type), + TP_STRUCT__entry( + /* sockaddr_in6 is always bigger than sockaddr_in */ + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) + __field(unsigned int, netns_ino) + __field(unsigned long, type) + ), + TP_fast_assign( + const struct inet_sock *inet = inet_sk(sk); + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + TP_STORE_ADDR_PORTS(__entry, inet, sk); + + __entry->netns_ino = sock_net(sk)->ns.inum; + __entry->type = type; + ), + TP_printk("src=%pISpc dest=%pISpc %s", + __entry->saddr, __entry->daddr, + show_tls_content_type(__entry->type) + ) +); + +/* + * TLS Alert events + */ + +DEFINE_HANDSHAKE_ALERT(tls_alert_send); +DEFINE_HANDSHAKE_ALERT(tls_alert_recv); + #endif /* _TRACE_HANDSHAKE_H */ #include diff --git a/net/handshake/alert.c b/net/handshake/alert.c index 286bf94d93a3..329d91984683 100644 --- a/net/handshake/alert.c +++ b/net/handshake/alert.c @@ -21,6 +21,8 @@ #include "handshake.h" +#include + /** * tls_alert_send - send a TLS Alert on a kTLS socket * @sock: open kTLS socket to send on @@ -39,6 +41,8 @@ int tls_alert_send(struct socket *sock, u8 level, u8 description) u8 alert[2]; int ret; + trace_tls_alert_send(sock->sk, level, description); + alert[0] = level; alert[1] = description; iov.iov_base = alert; @@ -77,6 +81,7 @@ u8 tls_get_record_type(const struct sock *sk, const struct cmsghdr *cmsg) return 0; record_type = *((u8 *)CMSG_DATA(cmsg)); + trace_tls_contenttype(sk, record_type); return record_type; } EXPORT_SYMBOL(tls_get_record_type); @@ -99,5 +104,7 @@ void tls_alert_recv(const struct sock *sk, const struct msghdr *msg, data = iov->iov_base; *level = data[0]; *description = data[1]; + + trace_tls_alert_recv(sk, *level, *description); } EXPORT_SYMBOL(tls_alert_recv); diff --git a/net/handshake/trace.c b/net/handshake/trace.c index 1c4d8e27e17a..44432d0857b9 100644 --- a/net/handshake/trace.c +++ b/net/handshake/trace.c @@ -8,8 +8,10 @@ */ #include +#include #include +#include #include #include -- cgit v1.2.3 From 9abddac583d68e16258d5e0b95dc1b3ca1886173 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Fri, 21 Jul 2023 14:22:45 -0600 Subject: netfilter: defrag: Add glue hooks for enabling/disabling defrag We want to be able to enable/disable IP packet defrag from core bpf/netfilter code. In other words, execute code from core that could possibly be built as a module. To help avoid symbol resolution errors, use glue hooks that the modules will register callbacks with during module init. Signed-off-by: Daniel Xu Reviewed-by: Florian Westphal Link: https://lore.kernel.org/r/f6a8824052441b72afe5285acedbd634bd3384c1.1689970773.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/linux/netfilter.h | 10 ++++++++++ net/ipv4/netfilter/nf_defrag_ipv4.c | 17 ++++++++++++++++- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 11 +++++++++++ net/netfilter/core.c | 6 ++++++ 4 files changed, 43 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index d4fed4c508ca..d68644b7c299 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -481,6 +482,15 @@ struct nfnl_ct_hook { }; extern const struct nfnl_ct_hook __rcu *nfnl_ct_hook; +struct nf_defrag_hook { + struct module *owner; + int (*enable)(struct net *net); + void (*disable)(struct net *net); +}; + +extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook; +extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook; + /* * nf_skb_duplicated - TEE target has sent a packet * diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index e61ea428ea18..a9ba7de092c4 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -113,17 +114,31 @@ static void __net_exit defrag4_net_exit(struct net *net) } } +static const struct nf_defrag_hook defrag_hook = { + .owner = THIS_MODULE, + .enable = nf_defrag_ipv4_enable, + .disable = nf_defrag_ipv4_disable, +}; + static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, }; static int __init nf_defrag_init(void) { - return register_pernet_subsys(&defrag4_net_ops); + int err; + + err = register_pernet_subsys(&defrag4_net_ops); + if (err) + return err; + + rcu_assign_pointer(nf_defrag_v4_hook, &defrag_hook); + return err; } static void __exit nf_defrag_fini(void) { + rcu_assign_pointer(nf_defrag_v4_hook, NULL); unregister_pernet_subsys(&defrag4_net_ops); } diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index cb4eb1d2c620..d59b296b4f51 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -96,6 +97,12 @@ static void __net_exit defrag6_net_exit(struct net *net) } } +static const struct nf_defrag_hook defrag_hook = { + .owner = THIS_MODULE, + .enable = nf_defrag_ipv6_enable, + .disable = nf_defrag_ipv6_disable, +}; + static struct pernet_operations defrag6_net_ops = { .exit = defrag6_net_exit, }; @@ -114,6 +121,9 @@ static int __init nf_defrag_init(void) pr_err("nf_defrag_ipv6: can't register pernet ops\n"); goto cleanup_frag6; } + + rcu_assign_pointer(nf_defrag_v6_hook, &defrag_hook); + return ret; cleanup_frag6: @@ -124,6 +134,7 @@ cleanup_frag6: static void __exit nf_defrag_fini(void) { + rcu_assign_pointer(nf_defrag_v6_hook, NULL); unregister_pernet_subsys(&defrag6_net_ops); nf_ct_frag6_cleanup(); } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 5f76ae86a656..ef4e76e5aef9 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -680,6 +680,12 @@ EXPORT_SYMBOL_GPL(nfnl_ct_hook); const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_hook); +const struct nf_defrag_hook __rcu *nf_defrag_v4_hook __read_mostly; +EXPORT_SYMBOL_GPL(nf_defrag_v4_hook); + +const struct nf_defrag_hook __rcu *nf_defrag_v6_hook __read_mostly; +EXPORT_SYMBOL_GPL(nf_defrag_v6_hook); + #if IS_ENABLED(CONFIG_NF_CONNTRACK) u8 nf_ctnetlink_has_listener; EXPORT_SYMBOL_GPL(nf_ctnetlink_has_listener); -- cgit v1.2.3 From 91721c2d02d3a0141df8a4787c7079b89b0d0607 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Fri, 21 Jul 2023 14:22:46 -0600 Subject: netfilter: bpf: Support BPF_F_NETFILTER_IP_DEFRAG in netfilter link This commit adds support for enabling IP defrag using pre-existing netfilter defrag support. Basically all the flag does is bump a refcnt while the link the active. Checks are also added to ensure the prog requesting defrag support is run _after_ netfilter defrag hooks. We also take care to avoid any issues w.r.t. module unloading -- while defrag is active on a link, the module is prevented from unloading. Signed-off-by: Daniel Xu Reviewed-by: Florian Westphal Link: https://lore.kernel.org/r/5cff26f97e55161b7d56b09ddcf5f8888a5add1d.1689970773.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 5 ++ net/netfilter/nf_bpf_link.c | 123 ++++++++++++++++++++++++++++++++++++----- tools/include/uapi/linux/bpf.h | 5 ++ 3 files changed, 118 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 14fd26b09e4b..70da85200695 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1188,6 +1188,11 @@ enum bpf_perf_event_type { */ #define BPF_F_KPROBE_MULTI_RETURN (1U << 0) +/* link_create.netfilter.flags used in LINK_CREATE command for + * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation. + */ +#define BPF_F_NETFILTER_IP_DEFRAG (1U << 0) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * the following extensions: * diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index c36da56d756f..8fe594bbc7e2 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include +#include #include #include @@ -23,8 +25,88 @@ struct bpf_nf_link { struct nf_hook_ops hook_ops; struct net *net; u32 dead; + const struct nf_defrag_hook *defrag_hook; }; +static const struct nf_defrag_hook * +get_proto_defrag_hook(struct bpf_nf_link *link, + const struct nf_defrag_hook __rcu *global_hook, + const char *mod) +{ + const struct nf_defrag_hook *hook; + int err; + + /* RCU protects us from races against module unloading */ + rcu_read_lock(); + hook = rcu_dereference(global_hook); + if (!hook) { + rcu_read_unlock(); + err = request_module(mod); + if (err) + return ERR_PTR(err < 0 ? err : -EINVAL); + + rcu_read_lock(); + hook = rcu_dereference(global_hook); + } + + if (hook && try_module_get(hook->owner)) { + /* Once we have a refcnt on the module, we no longer need RCU */ + hook = rcu_pointer_handoff(hook); + } else { + WARN_ONCE(!hook, "%s has bad registration", mod); + hook = ERR_PTR(-ENOENT); + } + rcu_read_unlock(); + + if (!IS_ERR(hook)) { + err = hook->enable(link->net); + if (err) { + module_put(hook->owner); + hook = ERR_PTR(err); + } + } + + return hook; +} + +static int bpf_nf_enable_defrag(struct bpf_nf_link *link) +{ + const struct nf_defrag_hook __maybe_unused *hook; + + switch (link->hook_ops.pf) { +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) + case NFPROTO_IPV4: + hook = get_proto_defrag_hook(link, nf_defrag_v4_hook, "nf_defrag_ipv4"); + if (IS_ERR(hook)) + return PTR_ERR(hook); + + link->defrag_hook = hook; + return 0; +#endif +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + case NFPROTO_IPV6: + hook = get_proto_defrag_hook(link, nf_defrag_v6_hook, "nf_defrag_ipv6"); + if (IS_ERR(hook)) + return PTR_ERR(hook); + + link->defrag_hook = hook; + return 0; +#endif + default: + return -EAFNOSUPPORT; + } +} + +static void bpf_nf_disable_defrag(struct bpf_nf_link *link) +{ + const struct nf_defrag_hook *hook = link->defrag_hook; + + if (!hook) + return; + hook->disable(link->net); + module_put(hook->owner); +} + static void bpf_nf_link_release(struct bpf_link *link) { struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); @@ -32,11 +114,11 @@ static void bpf_nf_link_release(struct bpf_link *link) if (nf_link->dead) return; - /* prevent hook-not-found warning splat from netfilter core when - * .detach was already called - */ - if (!cmpxchg(&nf_link->dead, 0, 1)) + /* do not double release in case .detach was already called */ + if (!cmpxchg(&nf_link->dead, 0, 1)) { nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops); + bpf_nf_disable_defrag(nf_link); + } } static void bpf_nf_link_dealloc(struct bpf_link *link) @@ -92,6 +174,8 @@ static const struct bpf_link_ops bpf_nf_link_lops = { static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr) { + int prio; + switch (attr->link_create.netfilter.pf) { case NFPROTO_IPV4: case NFPROTO_IPV6: @@ -102,19 +186,18 @@ static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr) return -EAFNOSUPPORT; } - if (attr->link_create.netfilter.flags) + if (attr->link_create.netfilter.flags & ~BPF_F_NETFILTER_IP_DEFRAG) return -EOPNOTSUPP; - /* make sure conntrack confirm is always last. - * - * In the future, if userspace can e.g. request defrag, then - * "defrag_requested && prio before NF_IP_PRI_CONNTRACK_DEFRAG" - * should fail. - */ - switch (attr->link_create.netfilter.priority) { - case NF_IP_PRI_FIRST: return -ERANGE; /* sabotage_in and other warts */ - case NF_IP_PRI_LAST: return -ERANGE; /* e.g. conntrack confirm */ - } + /* make sure conntrack confirm is always last */ + prio = attr->link_create.netfilter.priority; + if (prio == NF_IP_PRI_FIRST) + return -ERANGE; /* sabotage_in and other warts */ + else if (prio == NF_IP_PRI_LAST) + return -ERANGE; /* e.g. conntrack confirm */ + else if ((attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) && + prio <= NF_IP_PRI_CONNTRACK_DEFRAG) + return -ERANGE; /* cannot use defrag if prog runs before nf_defrag */ return 0; } @@ -149,6 +232,7 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) link->net = net; link->dead = false; + link->defrag_hook = NULL; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -156,8 +240,17 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } + if (attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) { + err = bpf_nf_enable_defrag(link); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + } + err = nf_register_net_hook(net, &link->hook_ops); if (err) { + bpf_nf_disable_defrag(link); bpf_link_cleanup(&link_primer); return err; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 14fd26b09e4b..70da85200695 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1188,6 +1188,11 @@ enum bpf_perf_event_type { */ #define BPF_F_KPROBE_MULTI_RETURN (1U << 0) +/* link_create.netfilter.flags used in LINK_CREATE command for + * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation. + */ +#define BPF_F_NETFILTER_IP_DEFRAG (1U << 0) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * the following extensions: * -- cgit v1.2.3 From 2b3082c6ef3b0104d822f6f18d2afbe5fc9a5c2c Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Sat, 29 Jul 2023 04:52:15 +0530 Subject: net: flow_dissector: Use 64bits for used_keys As 32bits of dissector->used_keys are exhausted, increase the size to 64bits. This is base change for ESP/AH flow dissector patch. Please find patch and discussions at https://lore.kernel.org/netdev/ZMDNjD46BvZ5zp5I@corigine.com/T/#t Signed-off-by: Ratheesh Kannoth Reviewed-by: Petr Machata # for mlxsw Tested-by: Petr Machata Reviewed-by: Martin Habets Reviewed-by: Simon Horman Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix_vsc9959.c | 8 +-- drivers/net/dsa/sja1105/sja1105_flower.c | 8 +-- drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 6 +- .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 18 +++--- .../ethernet/freescale/dpaa2/dpaa2-switch-flower.c | 22 +++---- drivers/net/ethernet/freescale/enetc/enetc_qos.c | 8 +-- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 16 +++--- drivers/net/ethernet/intel/i40e/i40e_main.c | 18 +++--- drivers/net/ethernet/intel/iavf/iavf_main.c | 18 +++--- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 44 +++++++------- drivers/net/ethernet/intel/igb/igb_main.c | 8 +-- .../net/ethernet/marvell/octeontx2/nic/otx2_tc.c | 18 +++--- .../ethernet/marvell/prestera/prestera_flower.c | 20 +++---- .../ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c | 25 ++++---- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 44 +++++++------- .../net/ethernet/mellanox/mlxsw/spectrum_flower.c | 22 +++---- .../ethernet/microchip/lan966x/lan966x_tc_flower.c | 4 +- .../ethernet/microchip/sparx5/sparx5_tc_flower.c | 4 +- drivers/net/ethernet/microchip/vcap/vcap_tc.c | 18 +++--- drivers/net/ethernet/microchip/vcap/vcap_tc.h | 2 +- drivers/net/ethernet/mscc/ocelot_flower.c | 28 ++++----- .../net/ethernet/netronome/nfp/flower/conntrack.c | 43 +++++++------- .../net/ethernet/netronome/nfp/flower/offload.c | 64 ++++++++++----------- drivers/net/ethernet/qlogic/qede/qede_filter.c | 12 ++-- drivers/net/ethernet/sfc/tc.c | 67 +++++++++++----------- .../net/ethernet/stmicro/stmmac/stmmac_selftests.c | 6 +- drivers/net/ethernet/ti/am65-cpsw-qos.c | 6 +- drivers/net/ethernet/ti/cpsw_priv.c | 6 +- include/net/flow_dissector.h | 5 +- net/core/flow_dissector.c | 2 +- net/ethtool/ioctl.c | 16 +++--- net/netfilter/nf_flow_table_offload.c | 22 +++---- net/netfilter/nf_tables_offload.c | 13 +++-- net/netfilter/nft_cmp.c | 2 +- 34 files changed, 317 insertions(+), 306 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index dce3548dcd05..4a6e52929d25 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1746,10 +1746,10 @@ static int vsc9959_stream_identify(struct flow_cls_offload *f, struct flow_dissector *dissector = rule->match.dissector; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_VLAN) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS))) + ~(BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS))) return -EOPNOTSUPP; if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { diff --git a/drivers/net/dsa/sja1105/sja1105_flower.c b/drivers/net/dsa/sja1105/sja1105_flower.c index fad5afe3819c..9e8ca182c722 100644 --- a/drivers/net/dsa/sja1105/sja1105_flower.c +++ b/drivers/net/dsa/sja1105/sja1105_flower.c @@ -205,10 +205,10 @@ static int sja1105_flower_parse_key(struct sja1105_private *priv, u16 pcp = U16_MAX; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_VLAN) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS))) { + ~(BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS))) { NL_SET_ERR_MSG_MOD(extack, "Unsupported keys used"); return -EOPNOTSUPP; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index d8afcf8d6b30..38d89d80b4a9 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -373,9 +373,9 @@ static int bnxt_tc_parse_flow(struct bnxt *bp, struct flow_dissector *dissector = rule->match.dissector; /* KEY_CONTROL and KEY_BASIC are needed for forming a meaningful key */ - if ((dissector->used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL)) == 0 || - (dissector->used_keys & BIT(FLOW_DISSECTOR_KEY_BASIC)) == 0) { - netdev_info(bp->dev, "cannot form TC key: used_keys = 0x%x\n", + if ((dissector->used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL)) == 0 || + (dissector->used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_BASIC)) == 0) { + netdev_info(bp->dev, "cannot form TC key: used_keys = 0x%llx\n", dissector->used_keys); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c index d3541159487d..72ac4a34424b 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -313,15 +313,15 @@ static int cxgb4_validate_flow_match(struct net_device *dev, u16 ethtype_key = 0; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_PORTS) | - BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | - BIT(FLOW_DISSECTOR_KEY_VLAN) | - BIT(FLOW_DISSECTOR_KEY_IP))) { - netdev_warn(dev, "Unsupported key used: 0x%x\n", + ~(BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) | + BIT_ULL(FLOW_DISSECTOR_KEY_IP))) { + netdev_warn(dev, "Unsupported key used: 0x%llx\n", dissector->used_keys); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch-flower.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch-flower.c index c39b866e2582..4798fb7fe35d 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch-flower.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch-flower.c @@ -17,14 +17,14 @@ static int dpaa2_switch_flower_parse_key(struct flow_cls_offload *cls, struct dpsw_acl_fields *acl_h, *acl_m; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_VLAN) | - BIT(FLOW_DISSECTOR_KEY_PORTS) | - BIT(FLOW_DISSECTOR_KEY_IP) | - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS))) { + ~(BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IP) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS))) { NL_SET_ERR_MSG_MOD(extack, "Unsupported keys used"); return -EOPNOTSUPP; @@ -539,9 +539,9 @@ static int dpaa2_switch_flower_parse_mirror_key(struct flow_cls_offload *cls, int ret = -EOPNOTSUPP; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_VLAN))) { + ~(BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN))) { NL_SET_ERR_MSG_MOD(extack, "Mirroring is supported only per VLAN"); return -EOPNOTSUPP; diff --git a/drivers/net/ethernet/freescale/enetc/enetc_qos.c b/drivers/net/ethernet/freescale/enetc/enetc_qos.c index 270cbd5e8684..2513b44056c1 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_qos.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_qos.c @@ -483,13 +483,13 @@ struct enetc_psfp { static struct actions_fwd enetc_act_fwd[] = { { BIT(FLOW_ACTION_GATE), - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS), + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS), FILTER_ACTION_TYPE_PSFP }, { BIT(FLOW_ACTION_POLICE) | BIT(FLOW_ACTION_GATE), - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS), + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS), FILTER_ACTION_TYPE_PSFP }, /* example for ACL actions */ @@ -1069,8 +1069,8 @@ revert_sid: return err; } -static struct actions_fwd *enetc_check_flow_actions(u64 acts, - unsigned int inputkeys) +static struct actions_fwd * +enetc_check_flow_actions(u64 acts, unsigned long long inputkeys) { int i; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index bf675c15fbb9..83ab89f44250 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -7315,14 +7315,14 @@ static int hclge_parse_cls_flower(struct hclge_dev *hdev, struct flow_dissector *dissector = flow->match.dissector; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_VLAN) | - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_PORTS))) { - dev_err(&hdev->pdev->dev, "unsupported key set: %#x\n", + ~(BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS))) { + dev_err(&hdev->pdev->dev, "unsupported key set: %#llx\n", dissector->used_keys); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 982ae70c51e8..3d0d6974c2a7 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -8525,15 +8525,15 @@ static int i40e_parse_cls_flower(struct i40e_vsi *vsi, u8 field_flags = 0; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_VLAN) | - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_PORTS) | - BIT(FLOW_DISSECTOR_KEY_ENC_KEYID))) { - dev_err(&pf->pdev->dev, "Unsupported key used: 0x%x\n", + ~(BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID))) { + dev_err(&pf->pdev->dev, "Unsupported key used: 0x%llx\n", dissector->used_keys); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 9610ca770349..7b300c86ceda 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -3744,15 +3744,15 @@ static int iavf_parse_cls_flower(struct iavf_adapter *adapter, struct virtchnl_filter *vf = &filter->f; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_VLAN) | - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_PORTS) | - BIT(FLOW_DISSECTOR_KEY_ENC_KEYID))) { - dev_err(&adapter->pdev->dev, "Unsupported key used: 0x%x\n", + ~(BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID))) { + dev_err(&adapter->pdev->dev, "Unsupported key used: 0x%llx\n", dissector->used_keys); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index 4a34ef5f58d3..38547db1ec4e 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -1343,24 +1343,24 @@ ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi, dissector = rule->match.dissector; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_VLAN) | - BIT(FLOW_DISSECTOR_KEY_CVLAN) | - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | - BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_ENC_PORTS) | - BIT(FLOW_DISSECTOR_KEY_ENC_OPTS) | - BIT(FLOW_DISSECTOR_KEY_IP) | - BIT(FLOW_DISSECTOR_KEY_ENC_IP) | - BIT(FLOW_DISSECTOR_KEY_PORTS) | - BIT(FLOW_DISSECTOR_KEY_PPPOE) | - BIT(FLOW_DISSECTOR_KEY_L2TPV3))) { + ~(BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) | + BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_OPTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IP) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IP) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_PPPOE) | + BIT_ULL(FLOW_DISSECTOR_KEY_L2TPV3))) { NL_SET_ERR_MSG_MOD(fltr->extack, "Unsupported key used"); return -EOPNOTSUPP; } @@ -1382,10 +1382,10 @@ ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi, */ headers = &fltr->inner_headers; } else if (dissector->used_keys & - (BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | - BIT(FLOW_DISSECTOR_KEY_ENC_PORTS))) { + (BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_PORTS))) { NL_SET_ERR_MSG_MOD(fltr->extack, "Tunnel key used, but device isn't a tunnel"); return -EOPNOTSUPP; } else { diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 9a2561409b06..9f63a10c6f80 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2615,10 +2615,10 @@ static int igb_parse_cls_flower(struct igb_adapter *adapter, struct netlink_ext_ack *extack = f->common.extack; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_VLAN))) { + ~(BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN))) { NL_SET_ERR_MSG_MOD(extack, "Unsupported key used, only BASIC, CONTROL, ETH_ADDRS and VLAN are supported"); return -EOPNOTSUPP; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c index 0915a0121316..5a44e9b96fc0 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c @@ -454,15 +454,15 @@ static int otx2_tc_prepare_flow(struct otx2_nic *nic, struct otx2_tc_flow *node, dissector = rule->match.dissector; if ((dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_VLAN) | - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_PORTS) | - BIT(FLOW_DISSECTOR_KEY_IP)))) { - netdev_info(nic->netdev, "unsupported flow used key 0x%x", + ~(BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IP)))) { + netdev_info(nic->netdev, "unsupported flow used key 0x%llx", dissector->used_keys); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/marvell/prestera/prestera_flower.c b/drivers/net/ethernet/marvell/prestera/prestera_flower.c index 3e20e71b0f81..8b9455d8a4f7 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_flower.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_flower.c @@ -202,16 +202,16 @@ static int prestera_flower_parse(struct prestera_flow_block *block, int err; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_META) | - BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_ICMP) | - BIT(FLOW_DISSECTOR_KEY_PORTS) | - BIT(FLOW_DISSECTOR_KEY_PORTS_RANGE) | - BIT(FLOW_DISSECTOR_KEY_VLAN))) { + ~(BIT_ULL(FLOW_DISSECTOR_KEY_META) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ICMP) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS_RANGE) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN))) { NL_SET_ERR_MSG_MOD(f->common.extack, "Unsupported key"); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c index 2b80fe73549d..8c531f4ec912 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c @@ -221,16 +221,21 @@ mlx5_ct_fs_smfs_destroy(struct mlx5_ct_fs *fs) } static inline bool -mlx5_tc_ct_valid_used_dissector_keys(const u32 used_keys) +mlx5_tc_ct_valid_used_dissector_keys(const u64 used_keys) { -#define DISS_BIT(name) BIT(FLOW_DISSECTOR_KEY_ ## name) - const u32 basic_keys = DISS_BIT(BASIC) | DISS_BIT(CONTROL) | DISS_BIT(META); - const u32 ipv4_tcp = basic_keys | DISS_BIT(IPV4_ADDRS) | DISS_BIT(PORTS) | DISS_BIT(TCP); - const u32 ipv6_tcp = basic_keys | DISS_BIT(IPV6_ADDRS) | DISS_BIT(PORTS) | DISS_BIT(TCP); - const u32 ipv4_udp = basic_keys | DISS_BIT(IPV4_ADDRS) | DISS_BIT(PORTS); - const u32 ipv6_udp = basic_keys | DISS_BIT(IPV6_ADDRS) | DISS_BIT(PORTS); - const u32 ipv4_gre = basic_keys | DISS_BIT(IPV4_ADDRS); - const u32 ipv6_gre = basic_keys | DISS_BIT(IPV6_ADDRS); +#define DISS_BIT(name) BIT_ULL(FLOW_DISSECTOR_KEY_ ## name) + const u64 basic_keys = DISS_BIT(BASIC) | DISS_BIT(CONTROL) | + DISS_BIT(META); + const u64 ipv4_tcp = basic_keys | DISS_BIT(IPV4_ADDRS) | + DISS_BIT(PORTS) | DISS_BIT(TCP); + const u64 ipv6_tcp = basic_keys | DISS_BIT(IPV6_ADDRS) | + DISS_BIT(PORTS) | DISS_BIT(TCP); + const u64 ipv4_udp = basic_keys | DISS_BIT(IPV4_ADDRS) | + DISS_BIT(PORTS); + const u64 ipv6_udp = basic_keys | DISS_BIT(IPV6_ADDRS) | + DISS_BIT(PORTS); + const u64 ipv4_gre = basic_keys | DISS_BIT(IPV4_ADDRS); + const u64 ipv6_gre = basic_keys | DISS_BIT(IPV6_ADDRS); return (used_keys == ipv4_tcp || used_keys == ipv4_udp || used_keys == ipv6_tcp || used_keys == ipv6_udp || used_keys == ipv4_gre || used_keys == ipv6_gre); @@ -247,7 +252,7 @@ mlx5_ct_fs_smfs_ct_validate_flow_rule(struct mlx5_ct_fs *fs, struct flow_rule *f struct flow_match_tcp tcp; if (!mlx5_tc_ct_valid_used_dissector_keys(flow_rule->match.dissector->used_keys)) { - ct_dbg("rule uses unexpected dissectors (0x%08x)", + ct_dbg("rule uses unexpected dissectors (0x%016llx)", flow_rule->match.dissector->used_keys); return false; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 507825a1abc8..dfd282319753 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -2590,29 +2590,29 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, match_level = outer_match_level; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_META) | - BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_VLAN) | - BIT(FLOW_DISSECTOR_KEY_CVLAN) | - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_PORTS) | - BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | - BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_ENC_PORTS) | - BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_TCP) | - BIT(FLOW_DISSECTOR_KEY_IP) | - BIT(FLOW_DISSECTOR_KEY_CT) | - BIT(FLOW_DISSECTOR_KEY_ENC_IP) | - BIT(FLOW_DISSECTOR_KEY_ENC_OPTS) | - BIT(FLOW_DISSECTOR_KEY_ICMP) | - BIT(FLOW_DISSECTOR_KEY_MPLS))) { + ~(BIT_ULL(FLOW_DISSECTOR_KEY_META) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) | + BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_TCP) | + BIT_ULL(FLOW_DISSECTOR_KEY_IP) | + BIT_ULL(FLOW_DISSECTOR_KEY_CT) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IP) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_OPTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ICMP) | + BIT_ULL(FLOW_DISSECTOR_KEY_MPLS))) { NL_SET_ERR_MSG_MOD(extack, "Unsupported key"); - netdev_dbg(priv->netdev, "Unsupported key used: 0x%x\n", + netdev_dbg(priv->netdev, "Unsupported key used: 0x%llx\n", dissector->used_keys); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index 8329100479b3..af3f57d017ec 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -558,17 +558,17 @@ static int mlxsw_sp_flower_parse(struct mlxsw_sp *mlxsw_sp, int err; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_META) | - BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_PORTS) | - BIT(FLOW_DISSECTOR_KEY_PORTS_RANGE) | - BIT(FLOW_DISSECTOR_KEY_TCP) | - BIT(FLOW_DISSECTOR_KEY_IP) | - BIT(FLOW_DISSECTOR_KEY_VLAN))) { + ~(BIT_ULL(FLOW_DISSECTOR_KEY_META) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS_RANGE) | + BIT_ULL(FLOW_DISSECTOR_KEY_TCP) | + BIT_ULL(FLOW_DISSECTOR_KEY_IP) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN))) { dev_err(mlxsw_sp->bus_info->dev, "Unsupported key\n"); NL_SET_ERR_MSG_MOD(f->common.extack, "Unsupported key"); return -EOPNOTSUPP; diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_tc_flower.c b/drivers/net/ethernet/microchip/lan966x/lan966x_tc_flower.c index 96b3def6c474..d696cf9dbd19 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_tc_flower.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_tc_flower.c @@ -75,7 +75,7 @@ lan966x_tc_flower_handler_control_usage(struct vcap_tc_flower_parse_usage *st) goto out; } - st->used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL); + st->used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL); return err; @@ -172,7 +172,7 @@ lan966x_tc_flower_handler_basic_usage(struct vcap_tc_flower_parse_usage *st) } } - st->used_keys |= BIT(FLOW_DISSECTOR_KEY_BASIC); + st->used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_BASIC); return err; out: NL_SET_ERR_MSG_MOD(st->fco->common.extack, "ip_proto parse error"); diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c b/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c index 3f87a5285a6d..906299ad8425 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_tc_flower.c @@ -126,7 +126,7 @@ sparx5_tc_flower_handler_basic_usage(struct vcap_tc_flower_parse_usage *st) } } - st->used_keys |= BIT(FLOW_DISSECTOR_KEY_BASIC); + st->used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_BASIC); return err; @@ -175,7 +175,7 @@ sparx5_tc_flower_handler_control_usage(struct vcap_tc_flower_parse_usage *st) goto out; } - st->used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL); + st->used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL); return err; diff --git a/drivers/net/ethernet/microchip/vcap/vcap_tc.c b/drivers/net/ethernet/microchip/vcap/vcap_tc.c index 09abe7944af6..27e2dffb65e6 100644 --- a/drivers/net/ethernet/microchip/vcap/vcap_tc.c +++ b/drivers/net/ethernet/microchip/vcap/vcap_tc.c @@ -50,7 +50,7 @@ int vcap_tc_flower_handler_ethaddr_usage(struct vcap_tc_flower_parse_usage *st) goto out; } - st->used_keys |= BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS); + st->used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS); return err; @@ -86,7 +86,7 @@ int vcap_tc_flower_handler_ipv4_usage(struct vcap_tc_flower_parse_usage *st) } } - st->used_keys |= BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS); + st->used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS); return err; @@ -124,7 +124,7 @@ int vcap_tc_flower_handler_ipv6_usage(struct vcap_tc_flower_parse_usage *st) goto out; } } - st->used_keys |= BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS); + st->used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS); return err; out: NL_SET_ERR_MSG_MOD(st->fco->common.extack, "ipv6_addr parse error"); @@ -158,7 +158,7 @@ int vcap_tc_flower_handler_portnum_usage(struct vcap_tc_flower_parse_usage *st) goto out; } - st->used_keys |= BIT(FLOW_DISSECTOR_KEY_PORTS); + st->used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); return err; @@ -201,7 +201,7 @@ int vcap_tc_flower_handler_cvlan_usage(struct vcap_tc_flower_parse_usage *st) goto out; } - st->used_keys |= BIT(FLOW_DISSECTOR_KEY_CVLAN); + st->used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN); return 0; out: @@ -238,7 +238,7 @@ int vcap_tc_flower_handler_vlan_usage(struct vcap_tc_flower_parse_usage *st, if (mt.mask->vlan_tpid) st->tpid = be16_to_cpu(mt.key->vlan_tpid); - st->used_keys |= BIT(FLOW_DISSECTOR_KEY_VLAN); + st->used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); return 0; out: @@ -313,7 +313,7 @@ int vcap_tc_flower_handler_tcp_usage(struct vcap_tc_flower_parse_usage *st) goto out; } - st->used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP); + st->used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_TCP); return err; @@ -376,7 +376,7 @@ int vcap_tc_flower_handler_arp_usage(struct vcap_tc_flower_parse_usage *st) goto out; } - st->used_keys |= BIT(FLOW_DISSECTOR_KEY_ARP); + st->used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ARP); return 0; @@ -401,7 +401,7 @@ int vcap_tc_flower_handler_ip_usage(struct vcap_tc_flower_parse_usage *st) goto out; } - st->used_keys |= BIT(FLOW_DISSECTOR_KEY_IP); + st->used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_IP); return err; diff --git a/drivers/net/ethernet/microchip/vcap/vcap_tc.h b/drivers/net/ethernet/microchip/vcap/vcap_tc.h index 071f892f9aa4..49b02d032906 100644 --- a/drivers/net/ethernet/microchip/vcap/vcap_tc.h +++ b/drivers/net/ethernet/microchip/vcap/vcap_tc.h @@ -14,7 +14,7 @@ struct vcap_tc_flower_parse_usage { u16 l3_proto; u8 l4_proto; u16 tpid; - unsigned int used_keys; + unsigned long long used_keys; }; int vcap_tc_flower_handler_ethaddr_usage(struct vcap_tc_flower_parse_usage *st); diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index e0916afcddfb..33b438c6aec5 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -581,14 +581,14 @@ ocelot_flower_parse_key(struct ocelot *ocelot, int port, bool ingress, int ret; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_META) | - BIT(FLOW_DISSECTOR_KEY_PORTS) | - BIT(FLOW_DISSECTOR_KEY_VLAN) | - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS))) { + ~(BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_META) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS))) { return -EOPNOTSUPP; } @@ -641,12 +641,12 @@ ocelot_flower_parse_key(struct ocelot *ocelot, int port, bool ingress, * then just bail out */ if ((dissector->used_keys & - (BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_CONTROL))) != - (BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_CONTROL))) + (BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL))) != + (BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL))) return -EOPNOTSUPP; flow_rule_match_eth_addrs(rule, &match); diff --git a/drivers/net/ethernet/netronome/nfp/flower/conntrack.c b/drivers/net/ethernet/netronome/nfp/flower/conntrack.c index 73032173ac4e..2643c4b3ff1f 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/conntrack.c +++ b/drivers/net/ethernet/netronome/nfp/flower/conntrack.c @@ -61,7 +61,7 @@ bool is_pre_ct_flow(struct flow_cls_offload *flow) struct flow_match_ct ct; int i; - if (dissector->used_keys & BIT(FLOW_DISSECTOR_KEY_CT)) { + if (dissector->used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CT)) { flow_rule_match_ct(rule, &ct); if (ct.key->ct_state) return false; @@ -94,7 +94,7 @@ bool is_post_ct_flow(struct flow_cls_offload *flow) struct flow_match_ct ct; int i; - if (dissector->used_keys & BIT(FLOW_DISSECTOR_KEY_CT)) { + if (dissector->used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CT)) { flow_rule_match_ct(rule, &ct); if (ct.key->ct_state & TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED) return true; @@ -236,10 +236,11 @@ static bool nfp_ct_merge_check_cannot_skip(struct nfp_fl_ct_flow_entry *entry1, static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, struct nfp_fl_ct_flow_entry *entry2) { - unsigned int ovlp_keys = entry1->rule->match.dissector->used_keys & - entry2->rule->match.dissector->used_keys; + unsigned long long ovlp_keys; bool out, is_v6 = false; u8 ip_proto = 0; + ovlp_keys = entry1->rule->match.dissector->used_keys & + entry2->rule->match.dissector->used_keys; /* Temporary buffer for mangling keys, 64 is enough to cover max * struct size of key in various fields that may be mangled. * Supported fields to mangle: @@ -257,7 +258,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, /* Check the overlapped fields one by one, the unmasked part * should not conflict with each other. */ - if (ovlp_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL)) { + if (ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL)) { struct flow_match_control match1, match2; flow_rule_match_control(entry1->rule, &match1); @@ -267,7 +268,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, goto check_failed; } - if (ovlp_keys & BIT(FLOW_DISSECTOR_KEY_BASIC)) { + if (ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_BASIC)) { struct flow_match_basic match1, match2; flow_rule_match_basic(entry1->rule, &match1); @@ -289,7 +290,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, * will be do merge check when do nft and post ct merge, * so skip this ip merge check here. */ - if ((ovlp_keys & BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS)) && + if ((ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS)) && nfp_ct_merge_check_cannot_skip(entry1, entry2)) { struct flow_match_ipv4_addrs match1, match2; @@ -311,7 +312,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, * will be do merge check when do nft and post ct merge, * so skip this ip merge check here. */ - if ((ovlp_keys & BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS)) && + if ((ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS)) && nfp_ct_merge_check_cannot_skip(entry1, entry2)) { struct flow_match_ipv6_addrs match1, match2; @@ -333,7 +334,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, * will be do merge check when do nft and post ct merge, * so skip this tport merge check here. */ - if ((ovlp_keys & BIT(FLOW_DISSECTOR_KEY_PORTS)) && + if ((ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_PORTS)) && nfp_ct_merge_check_cannot_skip(entry1, entry2)) { enum flow_action_mangle_base htype = FLOW_ACT_MANGLE_UNSPEC; struct flow_match_ports match1, match2; @@ -355,7 +356,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, goto check_failed; } - if (ovlp_keys & BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS)) { + if (ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct flow_match_eth_addrs match1, match2; flow_rule_match_eth_addrs(entry1->rule, &match1); @@ -371,7 +372,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, goto check_failed; } - if (ovlp_keys & BIT(FLOW_DISSECTOR_KEY_VLAN)) { + if (ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN)) { struct flow_match_vlan match1, match2; flow_rule_match_vlan(entry1->rule, &match1); @@ -381,7 +382,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, goto check_failed; } - if (ovlp_keys & BIT(FLOW_DISSECTOR_KEY_MPLS)) { + if (ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_MPLS)) { struct flow_match_mpls match1, match2; flow_rule_match_mpls(entry1->rule, &match1); @@ -391,7 +392,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, goto check_failed; } - if (ovlp_keys & BIT(FLOW_DISSECTOR_KEY_TCP)) { + if (ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_TCP)) { struct flow_match_tcp match1, match2; flow_rule_match_tcp(entry1->rule, &match1); @@ -401,7 +402,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, goto check_failed; } - if (ovlp_keys & BIT(FLOW_DISSECTOR_KEY_IP)) { + if (ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_IP)) { struct flow_match_ip match1, match2; flow_rule_match_ip(entry1->rule, &match1); @@ -413,7 +414,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, goto check_failed; } - if (ovlp_keys & BIT(FLOW_DISSECTOR_KEY_ENC_KEYID)) { + if (ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID)) { struct flow_match_enc_keyid match1, match2; flow_rule_match_enc_keyid(entry1->rule, &match1); @@ -423,7 +424,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, goto check_failed; } - if (ovlp_keys & BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { + if (ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { struct flow_match_ipv4_addrs match1, match2; flow_rule_match_enc_ipv4_addrs(entry1->rule, &match1); @@ -433,7 +434,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, goto check_failed; } - if (ovlp_keys & BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { + if (ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { struct flow_match_ipv6_addrs match1, match2; flow_rule_match_enc_ipv6_addrs(entry1->rule, &match1); @@ -443,7 +444,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, goto check_failed; } - if (ovlp_keys & BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL)) { + if (ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL)) { struct flow_match_control match1, match2; flow_rule_match_enc_control(entry1->rule, &match1); @@ -453,7 +454,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, goto check_failed; } - if (ovlp_keys & BIT(FLOW_DISSECTOR_KEY_ENC_IP)) { + if (ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IP)) { struct flow_match_ip match1, match2; flow_rule_match_enc_ip(entry1->rule, &match1); @@ -463,7 +464,7 @@ static int nfp_ct_merge_check(struct nfp_fl_ct_flow_entry *entry1, goto check_failed; } - if (ovlp_keys & BIT(FLOW_DISSECTOR_KEY_ENC_OPTS)) { + if (ovlp_keys & BIT_ULL(FLOW_DISSECTOR_KEY_ENC_OPTS)) { struct flow_match_enc_opts match1, match2; flow_rule_match_enc_opts(entry1->rule, &match1); @@ -589,7 +590,7 @@ static int nfp_ct_check_meta(struct nfp_fl_ct_flow_entry *post_ct_entry, int i; ct_met = get_flow_act(nft_entry->rule, FLOW_ACTION_CT_METADATA); - if (ct_met && (dissector->used_keys & BIT(FLOW_DISSECTOR_KEY_CT))) { + if (ct_met && (dissector->used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CT))) { u32 *act_lbl; act_lbl = ct_met->ct_metadata.labels; diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 18328eb7f5c3..c153f0575b92 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -24,43 +24,43 @@ FLOW_DIS_FIRST_FRAG) #define NFP_FLOWER_WHITELIST_DISSECTOR \ - (BIT(FLOW_DISSECTOR_KEY_CONTROL) | \ - BIT(FLOW_DISSECTOR_KEY_BASIC) | \ - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | \ - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | \ - BIT(FLOW_DISSECTOR_KEY_TCP) | \ - BIT(FLOW_DISSECTOR_KEY_PORTS) | \ - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | \ - BIT(FLOW_DISSECTOR_KEY_VLAN) | \ - BIT(FLOW_DISSECTOR_KEY_CVLAN) | \ - BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | \ - BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | \ - BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | \ - BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \ - BIT(FLOW_DISSECTOR_KEY_ENC_PORTS) | \ - BIT(FLOW_DISSECTOR_KEY_ENC_OPTS) | \ - BIT(FLOW_DISSECTOR_KEY_ENC_IP) | \ - BIT(FLOW_DISSECTOR_KEY_MPLS) | \ - BIT(FLOW_DISSECTOR_KEY_CT) | \ - BIT(FLOW_DISSECTOR_KEY_META) | \ - BIT(FLOW_DISSECTOR_KEY_IP)) + (BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_TCP) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_PORTS) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_OPTS) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IP) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_MPLS) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_CT) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_META) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_IP)) #define NFP_FLOWER_WHITELIST_TUN_DISSECTOR \ - (BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \ - BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | \ - BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | \ - BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | \ - BIT(FLOW_DISSECTOR_KEY_ENC_OPTS) | \ - BIT(FLOW_DISSECTOR_KEY_ENC_PORTS) | \ - BIT(FLOW_DISSECTOR_KEY_ENC_IP)) + (BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_OPTS) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_PORTS) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IP)) #define NFP_FLOWER_WHITELIST_TUN_DISSECTOR_R \ - (BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \ - BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) + (BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) #define NFP_FLOWER_WHITELIST_TUN_DISSECTOR_V6_R \ - (BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \ - BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) + (BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \ + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) #define NFP_FLOWER_MERGE_FIELDS \ (NFP_FLOWER_LAYER_PORT | \ @@ -1303,7 +1303,7 @@ static bool offload_pre_check(struct flow_cls_offload *flow) struct flow_dissector *dissector = rule->match.dissector; struct flow_match_ct ct; - if (dissector->used_keys & BIT(FLOW_DISSECTOR_KEY_CT)) { + if (dissector->used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CT)) { flow_rule_match_ct(rule, &ct); /* Allow special case where CT match is all 0 */ if (memchr_inv(ct.key, 0, sizeof(*ct.key))) diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index 3010833ddde3..a5ac21a0ee33 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -1827,12 +1827,12 @@ qede_parse_flow_attr(struct qede_dev *edev, __be16 proto, memset(tuple, 0, sizeof(*tuple)); if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_PORTS))) { - DP_NOTICE(edev, "Unsupported key set:0x%x\n", + ~(BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS))) { + DP_NOTICE(edev, "Unsupported key set:0x%llx\n", dissector->used_keys); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/sfc/tc.c b/drivers/net/ethernet/sfc/tc.c index 15ebd3973922..4dc881159246 100644 --- a/drivers/net/ethernet/sfc/tc.c +++ b/drivers/net/ethernet/sfc/tc.c @@ -201,23 +201,23 @@ static int efx_tc_flower_parse_match(struct efx_nic *efx, } } if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_VLAN) | - BIT(FLOW_DISSECTOR_KEY_CVLAN) | - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_PORTS) | - BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | - BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_ENC_IP) | - BIT(FLOW_DISSECTOR_KEY_ENC_PORTS) | - BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_TCP) | - BIT(FLOW_DISSECTOR_KEY_IP))) { - NL_SET_ERR_MSG_FMT_MOD(extack, "Unsupported flower keys %#x", + ~(BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) | + BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IP) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_TCP) | + BIT_ULL(FLOW_DISSECTOR_KEY_IP))) { + NL_SET_ERR_MSG_FMT_MOD(extack, "Unsupported flower keys %#llx", dissector->used_keys); return -EOPNOTSUPP; } @@ -228,12 +228,13 @@ static int efx_tc_flower_parse_match(struct efx_nic *efx, !(match->value.eth_proto == htons(ETH_P_IP) || match->value.eth_proto == htons(ETH_P_IPV6))) if (dissector->used_keys & - (BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_PORTS) | - BIT(FLOW_DISSECTOR_KEY_IP) | - BIT(FLOW_DISSECTOR_KEY_TCP))) { - NL_SET_ERR_MSG_FMT_MOD(extack, "L3/L4 flower keys %#x require protocol ipv[46]", + (BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_IP) | + BIT_ULL(FLOW_DISSECTOR_KEY_TCP))) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "L3/L4 flower keys %#llx require protocol ipv[46]", dissector->used_keys); return -EINVAL; } @@ -281,9 +282,10 @@ static int efx_tc_flower_parse_match(struct efx_nic *efx, if ((match->value.ip_proto != IPPROTO_UDP && match->value.ip_proto != IPPROTO_TCP) || !IS_ALL_ONES(match->mask.ip_proto)) if (dissector->used_keys & - (BIT(FLOW_DISSECTOR_KEY_PORTS) | - BIT(FLOW_DISSECTOR_KEY_TCP))) { - NL_SET_ERR_MSG_FMT_MOD(extack, "L4 flower keys %#x require ipproto udp or tcp", + (BIT_ULL(FLOW_DISSECTOR_KEY_PORTS) | + BIT_ULL(FLOW_DISSECTOR_KEY_TCP))) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "L4 flower keys %#llx require ipproto udp or tcp", dissector->used_keys); return -EINVAL; } @@ -344,12 +346,13 @@ static int efx_tc_flower_parse_match(struct efx_nic *efx, MAP_ENC_KEY_AND_MASK(PORTS, ports, enc_ports, dst, enc_dport); MAP_ENC_KEY_AND_MASK(KEYID, enc_keyid, enc_keyid, keyid, enc_keyid); } else if (dissector->used_keys & - (BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | - BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | - BIT(FLOW_DISSECTOR_KEY_ENC_IP) | - BIT(FLOW_DISSECTOR_KEY_ENC_PORTS))) { - NL_SET_ERR_MSG_FMT_MOD(extack, "Flower enc keys require enc_control (keys: %#x)", + (BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IP) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_PORTS))) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "Flower enc keys require enc_control (keys: %#llx)", dissector->used_keys); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c index 687f43cd466c..f9e43fc32ee8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c @@ -1355,7 +1355,7 @@ static int __stmmac_test_l3filt(struct stmmac_priv *priv, u32 dst, u32 src, goto cleanup_rss; } - dissector->used_keys |= (1 << FLOW_DISSECTOR_KEY_IPV4_ADDRS); + dissector->used_keys |= (1ULL << FLOW_DISSECTOR_KEY_IPV4_ADDRS); dissector->offset[FLOW_DISSECTOR_KEY_IPV4_ADDRS] = 0; cls = kzalloc(sizeof(*cls), GFP_KERNEL); @@ -1481,8 +1481,8 @@ static int __stmmac_test_l4filt(struct stmmac_priv *priv, u32 dst, u32 src, goto cleanup_rss; } - dissector->used_keys |= (1 << FLOW_DISSECTOR_KEY_BASIC); - dissector->used_keys |= (1 << FLOW_DISSECTOR_KEY_PORTS); + dissector->used_keys |= (1ULL << FLOW_DISSECTOR_KEY_BASIC); + dissector->used_keys |= (1ULL << FLOW_DISSECTOR_KEY_PORTS); dissector->offset[FLOW_DISSECTOR_KEY_BASIC] = 0; dissector->offset[FLOW_DISSECTOR_KEY_PORTS] = offsetof(typeof(keys), key); diff --git a/drivers/net/ethernet/ti/am65-cpsw-qos.c b/drivers/net/ethernet/ti/am65-cpsw-qos.c index eced87fa261c..9ac2ff05d501 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-qos.c +++ b/drivers/net/ethernet/ti/am65-cpsw-qos.c @@ -624,9 +624,9 @@ static int am65_cpsw_qos_clsflower_add_policer(struct am65_cpsw_port *port, int ret; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS))) { + ~(BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS))) { NL_SET_ERR_MSG_MOD(extack, "Unsupported keys used"); return -EOPNOTSUPP; diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index e966dd47e2db..ae52cdbcf8cc 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -1396,9 +1396,9 @@ static int cpsw_qos_clsflower_add_policer(struct cpsw_priv *priv, int ret; if (dissector->used_keys & - ~(BIT(FLOW_DISSECTOR_KEY_BASIC) | - BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS))) { + ~(BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS))) { NL_SET_ERR_MSG_MOD(extack, "Unsupported keys used"); return -EOPNOTSUPP; diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 8664ed4fbbdf..830f06b2f36d 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -370,7 +370,8 @@ struct flow_dissector_key { }; struct flow_dissector { - unsigned int used_keys; /* each bit repesents presence of one key id */ + unsigned long long used_keys; + /* each bit represents presence of one key id */ unsigned short int offset[FLOW_DISSECTOR_KEY_MAX]; }; @@ -430,7 +431,7 @@ void skb_flow_get_icmp_tci(const struct sk_buff *skb, static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { - return flow_dissector->used_keys & (1 << key_id); + return flow_dissector->used_keys & (1ULL << key_id); } static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 85a2d0d9bd39..ed5dfa376024 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -40,7 +40,7 @@ static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { - flow_dissector->used_keys |= (1 << key_id); + flow_dissector->used_keys |= (1ULL << key_id); } void skb_flow_dissector_init(struct flow_dissector *flow_dissector, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 7d40e7913e76..0b0ce4f81c01 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -3208,7 +3208,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) if (v4_m_spec->ip4src || v4_m_spec->ip4dst) { match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS); + BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_IPV4_ADDRS] = offsetof(struct ethtool_rx_flow_key, ipv4); } @@ -3223,7 +3223,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) if (v4_m_spec->psrc || v4_m_spec->pdst) { match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_PORTS); + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = offsetof(struct ethtool_rx_flow_key, tp); } @@ -3260,7 +3260,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src) || !ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) { match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS); + BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_IPV6_ADDRS] = offsetof(struct ethtool_rx_flow_key, ipv6); } @@ -3275,7 +3275,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) if (v6_m_spec->psrc || v6_m_spec->pdst) { match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_PORTS); + BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = offsetof(struct ethtool_rx_flow_key, tp); } @@ -3283,7 +3283,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) match->key.ip.tos = v6_spec->tclass; match->mask.ip.tos = v6_m_spec->tclass; match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_IP); + BIT_ULL(FLOW_DISSECTOR_KEY_IP); match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = offsetof(struct ethtool_rx_flow_key, ip); } @@ -3307,7 +3307,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) break; } - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_BASIC); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_BASIC); match->dissector.offset[FLOW_DISSECTOR_KEY_BASIC] = offsetof(struct ethtool_rx_flow_key, basic); @@ -3340,7 +3340,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) if (ext_m_spec->vlan_etype || ext_m_spec->vlan_tci) { match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_VLAN); + BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct ethtool_rx_flow_key, vlan); } @@ -3355,7 +3355,7 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) ETH_ALEN); match->dissector.used_keys |= - BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS); + BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_ETH_ADDRS] = offsetof(struct ethtool_rx_flow_key, eth_addrs); } diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 1c26f03fc661..a010b25076ca 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -34,7 +34,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, { struct nf_flow_key *mask = &match->mask; struct nf_flow_key *key = &match->key; - unsigned int enc_keys; + unsigned long long enc_keys; if (!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)) return; @@ -43,8 +43,8 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); key->enc_key_id.keyid = tunnel_id_to_key32(tun_info->key.tun_id); mask->enc_key_id.keyid = 0xffffffff; - enc_keys = BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | - BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL); + enc_keys = BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL); if (ip_tunnel_info_af(tun_info) == AF_INET) { NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, @@ -55,7 +55,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, mask->enc_ipv4.src = 0xffffffff; if (key->enc_ipv4.dst) mask->enc_ipv4.dst = 0xffffffff; - enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS); + enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS); key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } else { memcpy(&key->enc_ipv6.src, &tun_info->key.u.ipv6.dst, @@ -70,7 +70,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, sizeof(struct in6_addr))) memset(&mask->enc_ipv6.dst, 0xff, sizeof(struct in6_addr)); - enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS); + enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS); key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } @@ -163,14 +163,14 @@ static int nf_flow_rule_match(struct nf_flow_match *match, return -EOPNOTSUPP; } mask->control.addr_type = 0xffff; - match->dissector.used_keys |= BIT(key->control.addr_type); + match->dissector.used_keys |= BIT_ULL(key->control.addr_type); mask->basic.n_proto = 0xffff; switch (tuple->l4proto) { case IPPROTO_TCP: key->tcp.flags = 0; mask->tcp.flags = cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16); - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_TCP); break; case IPPROTO_UDP: case IPPROTO_GRE: @@ -182,9 +182,9 @@ static int nf_flow_rule_match(struct nf_flow_match *match, key->basic.ip_proto = tuple->l4proto; mask->basic.ip_proto = 0xff; - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) | - BIT(FLOW_DISSECTOR_KEY_CONTROL) | - BIT(FLOW_DISSECTOR_KEY_BASIC); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_META) | + BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) | + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC); switch (tuple->l4proto) { case IPPROTO_TCP: @@ -194,7 +194,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, key->tp.dst = tuple->dst_port; mask->tp.dst = 0xffff; - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_PORTS); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); break; } diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 910ef881c3b8..12ab78fa5d84 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -35,12 +35,12 @@ void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, struct nft_flow_key *mask = &match->mask; struct nft_flow_key *key = &match->key; - if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL)) + if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL)) return; key->control.addr_type = addr_type; mask->control.addr_type = 0xffff; - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL); match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] = offsetof(struct nft_flow_key, control); } @@ -59,7 +59,7 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, .mask = match->mask.basic.n_proto, }; - if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_VLAN) && + if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) && (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.cvlan.vlan_tpid; @@ -70,8 +70,9 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = offsetof(struct nft_flow_key, cvlan); - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CVLAN); - } else if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_BASIC) && + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN); + } else if (match->dissector.used_keys & + BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) && (match->key.basic.n_proto == htons(ETH_P_8021Q) || match->key.basic.n_proto == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.vlan.vlan_tpid; @@ -80,7 +81,7 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct nft_flow_key, vlan); - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_VLAN); + match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); } } diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 6eb21a4f5698..cd4652259095 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -162,7 +162,7 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx, memcpy(key + reg->offset, data, reg->len); memcpy(mask + reg->offset, datamask, reg->len); - flow->match.dissector.used_keys |= BIT(reg->key); + flow->match.dissector.used_keys |= BIT_ULL(reg->key); flow->match.dissector.offset[reg->key] = reg->base_offset; if (reg->key == FLOW_DISSECTOR_KEY_META && -- cgit v1.2.3 From 8936bf53a091ad6a34b480c22002f1cb2422ab38 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 28 Jul 2023 17:48:13 -0700 Subject: net: Use sockaddr_storage for getsockopt(SO_PEERNAME). Commit df8fc4e934c1 ("kbuild: Enable -fstrict-flex-arrays=3") started applying strict rules to standard string functions. It does not work well with conventional socket code around each protocol- specific sockaddr_XXX struct, which is cast from sockaddr_storage and has a bigger size than fortified functions expect. See these commits: commit 06d4c8a80836 ("af_unix: Fix fortify_panic() in unix_bind_bsd().") commit ecb4534b6a1c ("af_unix: Terminate sun_path when bind()ing pathname socket.") commit a0ade8404c3b ("af_packet: Fix warning of fortified memcpy() in packet_getname().") We must cast the protocol-specific address back to sockaddr_storage to call such functions. However, in the case of getsockaddr(SO_PEERNAME), the rationale is a bit unclear as the buffer is defined by char[128] which is the same size as sockaddr_storage. Let's use sockaddr_storage explicitly. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index ab1e8d1bd5a1..4ad267ba0099 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1806,14 +1806,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname, case SO_PEERNAME: { - char address[128]; + struct sockaddr_storage address; - lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); + lv = sock->ops->getname(sock, (struct sockaddr *)&address, 2); if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; - if (copy_to_sockptr(optval, address, len)) + if (copy_to_sockptr(optval, &address, len)) return -EFAULT; goto lenout; } -- cgit v1.2.3 From 74bdfab4fd7c641e55f7fe9d1be9687eeb01df67 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 31 Jul 2023 11:42:53 +0200 Subject: net: remove duplicate INDIRECT_CALLABLE_DECLARE of udp[6]_ehashfn There are already INDIRECT_CALLABLE_DECLARE in the hashtable headers, no need to declare them again. Fixes: 0f495f761722 ("net: remove duplicate reuseport_lookup functions") Suggested-by: Martin Lau Signed-off-by: Lorenz Bauer Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230731-indir-call-v1-1-4cd0aeaee64f@isovalent.com Signed-off-by: Martin KaFai Lau --- net/ipv4/inet_hashtables.c | 2 -- net/ipv6/inet6_hashtables.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 6a872b8fb0d3..7876b7d703cb 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -333,8 +333,6 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } -INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn); - /** * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. * @net: network namespace. diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 7c9700c7c9c8..b0e8d278e8a9 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -112,8 +112,6 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } -INDIRECT_CALLABLE_DECLARE(inet6_ehashfn_t udp6_ehashfn); - /** * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary. * @net: network namespace. -- cgit v1.2.3 From 4cbc32a8a2b44fc7e91ae3006e5da3eaf630e383 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Sat, 29 Jul 2023 20:19:29 +0800 Subject: net/smc: Remove unused function declarations commit f9aab6f2ce57 ("net/smc: immediate freeing in smc_lgr_cleanup_early()") left behind smc_lgr_schedule_free_work_fast() declaration. And since commit 349d43127dac ("net/smc: fix kernel panic caused by race of smc_sock") smc_ib_modify_qp_reset() is not used anymore. Signed-off-by: Yue Haibing Reviewed-by: Simon Horman Reviewed-by: Tony Lu Reviewed-by: Wenjia Zhang Link: https://lore.kernel.org/r/20230729121929.17180-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/smc/smc_core.h | 1 - net/smc/smc_ib.h | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 1645fba0d2d3..3c1b31bfa1cf 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -539,7 +539,6 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); -void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); int smc_core_init(void); void smc_core_exit(void); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 034295676e88..4df5f8c8a0a1 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -96,7 +96,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk); int smc_ib_create_queue_pair(struct smc_link *lnk); int smc_ib_ready_link(struct smc_link *lnk); int smc_ib_modify_qp_rts(struct smc_link *lnk); -int smc_ib_modify_qp_reset(struct smc_link *lnk); int smc_ib_modify_qp_error(struct smc_link *lnk); long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, -- cgit v1.2.3 From 634e44971981b1c197fa3152d7a6930c8150060e Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Sat, 29 Jul 2023 20:20:36 +0800 Subject: vsock: Remove unused function declarations These are never implemented since introduction in commit d021c344051a ("VSOCK: Introduce VM Sockets") Signed-off-by: Yue Haibing Reviewed-by: Simon Horman Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20230729122036.32988-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/vmci_transport.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h index b7b072194282..dbda3ababa14 100644 --- a/net/vmw_vsock/vmci_transport.h +++ b/net/vmw_vsock/vmci_transport.h @@ -116,9 +116,6 @@ struct vmci_transport { spinlock_t lock; /* protects sk. */ }; -int vmci_transport_register(void); -void vmci_transport_unregister(void); - int vmci_transport_send_wrote_bh(struct sockaddr_vm *dst, struct sockaddr_vm *src); int vmci_transport_send_read_bh(struct sockaddr_vm *dst, -- cgit v1.2.3 From 81584c23f249ac2d809e2f89b76a7a9a02c09d8a Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 31 Jul 2023 15:55:00 -0600 Subject: netfilter: bpf: Only define get_proto_defrag_hook() if necessary Before, we were getting this warning: net/netfilter/nf_bpf_link.c:32:1: warning: 'get_proto_defrag_hook' defined but not used [-Wunused-function] Guard the definition with CONFIG_NF_DEFRAG_IPV[4|6]. Fixes: 91721c2d02d3 ("netfilter: bpf: Support BPF_F_NETFILTER_IP_DEFRAG in netfilter link") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202307291213.fZ0zDmoG-lkp@intel.com/ Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/b128b6489f0066db32c4772ae4aaee1480495929.1690840454.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- net/netfilter/nf_bpf_link.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index 8fe594bbc7e2..e502ec00b2fe 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -28,6 +28,7 @@ struct bpf_nf_link { const struct nf_defrag_hook *defrag_hook; }; +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) static const struct nf_defrag_hook * get_proto_defrag_hook(struct bpf_nf_link *link, const struct nf_defrag_hook __rcu *global_hook, @@ -68,6 +69,7 @@ get_proto_defrag_hook(struct bpf_nf_link *link, return hook; } +#endif static int bpf_nf_enable_defrag(struct bpf_nf_link *link) { -- cgit v1.2.3 From 2f48401dd0f2d93a41d28bb8dc3474158f4e906c Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Sat, 29 Jul 2023 20:34:56 +0800 Subject: net/hsr: Remove unused function declarations commit f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") introducted these but never implemented. Signed-off-by: Yue Haibing Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230729123456.36340-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/hsr/hsr_netlink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h index 501552d9753b..8c99e64e1cea 100644 --- a/net/hsr/hsr_netlink.h +++ b/net/hsr/hsr_netlink.h @@ -23,7 +23,5 @@ void __exit hsr_netlink_exit(void); void hsr_nl_ringerror(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN], struct hsr_port *port); void hsr_nl_nodedown(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN]); -void hsr_nl_framedrop(int dropcount, int dev_idx); -void hsr_nl_linkdown(int dev_idx); #endif /* __HSR_NETLINK_H */ -- cgit v1.2.3 From 8798481b667fa7c9bbd5aa843bf1557ada699964 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Fri, 28 Jul 2023 12:35:33 -0300 Subject: net/sched: wrap open coded Qdics class filter counter The 'filter_cnt' counter is used to control a Qdisc class lifetime. Each filter referecing this class by its id will eventually increment/decrement this counter in their respective 'add/update/delete' routines. As these operations are always serialized under rtnl lock, we don't need an atomic type like 'refcount_t'. It also means that we lose the overflow/underflow checks already present in refcount_t, which are valuable to hunt down bugs where the unsigned counter wraps around as it aids automated tools like syzkaller to scream in such situations. Wrap the open coded increment/decrement into helper functions and add overflow checks to the operations. Acked-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- include/net/sch_generic.h | 26 ++++++++++++++++++++++++++ net/sched/sch_drr.c | 9 ++++----- net/sched/sch_hfsc.c | 8 ++++---- net/sched/sch_htb.c | 8 +++----- net/sched/sch_qfq.c | 10 ++++------ 5 files changed, 41 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 15be2d96b06d..f232512505f8 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -599,6 +599,7 @@ get_default_qdisc_ops(const struct net_device *dev, int ntx) struct Qdisc_class_common { u32 classid; + unsigned int filter_cnt; struct hlist_node hnode; }; @@ -633,6 +634,31 @@ qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id) return NULL; } +static inline bool qdisc_class_in_use(const struct Qdisc_class_common *cl) +{ + return cl->filter_cnt > 0; +} + +static inline void qdisc_class_get(struct Qdisc_class_common *cl) +{ + unsigned int res; + + if (check_add_overflow(cl->filter_cnt, 1, &res)) + WARN(1, "Qdisc class overflow"); + + cl->filter_cnt = res; +} + +static inline void qdisc_class_put(struct Qdisc_class_common *cl) +{ + unsigned int res; + + if (check_sub_overflow(cl->filter_cnt, 1, &res)) + WARN(1, "Qdisc class underflow"); + + cl->filter_cnt = res; +} + static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid) { u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index e35a4e90f4e6..1b22b3b741c9 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -17,7 +17,6 @@ struct drr_class { struct Qdisc_class_common common; - unsigned int filter_cnt; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; @@ -150,7 +149,7 @@ static int drr_delete_class(struct Qdisc *sch, unsigned long arg, struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl = (struct drr_class *)arg; - if (cl->filter_cnt > 0) + if (qdisc_class_in_use(&cl->common)) return -EBUSY; sch_tree_lock(sch); @@ -187,8 +186,8 @@ static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent, { struct drr_class *cl = drr_find_class(sch, classid); - if (cl != NULL) - cl->filter_cnt++; + if (cl) + qdisc_class_get(&cl->common); return (unsigned long)cl; } @@ -197,7 +196,7 @@ static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; - cl->filter_cnt--; + qdisc_class_put(&cl->common); } static int drr_graft_class(struct Qdisc *sch, unsigned long arg, diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 70b0c5873d32..896cb401fdb9 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -116,7 +116,6 @@ struct hfsc_class { struct net_rate_estimator __rcu *rate_est; struct tcf_proto __rcu *filter_list; /* filter list */ struct tcf_block *block; - unsigned int filter_cnt; /* filter count */ unsigned int level; /* class level in hierarchy */ struct hfsc_sched *sched; /* scheduler data */ @@ -1094,7 +1093,8 @@ hfsc_delete_class(struct Qdisc *sch, unsigned long arg, struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl = (struct hfsc_class *)arg; - if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root) + if (cl->level > 0 || qdisc_class_in_use(&cl->cl_common) || + cl == &q->root) return -EBUSY; sch_tree_lock(sch); @@ -1223,7 +1223,7 @@ hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) if (cl != NULL) { if (p != NULL && p->level <= cl->level) return 0; - cl->filter_cnt++; + qdisc_class_get(&cl->cl_common); } return (unsigned long)cl; @@ -1234,7 +1234,7 @@ hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) { struct hfsc_class *cl = (struct hfsc_class *)arg; - cl->filter_cnt--; + qdisc_class_put(&cl->cl_common); } static struct tcf_block *hfsc_tcf_block(struct Qdisc *sch, unsigned long arg, diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 333800a7d4eb..05c8291865ae 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -102,7 +102,6 @@ struct htb_class { struct tcf_proto __rcu *filter_list; /* class attached filters */ struct tcf_block *block; - int filter_cnt; int level; /* our level (see above) */ unsigned int children; @@ -1710,7 +1709,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, * tc subsys guarantee us that in htb_destroy it holds no class * refs so that we can remove children safely there ? */ - if (cl->children || cl->filter_cnt) + if (cl->children || qdisc_class_in_use(&cl->common)) return -EBUSY; if (!cl->level && htb_parent_last_child(cl)) @@ -2107,7 +2106,7 @@ static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent, * be broken by class during destroy IIUC. */ if (cl) - cl->filter_cnt++; + qdisc_class_get(&cl->common); return (unsigned long)cl; } @@ -2115,8 +2114,7 @@ static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg) { struct htb_class *cl = (struct htb_class *)arg; - if (cl) - cl->filter_cnt--; + qdisc_class_put(&cl->common); } static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index befaf74b33ca..7addc15f01b5 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -130,8 +130,6 @@ struct qfq_aggregate; struct qfq_class { struct Qdisc_class_common common; - unsigned int filter_cnt; - struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; @@ -545,7 +543,7 @@ static int qfq_delete_class(struct Qdisc *sch, unsigned long arg, struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)arg; - if (cl->filter_cnt > 0) + if (qdisc_class_in_use(&cl->common)) return -EBUSY; sch_tree_lock(sch); @@ -580,8 +578,8 @@ static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent, { struct qfq_class *cl = qfq_find_class(sch, classid); - if (cl != NULL) - cl->filter_cnt++; + if (cl) + qdisc_class_get(&cl->common); return (unsigned long)cl; } @@ -590,7 +588,7 @@ static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg) { struct qfq_class *cl = (struct qfq_class *)arg; - cl->filter_cnt--; + qdisc_class_put(&cl->common); } static int qfq_graft_class(struct Qdisc *sch, unsigned long arg, -- cgit v1.2.3 From daf8d9181b9b4b58d3b182d1fea10dec75c5de88 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Fri, 28 Jul 2023 12:35:34 -0300 Subject: net/sched: sch_drr: warn about class in use while deleting Add extack to warn that delete was rejected because the class is still in use Acked-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- net/sched/sch_drr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 1b22b3b741c9..19901e77cd3b 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -149,8 +149,10 @@ static int drr_delete_class(struct Qdisc *sch, unsigned long arg, struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl = (struct drr_class *)arg; - if (qdisc_class_in_use(&cl->common)) + if (qdisc_class_in_use(&cl->common)) { + NL_SET_ERR_MSG(extack, "DRR class is in use"); return -EBUSY; + } sch_tree_lock(sch); -- cgit v1.2.3 From 8e4553ef3ed5ea6d58e7ab89955e2a56be4aa4f4 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Fri, 28 Jul 2023 12:35:35 -0300 Subject: net/sched: sch_hfsc: warn about class in use while deleting Add extack to warn that delete was rejected because the class is still in use Acked-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- net/sched/sch_hfsc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 896cb401fdb9..98805303218d 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1094,8 +1094,10 @@ hfsc_delete_class(struct Qdisc *sch, unsigned long arg, struct hfsc_class *cl = (struct hfsc_class *)arg; if (cl->level > 0 || qdisc_class_in_use(&cl->cl_common) || - cl == &q->root) + cl == &q->root) { + NL_SET_ERR_MSG(extack, "HFSC class in use"); return -EBUSY; + } sch_tree_lock(sch); -- cgit v1.2.3 From 7118f56e04d4f89318ad222e210ba70da19a8d15 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Fri, 28 Jul 2023 12:35:36 -0300 Subject: net/sched: sch_htb: warn about class in use while deleting Add extack to warn that delete was rejected because the class is still in use Acked-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- net/sched/sch_htb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 05c8291865ae..0d947414e616 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1709,8 +1709,10 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, * tc subsys guarantee us that in htb_destroy it holds no class * refs so that we can remove children safely there ? */ - if (cl->children || qdisc_class_in_use(&cl->common)) + if (cl->children || qdisc_class_in_use(&cl->common)) { + NL_SET_ERR_MSG(extack, "HTB class in use"); return -EBUSY; + } if (!cl->level && htb_parent_last_child(cl)) last_child = 1; -- cgit v1.2.3 From e20e75017c5a3dbc4a3cc505d8ad57487b500bbb Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Fri, 28 Jul 2023 12:35:37 -0300 Subject: net/sched: sch_qfq: warn about class in use while deleting Add extack to warn that delete was rejected because the class is still in use Acked-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- net/sched/sch_qfq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 7addc15f01b5..1a25752f1a9a 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -543,8 +543,10 @@ static int qfq_delete_class(struct Qdisc *sch, unsigned long arg, struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)arg; - if (qdisc_class_in_use(&cl->common)) + if (qdisc_class_in_use(&cl->common)) { + NL_SET_ERR_MSG_MOD(extack, "QFQ class in use"); return -EBUSY; + } sch_tree_lock(sch); -- cgit v1.2.3 From de9db136dcc3b60ba99f1f5034fc6ae7af45fa99 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sun, 30 Jul 2023 09:41:09 +0200 Subject: net: dsa: tag_qca: return early if dev is not found Currently checksum is recalculated and dsa tag stripped even if we later don't find the dev. To improve code, exit early if we don't find the dev and skip additional operation on the skb since it will be freed anyway. Signed-off-by: Christian Marangi Reviewed-by: Simon Horman Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20230730074113.21889-1-ansuelsmth@gmail.com Signed-off-by: Paolo Abeni --- net/dsa/tag_qca.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index e757c8de06f1..e5ff7c34e577 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -75,10 +75,6 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev) return NULL; } - /* Remove QCA tag and recalculate checksum */ - skb_pull_rcsum(skb, QCA_HDR_LEN); - dsa_strip_etype_header(skb, QCA_HDR_LEN); - /* Get source port information */ port = FIELD_GET(QCA_HDR_RECV_SOURCE_PORT, hdr); @@ -86,6 +82,10 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev) if (!skb->dev) return NULL; + /* Remove QCA tag and recalculate checksum */ + skb_pull_rcsum(skb, QCA_HDR_LEN); + dsa_strip_etype_header(skb, QCA_HDR_LEN); + return skb; } -- cgit v1.2.3 From ceaac91dcd065db781d1ed5dfaef0686b8ec44dc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 31 Jul 2023 10:11:58 -0700 Subject: net: make sure we never create ifindex = 0 Instead of allocating from 1 use proper xa_init flag, to protect ourselves from IDs wrapping back to 0. Fixes: 759ab1edb56c ("net: store netdevs in an xarray") Reported-by: Stephen Hemminger Link: https://lore.kernel.org/all/20230728162350.2a6d4979@hermes.local/ Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230731171159.988962-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index b58674774a57..10e5a036c706 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11271,8 +11271,7 @@ static int __net_init netdev_init(struct net *net) if (net->dev_index_head == NULL) goto err_idx; - net->ifindex = 1; - xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC); + xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1); RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); -- cgit v1.2.3 From a57c34a80cbe15e36e12d42a4ddc5160a5bbb1a4 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Tue, 1 Aug 2023 07:10:58 +0530 Subject: net: flow_dissector: Add IPSEC dissector Support for dissecting IPSEC field SPI (which is 32bits in size) for ESP and AH packets. Signed-off-by: Ratheesh Kannoth Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 9 ++++++++ net/core/flow_dissector.c | 53 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 830f06b2f36d..1a7131d6cb0e 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -301,6 +301,14 @@ struct flow_dissector_key_l2tpv3 { __be32 session_id; }; +/** + * struct flow_dissector_key_ipsec: + * @spi: identifier for a ipsec connection + */ +struct flow_dissector_key_ipsec { + __be32 spi; +}; + /** * struct flow_dissector_key_cfm * @mdl_ver: maintenance domain level (mdl) and cfm protocol version @@ -354,6 +362,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_PPPOE, /* struct flow_dissector_key_pppoe */ FLOW_DISSECTOR_KEY_L2TPV3, /* struct flow_dissector_key_l2tpv3 */ FLOW_DISSECTOR_KEY_CFM, /* struct flow_dissector_key_cfm */ + FLOW_DISSECTOR_KEY_IPSEC, /* struct flow_dissector_key_ipsec */ FLOW_DISSECTOR_KEY_MAX, }; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index ed5dfa376024..89d15ceaf9af 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -205,6 +205,50 @@ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); } +static void __skb_flow_dissect_ah(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_ipsec *key_ah; + struct ip_auth_hdr _hdr, *hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_ah = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPSEC, + target_container); + + key_ah->spi = hdr->spi; +} + +static void __skb_flow_dissect_esp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_ipsec *key_esp; + struct ip_esp_hdr _hdr, *hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_esp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPSEC, + target_container); + + key_esp->spi = hdr->spi; +} + static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, @@ -1571,7 +1615,14 @@ ip_proto_again: __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container, data, nhoff, hlen); break; - + case IPPROTO_ESP: + __skb_flow_dissect_esp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + case IPPROTO_AH: + __skb_flow_dissect_ah(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; default: break; } -- cgit v1.2.3 From 4c13eda757e3ca72f523d07ed9e5f3e72b374299 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Tue, 1 Aug 2023 07:10:59 +0530 Subject: tc: flower: support for SPI tc flower rules support to classify ESP/AH packets matching SPI field. Signed-off-by: Ratheesh Kannoth Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 3 +++ net/sched/cls_flower.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 7865f5a9885b..75506f157340 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -598,6 +598,9 @@ enum { TCA_FLOWER_KEY_CFM, /* nested */ + TCA_FLOWER_KEY_SPI, /* be32 */ + TCA_FLOWER_KEY_SPI_MASK, /* be32 */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 8da9d039d964..eca260272845 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -72,6 +72,7 @@ struct fl_flow_key { struct flow_dissector_key_num_of_vlans num_of_vlans; struct flow_dissector_key_pppoe pppoe; struct flow_dissector_key_l2tpv3 l2tpv3; + struct flow_dissector_key_ipsec ipsec; struct flow_dissector_key_cfm cfm; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ @@ -726,6 +727,8 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_PPPOE_SID] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_PPP_PROTO] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_L2TPV3_SID] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_SPI] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_SPI_MASK] = { .type = NLA_U32 }, [TCA_FLOWER_L2_MISS] = NLA_POLICY_MAX(NLA_U8, 1), [TCA_FLOWER_KEY_CFM] = { .type = NLA_NESTED }, }; @@ -795,6 +798,24 @@ static void fl_set_key_val(struct nlattr **tb, nla_memcpy(mask, tb[mask_type], len); } +static int fl_set_key_spi(struct nlattr **tb, struct fl_flow_key *key, + struct fl_flow_key *mask, + struct netlink_ext_ack *extack) +{ + if (key->basic.ip_proto != IPPROTO_ESP && + key->basic.ip_proto != IPPROTO_AH) { + NL_SET_ERR_MSG(extack, + "Protocol must be either ESP or AH"); + return -EINVAL; + } + + fl_set_key_val(tb, &key->ipsec.spi, + TCA_FLOWER_KEY_SPI, + &mask->ipsec.spi, TCA_FLOWER_KEY_SPI_MASK, + sizeof(key->ipsec.spi)); + return 0; +} + static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) @@ -1894,6 +1915,12 @@ static int fl_set_key(struct net *net, struct nlattr **tb, return ret; } + if (tb[TCA_FLOWER_KEY_SPI]) { + ret = fl_set_key_spi(tb, key, mask, extack); + if (ret) + return ret; + } + if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) { key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; @@ -2066,6 +2093,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_PPPOE, pppoe); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_L2TPV3, l2tpv3); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_IPSEC, ipsec); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_CFM, cfm); @@ -3364,6 +3393,12 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, sizeof(key->l2tpv3.session_id))) goto nla_put_failure; + if (key->ipsec.spi && + fl_dump_key_val(skb, &key->ipsec.spi, TCA_FLOWER_KEY_SPI, + &mask->ipsec.spi, TCA_FLOWER_KEY_SPI_MASK, + sizeof(key->ipsec.spi))) + goto nla_put_failure; + if ((key->basic.ip_proto == IPPROTO_TCP || key->basic.ip_proto == IPPROTO_UDP || key->basic.ip_proto == IPPROTO_SCTP) && -- cgit v1.2.3 From c8915d7329d6e9164c5c847dc1c56a2c0437053f Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Tue, 1 Aug 2023 07:11:00 +0530 Subject: tc: flower: Enable offload support IPSEC SPI field. This patch enables offload for TC classifier flower rules which matches against SPI field. Signed-off-by: Ratheesh Kannoth Signed-off-by: David S. Miller --- include/net/flow_offload.h | 6 ++++++ net/core/flow_offload.c | 7 +++++++ 2 files changed, 13 insertions(+) (limited to 'net') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 118082eae48c..9efa9a59e81f 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -64,6 +64,10 @@ struct flow_match_tcp { struct flow_dissector_key_tcp *key, *mask; }; +struct flow_match_ipsec { + struct flow_dissector_key_ipsec *key, *mask; +}; + struct flow_match_mpls { struct flow_dissector_key_mpls *key, *mask; }; @@ -116,6 +120,8 @@ void flow_rule_match_ports_range(const struct flow_rule *rule, struct flow_match_ports_range *out); void flow_rule_match_tcp(const struct flow_rule *rule, struct flow_match_tcp *out); +void flow_rule_match_ipsec(const struct flow_rule *rule, + struct flow_match_ipsec *out); void flow_rule_match_icmp(const struct flow_rule *rule, struct flow_match_icmp *out); void flow_rule_match_mpls(const struct flow_rule *rule, diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index acfc1f88ea79..bc5169482710 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -146,6 +146,13 @@ void flow_rule_match_tcp(const struct flow_rule *rule, } EXPORT_SYMBOL(flow_rule_match_tcp); +void flow_rule_match_ipsec(const struct flow_rule *rule, + struct flow_match_ipsec *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPSEC, out); +} +EXPORT_SYMBOL(flow_rule_match_ipsec); + void flow_rule_match_icmp(const struct flow_rule *rule, struct flow_match_icmp *out) { -- cgit v1.2.3 From 2fca1b5ef898b131c430eed6f07b8972fb6c9673 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Tue, 1 Aug 2023 22:31:29 +0800 Subject: ila: Remove unnecessary file net/ila.h Commit 642c2c95585d ("ila: xlat changes") removed ila_xlat_outgoing() and ila_xlat_incoming() functions, then this file became unnecessary. Signed-off-by: Yue Haibing Reviewed-by: Larysa Zaremba Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230801143129.40652-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- include/net/ila.h | 16 ---------------- net/ipv6/ila/ila_main.c | 1 - net/ipv6/ila/ila_xlat.c | 1 - 3 files changed, 18 deletions(-) delete mode 100644 include/net/ila.h (limited to 'net') diff --git a/include/net/ila.h b/include/net/ila.h deleted file mode 100644 index 73ebe5eab272..000000000000 --- a/include/net/ila.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ILA kernel interface - * - * Copyright (c) 2015 Tom Herbert - */ - -#ifndef _NET_ILA_H -#define _NET_ILA_H - -struct sk_buff; - -int ila_xlat_outgoing(struct sk_buff *skb); -int ila_xlat_incoming(struct sk_buff *skb); - -#endif /* _NET_ILA_H */ diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c index 3faf62530d6a..69caed07315f 100644 --- a/net/ipv6/ila/ila_main.c +++ b/net/ipv6/ila/ila_main.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include #include #include #include "ila.h" diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index bee45dfeb187..67e8c9440977 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include "ila.h" -- cgit v1.2.3 From e12f2a6d1b9e10cfa1a5c520a620cd5c3a7ceba3 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Tue, 1 Aug 2023 22:34:53 +0800 Subject: netlabel: Remove unused declaration netlbl_cipsov4_doi_free() Since commit b1edeb102397 ("netlabel: Replace protocol/NetLabel linking with refrerence counts") this declaration is unused and can be removed. Signed-off-by: Yue Haibing Acked-by: Paul Moore Link: https://lore.kernel.org/r/20230801143453.24452-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/netlabel/netlabel_cipso_v4.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/netlabel/netlabel_cipso_v4.h b/net/netlabel/netlabel_cipso_v4.h index 85d7ecb05728..9518ab56ec98 100644 --- a/net/netlabel/netlabel_cipso_v4.h +++ b/net/netlabel/netlabel_cipso_v4.h @@ -149,7 +149,4 @@ enum { /* NetLabel protocol functions */ int netlbl_cipsov4_genl_init(void); -/* Free the memory associated with a CIPSOv4 DOI definition */ -void netlbl_cipsov4_doi_free(struct rcu_head *entry); - #endif -- cgit v1.2.3 From bf4ea1d0b2cb2251f9e5619c81daa98591087c33 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 1 Aug 2023 22:26:20 +0800 Subject: bpf, xdp: Add tracepoint to xdp attaching failure When error happens in dev_xdp_attach(), it should have a way to tell users the error message like the netlink approach. To avoid breaking uapi, adding a tracepoint in bpf_xdp_link_attach() is an appropriate way to notify users the error message. Hence, bpf libraries are able to retrieve the error message by this tracepoint, and then report the error message to users. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20230801142621.7925-2-hffilwlqm@gmail.com Signed-off-by: Alexei Starovoitov --- include/trace/events/xdp.h | 17 +++++++++++++++++ net/core/dev.c | 5 ++++- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index c40fc97f9417..cd89f1d5ce7b 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -404,6 +404,23 @@ TRACE_EVENT(mem_return_failed, ) ); +TRACE_EVENT(bpf_xdp_link_attach_failed, + + TP_PROTO(const char *msg), + + TP_ARGS(msg), + + TP_STRUCT__entry( + __string(msg, msg) + ), + + TP_fast_assign( + __assign_str(msg, msg); + ), + + TP_printk("errmsg=%s", __get_str(msg)) +); + #endif /* _TRACE_XDP_H */ #include diff --git a/net/core/dev.c b/net/core/dev.c index 8e7d0cb540cd..002fec07de73 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -133,6 +133,7 @@ #include #include #include +#include #include #include #include @@ -9470,6 +9471,7 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct net *net = current->nsproxy->net_ns; struct bpf_link_primer link_primer; + struct netlink_ext_ack extack = {}; struct bpf_xdp_link *link; struct net_device *dev; int err, fd; @@ -9497,12 +9499,13 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto unlock; } - err = dev_xdp_attach_link(dev, NULL, link); + err = dev_xdp_attach_link(dev, &extack, link); rtnl_unlock(); if (err) { link->dev = NULL; bpf_link_cleanup(&link_primer); + trace_bpf_xdp_link_attach_failed(extack._msg); goto out_put_dev; } -- cgit v1.2.3 From 09c2c90705bb64986d499c4a4f3fd659761b637c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 1 Aug 2023 20:52:51 +0000 Subject: net: allow alloc_skb_with_frags() to allocate bigger packets Refactor alloc_skb_with_frags() to allow bigger packets allocations. Instead of assuming that only order-0 allocations will be attempted, use the caller supplied max order. v2: try harder to use high-order pages, per Willem feedback. Link: https://lore.kernel.org/netdev/CANn89iJQfmc_KeUr3TeXvsLQwo3ZymyoCr7Y6AnHrkWSuz0yAg@mail.gmail.com/ Signed-off-by: Eric Dumazet Cc: Tahsin Erdogan Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20230801205254.400094-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 56 +++++++++++++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a298992060e6..c6f98245582c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6204,7 +6204,7 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); * * @header_len: size of linear part * @data_len: needed length in frags - * @max_page_order: max page order desired. + * @order: max page order desired. * @errcode: pointer to error code if any * @gfp_mask: allocation mask * @@ -6212,21 +6212,17 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); */ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long data_len, - int max_page_order, + int order, int *errcode, gfp_t gfp_mask) { - int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; unsigned long chunk; struct sk_buff *skb; struct page *page; - int i; + int nr_frags = 0; *errcode = -EMSGSIZE; - /* Note this test could be relaxed, if we succeed to allocate - * high order pages... - */ - if (npages > MAX_SKB_FRAGS) + if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order))) return NULL; *errcode = -ENOBUFS; @@ -6234,34 +6230,32 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, if (!skb) return NULL; - skb->truesize += npages << PAGE_SHIFT; - - for (i = 0; npages > 0; i++) { - int order = max_page_order; - - while (order) { - if (npages >= 1 << order) { - page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | - __GFP_COMP | - __GFP_NOWARN, - order); - if (page) - goto fill_page; - /* Do not retry other high order allocations */ - order = 1; - max_page_order = 0; - } + while (data_len) { + if (nr_frags == MAX_SKB_FRAGS - 1) + goto failure; + while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) order--; + + if (order) { + page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | + __GFP_NOWARN, + order); + if (!page) { + order--; + continue; + } + } else { + page = alloc_page(gfp_mask); + if (!page) + goto failure; } - page = alloc_page(gfp_mask); - if (!page) - goto failure; -fill_page: chunk = min_t(unsigned long, data_len, PAGE_SIZE << order); - skb_fill_page_desc(skb, i, page, 0, chunk); + skb_fill_page_desc(skb, nr_frags, page, 0, chunk); + nr_frags++; + skb->truesize += (PAGE_SIZE << order); data_len -= chunk; - npages -= 1 << order; } return skb; -- cgit v1.2.3 From ae6db08f8b5606ccd95ef2aadd741905d3c13bc0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 1 Aug 2023 20:52:53 +0000 Subject: net/packet: change packet_alloc_skb() to allow bigger paged allocations packet_alloc_skb() is currently calling sock_alloc_send_pskb() forcing order-0 page allocations. Switch to PAGE_ALLOC_COSTLY_ORDER, to increase max size by 8x. Also add logic to increase the linear part if needed. Signed-off-by: Eric Dumazet Cc: Tahsin Erdogan Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20230801205254.400094-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8e3ddec4c3d5..3b77d255d22d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2927,8 +2927,10 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, if (prepad + len < PAGE_SIZE || !linear) linear = len; + if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, - err, 0); + err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return NULL; -- cgit v1.2.3 From 66f7223039c0cef81221e9779520479895995815 Mon Sep 17 00:00:00 2001 From: Maxim Georgiev Date: Tue, 1 Aug 2023 17:28:13 +0300 Subject: net: add NDOs for configuring hardware timestamping Current hardware timestamping API for NICs requires implementing .ndo_eth_ioctl() for SIOCGHWTSTAMP and SIOCSHWTSTAMP. That API has some boilerplate such as request parameter translation between user and kernel address spaces, handling possible translation failures correctly, etc. Since it is the same all across the board, it would be desirable to handle it through generic code. Here we introduce .ndo_hwtstamp_get() and .ndo_hwtstamp_set(), which implement that boilerplate and allow drivers to just act upon requests. Suggested-by: Jakub Kicinski Signed-off-by: Maxim Georgiev Signed-off-by: Vladimir Oltean Reviewed-by: Jacob Keller Tested-by: Horatiu Vultur Link: https://lore.kernel.org/r/20230801142824.1772134-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/net_tstamp.h | 8 ++++++++ include/linux/netdevice.h | 16 ++++++++++++++++ net/core/dev_ioctl.c | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 68 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index fd67f3cc0c4b..7c59824f43f5 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -30,4 +30,12 @@ static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kern kernel_cfg->rx_filter = cfg->rx_filter; } +static inline void hwtstamp_config_from_kernel(struct hwtstamp_config *cfg, + const struct kernel_hwtstamp_config *kernel_cfg) +{ + cfg->flags = kernel_cfg->flags; + cfg->tx_type = kernel_cfg->tx_type; + cfg->rx_filter = kernel_cfg->rx_filter; +} + #endif /* _LINUX_NET_TIMESTAMPING_H_ */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 84c36a7f873f..08a0b8d45dc9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -57,6 +57,7 @@ struct netpoll_info; struct device; struct ethtool_ops; +struct kernel_hwtstamp_config; struct phy_device; struct dsa_port; struct ip_tunnel_parm; @@ -1418,6 +1419,16 @@ struct netdev_net_notifier { * Get hardware timestamp based on normal/adjustable time or free running * cycle counter. This function is required if physical clock supports a * free running cycle counter. + * + * int (*ndo_hwtstamp_get)(struct net_device *dev, + * struct kernel_hwtstamp_config *kernel_config); + * Get the currently configured hardware timestamping parameters for the + * NIC device. + * + * int (*ndo_hwtstamp_set)(struct net_device *dev, + * struct kernel_hwtstamp_config *kernel_config, + * struct netlink_ext_ack *extack); + * Change the hardware timestamping parameters for NIC device. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1652,6 +1663,11 @@ struct net_device_ops { ktime_t (*ndo_get_tstamp)(struct net_device *dev, const struct skb_shared_hwtstamps *hwtstamps, bool cycles); + int (*ndo_hwtstamp_get)(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_config); + int (*ndo_hwtstamp_set)(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_config, + struct netlink_ext_ack *extack); }; struct xdp_metadata_ops { diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 3730945ee294..10c0e173b38b 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -254,11 +254,32 @@ static int dev_eth_ioctl(struct net_device *dev, static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) { - return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); + const struct net_device_ops *ops = dev->netdev_ops; + struct kernel_hwtstamp_config kernel_cfg = {}; + struct hwtstamp_config cfg; + int err; + + if (!ops->ndo_hwtstamp_get) + return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */ + + if (!netif_device_present(dev)) + return -ENODEV; + + err = ops->ndo_hwtstamp_get(dev, &kernel_cfg); + if (err) + return err; + + hwtstamp_config_from_kernel(&cfg, &kernel_cfg); + + if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) + return -EFAULT; + + return 0; } static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) { + const struct net_device_ops *ops = dev->netdev_ops; struct kernel_hwtstamp_config kernel_cfg; struct netlink_ext_ack extack = {}; struct hwtstamp_config cfg; @@ -280,7 +301,28 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) return err; } - return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); + if (!ops->ndo_hwtstamp_set) + return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */ + + if (!netif_device_present(dev)) + return -ENODEV; + + err = ops->ndo_hwtstamp_set(dev, &kernel_cfg, &extack); + if (err) { + if (extack._msg) + netdev_err(dev, "%s\n", extack._msg); + return err; + } + + /* The driver may have modified the configuration, so copy the + * updated version of it back to user space + */ + hwtstamp_config_from_kernel(&cfg, &kernel_cfg); + + if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) + return -EFAULT; + + return 0; } static int dev_siocbond(struct net_device *dev, -- cgit v1.2.3 From e47d01fea663b1fe58d0a493efc9ed667f70242e Mon Sep 17 00:00:00 2001 From: Maxim Georgiev Date: Tue, 1 Aug 2023 17:28:14 +0300 Subject: net: add hwtstamping helpers for stackable net devices The stackable net devices with hwtstamping support (vlan, macvlan, bonding) only pass the hwtstamping ops to the lower (real) device. These drivers are the first that need to be converted to the new timestamping API, because if they aren't prepared to handle that, then no real device driver cannot be converted to the new API either. After studying what vlan_dev_ioctl(), macvlan_eth_ioctl() and bond_eth_ioctl() have in common, here we propose two generic implementations of ndo_hwtstamp_get() and ndo_hwtstamp_set() which can be called by those 3 drivers, with "dev" being their lower device. These helpers cover both cases, when the lower driver is converted to the new API or unconverted. We need some hacks in case of an unconverted driver, namely to stuff some pointers in struct kernel_hwtstamp_config which shouldn't have been there (since the new API isn't supposed to need it). These will be removed when all drivers will have been converted to the new API. Signed-off-by: Maxim Georgiev Signed-off-by: Vladimir Oltean Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20230801142824.1772134-3-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/net_tstamp.h | 6 ++++ include/linux/netdevice.h | 5 ++++ net/core/dev_ioctl.c | 75 +++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 79 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index 7c59824f43f5..03e922814851 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -11,6 +11,10 @@ * @flags: see struct hwtstamp_config * @tx_type: see struct hwtstamp_config * @rx_filter: see struct hwtstamp_config + * @ifr: pointer to ifreq structure from the original ioctl request, to pass to + * a legacy implementation of a lower driver + * @copied_to_user: request was passed to a legacy implementation which already + * copied the ioctl request back to user space * * Prefer using this structure for in-kernel processing of hardware * timestamping configuration, over the inextensible struct hwtstamp_config @@ -20,6 +24,8 @@ struct kernel_hwtstamp_config { int flags; int tx_type; int rx_filter; + struct ifreq *ifr; + bool copied_to_user; }; static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 08a0b8d45dc9..23e335f245cf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3950,6 +3950,11 @@ int put_user_ifreq(struct ifreq *ifr, void __user *arg); int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, void __user *data, bool *need_copyout); int dev_ifconf(struct net *net, struct ifconf __user *ifc); +int generic_hwtstamp_get_lower(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_cfg); +int generic_hwtstamp_set_lower(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_cfg, + struct netlink_ext_ack *extack); int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata); unsigned int dev_get_flags(const struct net_device *); int __dev_change_flags(struct net_device *dev, unsigned int flags, diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 10c0e173b38b..d0223ecd6f6f 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -265,14 +265,20 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) if (!netif_device_present(dev)) return -ENODEV; + kernel_cfg.ifr = ifr; err = ops->ndo_hwtstamp_get(dev, &kernel_cfg); if (err) return err; - hwtstamp_config_from_kernel(&cfg, &kernel_cfg); + /* If the request was resolved through an unconverted driver, omit + * the copy_to_user(), since the implementation has already done that + */ + if (!kernel_cfg.copied_to_user) { + hwtstamp_config_from_kernel(&cfg, &kernel_cfg); - if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) - return -EFAULT; + if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) + return -EFAULT; + } return 0; } @@ -280,7 +286,7 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) { const struct net_device_ops *ops = dev->netdev_ops; - struct kernel_hwtstamp_config kernel_cfg; + struct kernel_hwtstamp_config kernel_cfg = {}; struct netlink_ext_ack extack = {}; struct hwtstamp_config cfg; int err; @@ -289,6 +295,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) return -EFAULT; hwtstamp_config_to_kernel(&kernel_cfg, &cfg); + kernel_cfg.ifr = ifr; err = net_hwtstamp_validate(&kernel_cfg); if (err) @@ -317,14 +324,68 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) /* The driver may have modified the configuration, so copy the * updated version of it back to user space */ - hwtstamp_config_from_kernel(&cfg, &kernel_cfg); + if (!kernel_cfg.copied_to_user) { + hwtstamp_config_from_kernel(&cfg, &kernel_cfg); - if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) - return -EFAULT; + if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) + return -EFAULT; + } + + return 0; +} + +static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd, + struct kernel_hwtstamp_config *kernel_cfg) +{ + struct ifreq ifrr; + int err; + + strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ); + ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru; + + err = dev_eth_ioctl(dev, &ifrr, cmd); + if (err) + return err; + + kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru; + kernel_cfg->copied_to_user = true; return 0; } +int generic_hwtstamp_get_lower(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_cfg) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!netif_device_present(dev)) + return -ENODEV; + + if (ops->ndo_hwtstamp_get) + return ops->ndo_hwtstamp_get(dev, kernel_cfg); + + /* Legacy path: unconverted lower driver */ + return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); +} +EXPORT_SYMBOL(generic_hwtstamp_get_lower); + +int generic_hwtstamp_set_lower(struct net_device *dev, + struct kernel_hwtstamp_config *kernel_cfg, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!netif_device_present(dev)) + return -ENODEV; + + if (ops->ndo_hwtstamp_set) + return ops->ndo_hwtstamp_set(dev, kernel_cfg, extack); + + /* Legacy path: unconverted lower driver */ + return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); +} +EXPORT_SYMBOL(generic_hwtstamp_set_lower); + static int dev_siocbond(struct net_device *dev, struct ifreq *ifr, unsigned int cmd) { -- cgit v1.2.3 From 65c9fde15a651a673637075cc1e9e3aa6aaff203 Mon Sep 17 00:00:00 2001 From: Maxim Georgiev Date: Tue, 1 Aug 2023 17:28:15 +0300 Subject: net: vlan: convert to ndo_hwtstamp_get() / ndo_hwtstamp_set() 8021q is one of the stackable net devices which pass the hardware timestamping ops to the real device through ndo_eth_ioctl(). This prevents converting any device driver to the new hwtimestamping API without regressions. Remove that limitation in the vlan driver by using the newly introduced helpers for timestamping through lower devices, that handle both the new and the old driver API. Signed-off-by: Maxim Georgiev Signed-off-by: Vladimir Oltean Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20230801142824.1772134-4-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/8021q/vlan_dev.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index b90781b9ece6..2a7f1b15714a 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -354,6 +354,26 @@ out: return 0; } +static int vlan_hwtstamp_get(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) +{ + struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + + return generic_hwtstamp_get_lower(real_dev, cfg); +} + +static int vlan_hwtstamp_set(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) +{ + struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + + if (!net_eq(dev_net(dev), dev_net(real_dev))) + return -EOPNOTSUPP; + + return generic_hwtstamp_set_lower(real_dev, cfg, extack); +} + static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; @@ -365,14 +385,9 @@ static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) ifrr.ifr_ifru = ifr->ifr_ifru; switch (cmd) { - case SIOCSHWTSTAMP: - if (!net_eq(dev_net(dev), dev_net(real_dev))) - break; - fallthrough; case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: - case SIOCGHWTSTAMP: if (netif_device_present(real_dev) && ops->ndo_eth_ioctl) err = ops->ndo_eth_ioctl(real_dev, &ifrr, cmd); break; @@ -1081,6 +1096,8 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_fix_features = vlan_dev_fix_features, .ndo_get_iflink = vlan_dev_get_iflink, .ndo_fill_forward_path = vlan_dev_fill_forward_path, + .ndo_hwtstamp_get = vlan_hwtstamp_get, + .ndo_hwtstamp_set = vlan_hwtstamp_set, }; static void vlan_dev_free(struct net_device *dev) -- cgit v1.2.3 From 70ef7d87f62a86674c21a99341dabc175c19681a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 1 Aug 2023 17:28:22 +0300 Subject: net: transfer rtnl_lock() requirement from ethtool_set_ethtool_phy_ops() to caller phy_init() and phy_exit() will have to do more stuff under rtnl_lock() in a future change. Since rtnl_unlock() -> netdev_run_todo() does a lot of stuff under the hood, it's a pity to lock and unlock the rtnetlink mutex twice in a row. Change the calling convention such that the only caller of ethtool_set_ethtool_phy_ops(), phy_device.c, provides a context where the rtnl_mutex is already acquired. Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20230801142824.1772134-11-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 7 +++++++ net/ethtool/common.c | 3 +-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 61921d4dbb13..98b8ac28e5a1 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -3451,7 +3452,9 @@ static int __init phy_init(void) { int rc; + rtnl_lock(); ethtool_set_ethtool_phy_ops(&phy_ethtool_phy_ops); + rtnl_unlock(); rc = mdio_bus_init(); if (rc) @@ -3474,7 +3477,9 @@ err_c45: err_mdio_bus: mdio_bus_exit(); err_ethtool_phy_ops: + rtnl_lock(); ethtool_set_ethtool_phy_ops(NULL); + rtnl_unlock(); return rc; } @@ -3484,7 +3489,9 @@ static void __exit phy_exit(void) phy_driver_unregister(&genphy_c45_driver); phy_driver_unregister(&genphy_driver); mdio_bus_exit(); + rtnl_lock(); ethtool_set_ethtool_phy_ops(NULL); + rtnl_unlock(); } subsys_initcall(phy_init); diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 5fb19050991e..f5598c5f50de 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -665,9 +665,8 @@ const struct ethtool_phy_ops *ethtool_phy_ops; void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops) { - rtnl_lock(); + ASSERT_RTNL(); ethtool_phy_ops = ops; - rtnl_unlock(); } EXPORT_SYMBOL_GPL(ethtool_set_ethtool_phy_ops); -- cgit v1.2.3 From fd770e856e226f80fe6e1dc9d1861bcb135cdf0b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 1 Aug 2023 17:28:24 +0300 Subject: net: remove phy_has_hwtstamp() -> phy_mii_ioctl() decision from converted drivers It is desirable that the new .ndo_hwtstamp_set() API gives more uniformity, less overhead and future flexibility w.r.t. the PHY timestamping behavior. Currently there are some drivers which allow PHY timestamping through the procedure mentioned in Documentation/networking/timestamping.rst. They don't do anything locally if phy_has_hwtstamp() is set, except for lan966x which installs PTP packet traps. Centralize that behavior in a new dev_set_hwtstamp_phylib() code function, which calls either phy_mii_ioctl() for the phylib PHY, or .ndo_hwtstamp_set() of the netdev, based on a single policy (currently simplistic: phy_has_hwtstamp()). Any driver converted to .ndo_hwtstamp_set() will automatically opt into the centralized phylib timestamping policy. Unconverted drivers still get to choose whether they let the PHY handle timestamping or not. Netdev drivers with integrated PHY drivers that don't use phylib presumably don't set dev->phydev, and those will always see HWTSTAMP_SOURCE_NETDEV requests even when converted. The timestamping policy will remain 100% up to them. Signed-off-by: Vladimir Oltean Reviewed-by: Jacob Keller Tested-by: Horatiu Vultur Link: https://lore.kernel.org/r/20230801142824.1772134-13-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_main.c | 8 -- .../net/ethernet/microchip/lan966x/lan966x_main.c | 25 +++--- .../net/ethernet/microchip/sparx5/sparx5_netdev.c | 6 -- include/linux/net_tstamp.h | 16 ++++ include/linux/netdevice.h | 4 + net/core/dev_ioctl.c | 91 ++++++++++++++++++++-- 6 files changed, 117 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 6d81fff0227e..43f14cec91e9 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3872,10 +3872,6 @@ static int fec_hwtstamp_get(struct net_device *ndev, struct kernel_hwtstamp_config *config) { struct fec_enet_private *fep = netdev_priv(ndev); - struct phy_device *phydev = ndev->phydev; - - if (phy_has_hwtstamp(phydev)) - return phy_mii_ioctl(phydev, config->ifr, SIOCGHWTSTAMP); if (!netif_running(ndev)) return -EINVAL; @@ -3893,10 +3889,6 @@ static int fec_hwtstamp_set(struct net_device *ndev, struct netlink_ext_ack *extack) { struct fec_enet_private *fep = netdev_priv(ndev); - struct phy_device *phydev = ndev->phydev; - - if (phy_has_hwtstamp(phydev)) - return phy_mii_ioctl(phydev, config->ifr, SIOCSHWTSTAMP); if (!netif_running(ndev)) return -EINVAL; diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c index 1baa94a98fe3..4a1acc7234f6 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c @@ -455,9 +455,6 @@ static int lan966x_port_hwtstamp_get(struct net_device *dev, { struct lan966x_port *port = netdev_priv(dev); - if (phy_has_hwtstamp(dev->phydev)) - return phy_mii_ioctl(dev->phydev, cfg->ifr, SIOCGHWTSTAMP); - if (!port->lan966x->ptp) return -EOPNOTSUPP; @@ -473,21 +470,26 @@ static int lan966x_port_hwtstamp_set(struct net_device *dev, struct lan966x_port *port = netdev_priv(dev); int err; + if (cfg->source != HWTSTAMP_SOURCE_NETDEV && + cfg->source != HWTSTAMP_SOURCE_PHYLIB) + return -EOPNOTSUPP; + err = lan966x_ptp_setup_traps(port, cfg); if (err) return err; - if (phy_has_hwtstamp(dev->phydev)) { - err = phy_mii_ioctl(dev->phydev, cfg->ifr, SIOCSHWTSTAMP); - if (err) + if (cfg->source == HWTSTAMP_SOURCE_NETDEV) { + if (!port->lan966x->ptp) + return -EOPNOTSUPP; + + err = lan966x_ptp_hwtstamp_set(port, cfg, extack); + if (err) { lan966x_ptp_del_traps(port); - return err; + return err; + } } - if (!port->lan966x->ptp) - return -EOPNOTSUPP; - - return lan966x_ptp_hwtstamp_set(port, cfg, extack); + return 0; } static const struct net_device_ops lan966x_port_netdev_ops = { @@ -815,6 +817,7 @@ static int lan966x_probe_port(struct lan966x *lan966x, u32 p, NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_TC; dev->hw_features |= NETIF_F_HW_TC; + dev->priv_flags |= IFF_SEE_ALL_HWTSTAMP_REQUESTS; dev->needed_headroom = IFH_LEN_BYTES; eth_hw_addr_gen(dev, lan966x->base_mac, p + 1); diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_netdev.c b/drivers/net/ethernet/microchip/sparx5/sparx5_netdev.c index e01d3b1e17e0..705a004b324f 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_netdev.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_netdev.c @@ -216,9 +216,6 @@ static int sparx5_port_hwtstamp_get(struct net_device *dev, struct sparx5_port *sparx5_port = netdev_priv(dev); struct sparx5 *sparx5 = sparx5_port->sparx5; - if (phy_has_hwtstamp(dev->phydev)) - return phy_mii_ioctl(dev->phydev, cfg->ifr, SIOCGHWTSTAMP); - if (!sparx5->ptp) return -EOPNOTSUPP; @@ -234,9 +231,6 @@ static int sparx5_port_hwtstamp_set(struct net_device *dev, struct sparx5_port *sparx5_port = netdev_priv(dev); struct sparx5 *sparx5 = sparx5_port->sparx5; - if (phy_has_hwtstamp(dev->phydev)) - return phy_mii_ioctl(dev->phydev, cfg->ifr, SIOCSHWTSTAMP); - if (!sparx5->ptp) return -EOPNOTSUPP; diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index 03e922814851..eb01c37e71e0 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -5,6 +5,11 @@ #include +enum hwtstamp_source { + HWTSTAMP_SOURCE_NETDEV, + HWTSTAMP_SOURCE_PHYLIB, +}; + /** * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config * @@ -15,6 +20,8 @@ * a legacy implementation of a lower driver * @copied_to_user: request was passed to a legacy implementation which already * copied the ioctl request back to user space + * @source: indication whether timestamps should come from the netdev or from + * an attached phylib PHY * * Prefer using this structure for in-kernel processing of hardware * timestamping configuration, over the inextensible struct hwtstamp_config @@ -26,6 +33,7 @@ struct kernel_hwtstamp_config { int rx_filter; struct ifreq *ifr; bool copied_to_user; + enum hwtstamp_source source; }; static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg, @@ -44,4 +52,12 @@ static inline void hwtstamp_config_from_kernel(struct hwtstamp_config *cfg, cfg->rx_filter = kernel_cfg->rx_filter; } +static inline bool kernel_hwtstamp_config_changed(const struct kernel_hwtstamp_config *a, + const struct kernel_hwtstamp_config *b) +{ + return a->flags != b->flags || + a->tx_type != b->tx_type || + a->rx_filter != b->rx_filter; +} + #endif /* _LINUX_NET_TIMESTAMPING_H_ */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 23e335f245cf..85d594460c66 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1724,6 +1724,9 @@ struct xdp_metadata_ops { * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN + * @IFF_SEE_ALL_HWTSTAMP_REQUESTS: device wants to see calls to + * ndo_hwtstamp_set() for all timestamp requests regardless of source, + * even if those aren't HWTSTAMP_SOURCE_NETDEV. */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1759,6 +1762,7 @@ enum netdev_priv_flags { IFF_NO_ADDRCONF = BIT_ULL(30), IFF_TX_SKB_NO_LINEAR = BIT_ULL(31), IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), + IFF_SEE_ALL_HWTSTAMP_REQUESTS = BIT_ULL(33), }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index d0223ecd6f6f..72e077022348 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -252,6 +253,30 @@ static int dev_eth_ioctl(struct net_device *dev, return ops->ndo_eth_ioctl(dev, ifr, cmd); } +/** + * dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC + * or of attached phylib PHY + * @dev: Network device + * @cfg: Timestamping configuration structure + * + * Helper for enforcing a common policy that phylib timestamping, if available, + * should take precedence in front of hardware timestamping provided by the + * netdev. + * + * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and + * there only exists a phydev->mii_ts->hwtstamp() method. So this will return + * -EOPNOTSUPP for phylib for now, which is still more accurate than letting + * the netdev handle the GET request. + */ +static int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) +{ + if (phy_has_hwtstamp(dev->phydev)) + return phy_hwtstamp_get(dev->phydev, cfg); + + return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); +} + static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) { const struct net_device_ops *ops = dev->netdev_ops; @@ -266,7 +291,7 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) return -ENODEV; kernel_cfg.ifr = ifr; - err = ops->ndo_hwtstamp_get(dev, &kernel_cfg); + err = dev_get_hwtstamp_phylib(dev, &kernel_cfg); if (err) return err; @@ -283,6 +308,59 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) return 0; } +/** + * dev_set_hwtstamp_phylib() - Change hardware timestamping of NIC + * or of attached phylib PHY + * @dev: Network device + * @cfg: Timestamping configuration structure + * @extack: Netlink extended ack message structure, for error reporting + * + * Helper for enforcing a common policy that phylib timestamping, if available, + * should take precedence in front of hardware timestamping provided by the + * netdev. If the netdev driver needs to perform specific actions even for PHY + * timestamping to work properly (a switch port must trap the timestamped + * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in + * dev->priv_flags. + */ +static int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + bool phy_ts = phy_has_hwtstamp(dev->phydev); + struct kernel_hwtstamp_config old_cfg = {}; + bool changed = false; + int err; + + cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; + + if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + err = ops->ndo_hwtstamp_get(dev, &old_cfg); + if (err) + return err; + + err = ops->ndo_hwtstamp_set(dev, cfg, extack); + if (err) { + if (extack->_msg) + netdev_err(dev, "%s\n", extack->_msg); + return err; + } + + changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); + } + + if (phy_ts) { + err = phy_hwtstamp_set(dev->phydev, cfg, extack); + if (err) { + if (changed) + ops->ndo_hwtstamp_set(dev, &old_cfg, NULL); + return err; + } + } + + return 0; +} + static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) { const struct net_device_ops *ops = dev->netdev_ops; @@ -314,12 +392,9 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) if (!netif_device_present(dev)) return -ENODEV; - err = ops->ndo_hwtstamp_set(dev, &kernel_cfg, &extack); - if (err) { - if (extack._msg) - netdev_err(dev, "%s\n", extack._msg); + err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack); + if (err) return err; - } /* The driver may have modified the configuration, so copy the * updated version of it back to user space @@ -362,7 +437,7 @@ int generic_hwtstamp_get_lower(struct net_device *dev, return -ENODEV; if (ops->ndo_hwtstamp_get) - return ops->ndo_hwtstamp_get(dev, kernel_cfg); + return dev_get_hwtstamp_phylib(dev, kernel_cfg); /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); @@ -379,7 +454,7 @@ int generic_hwtstamp_set_lower(struct net_device *dev, return -ENODEV; if (ops->ndo_hwtstamp_set) - return ops->ndo_hwtstamp_set(dev, kernel_cfg, extack); + return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); -- cgit v1.2.3 From c956910d5af1e5df0f94fddb1bdae1ecaf74b3c5 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 2 Aug 2023 11:46:59 +0800 Subject: tipc: Remove unused function declarations Commit d50ccc2d3909 ("tipc: add 128-bit node identifier") declared but never implemented tipc_node_id2hash(). Also commit 5c216e1d28c8 ("tipc: Allow run-time alteration of default link settings") never implemented tipc_media_set_priority() and tipc_media_set_window(), commit cad2929dc432 ("tipc: update a binding service via broadcast") only declared tipc_named_bcast(). Since commit be07f056396d ("tipc: simplify the finalize work queue") tipc_sched_net_finalize() is removed and declaration is unused. Signed-off-by: Yue Haibing Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230802034659.39840-1-yuehaibing@huawei.com Signed-off-by: Paolo Abeni --- net/tipc/addr.h | 1 - net/tipc/bearer.h | 2 -- net/tipc/name_distr.h | 1 - net/tipc/net.h | 1 - 4 files changed, 5 deletions(-) (limited to 'net') diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 0772cfadaa0d..93f82398283d 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -131,6 +131,5 @@ bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr); void tipc_set_node_id(struct net *net, u8 *id); void tipc_set_node_addr(struct net *net, u32 addr); char *tipc_nodeid2string(char *str, u8 *id); -u32 tipc_node_id2hash(u8 *id128); #endif diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 1ee60649bd17..41eac1ee0c09 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -214,8 +214,6 @@ int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); -int tipc_media_set_priority(const char *name, u32 new_value); -int tipc_media_set_window(const char *name, u32 new_value); int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, struct nlattr *attrs[]); diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h index e231e6964d61..c677f6f082df 100644 --- a/net/tipc/name_distr.h +++ b/net/tipc/name_distr.h @@ -67,7 +67,6 @@ struct distr_item { __be32 key; }; -void tipc_named_bcast(struct net *net, struct sk_buff *skb); struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ); struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ); void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities); diff --git a/net/tipc/net.h b/net/tipc/net.h index d0c91d2df20a..1cb1e43cf34a 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -43,7 +43,6 @@ extern const struct nla_policy tipc_nl_net_policy[]; int tipc_net_init(struct net *net, u8 *node_id, u32 addr); void tipc_net_finalize_work(struct work_struct *work); -void tipc_sched_net_finalize(struct net *net, u32 addr); void tipc_net_stop(struct net *net); int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); -- cgit v1.2.3 From ce650a1663354a6cad7145e7f5131008458b39d4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 2 Aug 2023 08:36:50 +0100 Subject: udp6: Fix __ip6_append_data()'s handling of MSG_SPLICE_PAGES __ip6_append_data() can has a similar problem to __ip_append_data()[1] when asked to splice into a partially-built UDP message that has more than the frag-limit data and up to the MTU limit, but in the ipv6 case, it errors out with EINVAL. This can be triggered with something like: pipe(pfd); sfd = socket(AF_INET6, SOCK_DGRAM, 0); connect(sfd, ...); send(sfd, buffer, 8137, MSG_CONFIRM|MSG_MORE); write(pfd[1], buffer, 8); splice(pfd[0], 0, sfd, 0, 0x4ffe0ul, 0); where the amount of data given to send() is dependent on the MTU size (in this instance an interface with an MTU of 8192). The problem is that the calculation of the amount to copy in __ip6_append_data() goes negative in two places, but a check has been put in to give an error in this case. This happens because when pagedlen > 0 (which happens for MSG_ZEROCOPY and MSG_SPLICE_PAGES), the terms in: copy = datalen - transhdrlen - fraggap - pagedlen; then mostly cancel when pagedlen is substituted for, leaving just -fraggap. Fix this by: (1) Insert a note about the dodgy calculation of 'copy'. (2) If MSG_SPLICE_PAGES, clear copy if it is negative from the above equation, so that 'offset' isn't regressed and 'length' isn't increased, which will mean that length and thus copy should match the amount left in the iterator. (3) When handling MSG_SPLICE_PAGES, give a warning and return -EIO if we're asked to splice more than is in the iterator. It might be better to not give the warning or even just give a 'short' write. (4) If MSG_SPLICE_PAGES, override the copy<0 check. [!] Note that this should also affect MSG_ZEROCOPY, but that will return -EINVAL for the range of send sizes that requires the skbuff to be split. Signed-off-by: David Howells cc: Willem de Bruijn cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: David Ahern cc: Jens Axboe cc: Matthew Wilcox cc: netdev@vger.kernel.org Link: https://lore.kernel.org/r/000000000000881d0606004541d1@google.com/ [1] Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/1580952.1690961810@warthog.procyon.org.uk Signed-off-by: Paolo Abeni --- net/ipv6/ip6_output.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1e8c90e97608..bc96559bbf0f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1693,7 +1693,10 @@ alloc_new_skb: fraglen = datalen + fragheaderlen; copy = datalen - transhdrlen - fraggap - pagedlen; - if (copy < 0) { + /* [!] NOTE: copy may be negative if pagedlen>0 + * because then the equation may reduces to -fraggap. + */ + if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { err = -EINVAL; goto error; } @@ -1744,6 +1747,8 @@ alloc_new_skb: err = -EFAULT; kfree_skb(skb); goto error; + } else if (flags & MSG_SPLICE_PAGES) { + copy = 0; } offset += copy; @@ -1791,6 +1796,10 @@ alloc_new_skb: } else if (flags & MSG_SPLICE_PAGES) { struct msghdr *msg = from; + err = -EIO; + if (WARN_ON_ONCE(copy > msg->msg_iter.count)) + goto error; + err = skb_splice_from_iter(skb, &msg->msg_iter, copy, sk->sk_allocation); if (err < 0) -- cgit v1.2.3 From 49e47a5b6145d86c30022fe0e949bbb24bae28ba Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Aug 2023 18:02:29 -0700 Subject: net: move struct netdev_rx_queue out of netdevice.h struct netdev_rx_queue is touched in only a few places and having it defined in netdevice.h brings in the dependency on xdp.h, because struct xdp_rxq_info gets embedded in struct netdev_rx_queue. In prep for removal of xdp.h from netdevice.h move all the netdev_rx_queue stuff to a new header. We could technically break the new header up to avoid the sysfs.h include but it's so rarely included it doesn't seem to be worth it at this point. Reviewed-by: Amritha Nambiar Signed-off-by: Jakub Kicinski Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20230803010230.1755386-3-kuba@kernel.org Signed-off-by: Martin KaFai Lau --- drivers/net/virtio_net.c | 1 + include/linux/netdevice.h | 44 ----------------------------------- include/net/netdev_rx_queue.h | 53 +++++++++++++++++++++++++++++++++++++++++++ net/bpf/test_run.c | 1 + net/core/dev.c | 1 + net/core/net-sysfs.c | 1 + net/xdp/xsk.c | 1 + 7 files changed, 58 insertions(+), 44 deletions(-) create mode 100644 include/net/netdev_rx_queue.h (limited to 'net') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 0db14f6b87d3..5bcfd69333ea 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -22,6 +22,7 @@ #include #include #include +#include static int napi_weight = NAPI_POLL_WEIGHT; module_param(napi_weight, int, 0444); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3800d0479698..5563c8a210b5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -782,32 +782,6 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, #endif #endif /* CONFIG_RPS */ -/* This structure contains an instance of an RX queue. */ -struct netdev_rx_queue { - struct xdp_rxq_info xdp_rxq; -#ifdef CONFIG_RPS - struct rps_map __rcu *rps_map; - struct rps_dev_flow_table __rcu *rps_flow_table; -#endif - struct kobject kobj; - struct net_device *dev; - netdevice_tracker dev_tracker; - -#ifdef CONFIG_XDP_SOCKETS - struct xsk_buff_pool *pool; -#endif -} ____cacheline_aligned_in_smp; - -/* - * RX queue sysfs structures and functions. - */ -struct rx_queue_attribute { - struct attribute attr; - ssize_t (*show)(struct netdev_rx_queue *queue, char *buf); - ssize_t (*store)(struct netdev_rx_queue *queue, - const char *buf, size_t len); -}; - /* XPS map type and offset of the xps map within net_device->xps_maps[]. */ enum xps_map_type { XPS_CPUS = 0, @@ -3828,24 +3802,6 @@ static inline int netif_set_real_num_rx_queues(struct net_device *dev, int netif_set_real_num_queues(struct net_device *dev, unsigned int txq, unsigned int rxq); -static inline struct netdev_rx_queue * -__netif_get_rx_queue(struct net_device *dev, unsigned int rxq) -{ - return dev->_rx + rxq; -} - -#ifdef CONFIG_SYSFS -static inline unsigned int get_netdev_rx_queue_index( - struct netdev_rx_queue *queue) -{ - struct net_device *dev = queue->dev; - int index = queue - dev->_rx; - - BUG_ON(index >= dev->num_rx_queues); - return index; -} -#endif - int netif_get_num_default_rss_queues(void); void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason); diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h new file mode 100644 index 000000000000..cdcafb30d437 --- /dev/null +++ b/include/net/netdev_rx_queue.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NETDEV_RX_QUEUE_H +#define _LINUX_NETDEV_RX_QUEUE_H + +#include +#include +#include +#include + +/* This structure contains an instance of an RX queue. */ +struct netdev_rx_queue { + struct xdp_rxq_info xdp_rxq; +#ifdef CONFIG_RPS + struct rps_map __rcu *rps_map; + struct rps_dev_flow_table __rcu *rps_flow_table; +#endif + struct kobject kobj; + struct net_device *dev; + netdevice_tracker dev_tracker; + +#ifdef CONFIG_XDP_SOCKETS + struct xsk_buff_pool *pool; +#endif +} ____cacheline_aligned_in_smp; + +/* + * RX queue sysfs structures and functions. + */ +struct rx_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_rx_queue *queue, char *buf); + ssize_t (*store)(struct netdev_rx_queue *queue, + const char *buf, size_t len); +}; + +static inline struct netdev_rx_queue * +__netif_get_rx_queue(struct net_device *dev, unsigned int rxq) +{ + return dev->_rx + rxq; +} + +#ifdef CONFIG_SYSFS +static inline unsigned int +get_netdev_rx_queue_index(struct netdev_rx_queue *queue) +{ + struct net_device *dev = queue->dev; + int index = queue - dev->_rx; + + BUG_ON(index >= dev->num_rx_queues); + return index; +} +#endif +#endif diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 7d47f53f20c1..0aac76c13fd4 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/net/core/dev.c b/net/core/dev.c index 002fec07de73..1916ec990f58 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -152,6 +152,7 @@ #include #include #include +#include #include "dev.h" #include "net-sysfs.h" diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 15e3f4606b5f..fccaa5bac0ed 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "dev.h" #include "net-sysfs.h" diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4f1e0599146e..82aaec1b079f 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "xsk_queue.h" -- cgit v1.2.3 From 680ee0456a5712309db9ec2692e908ea1d6b1644 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Aug 2023 18:02:30 -0700 Subject: net: invert the netdevice.h vs xdp.h dependency xdp.h is far more specific and is included in only 67 other files vs netdevice.h's 1538 include sites. Make xdp.h include netdevice.h, instead of the other way around. This decreases the incremental allmodconfig builds size when xdp.h is touched from 5947 to 662 objects. Move bpf_prog_run_xdp() to xdp.h, seems appropriate and filter.h is a mega-header in its own right so it's nice to avoid xdp.h getting included there as well. The only unfortunate part is that the typedef for xdp_features_t has to move to netdevice.h, since its embedded in struct netdevice. Signed-off-by: Jakub Kicinski Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20230803010230.1755386-4-kuba@kernel.org Signed-off-by: Martin KaFai Lau --- include/linux/filter.h | 17 ----------------- include/linux/netdevice.h | 11 ++++------- include/net/busy_poll.h | 1 + include/net/xdp.h | 29 +++++++++++++++++++++++++---- include/trace/events/xdp.h | 1 + kernel/bpf/btf.c | 1 + kernel/bpf/offload.c | 1 + kernel/bpf/verifier.c | 1 + net/netfilter/nf_conntrack_bpf.c | 1 + 9 files changed, 35 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/linux/filter.h b/include/linux/filter.h index f5eabe3fa5e8..2d6fe30bad5f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -774,23 +774,6 @@ DECLARE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); u32 xdp_master_redirect(struct xdp_buff *xdp); -static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, - struct xdp_buff *xdp) -{ - /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus - * under local_bh_disable(), which provides the needed RCU protection - * for accessing map entries. - */ - u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); - - if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { - if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) - act = xdp_master_redirect(xdp); - } - - return act; -} - void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog); static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5563c8a210b5..d8ed85183fe4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -40,7 +40,6 @@ #include #endif #include -#include #include #include @@ -76,8 +75,12 @@ struct udp_tunnel_nic_info; struct udp_tunnel_nic; struct bpf_prog; struct xdp_buff; +struct xdp_frame; +struct xdp_metadata_ops; struct xdp_md; +typedef u32 xdp_features_t; + void synchronize_net(void); void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops); @@ -1628,12 +1631,6 @@ struct net_device_ops { bool cycles); }; -struct xdp_metadata_ops { - int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); - int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, - enum xdp_rss_hash_type *rss_type); -}; - /** * enum netdev_priv_flags - &struct net_device priv_flags * diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index f90f0021f5f2..4dabeb6c76d3 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -16,6 +16,7 @@ #include #include #include +#include /* 0 - Reserved to indicate value not set * 1..NR_CPUS - Reserved for sender_cpu diff --git a/include/net/xdp.h b/include/net/xdp.h index d1c5381fc95f..de08c8e0d134 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -6,9 +6,10 @@ #ifndef __LINUX_NET_XDP_H__ #define __LINUX_NET_XDP_H__ -#include /* skb_shared_info */ -#include #include +#include +#include +#include /* skb_shared_info */ /** * DOC: XDP RX-queue information @@ -45,8 +46,6 @@ enum xdp_mem_type { MEM_TYPE_MAX, }; -typedef u32 xdp_features_t; - /* XDP flags for ndo_xdp_xmit */ #define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ #define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH @@ -443,6 +442,12 @@ enum xdp_rss_hash_type { XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR, }; +struct xdp_metadata_ops { + int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); + int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, + enum xdp_rss_hash_type *rss_type); +}; + #ifdef CONFIG_NET u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); @@ -474,4 +479,20 @@ static inline void xdp_clear_features_flag(struct net_device *dev) xdp_set_features_flag(dev, 0); } +static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, + struct xdp_buff *xdp) +{ + /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus + * under local_bh_disable(), which provides the needed RCU protection + * for accessing map entries. + */ + u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); + + if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { + if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) + act = xdp_master_redirect(xdp); + } + + return act; +} #endif /* __LINUX_NET_XDP_H__ */ diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index cd89f1d5ce7b..9adc2bdf2f94 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -9,6 +9,7 @@ #include #include #include +#include #define __XDP_ACT_MAP(FN) \ FN(ABORTED) \ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ef9581a580e2..249657c466dd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -29,6 +29,7 @@ #include #include +#include #include "../tools/lib/bpf/relo_core.h" /* BTF (BPF Type Format) is the meta data format which describes diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 8a26cd8814c1..3e4f2ec1af06 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -25,6 +25,7 @@ #include #include #include +#include /* Protects offdevs, members of bpf_offload_netdev and offload members * of all progs. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e7b1af016841..132f25dab931 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "disasm.h" diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 0d36d7285e3f..c7a6114091ae 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 82e896d992fa631cda1f63239fd47b3ab781ffa6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Aug 2023 09:18:21 -0700 Subject: docs: net: page_pool: use kdoc to avoid duplicating the information All struct members of the driver-facing APIs are documented twice, in the code and under Documentation. This is a bit tedious. I also get the feeling that a lot of developers will read the header when coding, rather than the doc. Bring the two a little closer together by using kdoc for structs and functions. Using kdoc also gives us links (mentioning a function or struct in the text gets replaced by a link to its doc). Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20230802161821.3621985-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/page_pool.rst | 92 ++++++---------------- include/net/page_pool.h | 134 +++++++++++++++++++++++++-------- net/core/page_pool.c | 31 +++++++- 3 files changed, 155 insertions(+), 102 deletions(-) (limited to 'net') diff --git a/Documentation/networking/page_pool.rst b/Documentation/networking/page_pool.rst index eb96a592ec6b..53b5448cc0f1 100644 --- a/Documentation/networking/page_pool.rst +++ b/Documentation/networking/page_pool.rst @@ -64,50 +64,19 @@ This lockless guarantee naturally comes from running under a NAPI softirq. The protection doesn't strictly have to be NAPI, any guarantee that allocating a page will cause no race conditions is enough. -* page_pool_create(): Create a pool. - * flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV - * order: 2^order pages on allocation - * pool_size: size of the ptr_ring - * nid: preferred NUMA node for allocation - * dev: struct device. Used on DMA operations - * dma_dir: DMA direction - * max_len: max DMA sync memory size - * offset: DMA address offset - -* page_pool_put_page(): The outcome of this depends on the page refcnt. If the - driver bumps the refcnt > 1 this will unmap the page. If the page refcnt is 1 - the allocator owns the page and will try to recycle it in one of the pool - caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device - using dma_sync_single_range_for_device(). - -* page_pool_put_full_page(): Similar to page_pool_put_page(), but will DMA sync - for the entire memory area configured in area pool->max_len. - -* page_pool_recycle_direct(): Similar to page_pool_put_full_page() but caller - must guarantee safe context (e.g NAPI), since it will recycle the page - directly into the pool fast cache. - -* page_pool_dev_alloc_pages(): Get a page from the page allocator or page_pool - caches. - -* page_pool_get_dma_addr(): Retrieve the stored DMA address. - -* page_pool_get_dma_dir(): Retrieve the stored DMA direction. - -* page_pool_put_page_bulk(): Tries to refill a number of pages into the - ptr_ring cache holding ptr_ring producer lock. If the ptr_ring is full, - page_pool_put_page_bulk() will release leftover pages to the page allocator. - page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx - completion loop for the XDP_REDIRECT use case. - Please note the caller must not use data area after running - page_pool_put_page_bulk(), as this function overwrites it. - -* page_pool_get_stats(): Retrieve statistics about the page_pool. This API - is only available if the kernel has been configured with - ``CONFIG_PAGE_POOL_STATS=y``. A pointer to a caller allocated ``struct - page_pool_stats`` structure is passed to this API which is filled in. The - caller can then report those stats to the user (perhaps via ethtool, - debugfs, etc.). See below for an example usage of this API. +.. kernel-doc:: net/core/page_pool.c + :identifiers: page_pool_create + +.. kernel-doc:: include/net/page_pool.h + :identifiers: struct page_pool_params + +.. kernel-doc:: include/net/page_pool.h + :identifiers: page_pool_put_page page_pool_put_full_page + page_pool_recycle_direct page_pool_dev_alloc_pages + page_pool_get_dma_addr page_pool_get_dma_dir + +.. kernel-doc:: net/core/page_pool.c + :identifiers: page_pool_put_page_bulk page_pool_get_stats DMA sync -------- @@ -146,36 +115,17 @@ with fragmented page pools. Stats API and structures ------------------------ If the kernel is configured with ``CONFIG_PAGE_POOL_STATS=y``, the API -``page_pool_get_stats()`` and structures described below are available. It -takes a pointer to a ``struct page_pool`` and a pointer to a ``struct -page_pool_stats`` allocated by the caller. +page_pool_get_stats() and structures described below are available. +It takes a pointer to a ``struct page_pool`` and a pointer to a struct +page_pool_stats allocated by the caller. -The API will fill in the provided ``struct page_pool_stats`` with +The API will fill in the provided struct page_pool_stats with statistics about the page_pool. -The stats structure has the following fields:: - - struct page_pool_stats { - struct page_pool_alloc_stats alloc_stats; - struct page_pool_recycle_stats recycle_stats; - }; - - -The ``struct page_pool_alloc_stats`` has the following fields: - * ``fast``: successful fast path allocations - * ``slow``: slow path order-0 allocations - * ``slow_high_order``: slow path high order allocations - * ``empty``: ptr ring is empty, so a slow path allocation was forced. - * ``refill``: an allocation which triggered a refill of the cache - * ``waive``: pages obtained from the ptr ring that cannot be added to - the cache due to a NUMA mismatch. - -The ``struct page_pool_recycle_stats`` has the following fields: - * ``cached``: recycling placed page in the page pool cache - * ``cache_full``: page pool cache was full - * ``ring``: page placed into the ptr ring - * ``ring_full``: page released from page pool because the ptr ring was full - * ``released_refcnt``: page released (and not recycled) because refcnt > 1 +.. kernel-doc:: include/net/page_pool.h + :identifiers: struct page_pool_recycle_stats + struct page_pool_alloc_stats + struct page_pool_stats Coding examples =============== diff --git a/include/net/page_pool.h b/include/net/page_pool.h index f1d5cc1fa13b..73d4f786418d 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -70,47 +70,76 @@ struct pp_alloc_cache { struct page *cache[PP_ALLOC_CACHE_SIZE]; }; +/** + * struct page_pool_params - page pool parameters + * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG + * @order: 2^order pages on allocation + * @pool_size: size of the ptr_ring + * @nid: NUMA node id to allocate from pages from + * @dev: device, for DMA pre-mapping purposes + * @napi: NAPI which is the sole consumer of pages, otherwise NULL + * @dma_dir: DMA mapping direction + * @max_len: max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV + * @offset: DMA sync address offset for PP_FLAG_DMA_SYNC_DEV + */ struct page_pool_params { unsigned int flags; unsigned int order; unsigned int pool_size; - int nid; /* Numa node id to allocate from pages from */ - struct device *dev; /* device, for DMA pre-mapping purposes */ - struct napi_struct *napi; /* Sole consumer of pages, otherwise NULL */ - enum dma_data_direction dma_dir; /* DMA mapping direction */ - unsigned int max_len; /* max DMA sync memory size */ - unsigned int offset; /* DMA addr offset */ + int nid; + struct device *dev; + struct napi_struct *napi; + enum dma_data_direction dma_dir; + unsigned int max_len; + unsigned int offset; +/* private: used by test code only */ void (*init_callback)(struct page *page, void *arg); void *init_arg; }; #ifdef CONFIG_PAGE_POOL_STATS +/** + * struct page_pool_alloc_stats - allocation statistics + * @fast: successful fast path allocations + * @slow: slow path order-0 allocations + * @slow_high_order: slow path high order allocations + * @empty: ptr ring is empty, so a slow path allocation was forced + * @refill: an allocation which triggered a refill of the cache + * @waive: pages obtained from the ptr ring that cannot be added to + * the cache due to a NUMA mismatch + */ struct page_pool_alloc_stats { - u64 fast; /* fast path allocations */ - u64 slow; /* slow-path order 0 allocations */ - u64 slow_high_order; /* slow-path high order allocations */ - u64 empty; /* failed refills due to empty ptr ring, forcing - * slow path allocation - */ - u64 refill; /* allocations via successful refill */ - u64 waive; /* failed refills due to numa zone mismatch */ + u64 fast; + u64 slow; + u64 slow_high_order; + u64 empty; + u64 refill; + u64 waive; }; +/** + * struct page_pool_recycle_stats - recycling (freeing) statistics + * @cached: recycling placed page in the page pool cache + * @cache_full: page pool cache was full + * @ring: page placed into the ptr ring + * @ring_full: page released from page pool because the ptr ring was full + * @released_refcnt: page released (and not recycled) because refcnt > 1 + */ struct page_pool_recycle_stats { - u64 cached; /* recycling placed page in the cache. */ - u64 cache_full; /* cache was full */ - u64 ring; /* recycling placed page back into ptr ring */ - u64 ring_full; /* page was released from page-pool because - * PTR ring was full. - */ - u64 released_refcnt; /* page released because of elevated - * refcnt - */ + u64 cached; + u64 cache_full; + u64 ring; + u64 ring_full; + u64 released_refcnt; }; -/* This struct wraps the above stats structs so users of the - * page_pool_get_stats API can pass a single argument when requesting the - * stats for the page pool. +/** + * struct page_pool_stats - combined page pool use statistics + * @alloc_stats: see struct page_pool_alloc_stats + * @recycle_stats: see struct page_pool_recycle_stats + * + * Wrapper struct for combining page pool stats with different storage + * requirements. */ struct page_pool_stats { struct page_pool_alloc_stats alloc_stats; @@ -211,6 +240,12 @@ struct page_pool { struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); +/** + * page_pool_dev_alloc_pages() - allocate a page. + * @pool: pool from which to allocate + * + * Get a page from the page allocator or page_pool caches. + */ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); @@ -230,8 +265,12 @@ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, return page_pool_alloc_frag(pool, offset, size, gfp); } -/* get the stored dma direction. A driver might decide to treat this locally and - * avoid the extra cache line from page_pool to determine the direction +/** + * page_pool_get_dma_dir() - Retrieve the stored DMA direction. + * @pool: pool from which page was allocated + * + * Get the stored dma direction. A driver might decide to store this locally + * and avoid the extra cache line from page_pool to determine the direction. */ static inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) @@ -321,6 +360,19 @@ static inline bool page_pool_is_last_frag(struct page_pool *pool, (page_pool_defrag_page(page, 1) == 0); } +/** + * page_pool_put_page() - release a reference to a page pool page + * @pool: pool from which page was allocated + * @page: page to release a reference on + * @dma_sync_size: how much of the page may have been touched by the device + * @allow_direct: released by the consumer, allow lockless caching + * + * The outcome of this depends on the page refcnt. If the driver bumps + * the refcnt > 1 this will unmap the page. If the page refcnt is 1 + * the allocator owns the page and will try to recycle it in one of the pool + * caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device + * using dma_sync_single_range_for_device(). + */ static inline void page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, @@ -337,14 +389,29 @@ static inline void page_pool_put_page(struct page_pool *pool, #endif } -/* Same as above but will try to sync the entire area pool->max_len */ +/** + * page_pool_put_full_page() - release a reference on a page pool page + * @pool: pool from which page was allocated + * @page: page to release a reference on + * @allow_direct: released by the consumer, allow lockless caching + * + * Similar to page_pool_put_page(), but will DMA sync the entire memory area + * as configured in &page_pool_params.max_len. + */ static inline void page_pool_put_full_page(struct page_pool *pool, struct page *page, bool allow_direct) { page_pool_put_page(pool, page, -1, allow_direct); } -/* Same as above but the caller must guarantee safe context. e.g NAPI */ +/** + * page_pool_recycle_direct() - release a reference on a page pool page + * @pool: pool from which page was allocated + * @page: page to release a reference on + * + * Similar to page_pool_put_full_page() but caller must guarantee safe context + * (e.g NAPI), since it will recycle the page directly into the pool fast cache. + */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) { @@ -354,6 +421,13 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, #define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \ (sizeof(dma_addr_t) > sizeof(unsigned long)) +/** + * page_pool_get_dma_addr() - Retrieve the stored DMA address. + * @page: page allocated from a page pool + * + * Fetch the DMA address of the page. The page pool to which the page belongs + * must had been created with PP_FLAG_DMA_MAP. + */ static inline dma_addr_t page_pool_get_dma_addr(struct page *page) { dma_addr_t ret = page->dma_addr; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 7ca456bfab71..5d615a169718 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -58,6 +58,17 @@ static const char pp_stats[][ETH_GSTRING_LEN] = { "rx_pp_recycle_released_ref", }; +/** + * page_pool_get_stats() - fetch page pool stats + * @pool: pool from which page was allocated + * @stats: struct page_pool_stats to fill in + * + * Retrieve statistics about the page_pool. This API is only available + * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. + * A pointer to a caller allocated struct page_pool_stats structure + * is passed to this API which is filled in. The caller can then report + * those stats to the user (perhaps via ethtool, debugfs, etc.). + */ bool page_pool_get_stats(struct page_pool *pool, struct page_pool_stats *stats) { @@ -224,6 +235,10 @@ static int page_pool_init(struct page_pool *pool, return 0; } +/** + * page_pool_create() - create a page pool. + * @params: parameters, see struct page_pool_params + */ struct page_pool *page_pool_create(const struct page_pool_params *params) { struct page_pool *pool; @@ -626,7 +641,21 @@ void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, } EXPORT_SYMBOL(page_pool_put_defragged_page); -/* Caller must not use data area after call, as this function overwrites it */ +/** + * page_pool_put_page_bulk() - release references on multiple pages + * @pool: pool from which pages were allocated + * @data: array holding page pointers + * @count: number of pages in @data + * + * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring + * producer lock. If the ptr_ring is full, page_pool_put_page_bulk() + * will release leftover pages to the page allocator. + * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx + * completion loop for the XDP_REDIRECT use case. + * + * Please note the caller must not use data area after running + * page_pool_put_page_bulk(), as this function overwrites it. + */ void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count) { -- cgit v1.2.3 From 2744cefe03371cb6dda18649635e5dcbbb227ec9 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Thu, 3 Aug 2023 21:09:39 +0200 Subject: batman-adv: Start new development cycle This version will contain all the (major or even only minor) changes for Linux 6.6. The version number isn't a semantic version number with major and minor information. It is just encoding the year of the expected publishing as Linux -rc1 and the number of published versions this year (starting at 0). Signed-off-by: Simon Wunderlich --- net/batman-adv/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 156ed39eded1..10007c5894a1 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2023.1" +#define BATADV_SOURCE_VERSION "2023.3" #endif /* B.A.T.M.A.N. parameters */ -- cgit v1.2.3 From bbfb428a0cf6fbce4bcb34510a8eb0cf9b3841a4 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 26 Jul 2023 22:25:25 +0800 Subject: batman-adv: Remove unused declarations Since commit 335fbe0f5d25 ("batman-adv: tvlv - convert tt query packet to use tvlv unicast packets") batadv_recv_tt_query() is not used. And commit 122edaa05940 ("batman-adv: tvlv - convert roaming adv packet to use tvlv unicast packets") left behind batadv_recv_roam_adv(). Signed-off-by: YueHaibing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/routing.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 5f387786e9a7..afd15b3879f1 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -27,10 +27,6 @@ int batadv_recv_frag_packet(struct sk_buff *skb, struct batadv_hard_iface *iface); int batadv_recv_bcast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if); -int batadv_recv_tt_query(struct sk_buff *skb, - struct batadv_hard_iface *recv_if); -int batadv_recv_roam_adv(struct sk_buff *skb, - struct batadv_hard_iface *recv_if); int batadv_recv_unicast_tvlv(struct sk_buff *skb, struct batadv_hard_iface *recv_if); int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb, -- cgit v1.2.3 From e4b817804579624b47823d87e647ec6c3d3ab827 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 27 Jul 2023 10:13:42 +0200 Subject: batman-adv: Avoid magic value for minimum MTU The header linux/if_ether.h already defines a constant for the minimum MTU. So simply use it instead of having a magic constant in the code. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/soft-interface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index d3fdf82282af..f7947fad06f2 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -154,7 +154,7 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) { /* check ranges */ - if (new_mtu < 68 || new_mtu > batadv_hardif_min_mtu(dev)) + if (new_mtu < ETH_MIN_MTU || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; dev->mtu = new_mtu; -- cgit v1.2.3 From 112cbcb4af90c4b108b120a28089872d2234a350 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 27 Jul 2023 10:13:43 +0200 Subject: batman-adv: Check hardif MTU against runtime MTU If the MTU of the soft/mesh interface was already reduced (enough), it is not necessary to print a warning about a hard interface not having a MTU to transport ethernet payloads of 1500 bytes. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 41c1ad33d009..5a4ff9a81e74 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -699,9 +700,14 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, struct batadv_priv *bat_priv; __be16 ethertype = htons(ETH_P_BATMAN); int max_header_len = batadv_max_header_len(); + unsigned int required_mtu; + unsigned int hardif_mtu; int ret; - if (hard_iface->net_dev->mtu < ETH_MIN_MTU + max_header_len) + hardif_mtu = READ_ONCE(hard_iface->net_dev->mtu); + required_mtu = READ_ONCE(soft_iface->mtu) + max_header_len; + + if (hardif_mtu < ETH_MIN_MTU + max_header_len) return -EINVAL; if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) @@ -734,18 +740,18 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, hard_iface->net_dev->name); if (atomic_read(&bat_priv->fragmentation) && - hard_iface->net_dev->mtu < ETH_DATA_LEN + max_header_len) + hardif_mtu < required_mtu) batadv_info(hard_iface->soft_iface, "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. Packets going over this interface will be fragmented on layer2 which could impact the performance. Setting the MTU to %i would solve the problem.\n", - hard_iface->net_dev->name, hard_iface->net_dev->mtu, - ETH_DATA_LEN + max_header_len); + hard_iface->net_dev->name, hardif_mtu, + required_mtu); if (!atomic_read(&bat_priv->fragmentation) && - hard_iface->net_dev->mtu < ETH_DATA_LEN + max_header_len) + hardif_mtu < required_mtu) batadv_info(hard_iface->soft_iface, "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. If you experience problems getting traffic through try increasing the MTU to %i.\n", - hard_iface->net_dev->name, hard_iface->net_dev->mtu, - ETH_DATA_LEN + max_header_len); + hard_iface->net_dev->name, hardif_mtu, + required_mtu); if (batadv_hardif_is_iface_up(hard_iface)) batadv_hardif_activate_interface(hard_iface); -- cgit v1.2.3 From ba0f66c95fa670d99ad40e4a6c1ede425446a291 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Aug 2023 13:13:33 +0200 Subject: devlink: rename devlink_nl_ops to devlink_nl_small_ops In order to avoid name collision with the generated split ops array which is going to be introduced as a follow-up patch, rename the existing ops array to devlink_nl_small_ops. Signed-off-by: Jiri Pirko Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230803111340.1074067-6-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/devl_internal.h | 2 +- net/devlink/leftover.c | 2 +- net/devlink/netlink.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 62921b2eb0d3..c67f074641d4 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -119,7 +119,7 @@ struct devlink_cmd { struct netlink_callback *cb); }; -extern const struct genl_small_ops devlink_nl_ops[56]; +extern const struct genl_small_ops devlink_nl_small_ops[56]; struct devlink * devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs); diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 5128b9c7eea8..8f42f1f45705 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -6278,7 +6278,7 @@ static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, return devlink_trap_policer_set(devlink, policer_item, info); } -const struct genl_small_ops devlink_nl_ops[56] = { +const struct genl_small_ops devlink_nl_small_ops[56] = { { .cmd = DEVLINK_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 7a332eb70f70..82a3238d5344 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -243,8 +243,8 @@ struct genl_family devlink_nl_family __ro_after_init = { .pre_doit = devlink_nl_pre_doit, .post_doit = devlink_nl_post_doit, .module = THIS_MODULE, - .small_ops = devlink_nl_ops, - .n_small_ops = ARRAY_SIZE(devlink_nl_ops), + .small_ops = devlink_nl_small_ops, + .n_small_ops = ARRAY_SIZE(devlink_nl_small_ops), .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1, .mcgrps = devlink_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), -- cgit v1.2.3 From d61aedcf628ef7467d69638c3b3e5e33cafd75aa Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Aug 2023 13:13:34 +0200 Subject: devlink: rename couple of doit netlink callbacks to match generated names The generated names of the doit netlink callback are missing "cmd" in their names. Change names to be ready to switch to generated split ops header. Signed-off-by: Jiri Pirko Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230803111340.1074067-7-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/dev.c | 4 ++-- net/devlink/devl_internal.h | 4 ++-- net/devlink/leftover.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/devlink/dev.c b/net/devlink/dev.c index bf1d6f1bcfc7..2e120b3da220 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -196,7 +196,7 @@ void devlink_notify(struct devlink *devlink, enum devlink_command cmd) msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info) +int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct sk_buff *msg; @@ -804,7 +804,7 @@ err_cancel_msg: return err; } -int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, struct genl_info *info) +int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct sk_buff *msg; diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index c67f074641d4..d32a32705430 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -214,11 +214,11 @@ struct devlink_rate * devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info); /* Devlink nl cmds */ -int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info); diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 8f42f1f45705..094cd0e8224e 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -6282,7 +6282,7 @@ const struct genl_small_ops devlink_nl_small_ops[56] = { { .cmd = DEVLINK_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_get_doit, + .doit = devlink_nl_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, @@ -6536,7 +6536,7 @@ const struct genl_small_ops devlink_nl_small_ops[56] = { { .cmd = DEVLINK_CMD_INFO_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_info_get_doit, + .doit = devlink_nl_info_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, -- cgit v1.2.3 From 491a24872a64cbb8edf996b6113a97586702e5ca Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Aug 2023 13:13:35 +0200 Subject: devlink: introduce couple of dumpit callbacks for split ops Introduce couple of dumpit callbacks for generated split ops. Have them as a thin wrapper around iteration function and allow to pass dump_one() function pointer directly without need to store in devlink_cmd structs. Signed-off-by: Jiri Pirko Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230803111340.1074067-8-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/dev.c | 22 ++++++++++++---------- net/devlink/devl_internal.h | 17 +++++++++++------ net/devlink/leftover.c | 4 ++-- net/devlink/netlink.c | 21 ++++++++++++--------- 4 files changed, 37 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/devlink/dev.c b/net/devlink/dev.c index 2e120b3da220..5dfba2248b90 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -217,17 +217,18 @@ int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info) } static int -devlink_nl_cmd_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { return devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); } -const struct devlink_cmd devl_cmd_get = { - .dump_one = devlink_nl_cmd_get_dump_one, -}; +int devlink_nl_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(msg, cb, devlink_nl_get_dump_one); +} static void devlink_reload_failed_set(struct devlink *devlink, bool reload_failed) @@ -826,8 +827,8 @@ int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info) } static int -devlink_nl_cmd_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { int err; @@ -840,9 +841,10 @@ devlink_nl_cmd_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink, return err; } -const struct devlink_cmd devl_cmd_info_get = { - .dump_one = devlink_nl_cmd_info_get_dump_one, -}; +int devlink_nl_info_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(msg, cb, devlink_nl_info_get_dump_one); +} static int devlink_nl_flash_update_fill(struct sk_buff *msg, struct devlink *devlink, diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index d32a32705430..7d0a9dd1aceb 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -114,9 +114,12 @@ struct devlink_nl_dump_state { }; }; +typedef int devlink_nl_dump_one_func_t(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb); + struct devlink_cmd { - int (*dump_one)(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb); + devlink_nl_dump_one_func_t *dump_one; }; extern const struct genl_small_ops devlink_nl_small_ops[56]; @@ -127,8 +130,9 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs); void devlink_notify_unregister(struct devlink *devlink); void devlink_notify_register(struct devlink *devlink); -int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, - struct netlink_callback *cb); +int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, + devlink_nl_dump_one_func_t *dump_one); +int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, struct netlink_callback *cb); static inline struct devlink_nl_dump_state * devlink_dump_state(struct netlink_callback *cb) @@ -149,7 +153,6 @@ devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) } /* Commands */ -extern const struct devlink_cmd devl_cmd_get; extern const struct devlink_cmd devl_cmd_port_get; extern const struct devlink_cmd devl_cmd_sb_get; extern const struct devlink_cmd devl_cmd_sb_pool_get; @@ -157,7 +160,6 @@ extern const struct devlink_cmd devl_cmd_sb_port_pool_get; extern const struct devlink_cmd devl_cmd_sb_tc_pool_bind_get; extern const struct devlink_cmd devl_cmd_param_get; extern const struct devlink_cmd devl_cmd_region_get; -extern const struct devlink_cmd devl_cmd_info_get; extern const struct devlink_cmd devl_cmd_health_reporter_get; extern const struct devlink_cmd devl_cmd_trap_get; extern const struct devlink_cmd devl_cmd_trap_group_get; @@ -215,10 +217,13 @@ devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info); /* Devlink nl cmds */ int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_info_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info); diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 094cd0e8224e..895b732bed8e 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -6283,7 +6283,7 @@ const struct genl_small_ops devlink_nl_small_ops[56] = { .cmd = DEVLINK_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_get_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -6537,7 +6537,7 @@ const struct genl_small_ops devlink_nl_small_ops[56] = { .cmd = DEVLINK_CMD_INFO_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_info_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_info_get_dumpit, /* can be retrieved by unprivileged users */ }, { diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 82a3238d5344..624d0db7e229 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -178,7 +178,6 @@ static void devlink_nl_post_doit(const struct genl_split_ops *ops, } static const struct devlink_cmd *devl_cmds[] = { - [DEVLINK_CMD_GET] = &devl_cmd_get, [DEVLINK_CMD_PORT_GET] = &devl_cmd_port_get, [DEVLINK_CMD_SB_GET] = &devl_cmd_sb_get, [DEVLINK_CMD_SB_POOL_GET] = &devl_cmd_sb_pool_get, @@ -186,7 +185,6 @@ static const struct devlink_cmd *devl_cmds[] = { [DEVLINK_CMD_SB_TC_POOL_BIND_GET] = &devl_cmd_sb_tc_pool_bind_get, [DEVLINK_CMD_PARAM_GET] = &devl_cmd_param_get, [DEVLINK_CMD_REGION_GET] = &devl_cmd_region_get, - [DEVLINK_CMD_INFO_GET] = &devl_cmd_info_get, [DEVLINK_CMD_HEALTH_REPORTER_GET] = &devl_cmd_health_reporter_get, [DEVLINK_CMD_TRAP_GET] = &devl_cmd_trap_get, [DEVLINK_CMD_TRAP_GROUP_GET] = &devl_cmd_trap_group_get, @@ -196,23 +194,19 @@ static const struct devlink_cmd *devl_cmds[] = { [DEVLINK_CMD_SELFTESTS_GET] = &devl_cmd_selftests_get, }; -int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) +int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, + devlink_nl_dump_one_func_t *dump_one) { - const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct devlink_nl_dump_state *state = devlink_dump_state(cb); - const struct devlink_cmd *cmd; struct devlink *devlink; int err = 0; - cmd = devl_cmds[info->op.cmd]; - while ((devlink = devlinks_xa_find_get(sock_net(msg->sk), &state->instance))) { devl_lock(devlink); if (devl_is_registered(devlink)) - err = cmd->dump_one(msg, devlink, cb); + err = dump_one(msg, devlink, cb); else err = 0; @@ -233,6 +227,15 @@ int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, return msg->len; } +int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + const struct devlink_cmd *cmd = devl_cmds[info->op.cmd]; + + return devlink_nl_dumpit(msg, cb, cmd->dump_one); +} + struct genl_family devlink_nl_family __ro_after_init = { .name = DEVLINK_GENL_NAME, .version = DEVLINK_GENL_VERSION, -- cgit v1.2.3 From 8300dce542e45c50ef318c4493bbb2e7966a883c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Aug 2023 13:13:36 +0200 Subject: devlink: un-static devlink_nl_pre/post_doit() To be prepared for the follow-up generated split ops addition, make the functions devlink_nl_pre_doit() and devlink_nl_post_doit() usable outside of netlink.c. Introduce temporary prototypes which are going to be removed once the generated header will be included. Signed-off-by: Jiri Pirko Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230803111340.1074067-9-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/devl_internal.h | 4 ++++ net/devlink/netlink.c | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 7d0a9dd1aceb..0befa1869fde 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -216,6 +216,10 @@ struct devlink_rate * devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info); /* Devlink nl cmds */ +int devlink_nl_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +void devlink_nl_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info); diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 624d0db7e229..98d5c6b0acd8 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -109,8 +109,8 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs) return ERR_PTR(-ENODEV); } -static int devlink_nl_pre_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) +int devlink_nl_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) { struct devlink_linecard *linecard; struct devlink_port *devlink_port; @@ -167,8 +167,8 @@ unlock: return err; } -static void devlink_nl_post_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) +void devlink_nl_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink; -- cgit v1.2.3 From 6b7c486cae8136050df8fca2fbebedb685c49b2d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Aug 2023 13:13:38 +0200 Subject: devlink: add split ops generated according to spec Improve the existing devlink spec in order to serve as a source for generation of valid devlink split ops for the existing commands. Add the generated sources. Node that the policies are narrowed down only to the attributes that are actually parsed. The dont-validate-strict parsing policy makes sure that other possibly passed garbage attributes from userspace are ignored during validation. Signed-off-by: Jiri Pirko Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230803111340.1074067-11-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 10 ++++++ net/devlink/Makefile | 2 +- net/devlink/netlink_gen.c | 59 ++++++++++++++++++++++++++++++++ net/devlink/netlink_gen.h | 29 ++++++++++++++++ 4 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 net/devlink/netlink_gen.c create mode 100644 net/devlink/netlink_gen.h (limited to 'net') diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index 12699b7ce292..f6df0b3fd502 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -165,8 +165,13 @@ operations: name: get doc: Get devlink instances. attribute-set: devlink + dont-validate: + - strict + - dump do: + pre: devlink-nl-pre-doit + post: devlink-nl-post-doit request: value: 1 attributes: &dev-id-attrs @@ -189,8 +194,13 @@ operations: name: info-get doc: Get device information, like driver name, hardware and firmware versions etc. attribute-set: devlink + dont-validate: + - strict + - dump do: + pre: devlink-nl-pre-doit + post: devlink-nl-post-doit request: value: 51 attributes: *dev-id-attrs diff --git a/net/devlink/Makefile b/net/devlink/Makefile index ef91a76646a3..a087af581847 100644 --- a/net/devlink/Makefile +++ b/net/devlink/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := leftover.o core.o netlink.o dev.o health.o +obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o health.o diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c new file mode 100644 index 000000000000..32d8cbed0c30 --- /dev/null +++ b/net/devlink/netlink_gen.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/devlink.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "netlink_gen.h" + +#include + +/* DEVLINK_CMD_GET - do */ +static const struct nla_policy devlink_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_INFO_GET - do */ +static const struct nla_policy devlink_info_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* Ops table for devlink */ +const struct genl_split_ops devlink_nl_ops[4] = { + { + .cmd = DEVLINK_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_get_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_INFO_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_info_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_info_get_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_INFO_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_info_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +}; diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h new file mode 100644 index 000000000000..11980e04a718 --- /dev/null +++ b/net/devlink/netlink_gen.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/devlink.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_DEVLINK_GEN_H +#define _LINUX_DEVLINK_GEN_H + +#include +#include + +#include + +/* Ops table for devlink */ +extern const struct genl_split_ops devlink_nl_ops[4]; + +int devlink_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); +void +devlink_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_info_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); + +#endif /* _LINUX_DEVLINK_GEN_H */ -- cgit v1.2.3 From b2551b1517d86ff5e258e7bc834e5be0e1ce08de Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Aug 2023 13:13:39 +0200 Subject: devlink: include the generated netlink header Put the newly added generated header to the include list. Remove the duplicated temporary function prototypes. Signed-off-by: Jiri Pirko Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230803111340.1074067-12-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/devl_internal.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 0befa1869fde..51de0e1fc769 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -12,6 +12,8 @@ #include #include +#include "netlink_gen.h" + #define DEVLINK_REGISTERED XA_MARK_1 #define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ @@ -216,18 +218,9 @@ struct devlink_rate * devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info); /* Devlink nl cmds */ -int devlink_nl_pre_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info); -void devlink_nl_post_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info); -int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_info_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb); int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info); -- cgit v1.2.3 From 6e067d0cab68a64ef3488b5639a66b65e91b146d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Aug 2023 13:13:40 +0200 Subject: devlink: use generated split ops and remove duplicated commands from small ops Do the switch and use generated split ops for get and info_get commands. Remove those from small ops array. Signed-off-by: Jiri Pirko Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230803111340.1074067-13-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/devl_internal.h | 2 +- net/devlink/leftover.c | 16 +--------------- net/devlink/netlink.c | 2 ++ 3 files changed, 4 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 51de0e1fc769..7fdd956ff992 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -124,7 +124,7 @@ struct devlink_cmd { devlink_nl_dump_one_func_t *dump_one; }; -extern const struct genl_small_ops devlink_nl_small_ops[56]; +extern const struct genl_small_ops devlink_nl_small_ops[54]; struct devlink * devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs); diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 895b732bed8e..3bf42f5335ed 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -6278,14 +6278,7 @@ static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, return devlink_trap_policer_set(devlink, policer_item, info); } -const struct genl_small_ops devlink_nl_small_ops[56] = { - { - .cmd = DEVLINK_CMD_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_get_doit, - .dumpit = devlink_nl_get_dumpit, - /* can be retrieved by unprivileged users */ - }, +const struct genl_small_ops devlink_nl_small_ops[54] = { { .cmd = DEVLINK_CMD_PORT_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -6533,13 +6526,6 @@ const struct genl_small_ops devlink_nl_small_ops[56] = { .dumpit = devlink_nl_cmd_region_read_dumpit, .flags = GENL_ADMIN_PERM, }, - { - .cmd = DEVLINK_CMD_INFO_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_info_get_doit, - .dumpit = devlink_nl_info_get_dumpit, - /* can be retrieved by unprivileged users */ - }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 98d5c6b0acd8..bada2819827b 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -248,6 +248,8 @@ struct genl_family devlink_nl_family __ro_after_init = { .module = THIS_MODULE, .small_ops = devlink_nl_small_ops, .n_small_ops = ARRAY_SIZE(devlink_nl_small_ops), + .split_ops = devlink_nl_ops, + .n_split_ops = ARRAY_SIZE(devlink_nl_ops), .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1, .mcgrps = devlink_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), -- cgit v1.2.3 From c4a6b2da4b59d4819e1d35446ca9363166388051 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Aug 2023 13:54:16 +0000 Subject: tcp_metrics: hash table allocation cleanup After commit 098a697b497e ("tcp_metrics: Use a single hash table for all network namespaces.") we can avoid calling tcp_net_metrics_init() for each new netns. Instead, rename tcp_net_metrics_init() to tcp_metrics_hash_alloc(), and move it to __init section. Also move tcpmhash_entries to __initdata section. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230803135417.2716879-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_metrics.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 99ac5efe244d..c196759f1d3b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -990,7 +990,7 @@ static struct genl_family tcp_metrics_nl_family __ro_after_init = { .resv_start_op = TCP_METRICS_CMD_DEL + 1, }; -static unsigned int tcpmhash_entries; +static unsigned int tcpmhash_entries __initdata; static int __init set_tcpmhash_entries(char *str) { ssize_t ret; @@ -1006,15 +1006,11 @@ static int __init set_tcpmhash_entries(char *str) } __setup("tcpmhash_entries=", set_tcpmhash_entries); -static int __net_init tcp_net_metrics_init(struct net *net) +static void __init tcp_metrics_hash_alloc(void) { + unsigned int slots = tcpmhash_entries; size_t size; - unsigned int slots; - if (!net_eq(net, &init_net)) - return 0; - - slots = tcpmhash_entries; if (!slots) { if (totalram_pages() >= 128 * 1024) slots = 16 * 1024; @@ -1027,9 +1023,7 @@ static int __net_init tcp_net_metrics_init(struct net *net) tcp_metrics_hash = kvzalloc(size, GFP_KERNEL); if (!tcp_metrics_hash) - return -ENOMEM; - - return 0; + panic("Could not allocate the tcp_metrics hash table\n"); } static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list) @@ -1038,7 +1032,6 @@ static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_lis } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { - .init = tcp_net_metrics_init, .exit_batch = tcp_net_metrics_exit_batch, }; @@ -1046,9 +1039,11 @@ void __init tcp_metrics_init(void) { int ret; + tcp_metrics_hash_alloc(); + ret = register_pernet_subsys(&tcp_net_metrics_ops); if (ret < 0) - panic("Could not allocate the tcp_metrics hash table\n"); + panic("Could not register tcp_net_metrics_ops\n"); ret = genl_register_family(&tcp_metrics_nl_family); if (ret < 0) -- cgit v1.2.3 From d0f2b7a9ca0a1a89d65ee145815f89b54b7f977f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 3 Aug 2023 15:45:51 -0700 Subject: tcp: Disable header prediction for MD5 flow. TCP socket saves the minimum required header length in tcp_header_len of struct tcp_sock, and later the value is used in __tcp_fast_path_on() to generate a part of TCP header in tcp_sock(sk)->pred_flags. In tcp_rcv_established(), if the incoming packet has the same pattern with pred_flags, we enter the fast path and skip full option parsing. The MD5 option is parsed in tcp_v[46]_rcv(), so we need not parse it again later in tcp_rcv_established() unless other options exist. We add TCPOLEN_MD5SIG_ALIGNED to tcp_header_len in two paths to avoid the slow path. For passive open connections with MD5, we add TCPOLEN_MD5SIG_ALIGNED to tcp_header_len in tcp_create_openreq_child() after 3WHS. On the other hand, we do it in tcp_connect_init() for active open connections. However, the value is overwritten while processing SYN+ACK or crossed SYN in tcp_rcv_synsent_state_process(). These two cases will have the wrong value in pred_flags and never go into the fast path. We could update tcp_header_len in tcp_rcv_synsent_state_process(), but a test with slightly modified netperf which uses MD5 for each flow shows that the slow path is actually a bit faster than the fast path. On c5.4xlarge EC2 instance (16 vCPU, 32 GiB mem) $ for i in {1..10}; do ./super_netperf $(nproc) -H localhost -l 10 -- -m 256 -M 256; done Avg of 10 * 36e68eadd303 : 10.376 Gbps * all fast path : 10.374 Gbps (patch v2, See Link) * all slow path : 10.394 Gbps The header prediction is not worth adding complexity for MD5, so let's disable it for MD5. Link: https://lore.kernel.org/netdev/20230803042214.38309-1-kuniyu@amazon.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20230803224552.69398-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_minisocks.c | 2 -- net/ipv4/tcp_output.c | 5 ----- 2 files changed, 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index c8f2aa003387..ddba3b67f7c8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -570,8 +570,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ - if (treq->af_specific->req_md5_lookup(sk, req_to_sk(req))) - newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 51d8638d4b4c..c5412ee77fc8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3741,11 +3741,6 @@ static void tcp_connect_init(struct sock *sk) if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps)) tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; -#ifdef CONFIG_TCP_MD5SIG - if (tp->af_specific->md5_lookup(sk, sk)) - tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; -#endif - /* If user gave his TCP_MAXSEG, record it to clamp */ if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; -- cgit v1.2.3 From b205153689326d22e48b22f21d1ae593df9e85d3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 3 Aug 2023 15:45:52 -0700 Subject: tcp: Update stale comment for MD5 in tcp_parse_options(). Since commit 9ea88a153001 ("tcp: md5: check md5 signature without socket lock"), the MD5 option is checked in tcp_v[46]_rcv(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20230803224552.69398-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 670c3dab24f2..2995802126e4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4126,9 +4126,8 @@ void tcp_parse_options(const struct net *net, break; #ifdef CONFIG_TCP_MD5SIG case TCPOPT_MD5SIG: - /* - * The MD5 Hash has already been - * checked (see tcp_v{4,6}_do_rcv()). + /* The MD5 Hash has already been + * checked (see tcp_v{4,6}_rcv()). */ break; #endif -- cgit v1.2.3 From d44fd4a767b3755899f8ad1df3e8eca3961ba708 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Aug 2023 14:46:11 +0000 Subject: tcp: set TCP_SYNCNT locklessly icsk->icsk_syn_retries can safely be set without locking the socket. We have to add READ_ONCE() annotations in tcp_fastopen_synack_timer() and tcp_write_timeout(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 15 ++++++--------- net/ipv4/tcp_timer.c | 9 ++++++--- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index aca5620cf3ba..bcbb33a8c152 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3291,9 +3291,7 @@ int tcp_sock_set_syncnt(struct sock *sk, int val) if (val < 1 || val > MAX_TCP_SYNCNT) return -EINVAL; - lock_sock(sk); WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val); - release_sock(sk); return 0; } EXPORT_SYMBOL(tcp_sock_set_syncnt); @@ -3462,6 +3460,12 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; + /* Handle options that can be set without locking the socket. */ + switch (optname) { + case TCP_SYNCNT: + return tcp_sock_set_syncnt(sk, val); + } + sockopt_lock_sock(sk); switch (optname) { @@ -3569,13 +3573,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, else WRITE_ONCE(tp->keepalive_probes, val); break; - case TCP_SYNCNT: - if (val < 1 || val > MAX_TCP_SYNCNT) - err = -EINVAL; - else - WRITE_ONCE(icsk->icsk_syn_retries, val); - break; - case TCP_SAVE_SYN: /* 0: disable, 1: enable, 2: start from ether_header */ if (val < 0 || val > 2) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 470f581eedd4..66040ab457d4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -239,7 +239,8 @@ static int tcp_write_timeout(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) __dst_negative_advice(sk); - retry_until = icsk->icsk_syn_retries ? : + /* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */ + retry_until = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); max_retransmits = retry_until; @@ -421,8 +422,10 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) req->rsk_ops->syn_ack_timeout(req); - /* add one more retry for fastopen */ - max_retries = icsk->icsk_syn_retries ? : + /* Add one more retry for fastopen. + * Paired with WRITE_ONCE() in tcp_sock_set_syncnt() + */ + max_retries = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1; if (req->num_timeout >= max_retries) { -- cgit v1.2.3 From d58f2e15aa0c07f6f03ec71f64d7697ca43d04a1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Aug 2023 14:46:12 +0000 Subject: tcp: set TCP_USER_TIMEOUT locklessly icsk->icsk_user_timeout can be set locklessly, if all read sides use READ_ONCE(). Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- net/ipv4/tcp.c | 23 ++++++++++------------- net/ipv4/tcp_timer.c | 37 ++++++++++++++++++++++--------------- 3 files changed, 33 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d16abdb3541a..3c5efeeb024f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -564,6 +564,6 @@ void __tcp_sock_set_nodelay(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); -void tcp_sock_set_user_timeout(struct sock *sk, u32 val); +int tcp_sock_set_user_timeout(struct sock *sk, int val); #endif /* _LINUX_TCP_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bcbb33a8c152..34c2a40b0247 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3296,11 +3296,16 @@ int tcp_sock_set_syncnt(struct sock *sk, int val) } EXPORT_SYMBOL(tcp_sock_set_syncnt); -void tcp_sock_set_user_timeout(struct sock *sk, u32 val) +int tcp_sock_set_user_timeout(struct sock *sk, int val) { - lock_sock(sk); + /* Cap the max time in ms TCP will retry or probe the window + * before giving up and aborting (ETIMEDOUT) a connection. + */ + if (val < 0) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val); - release_sock(sk); + return 0; } EXPORT_SYMBOL(tcp_sock_set_user_timeout); @@ -3464,6 +3469,8 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case TCP_SYNCNT: return tcp_sock_set_syncnt(sk, val); + case TCP_USER_TIMEOUT: + return tcp_sock_set_user_timeout(sk, val); } sockopt_lock_sock(sk); @@ -3611,16 +3618,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif - case TCP_USER_TIMEOUT: - /* Cap the max time in ms TCP will retry or probe the window - * before giving up and aborting (ETIMEDOUT) a connection. - */ - if (val < 0) - err = -EINVAL; - else - WRITE_ONCE(icsk->icsk_user_timeout, val); - break; - case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 66040ab457d4..f99e2d06ae7c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -26,14 +26,15 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - u32 elapsed, start_ts; + u32 elapsed, start_ts, user_timeout; s32 remaining; start_ts = tcp_sk(sk)->retrans_stamp; - if (!icsk->icsk_user_timeout) + user_timeout = READ_ONCE(icsk->icsk_user_timeout); + if (!user_timeout) return icsk->icsk_rto; elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; - remaining = icsk->icsk_user_timeout - elapsed; + remaining = user_timeout - elapsed; if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ @@ -43,16 +44,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) { struct inet_connection_sock *icsk = inet_csk(sk); - u32 remaining; + u32 remaining, user_timeout; s32 elapsed; - if (!icsk->icsk_user_timeout || !icsk->icsk_probes_tstamp) + user_timeout = READ_ONCE(icsk->icsk_user_timeout); + if (!user_timeout || !icsk->icsk_probes_tstamp) return when; elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp; if (unlikely(elapsed < 0)) elapsed = 0; - remaining = msecs_to_jiffies(icsk->icsk_user_timeout) - elapsed; + remaining = msecs_to_jiffies(user_timeout) - elapsed; remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN); return min_t(u32, remaining, when); @@ -270,7 +272,7 @@ static int tcp_write_timeout(struct sock *sk) } if (!expired) expired = retransmits_timed_out(sk, retry_until, - icsk->icsk_user_timeout); + READ_ONCE(icsk->icsk_user_timeout)); tcp_fastopen_active_detect_blackhole(sk, expired); if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) @@ -384,13 +386,16 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - if (!icsk->icsk_probes_tstamp) + if (!icsk->icsk_probes_tstamp) { icsk->icsk_probes_tstamp = tcp_jiffies32; - else if (icsk->icsk_user_timeout && - (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= - msecs_to_jiffies(icsk->icsk_user_timeout)) - goto abort; + } else { + u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); + if (user_timeout && + (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= + msecs_to_jiffies(user_timeout)) + goto abort; + } max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; @@ -734,13 +739,15 @@ static void tcp_keepalive_timer (struct timer_list *t) elapsed = keepalive_time_elapsed(tp); if (elapsed >= keepalive_time_when(tp)) { + u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); + /* If the TCP_USER_TIMEOUT option is enabled, use that * to determine when to timeout instead. */ - if ((icsk->icsk_user_timeout != 0 && - elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) && + if ((user_timeout != 0 && + elapsed >= msecs_to_jiffies(user_timeout) && icsk->icsk_probes_out > 0) || - (icsk->icsk_user_timeout == 0 && + (user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); -- cgit v1.2.3 From 6fd70a6b4e6f58482a43da46d12323795bbc5f68 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Aug 2023 14:46:13 +0000 Subject: tcp: set TCP_KEEPINTVL locklessly tp->keepalive_intvl can be set locklessly, readers are already taking care of this field being potentially set by other threads. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 34c2a40b0247..75d6359ee575 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3348,9 +3348,7 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val) if (val < 1 || val > MAX_TCP_KEEPINTVL) return -EINVAL; - lock_sock(sk); WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ); - release_sock(sk); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepintvl); @@ -3471,6 +3469,8 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, return tcp_sock_set_syncnt(sk, val); case TCP_USER_TIMEOUT: return tcp_sock_set_user_timeout(sk, val); + case TCP_KEEPINTVL: + return tcp_sock_set_keepintvl(sk, val); } sockopt_lock_sock(sk); @@ -3568,12 +3568,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_KEEPIDLE: err = tcp_sock_set_keepidle_locked(sk, val); break; - case TCP_KEEPINTVL: - if (val < 1 || val > MAX_TCP_KEEPINTVL) - err = -EINVAL; - else - WRITE_ONCE(tp->keepalive_intvl, val * HZ); - break; case TCP_KEEPCNT: if (val < 1 || val > MAX_TCP_KEEPCNT) err = -EINVAL; -- cgit v1.2.3 From 84485080cbc1e5a011e7549966739df4cec158b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Aug 2023 14:46:14 +0000 Subject: tcp: set TCP_KEEPCNT locklessly tp->keepalive_probes can be set locklessly, readers are already taking care of this field being potentially set by other threads. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 75d6359ee575..e74a9593283c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3358,10 +3358,8 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val) if (val < 1 || val > MAX_TCP_KEEPCNT) return -EINVAL; - lock_sock(sk); /* Paired with READ_ONCE() in keepalive_probes() */ WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val); - release_sock(sk); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepcnt); @@ -3471,6 +3469,8 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, return tcp_sock_set_user_timeout(sk, val); case TCP_KEEPINTVL: return tcp_sock_set_keepintvl(sk, val); + case TCP_KEEPCNT: + return tcp_sock_set_keepcnt(sk, val); } sockopt_lock_sock(sk); @@ -3568,12 +3568,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_KEEPIDLE: err = tcp_sock_set_keepidle_locked(sk, val); break; - case TCP_KEEPCNT: - if (val < 1 || val > MAX_TCP_KEEPCNT) - err = -EINVAL; - else - WRITE_ONCE(tp->keepalive_probes, val); - break; case TCP_SAVE_SYN: /* 0: disable, 1: enable, 2: start from ether_header */ if (val < 0 || val > 2) -- cgit v1.2.3 From a81722ddd7e4d76c9bbff078d29416e18c6d7f71 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Aug 2023 14:46:15 +0000 Subject: tcp: set TCP_LINGER2 locklessly tp->linger2 can be set locklessly as long as readers use READ_ONCE(). Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 19 +++++++++---------- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_timer.c | 2 +- 3 files changed, 11 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e74a9593283c..5c71b4fe11d1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2865,7 +2865,7 @@ adjudge_to_death: if (sk->sk_state == TCP_FIN_WAIT2) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->linger2 < 0) { + if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), @@ -3471,6 +3471,14 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, return tcp_sock_set_keepintvl(sk, val); case TCP_KEEPCNT: return tcp_sock_set_keepcnt(sk, val); + case TCP_LINGER2: + if (val < 0) + WRITE_ONCE(tp->linger2, -1); + else if (val > TCP_FIN_TIMEOUT_MAX / HZ) + WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); + else + WRITE_ONCE(tp->linger2, val * HZ); + return 0; } sockopt_lock_sock(sk); @@ -3576,15 +3584,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, tp->save_syn = val; break; - case TCP_LINGER2: - if (val < 0) - WRITE_ONCE(tp->linger2, -1); - else if (val > TCP_FIN_TIMEOUT_MAX / HZ) - WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); - else - WRITE_ONCE(tp->linger2, val * HZ); - break; - case TCP_DEFER_ACCEPT: /* Translate value in seconds to number of retransmits */ WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2995802126e4..deac708250e6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6624,7 +6624,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; } - if (tp->linger2 < 0) { + if (READ_ONCE(tp->linger2) < 0) { tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index f99e2d06ae7c..d45c96c7f5a4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -714,7 +714,7 @@ static void tcp_keepalive_timer (struct timer_list *t) tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { - if (tp->linger2 >= 0) { + if (READ_ONCE(tp->linger2) >= 0) { const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; if (tmo > 0) { -- cgit v1.2.3 From 6e97ba552b8d3dd074a28b8600740b8bed42267b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Aug 2023 14:46:16 +0000 Subject: tcp: set TCP_DEFER_ACCEPT locklessly rskq_defer_accept field can be read/written without the need of holding the socket lock. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 13 ++++++------- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_minisocks.c | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5c71b4fe11d1..4fbc7ff8c53c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3479,6 +3479,12 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, else WRITE_ONCE(tp->linger2, val * HZ); return 0; + case TCP_DEFER_ACCEPT: + /* Translate value in seconds to number of retransmits */ + WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, + secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, + TCP_RTO_MAX / HZ)); + return 0; } sockopt_lock_sock(sk); @@ -3584,13 +3590,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, tp->save_syn = val; break; - case TCP_DEFER_ACCEPT: - /* Translate value in seconds to number of retransmits */ - WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, - secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, - TCP_RTO_MAX / HZ)); - break; - case TCP_WINDOW_CLAMP: err = tcp_set_window_clamp(sk, val); break; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index deac708250e6..8e96ebe373d7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6324,7 +6324,7 @@ consume: if (fastopen_fail) return -1; if (sk->sk_write_pending || - icsk->icsk_accept_queue.rskq_defer_accept || + READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) || inet_csk_in_pingpong_mode(sk)) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ddba3b67f7c8..13ee12983c42 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -792,7 +792,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return sk; /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ - if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); -- cgit v1.2.3 From c35e927cbe09d38b2d72183bb215901183927c68 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 4 Aug 2023 16:49:39 +0300 Subject: net: omit ndo_hwtstamp_get() call when possible in dev_set_hwtstamp_phylib() Setting dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS is only legal for drivers which were converted to ndo_hwtstamp_get() and ndo_hwtstamp_set(), and it is only there that we call ndo_hwtstamp_set() for a request that otherwise goes to phylib (for stuff like packet traps, which need to be undone if phylib failed, hence the old_cfg logic). The problem is that we end up calling ndo_hwtstamp_get() when we don't need to (even if the SIOCSHWTSTAMP wasn't intended for phylib, or if it was, but the driver didn't set IFF_SEE_ALL_HWTSTAMP_REQUESTS). For those unnecessary conditions, we share a code path with virtual drivers (vlan, macvlan, bonding) where ndo_hwtstamp_get() is implemented as generic_hwtstamp_get_lower(), and may be resolved through generic_hwtstamp_ioctl_lower() if the lower device is unconverted. I.e. this situation: $ ip link add link eno0 name eno0.100 type vlan id 100 $ hwstamp_ctl -i eno0.100 -t 1 We are unprepared to deal with this, because if ndo_hwtstamp_get() is resolved through a legacy ndo_eth_ioctl(SIOCGHWTSTAMP) lower_dev implementation, that needs a non-NULL old_cfg.ifr pointer, and we don't have it. But we don't even need to deal with it either. In the general case, drivers may not even implement SIOCGHWTSTAMP handling, only SIOCSHWTSTAMP, so it makes sense to completely avoid a SIOCGHWTSTAMP call if we can. The solution is to split the single "if" condition into 3 smaller ones, thus separating the decision to call ndo_hwtstamp_get() from the decision to call ndo_hwtstamp_set(). The third "if" condition is identical to the first one, and both are subsets of the second one. Thus, the "cfg" argument of kernel_hwtstamp_config_changed() is always valid. Reported-by: Eric Dumazet Closes: https://lore.kernel.org/netdev/CANn89iLOspJsvjPj+y8jikg7erXDomWe8sqHMdfL_2LQSFrPAg@mail.gmail.com/ Fixes: fd770e856e22 ("net: remove phy_has_hwtstamp() -> phy_mii_ioctl() decision from converted drivers") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/core/dev_ioctl.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 72e077022348..b46aedc36939 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -334,20 +334,23 @@ static int dev_set_hwtstamp_phylib(struct net_device *dev, cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; - if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { err = ops->ndo_hwtstamp_get(dev, &old_cfg); if (err) return err; + } + if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { err = ops->ndo_hwtstamp_set(dev, cfg, extack); if (err) { if (extack->_msg) netdev_err(dev, "%s\n", extack->_msg); return err; } + } + if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); - } if (phy_ts) { err = phy_hwtstamp_set(dev->phydev, cfg, extack); -- cgit v1.2.3 From f6ecb68b38a5c23f20160dd49718d1a9d395a86d Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Sat, 5 Aug 2023 18:48:11 +0800 Subject: net/tls: Remove unused function declarations Commit 3c4d7559159b ("tls: kernel TLS support") declared but never implemented these functions. Signed-off-by: Yue Haibing Signed-off-by: David S. Miller --- net/tls/tls.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/tls/tls.h b/net/tls/tls.h index 37539ac3ac2a..164d6a955e26 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -87,10 +87,6 @@ void tls_ctx_free(struct sock *sk, struct tls_context *ctx); void update_sk_prot(struct sock *sk, struct tls_context *ctx); int wait_on_pending_writer(struct sock *sk, long *timeo); -int tls_sk_query(struct sock *sk, int optname, char __user *optval, - int __user *optlen); -int tls_sk_attach(struct sock *sk, int optname, char __user *optval, - unsigned int optlen); void tls_err_abort(struct sock *sk, int err); int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); -- cgit v1.2.3 From a9ca9f9ceff382b58b488248f0c0da9e157f5d06 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Fri, 4 Aug 2023 20:05:24 +0200 Subject: page_pool: split types and declarations from page_pool.h Split types and pure function declarations from page_pool.h and add them in page_page/types.h, so that C sources can include page_pool.h and headers should generally only include page_pool/types.h as suggested by jakub. Rename page_pool.h to page_pool/helpers.h to have both in one place. Signed-off-by: Yunsheng Lin Suggested-by: Jakub Kicinski Signed-off-by: Alexander Lobakin Reviewed-by: Alexander Duyck Link: https://lore.kernel.org/r/20230804180529.2483231-2-aleksander.lobakin@intel.com [Jakub: change microsoft/mana, fix kdoc paths in Documentation] Signed-off-by: Jakub Kicinski --- Documentation/networking/page_pool.rst | 6 +- MAINTAINERS | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 2 +- drivers/net/ethernet/engleder/tsnep_main.c | 1 + drivers/net/ethernet/freescale/fec_main.c | 1 + drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 1 + drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 2 +- drivers/net/ethernet/marvell/mvneta.c | 2 +- drivers/net/ethernet/marvell/mvpp2/mvpp2.h | 2 +- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 1 + .../ethernet/marvell/octeontx2/nic/otx2_common.c | 1 + .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 1 + drivers/net/ethernet/mediatek/mtk_eth_soc.c | 1 + drivers/net/ethernet/mediatek/mtk_eth_soc.h | 2 +- .../net/ethernet/mellanox/mlx5/core/en/params.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/en/trap.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 2 +- .../net/ethernet/microchip/lan966x/lan966x_fdma.c | 1 + .../net/ethernet/microchip/lan966x/lan966x_main.h | 2 +- drivers/net/ethernet/microsoft/mana/mana_en.c | 1 + drivers/net/ethernet/socionext/netsec.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac.h | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 + drivers/net/ethernet/ti/cpsw.c | 2 +- drivers/net/ethernet/ti/cpsw_new.c | 2 +- drivers/net/ethernet/ti/cpsw_priv.c | 2 +- drivers/net/ethernet/wangxun/libwx/wx_lib.c | 2 +- drivers/net/veth.c | 2 +- drivers/net/wireless/mediatek/mt76/mac80211.c | 1 - drivers/net/wireless/mediatek/mt76/mt76.h | 1 + drivers/net/xen-netfront.c | 2 +- include/linux/skbuff.h | 2 +- include/net/page_pool.h | 470 --------------------- include/net/page_pool/helpers.h | 238 +++++++++++ include/net/page_pool/types.h | 238 +++++++++++ include/trace/events/page_pool.h | 2 +- net/bpf/test_run.c | 2 +- net/core/page_pool.c | 2 +- net/core/skbuff.c | 2 +- net/core/xdp.c | 2 +- 44 files changed, 517 insertions(+), 500 deletions(-) delete mode 100644 include/net/page_pool.h create mode 100644 include/net/page_pool/helpers.h create mode 100644 include/net/page_pool/types.h (limited to 'net') diff --git a/Documentation/networking/page_pool.rst b/Documentation/networking/page_pool.rst index 53b5448cc0f1..68b82cea13e4 100644 --- a/Documentation/networking/page_pool.rst +++ b/Documentation/networking/page_pool.rst @@ -67,10 +67,10 @@ a page will cause no race conditions is enough. .. kernel-doc:: net/core/page_pool.c :identifiers: page_pool_create -.. kernel-doc:: include/net/page_pool.h +.. kernel-doc:: include/net/page_pool/types.h :identifiers: struct page_pool_params -.. kernel-doc:: include/net/page_pool.h +.. kernel-doc:: include/net/page_pool/helpers.h :identifiers: page_pool_put_page page_pool_put_full_page page_pool_recycle_direct page_pool_dev_alloc_pages page_pool_get_dma_addr page_pool_get_dma_dir @@ -122,7 +122,7 @@ page_pool_stats allocated by the caller. The API will fill in the provided struct page_pool_stats with statistics about the page_pool. -.. kernel-doc:: include/net/page_pool.h +.. kernel-doc:: include/net/page_pool/types.h :identifiers: struct page_pool_recycle_stats struct page_pool_alloc_stats struct page_pool_stats diff --git a/MAINTAINERS b/MAINTAINERS index 5e2bb1059ab6..08bcf3a7c482 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16020,7 +16020,7 @@ M: Ilias Apalodimas L: netdev@vger.kernel.org S: Supported F: Documentation/networking/page_pool.rst -F: include/net/page_pool.h +F: include/net/page_pool/ F: include/trace/events/page_pool.h F: net/core/page_pool.c diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 6a643aae7802..eb168ca983b7 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -54,7 +54,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 2ce46d7affe4..96f5ca778c67 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "bnxt_hsi.h" #include "bnxt.h" #include "bnxt_xdp.h" diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c index 079f9f6ae21a..f61bd89734c5 100644 --- a/drivers/net/ethernet/engleder/tsnep_main.c +++ b/drivers/net/ethernet/engleder/tsnep_main.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #define TSNEP_RX_OFFSET (max(NET_SKB_PAD, XDP_PACKET_HEADROOM) + NET_IP_ALIGN) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 43f14cec91e9..3bd0bf03aedb 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 9f6890059666..e5e37a33fd81 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 88af34bbee34..acd756b0c7c9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include "hnae3.h" diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index acf4f6ba73a6..d483b8c00ec0 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h index 11e603686a27..e809f91c08fb 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 9e1b596c8f08..eb74ccddb440 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 8cdd92dd9762..8336cea16aff 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -7,6 +7,7 @@ #include #include +#include #include #include diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 61f62a6ec662..70b9065f7d10 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "otx2_reg.h" #include "otx2_common.h" diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 1b89f800f6df..fe05c9020269 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "mtk_eth_soc.h" #include "mtk_wed.h" diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h index 80d17729e557..4a2470fbad2c 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include "mtk_ppe.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c index 5ce28ff7685f..e097f336e1c4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c @@ -6,6 +6,7 @@ #include "en/port.h" #include "en_accel/en_accel.h" #include "en_accel/ipsec.h" +#include #include static u8 mlx5e_mpwrq_min_page_shift(struct mlx5_core_dev *mdev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c index 201ac7dd338f..698647cc8c0f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2020 Mellanox Technologies */ -#include #include "en/txrx.h" #include "en/params.h" #include "en/trap.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 40589cebb773..12f56d0db0af 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -35,6 +35,7 @@ #include "en/xdp.h" #include "en/params.h" #include +#include int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 1c820119e438..c8ec6467d4d1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include "eswitch.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index f7bb5f4aaaca..3fd11b0761e0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 4d77055abd4b..07b84d668fcc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -38,7 +38,7 @@ #include "en/port.h" #ifdef CONFIG_PAGE_POOL_STATS -#include +#include #endif static unsigned int stats_grps_num(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c index bd72fbc2220f..3960534ac2ad 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c @@ -2,6 +2,7 @@ #include #include +#include #include "lan966x_main.h" diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h index aebc9154693a..caa9e0533c96 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index a08023c57e25..31e2f2c74e15 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -11,6 +11,7 @@ #include #include +#include #include #include diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 0dcd6a568b06..f358ea003193 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include #define NETSEC_REG_SOFT_RST 0x104 diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index a6d034968654..3401e888a9f6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 99aa5360b3ff..fcab363d8dfa 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include "stmmac_ptp.h" diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index f9cd566d1c9b..ca4d4548f85e 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index c61e4e44a78f..0e4f526b1753 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -30,7 +30,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index ae52cdbcf8cc..0ec85635dfd6 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index 2c3f08be8c37..e04d4a5eed7b 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 614f3e3efab0..953f6d8f8db0 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #define DRV_NAME "veth" #define DRV_VERSION "1.0" diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c index c0ff36a98bed..d158320bc15d 100644 --- a/drivers/net/wireless/mediatek/mt76/mac80211.c +++ b/drivers/net/wireless/mediatek/mt76/mac80211.c @@ -4,7 +4,6 @@ */ #include #include -#include #include "mt76.h" #define CHAN2G(_idx, _freq) { \ diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h index 878087257ea7..e8757865a3d0 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76.h +++ b/drivers/net/wireless/mediatek/mt76/mt76.h @@ -15,6 +15,7 @@ #include #include #include +#include #include "util.h" #include "testmode.h" diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 47d54d8ea59d..ad29f370034e 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 16a49ba534e4..888e3d7e74c1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -32,7 +32,7 @@ #include #include #include -#include +#include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include #endif diff --git a/include/net/page_pool.h b/include/net/page_pool.h deleted file mode 100644 index 73d4f786418d..000000000000 --- a/include/net/page_pool.h +++ /dev/null @@ -1,470 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * page_pool.h - * Author: Jesper Dangaard Brouer - * Copyright (C) 2016 Red Hat, Inc. - */ - -/** - * DOC: page_pool allocator - * - * This page_pool allocator is optimized for the XDP mode that - * uses one-frame-per-page, but have fallbacks that act like the - * regular page allocator APIs. - * - * Basic use involve replacing alloc_pages() calls with the - * page_pool_alloc_pages() call. Drivers should likely use - * page_pool_dev_alloc_pages() replacing dev_alloc_pages(). - * - * API keeps track of in-flight pages, in-order to let API user know - * when it is safe to dealloactor page_pool object. Thus, API users - * must call page_pool_put_page() where appropriate and only attach - * the page to a page_pool-aware objects, like skbs marked for recycling. - * - * API user must only call page_pool_put_page() once on a page, as it - * will either recycle the page, or in case of elevated refcnt, it - * will release the DMA mapping and in-flight state accounting. We - * hope to lift this requirement in the future. - */ -#ifndef _NET_PAGE_POOL_H -#define _NET_PAGE_POOL_H - -#include /* Needed by ptr_ring */ -#include -#include - -#define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA - * map/unmap - */ -#define PP_FLAG_DMA_SYNC_DEV BIT(1) /* If set all pages that the driver gets - * from page_pool will be - * DMA-synced-for-device according to - * the length provided by the device - * driver. - * Please note DMA-sync-for-CPU is still - * device driver responsibility - */ -#define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */ -#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ - PP_FLAG_DMA_SYNC_DEV |\ - PP_FLAG_PAGE_FRAG) - -/* - * Fast allocation side cache array/stack - * - * The cache size and refill watermark is related to the network - * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX - * ring is usually refilled and the max consumed elements will be 64, - * thus a natural max size of objects needed in the cache. - * - * Keeping room for more objects, is due to XDP_DROP use-case. As - * XDP_DROP allows the opportunity to recycle objects directly into - * this array, as it shares the same softirq/NAPI protection. If - * cache is already full (or partly full) then the XDP_DROP recycles - * would have to take a slower code path. - */ -#define PP_ALLOC_CACHE_SIZE 128 -#define PP_ALLOC_CACHE_REFILL 64 -struct pp_alloc_cache { - u32 count; - struct page *cache[PP_ALLOC_CACHE_SIZE]; -}; - -/** - * struct page_pool_params - page pool parameters - * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG - * @order: 2^order pages on allocation - * @pool_size: size of the ptr_ring - * @nid: NUMA node id to allocate from pages from - * @dev: device, for DMA pre-mapping purposes - * @napi: NAPI which is the sole consumer of pages, otherwise NULL - * @dma_dir: DMA mapping direction - * @max_len: max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV - * @offset: DMA sync address offset for PP_FLAG_DMA_SYNC_DEV - */ -struct page_pool_params { - unsigned int flags; - unsigned int order; - unsigned int pool_size; - int nid; - struct device *dev; - struct napi_struct *napi; - enum dma_data_direction dma_dir; - unsigned int max_len; - unsigned int offset; -/* private: used by test code only */ - void (*init_callback)(struct page *page, void *arg); - void *init_arg; -}; - -#ifdef CONFIG_PAGE_POOL_STATS -/** - * struct page_pool_alloc_stats - allocation statistics - * @fast: successful fast path allocations - * @slow: slow path order-0 allocations - * @slow_high_order: slow path high order allocations - * @empty: ptr ring is empty, so a slow path allocation was forced - * @refill: an allocation which triggered a refill of the cache - * @waive: pages obtained from the ptr ring that cannot be added to - * the cache due to a NUMA mismatch - */ -struct page_pool_alloc_stats { - u64 fast; - u64 slow; - u64 slow_high_order; - u64 empty; - u64 refill; - u64 waive; -}; - -/** - * struct page_pool_recycle_stats - recycling (freeing) statistics - * @cached: recycling placed page in the page pool cache - * @cache_full: page pool cache was full - * @ring: page placed into the ptr ring - * @ring_full: page released from page pool because the ptr ring was full - * @released_refcnt: page released (and not recycled) because refcnt > 1 - */ -struct page_pool_recycle_stats { - u64 cached; - u64 cache_full; - u64 ring; - u64 ring_full; - u64 released_refcnt; -}; - -/** - * struct page_pool_stats - combined page pool use statistics - * @alloc_stats: see struct page_pool_alloc_stats - * @recycle_stats: see struct page_pool_recycle_stats - * - * Wrapper struct for combining page pool stats with different storage - * requirements. - */ -struct page_pool_stats { - struct page_pool_alloc_stats alloc_stats; - struct page_pool_recycle_stats recycle_stats; -}; - -int page_pool_ethtool_stats_get_count(void); -u8 *page_pool_ethtool_stats_get_strings(u8 *data); -u64 *page_pool_ethtool_stats_get(u64 *data, void *stats); - -/* - * Drivers that wish to harvest page pool stats and report them to users - * (perhaps via ethtool, debugfs, or another mechanism) can allocate a - * struct page_pool_stats call page_pool_get_stats to get stats for the specified pool. - */ -bool page_pool_get_stats(struct page_pool *pool, - struct page_pool_stats *stats); -#else - -static inline int page_pool_ethtool_stats_get_count(void) -{ - return 0; -} - -static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) -{ - return data; -} - -static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) -{ - return data; -} - -#endif - -struct page_pool { - struct page_pool_params p; - - struct delayed_work release_dw; - void (*disconnect)(void *); - unsigned long defer_start; - unsigned long defer_warn; - - u32 pages_state_hold_cnt; - unsigned int frag_offset; - struct page *frag_page; - long frag_users; - -#ifdef CONFIG_PAGE_POOL_STATS - /* these stats are incremented while in softirq context */ - struct page_pool_alloc_stats alloc_stats; -#endif - u32 xdp_mem_id; - - /* - * Data structure for allocation side - * - * Drivers allocation side usually already perform some kind - * of resource protection. Piggyback on this protection, and - * require driver to protect allocation side. - * - * For NIC drivers this means, allocate a page_pool per - * RX-queue. As the RX-queue is already protected by - * Softirq/BH scheduling and napi_schedule. NAPI schedule - * guarantee that a single napi_struct will only be scheduled - * on a single CPU (see napi_schedule). - */ - struct pp_alloc_cache alloc ____cacheline_aligned_in_smp; - - /* Data structure for storing recycled pages. - * - * Returning/freeing pages is more complicated synchronization - * wise, because free's can happen on remote CPUs, with no - * association with allocation resource. - * - * Use ptr_ring, as it separates consumer and producer - * effeciently, it a way that doesn't bounce cache-lines. - * - * TODO: Implement bulk return pages into this structure. - */ - struct ptr_ring ring; - -#ifdef CONFIG_PAGE_POOL_STATS - /* recycle stats are per-cpu to avoid locking */ - struct page_pool_recycle_stats __percpu *recycle_stats; -#endif - atomic_t pages_state_release_cnt; - - /* A page_pool is strictly tied to a single RX-queue being - * protected by NAPI, due to above pp_alloc_cache. This - * refcnt serves purpose is to simplify drivers error handling. - */ - refcount_t user_cnt; - - u64 destroy_cnt; -}; - -struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); - -/** - * page_pool_dev_alloc_pages() - allocate a page. - * @pool: pool from which to allocate - * - * Get a page from the page allocator or page_pool caches. - */ -static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) -{ - gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); - - return page_pool_alloc_pages(pool, gfp); -} - -struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, - unsigned int size, gfp_t gfp); - -static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, - unsigned int *offset, - unsigned int size) -{ - gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); - - return page_pool_alloc_frag(pool, offset, size, gfp); -} - -/** - * page_pool_get_dma_dir() - Retrieve the stored DMA direction. - * @pool: pool from which page was allocated - * - * Get the stored dma direction. A driver might decide to store this locally - * and avoid the extra cache line from page_pool to determine the direction. - */ -static -inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) -{ - return pool->p.dma_dir; -} - -bool page_pool_return_skb_page(struct page *page, bool napi_safe); - -struct page_pool *page_pool_create(const struct page_pool_params *params); - -struct xdp_mem_info; - -#ifdef CONFIG_PAGE_POOL -void page_pool_unlink_napi(struct page_pool *pool); -void page_pool_destroy(struct page_pool *pool); -void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), - struct xdp_mem_info *mem); -void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count); -#else -static inline void page_pool_unlink_napi(struct page_pool *pool) -{ -} - -static inline void page_pool_destroy(struct page_pool *pool) -{ -} - -static inline void page_pool_use_xdp_mem(struct page_pool *pool, - void (*disconnect)(void *), - struct xdp_mem_info *mem) -{ -} - -static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count) -{ -} -#endif - -void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, - bool allow_direct); - -/* pp_frag_count represents the number of writers who can update the page - * either by updating skb->data or via DMA mappings for the device. - * We can't rely on the page refcnt for that as we don't know who might be - * holding page references and we can't reliably destroy or sync DMA mappings - * of the fragments. - * - * When pp_frag_count reaches 0 we can either recycle the page if the page - * refcnt is 1 or return it back to the memory allocator and destroy any - * mappings we have. - */ -static inline void page_pool_fragment_page(struct page *page, long nr) -{ - atomic_long_set(&page->pp_frag_count, nr); -} - -static inline long page_pool_defrag_page(struct page *page, long nr) -{ - long ret; - - /* If nr == pp_frag_count then we have cleared all remaining - * references to the page. No need to actually overwrite it, instead - * we can leave this to be overwritten by the calling function. - * - * The main advantage to doing this is that an atomic_read is - * generally a much cheaper operation than an atomic update, - * especially when dealing with a page that may be partitioned - * into only 2 or 3 pieces. - */ - if (atomic_long_read(&page->pp_frag_count) == nr) - return 0; - - ret = atomic_long_sub_return(nr, &page->pp_frag_count); - WARN_ON(ret < 0); - return ret; -} - -static inline bool page_pool_is_last_frag(struct page_pool *pool, - struct page *page) -{ - /* If fragments aren't enabled or count is 0 we were the last user */ - return !(pool->p.flags & PP_FLAG_PAGE_FRAG) || - (page_pool_defrag_page(page, 1) == 0); -} - -/** - * page_pool_put_page() - release a reference to a page pool page - * @pool: pool from which page was allocated - * @page: page to release a reference on - * @dma_sync_size: how much of the page may have been touched by the device - * @allow_direct: released by the consumer, allow lockless caching - * - * The outcome of this depends on the page refcnt. If the driver bumps - * the refcnt > 1 this will unmap the page. If the page refcnt is 1 - * the allocator owns the page and will try to recycle it in one of the pool - * caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device - * using dma_sync_single_range_for_device(). - */ -static inline void page_pool_put_page(struct page_pool *pool, - struct page *page, - unsigned int dma_sync_size, - bool allow_direct) -{ - /* When page_pool isn't compiled-in, net/core/xdp.c doesn't - * allow registering MEM_TYPE_PAGE_POOL, but shield linker. - */ -#ifdef CONFIG_PAGE_POOL - if (!page_pool_is_last_frag(pool, page)) - return; - - page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct); -#endif -} - -/** - * page_pool_put_full_page() - release a reference on a page pool page - * @pool: pool from which page was allocated - * @page: page to release a reference on - * @allow_direct: released by the consumer, allow lockless caching - * - * Similar to page_pool_put_page(), but will DMA sync the entire memory area - * as configured in &page_pool_params.max_len. - */ -static inline void page_pool_put_full_page(struct page_pool *pool, - struct page *page, bool allow_direct) -{ - page_pool_put_page(pool, page, -1, allow_direct); -} - -/** - * page_pool_recycle_direct() - release a reference on a page pool page - * @pool: pool from which page was allocated - * @page: page to release a reference on - * - * Similar to page_pool_put_full_page() but caller must guarantee safe context - * (e.g NAPI), since it will recycle the page directly into the pool fast cache. - */ -static inline void page_pool_recycle_direct(struct page_pool *pool, - struct page *page) -{ - page_pool_put_full_page(pool, page, true); -} - -#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \ - (sizeof(dma_addr_t) > sizeof(unsigned long)) - -/** - * page_pool_get_dma_addr() - Retrieve the stored DMA address. - * @page: page allocated from a page pool - * - * Fetch the DMA address of the page. The page pool to which the page belongs - * must had been created with PP_FLAG_DMA_MAP. - */ -static inline dma_addr_t page_pool_get_dma_addr(struct page *page) -{ - dma_addr_t ret = page->dma_addr; - - if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) - ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16; - - return ret; -} - -static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) -{ - page->dma_addr = addr; - if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) - page->dma_addr_upper = upper_32_bits(addr); -} - -static inline bool is_page_pool_compiled_in(void) -{ -#ifdef CONFIG_PAGE_POOL - return true; -#else - return false; -#endif -} - -static inline bool page_pool_put(struct page_pool *pool) -{ - return refcount_dec_and_test(&pool->user_cnt); -} - -/* Caller must provide appropriate safe context, e.g. NAPI. */ -void page_pool_update_nid(struct page_pool *pool, int new_nid); -static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) -{ - if (unlikely(pool->p.nid != new_nid)) - page_pool_update_nid(pool, new_nid); -} - -#endif /* _NET_PAGE_POOL_H */ diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h new file mode 100644 index 000000000000..78df91804c87 --- /dev/null +++ b/include/net/page_pool/helpers.h @@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool/helpers.h + * Author: Jesper Dangaard Brouer + * Copyright (C) 2016 Red Hat, Inc. + */ + +/** + * DOC: page_pool allocator + * + * This page_pool allocator is optimized for the XDP mode that + * uses one-frame-per-page, but have fallbacks that act like the + * regular page allocator APIs. + * + * Basic use involve replacing alloc_pages() calls with the + * page_pool_alloc_pages() call. Drivers should likely use + * page_pool_dev_alloc_pages() replacing dev_alloc_pages(). + * + * API keeps track of in-flight pages, in-order to let API user know + * when it is safe to dealloactor page_pool object. Thus, API users + * must call page_pool_put_page() where appropriate and only attach + * the page to a page_pool-aware objects, like skbs marked for recycling. + * + * API user must only call page_pool_put_page() once on a page, as it + * will either recycle the page, or in case of elevated refcnt, it + * will release the DMA mapping and in-flight state accounting. We + * hope to lift this requirement in the future. + */ +#ifndef _NET_PAGE_POOL_HELPERS_H +#define _NET_PAGE_POOL_HELPERS_H + +#include + +#ifdef CONFIG_PAGE_POOL_STATS +int page_pool_ethtool_stats_get_count(void); +u8 *page_pool_ethtool_stats_get_strings(u8 *data); +u64 *page_pool_ethtool_stats_get(u64 *data, void *stats); + +/* + * Drivers that wish to harvest page pool stats and report them to users + * (perhaps via ethtool, debugfs, or another mechanism) can allocate a + * struct page_pool_stats call page_pool_get_stats to get stats for the specified pool. + */ +bool page_pool_get_stats(struct page_pool *pool, + struct page_pool_stats *stats); +#else +static inline int page_pool_ethtool_stats_get_count(void) +{ + return 0; +} + +static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) +{ + return data; +} + +static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +{ + return data; +} +#endif + +/** + * page_pool_dev_alloc_pages() - allocate a page. + * @pool: pool from which to allocate + * + * Get a page from the page allocator or page_pool caches. + */ +static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_pages(pool, gfp); +} + +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_frag(pool, offset, size, gfp); +} + +/** + * page_pool_get_dma_dir() - Retrieve the stored DMA direction. + * @pool: pool from which page was allocated + * + * Get the stored dma direction. A driver might decide to store this locally + * and avoid the extra cache line from page_pool to determine the direction. + */ +static +inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) +{ + return pool->p.dma_dir; +} + +/* pp_frag_count represents the number of writers who can update the page + * either by updating skb->data or via DMA mappings for the device. + * We can't rely on the page refcnt for that as we don't know who might be + * holding page references and we can't reliably destroy or sync DMA mappings + * of the fragments. + * + * When pp_frag_count reaches 0 we can either recycle the page if the page + * refcnt is 1 or return it back to the memory allocator and destroy any + * mappings we have. + */ +static inline void page_pool_fragment_page(struct page *page, long nr) +{ + atomic_long_set(&page->pp_frag_count, nr); +} + +static inline long page_pool_defrag_page(struct page *page, long nr) +{ + long ret; + + /* If nr == pp_frag_count then we have cleared all remaining + * references to the page. No need to actually overwrite it, instead + * we can leave this to be overwritten by the calling function. + * + * The main advantage to doing this is that an atomic_read is + * generally a much cheaper operation than an atomic update, + * especially when dealing with a page that may be partitioned + * into only 2 or 3 pieces. + */ + if (atomic_long_read(&page->pp_frag_count) == nr) + return 0; + + ret = atomic_long_sub_return(nr, &page->pp_frag_count); + WARN_ON(ret < 0); + return ret; +} + +static inline bool page_pool_is_last_frag(struct page_pool *pool, + struct page *page) +{ + /* If fragments aren't enabled or count is 0 we were the last user */ + return !(pool->p.flags & PP_FLAG_PAGE_FRAG) || + (page_pool_defrag_page(page, 1) == 0); +} + +/** + * page_pool_put_page() - release a reference to a page pool page + * @pool: pool from which page was allocated + * @page: page to release a reference on + * @dma_sync_size: how much of the page may have been touched by the device + * @allow_direct: released by the consumer, allow lockless caching + * + * The outcome of this depends on the page refcnt. If the driver bumps + * the refcnt > 1 this will unmap the page. If the page refcnt is 1 + * the allocator owns the page and will try to recycle it in one of the pool + * caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device + * using dma_sync_single_range_for_device(). + */ +static inline void page_pool_put_page(struct page_pool *pool, + struct page *page, + unsigned int dma_sync_size, + bool allow_direct) +{ + /* When page_pool isn't compiled-in, net/core/xdp.c doesn't + * allow registering MEM_TYPE_PAGE_POOL, but shield linker. + */ +#ifdef CONFIG_PAGE_POOL + if (!page_pool_is_last_frag(pool, page)) + return; + + page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct); +#endif +} + +/** + * page_pool_put_full_page() - release a reference on a page pool page + * @pool: pool from which page was allocated + * @page: page to release a reference on + * @allow_direct: released by the consumer, allow lockless caching + * + * Similar to page_pool_put_page(), but will DMA sync the entire memory area + * as configured in &page_pool_params.max_len. + */ +static inline void page_pool_put_full_page(struct page_pool *pool, + struct page *page, bool allow_direct) +{ + page_pool_put_page(pool, page, -1, allow_direct); +} + +/** + * page_pool_recycle_direct() - release a reference on a page pool page + * @pool: pool from which page was allocated + * @page: page to release a reference on + * + * Similar to page_pool_put_full_page() but caller must guarantee safe context + * (e.g NAPI), since it will recycle the page directly into the pool fast cache. + */ +static inline void page_pool_recycle_direct(struct page_pool *pool, + struct page *page) +{ + page_pool_put_full_page(pool, page, true); +} + +#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \ + (sizeof(dma_addr_t) > sizeof(unsigned long)) + +/** + * page_pool_get_dma_addr() - Retrieve the stored DMA address. + * @page: page allocated from a page pool + * + * Fetch the DMA address of the page. The page pool to which the page belongs + * must had been created with PP_FLAG_DMA_MAP. + */ +static inline dma_addr_t page_pool_get_dma_addr(struct page *page) +{ + dma_addr_t ret = page->dma_addr; + + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) + ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16; + + return ret; +} + +static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) +{ + page->dma_addr = addr; + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) + page->dma_addr_upper = upper_32_bits(addr); +} + +static inline bool page_pool_put(struct page_pool *pool) +{ + return refcount_dec_and_test(&pool->user_cnt); +} + +static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) +{ + if (unlikely(pool->p.nid != new_nid)) + page_pool_update_nid(pool, new_nid); +} + +#endif /* _NET_PAGE_POOL_HELPERS_H */ diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h new file mode 100644 index 000000000000..9ac39191bed7 --- /dev/null +++ b/include/net/page_pool/types.h @@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _NET_PAGE_POOL_TYPES_H +#define _NET_PAGE_POOL_TYPES_H + +#include +#include + +#define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA + * map/unmap + */ +#define PP_FLAG_DMA_SYNC_DEV BIT(1) /* If set all pages that the driver gets + * from page_pool will be + * DMA-synced-for-device according to + * the length provided by the device + * driver. + * Please note DMA-sync-for-CPU is still + * device driver responsibility + */ +#define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */ +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ + PP_FLAG_DMA_SYNC_DEV |\ + PP_FLAG_PAGE_FRAG) + +/* + * Fast allocation side cache array/stack + * + * The cache size and refill watermark is related to the network + * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX + * ring is usually refilled and the max consumed elements will be 64, + * thus a natural max size of objects needed in the cache. + * + * Keeping room for more objects, is due to XDP_DROP use-case. As + * XDP_DROP allows the opportunity to recycle objects directly into + * this array, as it shares the same softirq/NAPI protection. If + * cache is already full (or partly full) then the XDP_DROP recycles + * would have to take a slower code path. + */ +#define PP_ALLOC_CACHE_SIZE 128 +#define PP_ALLOC_CACHE_REFILL 64 +struct pp_alloc_cache { + u32 count; + struct page *cache[PP_ALLOC_CACHE_SIZE]; +}; + +/** + * struct page_pool_params - page pool parameters + * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG + * @order: 2^order pages on allocation + * @pool_size: size of the ptr_ring + * @nid: NUMA node id to allocate from pages from + * @dev: device, for DMA pre-mapping purposes + * @napi: NAPI which is the sole consumer of pages, otherwise NULL + * @dma_dir: DMA mapping direction + * @max_len: max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV + * @offset: DMA sync address offset for PP_FLAG_DMA_SYNC_DEV + */ +struct page_pool_params { + unsigned int flags; + unsigned int order; + unsigned int pool_size; + int nid; + struct device *dev; + struct napi_struct *napi; + enum dma_data_direction dma_dir; + unsigned int max_len; + unsigned int offset; +/* private: used by test code only */ + void (*init_callback)(struct page *page, void *arg); + void *init_arg; +}; + +#ifdef CONFIG_PAGE_POOL_STATS +/** + * struct page_pool_alloc_stats - allocation statistics + * @fast: successful fast path allocations + * @slow: slow path order-0 allocations + * @slow_high_order: slow path high order allocations + * @empty: ptr ring is empty, so a slow path allocation was forced + * @refill: an allocation which triggered a refill of the cache + * @waive: pages obtained from the ptr ring that cannot be added to + * the cache due to a NUMA mismatch + */ +struct page_pool_alloc_stats { + u64 fast; + u64 slow; + u64 slow_high_order; + u64 empty; + u64 refill; + u64 waive; +}; + +/** + * struct page_pool_recycle_stats - recycling (freeing) statistics + * @cached: recycling placed page in the page pool cache + * @cache_full: page pool cache was full + * @ring: page placed into the ptr ring + * @ring_full: page released from page pool because the ptr ring was full + * @released_refcnt: page released (and not recycled) because refcnt > 1 + */ +struct page_pool_recycle_stats { + u64 cached; + u64 cache_full; + u64 ring; + u64 ring_full; + u64 released_refcnt; +}; + +/** + * struct page_pool_stats - combined page pool use statistics + * @alloc_stats: see struct page_pool_alloc_stats + * @recycle_stats: see struct page_pool_recycle_stats + * + * Wrapper struct for combining page pool stats with different storage + * requirements. + */ +struct page_pool_stats { + struct page_pool_alloc_stats alloc_stats; + struct page_pool_recycle_stats recycle_stats; +}; +#endif + +struct page_pool { + struct page_pool_params p; + + struct delayed_work release_dw; + void (*disconnect)(void *pool); + unsigned long defer_start; + unsigned long defer_warn; + + u32 pages_state_hold_cnt; + unsigned int frag_offset; + struct page *frag_page; + long frag_users; + +#ifdef CONFIG_PAGE_POOL_STATS + /* these stats are incremented while in softirq context */ + struct page_pool_alloc_stats alloc_stats; +#endif + u32 xdp_mem_id; + + /* + * Data structure for allocation side + * + * Drivers allocation side usually already perform some kind + * of resource protection. Piggyback on this protection, and + * require driver to protect allocation side. + * + * For NIC drivers this means, allocate a page_pool per + * RX-queue. As the RX-queue is already protected by + * Softirq/BH scheduling and napi_schedule. NAPI schedule + * guarantee that a single napi_struct will only be scheduled + * on a single CPU (see napi_schedule). + */ + struct pp_alloc_cache alloc ____cacheline_aligned_in_smp; + + /* Data structure for storing recycled pages. + * + * Returning/freeing pages is more complicated synchronization + * wise, because free's can happen on remote CPUs, with no + * association with allocation resource. + * + * Use ptr_ring, as it separates consumer and producer + * efficiently, it a way that doesn't bounce cache-lines. + * + * TODO: Implement bulk return pages into this structure. + */ + struct ptr_ring ring; + +#ifdef CONFIG_PAGE_POOL_STATS + /* recycle stats are per-cpu to avoid locking */ + struct page_pool_recycle_stats __percpu *recycle_stats; +#endif + atomic_t pages_state_release_cnt; + + /* A page_pool is strictly tied to a single RX-queue being + * protected by NAPI, due to above pp_alloc_cache. This + * refcnt serves purpose is to simplify drivers error handling. + */ + refcount_t user_cnt; + + u64 destroy_cnt; +}; + +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); +struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, + unsigned int size, gfp_t gfp); +bool page_pool_return_skb_page(struct page *page, bool napi_safe); + +struct page_pool *page_pool_create(const struct page_pool_params *params); + +struct xdp_mem_info; + +#ifdef CONFIG_PAGE_POOL +void page_pool_unlink_napi(struct page_pool *pool); +void page_pool_destroy(struct page_pool *pool); +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), + struct xdp_mem_info *mem); +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count); +#else +static inline void page_pool_unlink_napi(struct page_pool *pool) +{ +} + +static inline void page_pool_destroy(struct page_pool *pool) +{ +} + +static inline void page_pool_use_xdp_mem(struct page_pool *pool, + void (*disconnect)(void *), + struct xdp_mem_info *mem) +{ +} + +static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ +} +#endif + +void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, + bool allow_direct); + +static inline bool is_page_pool_compiled_in(void) +{ +#ifdef CONFIG_PAGE_POOL + return true; +#else + return false; +#endif +} + +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid); + +#endif /* _NET_PAGE_POOL_H */ diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h index ca534501158b..6834356b2d2a 100644 --- a/include/trace/events/page_pool.h +++ b/include/trace/events/page_pool.h @@ -9,7 +9,7 @@ #include #include -#include +#include TRACE_EVENT(page_pool_release, diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 0aac76c13fd4..57a7a64b84ed 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 5d615a169718..cd28c1f14002 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c6f98245582c..d3bed964123c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -73,7 +73,7 @@ #include #include #include -#include +#include #include #include diff --git a/net/core/xdp.c b/net/core/xdp.c index 8362130bf085..a70670fe9a2d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include /* struct xdp_mem_allocator */ -- cgit v1.2.3 From 75eaf63ea7afeafd026ffef03bdc69e31f10829b Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 4 Aug 2023 20:05:25 +0200 Subject: net: skbuff: don't include to Currently, touching triggers a rebuild of more than half of the kernel. That's because it's included in . And each new include to page_pool/types.h adds more [useless] data for the toolchain to process per each source file from that pile. In commit 6a5bcd84e886 ("page_pool: Allow drivers to hint on SKB recycling"), Matteo included it to be able to call a couple of functions defined there. Then, in commit 57f05bc2ab24 ("page_pool: keep pp info as long as page pool owns the page") one of the calls was removed, so only one was left. It's the call to page_pool_return_skb_page() in napi_frag_unref(). The function is external and doesn't have any dependencies. Having very niche page_pool_types.h included only for that looks like an overkill. As %PP_SIGNATURE is not local to page_pool.c (was only in the early submissions), nothing holds this function there. Teleport page_pool_return_skb_page() to skbuff.c, just next to the main consumer, skb_pp_recycle(), and rename it to napi_pp_put_page(), as it doesn't work with skbs at all and the former name tells nothing. The #if guards here are only to not compile and have it in the vmlinux when not needed -- both call sites are already guarded. Now, touching page_pool_types.h only triggers rebuilding of the drivers using it and a couple of core networking files. Suggested-by: Jakub Kicinski # make skbuff.h less heavy Suggested-by: Alexander Duyck # move to skbuff.c Signed-off-by: Alexander Lobakin Reviewed-by: Alexander Duyck Link: https://lore.kernel.org/r/20230804180529.2483231-3-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 5 +++-- include/net/page_pool/types.h | 2 -- net/core/page_pool.c | 39 ------------------------------------- net/core/skbuff.c | 45 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 46 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 888e3d7e74c1..aa57e2eca33b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -32,7 +32,6 @@ #include #include #include -#include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include #endif @@ -3421,13 +3420,15 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } +bool napi_pp_put_page(struct page *page, bool napi_safe); + static inline void napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) { struct page *page = skb_frag_page(frag); #ifdef CONFIG_PAGE_POOL - if (recycle && page_pool_return_skb_page(page, napi_safe)) + if (recycle && napi_pp_put_page(page, napi_safe)) return; #endif put_page(page); diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 9ac39191bed7..fcb846523398 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -185,8 +185,6 @@ struct page_pool { struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp); -bool page_pool_return_skb_page(struct page *page, bool napi_safe); - struct page_pool *page_pool_create(const struct page_pool_params *params); struct xdp_mem_info; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index cd28c1f14002..03ad74d25959 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -935,42 +935,3 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); - -bool page_pool_return_skb_page(struct page *page, bool napi_safe) -{ - struct napi_struct *napi; - struct page_pool *pp; - bool allow_direct; - - page = compound_head(page); - - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) - return false; - - pp = page->pp; - - /* Allow direct recycle if we have reasons to believe that we are - * in the same context as the consumer would run, so there's - * no possible race. - */ - napi = READ_ONCE(pp->p.napi); - allow_direct = napi_safe && napi && - READ_ONCE(napi->list_owner) == smp_processor_id(); - - /* Driver set this to memory recycling info. Reset it on recycle. - * This will *not* work for NIC using a split-page memory model. - * The page will be returned to the pool here regardless of the - * 'flipped' fragment being in use or not. - */ - page_pool_put_full_page(pp, page, allow_direct); - - return true; -} -EXPORT_SYMBOL(page_pool_return_skb_page); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d3bed964123c..acc5844a0de1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -73,7 +73,7 @@ #include #include #include -#include +#include #include #include @@ -879,11 +879,52 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } +#if IS_ENABLED(CONFIG_PAGE_POOL) +bool napi_pp_put_page(struct page *page, bool napi_safe) +{ + struct napi_struct *napi; + struct page_pool *pp; + bool allow_direct; + + page = compound_head(page); + + /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation + * in order to preserve any existing bits, such as bit 0 for the + * head page of compound page and bit 1 for pfmemalloc page, so + * mask those bits for freeing side when doing below checking, + * and page_is_pfmemalloc() is checked in __page_pool_put_page() + * to avoid recycling the pfmemalloc page. + */ + if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + return false; + + pp = page->pp; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + */ + napi = READ_ONCE(pp->p.napi); + allow_direct = napi_safe && napi && + READ_ONCE(napi->list_owner) == smp_processor_id(); + + /* Driver set this to memory recycling info. Reset it on recycle. + * This will *not* work for NIC using a split-page memory model. + * The page will be returned to the pool here regardless of the + * 'flipped' fragment being in use or not. + */ + page_pool_put_full_page(pp, page, allow_direct); + + return true; +} +EXPORT_SYMBOL(napi_pp_put_page); +#endif + static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return page_pool_return_skb_page(virt_to_page(data), napi_safe); + return napi_pp_put_page(virt_to_page(data), napi_safe); } static void skb_kfree_head(void *head, unsigned int end_offset) -- cgit v1.2.3 From 5b899c33b3b852b9559b724cfee67801324a0886 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 4 Aug 2023 20:05:27 +0200 Subject: net: skbuff: avoid accessing page_pool if !napi_safe when returning page Currently, pp->p.napi is always read, but the actual variable it gets assigned to is read-only when @napi_safe is true. For the !napi_safe cases, which yet is still a pack, it's an unneeded operation. Moreover, it can lead to premature or even redundant page_pool cacheline access. For example, when page_pool_is_last_frag() returns false (with the recent frag improvements). Thus, read it only when @napi_safe is true. This also allows moving @napi inside the condition block itself. Constify it while we are here, because why not. Signed-off-by: Alexander Lobakin Reviewed-by: Alexander Duyck Link: https://lore.kernel.org/r/20230804180529.2483231-5-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index acc5844a0de1..85f82a6a08dc 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -882,9 +882,8 @@ static void skb_clone_fraglist(struct sk_buff *skb) #if IS_ENABLED(CONFIG_PAGE_POOL) bool napi_pp_put_page(struct page *page, bool napi_safe) { - struct napi_struct *napi; + bool allow_direct = false; struct page_pool *pp; - bool allow_direct; page = compound_head(page); @@ -904,9 +903,12 @@ bool napi_pp_put_page(struct page *page, bool napi_safe) * in the same context as the consumer would run, so there's * no possible race. */ - napi = READ_ONCE(pp->p.napi); - allow_direct = napi_safe && napi && - READ_ONCE(napi->list_owner) == smp_processor_id(); + if (napi_safe) { + const struct napi_struct *napi = READ_ONCE(pp->p.napi); + + allow_direct = napi && + READ_ONCE(napi->list_owner) == smp_processor_id(); + } /* Driver set this to memory recycling info. Reset it on recycle. * This will *not* work for NIC using a split-page memory model. -- cgit v1.2.3 From ff4e538c8c3e675a15e1e49509c55951832e0451 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Aug 2023 20:05:28 +0200 Subject: page_pool: add a lockdep check for recycling in hardirq Page pool use in hardirq is prohibited, add debug checks to catch misuses. IIRC we previously discussed using DEBUG_NET_WARN_ON_ONCE() for this, but there were concerns that people will have DEBUG_NET enabled in perf testing. I don't think anyone enables lockdep in perf testing, so use lockdep to avoid pushback and arguing :) Acked-by: Jesper Dangaard Brouer Signed-off-by: Alexander Lobakin Reviewed-by: Alexander Duyck Link: https://lore.kernel.org/r/20230804180529.2483231-6-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/lockdep.h | 7 +++++++ net/core/page_pool.c | 2 ++ 2 files changed, 9 insertions(+) (limited to 'net') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 310f85903c91..dc2844b071c2 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -625,6 +625,12 @@ do { \ WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirq_context)); \ } while (0) +#define lockdep_assert_no_hardirq() \ +do { \ + WARN_ON_ONCE(__lockdep_enabled && (this_cpu_read(hardirq_context) || \ + !this_cpu_read(hardirqs_enabled))); \ +} while (0) + #define lockdep_assert_preemption_enabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ @@ -659,6 +665,7 @@ do { \ # define lockdep_assert_irqs_enabled() do { } while (0) # define lockdep_assert_irqs_disabled() do { } while (0) # define lockdep_assert_in_irq() do { } while (0) +# define lockdep_assert_no_hardirq() do { } while (0) # define lockdep_assert_preemption_enabled() do { } while (0) # define lockdep_assert_preemption_disabled() do { } while (0) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 03ad74d25959..77cb75e63aca 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -587,6 +587,8 @@ static __always_inline struct page * __page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + lockdep_assert_no_hardirq(); + /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. -- cgit v1.2.3 From 4a36d0180c452c3482792e0ff14e2bcf536a9284 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 4 Aug 2023 20:05:29 +0200 Subject: net: skbuff: always try to recycle PP pages directly when in softirq Commit 8c48eea3adf3 ("page_pool: allow caching from safely localized NAPI") allowed direct recycling of skb pages to their PP for some cases, but unfortunately missed a couple of other majors. For example, %XDP_DROP in skb mode. The netstack just calls kfree_skb(), which unconditionally passes `false` as @napi_safe. Thus, all pages go through ptr_ring and locks, although most of time we're actually inside the NAPI polling this PP is linked with, so that it would be perfectly safe to recycle pages directly. Let's address such. If @napi_safe is true, we're fine, don't change anything for this path. But if it's false, check whether we are in the softirq context. It will most likely be so and then if ->list_owner is our current CPU, we're good to use direct recycling, even though @napi_safe is false -- concurrent access is excluded. in_softirq() protection is needed mostly due to we can hit this place in the process context (not the hardirq though). For the mentioned xdp-drop-skb-mode case, the improvement I got is 3-4% in Mpps. As for page_pool stats, recycle_ring is now 0 and alloc_slow counter doesn't change most of time, which means the MM layer is not even called to allocate any new pages. Suggested-by: Jakub Kicinski # in_softirq() Signed-off-by: Alexander Lobakin Reviewed-by: Alexander Duyck Link: https://lore.kernel.org/r/20230804180529.2483231-7-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 85f82a6a08dc..33fdf04d4334 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -902,8 +902,10 @@ bool napi_pp_put_page(struct page *page, bool napi_safe) /* Allow direct recycle if we have reasons to believe that we are * in the same context as the consumer would run, so there's * no possible race. + * __page_pool_put_page() makes sure we're not in hardirq context + * and interrupts are enabled prior to accessing the cache. */ - if (napi_safe) { + if (napi_safe || in_softirq()) { const struct napi_struct *napi = READ_ONCE(pp->p.napi); allow_direct = napi && -- cgit v1.2.3 From 1d85594fd3e7e39e63b53b1bdc2d89db43b6ecd5 Mon Sep 17 00:00:00 2001 From: Maciej Å»enczykowski Date: Tue, 25 Jul 2023 01:54:43 -0700 Subject: netfilter: nfnetlink_log: always add a timestamp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compared to all the other work we're already doing to deliver an skb to userspace this is very cheap - at worse an extra call to ktime_get_real() - and very useful. (and indeed it may even be cheaper if we're running from other hooks) (background: Android occasionally logs packets which caused wake from sleep/suspend and we'd like to have timestamps reliably associated with these events) Cc: Pablo Neira Ayuso Cc: Martin KaFai Lau Cc: Florian Westphal Signed-off-by: Maciej Å»enczykowski Signed-off-by: Florian Westphal --- net/netfilter/nfnetlink_log.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index e57eb168ee13..53c9e76473ba 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -470,7 +470,6 @@ __build_packet_message(struct nfnl_log_net *log, sk_buff_data_t old_tail = inst->skb->tail; struct sock *sk; const unsigned char *hwhdrp; - ktime_t tstamp; nlh = nfnl_msg_put(inst->skb, 0, 0, nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), @@ -599,10 +598,9 @@ __build_packet_message(struct nfnl_log_net *log, goto nla_put_failure; } - tstamp = skb_tstamp_cond(skb, false); - if (hooknum <= NF_INET_FORWARD && tstamp) { + if (hooknum <= NF_INET_FORWARD) { + struct timespec64 kts = ktime_to_timespec64(skb_tstamp_cond(skb, true)); struct nfulnl_msg_packet_timestamp ts; - struct timespec64 kts = ktime_to_timespec64(tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); -- cgit v1.2.3 From c67180efc507e04a87f22aa68bd7dd832db006b7 Mon Sep 17 00:00:00 2001 From: xu xin Date: Mon, 7 Aug 2023 01:54:08 +0000 Subject: net/ipv4: return the real errno instead of -EINVAL For now, No matter what error pointer ip_neigh_for_gw() returns, ip_finish_output2() always return -EINVAL, which may mislead the upper users. For exemple, an application uses sendto to send an UDP packet, but when the neighbor table overflows, sendto() will get a value of -EINVAL, and it will cause users to waste a lot of time checking parameters for errors. Return the real errno instead of -EINVAL. Signed-off-by: xu xin Reviewed-by: Yang Yang Cc: Si Hao Reviewed-by: Kuniyuki Iwashima Reviewed-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20230807015408.248237-1-xu.xin16@zte.com.cn Signed-off-by: Jakub Kicinski --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6ba1a0fafbaa..f28c87533a46 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -236,7 +236,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s net_dbg_ratelimited("%s: No header cache and no neighbour!\n", __func__); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); - return -EINVAL; + return PTR_ERR(neigh); } static int ip_finish_output_gso(struct net *net, struct sock *sk, -- cgit v1.2.3 From 794529c448008d8bc24d6e06c7528b7dbec99dfd Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Mon, 7 Aug 2023 10:09:47 +0800 Subject: ipv6: exthdrs: Replace opencoded swap() implementation Get a coccinelle warning as follows: net/ipv6/exthdrs.c:800:29-30: WARNING opportunity for swap() Use swap() to replace opencoded implementation. Signed-off-by: Ziyang Xuan Reviewed-by: Pavan Chebbi Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230807020947.1991716-1-william.xuanziyang@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv6/exthdrs.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index f4bfccae003c..4952ae792450 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -648,7 +648,6 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); struct in6_addr *addr = NULL; - struct in6_addr daddr; int n, i; struct ipv6_rt_hdr *hdr; struct rt0_hdr *rthdr; @@ -796,9 +795,7 @@ looped_back: return -1; } - daddr = *addr; - *addr = ipv6_hdr(skb)->daddr; - ipv6_hdr(skb)->daddr = daddr; + swap(*addr, ipv6_hdr(skb)->daddr); ip6_route_input(skb); if (skb_dst(skb)->error) { -- cgit v1.2.3 From ba4a734e1aa073b06412fc80ad81b54c4644d093 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 7 Aug 2023 09:10:22 +0200 Subject: net/tls: avoid TCP window full during ->read_sock() When flushing the backlog after decoding a record we don't really know how much data the caller want us to evaluate, so use INT_MAX and 0 as arguments to tls_read_flush_backlog() to ensure we flush at 128k of data. Otherwise we might be reading too much data and trigger a TCP window full. Suggested-by: Jakub Kicinski Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20230807071022.10091-1-hare@suse.de Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9c1f13541708..5c122d7bb784 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2240,7 +2240,6 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, tlm = tls_msg(skb); } else { struct tls_decrypt_arg darg; - int to_decrypt; err = tls_rx_rec_wait(sk, NULL, true, released); if (err <= 0) @@ -2248,20 +2247,18 @@ int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, memset(&darg.inargs, 0, sizeof(darg.inargs)); - rxm = strp_msg(tls_strp_msg(ctx)); - tlm = tls_msg(tls_strp_msg(ctx)); - - to_decrypt = rxm->full_len - prot->overhead_size; - err = tls_rx_one_record(sk, NULL, &darg); if (err < 0) { tls_err_abort(sk, -EBADMSG); goto read_sock_end; } - released = tls_read_flush_backlog(sk, prot, rxm->full_len, to_decrypt, - decrypted, &flushed_at); + released = tls_read_flush_backlog(sk, prot, INT_MAX, + 0, decrypted, + &flushed_at); skb = darg.skb; + rxm = strp_msg(skb); + tlm = tls_msg(skb); decrypted += rxm->full_len; tls_rx_rec_done(ctx); -- cgit v1.2.3 From ca76b386d46fdcdf2d8829f008744e57f819a581 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Mon, 7 Aug 2023 22:29:26 +0800 Subject: tipc: Remove unused declaration tipc_link_build_bc_sync_msg() Commit 526669866140 ("tipc: let broadcast packet reception use new link receive function") declared but never implemented this. Signed-off-by: Yue Haibing Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230807142926.45752-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/tipc/link.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/tipc/link.h b/net/tipc/link.h index a16f401fdabd..d80f5649b395 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -148,8 +148,6 @@ int tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, struct sk_buff_head *xmitq, struct sk_buff_head *retrq); -void tipc_link_build_bc_sync_msg(struct tipc_link *l, - struct sk_buff_head *xmitq); void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr); int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, struct sk_buff_head *xmitq); -- cgit v1.2.3 From 832140804e3b3ad19d73adebd25f69ed98778c58 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 8 Aug 2023 10:20:20 +0200 Subject: devlink: clear flag on port register error path When xarray insertion fails, clear the flag. Signed-off-by: Jiri Pirko Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230808082020.1363497-1-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/leftover.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 3bf42f5335ed..e7900d9fa205 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -6829,8 +6829,10 @@ int devl_port_register_with_ops(struct devlink *devlink, spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); - if (err) + if (err) { + devlink_port->registered = false; return err; + } INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); -- cgit v1.2.3 From 145622771d22a7ad5061278eb6fb16396c29d308 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 8 Aug 2023 12:12:16 +0100 Subject: net: dsa: mark parsed interface mode for legacy switch drivers If we successfully parsed an interface mode with a legacy switch driver, populate that mode into phylink's supported interfaces rather than defaulting to the internal and gmii interfaces. This hasn't caused an issue so far, because when the interface doesn't match a supported one, phylink_validate() doesn't clear the supported mask, but instead returns -EINVAL. phylink_parse_fixedlink() doesn't check this return value, and merely relies on the supported ethtool link modes mask being cleared. Therefore, the fixed link settings end up being allowed despite validation failing. Before this causes a problem, arrange for DSA to more accurately populate phylink's supported interfaces mask so validation can correctly succeed. Signed-off-by: Russell King (Oracle) Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/E1qTKdM-003Cpx-Eh@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- net/dsa/port.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dsa/port.c b/net/dsa/port.c index 24015e11255f..37ab238e8304 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1690,10 +1690,14 @@ int dsa_port_phylink_create(struct dsa_port *dp) ds->ops->phylink_get_caps(ds, dp->index, &dp->pl_config); } else { /* For legacy drivers */ - __set_bit(PHY_INTERFACE_MODE_INTERNAL, - dp->pl_config.supported_interfaces); - __set_bit(PHY_INTERFACE_MODE_GMII, - dp->pl_config.supported_interfaces); + if (mode != PHY_INTERFACE_MODE_NA) { + __set_bit(mode, dp->pl_config.supported_interfaces); + } else { + __set_bit(PHY_INTERFACE_MODE_INTERNAL, + dp->pl_config.supported_interfaces); + __set_bit(PHY_INTERFACE_MODE_GMII, + dp->pl_config.supported_interfaces); + } } pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn), -- cgit v1.2.3 From 1ded5e5a5931bb8b31e15b63b655fe232e3416b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Aug 2023 13:58:09 +0000 Subject: net: annotate data-races around sock->ops IPV6_ADDRFORM socket option is evil, because it can change sock->ops while other threads might read it. Same issue for sk->sk_family being set to AF_INET. Adding READ_ONCE() over sock->ops reads is needed for sockets that might be impacted by IPV6_ADDRFORM. Note that mptcp_is_tcpsk() can also overwrite sock->ops. Adding annotations for all sk->sk_family reads will require more patches :/ BUG: KCSAN: data-race in ____sys_sendmsg / do_ipv6_setsockopt write to 0xffff888109f24ca0 of 8 bytes by task 4470 on cpu 0: do_ipv6_setsockopt+0x2c5e/0x2ce0 net/ipv6/ipv6_sockglue.c:491 ipv6_setsockopt+0x57/0x130 net/ipv6/ipv6_sockglue.c:1012 udpv6_setsockopt+0x95/0xa0 net/ipv6/udp.c:1690 sock_common_setsockopt+0x61/0x70 net/core/sock.c:3663 __sys_setsockopt+0x1c3/0x230 net/socket.c:2273 __do_sys_setsockopt net/socket.c:2284 [inline] __se_sys_setsockopt net/socket.c:2281 [inline] __x64_sys_setsockopt+0x66/0x80 net/socket.c:2281 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff888109f24ca0 of 8 bytes by task 4469 on cpu 1: sock_sendmsg_nosec net/socket.c:724 [inline] sock_sendmsg net/socket.c:747 [inline] ____sys_sendmsg+0x349/0x4c0 net/socket.c:2503 ___sys_sendmsg net/socket.c:2557 [inline] __sys_sendmmsg+0x263/0x500 net/socket.c:2643 __do_sys_sendmmsg net/socket.c:2672 [inline] __se_sys_sendmmsg net/socket.c:2669 [inline] __x64_sys_sendmmsg+0x57/0x60 net/socket.c:2669 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0xffffffff850e32b8 -> 0xffffffff850da890 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 4469 Comm: syz-executor.1 Not tainted 6.4.0-rc5-syzkaller-00313-g4c605260bc60 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/25/2023 Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230808135809.2300241-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/net.h | 2 +- net/9p/trans_fd.c | 4 +- net/core/scm.c | 3 +- net/core/skmsg.c | 8 ++- net/core/sock.c | 24 ++++++--- net/ipv6/ipv6_sockglue.c | 8 +-- net/mptcp/protocol.c | 8 +-- net/socket.c | 136 ++++++++++++++++++++++++++++------------------- net/unix/scm.c | 3 +- 9 files changed, 118 insertions(+), 78 deletions(-) (limited to 'net') diff --git a/include/linux/net.h b/include/linux/net.h index 41c608c1b02c..c9b4a63791a4 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -123,7 +123,7 @@ struct socket { struct file *file; struct sock *sk; - const struct proto_ops *ops; + const struct proto_ops *ops; /* Might change with IPV6_ADDRFORM or MPTCP. */ struct socket_wq wq; }; diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 00b684616e8d..c4015f30f9fa 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -1019,7 +1019,7 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) } } - err = csocket->ops->connect(csocket, + err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr *)&sin_server, sizeof(struct sockaddr_in), 0); if (err < 0) { @@ -1060,7 +1060,7 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) return err; } - err = csocket->ops->connect(csocket, (struct sockaddr *)&sun_server, + err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr *)&sun_server, sizeof(struct sockaddr_un) - 1, 0); if (err < 0) { pr_err("%s (%d): problem connecting socket: %s: %d\n", diff --git a/net/core/scm.c b/net/core/scm.c index 3cd7dd377e53..880027ecf516 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -130,6 +130,7 @@ EXPORT_SYMBOL(__scm_destroy); int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { + const struct proto_ops *ops = READ_ONCE(sock->ops); struct cmsghdr *cmsg; int err; @@ -153,7 +154,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) switch (cmsg->cmsg_type) { case SCM_RIGHTS: - if (!sock->ops || sock->ops->family != PF_UNIX) + if (!ops || ops->family != PF_UNIX) goto error; err=scm_fp_copy(cmsg, &p->fp); if (err<0) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index a29508e1ff35..e6dfc846018f 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1198,13 +1198,17 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; + const struct proto_ops *ops; int copied; trace_sk_data_ready(sk); - if (unlikely(!sock || !sock->ops || !sock->ops->read_skb)) + if (unlikely(!sock)) return; - copied = sock->ops->read_skb(sk, sk_psock_verdict_recv); + ops = READ_ONCE(sock->ops); + if (!ops || !ops->read_skb) + return; + copied = ops->read_skb(sk, sk_psock_verdict_recv); if (copied >= 0) { struct sk_psock *psock; diff --git a/net/core/sock.c b/net/core/sock.c index 49915801d53a..51f7d94eccf7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1277,14 +1277,19 @@ set_sndbuf: break; case SO_RCVLOWAT: + { + int (*set_rcvlowat)(struct sock *sk, int val) = NULL; + if (val < 0) val = INT_MAX; - if (sock && sock->ops->set_rcvlowat) - ret = sock->ops->set_rcvlowat(sk, val); + if (sock) + set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; + if (set_rcvlowat) + ret = set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; - + } case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, @@ -1379,11 +1384,16 @@ set_sndbuf: break; case SO_PEEK_OFF: - if (sock->ops->set_peek_off) - ret = sock->ops->set_peek_off(sk, val); + { + int (*set_peek_off)(struct sock *sk, int val); + + set_peek_off = READ_ONCE(sock->ops)->set_peek_off; + if (set_peek_off) + ret = set_peek_off(sk, val); else ret = -EOPNOTSUPP; break; + } case SO_NOFCS: sock_valbool_flag(sk, SOCK_NOFCS, valbool); @@ -1816,7 +1826,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, { struct sockaddr_storage address; - lv = sock->ops->getname(sock, (struct sockaddr *)&address, 2); + lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); if (lv < 0) return -ENOTCONN; if (lv < len) @@ -1858,7 +1868,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PEEK_OFF: - if (!sock->ops->set_peek_off) + if (!READ_ONCE(sock->ops)->set_peek_off) return -EOPNOTSUPP; v.val = READ_ONCE(sk->sk_peek_off); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index ae818ff46224..ca377159967c 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -474,8 +474,8 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); - sk->sk_socket->ops = &inet_stream_ops; - sk->sk_family = PF_INET; + WRITE_ONCE(sk->sk_socket->ops, &inet_stream_ops); + WRITE_ONCE(sk->sk_family, PF_INET); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct proto *prot = &udp_prot; @@ -488,8 +488,8 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); - sk->sk_socket->ops = &inet_dgram_ops; - sk->sk_family = PF_INET; + WRITE_ONCE(sk->sk_socket->ops, &inet_dgram_ops); + WRITE_ONCE(sk->sk_family, PF_INET); } /* Disable all options not to allocate memory anymore, diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 65ee949a8a44..1c079e83481e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -67,11 +67,11 @@ static bool mptcp_is_tcpsk(struct sock *sk) * Hand the socket over to tcp so all further socket ops * bypass mptcp. */ - sock->ops = &inet_stream_ops; + WRITE_ONCE(sock->ops, &inet_stream_ops); return true; #if IS_ENABLED(CONFIG_MPTCP_IPV6) } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { - sock->ops = &inet6_stream_ops; + WRITE_ONCE(sock->ops, &inet6_stream_ops); return true; #endif } @@ -3683,7 +3683,7 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto unlock; } - err = ssock->ops->bind(ssock, uaddr, addr_len); + err = READ_ONCE(ssock->ops)->bind(ssock, uaddr, addr_len); if (!err) mptcp_copy_inaddrs(sock->sk, ssock->sk); @@ -3717,7 +3717,7 @@ static int mptcp_listen(struct socket *sock, int backlog) inet_sk_state_store(sk, TCP_LISTEN); sock_set_flag(sk, SOCK_RCU_FREE); - err = ssock->ops->listen(ssock, backlog); + err = READ_ONCE(ssock->ops)->listen(ssock, backlog); inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); if (!err) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); diff --git a/net/socket.c b/net/socket.c index 2b0e54b2405c..5d4e37595e9a 100644 --- a/net/socket.c +++ b/net/socket.c @@ -136,9 +136,10 @@ static void sock_splice_eof(struct file *file); static void sock_show_fdinfo(struct seq_file *m, struct file *f) { struct socket *sock = f->private_data; + const struct proto_ops *ops = READ_ONCE(sock->ops); - if (sock->ops->show_fdinfo) - sock->ops->show_fdinfo(m, sock); + if (ops->show_fdinfo) + ops->show_fdinfo(m, sock); } #else #define sock_show_fdinfo NULL @@ -646,12 +647,14 @@ EXPORT_SYMBOL(sock_alloc); static void __sock_release(struct socket *sock, struct inode *inode) { - if (sock->ops) { - struct module *owner = sock->ops->owner; + const struct proto_ops *ops = READ_ONCE(sock->ops); + + if (ops) { + struct module *owner = ops->owner; if (inode) inode_lock(inode); - sock->ops->release(sock); + ops->release(sock); sock->sk = NULL; if (inode) inode_unlock(inode); @@ -722,7 +725,7 @@ static noinline void call_trace_sock_send_length(struct sock *sk, int ret, static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { - int ret = INDIRECT_CALL_INET(sock->ops->sendmsg, inet6_sendmsg, + int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->sendmsg, inet6_sendmsg, inet_sendmsg, sock, msg, msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); @@ -786,13 +789,14 @@ int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { struct socket *sock = sk->sk_socket; + const struct proto_ops *ops = READ_ONCE(sock->ops); - if (!sock->ops->sendmsg_locked) + if (!ops->sendmsg_locked) return sock_no_sendmsg_locked(sk, msg, size); iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size); - return sock->ops->sendmsg_locked(sk, msg, msg_data_left(msg)); + return ops->sendmsg_locked(sk, msg, msg_data_left(msg)); } EXPORT_SYMBOL(kernel_sendmsg_locked); @@ -1017,7 +1021,8 @@ static noinline void call_trace_sock_recv_length(struct sock *sk, int ret, int f static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags) { - int ret = INDIRECT_CALL_INET(sock->ops->recvmsg, inet6_recvmsg, + int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->recvmsg, + inet6_recvmsg, inet_recvmsg, sock, msg, msg_data_left(msg), flags); if (trace_sock_recv_length_enabled()) @@ -1072,19 +1077,23 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, unsigned int flags) { struct socket *sock = file->private_data; + const struct proto_ops *ops; - if (unlikely(!sock->ops->splice_read)) + ops = READ_ONCE(sock->ops); + if (unlikely(!ops->splice_read)) return copy_splice_read(file, ppos, pipe, len, flags); - return sock->ops->splice_read(sock, ppos, pipe, len, flags); + return ops->splice_read(sock, ppos, pipe, len, flags); } static void sock_splice_eof(struct file *file) { struct socket *sock = file->private_data; + const struct proto_ops *ops; - if (sock->ops->splice_eof) - sock->ops->splice_eof(sock); + ops = READ_ONCE(sock->ops); + if (ops->splice_eof) + ops->splice_eof(sock); } static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -1181,13 +1190,14 @@ EXPORT_SYMBOL(vlan_ioctl_set); static long sock_do_ioctl(struct net *net, struct socket *sock, unsigned int cmd, unsigned long arg) { + const struct proto_ops *ops = READ_ONCE(sock->ops); struct ifreq ifr; bool need_copyout; int err; void __user *argp = (void __user *)arg; void __user *data; - err = sock->ops->ioctl(sock, cmd, arg); + err = ops->ioctl(sock, cmd, arg); /* * If this ioctl is unknown try to hand it down @@ -1216,6 +1226,7 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) { + const struct proto_ops *ops; struct socket *sock; struct sock *sk; void __user *argp = (void __user *)arg; @@ -1223,6 +1234,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) struct net *net; sock = file->private_data; + ops = READ_ONCE(sock->ops); sk = sock->sk; net = sock_net(sk); if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) { @@ -1280,23 +1292,23 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: - if (!sock->ops->gettstamp) { + if (!ops->gettstamp) { err = -ENOIOCTLCMD; break; } - err = sock->ops->gettstamp(sock, argp, - cmd == SIOCGSTAMP_OLD, - !IS_ENABLED(CONFIG_64BIT)); + err = ops->gettstamp(sock, argp, + cmd == SIOCGSTAMP_OLD, + !IS_ENABLED(CONFIG_64BIT)); break; case SIOCGSTAMP_NEW: case SIOCGSTAMPNS_NEW: - if (!sock->ops->gettstamp) { + if (!ops->gettstamp) { err = -ENOIOCTLCMD; break; } - err = sock->ops->gettstamp(sock, argp, - cmd == SIOCGSTAMP_NEW, - false); + err = ops->gettstamp(sock, argp, + cmd == SIOCGSTAMP_NEW, + false); break; case SIOCGIFCONF: @@ -1357,9 +1369,10 @@ EXPORT_SYMBOL(sock_create_lite); static __poll_t sock_poll(struct file *file, poll_table *wait) { struct socket *sock = file->private_data; + const struct proto_ops *ops = READ_ONCE(sock->ops); __poll_t events = poll_requested_events(wait), flag = 0; - if (!sock->ops->poll) + if (!ops->poll) return 0; if (sk_can_busy_loop(sock->sk)) { @@ -1371,14 +1384,14 @@ static __poll_t sock_poll(struct file *file, poll_table *wait) flag = POLL_BUSY_LOOP; } - return sock->ops->poll(file, sock, wait) | flag; + return ops->poll(file, sock, wait) | flag; } static int sock_mmap(struct file *file, struct vm_area_struct *vma) { struct socket *sock = file->private_data; - return sock->ops->mmap(file, sock, vma); + return READ_ONCE(sock->ops)->mmap(file, sock, vma); } static int sock_close(struct inode *inode, struct file *filp) @@ -1728,7 +1741,7 @@ int __sys_socketpair(int family, int type, int protocol, int __user *usockvec) goto out; } - err = sock1->ops->socketpair(sock1, sock2); + err = READ_ONCE(sock1->ops)->socketpair(sock1, sock2); if (unlikely(err < 0)) { sock_release(sock2); sock_release(sock1); @@ -1789,7 +1802,7 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) (struct sockaddr *)&address, addrlen); if (!err) - err = sock->ops->bind(sock, + err = READ_ONCE(sock->ops)->bind(sock, (struct sockaddr *) &address, addrlen); } @@ -1823,7 +1836,7 @@ int __sys_listen(int fd, int backlog) err = security_socket_listen(sock, backlog); if (!err) - err = sock->ops->listen(sock, backlog); + err = READ_ONCE(sock->ops)->listen(sock, backlog); fput_light(sock->file, fput_needed); } @@ -1843,6 +1856,7 @@ struct file *do_accept(struct file *file, unsigned file_flags, struct file *newfile; int err, len; struct sockaddr_storage address; + const struct proto_ops *ops; sock = sock_from_file(file); if (!sock) @@ -1851,15 +1865,16 @@ struct file *do_accept(struct file *file, unsigned file_flags, newsock = sock_alloc(); if (!newsock) return ERR_PTR(-ENFILE); + ops = READ_ONCE(sock->ops); newsock->type = sock->type; - newsock->ops = sock->ops; + newsock->ops = ops; /* * We don't need try_module_get here, as the listening socket (sock) * has the protocol module (sock->ops->owner) held. */ - __module_get(newsock->ops->owner); + __module_get(ops->owner); newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); if (IS_ERR(newfile)) @@ -1869,14 +1884,13 @@ struct file *do_accept(struct file *file, unsigned file_flags, if (err) goto out_fd; - err = sock->ops->accept(sock, newsock, sock->file->f_flags | file_flags, + err = ops->accept(sock, newsock, sock->file->f_flags | file_flags, false); if (err < 0) goto out_fd; if (upeer_sockaddr) { - len = newsock->ops->getname(newsock, - (struct sockaddr *)&address, 2); + len = ops->getname(newsock, (struct sockaddr *)&address, 2); if (len < 0) { err = -ECONNABORTED; goto out_fd; @@ -1989,8 +2003,8 @@ int __sys_connect_file(struct file *file, struct sockaddr_storage *address, if (err) goto out; - err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen, - sock->file->f_flags | file_flags); + err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)address, + addrlen, sock->file->f_flags | file_flags); out: return err; } @@ -2039,7 +2053,7 @@ int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, if (err) goto out_put; - err = sock->ops->getname(sock, (struct sockaddr *)&address, 0); + err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0); if (err < 0) goto out_put; /* "err" is actually length in this case */ @@ -2071,13 +2085,15 @@ int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { + const struct proto_ops *ops = READ_ONCE(sock->ops); + err = security_socket_getpeername(sock); if (err) { fput_light(sock->file, fput_needed); return err; } - err = sock->ops->getname(sock, (struct sockaddr *)&address, 1); + err = ops->getname(sock, (struct sockaddr *)&address, 1); if (err >= 0) /* "err" is actually length in this case */ err = move_addr_to_user(&address, err, usockaddr, @@ -2227,6 +2243,7 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, int optlen) { sockptr_t optval = USER_SOCKPTR(user_optval); + const struct proto_ops *ops; char *kernel_optval = NULL; int err, fput_needed; struct socket *sock; @@ -2255,12 +2272,13 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, if (kernel_optval) optval = KERNEL_SOCKPTR(kernel_optval); + ops = READ_ONCE(sock->ops); if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock)) err = sock_setsockopt(sock, level, optname, optval, optlen); - else if (unlikely(!sock->ops->setsockopt)) + else if (unlikely(!ops->setsockopt)) err = -EOPNOTSUPP; else - err = sock->ops->setsockopt(sock, level, optname, optval, + err = ops->setsockopt(sock, level, optname, optval, optlen); kfree(kernel_optval); out_put: @@ -2285,6 +2303,7 @@ int __sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen) { int max_optlen __maybe_unused; + const struct proto_ops *ops; int err, fput_needed; struct socket *sock; @@ -2299,12 +2318,13 @@ int __sys_getsockopt(int fd, int level, int optname, char __user *optval, if (!in_compat_syscall()) max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); + ops = READ_ONCE(sock->ops); if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, optlen); - else if (unlikely(!sock->ops->getsockopt)) + else if (unlikely(!ops->getsockopt)) err = -EOPNOTSUPP; else - err = sock->ops->getsockopt(sock, level, optname, optval, + err = ops->getsockopt(sock, level, optname, optval, optlen); if (!in_compat_syscall()) @@ -2332,7 +2352,7 @@ int __sys_shutdown_sock(struct socket *sock, int how) err = security_socket_shutdown(sock, how); if (!err) - err = sock->ops->shutdown(sock, how); + err = READ_ONCE(sock->ops)->shutdown(sock, how); return err; } @@ -3324,6 +3344,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, void __user *argp = compat_ptr(arg); struct sock *sk = sock->sk; struct net *net = sock_net(sk); + const struct proto_ops *ops; if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) return sock_ioctl(file, cmd, (unsigned long)argp); @@ -3333,10 +3354,11 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, return compat_siocwandev(net, argp); case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: - if (!sock->ops->gettstamp) + ops = READ_ONCE(sock->ops); + if (!ops->gettstamp) return -ENOIOCTLCMD; - return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, - !COMPAT_USE_64BIT_TIME); + return ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, + !COMPAT_USE_64BIT_TIME); case SIOCETHTOOL: case SIOCBONDSLAVEINFOQUERY: @@ -3417,6 +3439,7 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct socket *sock = file->private_data; + const struct proto_ops *ops = READ_ONCE(sock->ops); int ret = -ENOIOCTLCMD; struct sock *sk; struct net *net; @@ -3424,8 +3447,8 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, sk = sock->sk; net = sock_net(sk); - if (sock->ops->compat_ioctl) - ret = sock->ops->compat_ioctl(sock, cmd, arg); + if (ops->compat_ioctl) + ret = ops->compat_ioctl(sock, cmd, arg); if (ret == -ENOIOCTLCMD && (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)) @@ -3449,7 +3472,7 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { - return sock->ops->bind(sock, addr, addrlen); + return READ_ONCE(sock->ops)->bind(sock, addr, addrlen); } EXPORT_SYMBOL(kernel_bind); @@ -3463,7 +3486,7 @@ EXPORT_SYMBOL(kernel_bind); int kernel_listen(struct socket *sock, int backlog) { - return sock->ops->listen(sock, backlog); + return READ_ONCE(sock->ops)->listen(sock, backlog); } EXPORT_SYMBOL(kernel_listen); @@ -3481,6 +3504,7 @@ EXPORT_SYMBOL(kernel_listen); int kernel_accept(struct socket *sock, struct socket **newsock, int flags) { struct sock *sk = sock->sk; + const struct proto_ops *ops = READ_ONCE(sock->ops); int err; err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, @@ -3488,15 +3512,15 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags) if (err < 0) goto done; - err = sock->ops->accept(sock, *newsock, flags, true); + err = ops->accept(sock, *newsock, flags, true); if (err < 0) { sock_release(*newsock); *newsock = NULL; goto done; } - (*newsock)->ops = sock->ops; - __module_get((*newsock)->ops->owner); + (*newsock)->ops = ops; + __module_get(ops->owner); done: return err; @@ -3519,7 +3543,7 @@ EXPORT_SYMBOL(kernel_accept); int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags) { - return sock->ops->connect(sock, addr, addrlen, flags); + return READ_ONCE(sock->ops)->connect(sock, addr, addrlen, flags); } EXPORT_SYMBOL(kernel_connect); @@ -3534,7 +3558,7 @@ EXPORT_SYMBOL(kernel_connect); int kernel_getsockname(struct socket *sock, struct sockaddr *addr) { - return sock->ops->getname(sock, addr, 0); + return READ_ONCE(sock->ops)->getname(sock, addr, 0); } EXPORT_SYMBOL(kernel_getsockname); @@ -3549,7 +3573,7 @@ EXPORT_SYMBOL(kernel_getsockname); int kernel_getpeername(struct socket *sock, struct sockaddr *addr) { - return sock->ops->getname(sock, addr, 1); + return READ_ONCE(sock->ops)->getname(sock, addr, 1); } EXPORT_SYMBOL(kernel_getpeername); @@ -3563,7 +3587,7 @@ EXPORT_SYMBOL(kernel_getpeername); int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how) { - return sock->ops->shutdown(sock, how); + return READ_ONCE(sock->ops)->shutdown(sock, how); } EXPORT_SYMBOL(kernel_sock_shutdown); diff --git a/net/unix/scm.c b/net/unix/scm.c index f9152881d77f..e9dde7176c8a 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -29,10 +29,11 @@ struct sock *unix_get_socket(struct file *filp) /* Socket ? */ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { struct socket *sock = SOCKET_I(inode); + const struct proto_ops *ops = READ_ONCE(sock->ops); struct sock *s = sock->sk; /* PF_UNIX ? */ - if (s && sock->ops && sock->ops->family == PF_UNIX) + if (s && ops && ops->family == PF_UNIX) u_sock = s; } else { /* Could be an io_uring instance */ -- cgit v1.2.3 From fa1891aeb7620b51992d0db86c72c0cd3cf114ab Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 8 Aug 2023 09:43:09 -0700 Subject: net/llc/llc_conn.c: fix 4 instances of -Wmissing-variable-declarations I'm looking to enable -Wmissing-variable-declarations behind W=1. 0day bot spotted the following instances: net/llc/llc_conn.c:44:5: warning: no previous extern declaration for non-static variable 'sysctl_llc2_ack_timeout' [-Wmissing-variable-declarations] 44 | int sysctl_llc2_ack_timeout = LLC2_ACK_TIME * HZ; | ^ net/llc/llc_conn.c:44:1: note: declare 'static' if the variable is not intended to be used outside of this translation unit 44 | int sysctl_llc2_ack_timeout = LLC2_ACK_TIME * HZ; | ^ net/llc/llc_conn.c:45:5: warning: no previous extern declaration for non-static variable 'sysctl_llc2_p_timeout' [-Wmissing-variable-declarations] 45 | int sysctl_llc2_p_timeout = LLC2_P_TIME * HZ; | ^ net/llc/llc_conn.c:45:1: note: declare 'static' if the variable is not intended to be used outside of this translation unit 45 | int sysctl_llc2_p_timeout = LLC2_P_TIME * HZ; | ^ net/llc/llc_conn.c:46:5: warning: no previous extern declaration for non-static variable 'sysctl_llc2_rej_timeout' [-Wmissing-variable-declarations] 46 | int sysctl_llc2_rej_timeout = LLC2_REJ_TIME * HZ; | ^ net/llc/llc_conn.c:46:1: note: declare 'static' if the variable is not intended to be used outside of this translation unit 46 | int sysctl_llc2_rej_timeout = LLC2_REJ_TIME * HZ; | ^ net/llc/llc_conn.c:47:5: warning: no previous extern declaration for non-static variable 'sysctl_llc2_busy_timeout' [-Wmissing-variable-declarations] 47 | int sysctl_llc2_busy_timeout = LLC2_BUSY_TIME * HZ; | ^ net/llc/llc_conn.c:47:1: note: declare 'static' if the variable is not intended to be used outside of this translation unit 47 | int sysctl_llc2_busy_timeout = LLC2_BUSY_TIME * HZ; | ^ These symbols are referenced by more than one translation unit, so make include the correct header for their declarations. Finally, sort the list of includes to help keep them tidy. Reported-by: kernel test robot Closes: https://lore.kernel.org/llvm/202308081000.tTL1ElTr-lkp@intel.com/ Signed-off-by: Nick Desaulniers Reviewed-by: Simon Horman Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230808-llc_static-v1-1-c140c4c297e4@google.com Signed-off-by: Jakub Kicinski --- net/llc/llc_conn.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index d037009ee10f..0a3f5e0bec00 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -14,14 +14,15 @@ #include #include -#include -#include -#include -#include -#include +#include #include +#include #include +#include #include +#include +#include +#include #if 0 #define dprintk(args...) printk(KERN_DEBUG args) -- cgit v1.2.3 From 09e0c3bbde901f17837ad5088a13c3985534b860 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Aug 2023 22:33:14 +0300 Subject: net/sched: taprio: don't access q->qdiscs[] in unoffloaded mode during attach() This is a simple code transformation with no intended behavior change, just to make it absolutely clear that q->qdiscs[] is only attached to the child taprio classes in full offload mode. Right now we use the q->qdiscs[] variable in taprio_attach() for software mode too, but that is quite confusing and avoidable. We use it only to reach the netdev TX queue, but we could as well just use netdev_get_tx_queue() for that. Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20230807193324.4128292-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 8c9cfff7fd05..1cbc7fcd56c0 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -2145,14 +2145,20 @@ static void taprio_attach(struct Qdisc *sch) /* Attach underlying qdisc */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { - struct Qdisc *qdisc = q->qdiscs[ntx]; + struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); struct Qdisc *old; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { + struct Qdisc *qdisc = q->qdiscs[ntx]; + qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; - old = dev_graft_qdisc(qdisc->dev_queue, qdisc); + old = dev_graft_qdisc(dev_queue, qdisc); } else { - old = dev_graft_qdisc(qdisc->dev_queue, sch); + /* In software mode, attach the root taprio qdisc + * to all netdev TX queues, so that dev_qdisc_enqueue() + * goes through taprio_enqueue(). + */ + old = dev_graft_qdisc(dev_queue, sch); qdisc_refcount_inc(sch); } if (old) -- cgit v1.2.3 From 25b0d4e4e41fcf0d2f43c431a82d344149e91258 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Aug 2023 22:33:15 +0300 Subject: net/sched: taprio: keep child Qdisc refcount elevated at 2 in offload mode Normally, Qdiscs have one reference on them held by their owner and one held for each TXQ to which they are attached, however this is not the case with the children of an offloaded taprio. Instead, the taprio qdisc currently lives in the following fragile equilibrium. In the software scheduling case, taprio attaches itself (the root Qdisc) to all TXQs, thus having a refcount of 1 + the number of TX queues. In this mode, the q->qdiscs[] children are not visible directly to the Qdisc API. The lifetime of the Qdiscs from this private array lasts until qdisc_destroy() -> taprio_destroy(). In the fully offloaded case, the root taprio has a refcount of 1, and all child q->qdiscs[] also have a refcount of 1. The child q->qdiscs[] are attached to the netdev TXQs directly and thus are visible to the Qdisc API, however taprio loses a reference to them very early - during qdisc_graft(parent==NULL) -> taprio_attach(). At that time, taprio frees the q->qdiscs[] array to not leak memory, but interestingly, it does not release a reference on these qdiscs because it doesn't effectively own them - they are created by taprio but owned by the Qdisc core, and will be freed by qdisc_graft(parent==NULL, new==NULL) -> qdisc_put(old) when the Qdisc is deleted or when the child Qdisc is replaced with something else. My interest is to change this equilibrium such that taprio also owns a reference on the q->qdiscs[] child Qdiscs for the lifetime of the root Qdisc, including in full offload mode. I want this because I would like taprio_leaf(), taprio_dump_class(), taprio_dump_class_stats() to have insight into q->qdiscs[] for the software scheduling mode - currently they look at dev_queue->qdisc_sleeping, which is, as mentioned, the same as the root taprio. The following set of changes is necessary: - don't free q->qdiscs[] early in taprio_attach(), free it late in taprio_destroy() for consistency with software mode. But: - currently that's not possible, because taprio doesn't own a reference on q->qdiscs[]. So hold that reference - once during the initial attach() and once during subsequent graft() calls when the child is changed. - always keep track of the current child in q->qdiscs[], even for full offload mode, so that we free in taprio_destroy() what we should, and not something stale. Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20230807193324.4128292-3-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 1cbc7fcd56c0..7c1fc3dc3e55 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -2146,30 +2146,31 @@ static void taprio_attach(struct Qdisc *sch) /* Attach underlying qdisc */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); - struct Qdisc *old; + struct Qdisc *old, *dev_queue_qdisc; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { struct Qdisc *qdisc = q->qdiscs[ntx]; + /* In offload mode, the root taprio qdisc is bypassed + * and the netdev TX queues see the children directly + */ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; - old = dev_graft_qdisc(dev_queue, qdisc); + dev_queue_qdisc = qdisc; } else { /* In software mode, attach the root taprio qdisc * to all netdev TX queues, so that dev_qdisc_enqueue() * goes through taprio_enqueue(). */ - old = dev_graft_qdisc(dev_queue, sch); - qdisc_refcount_inc(sch); + dev_queue_qdisc = sch; } + old = dev_graft_qdisc(dev_queue, dev_queue_qdisc); + /* The qdisc's refcount requires to be elevated once + * for each netdev TX queue it is grafted onto + */ + qdisc_refcount_inc(dev_queue_qdisc); if (old) qdisc_put(old); } - - /* access to the child qdiscs is not needed in offload mode */ - if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { - kfree(q->qdiscs); - q->qdiscs = NULL; - } } static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, @@ -2198,13 +2199,23 @@ static int taprio_graft(struct Qdisc *sch, unsigned long cl, if (dev->flags & IFF_UP) dev_deactivate(dev); + /* In offload mode, the child Qdisc is directly attached to the netdev + * TX queue, and thus, we need to keep its refcount elevated in order + * to counteract qdisc_graft()'s call to qdisc_put() once per TX queue. + * However, save the reference to the new qdisc in the private array in + * both software and offload cases, to have an up-to-date reference to + * our children. + */ + *old = q->qdiscs[cl - 1]; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { - *old = dev_graft_qdisc(dev_queue, new); - } else { - *old = q->qdiscs[cl - 1]; - q->qdiscs[cl - 1] = new; + WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old); + if (new) + qdisc_refcount_inc(new); + if (*old) + qdisc_put(*old); } + q->qdiscs[cl - 1] = new; if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; -- cgit v1.2.3 From 98766add2d55194be9f7d88628a058ceab3b06e6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Aug 2023 22:33:16 +0300 Subject: net/sched: taprio: try again to report q->qdiscs[] to qdisc_leaf() This is another stab at commit 1461d212ab27 ("net/sched: taprio: make qdisc_leaf() see the per-netdev-queue pfifo child qdiscs"), later reverted in commit af7b29b1deaa ("Revert "net/sched: taprio: make qdisc_leaf() see the per-netdev-queue pfifo child qdiscs""). I believe that the problems that caused the revert were fixed, and thus, this change is identical to the original patch. Its purpose is to properly reject attaching a software taprio child qdisc to a software taprio parent. Because unoffloaded taprio currently reports itself (the root Qdisc) as the return value from qdisc_leaf(), then the process of attaching another taprio as child to a Qdisc class of the root will just result in a Qdisc_ops :: change() call for the root. Whereas that's not we want. We want Qdisc_ops :: init() to be called for the taprio child, in order to give the taprio child a chance to check whether its sch->parent is TC_H_ROOT or not (and reject this configuration). Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20230807193324.4128292-4-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 7c1fc3dc3e55..831e6f3fda2a 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -2453,12 +2453,14 @@ start_error: static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) { - struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + unsigned int ntx = cl - 1; - if (!dev_queue) + if (ntx >= dev->num_tx_queues) return NULL; - return rtnl_dereference(dev_queue->qdisc_sleeping); + return q->qdiscs[ntx]; } static unsigned long taprio_find(struct Qdisc *sch, u32 classid) -- cgit v1.2.3 From 6e0ec800c1740372a6bbada0d98e78c2e69ba4a8 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Aug 2023 22:33:17 +0300 Subject: net/sched: taprio: delete misleading comment about preallocating child qdiscs As mentioned in commit af7b29b1deaa ("Revert "net/sched: taprio: make qdisc_leaf() see the per-netdev-queue pfifo child qdiscs"") - unlike mqprio, taprio doesn't use q->qdiscs[] only as a temporary transport between Qdisc_ops :: init() and Qdisc_ops :: attach(). Delete the comment, which is just stolen from mqprio, but there, the usage patterns are a lot different, and this is nothing but confusing. Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20230807193324.4128292-5-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 831e6f3fda2a..d5693bd52e93 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -2099,11 +2099,8 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, return -EOPNOTSUPP; } - /* pre-allocate qdisc, attachment can't fail */ - q->qdiscs = kcalloc(dev->num_tx_queues, - sizeof(q->qdiscs[0]), + q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]), GFP_KERNEL); - if (!q->qdiscs) return -ENOMEM; -- cgit v1.2.3 From 665338b2a7a0139337d1f85be65ed16e487f84c1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 7 Aug 2023 22:33:18 +0300 Subject: net/sched: taprio: dump class stats for the actual q->qdiscs[] This makes a difference for the software scheduling mode, where dev_queue->qdisc_sleeping is the same as the taprio root Qdisc itself, but when we're talking about what Qdisc and stats get reported for a traffic class, the root taprio isn't what comes to mind, but q->qdiscs[] is. To understand the difference, I've attempted to send 100 packets in software mode through class 8001:5, and recorded the stats before and after the change. Here is before: $ tc -s class show dev eth0 class taprio 8001:1 root leaf 8001: Sent 9400 bytes 100 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 class taprio 8001:2 root leaf 8001: Sent 9400 bytes 100 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 class taprio 8001:3 root leaf 8001: Sent 9400 bytes 100 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 class taprio 8001:4 root leaf 8001: Sent 9400 bytes 100 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 class taprio 8001:5 root leaf 8001: Sent 9400 bytes 100 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 class taprio 8001:6 root leaf 8001: Sent 9400 bytes 100 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 class taprio 8001:7 root leaf 8001: Sent 9400 bytes 100 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 class taprio 8001:8 root leaf 8001: Sent 9400 bytes 100 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 and here is after: class taprio 8001:1 root Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 class taprio 8001:2 root Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 class taprio 8001:3 root Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 class taprio 8001:4 root Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 class taprio 8001:5 root Sent 9400 bytes 100 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 class taprio 8001:6 root Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 class taprio 8001:7 root Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 class taprio 8001:8 root leaf 800d: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 window_drops 0 The most glaring (and expected) difference is that before, all class stats reported the global stats, whereas now, they really report just the counters for that traffic class. Finally, Pedro Tammela points out that there is a tc selftest which checks specifically which handle do the child Qdiscs corresponding to each class have. That's changing here - taprio no longer reports tcm->tcm_info as the same handle "1:" as itself (the root Qdisc), but 0 (the handle of the default pfifo child Qdiscs). Since iproute2 does not print a child Qdisc handle of 0, adjust the test's expected output. Link: https://lore.kernel.org/netdev/3b83fcf6-a5e8-26fb-8c8a-ec34ec4c3342@mojatatu.com/ Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20230807193324.4128292-6-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 8 +++----- tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index d5693bd52e93..1cb5e41c0ec7 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -2472,11 +2472,11 @@ static unsigned long taprio_find(struct Qdisc *sch, u32 classid) static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { - struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + struct Qdisc *child = taprio_leaf(sch, cl); tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle |= TC_H_MIN(cl); - tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle; + tcm->tcm_info = child->handle; return 0; } @@ -2486,16 +2486,14 @@ static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, __releases(d->lock) __acquires(d->lock) { - struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + struct Qdisc *child = taprio_leaf(sch, cl); struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_QUEUE_STATS, .queue_stats = { .queue = cl - 1, }, }; - struct Qdisc *child; - child = rtnl_dereference(dev_queue->qdisc_sleeping); if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 || qdisc_qstats_copy(d, child) < 0) return -1; diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json index 08d4861c2e78..860b55df0d6d 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json @@ -104,7 +104,7 @@ "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: taprio num_tc 3 map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@0 1@0 base-time 1000000000 sched-entry S 01 300000 flags 0x1 clockid CLOCK_TAI", "expExitCode": "0", "verifyCmd": "$TC class show dev $ETH", - "matchPattern": "class taprio 1:[0-9]+ root leaf 1:", + "matchPattern": "class taprio 1:[0-9]+ root", "matchCount": "8", "teardown": [ "echo \"1\" > /sys/bus/netdevsim/del_device" -- cgit v1.2.3 From ae75336131337f926d61b7fd86a0cca3146a7620 Mon Sep 17 00:00:00 2001 From: Claudia Draghicescu Date: Wed, 10 May 2023 16:45:57 +0300 Subject: Bluetooth: Check for ISO support in controller This patch checks for ISO_BROADCASTER and ISO_SYNC_RECEIVER in controller. Signed-off-by: Claudia Draghicescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/hci_core.h | 1 + include/net/bluetooth/mgmt.h | 2 ++ net/bluetooth/mgmt.c | 6 ++++++ 4 files changed, 10 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 872dcb91a540..ab2f8f1817cf 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -577,6 +577,7 @@ enum { #define HCI_LE_CIS_CENTRAL 0x10 #define HCI_LE_CIS_PERIPHERAL 0x20 #define HCI_LE_ISO_BROADCASTER 0x40 +#define HCI_LE_ISO_SYNC_RECEIVER 0x80 /* Connection modes */ #define HCI_CM_ACTIVE 0x0000 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index e01d52cb668c..da871581ef87 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1765,6 +1765,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define cis_peripheral_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL) #define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER) +#define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ (!test_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &(dev)->quirks))) diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 5e68b3dd4422..d382679efd2b 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -111,6 +111,8 @@ struct mgmt_rp_read_index_list { #define MGMT_SETTING_WIDEBAND_SPEECH BIT(17) #define MGMT_SETTING_CIS_CENTRAL BIT(18) #define MGMT_SETTING_CIS_PERIPHERAL BIT(19) +#define MGMT_SETTING_ISO_BROADCASTER BIT(20) +#define MGMT_SETTING_ISO_SYNC_RECEIVER BIT(21) #define MGMT_OP_READ_INFO 0x0004 #define MGMT_READ_INFO_SIZE 0 diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index d4498037fadc..4c352abe063b 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -944,6 +944,12 @@ static u32 get_current_settings(struct hci_dev *hdev) if (cis_peripheral_capable(hdev)) settings |= MGMT_SETTING_CIS_PERIPHERAL; + if (bis_capable(hdev)) + settings |= MGMT_SETTING_ISO_BROADCASTER; + + if (sync_recv_capable(hdev)) + settings |= MGMT_SETTING_ISO_SYNC_RECEIVER; + return settings; } -- cgit v1.2.3 From a0bfde167b506423111ddb8cd71930497a40fc54 Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Tue, 30 May 2023 17:21:59 +0300 Subject: Bluetooth: ISO: Add support for connecting multiple BISes It is required for some configurations to have multiple BISes as part of the same BIG. Similar to the flow implemented for unicast, DEFER_SETUP will also be used to bind multiple BISes for the same BIG, before starting Periodic Advertising and creating the BIG. The user will have to open a new socket for each BIS. By setting the BT_DEFER_SETUP socket option and calling connect, a new connection will be added for the BIG and advertising handle set by the socket QoS parameters. Since all BISes will be bound for the same BIG and advertising handle, the socket QoS options and base parameters should match for all connections. By calling connect on a socket that does not have the BT_DEFER_SETUP option set, periodic advertising will be started and the BIG will be created, with a BIS for each previously bound connection. Since a BIG cannot be reconfigured with additional BISes after creation, no more connections can be bound for the BIG after the start periodic advertising and create BIG commands have been queued. The bis_cleanup function has also been updated, so that the advertising set and the BIG will not be terminated unless there are no more bound or connected BISes. The HCI_CONN_BIG_CREATED connection flag has been added to indicate that the BIG has been successfully created. This flag is checked at bis_cleanup, so that the BIG is only terminated if the HCI_LE_Create_BIG_Complete has been received. This implementation has been tested on hardware, using the "isotest" tool with an additional command line option, to specify the number of BISes to create as part of the desired BIG: tools/isotest -i hci0 -s 00:00:00:00:00:00 -N 2 -G 1 -T 1 The btmon log shows that a BIG containing 2 BISes has been created: < HCI Command: LE Create Broadcast Isochronous Group (0x08|0x0068) plen 31 Handle: 0x01 Advertising Handle: 0x01 Number of BIS: 2 SDU Interval: 10000 us (0x002710) Maximum SDU size: 40 Maximum Latency: 10 ms (0x000a) RTN: 0x02 PHY: LE 2M (0x02) Packing: Sequential (0x00) Framing: Unframed (0x00) Encryption: 0x00 Broadcast Code: 00000000000000000000000000000000 > HCI Event: Command Status (0x0f) plen 4 LE Create Broadcast Isochronous Group (0x08|0x0068) ncmd 1 Status: Success (0x00) > HCI Event: LE Meta Event (0x3e) plen 23 LE Broadcast Isochronous Group Complete (0x1b) Status: Success (0x00) Handle: 0x01 BIG Synchronization Delay: 1974 us (0x0007b6) Transport Latency: 1974 us (0x0007b6) PHY: LE 2M (0x02) NSE: 3 BN: 1 PTO: 1 IRC: 3 Maximum PDU: 40 ISO Interval: 10.00 msec (0x0008) Connection Handle #0: 10 Connection Handle #1: 11 < HCI Command: LE Setup Isochronous Data Path (0x08|0x006e) plen 13 Handle: 10 Data Path Direction: Input (Host to Controller) (0x00) Data Path: HCI (0x00) Coding Format: Transparent (0x03) Company Codec ID: Ericsson Technology Licensing (0) Vendor Codec ID: 0 Controller Delay: 0 us (0x000000) Codec Configuration Length: 0 Codec Configuration: > HCI Event: Command Complete (0x0e) plen 6 LE Setup Isochronous Data Path (0x08|0x006e) ncmd 1 Status: Success (0x00) Handle: 10 < HCI Command: LE Setup Isochronous Data Path (0x08|0x006e) plen 13 Handle: 11 Data Path Direction: Input (Host to Controller) (0x00) Data Path: HCI (0x00) Coding Format: Transparent (0x03) Company Codec ID: Ericsson Technology Licensing (0) Vendor Codec ID: 0 Controller Delay: 0 us (0x000000) Codec Configuration Length: 0 Codec Configuration: > HCI Event: Command Complete (0x0e) plen 6 LE Setup Isochronous Data Path (0x08|0x006e) ncmd 1 Status: Success (0x00) Handle: 11 < ISO Data TX: Handle 10 flags 0x02 dlen 44 < ISO Data TX: Handle 11 flags 0x02 dlen 44 > HCI Event: Number of Completed Packets (0x13) plen 5 Num handles: 1 Handle: 10 Count: 1 > HCI Event: Number of Completed Packets (0x13) plen 5 Num handles: 1 Handle: 11 Count: 1 Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 30 ++++++++ net/bluetooth/hci_conn.c | 152 +++++++++++++++++++++++++++------------ net/bluetooth/hci_event.c | 52 ++++++++------ net/bluetooth/iso.c | 28 ++++++-- 4 files changed, 189 insertions(+), 73 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index da871581ef87..c0bb58f1e86f 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -974,6 +974,7 @@ enum { HCI_CONN_SCANNING, HCI_CONN_AUTH_FAILURE, HCI_CONN_PER_ADV, + HCI_CONN_BIG_CREATED, }; static inline bool hci_conn_ssp_enabled(struct hci_conn *conn) @@ -1115,6 +1116,32 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn * +hci_conn_hash_lookup_per_adv_bis(struct hci_dev *hdev, + bdaddr_t *ba, + __u8 big, __u8 bis) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (bacmp(&c->dst, ba) || c->type != ISO_LINK || + !test_bit(HCI_CONN_PER_ADV, &c->flags)) + continue; + + if (c->iso_qos.bcast.big == big && + c->iso_qos.bcast.bis == bis) { + rcu_read_unlock(); + return c; + } + } + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn *hci_conn_hash_lookup_handle(struct hci_dev *hdev, __u16 handle) { @@ -1351,6 +1378,9 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, __u16 setting, struct bt_codec *codec); struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos); +struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, + struct bt_iso_qos *qos, + __u8 base_len, __u8 *base); struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos); struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 76222565e2df..13c266dbee67 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -792,6 +792,7 @@ struct iso_list_data { }; int count; struct iso_cig_params pdu; + bool big_term; }; static void bis_list(struct hci_conn *conn, void *data) @@ -828,11 +829,8 @@ static int terminate_big_sync(struct hci_dev *hdev, void *data) hci_remove_ext_adv_instance_sync(hdev, d->bis, NULL); - /* Check if ISO connection is a BIS and terminate BIG if there are - * no other connections using it. - */ - hci_conn_hash_list_state(hdev, find_bis, ISO_LINK, BT_CONNECTED, d); - if (d->count) + /* Only terminate BIG if it has been created */ + if (!d->big_term) return 0; return hci_le_terminate_big_sync(hdev, d->big, @@ -844,19 +842,21 @@ static void terminate_big_destroy(struct hci_dev *hdev, void *data, int err) kfree(data); } -static int hci_le_terminate_big(struct hci_dev *hdev, u8 big, u8 bis) +static int hci_le_terminate_big(struct hci_dev *hdev, struct hci_conn *conn) { struct iso_list_data *d; int ret; - bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", big, bis); + bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", conn->iso_qos.bcast.big, + conn->iso_qos.bcast.bis); d = kzalloc(sizeof(*d), GFP_KERNEL); if (!d) return -ENOMEM; - d->big = big; - d->bis = bis; + d->big = conn->iso_qos.bcast.big; + d->bis = conn->iso_qos.bcast.bis; + d->big_term = test_and_clear_bit(HCI_CONN_BIG_CREATED, &conn->flags); ret = hci_cmd_sync_queue(hdev, terminate_big_sync, d, terminate_big_destroy); @@ -916,6 +916,7 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, u16 sync_handle) static void bis_cleanup(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; + struct hci_conn *bis; bt_dev_dbg(hdev, "conn %p", conn); @@ -923,8 +924,16 @@ static void bis_cleanup(struct hci_conn *conn) if (!test_and_clear_bit(HCI_CONN_PER_ADV, &conn->flags)) return; - hci_le_terminate_big(hdev, conn->iso_qos.bcast.big, - conn->iso_qos.bcast.bis); + /* Check if ISO connection is a BIS and terminate advertising + * set and BIG if there are no other connections using it. + */ + bis = hci_conn_hash_lookup_bis(hdev, BDADDR_ANY, + conn->iso_qos.bcast.big, + conn->iso_qos.bcast.bis); + if (bis) + return; + + hci_le_terminate_big(hdev, conn); } else { hci_le_big_terminate(hdev, conn->iso_qos.bcast.big, conn->sync_handle); @@ -1495,10 +1504,10 @@ static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos) /* This function requires the caller holds hdev->lock */ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, - struct bt_iso_qos *qos) + struct bt_iso_qos *qos, __u8 base_len, + __u8 *base) { struct hci_conn *conn; - struct iso_list_data data; int err; /* Let's make sure that le is enabled.*/ @@ -1516,24 +1525,27 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, if (err) return ERR_PTR(err); - data.big = qos->bcast.big; - data.bis = qos->bcast.bis; - data.count = 0; - - /* Check if there is already a matching BIG/BIS */ - hci_conn_hash_list_state(hdev, bis_list, ISO_LINK, BT_BOUND, &data); - if (data.count) + /* Check if the LE Create BIG command has already been sent */ + conn = hci_conn_hash_lookup_per_adv_bis(hdev, dst, qos->bcast.big, + qos->bcast.big); + if (conn) return ERR_PTR(-EADDRINUSE); - conn = hci_conn_hash_lookup_bis(hdev, dst, qos->bcast.big, qos->bcast.bis); - if (conn) + /* Check BIS settings against other bound BISes, since all + * BISes in a BIG must have the same value for all parameters + */ + conn = hci_conn_hash_lookup_bis(hdev, dst, qos->bcast.big, + qos->bcast.bis); + + if (conn && (memcmp(qos, &conn->iso_qos, sizeof(*qos)) || + base_len != conn->le_per_adv_data_len || + memcmp(conn->le_per_adv_data, base, base_len))) return ERR_PTR(-EADDRINUSE); conn = hci_conn_add(hdev, ISO_LINK, dst, HCI_ROLE_MASTER); if (!conn) return ERR_PTR(-ENOMEM); - set_bit(HCI_CONN_PER_ADV, &conn->flags); conn->state = BT_CONNECT; hci_conn_hold(conn); @@ -1747,12 +1759,21 @@ static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos) { struct hci_dev *hdev = conn->hdev; struct hci_cp_le_create_big cp; + struct iso_list_data data; memset(&cp, 0, sizeof(cp)); + data.big = qos->bcast.big; + data.bis = qos->bcast.bis; + data.count = 0; + + /* Create a BIS for each bound connection */ + hci_conn_hash_list_state(hdev, bis_list, ISO_LINK, + BT_BOUND, &data); + cp.handle = qos->bcast.big; cp.adv_handle = qos->bcast.bis; - cp.num_bis = 0x01; + cp.num_bis = data.count; hci_cpu_to_le24(qos->bcast.out.interval, cp.bis.sdu_interval); cp.bis.sdu = cpu_to_le16(qos->bcast.out.sdu); cp.bis.latency = cpu_to_le16(qos->bcast.out.latency); @@ -2051,16 +2072,6 @@ static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn, qos->latency = conn->le_conn_latency; } -static void hci_bind_bis(struct hci_conn *conn, - struct bt_iso_qos *qos) -{ - /* Update LINK PHYs according to QoS preference */ - conn->le_tx_phy = qos->bcast.out.phy; - conn->le_tx_phy = qos->bcast.out.phy; - conn->iso_qos = *qos; - conn->state = BT_BOUND; -} - static int create_big_sync(struct hci_dev *hdev, void *data) { struct hci_conn *conn = data; @@ -2183,27 +2194,80 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err) } } -struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) +struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, + struct bt_iso_qos *qos, + __u8 base_len, __u8 *base) { struct hci_conn *conn; - int err; + __u8 eir[HCI_MAX_PER_AD_LENGTH]; + + if (base_len && base) + base_len = eir_append_service_data(eir, 0, 0x1851, + base, base_len); /* We need hci_conn object using the BDADDR_ANY as dst */ - conn = hci_add_bis(hdev, dst, qos); + conn = hci_add_bis(hdev, dst, qos, base_len, eir); if (IS_ERR(conn)) return conn; - hci_bind_bis(conn, qos); + /* Update LINK PHYs according to QoS preference */ + conn->le_tx_phy = qos->bcast.out.phy; + conn->le_tx_phy = qos->bcast.out.phy; /* Add Basic Announcement into Peridic Adv Data if BASE is set */ if (base_len && base) { - base_len = eir_append_service_data(conn->le_per_adv_data, 0, - 0x1851, base, base_len); + memcpy(conn->le_per_adv_data, eir, sizeof(eir)); conn->le_per_adv_data_len = base_len; } + hci_iso_qos_setup(hdev, conn, &qos->bcast.out, + conn->le_tx_phy ? conn->le_tx_phy : + hdev->le_tx_def_phys); + + conn->iso_qos = *qos; + conn->state = BT_BOUND; + + return conn; +} + +static void bis_mark_per_adv(struct hci_conn *conn, void *data) +{ + struct iso_list_data *d = data; + + /* Skip if not broadcast/ANY address */ + if (bacmp(&conn->dst, BDADDR_ANY)) + return; + + if (d->big != conn->iso_qos.bcast.big || + d->bis == BT_ISO_QOS_BIS_UNSET || + d->bis != conn->iso_qos.bcast.bis) + return; + + set_bit(HCI_CONN_PER_ADV, &conn->flags); +} + +struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, + __u8 dst_type, struct bt_iso_qos *qos, + __u8 base_len, __u8 *base) +{ + struct hci_conn *conn; + int err; + struct iso_list_data data; + + conn = hci_bind_bis(hdev, dst, qos, base_len, base); + if (IS_ERR(conn)) + return conn; + + data.big = qos->bcast.big; + data.bis = qos->bcast.bis; + + /* Set HCI_CONN_PER_ADV for all bound connections, to mark that + * the start periodic advertising and create BIG commands have + * been queued + */ + hci_conn_hash_list_state(hdev, bis_mark_per_adv, ISO_LINK, + BT_BOUND, &data); + /* Queue start periodic advertising and create BIG */ err = hci_cmd_sync_queue(hdev, create_big_sync, conn, create_big_complete); @@ -2212,10 +2276,6 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, return ERR_PTR(err); } - hci_iso_qos_setup(hdev, conn, &qos->bcast.out, - conn->le_tx_phy ? conn->le_tx_phy : - hdev->le_tx_def_phys); - return conn; } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 31ca320ce38d..86a7e4b3b98d 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6936,6 +6936,7 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, { struct hci_evt_le_create_big_complete *ev = data; struct hci_conn *conn; + __u8 bis_idx = 0; BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); @@ -6944,33 +6945,44 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, return; hci_dev_lock(hdev); + rcu_read_lock(); - conn = hci_conn_hash_lookup_big(hdev, ev->handle); - if (!conn) - goto unlock; + /* Connect all BISes that are bound to the BIG */ + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + if (bacmp(&conn->dst, BDADDR_ANY) || + conn->type != ISO_LINK || + conn->iso_qos.bcast.big != ev->handle) + continue; - if (conn->type != ISO_LINK) { - bt_dev_err(hdev, - "Invalid connection link type handle 0x%2.2x", - ev->handle); - goto unlock; - } + conn->handle = __le16_to_cpu(ev->bis_handle[bis_idx++]); - if (ev->num_bis) - conn->handle = __le16_to_cpu(ev->bis_handle[0]); + if (!ev->status) { + conn->state = BT_CONNECTED; + set_bit(HCI_CONN_BIG_CREATED, &conn->flags); + rcu_read_unlock(); + hci_debugfs_create_conn(conn); + hci_conn_add_sysfs(conn); + hci_iso_setup_path(conn); + rcu_read_lock(); + continue; + } - if (!ev->status) { - conn->state = BT_CONNECTED; - hci_debugfs_create_conn(conn); - hci_conn_add_sysfs(conn); - hci_iso_setup_path(conn); - goto unlock; + hci_connect_cfm(conn, ev->status); + rcu_read_unlock(); + hci_conn_del(conn); + rcu_read_lock(); } - hci_connect_cfm(conn, ev->status); - hci_conn_del(conn); + if (!ev->status && !bis_idx) + /* If no BISes have been connected for the BIG, + * terminate. This is in case all bound connections + * have been closed before the BIG creation + * has completed. + */ + hci_le_terminate_big_sync(hdev, ev->handle, + HCI_ERROR_LOCAL_HOST_TERM); -unlock: + rcu_read_unlock(); hci_dev_unlock(hdev); } diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 505d62247268..5808d57c1d7b 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -287,13 +287,24 @@ static int iso_connect_bis(struct sock *sk) goto unlock; } - hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, - le_addr_type(iso_pi(sk)->dst_type), - &iso_pi(sk)->qos, iso_pi(sk)->base_len, - iso_pi(sk)->base); - if (IS_ERR(hcon)) { - err = PTR_ERR(hcon); - goto unlock; + /* Just bind if DEFER_SETUP has been set */ + if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { + hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, + &iso_pi(sk)->qos, iso_pi(sk)->base_len, + iso_pi(sk)->base); + if (IS_ERR(hcon)) { + err = PTR_ERR(hcon); + goto unlock; + } + } else { + hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, + le_addr_type(iso_pi(sk)->dst_type), + &iso_pi(sk)->qos, iso_pi(sk)->base_len, + iso_pi(sk)->base); + if (IS_ERR(hcon)) { + err = PTR_ERR(hcon); + goto unlock; + } } conn = iso_conn_add(hcon); @@ -317,6 +328,9 @@ static int iso_connect_bis(struct sock *sk) if (hcon->state == BT_CONNECTED) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; + } else if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { + iso_sock_clear_timer(sk); + sk->sk_state = BT_CONNECT; } else { sk->sk_state = BT_CONNECT; iso_sock_set_timer(sk, sk->sk_sndtimeo); -- cgit v1.2.3 From 7f74563e6140e42b4ffae62adbef7a65967a3f98 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Thu, 1 Jun 2023 09:34:46 +0300 Subject: Bluetooth: ISO: do not emit new LE Create CIS if previous is pending LE Create CIS command shall not be sent before all CIS Established events from its previous invocation have been processed. Currently it is sent via hci_sync but that only waits for the first event, but there can be multiple. Make it wait for all events, and simplify the CIS creation as follows: Add new flag HCI_CONN_CREATE_CIS, which is set if Create CIS has been sent for the connection but it is not yet completed. Make BT_CONNECT state to mean the connection wants Create CIS. On events after which new Create CIS may need to be sent, send it if possible and some connections need it. These events are: hci_connect_cis, iso_connect_cfm, hci_cs_le_create_cis, hci_le_cis_estabilished_evt. The Create CIS status/completion events shall queue new Create CIS only if at least one of the connections transitions away from BT_CONNECT, so that we don't loop if controller is sending bogus events. This fixes sending multiple CIS Create for the same CIS in the "ISO AC 6(i) - Success" BlueZ test case: < HCI Command: LE Create Co.. (0x08|0x0064) plen 9 #129 [hci0] Number of CIS: 2 CIS Handle: 257 ACL Handle: 42 CIS Handle: 258 ACL Handle: 42 > HCI Event: Command Status (0x0f) plen 4 #130 [hci0] LE Create Connected Isochronous Stream (0x08|0x0064) ncmd 1 Status: Success (0x00) > HCI Event: LE Meta Event (0x3e) plen 29 #131 [hci0] LE Connected Isochronous Stream Established (0x19) Status: Success (0x00) Connection Handle: 257 ... < HCI Command: LE Setup Is.. (0x08|0x006e) plen 13 #132 [hci0] ... > HCI Event: Command Complete (0x0e) plen 6 #133 [hci0] LE Setup Isochronous Data Path (0x08|0x006e) ncmd 1 ... < HCI Command: LE Create Co.. (0x08|0x0064) plen 5 #134 [hci0] Number of CIS: 1 CIS Handle: 258 ACL Handle: 42 > HCI Event: Command Status (0x0f) plen 4 #135 [hci0] LE Create Connected Isochronous Stream (0x08|0x0064) ncmd 1 Status: ACL Connection Already Exists (0x0b) > HCI Event: LE Meta Event (0x3e) plen 29 #136 [hci0] LE Connected Isochronous Stream Established (0x19) Status: Success (0x00) Connection Handle: 258 ... Fixes: c09b80be6ffc ("Bluetooth: hci_conn: Fix not waiting for HCI_EVT_LE_CIS_ESTABLISHED") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 4 +- include/net/bluetooth/hci_sync.h | 2 +- net/bluetooth/hci_conn.c | 74 ++++++++++++++------------------- net/bluetooth/hci_event.c | 25 +++++++++-- net/bluetooth/hci_sync.c | 90 ++++++++++++++++++++++++++++------------ net/bluetooth/iso.c | 2 +- 6 files changed, 119 insertions(+), 78 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index c0bb58f1e86f..ad39d09e9bd6 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -975,6 +975,7 @@ enum { HCI_CONN_AUTH_FAILURE, HCI_CONN_PER_ADV, HCI_CONN_BIG_CREATED, + HCI_CONN_CREATE_CIS, }; static inline bool hci_conn_ssp_enabled(struct hci_conn *conn) @@ -1351,7 +1352,8 @@ int hci_disconnect(struct hci_conn *conn, __u8 reason); bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); bool hci_iso_setup_path(struct hci_conn *conn); -int hci_le_create_cis(struct hci_conn *conn); +int hci_le_create_cis_pending(struct hci_dev *hdev); +int hci_conn_check_create_cis(struct hci_conn *conn); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, u8 role); diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 2495be4d8b82..b516a0f4a55b 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -124,7 +124,7 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason); int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn); -int hci_le_create_cis_sync(struct hci_dev *hdev, struct hci_conn *conn); +int hci_le_create_cis_sync(struct hci_dev *hdev); int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 13c266dbee67..4b5223e62141 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1990,59 +1990,47 @@ bool hci_iso_setup_path(struct hci_conn *conn) return true; } -static int hci_create_cis_sync(struct hci_dev *hdev, void *data) +int hci_conn_check_create_cis(struct hci_conn *conn) { - return hci_le_create_cis_sync(hdev, data); -} + if (conn->type != ISO_LINK || !bacmp(&conn->dst, BDADDR_ANY)) + return -EINVAL; -int hci_le_create_cis(struct hci_conn *conn) -{ - struct hci_conn *cis; - struct hci_link *link, *t; - struct hci_dev *hdev = conn->hdev; - int err; + if (!conn->parent || conn->parent->state != BT_CONNECTED || + conn->state != BT_CONNECT || conn->handle == HCI_CONN_HANDLE_UNSET) + return 1; - bt_dev_dbg(hdev, "hcon %p", conn); + return 0; +} - switch (conn->type) { - case LE_LINK: - if (conn->state != BT_CONNECTED || list_empty(&conn->link_list)) - return -EINVAL; +static int hci_create_cis_sync(struct hci_dev *hdev, void *data) +{ + return hci_le_create_cis_sync(hdev); +} - cis = NULL; +int hci_le_create_cis_pending(struct hci_dev *hdev) +{ + struct hci_conn *conn; + bool pending = false; - /* hci_conn_link uses list_add_tail_rcu so the list is in - * the same order as the connections are requested. - */ - list_for_each_entry_safe(link, t, &conn->link_list, list) { - if (link->conn->state == BT_BOUND) { - err = hci_le_create_cis(link->conn); - if (err) - return err; + rcu_read_lock(); - cis = link->conn; - } + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) { + rcu_read_unlock(); + return -EBUSY; } - return cis ? 0 : -EINVAL; - case ISO_LINK: - cis = conn; - break; - default: - return -EINVAL; + if (!hci_conn_check_create_cis(conn)) + pending = true; } - if (cis->state == BT_CONNECT) + rcu_read_unlock(); + + if (!pending) return 0; /* Queue Create CIS */ - err = hci_cmd_sync_queue(hdev, hci_create_cis_sync, cis, NULL); - if (err) - return err; - - cis->state = BT_CONNECT; - - return 0; + return hci_cmd_sync_queue(hdev, hci_create_cis_sync, NULL, NULL); } static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn, @@ -2317,11 +2305,9 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, return ERR_PTR(-ENOLINK); } - /* If LE is already connected and CIS handle is already set proceed to - * Create CIS immediately. - */ - if (le->state == BT_CONNECTED && cis->handle != HCI_CONN_HANDLE_UNSET) - hci_le_create_cis(cis); + cis->state = BT_CONNECT; + + hci_le_create_cis_pending(hdev); return cis; } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 86a7e4b3b98d..e7249ed3f5f2 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3810,6 +3810,7 @@ static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data, struct hci_cp_le_set_cig_params *cp; struct hci_conn *conn; u8 status = rp->status; + bool pending = false; int i; bt_dev_dbg(hdev, "status 0x%2.2x", rp->status); @@ -3852,13 +3853,15 @@ static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "%p handle 0x%4.4x parent %p", conn, conn->handle, conn->parent); - - /* Create CIS if LE is already connected */ - if (conn->parent && conn->parent->state == BT_CONNECTED) - hci_le_create_cis(conn); + + if (conn->state == BT_CONNECT) + pending = true; } unlock: + if (pending) + hci_le_create_cis_pending(hdev); + hci_dev_unlock(hdev); return rp->status; @@ -4224,6 +4227,7 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, void *data, static void hci_cs_le_create_cis(struct hci_dev *hdev, u8 status) { struct hci_cp_le_create_cis *cp; + bool pending = false; int i; bt_dev_dbg(hdev, "status 0x%2.2x", status); @@ -4246,12 +4250,18 @@ static void hci_cs_le_create_cis(struct hci_dev *hdev, u8 status) conn = hci_conn_hash_lookup_handle(hdev, handle); if (conn) { + if (test_and_clear_bit(HCI_CONN_CREATE_CIS, + &conn->flags)) + pending = true; conn->state = BT_CLOSED; hci_connect_cfm(conn, status); hci_conn_del(conn); } } + if (pending) + hci_le_create_cis_pending(hdev); + hci_dev_unlock(hdev); } @@ -6790,6 +6800,7 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, struct hci_evt_le_cis_established *ev = data; struct hci_conn *conn; struct bt_iso_qos *qos; + bool pending = false; u16 handle = __le16_to_cpu(ev->handle); bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); @@ -6813,6 +6824,8 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, qos = &conn->iso_qos; + pending = test_and_clear_bit(HCI_CONN_CREATE_CIS, &conn->flags); + /* Convert ISO Interval (1.25 ms slots) to SDU Interval (us) */ qos->ucast.in.interval = le16_to_cpu(ev->interval) * 1250; qos->ucast.out.interval = qos->ucast.in.interval; @@ -6854,10 +6867,14 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, goto unlock; } + conn->state = BT_CLOSED; hci_connect_cfm(conn, ev->status); hci_conn_del(conn); unlock: + if (pending) + hci_le_create_cis_pending(hdev); + hci_dev_unlock(hdev); } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 4d1e32bb6a9c..b617d1dd247a 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -6260,56 +6260,92 @@ done: return err; } -int hci_le_create_cis_sync(struct hci_dev *hdev, struct hci_conn *conn) +int hci_le_create_cis_sync(struct hci_dev *hdev) { struct { struct hci_cp_le_create_cis cp; struct hci_cis cis[0x1f]; } cmd; - u8 cig; - struct hci_conn *hcon = conn; + struct hci_conn *conn; + u8 cig = BT_ISO_QOS_CIG_UNSET; + + /* The spec allows only one pending LE Create CIS command at a time. If + * the command is pending now, don't do anything. We check for pending + * connections after each CIS Established event. + * + * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E + * page 2566: + * + * If the Host issues this command before all the + * HCI_LE_CIS_Established events from the previous use of the + * command have been generated, the Controller shall return the + * error code Command Disallowed (0x0C). + * + * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E + * page 2567: + * + * When the Controller receives the HCI_LE_Create_CIS command, the + * Controller sends the HCI_Command_Status event to the Host. An + * HCI_LE_CIS_Established event will be generated for each CIS when it + * is established or if it is disconnected or considered lost before + * being established; until all the events are generated, the command + * remains pending. + */ memset(&cmd, 0, sizeof(cmd)); - cmd.cis[0].acl_handle = cpu_to_le16(conn->parent->handle); - cmd.cis[0].cis_handle = cpu_to_le16(conn->handle); - cmd.cp.num_cis++; - cig = conn->iso_qos.ucast.cig; hci_dev_lock(hdev); rcu_read_lock(); + /* Wait until previous Create CIS has completed */ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { - struct hci_cis *cis = &cmd.cis[cmd.cp.num_cis]; + if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) + goto done; + } - if (conn == hcon || conn->type != ISO_LINK || - conn->state == BT_CONNECTED || - conn->iso_qos.ucast.cig != cig) + /* Find CIG with all CIS ready */ + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + struct hci_conn *link; + + if (hci_conn_check_create_cis(conn)) continue; - /* Check if all CIS(s) belonging to a CIG are ready */ - if (!conn->parent || conn->parent->state != BT_CONNECTED || - conn->state != BT_CONNECT) { - cmd.cp.num_cis = 0; - break; + cig = conn->iso_qos.ucast.cig; + + list_for_each_entry_rcu(link, &hdev->conn_hash.list, list) { + if (hci_conn_check_create_cis(link) > 0 && + link->iso_qos.ucast.cig == cig && + link->state != BT_CONNECTED) { + cig = BT_ISO_QOS_CIG_UNSET; + break; + } } - /* Group all CIS with state BT_CONNECT since the spec don't - * allow to send them individually: - * - * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E - * page 2566: - * - * If the Host issues this command before all the - * HCI_LE_CIS_Established events from the previous use of the - * command have been generated, the Controller shall return the - * error code Command Disallowed (0x0C). - */ + if (cig != BT_ISO_QOS_CIG_UNSET) + break; + } + + if (cig == BT_ISO_QOS_CIG_UNSET) + goto done; + + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + struct hci_cis *cis = &cmd.cis[cmd.cp.num_cis]; + + if (hci_conn_check_create_cis(conn) || + conn->iso_qos.ucast.cig != cig) + continue; + + set_bit(HCI_CONN_CREATE_CIS, &conn->flags); cis->acl_handle = cpu_to_le16(conn->parent->handle); cis->cis_handle = cpu_to_le16(conn->handle); cmd.cp.num_cis++; + + if (cmd.cp.num_cis >= ARRAY_SIZE(cmd.cis)) + break; } +done: rcu_read_unlock(); hci_dev_unlock(hdev); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 5808d57c1d7b..5db4d68c96d5 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1690,7 +1690,7 @@ static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) } /* Create CIS if pending */ - hci_le_create_cis(hcon); + hci_le_create_cis_pending(hcon->hdev); return; } -- cgit v1.2.3 From 6bfa273e533d7b25eee3d74e28a7fe8e6a8e7a93 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 25 May 2023 16:46:41 -0700 Subject: Bluetooth: Consolidate code around sk_alloc into a helper function This consolidates code around sk_alloc into bt_sock_alloc which does take care of common initialization. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 2 ++ net/bluetooth/af_bluetooth.c | 21 +++++++++++++++++++++ net/bluetooth/bnep/sock.c | 10 +--------- net/bluetooth/hci_sock.c | 10 ++-------- net/bluetooth/iso.c | 10 +--------- net/bluetooth/l2cap_sock.c | 10 +--------- net/bluetooth/rfcomm/sock.c | 13 +++---------- net/bluetooth/sco.c | 10 +--------- 8 files changed, 32 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index af729859385e..60689a07b82c 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -400,6 +400,8 @@ int bt_sock_register(int proto, const struct net_proto_family *ops); void bt_sock_unregister(int proto); void bt_sock_link(struct bt_sock_list *l, struct sock *s); void bt_sock_unlink(struct bt_sock_list *l, struct sock *s); +struct sock *bt_sock_alloc(struct net *net, struct socket *sock, + struct proto *prot, int proto, gfp_t prio, int kern); int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 1c3c7ff5c3c6..6035422e13da 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -140,6 +140,27 @@ static int bt_sock_create(struct net *net, struct socket *sock, int proto, return err; } +struct sock *bt_sock_alloc(struct net *net, struct socket *sock, + struct proto *prot, int proto, gfp_t prio, int kern) +{ + struct sock *sk; + + sk = sk_alloc(net, PF_BLUETOOTH, prio, prot, kern); + if (!sk) + return NULL; + + sock_init_data(sock, sk); + INIT_LIST_HEAD(&bt_sk(sk)->accept_q); + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = proto; + sk->sk_state = BT_OPEN; + + return sk; +} +EXPORT_SYMBOL(bt_sock_alloc); + void bt_sock_link(struct bt_sock_list *l, struct sock *sk) { write_lock(&l->lock); diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index 57d509d77cb4..00d47bcf4d7d 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -205,21 +205,13 @@ static int bnep_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto, kern); + sk = bt_sock_alloc(net, sock, &bnep_proto, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; - sock_init_data(sock, sk); - sock->ops = &bnep_sock_ops; - sock->state = SS_UNCONNECTED; - sock_reset_flag(sk, SOCK_ZAPPED); - - sk->sk_protocol = protocol; - sk->sk_state = BT_OPEN; - bt_sock_link(&bnep_sk_list, sk); return 0; } diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 1d249d839819..9c45586f5818 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -2143,18 +2143,12 @@ static int hci_sock_create(struct net *net, struct socket *sock, int protocol, sock->ops = &hci_sock_ops; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto, kern); + sk = bt_sock_alloc(net, sock, &hci_sk_proto, protocol, GFP_ATOMIC, + kern); if (!sk) return -ENOMEM; - sock_init_data(sock, sk); - - sock_reset_flag(sk, SOCK_ZAPPED); - - sk->sk_protocol = protocol; - sock->state = SS_UNCONNECTED; - sk->sk_state = BT_OPEN; sk->sk_destruct = hci_sock_destruct; bt_sock_link(&hci_sk_list, sk); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 5db4d68c96d5..84d238d0639a 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -738,21 +738,13 @@ static struct sock *iso_sock_alloc(struct net *net, struct socket *sock, { struct sock *sk; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &iso_proto, kern); + sk = bt_sock_alloc(net, sock, &iso_proto, proto, prio, kern); if (!sk) return NULL; - sock_init_data(sock, sk); - INIT_LIST_HEAD(&bt_sk(sk)->accept_q); - sk->sk_destruct = iso_sock_destruct; sk->sk_sndtimeo = ISO_CONN_TIMEOUT; - sock_reset_flag(sk, SOCK_ZAPPED); - - sk->sk_protocol = proto; - sk->sk_state = BT_OPEN; - /* Set address type as public as default src address is BDADDR_ANY */ iso_pi(sk)->src_type = BDADDR_LE_PUBLIC; diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 947ca580bb9a..9ef936f27a6a 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1858,21 +1858,13 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, struct sock *sk; struct l2cap_chan *chan; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto, kern); + sk = bt_sock_alloc(net, sock, &l2cap_proto, proto, prio, kern); if (!sk) return NULL; - sock_init_data(sock, sk); - INIT_LIST_HEAD(&bt_sk(sk)->accept_q); - sk->sk_destruct = l2cap_sock_destruct; sk->sk_sndtimeo = L2CAP_CONN_TIMEOUT; - sock_reset_flag(sk, SOCK_ZAPPED); - - sk->sk_protocol = proto; - sk->sk_state = BT_OPEN; - chan = l2cap_chan_create(); if (!chan) { sk_free(sk); diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 4397e14ff560..b54e8a530f55 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -268,18 +268,16 @@ static struct proto rfcomm_proto = { .obj_size = sizeof(struct rfcomm_pinfo) }; -static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) +static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, + int proto, gfp_t prio, int kern) { struct rfcomm_dlc *d; struct sock *sk; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto, kern); + sk = bt_sock_alloc(net, sock, &rfcomm_proto, proto, prio, kern); if (!sk) return NULL; - sock_init_data(sock, sk); - INIT_LIST_HEAD(&bt_sk(sk)->accept_q); - d = rfcomm_dlc_alloc(prio); if (!d) { sk_free(sk); @@ -298,11 +296,6 @@ static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int sk->sk_sndbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10; sk->sk_rcvbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10; - sock_reset_flag(sk, SOCK_ZAPPED); - - sk->sk_protocol = proto; - sk->sk_state = BT_OPEN; - bt_sock_link(&rfcomm_sk_list, sk); BT_DBG("sk %p", sk); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 7762604ddfc0..ec6dce488a40 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -504,21 +504,13 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, { struct sock *sk; - sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto, kern); + sk = bt_sock_alloc(net, sock, &sco_proto, proto, prio, kern); if (!sk) return NULL; - sock_init_data(sock, sk); - INIT_LIST_HEAD(&bt_sk(sk)->accept_q); - sk->sk_destruct = sco_sock_destruct; sk->sk_sndtimeo = SCO_CONN_TIMEOUT; - sock_reset_flag(sk, SOCK_ZAPPED); - - sk->sk_protocol = proto; - sk->sk_state = BT_OPEN; - sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT; sco_pi(sk)->codec.id = BT_CODEC_CVSD; sco_pi(sk)->codec.cid = 0xffff; -- cgit v1.2.3 From 464c702fb9374ff8f3f816f24fb7ac719dd20e1e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 25 May 2023 16:46:42 -0700 Subject: Bluetooth: Init sk_peer_* on bt_sock_alloc This makes sure peer information is always available via sock when using bt_sock_alloc. Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/af_bluetooth.c | 24 ++++++++++++++++++++++++ net/bluetooth/hidp/sock.c | 10 +--------- net/bluetooth/l2cap_sock.c | 19 ------------------- 3 files changed, 25 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 6035422e13da..647afb187147 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -157,6 +157,14 @@ struct sock *bt_sock_alloc(struct net *net, struct socket *sock, sk->sk_protocol = proto; sk->sk_state = BT_OPEN; + /* Init peer information so it can be properly monitored */ + if (!kern) { + spin_lock(&sk->sk_peer_lock); + sk->sk_peer_pid = get_pid(task_tgid(current)); + sk->sk_peer_cred = get_current_cred(); + spin_unlock(&sk->sk_peer_lock); + } + return sk; } EXPORT_SYMBOL(bt_sock_alloc); @@ -179,6 +187,9 @@ EXPORT_SYMBOL(bt_sock_unlink); void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) { + const struct cred *old_cred; + struct pid *old_pid; + BT_DBG("parent %p, sk %p", parent, sk); sock_hold(sk); @@ -191,6 +202,19 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q); bt_sk(sk)->parent = parent; + /* Copy credentials from parent since for incoming connections the + * socket is allocated by the kernel. + */ + spin_lock(&sk->sk_peer_lock); + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; + sk->sk_peer_pid = get_pid(parent->sk_peer_pid); + sk->sk_peer_cred = get_cred(parent->sk_peer_cred); + spin_unlock(&sk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); + if (bh) bh_unlock_sock(sk); else diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 369ed92dac99..c93aaeb3a3fa 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -256,21 +256,13 @@ static int hidp_sock_create(struct net *net, struct socket *sock, int protocol, if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto, kern); + sk = bt_sock_alloc(net, sock, &hidp_proto, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; - sock_init_data(sock, sk); - sock->ops = &hidp_sock_ops; - sock->state = SS_UNCONNECTED; - sock_reset_flag(sk, SOCK_ZAPPED); - - sk->sk_protocol = protocol; - sk->sk_state = BT_OPEN; - bt_sock_link(&hidp_sk_list, sk); return 0; diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 9ef936f27a6a..3bdfc3f1e73d 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -178,21 +178,6 @@ done: return err; } -static void l2cap_sock_init_pid(struct sock *sk) -{ - struct l2cap_chan *chan = l2cap_pi(sk)->chan; - - /* Only L2CAP_MODE_EXT_FLOWCTL ever need to access the PID in order to - * group the channels being requested. - */ - if (chan->mode != L2CAP_MODE_EXT_FLOWCTL) - return; - - spin_lock(&sk->sk_peer_lock); - sk->sk_peer_pid = get_pid(task_tgid(current)); - spin_unlock(&sk->sk_peer_lock); -} - static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { @@ -268,8 +253,6 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, chan->mode != L2CAP_MODE_EXT_FLOWCTL) chan->mode = L2CAP_MODE_LE_FLOWCTL; - l2cap_sock_init_pid(sk); - err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid), &la.l2_bdaddr, la.l2_bdaddr_type); if (err) @@ -325,8 +308,6 @@ static int l2cap_sock_listen(struct socket *sock, int backlog) goto done; } - l2cap_sock_init_pid(sk); - sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; -- cgit v1.2.3 From 69ae5065061c53ded4f49e821646b9bc60ab302a Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 25 May 2023 16:46:43 -0700 Subject: Bluetooth: hci_sock: Forward credentials to monitor This stores scm_creds into hci_skb_cb so they can be properly forwarded to the likes of btmon which is then able to print information about the process who is originating the traffic: bluetoothd[35]: @ MGMT Command: Rea.. (0x0001) plen 0 {0x0001} @ MGMT Event: Command Complete (0x0001) plen 6 {0x0001} Read Management Version Information (0x0001) plen 3 bluetoothd[35]: < ACL Data T.. flags 0x00 dlen 41 ATT: Write Command (0x52) len 36 Handle: 0x0043 Type: ASE Control Point (0x2bc6) Data: 020203000110270000022800020a00409c0001000110270000022800020a00409c00 Opcode: QoS Configuration (0x02) Number of ASE(s): 2 ASE: #0 ASE ID: 0x03 CIG ID: 0x00 CIS ID: 0x01 SDU Interval: 10000 usec Framing: Unframed (0x00) PHY: 0x02 LE 2M PHY (0x02) Max SDU: 40 RTN: 2 Max Transport Latency: 10 Presentation Delay: 40000 us ASE: #1 ASE ID: 0x01 CIG ID: 0x00 CIS ID: 0x01 SDU Interval: 10000 usec Framing: Unframed (0x00) PHY: 0x02 LE 2M PHY (0x02) Max SDU: 40 RTN: 2 Max Transport Latency: 10 Presentation Delay: 40000 us Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 1 + net/bluetooth/hci_sock.c | 67 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 66 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 60689a07b82c..34998ae8ed78 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -471,6 +471,7 @@ struct bt_skb_cb { struct sco_ctrl sco; struct hci_ctrl hci; struct mgmt_ctrl mgmt; + struct scm_creds creds; }; }; #define bt_cb(skb) ((struct bt_skb_cb *)((skb)->cb)) diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 9c45586f5818..5e4f718073b7 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -264,6 +264,53 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) kfree_skb(skb_copy); } +static void hci_sock_copy_creds(struct sock *sk, struct sk_buff *skb) +{ + struct scm_creds *creds; + + if (!sk || WARN_ON(!skb)) + return; + + creds = &bt_cb(skb)->creds; + + /* Check if peer credentials is set */ + if (!sk->sk_peer_pid) { + /* Check if parent peer credentials is set */ + if (bt_sk(sk)->parent && bt_sk(sk)->parent->sk_peer_pid) + sk = bt_sk(sk)->parent; + else + return; + } + + /* Check if scm_creds already set */ + if (creds->pid == pid_vnr(sk->sk_peer_pid)) + return; + + memset(creds, 0, sizeof(*creds)); + + creds->pid = pid_vnr(sk->sk_peer_pid); + if (sk->sk_peer_cred) { + creds->uid = sk->sk_peer_cred->uid; + creds->gid = sk->sk_peer_cred->gid; + } +} + +static struct sk_buff *hci_skb_clone(struct sk_buff *skb) +{ + struct sk_buff *nskb; + + if (!skb) + return NULL; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return NULL; + + hci_sock_copy_creds(skb->sk, nskb); + + return nskb; +} + /* Send frame to sockets with specific channel */ static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb, int flag, struct sock *skip_sk) @@ -289,7 +336,7 @@ static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb, if (hci_pi(sk)->channel != channel) continue; - nskb = skb_clone(skb, GFP_ATOMIC); + nskb = hci_skb_clone(skb); if (!nskb) continue; @@ -356,6 +403,8 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) if (!skb_copy) return; + hci_sock_copy_creds(skb->sk, skb_copy); + /* Put header before the data */ hdr = skb_push(skb_copy, HCI_MON_HDR_SIZE); hdr->opcode = opcode; @@ -531,10 +580,12 @@ static struct sk_buff *create_monitor_ctrl_open(struct sock *sk) return NULL; } - skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC); + skb = bt_skb_alloc(14 + TASK_COMM_LEN, GFP_ATOMIC); if (!skb) return NULL; + hci_sock_copy_creds(sk, skb); + flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0; put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); @@ -580,6 +631,8 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk) if (!skb) return NULL; + hci_sock_copy_creds(sk, skb); + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); __net_timestamp(skb); @@ -606,6 +659,8 @@ static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index, if (!skb) return NULL; + hci_sock_copy_creds(sk, skb); + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); put_unaligned_le16(opcode, skb_put(skb, 2)); @@ -638,6 +693,8 @@ send_monitor_note(struct sock *sk, const char *fmt, ...) if (!skb) return; + hci_sock_copy_creds(sk, skb); + va_start(args, fmt); vsprintf(skb_put(skb, len), fmt, args); *(u8 *)skb_put(skb, 1) = 0; @@ -1494,6 +1551,7 @@ static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { + struct scm_cookie scm; struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; @@ -1538,11 +1596,16 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, break; } + memset(&scm, 0, sizeof(scm)); + scm.creds = bt_cb(skb)->creds; + skb_free_datagram(sk, skb); if (flags & MSG_TRUNC) copied = skblen; + scm_recv(sock, msg, &scm, flags); + return err ? : copied; } -- cgit v1.2.3 From 6a42e9bfd17f7135d59701f93942a3392da482f4 Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Mon, 19 Jun 2023 17:53:16 +0300 Subject: Bluetooth: ISO: Support multiple BIGs This adds support for creating multiple BIGs. According to spec, each BIG shall have an unique handle, and each BIG should be associated with a different advertising handle. Otherwise, the LE Create BIG command will fail, with error code Command Disallowed (for reusing a BIG handle), or Unknown Advertising Identifier (for reusing an advertising handle). The btmon snippet below shows an exercise for creating two BIGs for the same controller, by opening two isotest instances with the following command: tools/isotest -i hci0 -s 00:00:00:00:00:00 < HCI Command: LE Create Broadcast Isochronous Group (0x08|0x0068) plen 31 Handle: 0x00 Advertising Handle: 0x01 Number of BIS: 1 SDU Interval: 10000 us (0x002710) Maximum SDU size: 40 Maximum Latency: 10 ms (0x000a) RTN: 0x02 PHY: LE 2M (0x02) Packing: Sequential (0x00) Framing: Unframed (0x00) Encryption: 0x00 Broadcast Code: 00000000000000000000000000000000 > HCI Event: Command Status (0x0f) plen 4 LE Create Broadcast Isochronous Group (0x08|0x0068) ncmd 1 Status: Success (0x00) > HCI Event: LE Meta Event (0x3e) plen 21 LE Broadcast Isochronous Group Complete (0x1b) Status: Success (0x00) Handle: 0x00 BIG Synchronization Delay: 912 us (0x000390) Transport Latency: 912 us (0x000390) PHY: LE 2M (0x02) NSE: 3 BN: 1 PTO: 1 IRC: 3 Maximum PDU: 40 ISO Interval: 10.00 msec (0x0008) Connection Handle #0: 10 < HCI Command: LE Create Broadcast Isochronous Group (0x08|0x0068) Handle: 0x01 Advertising Handle: 0x02 Number of BIS: 1 SDU Interval: 10000 us (0x002710) Maximum SDU size: 40 Maximum Latency: 10 ms (0x000a) RTN: 0x02 PHY: LE 2M (0x02) Packing: Sequential (0x00) Framing: Unframed (0x00) Encryption: 0x00 Broadcast Code: 00000000000000000000000000000000 > HCI Event: Command Status (0x0f) plen 4 LE Create Broadcast Isochronous Group (0x08|0x0068) ncmd 1 Status: Success (0x00) Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 5 ++--- net/bluetooth/hci_conn.c | 40 +++++++++++++++++----------------------- net/bluetooth/hci_event.c | 35 +++++++++++++++++++++++++++++------ net/bluetooth/hci_sync.c | 28 +++++++++++++++------------- 4 files changed, 63 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index ad39d09e9bd6..9140d4a80e38 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1095,8 +1095,7 @@ static inline __u8 hci_conn_lookup_type(struct hci_dev *hdev, __u16 handle) } static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, - bdaddr_t *ba, - __u8 big, __u8 bis) + bdaddr_t *ba, __u8 bis) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; @@ -1107,7 +1106,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, if (bacmp(&c->dst, ba) || c->type != ISO_LINK) continue; - if (c->iso_qos.bcast.big == big && c->iso_qos.bcast.bis == bis) { + if (c->iso_qos.bcast.bis == bis) { rcu_read_unlock(); return c; } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 4b5223e62141..0f89daafe194 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -927,9 +927,7 @@ static void bis_cleanup(struct hci_conn *conn) /* Check if ISO connection is a BIS and terminate advertising * set and BIG if there are no other connections using it. */ - bis = hci_conn_hash_lookup_bis(hdev, BDADDR_ANY, - conn->iso_qos.bcast.big, - conn->iso_qos.bcast.bis); + bis = hci_conn_hash_lookup_big(hdev, conn->iso_qos.bcast.big); if (bis) return; @@ -1449,25 +1447,23 @@ static int hci_explicit_conn_params_set(struct hci_dev *hdev, static int qos_set_big(struct hci_dev *hdev, struct bt_iso_qos *qos) { - struct iso_list_data data; + struct hci_conn *conn; + u8 big; /* Allocate a BIG if not set */ if (qos->bcast.big == BT_ISO_QOS_BIG_UNSET) { - for (data.big = 0x00; data.big < 0xef; data.big++) { - data.count = 0; - data.bis = 0xff; + for (big = 0x00; big < 0xef; big++) { - hci_conn_hash_list_state(hdev, bis_list, ISO_LINK, - BT_BOUND, &data); - if (!data.count) + conn = hci_conn_hash_lookup_big(hdev, big); + if (!conn) break; } - if (data.big == 0xef) + if (big == 0xef) return -EADDRNOTAVAIL; /* Update BIG */ - qos->bcast.big = data.big; + qos->bcast.big = big; } return 0; @@ -1475,28 +1471,27 @@ static int qos_set_big(struct hci_dev *hdev, struct bt_iso_qos *qos) static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos) { - struct iso_list_data data; + struct hci_conn *conn; + u8 bis; /* Allocate BIS if not set */ if (qos->bcast.bis == BT_ISO_QOS_BIS_UNSET) { /* Find an unused adv set to advertise BIS, skip instance 0x00 * since it is reserved as general purpose set. */ - for (data.bis = 0x01; data.bis < hdev->le_num_of_adv_sets; - data.bis++) { - data.count = 0; + for (bis = 0x01; bis < hdev->le_num_of_adv_sets; + bis++) { - hci_conn_hash_list_state(hdev, bis_list, ISO_LINK, - BT_BOUND, &data); - if (!data.count) + conn = hci_conn_hash_lookup_bis(hdev, BDADDR_ANY, bis); + if (!conn) break; } - if (data.bis == hdev->le_num_of_adv_sets) + if (bis == hdev->le_num_of_adv_sets) return -EADDRNOTAVAIL; /* Update BIS */ - qos->bcast.bis = data.bis; + qos->bcast.bis = bis; } return 0; @@ -1534,8 +1529,7 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, /* Check BIS settings against other bound BISes, since all * BISes in a BIG must have the same value for all parameters */ - conn = hci_conn_hash_lookup_bis(hdev, dst, qos->bcast.big, - qos->bcast.bis); + conn = hci_conn_hash_lookup_big(hdev, qos->bcast.big); if (conn && (memcmp(qos, &conn->iso_qos, sizeof(*qos)) || base_len != conn->le_per_adv_data_len || diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index e7249ed3f5f2..c67612c99f89 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1639,7 +1639,7 @@ static u8 hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, void *data, hci_dev_set_flag(hdev, HCI_LE_ADV); - if (adv) + if (adv && !adv->periodic) adv->enabled = true; conn = hci_lookup_le_connect(hdev); @@ -3941,24 +3941,47 @@ static u8 hci_cc_le_set_per_adv_enable(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct hci_ev_status *rp = data; - __u8 *sent; + struct hci_cp_le_set_per_adv_enable *cp; + struct adv_info *adv = NULL, *n; + u8 per_adv_cnt = 0; bt_dev_dbg(hdev, "status 0x%2.2x", rp->status); if (rp->status) return rp->status; - sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE); - if (!sent) + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE); + if (!cp) return rp->status; hci_dev_lock(hdev); - if (*sent) + adv = hci_find_adv_instance(hdev, cp->handle); + + if (cp->enable) { hci_dev_set_flag(hdev, HCI_LE_PER_ADV); - else + + if (adv) + adv->enabled = true; + } else { + /* If just one instance was disabled check if there are + * any other instance enabled before clearing HCI_LE_PER_ADV. + * The current periodic adv instance will be marked as + * disabled once extended advertising is also disabled. + */ + list_for_each_entry_safe(adv, n, &hdev->adv_instances, + list) { + if (adv->periodic && adv->enabled) + per_adv_cnt++; + } + + if (per_adv_cnt > 1) + goto unlock; + hci_dev_clear_flag(hdev, HCI_LE_PER_ADV); + } +unlock: hci_dev_unlock(hdev); return rp->status; diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index b617d1dd247a..afb8e970e62c 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3,6 +3,7 @@ * BlueZ - Bluetooth protocol stack for Linux * * Copyright (C) 2021 Intel Corporation + * Copyright 2023 NXP */ #include @@ -1319,9 +1320,11 @@ int hci_start_ext_adv_sync(struct hci_dev *hdev, u8 instance) static int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_per_adv_enable cp; + struct adv_info *adv = NULL; /* If periodic advertising already disabled there is nothing to do. */ - if (!hci_dev_test_flag(hdev, HCI_LE_PER_ADV)) + adv = hci_find_adv_instance(hdev, instance); + if (!adv || !adv->periodic || !adv->enabled) return 0; memset(&cp, 0, sizeof(cp)); @@ -1386,9 +1389,11 @@ static int hci_set_per_adv_data_sync(struct hci_dev *hdev, u8 instance) static int hci_enable_per_advertising_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_per_adv_enable cp; + struct adv_info *adv = NULL; /* If periodic advertising already enabled there is nothing to do. */ - if (hci_dev_test_flag(hdev, HCI_LE_PER_ADV)) + adv = hci_find_adv_instance(hdev, instance); + if (adv && adv->periodic && adv->enabled) return 0; memset(&cp, 0, sizeof(cp)); @@ -1458,22 +1463,19 @@ int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len, sync_interval); if (IS_ERR(adv)) return PTR_ERR(adv); + adv->pending = false; added = true; } } - /* Only start advertising if instance 0 or if a dedicated instance has - * been added. - */ - if (!adv || added) { - err = hci_start_ext_adv_sync(hdev, instance); - if (err < 0) - goto fail; + /* Start advertising */ + err = hci_start_ext_adv_sync(hdev, instance); + if (err < 0) + goto fail; - err = hci_adv_bcast_annoucement(hdev, adv); - if (err < 0) - goto fail; - } + err = hci_adv_bcast_annoucement(hdev, adv); + if (err < 0) + goto fail; err = hci_set_per_adv_params_sync(hdev, instance, min_interval, max_interval); -- cgit v1.2.3 From 9e14606d8f38ea52a38c27692a9c1513c987a5da Mon Sep 17 00:00:00 2001 From: Hilda Wu Date: Wed, 21 Jun 2023 18:00:31 +0800 Subject: Bluetooth: msft: Extended monitor tracking by address filter Since limited tracking device per condition, this feature is to support tracking multiple devices concurrently. When a pattern monitor detects the device, this feature issues an address monitor for tracking that device. Let pattern monitor can keep monitor new devices. This feature adds an address filter when receiving a LE monitor device event which monitor handle is for a pattern, and the controller started monitoring the device. And this feature also has cancelled the monitor advertisement from address filters when receiving a LE monitor device event when the controller stopped monitoring the device specified by an address and monitor handle. Below is an example to know the feature adds the address filter. //Add MSFT pattern monitor < HCI Command: Vendor (0x3f|0x00f0) plen 14 #142 [hci0] 55.552420 03 b8 a4 03 ff 01 01 06 09 05 5f 52 45 46 .........._REF > HCI Event: Command Complete (0x0e) plen 6 #143 [hci0] 55.653960 Vendor (0x3f|0x00f0) ncmd 2 Status: Success (0x00) 03 00 //Got event from the pattern monitor > HCI Event: Vendor (0xff) plen 18 #148 [hci0] 58.384953 23 79 54 33 77 88 97 68 02 00 fb c1 29 eb 27 b8 #yT3w..h....).'. 00 01 .. //Add MSFT address monitor (Sample address: B8:27:EB:29:C1:FB) < HCI Command: Vendor (0x3f|0x00f0) plen 13 #149 [hci0] 58.385067 03 b8 a4 03 ff 04 00 fb c1 29 eb 27 b8 .........).'. //Report to userspace about found device (ADV Monitor Device Found) @ MGMT Event: Unknown (0x002f) plen 38 {0x0003} [hci0] 58.680042 01 00 fb c1 29 eb 27 b8 01 ce 00 00 00 00 16 00 ....).'......... 0a 09 4b 45 59 42 44 5f 52 45 46 02 01 06 03 19 ..KEYBD_REF..... c1 03 03 03 12 18 ...... //Got event from address monitor > HCI Event: Vendor (0xff) plen 18 #152 [hci0] 58.672956 23 79 54 33 77 88 97 68 02 00 fb c1 29 eb 27 b8 #yT3w..h....).'. 01 01 Signed-off-by: Alex Lu Signed-off-by: Hilda Wu Reviewed-by: Simon Horman Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btrtl.c | 4 + include/net/bluetooth/hci.h | 10 ++ net/bluetooth/msft.c | 412 ++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 411 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/drivers/bluetooth/btrtl.c b/drivers/bluetooth/btrtl.c index 04399b3c39a0..ddae6524106d 100644 --- a/drivers/bluetooth/btrtl.c +++ b/drivers/bluetooth/btrtl.c @@ -1269,6 +1269,10 @@ void btrtl_set_quirks(struct hci_dev *hdev, struct btrtl_device_info *btrtl_dev) if (btrtl_dev->project_id == CHIP_ID_8852C) btrealtek_set_flag(hdev, REALTEK_ALT6_CONTINUOUS_TX_CHIP); + if (btrtl_dev->project_id == CHIP_ID_8852A || + btrtl_dev->project_id == CHIP_ID_8852C) + set_bit(HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, &hdev->quirks); + hci_set_aosp_capable(hdev); break; default: diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index ab2f8f1817cf..5723405b833e 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -309,6 +309,16 @@ enum { * to support it. */ HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, + + /* When this quirk is set, MSFT extension monitor tracking by + * address filter is supported. Since tracking quantity of each + * pattern is limited, this feature supports tracking multiple + * devices concurrently if controller supports multiple + * address filters. + * + * This quirk must be set before hci_register_dev is called. + */ + HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, }; /* HCI device flags */ diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index bf5cee48916c..b80a2162a5c3 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -91,6 +91,33 @@ struct msft_ev_le_monitor_device { struct msft_monitor_advertisement_handle_data { __u8 msft_handle; __u16 mgmt_handle; + __s8 rssi_high; + __s8 rssi_low; + __u8 rssi_low_interval; + __u8 rssi_sampling_period; + __u8 cond_type; + struct list_head list; +}; + +enum monitor_addr_filter_state { + AF_STATE_IDLE, + AF_STATE_ADDING, + AF_STATE_ADDED, + AF_STATE_REMOVING, +}; + +#define MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR 0x04 +struct msft_monitor_addr_filter_data { + __u8 msft_handle; + __u8 pattern_handle; /* address filters pertain to */ + __u16 mgmt_handle; + int state; + __s8 rssi_high; + __s8 rssi_low; + __u8 rssi_low_interval; + __u8 rssi_sampling_period; + __u8 addr_type; + bdaddr_t bdaddr; struct list_head list; }; @@ -99,9 +126,12 @@ struct msft_data { __u8 evt_prefix_len; __u8 *evt_prefix; struct list_head handle_map; + struct list_head address_filters; __u8 resuming; __u8 suspending; __u8 filter_enabled; + /* To synchronize add/remove address filter and monitor device event.*/ + struct mutex filter_lock; }; bool msft_monitor_supported(struct hci_dev *hdev) @@ -180,6 +210,24 @@ static struct msft_monitor_advertisement_handle_data *msft_find_handle_data return NULL; } +/* This function requires the caller holds msft->filter_lock */ +static struct msft_monitor_addr_filter_data *msft_find_address_data + (struct hci_dev *hdev, u8 addr_type, bdaddr_t *addr, + u8 pattern_handle) +{ + struct msft_monitor_addr_filter_data *entry; + struct msft_data *msft = hdev->msft_data; + + list_for_each_entry(entry, &msft->address_filters, list) { + if (entry->pattern_handle == pattern_handle && + addr_type == entry->addr_type && + !bacmp(addr, &entry->bdaddr)) + return entry; + } + + return NULL; +} + /* This function requires the caller holds hdev->lock */ static int msft_monitor_device_del(struct hci_dev *hdev, __u16 mgmt_handle, bdaddr_t *bdaddr, __u8 addr_type, @@ -240,6 +288,7 @@ static int msft_le_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode, handle_data->mgmt_handle = monitor->handle; handle_data->msft_handle = rp->handle; + handle_data->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN; INIT_LIST_HEAD(&handle_data->list); list_add(&handle_data->list, &msft->handle_map); @@ -254,6 +303,70 @@ unlock: return status; } +/* This function requires the caller holds hci_req_sync_lock */ +static void msft_remove_addr_filters_sync(struct hci_dev *hdev, u8 handle) +{ + struct msft_monitor_addr_filter_data *address_filter, *n; + struct msft_cp_le_cancel_monitor_advertisement cp; + struct msft_data *msft = hdev->msft_data; + struct list_head head; + struct sk_buff *skb; + + INIT_LIST_HEAD(&head); + + /* Cancel all corresponding address monitors */ + mutex_lock(&msft->filter_lock); + + list_for_each_entry_safe(address_filter, n, &msft->address_filters, + list) { + if (address_filter->pattern_handle != handle) + continue; + + list_del(&address_filter->list); + + /* Keep the address filter and let + * msft_add_address_filter_sync() remove and free the address + * filter. + */ + if (address_filter->state == AF_STATE_ADDING) { + address_filter->state = AF_STATE_REMOVING; + continue; + } + + /* Keep the address filter and let + * msft_cancel_address_filter_sync() remove and free the address + * filter + */ + if (address_filter->state == AF_STATE_REMOVING) + continue; + + list_add_tail(&address_filter->list, &head); + } + + mutex_unlock(&msft->filter_lock); + + list_for_each_entry_safe(address_filter, n, &head, list) { + list_del(&address_filter->list); + + cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; + cp.handle = address_filter->msft_handle; + + skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, + HCI_CMD_TIMEOUT); + if (IS_ERR_OR_NULL(skb)) { + kfree(address_filter); + continue; + } + + kfree_skb(skb); + + bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter", + &address_filter->bdaddr); + + kfree(address_filter); + } +} + static int msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode, struct adv_monitor *monitor, @@ -263,6 +376,7 @@ static int msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; int status = 0; + u8 msft_handle; rp = (struct msft_rp_le_cancel_monitor_advertisement *)skb->data; if (skb->len < sizeof(*rp)) { @@ -293,11 +407,17 @@ static int msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, NULL, 0, false); } + msft_handle = handle_data->msft_handle; + list_del(&handle_data->list); kfree(handle_data); - } - hci_dev_unlock(hdev); + hci_dev_unlock(hdev); + + msft_remove_addr_filters_sync(hdev, msft_handle); + } else { + hci_dev_unlock(hdev); + } done: return status; @@ -394,12 +514,14 @@ static int msft_add_monitor_sync(struct hci_dev *hdev, { struct msft_cp_le_monitor_advertisement *cp; struct msft_le_monitor_advertisement_pattern_data *pattern_data; + struct msft_monitor_advertisement_handle_data *handle_data; struct msft_le_monitor_advertisement_pattern *pattern; struct adv_pattern *entry; size_t total_size = sizeof(*cp) + sizeof(*pattern_data); ptrdiff_t offset = 0; u8 pattern_count = 0; struct sk_buff *skb; + int err; if (!msft_monitor_pattern_valid(monitor)) return -EINVAL; @@ -436,16 +558,31 @@ static int msft_add_monitor_sync(struct hci_dev *hdev, skb = __hci_cmd_sync(hdev, hdev->msft_opcode, total_size, cp, HCI_CMD_TIMEOUT); - kfree(cp); if (IS_ERR_OR_NULL(skb)) { - if (!skb) - return -EIO; - return PTR_ERR(skb); + err = PTR_ERR(skb); + goto out_free; } - return msft_le_monitor_advertisement_cb(hdev, hdev->msft_opcode, - monitor, skb); + err = msft_le_monitor_advertisement_cb(hdev, hdev->msft_opcode, + monitor, skb); + if (err) + goto out_free; + + handle_data = msft_find_handle_data(hdev, monitor->handle, true); + if (!handle_data) { + err = -ENODATA; + goto out_free; + } + + handle_data->rssi_high = cp->rssi_high; + handle_data->rssi_low = cp->rssi_low; + handle_data->rssi_low_interval = cp->rssi_low_interval; + handle_data->rssi_sampling_period = cp->rssi_sampling_period; + +out_free: + kfree(cp); + return err; } /* This function requires the caller holds hci_req_sync_lock */ @@ -538,6 +675,7 @@ void msft_do_close(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; struct msft_monitor_advertisement_handle_data *handle_data, *tmp; + struct msft_monitor_addr_filter_data *address_filter, *n; struct adv_monitor *monitor; if (!msft) @@ -559,6 +697,14 @@ void msft_do_close(struct hci_dev *hdev) kfree(handle_data); } + mutex_lock(&msft->filter_lock); + list_for_each_entry_safe(address_filter, n, &msft->address_filters, + list) { + list_del(&address_filter->list); + kfree(address_filter); + } + mutex_unlock(&msft->filter_lock); + hci_dev_lock(hdev); /* Clear any devices that are being monitored and notify device lost */ @@ -568,6 +714,49 @@ void msft_do_close(struct hci_dev *hdev) hci_dev_unlock(hdev); } +static int msft_cancel_address_filter_sync(struct hci_dev *hdev, void *data) +{ + struct msft_monitor_addr_filter_data *address_filter = data; + struct msft_cp_le_cancel_monitor_advertisement cp; + struct msft_data *msft = hdev->msft_data; + struct sk_buff *skb; + int err = 0; + + if (!msft) { + bt_dev_err(hdev, "MSFT: msft data is freed"); + return -EINVAL; + } + + /* The address filter has been removed by hci dev close */ + if (!test_bit(HCI_UP, &hdev->flags)) + return 0; + + mutex_lock(&msft->filter_lock); + list_del(&address_filter->list); + mutex_unlock(&msft->filter_lock); + + cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; + cp.handle = address_filter->msft_handle; + + skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, + HCI_CMD_TIMEOUT); + if (IS_ERR_OR_NULL(skb)) { + bt_dev_err(hdev, "MSFT: Failed to cancel address (%pMR) filter", + &address_filter->bdaddr); + err = EIO; + goto done; + } + kfree_skb(skb); + + bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter", + &address_filter->bdaddr); + +done: + kfree(address_filter); + + return err; +} + void msft_register(struct hci_dev *hdev) { struct msft_data *msft = NULL; @@ -581,7 +770,9 @@ void msft_register(struct hci_dev *hdev) } INIT_LIST_HEAD(&msft->handle_map); + INIT_LIST_HEAD(&msft->address_filters); hdev->msft_data = msft; + mutex_init(&msft->filter_lock); } void msft_unregister(struct hci_dev *hdev) @@ -596,6 +787,7 @@ void msft_unregister(struct hci_dev *hdev) hdev->msft_data = NULL; kfree(msft->evt_prefix); + mutex_destroy(&msft->filter_lock); kfree(msft); } @@ -645,11 +837,149 @@ static void *msft_skb_pull(struct hci_dev *hdev, struct sk_buff *skb, return data; } +static int msft_add_address_filter_sync(struct hci_dev *hdev, void *data) +{ + struct msft_monitor_addr_filter_data *address_filter = data; + struct msft_rp_le_monitor_advertisement *rp; + struct msft_cp_le_monitor_advertisement *cp; + struct msft_data *msft = hdev->msft_data; + struct sk_buff *skb = NULL; + bool remove = false; + size_t size; + + if (!msft) { + bt_dev_err(hdev, "MSFT: msft data is freed"); + return -EINVAL; + } + + /* The address filter has been removed by hci dev close */ + if (!test_bit(HCI_UP, &hdev->flags)) + return -ENODEV; + + /* We are safe to use the address filter from now on. + * msft_monitor_device_evt() wouldn't delete this filter because it's + * not been added by now. + * And all other functions that requiring hci_req_sync_lock wouldn't + * touch this filter before this func completes because it's protected + * by hci_req_sync_lock. + */ + + if (address_filter->state == AF_STATE_REMOVING) { + mutex_lock(&msft->filter_lock); + list_del(&address_filter->list); + mutex_unlock(&msft->filter_lock); + kfree(address_filter); + return 0; + } + + size = sizeof(*cp) + + sizeof(address_filter->addr_type) + + sizeof(address_filter->bdaddr); + cp = kzalloc(size, GFP_KERNEL); + if (!cp) { + bt_dev_err(hdev, "MSFT: Alloc cmd param err"); + remove = true; + goto done; + } + cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT; + cp->rssi_high = address_filter->rssi_high; + cp->rssi_low = address_filter->rssi_low; + cp->rssi_low_interval = address_filter->rssi_low_interval; + cp->rssi_sampling_period = address_filter->rssi_sampling_period; + cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR; + cp->data[0] = address_filter->addr_type; + memcpy(&cp->data[1], &address_filter->bdaddr, + sizeof(address_filter->bdaddr)); + + skb = __hci_cmd_sync(hdev, hdev->msft_opcode, size, cp, + HCI_CMD_TIMEOUT); + if (IS_ERR_OR_NULL(skb)) { + bt_dev_err(hdev, "Failed to enable address %pMR filter", + &address_filter->bdaddr); + skb = NULL; + remove = true; + goto done; + } + + rp = skb_pull_data(skb, sizeof(*rp)); + if (!rp || rp->sub_opcode != MSFT_OP_LE_MONITOR_ADVERTISEMENT || + rp->status) + remove = true; + +done: + mutex_lock(&msft->filter_lock); + + if (remove) { + bt_dev_warn(hdev, "MSFT: Remove address (%pMR) filter", + &address_filter->bdaddr); + list_del(&address_filter->list); + kfree(address_filter); + } else { + address_filter->state = AF_STATE_ADDED; + address_filter->msft_handle = rp->handle; + bt_dev_dbg(hdev, "MSFT: Address %pMR filter enabled", + &address_filter->bdaddr); + } + mutex_unlock(&msft->filter_lock); + + kfree_skb(skb); + + return 0; +} + +/* This function requires the caller holds msft->filter_lock */ +static struct msft_monitor_addr_filter_data *msft_add_address_filter + (struct hci_dev *hdev, u8 addr_type, bdaddr_t *bdaddr, + struct msft_monitor_advertisement_handle_data *handle_data) +{ + struct msft_monitor_addr_filter_data *address_filter = NULL; + struct msft_data *msft = hdev->msft_data; + int err; + + address_filter = kzalloc(sizeof(*address_filter), GFP_KERNEL); + if (!address_filter) + return NULL; + + address_filter->state = AF_STATE_ADDING; + address_filter->msft_handle = 0xff; + address_filter->pattern_handle = handle_data->msft_handle; + address_filter->mgmt_handle = handle_data->mgmt_handle; + address_filter->rssi_high = handle_data->rssi_high; + address_filter->rssi_low = handle_data->rssi_low; + address_filter->rssi_low_interval = handle_data->rssi_low_interval; + address_filter->rssi_sampling_period = handle_data->rssi_sampling_period; + address_filter->addr_type = addr_type; + bacpy(&address_filter->bdaddr, bdaddr); + + /* With the above AF_STATE_ADDING, duplicated address filter can be + * avoided when receiving monitor device event (found/lost) frequently + * for the same device. + */ + list_add_tail(&address_filter->list, &msft->address_filters); + + err = hci_cmd_sync_queue(hdev, msft_add_address_filter_sync, + address_filter, NULL); + if (err < 0) { + bt_dev_err(hdev, "MSFT: Add address %pMR filter err", bdaddr); + list_del(&address_filter->list); + kfree(address_filter); + return NULL; + } + + bt_dev_dbg(hdev, "MSFT: Add device %pMR address filter", + &address_filter->bdaddr); + + return address_filter; +} + /* This function requires the caller holds hdev->lock */ static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb) { + struct msft_monitor_addr_filter_data *n, *address_filter = NULL; struct msft_ev_le_monitor_device *ev; struct msft_monitor_advertisement_handle_data *handle_data; + struct msft_data *msft = hdev->msft_data; + u16 mgmt_handle = 0xffff; u8 addr_type; ev = msft_skb_pull(hdev, skb, MSFT_EV_LE_MONITOR_DEVICE, sizeof(*ev)); @@ -662,9 +992,53 @@ static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb) ev->monitor_state, &ev->bdaddr); handle_data = msft_find_handle_data(hdev, ev->monitor_handle, false); - if (!handle_data) + + if (!test_bit(HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, &hdev->quirks)) { + if (!handle_data) + return; + mgmt_handle = handle_data->mgmt_handle; + goto report_state; + } + + if (handle_data) { + /* Don't report any device found/lost event from pattern + * monitors. Pattern monitor always has its address filters for + * tracking devices. + */ + + address_filter = msft_find_address_data(hdev, ev->addr_type, + &ev->bdaddr, + handle_data->msft_handle); + if (address_filter) + return; + + if (ev->monitor_state && handle_data->cond_type == + MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN) + msft_add_address_filter(hdev, ev->addr_type, + &ev->bdaddr, handle_data); + return; + } + /* This device event is not from pattern monitor. + * Report it if there is a corresponding address_filter for it. + */ + list_for_each_entry(n, &msft->address_filters, list) { + if (n->state == AF_STATE_ADDED && + n->msft_handle == ev->monitor_handle) { + mgmt_handle = n->mgmt_handle; + address_filter = n; + break; + } + } + + if (!address_filter) { + bt_dev_warn(hdev, "MSFT: Unexpected device event %pMR, %u, %u", + &ev->bdaddr, ev->monitor_handle, ev->monitor_state); + return; + } + +report_state: switch (ev->addr_type) { case ADDR_LE_DEV_PUBLIC: addr_type = BDADDR_LE_PUBLIC; @@ -681,12 +1055,18 @@ static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb) return; } - if (ev->monitor_state) - msft_device_found(hdev, &ev->bdaddr, addr_type, - handle_data->mgmt_handle); - else - msft_device_lost(hdev, &ev->bdaddr, addr_type, - handle_data->mgmt_handle); + if (ev->monitor_state) { + msft_device_found(hdev, &ev->bdaddr, addr_type, mgmt_handle); + } else { + if (address_filter && address_filter->state == AF_STATE_ADDED) { + address_filter->state = AF_STATE_REMOVING; + hci_cmd_sync_queue(hdev, + msft_cancel_address_filter_sync, + address_filter, + NULL); + } + msft_device_lost(hdev, &ev->bdaddr, addr_type, mgmt_handle); + } } void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) @@ -724,7 +1104,9 @@ void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) switch (*evt) { case MSFT_EV_LE_MONITOR_DEVICE: + mutex_lock(&msft->filter_lock); msft_monitor_device_evt(hdev, skb); + mutex_unlock(&msft->filter_lock); break; default: -- cgit v1.2.3 From c33362a528d968ed7fce21e6a9d4b1334bbbb25c Mon Sep 17 00:00:00 2001 From: Claudia Draghicescu Date: Thu, 29 Jun 2023 16:45:18 +0300 Subject: Bluetooth: hci_sync: Enable events for BIS capable devices In the case of a Synchronized Receiver capable device, enable at start-up the events for PA reports, PA Sync Established and Big Info Adv reports. Signed-off-by: Claudia Draghicescu Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index afb8e970e62c..73b49d0c1e11 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4135,10 +4135,13 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) } if (bis_capable(hdev)) { + events[1] |= 0x20; /* LE PA Report */ + events[1] |= 0x40; /* LE PA Sync Established */ events[3] |= 0x04; /* LE Create BIG Complete */ events[3] |= 0x08; /* LE Terminate BIG Complete */ events[3] |= 0x10; /* LE BIG Sync Established */ events[3] |= 0x20; /* LE BIG Sync Loss */ + events[4] |= 0x02; /* LE BIG Info Advertising Report */ } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EVENT_MASK, -- cgit v1.2.3 From a13f316e90fdb1fb6df6582e845aa9b3270f3581 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 26 Jun 2023 17:25:06 -0700 Subject: Bluetooth: hci_conn: Consolidate code for aborting connections This consolidates code for aborting connections using hci_cmd_sync_queue so it is synchronized with other threads, but because of the fact that some commands may block the cmd_sync_queue while waiting specific events this attempt to cancel those requests by using hci_cmd_sync_cancel. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 2 +- net/bluetooth/hci_conn.c | 154 ++++++++------------------------------- net/bluetooth/hci_sync.c | 23 ++++-- net/bluetooth/mgmt.c | 15 +--- 4 files changed, 47 insertions(+), 147 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 9140d4a80e38..2dd59e3a51b3 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -739,6 +739,7 @@ struct hci_conn { unsigned long flags; enum conn_reasons conn_reason; + __u8 abort_reason; __u32 clock; __u16 clock_accuracy; @@ -758,7 +759,6 @@ struct hci_conn { struct delayed_work auto_accept_work; struct delayed_work idle_work; struct delayed_work le_conn_timeout; - struct work_struct le_scan_cleanup; struct device dev; struct dentry *debugfs; diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 0f89daafe194..fa9236dfba3e 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -178,57 +178,6 @@ static void hci_conn_cleanup(struct hci_conn *conn) hci_conn_put(conn); } -static void le_scan_cleanup(struct work_struct *work) -{ - struct hci_conn *conn = container_of(work, struct hci_conn, - le_scan_cleanup); - struct hci_dev *hdev = conn->hdev; - struct hci_conn *c = NULL; - - BT_DBG("%s hcon %p", hdev->name, conn); - - hci_dev_lock(hdev); - - /* Check that the hci_conn is still around */ - rcu_read_lock(); - list_for_each_entry_rcu(c, &hdev->conn_hash.list, list) { - if (c == conn) - break; - } - rcu_read_unlock(); - - if (c == conn) { - hci_connect_le_scan_cleanup(conn, 0x00); - hci_conn_cleanup(conn); - } - - hci_dev_unlock(hdev); - hci_dev_put(hdev); - hci_conn_put(conn); -} - -static void hci_connect_le_scan_remove(struct hci_conn *conn) -{ - BT_DBG("%s hcon %p", conn->hdev->name, conn); - - /* We can't call hci_conn_del/hci_conn_cleanup here since that - * could deadlock with another hci_conn_del() call that's holding - * hci_dev_lock and doing cancel_delayed_work_sync(&conn->disc_work). - * Instead, grab temporary extra references to the hci_dev and - * hci_conn and perform the necessary cleanup in a separate work - * callback. - */ - - hci_dev_hold(conn->hdev); - hci_conn_get(conn); - - /* Even though we hold a reference to the hdev, many other - * things might get cleaned up meanwhile, including the hdev's - * own workqueue, so we can't use that for scheduling. - */ - schedule_work(&conn->le_scan_cleanup); -} - static void hci_acl_create_connection(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; @@ -679,13 +628,6 @@ static void hci_conn_timeout(struct work_struct *work) if (refcnt > 0) return; - /* LE connections in scanning state need special handling */ - if (conn->state == BT_CONNECT && conn->type == LE_LINK && - test_bit(HCI_CONN_SCANNING, &conn->flags)) { - hci_connect_le_scan_remove(conn); - return; - } - hci_abort_conn(conn, hci_proto_disconn_ind(conn)); } @@ -1066,7 +1008,6 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, INIT_DELAYED_WORK(&conn->auto_accept_work, hci_conn_auto_accept); INIT_DELAYED_WORK(&conn->idle_work, hci_conn_idle); INIT_DELAYED_WORK(&conn->le_conn_timeout, le_conn_timeout); - INIT_WORK(&conn->le_scan_cleanup, le_scan_cleanup); atomic_set(&conn->refcnt, 0); @@ -2888,81 +2829,46 @@ u32 hci_conn_get_phy(struct hci_conn *conn) return phys; } -int hci_abort_conn(struct hci_conn *conn, u8 reason) +static int abort_conn_sync(struct hci_dev *hdev, void *data) { - int r = 0; + struct hci_conn *conn; + u16 handle = PTR_ERR(data); - if (test_and_set_bit(HCI_CONN_CANCEL, &conn->flags)) + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (!conn) return 0; - switch (conn->state) { - case BT_CONNECTED: - case BT_CONFIG: - if (conn->type == AMP_LINK) { - struct hci_cp_disconn_phy_link cp; + return hci_abort_conn_sync(hdev, conn, conn->abort_reason); +} - cp.phy_handle = HCI_PHY_HANDLE(conn->handle); - cp.reason = reason; - r = hci_send_cmd(conn->hdev, HCI_OP_DISCONN_PHY_LINK, - sizeof(cp), &cp); - } else { - struct hci_cp_disconnect dc; +int hci_abort_conn(struct hci_conn *conn, u8 reason) +{ + struct hci_dev *hdev = conn->hdev; - dc.handle = cpu_to_le16(conn->handle); - dc.reason = reason; - r = hci_send_cmd(conn->hdev, HCI_OP_DISCONNECT, - sizeof(dc), &dc); - } + /* If abort_reason has already been set it means the connection is + * already being aborted so don't attempt to overwrite it. + */ + if (conn->abort_reason) + return 0; - conn->state = BT_DISCONN; + bt_dev_dbg(hdev, "handle 0x%2.2x reason 0x%2.2x", conn->handle, reason); - break; - case BT_CONNECT: - if (conn->type == LE_LINK) { - if (test_bit(HCI_CONN_SCANNING, &conn->flags)) - break; - r = hci_send_cmd(conn->hdev, - HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL); - } else if (conn->type == ACL_LINK) { - if (conn->hdev->hci_ver < BLUETOOTH_VER_1_2) - break; - r = hci_send_cmd(conn->hdev, - HCI_OP_CREATE_CONN_CANCEL, - 6, &conn->dst); - } - break; - case BT_CONNECT2: - if (conn->type == ACL_LINK) { - struct hci_cp_reject_conn_req rej; - - bacpy(&rej.bdaddr, &conn->dst); - rej.reason = reason; - - r = hci_send_cmd(conn->hdev, - HCI_OP_REJECT_CONN_REQ, - sizeof(rej), &rej); - } else if (conn->type == SCO_LINK || conn->type == ESCO_LINK) { - struct hci_cp_reject_sync_conn_req rej; - - bacpy(&rej.bdaddr, &conn->dst); - - /* SCO rejection has its own limited set of - * allowed error values (0x0D-0x0F) which isn't - * compatible with most values passed to this - * function. To be safe hard-code one of the - * values that's suitable for SCO. - */ - rej.reason = HCI_ERROR_REJ_LIMITED_RESOURCES; + conn->abort_reason = reason; - r = hci_send_cmd(conn->hdev, - HCI_OP_REJECT_SYNC_CONN_REQ, - sizeof(rej), &rej); + /* If the connection is pending check the command opcode since that + * might be blocking on hci_cmd_sync_work while waiting its respective + * event so we need to hci_cmd_sync_cancel to cancel it. + */ + if (conn->state == BT_CONNECT && hdev->req_status == HCI_REQ_PEND) { + switch (hci_skb_event(hdev->sent_cmd)) { + case HCI_EV_LE_CONN_COMPLETE: + case HCI_EV_LE_ENHANCED_CONN_COMPLETE: + case HCI_EVT_LE_CIS_ESTABLISHED: + hci_cmd_sync_cancel(hdev, -ECANCELED); + break; } - break; - default: - conn->state = BT_CLOSED; - break; } - return r; + return hci_cmd_sync_queue(hdev, abort_conn_sync, ERR_PTR(conn->handle), + NULL); } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 73b49d0c1e11..5f7a901709b5 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5274,22 +5274,27 @@ static int hci_disconnect_sync(struct hci_dev *hdev, struct hci_conn *conn, } static int hci_le_connect_cancel_sync(struct hci_dev *hdev, - struct hci_conn *conn) + struct hci_conn *conn, u8 reason) { + /* Return reason if scanning since the connection shall probably be + * cleanup directly. + */ if (test_bit(HCI_CONN_SCANNING, &conn->flags)) - return 0; + return reason; - if (test_and_set_bit(HCI_CONN_CANCEL, &conn->flags)) + if (conn->role == HCI_ROLE_SLAVE || + test_and_set_bit(HCI_CONN_CANCEL, &conn->flags)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); } -static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn) +static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, + u8 reason) { if (conn->type == LE_LINK) - return hci_le_connect_cancel_sync(hdev, conn); + return hci_le_connect_cancel_sync(hdev, conn, reason); if (hdev->hci_ver < BLUETOOTH_VER_1_2) return 0; @@ -5342,9 +5347,11 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) case BT_CONFIG: return hci_disconnect_sync(hdev, conn, reason); case BT_CONNECT: - err = hci_connect_cancel_sync(hdev, conn); + err = hci_connect_cancel_sync(hdev, conn, reason); /* Cleanup hci_conn object if it cannot be cancelled as it - * likelly means the controller and host stack are out of sync. + * likelly means the controller and host stack are out of sync + * or in case of LE it was still scanning so it can be cleanup + * safely. */ if (err) { hci_dev_lock(hdev); @@ -6258,7 +6265,7 @@ int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn) done: if (err == -ETIMEDOUT) - hci_le_connect_cancel_sync(hdev, conn); + hci_le_connect_cancel_sync(hdev, conn, 0x00); /* Re-enable advertising after the connection attempt is finished. */ hci_resume_advertising_sync(hdev); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 4c352abe063b..5707ff468842 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3586,18 +3586,6 @@ unlock: return err; } -static int abort_conn_sync(struct hci_dev *hdev, void *data) -{ - struct hci_conn *conn; - u16 handle = PTR_ERR(data); - - conn = hci_conn_hash_lookup_handle(hdev, handle); - if (!conn) - return 0; - - return hci_abort_conn_sync(hdev, conn, HCI_ERROR_REMOTE_USER_TERM); -} - static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { @@ -3648,8 +3636,7 @@ static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data, le_addr_type(addr->type)); if (conn->conn_reason == CONN_REASON_PAIR_DEVICE) - hci_cmd_sync_queue(hdev, abort_conn_sync, ERR_PTR(conn->handle), - NULL); + hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); unlock: hci_dev_unlock(hdev); -- cgit v1.2.3 From 04a51d616929eb96b7a3e547bc11d3bb46af2c9f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 27 Jun 2023 15:55:47 -0700 Subject: Bluetooth: hci_sync: Fix not handling ISO_LINK in hci_abort_conn_sync ISO_LINK connections where not being handled properly on hci_abort_conn_sync which sometimes resulted in sending the wrong commands, or in case of having the reject command being sent by the socket code (iso.c) which is sort of a layer violation. Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 23 +++++++++++++++++++---- net/bluetooth/hci_sync.c | 34 ++++++++++++++++++++++++++++++++++ net/bluetooth/iso.c | 14 -------------- 3 files changed, 53 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index fa9236dfba3e..a0ffe7db412b 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1223,7 +1223,12 @@ void hci_conn_failed(struct hci_conn *conn, u8 status) static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) { - struct hci_conn *conn = data; + struct hci_conn *conn; + u16 handle = PTR_ERR(data); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (!conn) + return; bt_dev_dbg(hdev, "err %d", err); @@ -1248,10 +1253,17 @@ done: static int hci_connect_le_sync(struct hci_dev *hdev, void *data) { - struct hci_conn *conn = data; + struct hci_conn *conn; + u16 handle = PTR_ERR(data); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (!conn) + return 0; bt_dev_dbg(hdev, "conn %p", conn); + conn->state = BT_CONNECT; + return hci_le_create_conn_sync(hdev, conn); } @@ -1321,10 +1333,10 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, conn->sec_level = BT_SECURITY_LOW; conn->conn_timeout = conn_timeout; - conn->state = BT_CONNECT; clear_bit(HCI_CONN_SCANNING, &conn->flags); - err = hci_cmd_sync_queue(hdev, hci_connect_le_sync, conn, + err = hci_cmd_sync_queue(hdev, hci_connect_le_sync, + ERR_PTR(conn->handle), create_le_conn_complete); if (err) { hci_conn_del(conn); @@ -2858,6 +2870,9 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason) /* If the connection is pending check the command opcode since that * might be blocking on hci_cmd_sync_work while waiting its respective * event so we need to hci_cmd_sync_cancel to cancel it. + * + * hci_connect_le serializes the connection attempts so only one + * connection can be in BT_CONNECT at time. */ if (conn->state == BT_CONNECT && hdev->req_status == HCI_REQ_PEND) { switch (hci_skb_event(hdev->sent_cmd)) { diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 5f7a901709b5..c085b54d158b 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5296,6 +5296,24 @@ static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, if (conn->type == LE_LINK) return hci_le_connect_cancel_sync(hdev, conn, reason); + if (conn->type == ISO_LINK) { + /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E + * page 1857: + * + * If this command is issued for a CIS on the Central and the + * CIS is successfully terminated before being established, + * then an HCI_LE_CIS_Established event shall also be sent for + * this CIS with the Status Operation Cancelled by Host (0x44). + */ + if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) + return hci_disconnect_sync(hdev, conn, reason); + + /* There is no way to cancel a BIS without terminating the BIG + * which is done later on connection cleanup. + */ + return 0; + } + if (hdev->hci_ver < BLUETOOTH_VER_1_2) return 0; @@ -5322,11 +5340,27 @@ static int hci_reject_sco_sync(struct hci_dev *hdev, struct hci_conn *conn, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } +static int hci_le_reject_cis_sync(struct hci_dev *hdev, struct hci_conn *conn, + u8 reason) +{ + struct hci_cp_le_reject_cis cp; + + memset(&cp, 0, sizeof(cp)); + cp.handle = cpu_to_le16(conn->handle); + cp.reason = reason; + + return __hci_cmd_sync_status(hdev, HCI_OP_LE_REJECT_CIS, + sizeof(cp), &cp, HCI_CMD_TIMEOUT); +} + static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { struct hci_cp_reject_conn_req cp; + if (conn->type == ISO_LINK) + return hci_le_reject_cis_sync(hdev, conn, reason); + if (conn->type == SCO_LINK || conn->type == ESCO_LINK) return hci_reject_sco_sync(hdev, conn, reason); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 84d238d0639a..9c41af55f2c7 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -614,18 +614,6 @@ static void iso_sock_kill(struct sock *sk) sock_put(sk); } -static void iso_conn_defer_reject(struct hci_conn *conn) -{ - struct hci_cp_le_reject_cis cp; - - BT_DBG("conn %p", conn); - - memset(&cp, 0, sizeof(cp)); - cp.handle = cpu_to_le16(conn->handle); - cp.reason = HCI_ERROR_REJ_BAD_ADDR; - hci_send_cmd(conn->hdev, HCI_OP_LE_REJECT_CIS, sizeof(cp), &cp); -} - static void __iso_sock_close(struct sock *sk) { BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket); @@ -650,8 +638,6 @@ static void __iso_sock_close(struct sock *sk) break; case BT_CONNECT2: - if (iso_pi(sk)->conn->hcon) - iso_conn_defer_reject(iso_pi(sk)->conn->hcon); iso_chan_del(sk, ECONNRESET); break; case BT_CONNECT: -- cgit v1.2.3 From 9f78191cc9f1b34c2e2afd7b554a83bf034092dd Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 28 Jun 2023 12:15:53 -0700 Subject: Bluetooth: hci_conn: Always allocate unique handles This attempts to always allocate a unique handle for connections so they can be properly aborted by the likes of hci_abort_conn, so this uses the invalid range as a pool of unset handles that way if userspace is trying to create multiple connections at once each will be given a unique handle which will be considered unset. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 2 +- net/bluetooth/hci_conn.c | 25 ++++++++++++++++++++++--- net/bluetooth/hci_event.c | 6 +++--- 3 files changed, 26 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 2dd59e3a51b3..491ab83ccafc 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -321,8 +321,8 @@ struct adv_monitor { #define HCI_MAX_SHORT_NAME_LENGTH 10 -#define HCI_CONN_HANDLE_UNSET 0xffff #define HCI_CONN_HANDLE_MAX 0x0eff +#define HCI_CONN_HANDLE_UNSET(_handle) (_handle > HCI_CONN_HANDLE_MAX) /* Min encryption key size to match with SMP */ #define HCI_MIN_ENC_KEY_SIZE 7 diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index a0ffe7db412b..af7dc8131a8c 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -932,6 +932,25 @@ static void cis_cleanup(struct hci_conn *conn) hci_le_remove_cig(hdev, conn->iso_qos.ucast.cig); } +static u16 hci_conn_hash_alloc_unset(struct hci_dev *hdev) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + u16 handle = HCI_CONN_HANDLE_MAX + 1; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + /* Find the first unused handle */ + if (handle == 0xffff || c->handle != handle) + break; + handle++; + } + rcu_read_unlock(); + + return handle; +} + struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, u8 role) { @@ -945,7 +964,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, bacpy(&conn->dst, dst); bacpy(&conn->src, &hdev->bdaddr); - conn->handle = HCI_CONN_HANDLE_UNSET; + conn->handle = hci_conn_hash_alloc_unset(hdev); conn->hdev = hdev; conn->type = type; conn->role = role; @@ -1057,7 +1076,7 @@ static void hci_conn_unlink(struct hci_conn *conn) */ if ((child->type == SCO_LINK || child->type == ESCO_LINK) && - child->handle == HCI_CONN_HANDLE_UNSET) + HCI_CONN_HANDLE_UNSET(child->handle)) hci_conn_del(child); } @@ -1943,7 +1962,7 @@ int hci_conn_check_create_cis(struct hci_conn *conn) return -EINVAL; if (!conn->parent || conn->parent->state != BT_CONNECTED || - conn->state != BT_CONNECT || conn->handle == HCI_CONN_HANDLE_UNSET) + conn->state != BT_CONNECT || HCI_CONN_HANDLE_UNSET(conn->handle)) return 1; return 0; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index c67612c99f89..90cfd30616f5 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3173,7 +3173,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, * As the connection handle is set here for the first time, it indicates * whether the connection is already set up. */ - if (conn->handle != HCI_CONN_HANDLE_UNSET) { + if (!HCI_CONN_HANDLE_UNSET(conn->handle)) { bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for existing connection"); goto unlock; } @@ -5032,7 +5032,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data, * As the connection handle is set here for the first time, it indicates * whether the connection is already set up. */ - if (conn->handle != HCI_CONN_HANDLE_UNSET) { + if (!HCI_CONN_HANDLE_UNSET(conn->handle)) { bt_dev_err(hdev, "Ignoring HCI_Sync_Conn_Complete event for existing connection"); goto unlock; } @@ -5896,7 +5896,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, * As the connection handle is set here for the first time, it indicates * whether the connection is already set up. */ - if (conn->handle != HCI_CONN_HANDLE_UNSET) { + if (!HCI_CONN_HANDLE_UNSET(conn->handle)) { bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for existing connection"); goto unlock; } -- cgit v1.2.3 From f777d88278170410b06a1f6633f3b9375a4ddd6b Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Mon, 3 Jul 2023 10:02:38 +0300 Subject: Bluetooth: ISO: Notify user space about failed bis connections Some use cases require the user to be informed if BIG synchronization fails. This commit makes it so that even if the BIG sync established event arrives with error status, a new hconn is added for each BIS, and the iso layer is notified about the failed connections. Unsuccesful bis connections will be marked using the HCI_CONN_BIG_SYNC_FAILED flag. From the iso layer, the POLLERR event is triggered on the newly allocated bis sockets, before adding them to the accept list of the parent socket. From user space, a new fd for each failed bis connection will be obtained by calling accept. The user should check for the POLLERR event on the new socket, to determine if the connection was successful or not. The HCI_CONN_BIG_SYNC flag has been added to mark whether the BIG sync has been successfully established. This flag is checked at bis cleanup, so the HCI LE BIG Terminate Sync command is only issued if needed. The BT_SK_BIG_SYNC flag indicates if BIG create sync has been called for a listening socket, to avoid issuing the command everytime a BIGInfo advertising report is received. Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 25 +++++++++++++++++++++++++ net/bluetooth/hci_conn.c | 37 ++++++++++++++----------------------- net/bluetooth/hci_event.c | 21 +++++++++++++++++---- net/bluetooth/hci_sync.c | 8 ++++++++ net/bluetooth/iso.c | 40 +++++++++++++++++++++++++++++----------- 5 files changed, 93 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 491ab83ccafc..105c1c394f82 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -976,6 +976,8 @@ enum { HCI_CONN_PER_ADV, HCI_CONN_BIG_CREATED, HCI_CONN_CREATE_CIS, + HCI_CONN_BIG_SYNC, + HCI_CONN_BIG_SYNC_FAILED, }; static inline bool hci_conn_ssp_enabled(struct hci_conn *conn) @@ -1286,6 +1288,29 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn *hci_conn_hash_lookup_big_any_dst(struct hci_dev *hdev, + __u8 handle) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != ISO_LINK) + continue; + + if (handle == c->iso_qos.bcast.big) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, __u8 type, __u16 state) { diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index af7dc8131a8c..cccc2b8b60a8 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -735,6 +735,7 @@ struct iso_list_data { int count; struct iso_cig_params pdu; bool big_term; + bool big_sync_term; }; static void bis_list(struct hci_conn *conn, void *data) @@ -752,17 +753,6 @@ static void bis_list(struct hci_conn *conn, void *data) d->count++; } -static void find_bis(struct hci_conn *conn, void *data) -{ - struct iso_list_data *d = data; - - /* Ignore unicast */ - if (bacmp(&conn->dst, BDADDR_ANY)) - return; - - d->count++; -} - static int terminate_big_sync(struct hci_dev *hdev, void *data) { struct iso_list_data *d = data; @@ -815,31 +805,26 @@ static int big_terminate_sync(struct hci_dev *hdev, void *data) bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", d->big, d->sync_handle); - /* Check if ISO connection is a BIS and terminate BIG if there are - * no other connections using it. - */ - hci_conn_hash_list_state(hdev, find_bis, ISO_LINK, BT_CONNECTED, d); - if (d->count) - return 0; - - hci_le_big_terminate_sync(hdev, d->big); + if (d->big_sync_term) + hci_le_big_terminate_sync(hdev, d->big); return hci_le_pa_terminate_sync(hdev, d->sync_handle); } -static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, u16 sync_handle) +static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *conn) { struct iso_list_data *d; int ret; - bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", big, sync_handle); + bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", big, conn->sync_handle); d = kzalloc(sizeof(*d), GFP_KERNEL); if (!d) return -ENOMEM; d->big = big; - d->sync_handle = sync_handle; + d->sync_handle = conn->sync_handle; + d->big_sync_term = test_and_clear_bit(HCI_CONN_BIG_SYNC, &conn->flags); ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d, terminate_big_destroy); @@ -875,8 +860,14 @@ static void bis_cleanup(struct hci_conn *conn) hci_le_terminate_big(hdev, conn); } else { + bis = hci_conn_hash_lookup_big_any_dst(hdev, + conn->iso_qos.bcast.big); + + if (bis) + return; + hci_le_big_terminate(hdev, conn->iso_qos.bcast.big, - conn->sync_handle); + conn); } } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 90cfd30616f5..c29eece88d2c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -7039,9 +7039,6 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, flex_array_size(ev, bis, ev->num_bis))) return; - if (ev->status) - return; - hci_dev_lock(hdev); for (i = 0; i < ev->num_bis; i++) { @@ -7065,9 +7062,25 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, bis->iso_qos.bcast.in.latency = le16_to_cpu(ev->interval) * 125 / 100; bis->iso_qos.bcast.in.sdu = le16_to_cpu(ev->max_pdu); - hci_iso_setup_path(bis); + if (!ev->status) { + set_bit(HCI_CONN_BIG_SYNC, &bis->flags); + hci_iso_setup_path(bis); + } } + /* In case BIG sync failed, notify each failed connection to + * the user after all hci connections have been added + */ + if (ev->status) + for (i = 0; i < ev->num_bis; i++) { + u16 handle = le16_to_cpu(ev->bis[i]); + + bis = hci_conn_hash_lookup_handle(hdev, handle); + + set_bit(HCI_CONN_BIG_SYNC_FAILED, &bis->flags); + hci_connect_cfm(bis, ev->status); + } + hci_dev_unlock(hdev); } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index c085b54d158b..3348a1b0e3f7 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5395,6 +5395,14 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) return err; case BT_CONNECT2: return hci_reject_conn_sync(hdev, conn, reason); + case BT_OPEN: + /* Cleanup bises that failed to be established */ + if (test_and_clear_bit(HCI_CONN_BIG_SYNC_FAILED, &conn->flags)) { + hci_dev_lock(hdev); + hci_conn_failed(conn, reason); + hci_dev_unlock(hdev); + } + break; default: conn->state = BT_CLOSED; break; diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 9c41af55f2c7..efac284badbc 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -48,6 +48,11 @@ static void iso_sock_kill(struct sock *sk); #define EIR_SERVICE_DATA_LENGTH 4 #define BASE_MAX_LENGTH (HCI_MAX_PER_AD_LENGTH - EIR_SERVICE_DATA_LENGTH) +/* iso_pinfo flags values */ +enum { + BT_SK_BIG_SYNC, +}; + struct iso_pinfo { struct bt_sock bt; bdaddr_t src; @@ -58,7 +63,7 @@ struct iso_pinfo { __u8 bc_num_bis; __u8 bc_bis[ISO_MAX_NUM_BIS]; __u16 sync_handle; - __u32 flags; + unsigned long flags; struct bt_iso_qos qos; bool qos_user_set; __u8 base_len; @@ -1555,6 +1560,12 @@ static void iso_conn_ready(struct iso_conn *conn) hci_conn_hold(hcon); iso_chan_add(conn, sk, parent); + if (ev && ((struct hci_evt_le_big_sync_estabilished *)ev)->status) { + /* Trigger error signal on child socket */ + sk->sk_err = ECONNREFUSED; + sk->sk_error_report(sk); + } + if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) sk->sk_state = BT_CONNECT2; else @@ -1623,15 +1634,17 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) if (ev2->num_bis < iso_pi(sk)->bc_num_bis) iso_pi(sk)->bc_num_bis = ev2->num_bis; - err = hci_le_big_create_sync(hdev, - &iso_pi(sk)->qos, - iso_pi(sk)->sync_handle, - iso_pi(sk)->bc_num_bis, - iso_pi(sk)->bc_bis); - if (err) { - bt_dev_err(hdev, "hci_le_big_create_sync: %d", - err); - sk = NULL; + if (!test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { + err = hci_le_big_create_sync(hdev, + &iso_pi(sk)->qos, + iso_pi(sk)->sync_handle, + iso_pi(sk)->bc_num_bis, + iso_pi(sk)->bc_bis); + if (err) { + bt_dev_err(hdev, "hci_le_big_create_sync: %d", + err); + sk = NULL; + } } } } else { @@ -1674,7 +1687,12 @@ static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); - if (!status) { + /* Similar to the success case, if HCI_CONN_BIG_SYNC_FAILED is set, + * queue the failed bis connection into the accept queue of the + * listening socket and wake up userspace, to inform the user about + * the BIG sync failed event. + */ + if (!status || test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) { struct iso_conn *conn; conn = iso_conn_add(hcon); -- cgit v1.2.3 From 528b2acf434bf4a0fb2969e5222fbe790b95f422 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 3 Jul 2023 17:17:16 +0300 Subject: Bluetooth: msft: Fix error code in msft_cancel_address_filter_sync() Return negative -EIO instead of positive EIO. Fixes: 926df8962f3f ("Bluetooth: msft: Extended monitor tracking by address filter") Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/msft.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index b80a2162a5c3..abbafa6194ca 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -743,7 +743,7 @@ static int msft_cancel_address_filter_sync(struct hci_dev *hdev, void *data) if (IS_ERR_OR_NULL(skb)) { bt_dev_err(hdev, "MSFT: Failed to cancel address (%pMR) filter", &address_filter->bdaddr); - err = EIO; + err = -EIO; goto done; } kfree_skb(skb); -- cgit v1.2.3 From 6f55eea116ba3646fb5fbb31de703f8cf79d8214 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 30 Jun 2023 15:33:15 -0700 Subject: Bluetooth: hci_sync: Don't double print name in add/remove adv_monitor The hci_add_adv_monitor() hci_remove_adv_monitor() functions call bt_dev_dbg() to print some debug statements. The bt_dev_dbg() macro automatically adds in the device's name. That means that we shouldn't include the name in the bt_dev_dbg() calls. Suggested-by: Luiz Augusto von Dentz Signed-off-by: Douglas Anderson Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 1ec83985f1ab..821ae737e85b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1949,14 +1949,14 @@ int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_NONE: - bt_dev_dbg(hdev, "%s add monitor %d status %d", hdev->name, + bt_dev_dbg(hdev, "add monitor %d status %d", monitor->handle, status); /* Message was not forwarded to controller - not an error */ break; case HCI_ADV_MONITOR_EXT_MSFT: status = msft_add_monitor_pattern(hdev, monitor); - bt_dev_dbg(hdev, "%s add monitor %d msft status %d", hdev->name, + bt_dev_dbg(hdev, "add monitor %d msft status %d", monitor->handle, status); break; } @@ -1976,15 +1976,15 @@ static int hci_remove_adv_monitor(struct hci_dev *hdev, switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_NONE: /* also goes here when powered off */ - bt_dev_dbg(hdev, "%s remove monitor %d status %d", hdev->name, + bt_dev_dbg(hdev, "remove monitor %d status %d", monitor->handle, status); goto free_monitor; case HCI_ADV_MONITOR_EXT_MSFT: handle = monitor->handle; status = msft_remove_monitor(hdev, monitor); - bt_dev_dbg(hdev, "%s remove monitor %d msft status %d", - hdev->name, handle, status); + bt_dev_dbg(hdev, "remove monitor %d msft status %d", + handle, status); break; } -- cgit v1.2.3 From 112b5090c21905531314fee41f691f0317bbf4f6 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 6 Jul 2023 12:06:32 -0700 Subject: Bluetooth: MGMT: Fix always using HCI_MAX_AD_LENGTH HCI_MAX_AD_LENGTH shall only be used if the controller doesn't support extended advertising, otherwise HCI_MAX_EXT_AD_LENGTH shall be used instead. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 4 ++++ net/bluetooth/hci_event.c | 12 +++++++----- net/bluetooth/mgmt.c | 6 +++--- 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 105c1c394f82..8200a6689b39 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1801,6 +1801,10 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) +/* Maximum advertising length */ +#define max_adv_len(dev) \ + (ext_adv_capable(dev) ? HCI_MAX_EXT_AD_LENGTH : HCI_MAX_AD_LENGTH) + /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 1789: * * C24: Mandatory if the LE Controller supports Connection State and either diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index c29eece88d2c..f1fcece29e7d 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1747,7 +1747,7 @@ static void store_pending_adv_report(struct hci_dev *hdev, bdaddr_t *bdaddr, { struct discovery_state *d = &hdev->discovery; - if (len > HCI_MAX_AD_LENGTH) + if (len > max_adv_len(hdev)) return; bacpy(&d->last_adv_addr, bdaddr); @@ -6249,8 +6249,9 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, return; } - if (!ext_adv && len > HCI_MAX_AD_LENGTH) { - bt_dev_err_ratelimited(hdev, "legacy adv larger than 31 bytes"); + if (len > max_adv_len(hdev)) { + bt_dev_err_ratelimited(hdev, + "adv larger than maximum supported"); return; } @@ -6315,7 +6316,8 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, */ conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, bdaddr_resolved, type); - if (!ext_adv && conn && type == LE_ADV_IND && len <= HCI_MAX_AD_LENGTH) { + if (!ext_adv && conn && type == LE_ADV_IND && + len <= max_adv_len(hdev)) { /* Store report for later inclusion by * mgmt_device_connected */ @@ -6456,7 +6458,7 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, void *data, info->length + 1)) break; - if (info->length <= HCI_MAX_AD_LENGTH) { + if (info->length <= max_adv_len(hdev)) { rssi = info->data[info->length]; process_adv_report(hdev, info->type, &info->bdaddr, info->bdaddr_type, NULL, 0, rssi, diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 5707ff468842..d6c9b7bc8592 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -8428,8 +8428,8 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, supported_flags = get_supported_adv_flags(hdev); rp->supported_flags = cpu_to_le32(supported_flags); - rp->max_adv_data_len = HCI_MAX_AD_LENGTH; - rp->max_scan_rsp_len = HCI_MAX_AD_LENGTH; + rp->max_adv_data_len = max_adv_len(hdev); + rp->max_scan_rsp_len = max_adv_len(hdev); rp->max_instances = hdev->le_num_of_adv_sets; rp->num_instances = hdev->adv_instance_cnt; @@ -8465,7 +8465,7 @@ static u8 calculate_name_len(struct hci_dev *hdev) static u8 tlv_data_max_len(struct hci_dev *hdev, u32 adv_flags, bool is_adv_data) { - u8 max_len = HCI_MAX_AD_LENGTH; + u8 max_len = max_adv_len(hdev); if (is_adv_data) { if (adv_flags & (MGMT_ADV_FLAG_DISCOV | -- cgit v1.2.3 From 82eae9dc438cd7932b5a1c79057378839f1e61e0 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 11 Jul 2023 19:41:10 +0200 Subject: Bluetooth: hci_debugfs: Use kstrtobool() instead of strtobool() strtobool() is the same as kstrtobool(). However, the latter is more used within the kernel. In order to remove strtobool() and slightly simplify kstrtox.h, switch to the other function name. While at it, include the corresponding header file () Signed-off-by: Christophe JAILLET Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_debugfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index ec0df2f9188e..6b7741f6e95b 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -22,6 +22,7 @@ */ #include +#include #include #include @@ -1152,7 +1153,7 @@ static ssize_t force_no_mitm_write(struct file *file, return -EFAULT; buf[buf_size] = '\0'; - if (strtobool(buf, &enable)) + if (kstrtobool(buf, &enable)) return -EINVAL; if (enable == hci_dev_test_flag(hdev, HCI_FORCE_NO_MITM)) -- cgit v1.2.3 From 573ebae162111063eedc6c838a659ba628f66a0f Mon Sep 17 00:00:00 2001 From: Ying Hsu Date: Wed, 5 Jul 2023 21:06:47 +0000 Subject: Bluetooth: Fix hci_suspend_sync crash If hci_unregister_dev() frees the hci_dev object but hci_suspend_notifier may still be accessing it, it can cause the program to crash. Here's the call trace: <4>[102152.653246] Call Trace: <4>[102152.653254] hci_suspend_sync+0x109/0x301 [bluetooth] <4>[102152.653259] hci_suspend_dev+0x78/0xcd [bluetooth] <4>[102152.653263] hci_suspend_notifier+0x42/0x7a [bluetooth] <4>[102152.653268] notifier_call_chain+0x43/0x6b <4>[102152.653271] __blocking_notifier_call_chain+0x48/0x69 <4>[102152.653273] __pm_notifier_call_chain+0x22/0x39 <4>[102152.653276] pm_suspend+0x287/0x57c <4>[102152.653278] state_store+0xae/0xe5 <4>[102152.653281] kernfs_fop_write+0x109/0x173 <4>[102152.653284] __vfs_write+0x16f/0x1a2 <4>[102152.653287] ? selinux_file_permission+0xca/0x16f <4>[102152.653289] ? security_file_permission+0x36/0x109 <4>[102152.653291] vfs_write+0x114/0x21d <4>[102152.653293] __x64_sys_write+0x7b/0xdb <4>[102152.653296] do_syscall_64+0x59/0x194 <4>[102152.653299] entry_SYSCALL_64_after_hwframe+0x5c/0xc1 This patch holds the reference count of the hci_dev object while processing it in hci_suspend_notifier to avoid potential crash caused by the race condition. Signed-off-by: Ying Hsu Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 821ae737e85b..bd90caad4804 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2436,6 +2436,9 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return NOTIFY_DONE; + /* To avoid a potential race with hci_unregister_dev. */ + hci_dev_hold(hdev); + if (action == PM_SUSPEND_PREPARE) ret = hci_suspend_dev(hdev); else if (action == PM_POST_SUSPEND) @@ -2445,6 +2448,7 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, bt_dev_err(hdev, "Suspend notifier action (%lu) failed: %d", action, ret); + hci_dev_put(hdev); return NOTIFY_DONE; } -- cgit v1.2.3 From 3f19ffb2f924db5b0925c77818d18ac1f6f08a44 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 13 Jul 2023 13:41:31 -0700 Subject: Bluetooth: af_bluetooth: Make BT_PKT_STATUS generic This makes the handling of BT_PKT_STATUS more generic so it can be reused by sockets other than SCO like BT_DEFER_SETUP, etc. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 8 +++----- include/net/bluetooth/sco.h | 2 -- net/bluetooth/af_bluetooth.c | 8 ++++++-- net/bluetooth/hci_core.c | 2 +- net/bluetooth/sco.c | 22 ++++------------------ 5 files changed, 14 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 34998ae8ed78..aa90adc3b2a4 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -386,6 +386,7 @@ struct bt_sock { enum { BT_SK_DEFER_SETUP, BT_SK_SUSPEND, + BT_SK_PKT_STATUS }; struct bt_sock_list { @@ -432,10 +433,6 @@ struct l2cap_ctrl { struct l2cap_chan *chan; }; -struct sco_ctrl { - u8 pkt_status; -}; - struct hci_dev; typedef void (*hci_req_complete_t)(struct hci_dev *hdev, u8 status, u16 opcode); @@ -466,9 +463,9 @@ struct bt_skb_cb { u8 force_active; u16 expect; u8 incoming:1; + u8 pkt_status:2; union { struct l2cap_ctrl l2cap; - struct sco_ctrl sco; struct hci_ctrl hci; struct mgmt_ctrl mgmt; struct scm_creds creds; @@ -477,6 +474,7 @@ struct bt_skb_cb { #define bt_cb(skb) ((struct bt_skb_cb *)((skb)->cb)) #define hci_skb_pkt_type(skb) bt_cb((skb))->pkt_type +#define hci_skb_pkt_status(skb) bt_cb((skb))->pkt_status #define hci_skb_expect(skb) bt_cb((skb))->expect #define hci_skb_opcode(skb) bt_cb((skb))->hci.opcode #define hci_skb_event(skb) bt_cb((skb))->hci.req_event diff --git a/include/net/bluetooth/sco.h b/include/net/bluetooth/sco.h index 1aa2e14b6c94..f40ddb4264fc 100644 --- a/include/net/bluetooth/sco.h +++ b/include/net/bluetooth/sco.h @@ -46,6 +46,4 @@ struct sco_conninfo { __u8 dev_class[3]; }; -#define SCO_CMSG_PKT_STATUS 0x01 - #endif /* __SCO_H */ diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 647afb187147..336a76165454 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -333,8 +333,12 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, bt_sk(sk)->skb_msg_name(skb, msg->msg_name, &msg->msg_namelen); - if (bt_sk(sk)->skb_put_cmsg) - bt_sk(sk)->skb_put_cmsg(skb, msg, sk); + if (test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags)) { + u8 pkt_status = hci_skb_pkt_status(skb); + + put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_STATUS, + sizeof(pkt_status), &pkt_status); + } } skb_free_datagram(sk, skb); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index bd90caad4804..0fefa6788911 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3895,7 +3895,7 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) if (conn) { /* Send to upper protocol */ - bt_cb(skb)->sco.pkt_status = flags & 0x03; + hci_skb_pkt_status(skb) = flags & 0x03; sco_recv_scodata(conn, skb); return; } else { diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index ec6dce488a40..50ad5935ae47 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -68,7 +68,6 @@ struct sco_pinfo { bdaddr_t dst; __u32 flags; __u16 setting; - __u8 cmsg_mask; struct bt_codec codec; struct sco_conn *conn; }; @@ -471,15 +470,6 @@ static void sco_sock_close(struct sock *sk) release_sock(sk); } -static void sco_skb_put_cmsg(struct sk_buff *skb, struct msghdr *msg, - struct sock *sk) -{ - if (sco_pi(sk)->cmsg_mask & SCO_CMSG_PKT_STATUS) - put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_STATUS, - sizeof(bt_cb(skb)->sco.pkt_status), - &bt_cb(skb)->sco.pkt_status); -} - static void sco_sock_init(struct sock *sk, struct sock *parent) { BT_DBG("sk %p", sk); @@ -488,8 +478,6 @@ static void sco_sock_init(struct sock *sk, struct sock *parent) sk->sk_type = parent->sk_type; bt_sk(sk)->flags = bt_sk(parent)->flags; security_sk_clone(parent, sk); - } else { - bt_sk(sk)->skb_put_cmsg = sco_skb_put_cmsg; } } @@ -907,9 +895,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, } if (opt) - sco_pi(sk)->cmsg_mask |= SCO_CMSG_PKT_STATUS; + set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); else - sco_pi(sk)->cmsg_mask &= SCO_CMSG_PKT_STATUS; + clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); break; case BT_CODEC: @@ -1040,7 +1028,6 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, int len, err = 0; struct bt_voice voice; u32 phys; - int pkt_status; int buf_len; struct codec_list *c; u8 num_codecs, i, __user *ptr; @@ -1094,9 +1081,8 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, break; case BT_PKT_STATUS: - pkt_status = (sco_pi(sk)->cmsg_mask & SCO_CMSG_PKT_STATUS); - - if (put_user(pkt_status, (int __user *)optval)) + if (put_user(test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags), + (int __user *)optval)) err = -EFAULT; break; -- cgit v1.2.3 From 0731c5ab4d510501a6a2da491ac68aea858bf834 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 13 Jul 2023 14:02:37 -0700 Subject: Bluetooth: ISO: Add support for BT_PKT_STATUS This adds support for BT_PKT_STATUS socketopt by setting BT_SK_PKT_STATUS. Then upon receiving an ISO packet the code would attempt to store the Packet_Status_Flag to hci_skb_pkt_status which is then forward to userspace in the form of BT_SCM_PKT_STATUS whenever BT_PKT_STATUS has been enabled/set. Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index efac284badbc..cbe3299b4a41 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1288,6 +1288,18 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); break; + case BT_PKT_STATUS: + if (copy_from_sockptr(&opt, optval, sizeof(u32))) { + err = -EFAULT; + break; + } + + if (opt) + set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); + else + clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); + break; + case BT_ISO_QOS: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2) { @@ -1373,6 +1385,12 @@ static int iso_sock_getsockopt(struct socket *sock, int level, int optname, break; + case BT_PKT_STATUS: + if (put_user(test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags), + (int __user *)optval)) + err = -EFAULT; + break; + case BT_ISO_QOS: qos = iso_sock_get_qos(sk); @@ -1767,6 +1785,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (len == skb->len) { /* Complete frame received */ + hci_skb_pkt_status(skb) = flags & 0x03; iso_recv_frame(conn, skb); return; } @@ -1788,6 +1807,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (!conn->rx_skb) goto drop; + hci_skb_pkt_status(conn->rx_skb) = flags & 0x03; skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len = len - skb->len; -- cgit v1.2.3 From 90005880a68cc8908885f5c9c9e2e60deaf78700 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Fri, 28 Jul 2023 11:30:11 +0800 Subject: Bluetooth: Remove unused declaration amp_read_loc_info() This is introduced in commit 903e45411099 but was never implemented. Fixes: 903e45411099 ("Bluetooth: AMP: Use HCI cmd to Read Loc AMP Assoc") Signed-off-by: Yue Haibing Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/amp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/amp.h b/net/bluetooth/amp.h index 832764dfbfb3..97c87abd129f 100644 --- a/net/bluetooth/amp.h +++ b/net/bluetooth/amp.h @@ -28,7 +28,6 @@ struct hci_conn *phylink_add(struct hci_dev *hdev, struct amp_mgr *mgr, int phylink_gen_key(struct hci_conn *hcon, u8 *data, u8 *len, u8 *type); -void amp_read_loc_info(struct hci_dev *hdev, struct amp_mgr *mgr); void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle); void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr); void amp_read_loc_assoc_final_data(struct hci_dev *hdev, -- cgit v1.2.3 From 69997d50ec574be816b4ee8f9cee52ebbd53f8bd Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Thu, 27 Jul 2023 00:25:26 +0300 Subject: Bluetooth: ISO: handle bound CIS cleanup via hci_conn Calling hci_conn_del in __iso_sock_close is invalid. It needs hdev->lock, but it cannot be acquired there due to lock ordering. Fix this by doing cleanup via hci_conn_drop. Return hci_conn with refcount 1 from hci_bind_cis and hci_connect_cis, so that the iso_conn always holds one reference. This also fixes refcounting when error handling. Since hci_conn_abort shall handle termination of connections in any state properly, we can handle BT_CONNECT socket state in the same way as BT_CONNECTED. Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 5 +++++ net/bluetooth/iso.c | 14 +------------- 2 files changed, 6 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index cccc2b8b60a8..923bb7e7be2b 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1909,6 +1909,8 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, return ERR_PTR(-EINVAL); } + hci_conn_hold(cis); + cis->iso_qos = *qos; cis->state = BT_BOUND; @@ -2262,6 +2264,9 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, return ERR_PTR(-ENOLINK); } + /* Link takes the refcount */ + hci_conn_drop(cis); + cis->state = BT_CONNECT; hci_le_create_cis_pending(hdev); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index cbe3299b4a41..358954bfbb32 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -628,6 +628,7 @@ static void __iso_sock_close(struct sock *sk) iso_sock_cleanup_listen(sk); break; + case BT_CONNECT: case BT_CONNECTED: case BT_CONFIG: if (iso_pi(sk)->conn->hcon) { @@ -643,19 +644,6 @@ static void __iso_sock_close(struct sock *sk) break; case BT_CONNECT2: - iso_chan_del(sk, ECONNRESET); - break; - case BT_CONNECT: - /* In case of DEFER_SETUP the hcon would be bound to CIG which - * needs to be removed so just call hci_conn_del so the cleanup - * callback do what is needed. - */ - if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && - iso_pi(sk)->conn->hcon) { - hci_conn_del(iso_pi(sk)->conn->hcon); - iso_pi(sk)->conn->hcon = NULL; - } - iso_chan_del(sk, ECONNRESET); break; case BT_DISCONN: -- cgit v1.2.3 From 2889bdd0a9a195533c2103e7b39ab0de844d72f6 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Thu, 27 Jul 2023 00:25:25 +0300 Subject: Bluetooth: hci_sync: delete CIS in BT_OPEN/CONNECT/BOUND when aborting Dropped CIS that are in state BT_OPEN/BT_BOUND, and in state BT_CONNECT with HCI_CONN_CREATE_CIS unset, should be cleaned up immediately. Closing CIS ISO sockets should result to the hci_conn be deleted, so that potentially pending CIG removal can run. hci_abort_conn cannot refer to them by handle, since their handle is still unset if Set CIG Parameters has not yet completed. This fixes CIS not being terminated if the socket is shut down immediately after connection, so that the hci_abort_conn runs before Set CIG Parameters completes. See new BlueZ test "ISO Connect Close - Success" Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 3348a1b0e3f7..e114409628d1 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5308,6 +5308,10 @@ static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) return hci_disconnect_sync(hdev, conn, reason); + /* CIS with no Create CIS sent have nothing to cancel */ + if (bacmp(&conn->dst, BDADDR_ANY)) + return HCI_ERROR_LOCAL_HOST_TERM; + /* There is no way to cancel a BIS without terminating the BIG * which is done later on connection cleanup. */ @@ -5396,13 +5400,11 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) case BT_CONNECT2: return hci_reject_conn_sync(hdev, conn, reason); case BT_OPEN: - /* Cleanup bises that failed to be established */ - if (test_and_clear_bit(HCI_CONN_BIG_SYNC_FAILED, &conn->flags)) { - hci_dev_lock(hdev); - hci_conn_failed(conn, reason); - hci_dev_unlock(hdev); - } - break; + case BT_BOUND: + hci_dev_lock(hdev); + hci_conn_failed(conn, reason); + hci_dev_unlock(hdev); + return 0; default: conn->state = BT_CLOSED; break; -- cgit v1.2.3 From 094e3639623ee3b8a043e2b5285498b036a4dc09 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 2 Aug 2023 21:08:53 -0700 Subject: Bluetooth: hci_sync: Fix handling of HCI_OP_CREATE_CONN_CANCEL When sending HCI_OP_CREATE_CONN_CANCEL it shall Wait for HCI_EV_CONN_COMPLETE, not HCI_EV_CMD_STATUS, when the reason is anything but HCI_ERROR_REMOTE_POWER_OFF. This reason is used when suspending or powering off, where we don't want to wait for the peer's response. Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index e114409628d1..a9b048d7b419 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5321,6 +5321,17 @@ static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, if (hdev->hci_ver < BLUETOOTH_VER_1_2) return 0; + /* Wait for HCI_EV_CONN_COMPLETE, not HCI_EV_CMD_STATUS, when the + * reason is anything but HCI_ERROR_REMOTE_POWER_OFF. This reason is + * used when suspending or powering off, where we don't want to wait + * for the peer's response. + */ + if (reason != HCI_ERROR_REMOTE_POWER_OFF) + return __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN_CANCEL, + 6, &conn->dst, + HCI_EV_CONN_COMPLETE, + HCI_CMD_TIMEOUT, NULL); + return __hci_cmd_sync_status(hdev, HCI_OP_CREATE_CONN_CANCEL, 6, &conn->dst, HCI_CMD_TIMEOUT); } -- cgit v1.2.3 From 5af1f84ed13a416297ab9ced7537f4d5ae7f329a Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 3 Aug 2023 11:04:51 -0700 Subject: Bluetooth: hci_sync: Fix UAF on hci_abort_conn_sync Connections may be cleanup while waiting for the commands to complete so this attempts to check if the connection handle remains valid in case of errors that would lead to call hci_conn_failed: BUG: KASAN: slab-use-after-free in hci_conn_failed+0x1f/0x160 Read of size 8 at addr ffff888001376958 by task kworker/u3:0/52 CPU: 0 PID: 52 Comm: kworker/u3:0 Not tainted 6.5.0-rc1-00527-g2dfe76d58d3a #5615 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc38 04/01/2014 Workqueue: hci0 hci_cmd_sync_work Call Trace: dump_stack_lvl+0x1d/0x70 print_report+0xce/0x620 ? __virt_addr_valid+0xd4/0x150 ? hci_conn_failed+0x1f/0x160 kasan_report+0xd1/0x100 ? hci_conn_failed+0x1f/0x160 hci_conn_failed+0x1f/0x160 hci_abort_conn_sync+0x237/0x360 Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index a9b048d7b419..ec8929e79502 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5389,27 +5389,20 @@ static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { - int err; + int err = 0; + u16 handle = conn->handle; switch (conn->state) { case BT_CONNECTED: case BT_CONFIG: - return hci_disconnect_sync(hdev, conn, reason); + err = hci_disconnect_sync(hdev, conn, reason); + break; case BT_CONNECT: err = hci_connect_cancel_sync(hdev, conn, reason); - /* Cleanup hci_conn object if it cannot be cancelled as it - * likelly means the controller and host stack are out of sync - * or in case of LE it was still scanning so it can be cleanup - * safely. - */ - if (err) { - hci_dev_lock(hdev); - hci_conn_failed(conn, err); - hci_dev_unlock(hdev); - } - return err; + break; case BT_CONNECT2: - return hci_reject_conn_sync(hdev, conn, reason); + err = hci_reject_conn_sync(hdev, conn, reason); + break; case BT_OPEN: case BT_BOUND: hci_dev_lock(hdev); @@ -5418,10 +5411,30 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) return 0; default: conn->state = BT_CLOSED; - break; + return 0; } - return 0; + /* Cleanup hci_conn object if it cannot be cancelled as it + * likelly means the controller and host stack are out of sync + * or in case of LE it was still scanning so it can be cleanup + * safely. + */ + if (err) { + struct hci_conn *c; + + /* Check if the connection hasn't been cleanup while waiting + * commands to complete. + */ + c = hci_conn_hash_lookup_handle(hdev, handle); + if (!c || c != conn) + return 0; + + hci_dev_lock(hdev); + hci_conn_failed(conn, err); + hci_dev_unlock(hdev); + } + + return err; } static int hci_disconnect_all_sync(struct hci_dev *hdev, u8 reason) -- cgit v1.2.3 From b7f923b1ef6a2e76013089d30c9552257056360a Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 3 Aug 2023 14:41:46 -0700 Subject: Bluetooth: ISO: Fix not checking for valid CIG/CIS IDs Valid range of CIG/CIS are 0x00 to 0xEF, so this checks they are properly checked before attempting to use HCI_OP_LE_SET_CIG_PARAMS. Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 358954bfbb32..6b66d6a88b9a 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1187,6 +1187,12 @@ static bool check_io_qos(struct bt_iso_io_qos *qos) static bool check_ucast_qos(struct bt_iso_qos *qos) { + if (qos->ucast.cig > 0xef && qos->ucast.cig != BT_ISO_QOS_CIG_UNSET) + return false; + + if (qos->ucast.cis > 0xef && qos->ucast.cis != BT_ISO_QOS_CIS_UNSET) + return false; + if (qos->ucast.sca > 0x07) return false; -- cgit v1.2.3 From 16e3b6429159795a87add7584eb100b19aa1d70b Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 3 Aug 2023 14:49:14 -0700 Subject: Bluetooth: hci_conn: Fix modifying handle while aborting This introduces hci_conn_set_handle which takes care of verifying the conditions where the hci_conn handle can be modified, including when hci_conn_abort has been called and also checks that the handles is valid as well. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_conn.c | 27 +++++++++++++++++++++++++++ net/bluetooth/hci_event.c | 29 +++++++++++------------------ 3 files changed, 39 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 8200a6689b39..d2a3a2a9fd7d 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1425,6 +1425,7 @@ int hci_conn_switch_role(struct hci_conn *conn, __u8 role); void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active); void hci_conn_failed(struct hci_conn *conn, u8 status); +u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle); /* * hci_conn_get() and hci_conn_put() are used to control the life-time of an diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 923bb7e7be2b..6132eed7e8dc 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1231,6 +1231,33 @@ void hci_conn_failed(struct hci_conn *conn, u8 status) hci_conn_del(conn); } +/* This function requires the caller holds hdev->lock */ +u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle) +{ + struct hci_dev *hdev = conn->hdev; + + bt_dev_dbg(hdev, "hcon %p handle 0x%4.4x", conn, handle); + + if (conn->handle == handle) + return 0; + + if (handle > HCI_CONN_HANDLE_MAX) { + bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", + handle, HCI_CONN_HANDLE_MAX); + return HCI_ERROR_INVALID_PARAMETERS; + } + + /* If abort_reason has been sent it means the connection is being + * aborted and the handle shall not be changed. + */ + if (conn->abort_reason) + return conn->abort_reason; + + conn->handle = handle; + + return 0; +} + static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) { struct hci_conn *conn; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index f1fcece29e7d..218da9b0fe8f 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3179,13 +3179,9 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, } if (!status) { - conn->handle = __le16_to_cpu(ev->handle); - if (conn->handle > HCI_CONN_HANDLE_MAX) { - bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", - conn->handle, HCI_CONN_HANDLE_MAX); - status = HCI_ERROR_INVALID_PARAMETERS; + status = hci_conn_set_handle(conn, __le16_to_cpu(ev->handle)); + if (status) goto done; - } if (conn->type == ACL_LINK) { conn->state = BT_CONFIG; @@ -3849,11 +3845,9 @@ static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data, if (conn->state != BT_BOUND && conn->state != BT_CONNECT) continue; - conn->handle = __le16_to_cpu(rp->handle[i]); + if (hci_conn_set_handle(conn, __le16_to_cpu(rp->handle[i]))) + continue; - bt_dev_dbg(hdev, "%p handle 0x%4.4x parent %p", conn, - conn->handle, conn->parent); - if (conn->state == BT_CONNECT) pending = true; } @@ -5039,11 +5033,8 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data, switch (status) { case 0x00: - conn->handle = __le16_to_cpu(ev->handle); - if (conn->handle > HCI_CONN_HANDLE_MAX) { - bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", - conn->handle, HCI_CONN_HANDLE_MAX); - status = HCI_ERROR_INVALID_PARAMETERS; + status = hci_conn_set_handle(conn, __le16_to_cpu(ev->handle)); + if (status) { conn->state = BT_CLOSED; break; } @@ -6978,7 +6969,7 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, { struct hci_evt_le_create_big_complete *ev = data; struct hci_conn *conn; - __u8 bis_idx = 0; + __u8 i = 0; BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); @@ -6996,7 +6987,9 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, conn->iso_qos.bcast.big != ev->handle) continue; - conn->handle = __le16_to_cpu(ev->bis_handle[bis_idx++]); + if (hci_conn_set_handle(conn, + __le16_to_cpu(ev->bis_handle[i++]))) + continue; if (!ev->status) { conn->state = BT_CONNECTED; @@ -7015,7 +7008,7 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, rcu_read_lock(); } - if (!ev->status && !bis_idx) + if (!ev->status && !i) /* If no BISes have been connected for the BIG, * terminate. This is in case all bound connections * have been closed before the BIG creation -- cgit v1.2.3 From f2f84a70f9d0c9a3263194ca9d82e7bc6027d356 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 3 Aug 2023 16:41:34 -0700 Subject: Bluetooth: hci_conn: Fix not allowing valid CIS ID Only the number of CIS shall be limited to 0x1f, the CIS ID in the other hand is up to 0xef. Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 6132eed7e8dc..71e5a4c2e523 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1846,9 +1846,12 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos) cis_add(&data, qos); } - /* Reprogram all CIS(s) with the same CIG */ - for (data.cig = qos->ucast.cig, data.cis = 0x00; data.cis < 0x11; - data.cis++) { + /* Reprogram all CIS(s) with the same CIG, valid range are: + * num_cis: 0x00 to 0x1F + * cis_id: 0x00 to 0xEF + */ + for (data.cig = qos->ucast.cig, data.cis = 0x00; data.cis < 0xf0 && + data.pdu.cp.num_cis < ARRAY_SIZE(data.pdu.cis); data.cis++) { data.count = 0; hci_conn_hash_list_state(hdev, cis_list, ISO_LINK, BT_BOUND, -- cgit v1.2.3 From f88670161eb205f842989df555d0dd2f9fe2d4b5 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 4 Aug 2023 11:03:43 -0700 Subject: Bluetooth: hci_core: Make hci_is_le_conn_scanning public This moves hci_is_le_conn_scanning to hci_core.h so it can be used by different files without having to duplicate its code. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 21 +++++++++++++++++++++ net/bluetooth/hci_request.c | 21 --------------------- net/bluetooth/hci_sync.c | 21 --------------------- 3 files changed, 21 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index d2a3a2a9fd7d..f4462c325e2a 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1372,6 +1372,27 @@ static inline struct hci_conn *hci_lookup_le_connect(struct hci_dev *hdev) return NULL; } +/* Returns true if an le connection is in the scanning state */ +static inline bool hci_is_le_conn_scanning(struct hci_dev *hdev) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == LE_LINK && c->state == BT_CONNECT && + test_bit(HCI_CONN_SCANNING, &c->flags)) { + rcu_read_unlock(); + return true; + } + } + + rcu_read_unlock(); + + return false; +} + int hci_disconnect(struct hci_conn *conn, __u8 reason); bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index f7e006a36382..6e023b0104b0 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -629,27 +629,6 @@ static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, } } -/* Returns true if an le connection is in the scanning state */ -static inline bool hci_is_le_conn_scanning(struct hci_dev *hdev) -{ - struct hci_conn_hash *h = &hdev->conn_hash; - struct hci_conn *c; - - rcu_read_lock(); - - list_for_each_entry_rcu(c, &h->list, list) { - if (c->type == LE_LINK && c->state == BT_CONNECT && - test_bit(HCI_CONN_SCANNING, &c->flags)) { - rcu_read_unlock(); - return true; - } - } - - rcu_read_unlock(); - - return false; -} - static void set_random_addr(struct hci_request *req, bdaddr_t *rpa); static int hci_update_random_address(struct hci_request *req, bool require_privacy, bool use_rpa, diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index ec8929e79502..41a8e57d8267 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2672,27 +2672,6 @@ done: return filter_policy; } -/* Returns true if an le connection is in the scanning state */ -static inline bool hci_is_le_conn_scanning(struct hci_dev *hdev) -{ - struct hci_conn_hash *h = &hdev->conn_hash; - struct hci_conn *c; - - rcu_read_lock(); - - list_for_each_entry_rcu(c, &h->list, list) { - if (c->type == LE_LINK && c->state == BT_CONNECT && - test_bit(HCI_CONN_SCANNING, &c->flags)) { - rcu_read_unlock(); - return true; - } - } - - rcu_read_unlock(); - - return false; -} - static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy) -- cgit v1.2.3 From a091289218202bc09d9b9caa8afcde1018584aec Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 4 Aug 2023 14:54:09 -0700 Subject: Bluetooth: hci_conn: Fix hci_le_set_cig_params When running with concurrent task only one CIS was being assigned so this attempts to rework the way the PDU is constructed so it is handled later at the callback instead of in place. Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 157 +++++++++++++++++++---------------------------- 1 file changed, 63 insertions(+), 94 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 71e5a4c2e523..ae206eb551f7 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -733,7 +733,6 @@ struct iso_list_data { u16 sync_handle; }; int count; - struct iso_cig_params pdu; bool big_term; bool big_sync_term; }; @@ -1703,42 +1702,6 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, return sco; } -static void cis_add(struct iso_list_data *d, struct bt_iso_qos *qos) -{ - struct hci_cis_params *cis = &d->pdu.cis[d->pdu.cp.num_cis]; - - cis->cis_id = qos->ucast.cis; - cis->c_sdu = cpu_to_le16(qos->ucast.out.sdu); - cis->p_sdu = cpu_to_le16(qos->ucast.in.sdu); - cis->c_phy = qos->ucast.out.phy ? qos->ucast.out.phy : qos->ucast.in.phy; - cis->p_phy = qos->ucast.in.phy ? qos->ucast.in.phy : qos->ucast.out.phy; - cis->c_rtn = qos->ucast.out.rtn; - cis->p_rtn = qos->ucast.in.rtn; - - d->pdu.cp.num_cis++; -} - -static void cis_list(struct hci_conn *conn, void *data) -{ - struct iso_list_data *d = data; - - /* Skip if broadcast/ANY address */ - if (!bacmp(&conn->dst, BDADDR_ANY)) - return; - - if (d->cig != conn->iso_qos.ucast.cig || d->cis == BT_ISO_QOS_CIS_UNSET || - d->cis != conn->iso_qos.ucast.cis) - return; - - d->count++; - - if (d->pdu.cp.cig_id == BT_ISO_QOS_CIG_UNSET || - d->count >= ARRAY_SIZE(d->pdu.cis)) - return; - - cis_add(d, &conn->iso_qos); -} - static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos) { struct hci_dev *hdev = conn->hdev; @@ -1771,25 +1734,62 @@ static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos) return hci_send_cmd(hdev, HCI_OP_LE_CREATE_BIG, sizeof(cp), &cp); } -static void set_cig_params_complete(struct hci_dev *hdev, void *data, int err) +static int set_cig_params_sync(struct hci_dev *hdev, void *data) { - struct iso_cig_params *pdu = data; + u8 cig_id = PTR_ERR(data); + struct hci_conn *conn; + struct bt_iso_qos *qos; + struct iso_cig_params pdu; + u8 cis_id; - bt_dev_dbg(hdev, ""); + conn = hci_conn_hash_lookup_cig(hdev, cig_id); + if (!conn) + return 0; - if (err) - bt_dev_err(hdev, "Unable to set CIG parameters: %d", err); + memset(&pdu, 0, sizeof(pdu)); - kfree(pdu); -} + qos = &conn->iso_qos; + pdu.cp.cig_id = cig_id; + hci_cpu_to_le24(qos->ucast.out.interval, pdu.cp.c_interval); + hci_cpu_to_le24(qos->ucast.in.interval, pdu.cp.p_interval); + pdu.cp.sca = qos->ucast.sca; + pdu.cp.packing = qos->ucast.packing; + pdu.cp.framing = qos->ucast.framing; + pdu.cp.c_latency = cpu_to_le16(qos->ucast.out.latency); + pdu.cp.p_latency = cpu_to_le16(qos->ucast.in.latency); -static int set_cig_params_sync(struct hci_dev *hdev, void *data) -{ - struct iso_cig_params *pdu = data; - u32 plen; + /* Reprogram all CIS(s) with the same CIG, valid range are: + * num_cis: 0x00 to 0x1F + * cis_id: 0x00 to 0xEF + */ + for (cis_id = 0x00; cis_id < 0xf0 && + pdu.cp.num_cis < ARRAY_SIZE(pdu.cis); cis_id++) { + struct hci_cis_params *cis; + + conn = hci_conn_hash_lookup_cis(hdev, NULL, 0, cig_id, cis_id); + if (!conn) + continue; - plen = sizeof(pdu->cp) + pdu->cp.num_cis * sizeof(pdu->cis[0]); - return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_CIG_PARAMS, plen, pdu, + qos = &conn->iso_qos; + + cis = &pdu.cis[pdu.cp.num_cis++]; + cis->cis_id = cis_id; + cis->c_sdu = cpu_to_le16(conn->iso_qos.ucast.out.sdu); + cis->p_sdu = cpu_to_le16(conn->iso_qos.ucast.in.sdu); + cis->c_phy = qos->ucast.out.phy ? qos->ucast.out.phy : + qos->ucast.in.phy; + cis->p_phy = qos->ucast.in.phy ? qos->ucast.in.phy : + qos->ucast.out.phy; + cis->c_rtn = qos->ucast.out.rtn; + cis->p_rtn = qos->ucast.in.rtn; + } + + if (!pdu.cp.num_cis) + return 0; + + return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_CIG_PARAMS, + sizeof(pdu.cp) + + pdu.cp.num_cis * sizeof(pdu.cis[0]), &pdu, HCI_CMD_TIMEOUT); } @@ -1797,7 +1797,6 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos) { struct hci_dev *hdev = conn->hdev; struct iso_list_data data; - struct iso_cig_params *pdu; memset(&data, 0, sizeof(data)); @@ -1824,61 +1823,31 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos) qos->ucast.cig = data.cig; } - data.pdu.cp.cig_id = qos->ucast.cig; - hci_cpu_to_le24(qos->ucast.out.interval, data.pdu.cp.c_interval); - hci_cpu_to_le24(qos->ucast.in.interval, data.pdu.cp.p_interval); - data.pdu.cp.sca = qos->ucast.sca; - data.pdu.cp.packing = qos->ucast.packing; - data.pdu.cp.framing = qos->ucast.framing; - data.pdu.cp.c_latency = cpu_to_le16(qos->ucast.out.latency); - data.pdu.cp.p_latency = cpu_to_le16(qos->ucast.in.latency); - if (qos->ucast.cis != BT_ISO_QOS_CIS_UNSET) { - data.count = 0; - data.cig = qos->ucast.cig; - data.cis = qos->ucast.cis; - - hci_conn_hash_list_state(hdev, cis_list, ISO_LINK, BT_BOUND, - &data); - if (data.count) + if (hci_conn_hash_lookup_cis(hdev, NULL, 0, qos->ucast.cig, + qos->ucast.cis)) return false; - - cis_add(&data, qos); + goto done; } - /* Reprogram all CIS(s) with the same CIG, valid range are: - * num_cis: 0x00 to 0x1F - * cis_id: 0x00 to 0xEF - */ - for (data.cig = qos->ucast.cig, data.cis = 0x00; data.cis < 0xf0 && - data.pdu.cp.num_cis < ARRAY_SIZE(data.pdu.cis); data.cis++) { - data.count = 0; - - hci_conn_hash_list_state(hdev, cis_list, ISO_LINK, BT_BOUND, - &data); - if (data.count) - continue; - - /* Allocate a CIS if not set */ - if (qos->ucast.cis == BT_ISO_QOS_CIS_UNSET) { + /* Allocate first available CIS if not set */ + for (data.cig = qos->ucast.cig, data.cis = 0x00; data.cis < 0xf0; + data.cis++) { + if (!hci_conn_hash_lookup_cis(hdev, NULL, 0, data.cig, + data.cis)) { /* Update CIS */ qos->ucast.cis = data.cis; - cis_add(&data, qos); + break; } } - if (qos->ucast.cis == BT_ISO_QOS_CIS_UNSET || !data.pdu.cp.num_cis) + if (qos->ucast.cis == BT_ISO_QOS_CIS_UNSET) return false; - pdu = kmemdup(&data.pdu, sizeof(*pdu), GFP_KERNEL); - if (!pdu) - return false; - - if (hci_cmd_sync_queue(hdev, set_cig_params_sync, pdu, - set_cig_params_complete) < 0) { - kfree(pdu); +done: + if (hci_cmd_sync_queue(hdev, set_cig_params_sync, + ERR_PTR(qos->ucast.cig), NULL) < 0) return false; - } return true; } -- cgit v1.2.3 From a1f6c3aef13c9e7f8d459bd464e9e34da1342c0c Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 4 Aug 2023 16:23:41 -0700 Subject: Bluetooth: hci_sync: Introduce PTR_UINT/UINT_PTR macros This introduces PTR_UINT/UINT_PTR macros and replace the use of PTR_ERR/ERR_PTR. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_sync.h | 3 +++ net/bluetooth/hci_conn.c | 19 ++++++++++--------- net/bluetooth/hci_sync.c | 4 ++-- 3 files changed, 15 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index b516a0f4a55b..57eeb07aeb25 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -5,6 +5,9 @@ * Copyright (C) 2021 Intel Corporation */ +#define UINT_PTR(_handle) ((void *)((uintptr_t)_handle)) +#define PTR_UINT(_ptr) ((uintptr_t)((void *)_ptr)) + typedef int (*hci_cmd_sync_work_func_t)(struct hci_dev *hdev, void *data); typedef void (*hci_cmd_sync_work_destroy_t)(struct hci_dev *hdev, void *data, int err); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ae206eb551f7..4ad6af4e3145 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -872,7 +872,7 @@ static void bis_cleanup(struct hci_conn *conn) static int remove_cig_sync(struct hci_dev *hdev, void *data) { - u8 handle = PTR_ERR(data); + u8 handle = PTR_UINT(data); return hci_le_remove_cig_sync(hdev, handle); } @@ -881,7 +881,8 @@ static int hci_le_remove_cig(struct hci_dev *hdev, u8 handle) { bt_dev_dbg(hdev, "handle 0x%2.2x", handle); - return hci_cmd_sync_queue(hdev, remove_cig_sync, ERR_PTR(handle), NULL); + return hci_cmd_sync_queue(hdev, remove_cig_sync, UINT_PTR(handle), + NULL); } static void find_cis(struct hci_conn *conn, void *data) @@ -1260,7 +1261,7 @@ u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle) static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) { struct hci_conn *conn; - u16 handle = PTR_ERR(data); + u16 handle = PTR_UINT(data); conn = hci_conn_hash_lookup_handle(hdev, handle); if (!conn) @@ -1290,7 +1291,7 @@ done: static int hci_connect_le_sync(struct hci_dev *hdev, void *data) { struct hci_conn *conn; - u16 handle = PTR_ERR(data); + u16 handle = PTR_UINT(data); conn = hci_conn_hash_lookup_handle(hdev, handle); if (!conn) @@ -1372,7 +1373,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, clear_bit(HCI_CONN_SCANNING, &conn->flags); err = hci_cmd_sync_queue(hdev, hci_connect_le_sync, - ERR_PTR(conn->handle), + UINT_PTR(conn->handle), create_le_conn_complete); if (err) { hci_conn_del(conn); @@ -1736,7 +1737,7 @@ static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos) static int set_cig_params_sync(struct hci_dev *hdev, void *data) { - u8 cig_id = PTR_ERR(data); + u8 cig_id = PTR_UINT(data); struct hci_conn *conn; struct bt_iso_qos *qos; struct iso_cig_params pdu; @@ -1846,7 +1847,7 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos) done: if (hci_cmd_sync_queue(hdev, set_cig_params_sync, - ERR_PTR(qos->ucast.cig), NULL) < 0) + UINT_PTR(qos->ucast.cig), NULL) < 0) return false; return true; @@ -2858,7 +2859,7 @@ u32 hci_conn_get_phy(struct hci_conn *conn) static int abort_conn_sync(struct hci_dev *hdev, void *data) { struct hci_conn *conn; - u16 handle = PTR_ERR(data); + u16 handle = PTR_UINT(data); conn = hci_conn_hash_lookup_handle(hdev, handle); if (!conn) @@ -2898,6 +2899,6 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason) } } - return hci_cmd_sync_queue(hdev, abort_conn_sync, ERR_PTR(conn->handle), + return hci_cmd_sync_queue(hdev, abort_conn_sync, UINT_PTR(conn->handle), NULL); } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 41a8e57d8267..5eb30ba21370 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -6528,7 +6528,7 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, static int _update_adv_data_sync(struct hci_dev *hdev, void *data) { - u8 instance = PTR_ERR(data); + u8 instance = PTR_UINT(data); return hci_update_adv_data_sync(hdev, instance); } @@ -6536,5 +6536,5 @@ static int _update_adv_data_sync(struct hci_dev *hdev, void *data) int hci_update_adv_data(struct hci_dev *hdev, u8 instance) { return hci_cmd_sync_queue(hdev, _update_adv_data_sync, - ERR_PTR(instance), NULL); + UINT_PTR(instance), NULL); } -- cgit v1.2.3 From 3673952cf0c6cf81b06c66a0b788abeeb02ff3ae Mon Sep 17 00:00:00 2001 From: Min Li Date: Mon, 7 Aug 2023 19:07:41 +0800 Subject: Bluetooth: Fix potential use-after-free when clear keys Similar to commit c5d2b6fa26b5 ("Bluetooth: Fix use-after-free in hci_remove_ltk/hci_remove_irk"). We can not access k after kfree_rcu() call. Fixes: d7d41682efc2 ("Bluetooth: Fix Suspicious RCU usage warnings") Signed-off-by: Min Li Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 0fefa6788911..d7552b394ac2 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1074,9 +1074,9 @@ void hci_uuids_clear(struct hci_dev *hdev) void hci_link_keys_clear(struct hci_dev *hdev) { - struct link_key *key; + struct link_key *key, *tmp; - list_for_each_entry(key, &hdev->link_keys, list) { + list_for_each_entry_safe(key, tmp, &hdev->link_keys, list) { list_del_rcu(&key->list); kfree_rcu(key, rcu); } @@ -1084,9 +1084,9 @@ void hci_link_keys_clear(struct hci_dev *hdev) void hci_smp_ltks_clear(struct hci_dev *hdev) { - struct smp_ltk *k; + struct smp_ltk *k, *tmp; - list_for_each_entry(k, &hdev->long_term_keys, list) { + list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } @@ -1094,9 +1094,9 @@ void hci_smp_ltks_clear(struct hci_dev *hdev) void hci_smp_irks_clear(struct hci_dev *hdev) { - struct smp_irk *k; + struct smp_irk *k, *tmp; - list_for_each_entry(k, &hdev->identity_resolving_keys, list) { + list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } @@ -1104,9 +1104,9 @@ void hci_smp_irks_clear(struct hci_dev *hdev) void hci_blocked_keys_clear(struct hci_dev *hdev) { - struct blocked_key *b; + struct blocked_key *b, *tmp; - list_for_each_entry(b, &hdev->blocked_keys, list) { + list_for_each_entry_safe(b, tmp, &hdev->blocked_keys, list) { list_del_rcu(&b->list); kfree_rcu(b, rcu); } -- cgit v1.2.3 From a2bcd2b63271a93a695fabbfbf459c603d956d48 Mon Sep 17 00:00:00 2001 From: Manish Mandlik Date: Fri, 4 Aug 2023 11:14:45 -0700 Subject: Bluetooth: hci_sync: Avoid use-after-free in dbg for hci_add_adv_monitor() KSAN reports use-after-free in hci_add_adv_monitor(). While adding an adv monitor, hci_add_adv_monitor() calls -> msft_add_monitor_pattern() calls -> msft_add_monitor_sync() calls -> msft_le_monitor_advertisement_cb() calls in an error case -> hci_free_adv_monitor() which frees the *moniter. This is referenced by bt_dev_dbg() in hci_add_adv_monitor(). Fix the bt_dev_dbg() by using handle instead of monitor->handle. Fixes: b747a83690c8 ("Bluetooth: hci_sync: Refactor add Adv Monitor") Signed-off-by: Manish Mandlik Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index d7552b394ac2..a5992f1b3c9b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1957,7 +1957,7 @@ int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) case HCI_ADV_MONITOR_EXT_MSFT: status = msft_add_monitor_pattern(hdev, monitor); bt_dev_dbg(hdev, "add monitor %d msft status %d", - monitor->handle, status); + handle, status); break; } -- cgit v1.2.3 From 3cd43dd15f9dbfb67a60889992bf03b92370b202 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Mon, 7 Aug 2023 10:32:06 +0800 Subject: Bluetooth: Remove unnecessary NULL check before vfree() Remove unnecessary NULL check which causes coccinelle warning: net/bluetooth/coredump.c:104:2-7: WARNING: NULL check before some freeing functions is not needed. Signed-off-by: Ziyang Xuan Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/coredump.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/coredump.c b/net/bluetooth/coredump.c index d2d2624ec708..ec97a4bab1c9 100644 --- a/net/bluetooth/coredump.c +++ b/net/bluetooth/coredump.c @@ -100,8 +100,7 @@ void hci_devcd_reset(struct hci_dev *hdev) /* Call with hci_dev_lock only. */ static void hci_devcd_free(struct hci_dev *hdev) { - if (hdev->dump.head) - vfree(hdev->dump.head); + vfree(hdev->dump.head); hci_devcd_reset(hdev); } -- cgit v1.2.3 From 66dee21524d9ac6461ec3052652b7bc0603ee0c5 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sat, 5 Aug 2023 19:08:41 +0300 Subject: Bluetooth: hci_event: drop only unbound CIS if Set CIG Parameters fails When user tries to connect a new CIS when its CIG is not configurable, that connection shall fail, but pre-existing connections shall not be affected. However, currently hci_cc_le_set_cig_params deletes all CIS of the CIG on error so it doesn't work, even though controller shall not change CIG/CIS configuration if the command fails. Fix by failing on command error only the connections that are not yet bound, so that we keep the previous CIS configuration like the controller does. Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 218da9b0fe8f..559b6080706c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3799,6 +3799,22 @@ static u8 hci_cc_le_read_buffer_size_v2(struct hci_dev *hdev, void *data, return rp->status; } +static void hci_unbound_cis_failed(struct hci_dev *hdev, u8 cig, u8 status) +{ + struct hci_conn *conn, *tmp; + + lockdep_assert_held(&hdev->lock); + + list_for_each_entry_safe(conn, tmp, &hdev->conn_hash.list, list) { + if (conn->type != ISO_LINK || !bacmp(&conn->dst, BDADDR_ANY) || + conn->state == BT_OPEN || conn->iso_qos.ucast.cig != cig) + continue; + + if (HCI_CONN_HANDLE_UNSET(conn->handle)) + hci_conn_failed(conn, status); + } +} + static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -3820,12 +3836,15 @@ static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); + /* BLUETOOTH CORE SPECIFICATION Version 5.4 | Vol 4, Part E page 2554 + * + * If the Status return parameter is non-zero, then the state of the CIG + * and its CIS configurations shall not be changed by the command. If + * the CIG did not already exist, it shall not be created. + */ if (status) { - while ((conn = hci_conn_hash_lookup_cig(hdev, rp->cig_id))) { - conn->state = BT_CLOSED; - hci_connect_cfm(conn, status); - hci_conn_del(conn); - } + /* Keep current configuration, fail only the unbound CIS */ + hci_unbound_cis_failed(hdev, rp->cig_id, status); goto unlock; } -- cgit v1.2.3 From b5793de3cfaefef34a1fc9305c9fe3dbcd0ac792 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sat, 5 Aug 2023 19:08:42 +0300 Subject: Bluetooth: hci_conn: avoid checking uninitialized CIG/CIS ids The CIS/CIG ids of ISO connections are defined only when the connection is unicast. Fix the lookup functions to check for unicast first. Ensure CIG/CIS IDs have valid value also in state BT_OPEN. Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 4 ++-- net/bluetooth/hci_conn.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index f4462c325e2a..c53d74236e3a 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1219,7 +1219,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) + if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) continue; /* Match CIG ID if set */ @@ -1251,7 +1251,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cig(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) + if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) continue; if (handle == c->iso_qos.ucast.cig) { diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 4ad6af4e3145..234746721047 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1866,6 +1866,8 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, return ERR_PTR(-ENOMEM); cis->cleanup = cis_cleanup; cis->dst_type = dst_type; + cis->iso_qos.ucast.cig = BT_ISO_QOS_CIG_UNSET; + cis->iso_qos.ucast.cis = BT_ISO_QOS_CIS_UNSET; } if (cis->state == BT_CONNECTED) -- cgit v1.2.3 From e2142825c120d4317abf7160a0fc34b3de532586 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 11 Aug 2023 10:55:27 +0800 Subject: net: tcp: send zero-window ACK when no memory For now, skb will be dropped when no memory, which makes client keep retrans util timeout and it's not friendly to the users. In this patch, we reply an ACK with zero-window in this case to update the snd_wnd of the sender to 0. Therefore, the sender won't timeout the connection and will probe the zero-window with the retransmits. Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 3 ++- net/ipv4/tcp_input.c | 18 ++++++++++++------ net/ipv4/tcp_output.c | 14 +++++++++++--- 3 files changed, 25 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c2b15f7e5516..be3c858a2ebb 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -164,7 +164,8 @@ enum inet_csk_ack_state_t { ICSK_ACK_TIMER = 2, ICSK_ACK_PUSHED = 4, ICSK_ACK_PUSHED2 = 8, - ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ + ICSK_ACK_NOW = 16, /* Send the next ACK immediately (once) */ + ICSK_ACK_NOMEM = 32, }; void inet_csk_init_xmit_timers(struct sock *sk, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8e96ebe373d7..2ac059483410 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5059,13 +5059,19 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) /* Ok. In sequence. In window. */ queue_and_out: - if (skb_queue_len(&sk->sk_receive_queue) == 0) - sk_forced_mem_schedule(sk, skb->truesize); - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { - reason = SKB_DROP_REASON_PROTO_MEM; - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + /* TODO: maybe ratelimit these WIN 0 ACK ? */ + inet_csk(sk)->icsk_ack.pending |= + (ICSK_ACK_NOMEM | ICSK_ACK_NOW); + inet_csk_schedule_ack(sk); sk->sk_data_ready(sk); - goto drop; + + if (skb_queue_len(&sk->sk_receive_queue)) { + reason = SKB_DROP_REASON_PROTO_MEM; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); + goto drop; + } + sk_forced_mem_schedule(sk, skb->truesize); } eaten = tcp_queue_rcv(sk, skb, &fragstolen); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c5412ee77fc8..769a558159ee 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -257,11 +257,19 @@ EXPORT_SYMBOL(tcp_select_initial_window); static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - u32 old_win = tp->rcv_wnd; - u32 cur_win = tcp_receive_window(tp); - u32 new_win = __tcp_select_window(sk); struct net *net = sock_net(sk); + u32 old_win = tp->rcv_wnd; + u32 cur_win, new_win; + + /* Make the window 0 if we failed to queue the data because we + * are out of memory. The window is temporary, so we don't store + * it on the socket. + */ + if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) + return 0; + cur_win = tcp_receive_window(tp); + new_win = __tcp_select_window(sk); if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else -- cgit v1.2.3 From 800a666141de75baebb59b347b9c9e43d963e4da Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 11 Aug 2023 10:55:28 +0800 Subject: net: tcp: allow zero-window ACK update the window Fow now, an ACK can update the window in following case, according to the tcp_may_update_window(): 1. the ACK acknowledged new data 2. the ACK has new data 3. the ACK expand the window and the seq of it is valid Now, we allow the ACK update the window if the window is 0, and the seq/ack of it is valid. This is for the case that the receiver replies an zero-window ACK when it is under memory stress and can't queue the new data. Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2ac059483410..d34d52fdfdb1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3525,7 +3525,7 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp, { return after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || - (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); + (ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd || !nwin)); } /* If we update tp->snd_una, also update tp->bytes_acked */ -- cgit v1.2.3 From e89688e3e97868451a5d05b38a9d2633d6785cd4 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 11 Aug 2023 10:55:29 +0800 Subject: net: tcp: fix unexcepted socket die when snd_wnd is 0 In tcp_retransmit_timer(), a window shrunk connection will be regarded as timeout if 'tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX'. This is not right all the time. The retransmits will become zero-window probes in tcp_retransmit_timer() if the 'snd_wnd==0'. Therefore, the icsk->icsk_rto will come up to TCP_RTO_MAX sooner or later. However, the timer can be delayed and be triggered after 122877ms, not TCP_RTO_MAX, as I tested. Therefore, 'tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX' is always true once the RTO come up to TCP_RTO_MAX, and the socket will die. Fix this by replacing the 'tcp_jiffies32' with '(u32)icsk->icsk_timeout', which is exact the timestamp of the timeout. However, "tp->rcv_tstamp" can restart from idle, then tp->rcv_tstamp could already be a long time (minutes or hours) in the past even on the first RTO. So we double check the timeout with the duration of the retransmission. Meanwhile, making "2 * TCP_RTO_MAX" as the timeout to avoid the socket dying too soon. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Link: https://lore.kernel.org/netdev/CADxym3YyMiO+zMD4zj03YPM3FBi-1LHi6gSD2XT8pyAMM096pg@mail.gmail.com/ Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index d45c96c7f5a4..f2a52c11e044 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -454,6 +454,22 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) req->timeout << req->num_timeout, TCP_RTO_MAX); } +static bool tcp_rtx_probe0_timed_out(const struct sock *sk, + const struct sk_buff *skb) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const int timeout = TCP_RTO_MAX * 2; + u32 rcv_delta, rtx_delta; + + rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp; + if (rcv_delta <= timeout) + return false; + + rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) - + (tp->retrans_stamp ?: tcp_skb_timestamp(skb))); + + return rtx_delta > timeout; +} /** * tcp_retransmit_timer() - The TCP retransmit timeout handler @@ -519,7 +535,7 @@ void tcp_retransmit_timer(struct sock *sk) tp->snd_una, tp->snd_nxt); } #endif - if (tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX) { + if (tcp_rtx_probe0_timed_out(sk, skb)) { tcp_write_err(sk); goto out; } -- cgit v1.2.3 From 031c44b7527aec2f22ddaae4bcd8b085ff810ec4 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 11 Aug 2023 10:55:30 +0800 Subject: net: tcp: refactor the dbg message in tcp_retransmit_timer() The debug message in tcp_retransmit_timer() is slightly wrong, because they could be printed even if we did not receive a new ACK packet from the remote peer. Change it to probing zero-window, as it is a expected case now. The description may be not correct. Adding the duration since the last ACK we received, and the duration of the retransmission, which are useful for debugging. And the message now like this: Probing zero-window on 127.0.0.1:9999/46946, seq=3737778959:3737791503, recv 209ms ago, lasting 209ms Probing zero-window on 127.0.0.1:9999/46946, seq=3737778959:3737791503, recv 404ms ago, lasting 408ms Probing zero-window on 127.0.0.1:9999/46946, seq=3737778959:3737791503, recv 812ms ago, lasting 1224ms Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index f2a52c11e044..74c70fc1003c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -519,20 +519,23 @@ void tcp_retransmit_timer(struct sock *sk) * we cannot allow such beasts to hang infinitely. */ struct inet_sock *inet = inet_sk(sk); + u32 rtx_delta; + + rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp(skb)); if (sk->sk_family == AF_INET) { - net_dbg_ratelimited("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", - &inet->inet_daddr, - ntohs(inet->inet_dport), - inet->inet_num, - tp->snd_una, tp->snd_nxt); + net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", + &inet->inet_daddr, ntohs(inet->inet_dport), + inet->inet_num, tp->snd_una, tp->snd_nxt, + jiffies_to_msecs(jiffies - tp->rcv_tstamp), + rtx_delta); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { - net_dbg_ratelimited("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", - &sk->sk_v6_daddr, - ntohs(inet->inet_dport), - inet->inet_num, - tp->snd_una, tp->snd_nxt); + net_dbg_ratelimited("Probing zero-window on %pI6:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", + &sk->sk_v6_daddr, ntohs(inet->inet_dport), + inet->inet_num, tp->snd_una, tp->snd_nxt, + jiffies_to_msecs(jiffies - tp->rcv_tstamp), + rtx_delta); } #endif if (tcp_rtx_probe0_timed_out(sk, skb)) { -- cgit v1.2.3 From 8fe08d70a2b61b35a0a1235c78cf321e7528351f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Aug 2023 07:22:26 +0000 Subject: netlink: convert nlk->flags to atomic flags sk_diag_put_flags(), netlink_setsockopt(), netlink_getsockopt() and others use nlk->flags without correct locking. Use set_bit(), clear_bit(), test_bit(), assign_bit() to remove data-races. Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 90 ++++++++++++++++-------------------------------- net/netlink/af_netlink.h | 22 +++++++----- net/netlink/diag.c | 10 +++--- 3 files changed, 48 insertions(+), 74 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 96c605e45235..642b9d382fb4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -84,7 +84,7 @@ struct listeners { static inline int netlink_is_kernel(struct sock *sk) { - return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET; + return nlk_test_bit(KERNEL_SOCKET, sk); } struct netlink_table *nl_table __read_mostly; @@ -349,9 +349,7 @@ static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src, static void netlink_overrun(struct sock *sk) { - struct netlink_sock *nlk = nlk_sk(sk); - - if (!(nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)) { + if (!nlk_test_bit(RECV_NO_ENOBUFS, sk)) { if (!test_and_set_bit(NETLINK_S_CONGESTED, &nlk_sk(sk)->state)) { sk->sk_err = ENOBUFS; @@ -1407,9 +1405,7 @@ EXPORT_SYMBOL_GPL(netlink_has_listeners); bool netlink_strict_get_check(struct sk_buff *skb) { - const struct netlink_sock *nlk = nlk_sk(NETLINK_CB(skb).sk); - - return nlk->flags & NETLINK_F_STRICT_CHK; + return nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk); } EXPORT_SYMBOL_GPL(netlink_strict_get_check); @@ -1455,7 +1451,7 @@ static void do_one_broadcast(struct sock *sk, return; if (!net_eq(sock_net(sk), p->net)) { - if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID)) + if (!nlk_test_bit(LISTEN_ALL_NSID, sk)) return; if (!peernet_has_id(sock_net(sk), p->net)) @@ -1488,7 +1484,7 @@ static void do_one_broadcast(struct sock *sk, netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; - if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) + if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) p->delivery_failure = 1; goto out; } @@ -1510,7 +1506,7 @@ static void do_one_broadcast(struct sock *sk, val = netlink_broadcast_deliver(sk, p->skb2); if (val < 0) { netlink_overrun(sk); - if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) + if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) p->delivery_failure = 1; } else { p->congested |= val; @@ -1604,7 +1600,7 @@ static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) !test_bit(p->group - 1, nlk->groups)) goto out; - if (p->code == ENOBUFS && nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) { + if (p->code == ENOBUFS && nlk_test_bit(RECV_NO_ENOBUFS, sk)) { ret = 1; goto out; } @@ -1668,7 +1664,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); unsigned int val = 0; - int err; + int nr = -1; if (level != SOL_NETLINK) return -ENOPROTOOPT; @@ -1679,14 +1675,12 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case NETLINK_PKTINFO: - if (val) - nlk->flags |= NETLINK_F_RECV_PKTINFO; - else - nlk->flags &= ~NETLINK_F_RECV_PKTINFO; - err = 0; + nr = NETLINK_F_RECV_PKTINFO; break; case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: { + int err; + if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); @@ -1706,61 +1700,38 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind) nlk->netlink_unbind(sock_net(sk), val); - err = 0; break; } case NETLINK_BROADCAST_ERROR: - if (val) - nlk->flags |= NETLINK_F_BROADCAST_SEND_ERROR; - else - nlk->flags &= ~NETLINK_F_BROADCAST_SEND_ERROR; - err = 0; + nr = NETLINK_F_BROADCAST_SEND_ERROR; break; case NETLINK_NO_ENOBUFS: + assign_bit(NETLINK_F_RECV_NO_ENOBUFS, &nlk->flags, val); if (val) { - nlk->flags |= NETLINK_F_RECV_NO_ENOBUFS; clear_bit(NETLINK_S_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); - } else { - nlk->flags &= ~NETLINK_F_RECV_NO_ENOBUFS; } - err = 0; break; case NETLINK_LISTEN_ALL_NSID: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) return -EPERM; - - if (val) - nlk->flags |= NETLINK_F_LISTEN_ALL_NSID; - else - nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID; - err = 0; + nr = NETLINK_F_LISTEN_ALL_NSID; break; case NETLINK_CAP_ACK: - if (val) - nlk->flags |= NETLINK_F_CAP_ACK; - else - nlk->flags &= ~NETLINK_F_CAP_ACK; - err = 0; + nr = NETLINK_F_CAP_ACK; break; case NETLINK_EXT_ACK: - if (val) - nlk->flags |= NETLINK_F_EXT_ACK; - else - nlk->flags &= ~NETLINK_F_EXT_ACK; - err = 0; + nr = NETLINK_F_EXT_ACK; break; case NETLINK_GET_STRICT_CHK: - if (val) - nlk->flags |= NETLINK_F_STRICT_CHK; - else - nlk->flags &= ~NETLINK_F_STRICT_CHK; - err = 0; + nr = NETLINK_F_STRICT_CHK; break; default: - err = -ENOPROTOOPT; + return -ENOPROTOOPT; } - return err; + if (nr >= 0) + assign_bit(nr, &nlk->flags, val); + return 0; } static int netlink_getsockopt(struct socket *sock, int level, int optname, @@ -1827,7 +1798,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, return -EINVAL; len = sizeof(int); - val = nlk->flags & flag ? 1 : 0; + val = test_bit(flag, &nlk->flags); if (put_user(len, optlen) || copy_to_user(optval, &val, len)) @@ -2004,9 +1975,9 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, msg->msg_namelen = sizeof(*addr); } - if (nlk->flags & NETLINK_F_RECV_PKTINFO) + if (nlk_test_bit(RECV_PKTINFO, sk)) netlink_cmsg_recv_pktinfo(msg, skb); - if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID) + if (nlk_test_bit(LISTEN_ALL_NSID, sk)) netlink_cmsg_listen_all_nsid(sk, msg, skb); memset(&scm, 0, sizeof(scm)); @@ -2083,7 +2054,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, goto out_sock_release; nlk = nlk_sk(sk); - nlk->flags |= NETLINK_F_KERNEL_SOCKET; + set_bit(NETLINK_F_KERNEL_SOCKET, &nlk->flags); netlink_table_grab(); if (!nl_table[unit].registered) { @@ -2218,7 +2189,7 @@ static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, nl_dump_check_consistent(cb, nlh); memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno)); - if (extack->_msg && nlk->flags & NETLINK_F_EXT_ACK) { + if (extack->_msg && test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) { nlh->nlmsg_flags |= NLM_F_ACK_TLVS; if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)) nlmsg_end(skb, nlh); @@ -2347,8 +2318,8 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { - struct netlink_sock *nlk, *nlk2; struct netlink_callback *cb; + struct netlink_sock *nlk; struct sock *sk; int ret; @@ -2383,8 +2354,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->min_dump_alloc = control->min_dump_alloc; cb->skb = skb; - nlk2 = nlk_sk(NETLINK_CB(skb).sk); - cb->strict_check = !!(nlk2->flags & NETLINK_F_STRICT_CHK); + cb->strict_check = nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk); if (control->start) { cb->extack = control->extack; @@ -2428,7 +2398,7 @@ netlink_ack_tlv_len(struct netlink_sock *nlk, int err, { size_t tlvlen; - if (!extack || !(nlk->flags & NETLINK_F_EXT_ACK)) + if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) return 0; tlvlen = 0; @@ -2500,7 +2470,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, * requests to cap the error message, and get extra error data if * requested. */ - if (err && !(nlk->flags & NETLINK_F_CAP_ACK)) + if (err && !test_bit(NETLINK_F_CAP_ACK, &nlk->flags)) payload += nlmsg_len(nlh); else flags |= NLM_F_CAPPED; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index fd424cd63f31..2145979b9986 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -8,14 +8,16 @@ #include /* flags */ -#define NETLINK_F_KERNEL_SOCKET 0x1 -#define NETLINK_F_RECV_PKTINFO 0x2 -#define NETLINK_F_BROADCAST_SEND_ERROR 0x4 -#define NETLINK_F_RECV_NO_ENOBUFS 0x8 -#define NETLINK_F_LISTEN_ALL_NSID 0x10 -#define NETLINK_F_CAP_ACK 0x20 -#define NETLINK_F_EXT_ACK 0x40 -#define NETLINK_F_STRICT_CHK 0x80 +enum { + NETLINK_F_KERNEL_SOCKET, + NETLINK_F_RECV_PKTINFO, + NETLINK_F_BROADCAST_SEND_ERROR, + NETLINK_F_RECV_NO_ENOBUFS, + NETLINK_F_LISTEN_ALL_NSID, + NETLINK_F_CAP_ACK, + NETLINK_F_EXT_ACK, + NETLINK_F_STRICT_CHK, +}; #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) @@ -23,10 +25,10 @@ struct netlink_sock { /* struct sock has to be the first member of netlink_sock */ struct sock sk; + unsigned long flags; u32 portid; u32 dst_portid; u32 dst_group; - u32 flags; u32 subscriptions; u32 ngroups; unsigned long *groups; @@ -56,6 +58,8 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk) return container_of(sk, struct netlink_sock, sk); } +#define nlk_test_bit(nr, sk) test_bit(NETLINK_F_##nr, &nlk_sk(sk)->flags) + struct netlink_table { struct rhashtable hash; struct hlist_head mc_list; diff --git a/net/netlink/diag.c b/net/netlink/diag.c index e4f21b1067bc..9c4f231be275 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -27,15 +27,15 @@ static int sk_diag_put_flags(struct sock *sk, struct sk_buff *skb) if (nlk->cb_running) flags |= NDIAG_FLAG_CB_RUNNING; - if (nlk->flags & NETLINK_F_RECV_PKTINFO) + if (nlk_test_bit(RECV_PKTINFO, sk)) flags |= NDIAG_FLAG_PKTINFO; - if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) + if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) flags |= NDIAG_FLAG_BROADCAST_ERROR; - if (nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) + if (nlk_test_bit(RECV_NO_ENOBUFS, sk)) flags |= NDIAG_FLAG_NO_ENOBUFS; - if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID) + if (nlk_test_bit(LISTEN_ALL_NSID, sk)) flags |= NDIAG_FLAG_LISTEN_ALL_NSID; - if (nlk->flags & NETLINK_F_CAP_ACK) + if (nlk_test_bit(CAP_ACK, sk)) flags |= NDIAG_FLAG_CAP_ACK; return nla_put_u32(skb, NETLINK_DIAG_FLAGS, flags); -- cgit v1.2.3 From 2b8893b639e4ad38fa0a941d665e827963ede727 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Fri, 11 Aug 2023 17:50:10 +0800 Subject: net/rds: Remove unused function declarations Commit 39de8281791c ("RDS: Main header file") declared but never implemented rds_trans_init() and rds_trans_exit(), remove it. Commit d37c9359056f ("RDS: Move loop-only function to loop.c") removed the implementation rds_message_inc_free() but not the declaration. Since commit 55b7ed0b582f ("RDS: Common RDMA transport code") rds_rdma_conn_connect() is never implemented and used. rds_tcp_map_seq() is never implemented and used since commit 70041088e3b9 ("RDS: Add TCP transport to RDS"). Signed-off-by: Yue Haibing Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/rds/rdma_transport.h | 1 - net/rds/rds.h | 3 --- net/rds/tcp.h | 1 - 3 files changed, 5 deletions(-) (limited to 'net') diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index ca4c3a667091..d2fdb1529585 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -17,7 +17,6 @@ */ #define RDS_RDMA_REJ_INCOMPAT 1 -int rds_rdma_conn_connect(struct rds_connection *conn); int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id, diff --git a/net/rds/rds.h b/net/rds/rds.h index d35d1fc39807..dc360252c515 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -863,7 +863,6 @@ int rds_message_next_extension(struct rds_header *hdr, unsigned int *pos, void *buf, unsigned int *buflen); int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); -void rds_message_inc_free(struct rds_incoming *inc); void rds_message_addref(struct rds_message *rm); void rds_message_put(struct rds_message *rm); void rds_message_wait(struct rds_message *rm); @@ -1013,7 +1012,5 @@ void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); struct rds_transport *rds_trans_get(int t_type); -int rds_trans_init(void); -void rds_trans_exit(void); #endif diff --git a/net/rds/tcp.h b/net/rds/tcp.h index f8b5930d7b34..053aa7da87ef 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -56,7 +56,6 @@ void rds_tcp_restore_callbacks(struct socket *sock, struct rds_tcp_connection *tc); u32 rds_tcp_write_seq(struct rds_tcp_connection *tc); u32 rds_tcp_snd_una(struct rds_tcp_connection *tc); -u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq); extern struct rds_transport rds_tcp_transport; void rds_tcp_accept_work(struct sock *sk); int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr, -- cgit v1.2.3 From 131a627751e3474bc8e33009c032feecfe8d879f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 11 Aug 2023 17:57:14 +0200 Subject: mptcp: avoid unneeded mptcp_token_destroy() calls The MPTCP protocol currently clears the msk token both at connect() and listen() time. That is needed to deal with failing connect() calls that can create a new token while leaving the sk in TCP_CLOSE,SS_UNCONNECTED status and thus allowing later connect() and/or listen() calls. Let's deal with such failures explicitly, cleaning the token in a timely manner and avoid the confusing early mptcp_token_destroy(). Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 48e649fe2360..abb310548c37 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3594,7 +3594,6 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (IS_ERR(ssock)) return PTR_ERR(ssock); - mptcp_token_destroy(msk); inet_sk_state_store(sk, TCP_SYN_SENT); subflow = mptcp_subflow_ctx(ssock->sk); #ifdef CONFIG_TCP_MD5SIG @@ -3624,6 +3623,8 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * subflow_finish_connect() */ if (unlikely(err && err != -EINPROGRESS)) { + /* avoid leaving a dangling token in an unconnected socket */ + mptcp_token_destroy(msk); inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); return err; } @@ -3713,7 +3714,6 @@ static int mptcp_listen(struct socket *sock, int backlog) goto unlock; } - mptcp_token_destroy(msk); inet_sk_state_store(sk, TCP_LISTEN); sock_set_flag(sk, SOCK_RCU_FREE); -- cgit v1.2.3 From ccae357c1c6a76c3b46f12ae18d7679d871c9390 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 11 Aug 2023 17:57:15 +0200 Subject: mptcp: avoid additional __inet_stream_connect() call The mptcp protocol maintains an additional socket just to easily invoke a few stream operations on the first subflow. One of them is __inet_stream_connect(). We are going to remove the first subflow socket soon, so avoid the additional indirection via at connect time, calling directly into the sock-level connect() ops. The sk-level connect never return -EINPROGRESS, cleanup the error path accordingly. Additionally, the ssk status on error is always TCP_CLOSE. Avoid unneeded access to the subflow sk state. No functional change intended. Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index abb310548c37..b888d6339c80 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3589,22 +3589,24 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) struct mptcp_sock *msk = mptcp_sk(sk); struct socket *ssock; int err = -EINVAL; + struct sock *ssk; ssock = __mptcp_nmpc_socket(msk); if (IS_ERR(ssock)) return PTR_ERR(ssock); inet_sk_state_store(sk, TCP_SYN_SENT); - subflow = mptcp_subflow_ctx(ssock->sk); + ssk = msk->first; + subflow = mptcp_subflow_ctx(ssk); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ - if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) + if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info)) mptcp_subflow_early_fallback(msk, subflow); #endif - if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) { - MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT); + if (subflow->request_mptcp && mptcp_token_new_connect(ssk)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT); mptcp_subflow_early_fallback(msk, subflow); } if (likely(!__mptcp_check_fallback(msk))) @@ -3613,27 +3615,42 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* if reaching here via the fastopen/sendmsg path, the caller already * acquired the subflow socket lock, too. */ - if (msk->fastopening) - err = __inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK, 1); - else - err = inet_stream_connect(ssock, uaddr, addr_len, O_NONBLOCK); - inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect; + if (!msk->fastopening) + lock_sock(ssk); + + /* the following mirrors closely a very small chunk of code from + * __inet_stream_connect() + */ + if (ssk->sk_state != TCP_CLOSE) + goto out; + + if (BPF_CGROUP_PRE_CONNECT_ENABLED(ssk)) { + err = ssk->sk_prot->pre_connect(ssk, uaddr, addr_len); + if (err) + goto out; + } + + err = ssk->sk_prot->connect(ssk, uaddr, addr_len); + if (err < 0) + goto out; + + inet_sk(sk)->defer_connect = inet_sk(ssk)->defer_connect; + +out: + if (!msk->fastopening) + release_sock(ssk); /* on successful connect, the msk state will be moved to established by * subflow_finish_connect() */ - if (unlikely(err && err != -EINPROGRESS)) { + if (unlikely(err)) { /* avoid leaving a dangling token in an unconnected socket */ mptcp_token_destroy(msk); - inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); + inet_sk_state_store(sk, TCP_CLOSE); return err; } - mptcp_copy_inaddrs(sk, ssock->sk); - - /* silence EINPROGRESS and let the caller inet_stream_connect - * handle the connection in progress - */ + mptcp_copy_inaddrs(sk, ssk); return 0; } -- cgit v1.2.3 From cfb63e50d319ee5d11254c83d3ad2b135b8735db Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 11 Aug 2023 17:57:16 +0200 Subject: mptcp: avoid subflow socket usage in mptcp_get_port() We are going to remove the first subflow socket soon, so avoid accessing it in mptcp_get_port(). Instead, access directly the first subflow sock. Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b888d6339c80..891f49722263 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3405,14 +3405,12 @@ static void mptcp_unhash(struct sock *sk) static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; - ssock = msk->subflow; - pr_debug("msk=%p, subflow=%p", msk, ssock); - if (WARN_ON_ONCE(!ssock)) + pr_debug("msk=%p, ssk=%p", msk, msk->first); + if (WARN_ON_ONCE(!msk->first)) return -EINVAL; - return inet_csk_get_port(ssock->sk, snum); + return inet_csk_get_port(msk->first, snum); } void mptcp_finish_connect(struct sock *ssk) -- cgit v1.2.3 From e6d360ff87f005e5b28edc26cb43718244ae7e73 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 11 Aug 2023 17:57:17 +0200 Subject: net: factor out inet{,6}_bind_sk helpers The mptcp protocol maintains an additional socket just to easily invoke a few stream operations on the first subflow. One of them is bind(). Factor out the helpers operating directly on the struct sock, to allow get rid of the above dependency in the next patch without duplicating the existing code. No functional changes intended. Signed-off-by: Paolo Abeni Acked-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- include/net/inet_common.h | 1 + include/net/ipv6.h | 1 + net/ipv4/af_inet.c | 8 ++++++-- net/ipv6/af_inet6.c | 10 +++++++--- 4 files changed, 15 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index b86b8e21de7f..8e97de700991 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -42,6 +42,7 @@ int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); /* Don't allocate port at this moment, defer to connect. */ #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 2acc4c808d45..22643ffc2df8 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1216,6 +1216,7 @@ void inet6_cleanup_sock(struct sock *sk); void inet6_sock_destruct(struct sock *sk); int inet6_release(struct socket *sock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9b2ca2fcc5a1..2fd23437c1d2 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -431,9 +431,8 @@ int inet_release(struct socket *sock) } EXPORT_SYMBOL(inet_release); -int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct sock *sk = sock->sk; u32 flags = BIND_WITH_LOCK; int err; @@ -454,6 +453,11 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return __inet_bind(sk, uaddr, addr_len, flags); } + +int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + return inet_bind_sk(sock->sk, uaddr, addr_len); +} EXPORT_SYMBOL(inet_bind); int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 9f9c4b838664..3ec0359d5c1f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -435,10 +435,8 @@ out_unlock: goto out; } -/* bind for INET6 API */ -int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct sock *sk = sock->sk; u32 flags = BIND_WITH_LOCK; const struct proto *prot; int err = 0; @@ -462,6 +460,12 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return __inet6_bind(sk, uaddr, addr_len, flags); } + +/* bind for INET6 API */ +int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + return inet6_bind_sk(sock->sk, uaddr, addr_len); +} EXPORT_SYMBOL(inet6_bind); int inet6_release(struct socket *sock) -- cgit v1.2.3 From 8cf2ebdc0078a841fd54fcd50574bae1ae910d94 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 11 Aug 2023 17:57:18 +0200 Subject: mptcp: mptcp: avoid additional indirection in mptcp_bind() We are going to remove the first subflow socket soon, so avoid the additional indirection via at bind() time. Instead call directly the recently introduced helpers on the first subflow sock. Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 891f49722263..5b4d6f0628a7 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3689,22 +3689,29 @@ static struct proto mptcp_prot = { static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct mptcp_sock *msk = mptcp_sk(sock->sk); + struct sock *ssk, *sk = sock->sk; struct socket *ssock; - int err; + int err = -EINVAL; - lock_sock(sock->sk); + lock_sock(sk); ssock = __mptcp_nmpc_socket(msk); if (IS_ERR(ssock)) { err = PTR_ERR(ssock); goto unlock; } - err = READ_ONCE(ssock->ops)->bind(ssock, uaddr, addr_len); + ssk = msk->first; + if (sk->sk_family == AF_INET) + err = inet_bind_sk(ssk, uaddr, addr_len); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (sk->sk_family == AF_INET6) + err = inet6_bind_sk(ssk, uaddr, addr_len); +#endif if (!err) - mptcp_copy_inaddrs(sock->sk, ssock->sk); + mptcp_copy_inaddrs(sk, ssk); unlock: - release_sock(sock->sk); + release_sock(sk); return err; } -- cgit v1.2.3 From 71a9a874cd6beb074896519f762741fbc448f5be Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 11 Aug 2023 17:57:19 +0200 Subject: net: factor out __inet_listen_sk() helper The mptcp protocol maintains an additional socket just to easily invoke a few stream operations on the first subflow. One of them is inet_listen(). Factor out an helper operating directly on the (locked) struct sock, to allow get rid of the above dependency in the next patch without duplicating the existing code. No functional changes intended. Signed-off-by: Paolo Abeni Acked-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- include/net/inet_common.h | 1 + net/ipv4/af_inet.c | 38 ++++++++++++++++++++++---------------- 2 files changed, 23 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 8e97de700991..f50a644d87a9 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -40,6 +40,7 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); +int __inet_listen_sk(struct sock *sk, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 2fd23437c1d2..c59da65f19d2 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -187,24 +187,13 @@ static int inet_autobind(struct sock *sk) return 0; } -/* - * Move a socket into listening state. - */ -int inet_listen(struct socket *sock, int backlog) +int __inet_listen_sk(struct sock *sk, int backlog) { - struct sock *sk = sock->sk; - unsigned char old_state; + unsigned char old_state = sk->sk_state; int err, tcp_fastopen; - lock_sock(sk); - - err = -EINVAL; - if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) - goto out; - - old_state = sk->sk_state; if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) - goto out; + return -EINVAL; WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state @@ -227,10 +216,27 @@ int inet_listen(struct socket *sock, int backlog) err = inet_csk_listen_start(sk); if (err) - goto out; + return err; + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); } - err = 0; + return 0; +} + +/* + * Move a socket into listening state. + */ +int inet_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int err = -EINVAL; + + lock_sock(sk); + + if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) + goto out; + + err = __inet_listen_sk(sk, backlog); out: release_sock(sk); -- cgit v1.2.3 From 40f56d0c7043d16e9d49d1df6c02804c09ed5cd6 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 11 Aug 2023 17:57:20 +0200 Subject: mptcp: avoid additional indirection in mptcp_listen() We are going to remove the first subflow socket soon, so avoid the additional indirection via at listen() time. Instead call directly the recently introduced helper on the first subflow sock. No functional changes intended. Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5b4d6f0628a7..d8b75fbc4f24 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3720,6 +3720,7 @@ static int mptcp_listen(struct socket *sock, int backlog) struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *sk = sock->sk; struct socket *ssock; + struct sock *ssk; int err; pr_debug("msk=%p", msk); @@ -3736,15 +3737,19 @@ static int mptcp_listen(struct socket *sock, int backlog) goto unlock; } + ssk = msk->first; inet_sk_state_store(sk, TCP_LISTEN); sock_set_flag(sk, SOCK_RCU_FREE); - err = READ_ONCE(ssock->ops)->listen(ssock, backlog); - inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); + lock_sock(ssk); + err = __inet_listen_sk(ssk, backlog); + release_sock(ssk); + inet_sk_state_store(sk, inet_sk_state_load(ssk)); + if (!err) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - mptcp_copy_inaddrs(sk, ssock->sk); - mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED); + mptcp_copy_inaddrs(sk, ssk); + mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); } unlock: -- cgit v1.2.3 From 5426a4ef6455c93e76a553eda0a1811bb0bbd214 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 11 Aug 2023 17:57:21 +0200 Subject: mptcp: avoid additional indirection in mptcp_poll() We are going to remove the first subflow socket soon, so avoid the additional indirection at poll() time. Instead access directly the first subflow sock. No functional changes intended. Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d8b75fbc4f24..e89d1bf44f77 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3844,12 +3844,12 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); if (state == TCP_LISTEN) { - struct socket *ssock = READ_ONCE(msk->subflow); + struct sock *ssk = READ_ONCE(msk->first); - if (WARN_ON_ONCE(!ssock || !ssock->sk)) + if (WARN_ON_ONCE(!ssk)) return 0; - return inet_csk_listen_poll(ssock->sk); + return inet_csk_listen_poll(ssk); } shutdown = READ_ONCE(sk->sk_shutdown); -- cgit v1.2.3 From 1f6610b92ac3c4cd4b7904b1d00b776f9bf08ed3 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 11 Aug 2023 17:57:22 +0200 Subject: mptcp: avoid unneeded indirection in mptcp_stream_accept() We are going to remove the first subflow socket soon, so avoid the additional indirection at accept() time. Instead access directly the first subflow sock, and update mptcp_accept() to operate on it. This allows dropping a duplicated check in mptcp_accept(). No functional changes intended. Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e89d1bf44f77..e5ebd170d316 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3174,25 +3174,17 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); } -static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, +static struct sock *mptcp_accept(struct sock *ssk, int flags, int *err, bool kern) { - struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *listener; struct sock *newsk; - listener = READ_ONCE(msk->subflow); - if (WARN_ON_ONCE(!listener)) { - *err = -EINVAL; - return NULL; - } - - pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk)); - newsk = inet_csk_accept(listener->sk, flags, err, kern); + pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk)); + newsk = inet_csk_accept(ssk, flags, err, kern); if (!newsk) return NULL; - pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk)); + pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk)); if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; @@ -3209,9 +3201,9 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, } newsk = new_mptcp_sock; - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); } else { - MPTCP_INC_STATS(sock_net(sk), + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); } @@ -3761,8 +3753,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct mptcp_sock *msk = mptcp_sk(sock->sk); - struct socket *ssock; - struct sock *newsk; + struct sock *ssk, *newsk; int err; pr_debug("msk=%p", msk); @@ -3770,11 +3761,11 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, /* Buggy applications can call accept on socket states other then LISTEN * but no need to allocate the first subflow just to error out. */ - ssock = READ_ONCE(msk->subflow); - if (!ssock) + ssk = READ_ONCE(msk->first); + if (!ssk) return -EINVAL; - newsk = mptcp_accept(sock->sk, flags, &err, kern); + newsk = mptcp_accept(ssk, flags, &err, kern); if (!newsk) return err; -- cgit v1.2.3 From f0bc514bd5c1396d1f189e4aafd2c24372ce535a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 11 Aug 2023 17:57:23 +0200 Subject: mptcp: avoid additional indirection in sockopt The mptcp sockopt infrastructure unneedly uses the first subflow socket struct in a few spots. We are going to remove such field soon, so use directly the first subflow sock instead. No functional changes intended. Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/sockopt.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index a3f1fe810cc9..6661852f8d97 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -293,6 +293,7 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, { struct sock *sk = (struct sock *)msk; struct socket *ssock; + struct sock *ssk; int ret; switch (optname) { @@ -307,16 +308,17 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, return PTR_ERR(ssock); } - ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); + ssk = msk->first; + ret = sk_setsockopt(ssk, SOL_SOCKET, optname, optval, optlen); if (ret == 0) { if (optname == SO_REUSEPORT) - sk->sk_reuseport = ssock->sk->sk_reuseport; + sk->sk_reuseport = ssk->sk_reuseport; else if (optname == SO_REUSEADDR) - sk->sk_reuse = ssock->sk->sk_reuse; + sk->sk_reuse = ssk->sk_reuse; else if (optname == SO_BINDTODEVICE) - sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if; + sk->sk_bound_dev_if = ssk->sk_bound_dev_if; else if (optname == SO_BINDTOIFINDEX) - sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if; + sk->sk_bound_dev_if = ssk->sk_bound_dev_if; } release_sock(sk); return ret; @@ -391,6 +393,7 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, struct sock *sk = (struct sock *)msk; int ret = -EOPNOTSUPP; struct socket *ssock; + struct sock *ssk; switch (optname) { case IPV6_V6ONLY: @@ -403,7 +406,8 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, return PTR_ERR(ssock); } - ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); + ssk = msk->first; + ret = tcp_setsockopt(ssk, SOL_IPV6, optname, optval, optlen); if (ret != 0) { release_sock(sk); return ret; @@ -413,13 +417,13 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, switch (optname) { case IPV6_V6ONLY: - sk->sk_ipv6only = ssock->sk->sk_ipv6only; + sk->sk_ipv6only = ssk->sk_ipv6only; break; case IPV6_TRANSPARENT: - inet_sk(sk)->transparent = inet_sk(ssock->sk)->transparent; + inet_sk(sk)->transparent = inet_sk(ssk)->transparent; break; case IPV6_FREEBIND: - inet_sk(sk)->freebind = inet_sk(ssock->sk)->freebind; + inet_sk(sk)->freebind = inet_sk(ssk)->freebind; break; } @@ -700,7 +704,7 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o return PTR_ERR(ssock); } - issk = inet_sk(ssock->sk); + issk = inet_sk(msk->first); switch (optname) { case IP_FREEBIND: @@ -865,8 +869,8 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int { struct sock *sk = (struct sock *)msk; struct socket *ssock; - int ret; struct sock *ssk; + int ret; lock_sock(sk); ssk = msk->first; @@ -881,7 +885,7 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int goto out; } - ret = tcp_getsockopt(ssock->sk, level, optname, optval, optlen); + ret = tcp_getsockopt(ssk, level, optname, optval, optlen); out: release_sock(sk); -- cgit v1.2.3 From 3aa362494170142406228e45860d6f4e48595d54 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 11 Aug 2023 17:57:24 +0200 Subject: mptcp: avoid ssock usage in mptcp_pm_nl_create_listen_socket() This is one of the few remaining spots actually manipulating the first subflow socket. We can leverage the recently introduced inet helpers to get rid of ssock there. No functional changes intended. Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 5692daf57a4d..ae36155ff128 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -1005,8 +1006,8 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, bool is_ipv6 = sk->sk_family == AF_INET6; int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; + struct sock *newsk, *ssk; struct socket *ssock; - struct sock *newsk; int backlog = 1024; int err; @@ -1042,18 +1043,23 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, if (entry->addr.family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif - err = kernel_bind(ssock, (struct sockaddr *)&addr, addrlen); + ssk = mptcp_sk(newsk)->first; + if (ssk->sk_family == AF_INET) + err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (ssk->sk_family == AF_INET6) + err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); +#endif if (err) return err; inet_sk_state_store(newsk, TCP_LISTEN); - err = kernel_listen(ssock, backlog); - if (err) - return err; - - mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED); - - return 0; + lock_sock(ssk); + err = __inet_listen_sk(ssk, backlog); + if (!err) + mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); + release_sock(ssk); + return err; } int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) -- cgit v1.2.3 From 3f326a821b99812edb6d3c24bcb78377cae6e432 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 11 Aug 2023 17:57:25 +0200 Subject: mptcp: change the mpc check helper to return a sk After the previous patch the __mptcp_nmpc_socket helper is used only to ensure that the MPTCP socket is a suitable status - that is, the mptcp capable handshake is not started yet. Change the return value to the relevant subflow sock, to finally remove the last references to first subflow socket in the MPTCP stack. As a bonus, we can get rid of a few local variables in different functions. No functional change intended. Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 8 +++----- net/mptcp/protocol.c | 40 +++++++++++++++------------------------- net/mptcp/protocol.h | 2 +- net/mptcp/sockopt.c | 43 +++++++++++++++++++------------------------ 4 files changed, 38 insertions(+), 55 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index ae36155ff128..c75d9d88a053 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1007,7 +1007,6 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; struct sock *newsk, *ssk; - struct socket *ssock; int backlog = 1024; int err; @@ -1033,17 +1032,16 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, &mptcp_keys[is_ipv6]); lock_sock(newsk); - ssock = __mptcp_nmpc_socket(mptcp_sk(newsk)); + ssk = __mptcp_nmpc_sk(mptcp_sk(newsk)); release_sock(newsk); - if (IS_ERR(ssock)) - return PTR_ERR(ssock); + if (IS_ERR(ssk)) + return PTR_ERR(ssk); mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (entry->addr.family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif - ssk = mptcp_sk(newsk)->first; if (ssk->sk_family == AF_INET) err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); #if IS_ENABLED(CONFIG_MPTCP_IPV6) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e5ebd170d316..fafa83ee4a72 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -109,7 +109,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) /* If the MPC handshake is not started, returns the first subflow, * eventually allocating it. */ -struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk) +struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; int ret; @@ -117,10 +117,7 @@ struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk) if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) return ERR_PTR(-EINVAL); - if (!msk->subflow) { - if (msk->first) - return ERR_PTR(-EINVAL); - + if (!msk->first) { ret = __mptcp_socket_create(msk); if (ret) return ERR_PTR(ret); @@ -128,7 +125,7 @@ struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk) mptcp_sockopt_sync(msk, msk->first); } - return msk->subflow; + return msk->first; } static void mptcp_drop(struct sock *sk, struct sk_buff *skb) @@ -1643,7 +1640,6 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, { unsigned int saved_flags = msg->msg_flags; struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; struct sock *ssk; int ret; @@ -1654,9 +1650,9 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, * fastopen attempt, no need to check for additional subflow status. */ if (msg->msg_flags & MSG_FASTOPEN) { - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) - return PTR_ERR(ssock); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) + return PTR_ERR(ssk); } if (!msk->first) return -EINVAL; @@ -3577,16 +3573,14 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - struct socket *ssock; int err = -EINVAL; struct sock *ssk; - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) - return PTR_ERR(ssock); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) + return PTR_ERR(ssk); inet_sk_state_store(sk, TCP_SYN_SENT); - ssk = msk->first; subflow = mptcp_subflow_ctx(ssk); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of @@ -3682,17 +3676,15 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *sk = sock->sk; - struct socket *ssock; int err = -EINVAL; lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { + err = PTR_ERR(ssk); goto unlock; } - ssk = msk->first; if (sk->sk_family == AF_INET) err = inet_bind_sk(ssk, uaddr, addr_len); #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -3711,7 +3703,6 @@ static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *sk = sock->sk; - struct socket *ssock; struct sock *ssk; int err; @@ -3723,13 +3714,12 @@ static int mptcp_listen(struct socket *sock, int backlog) if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto unlock; - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) { - err = PTR_ERR(ssock); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { + err = PTR_ERR(ssk); goto unlock; } - ssk = msk->first; inet_sk_state_store(sk, TCP_LISTEN); sock_set_flag(sk, SOCK_RCU_FREE); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 79fc5cdb67bc..dccc96dc2d6b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -640,7 +640,7 @@ void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); -struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk); +struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); void __mptcp_unaccepted_force_close(struct sock *sk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 6661852f8d97..21bc46acbe38 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -292,7 +292,6 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; - struct socket *ssock; struct sock *ssk; int ret; @@ -302,13 +301,12 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, case SO_BINDTODEVICE: case SO_BINDTOIFINDEX: lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) { + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { release_sock(sk); - return PTR_ERR(ssock); + return PTR_ERR(ssk); } - ssk = msk->first; ret = sk_setsockopt(ssk, SOL_SOCKET, optname, optval, optlen); if (ret == 0) { if (optname == SO_REUSEPORT) @@ -392,7 +390,6 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, { struct sock *sk = (struct sock *)msk; int ret = -EOPNOTSUPP; - struct socket *ssock; struct sock *ssk; switch (optname) { @@ -400,13 +397,12 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, case IPV6_TRANSPARENT: case IPV6_FREEBIND: lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) { + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { release_sock(sk); - return PTR_ERR(ssock); + return PTR_ERR(ssk); } - ssk = msk->first; ret = tcp_setsockopt(ssk, SOL_IPV6, optname, optval, optlen); if (ret != 0) { release_sock(sk); @@ -689,7 +685,7 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o { struct sock *sk = (struct sock *)msk; struct inet_sock *issk; - struct socket *ssock; + struct sock *ssk; int err; err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen); @@ -698,13 +694,13 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) { + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { release_sock(sk); - return PTR_ERR(ssock); + return PTR_ERR(ssk); } - issk = inet_sk(msk->first); + issk = inet_sk(ssk); switch (optname) { case IP_FREEBIND: @@ -767,18 +763,18 @@ static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; - struct socket *sock; + struct sock *ssk; int ret; /* Limit to first subflow, before the connection establishment */ lock_sock(sk); - sock = __mptcp_nmpc_socket(msk); - if (IS_ERR(sock)) { - ret = PTR_ERR(sock); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { + ret = PTR_ERR(ssk); goto unlock; } - ret = tcp_setsockopt(sock->sk, level, optname, optval, optlen); + ret = tcp_setsockopt(ssk, level, optname, optval, optlen); unlock: release_sock(sk); @@ -868,7 +864,6 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int char __user *optval, int __user *optlen) { struct sock *sk = (struct sock *)msk; - struct socket *ssock; struct sock *ssk; int ret; @@ -879,9 +874,9 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int goto out; } - ssock = __mptcp_nmpc_socket(msk); - if (IS_ERR(ssock)) { - ret = PTR_ERR(ssock); + ssk = __mptcp_nmpc_sk(msk); + if (IS_ERR(ssk)) { + ret = PTR_ERR(ssk); goto out; } -- cgit v1.2.3 From 39880bd808ad2ddfb9b7fee129568c3b814f0609 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 11 Aug 2023 17:57:26 +0200 Subject: mptcp: get rid of msk->subflow Such field is now unused just as a flag to control the first subflow deletion at close() time. Introduce a new bit flag for that and finally drop the mentioned field. As an intended side effect, now the first subflow sock is not freed before close() even for passive sockets. The msk has no open/active subflows if the first one is closed and the subflow list is singular, update accordingly the state check in mptcp_stream_accept(). Among other benefits, the subflow removal, reduces the amount of memory used on the client side for each mptcp connection, allows passive sockets to go through successful accept()/disconnect()/connect() and makes return error code consistent for failing both passive and active sockets. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/290 Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 25 ++++++------------------- net/mptcp/protocol.h | 13 ++++++------- 2 files changed, 12 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index fafa83ee4a72..e715771ded7c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -92,7 +92,6 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio; WRITE_ONCE(msk->first, ssock->sk); - WRITE_ONCE(msk->subflow, ssock); subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); @@ -102,6 +101,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) /* This is the first subflow, always with id 0 */ subflow->local_id_valid = 1; mptcp_sock_graft(msk->first, sk->sk_socket); + iput(SOCK_INODE(ssock)); return 0; } @@ -2238,14 +2238,6 @@ static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) return min_stale_count > 1 ? backup : NULL; } -static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk) -{ - if (msk->subflow) { - iput(SOCK_INODE(msk->subflow)); - WRITE_ONCE(msk->subflow, NULL); - } -} - bool __mptcp_retransmit_pending_data(struct sock *sk) { struct mptcp_data_frag *cur, *rtx_head; @@ -2324,7 +2316,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, goto out_release; } - dispose_it = !msk->subflow || ssk != msk->subflow->sk; + dispose_it = msk->free_first || ssk != msk->first; if (dispose_it) list_del(&subflow->node); @@ -2345,7 +2337,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, * disconnect should never fail */ WARN_ON_ONCE(tcp_disconnect(ssk, 0)); - msk->subflow->state = SS_UNCONNECTED; mptcp_subflow_ctx_reset(subflow); release_sock(ssk); @@ -3106,7 +3097,6 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, msk = mptcp_sk(nsk); msk->local_key = subflow_req->local_key; msk->token = subflow_req->token; - WRITE_ONCE(msk->subflow, NULL); msk->in_accept_queue = 1; WRITE_ONCE(msk->fully_established, false); if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) @@ -3240,10 +3230,8 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - /* clears msk->subflow, allowing the following to close - * even the initial subflow - */ - mptcp_dispose_initial_subflow(msk); + /* allow the following to close even the initial subflow */ + msk->free_first = 1; mptcp_destroy_common(msk, 0); sk_sockets_allocated_dec(sk); } @@ -3782,11 +3770,10 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, /* Do late cleanup for the first subflow as necessary. Also * deal with bad peers not doing a complete shutdown. */ - if (msk->first && - unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { + if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { __mptcp_close_ssk(newsk, msk->first, mptcp_subflow_ctx(msk->first), 0); - if (unlikely(list_empty(&msk->conn_list))) + if (unlikely(list_is_singular(&msk->conn_list))) inet_sk_state_store(newsk, TCP_CLOSE); } } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index dccc96dc2d6b..38c7ea013361 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -299,7 +299,8 @@ struct mptcp_sock { cork:1, nodelay:1, fastopening:1, - in_accept_queue:1; + in_accept_queue:1, + free_first:1; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; @@ -308,12 +309,10 @@ struct mptcp_sock { struct list_head rtx_queue; struct mptcp_data_frag *first_pending; struct list_head join_list; - struct socket *subflow; /* outgoing connect/listener/!mp_capable - * The mptcp ops can safely dereference, using suitable - * ONCE annotation, the subflow outside the socket - * lock as such sock is freed after close(). - */ - struct sock *first; + struct sock *first; /* The mptcp ops can safely dereference, using suitable + * ONCE annotation, the subflow outside the socket + * lock as such sock is freed after close(). + */ struct mptcp_pm_data pm; struct { u32 space; /* bytes copied in last measurement window */ -- cgit v1.2.3 From e263691773cd67d7c824eeee8b802f50c6e0c118 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Aug 2023 17:57:27 +0200 Subject: mptcp: Remove unnecessary test for __mptcp_init_sock() __mptcp_init_sock() always returns 0 because mptcp_init_sock() used to return the value directly. But after commit 18b683bff89d ("mptcp: queue data for mptcp level retransmission"), __mptcp_init_sock() need not return value anymore. Let's remove the unnecessary test for __mptcp_init_sock() and make it return void. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e715771ded7c..6ea0a1da8068 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2649,7 +2649,7 @@ unlock: sock_put(sk); } -static int __mptcp_init_sock(struct sock *sk) +static void __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -2676,8 +2676,6 @@ static int __mptcp_init_sock(struct sock *sk) /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0); - - return 0; } static void mptcp_ca_reset(struct sock *sk) @@ -2695,11 +2693,8 @@ static void mptcp_ca_reset(struct sock *sk) static int mptcp_init_sock(struct sock *sk) { struct net *net = sock_net(sk); - int ret; - ret = __mptcp_init_sock(sk); - if (ret) - return ret; + __mptcp_init_sock(sk); if (!mptcp_is_enabled(net)) return -ENOPROTOOPT; -- cgit v1.2.3 From 9d802da40b7c820deb9c60fc394457ea565cafc8 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Fri, 11 Aug 2023 16:12:48 +0200 Subject: net: openvswitch: add last-action drop reason Create a new drop reason subsystem for openvswitch and add the first drop reason to represent last-action drops. Last-action drops happen when a flow has an empty action list or there is no action that consumes the packet (output, userspace, recirc, etc). It is the most common way in which OVS drops packets. Implementation-wise, most of these skb-consuming actions already call "consume_skb" internally and return directly from within the do_execute_actions() loop so with minimal changes we can assume that any skb that exits the loop normally is a packet drop. Signed-off-by: Adrian Moreno Signed-off-by: David S. Miller --- include/net/dropreason.h | 6 ++++++ net/openvswitch/actions.c | 12 ++++++++++-- net/openvswitch/datapath.c | 16 ++++++++++++++++ net/openvswitch/drop.h | 31 +++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 net/openvswitch/drop.h (limited to 'net') diff --git a/include/net/dropreason.h b/include/net/dropreason.h index 685fb37df8e8..56cb7be92244 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -23,6 +23,12 @@ enum skb_drop_reason_subsys { */ SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR, + /** + * @SKB_DROP_REASON_SUBSYS_OPENVSWITCH: openvswitch drop reasons, + * see net/openvswitch/drop.h + */ + SKB_DROP_REASON_SUBSYS_OPENVSWITCH, + /** @SKB_DROP_REASON_SUBSYS_NUM: number of subsystems defined */ SKB_DROP_REASON_SUBSYS_NUM }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index cab1e02b63e0..8c8a7a82f76f 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -27,6 +27,7 @@ #include #include "datapath.h" +#include "drop.h" #include "flow.h" #include "conntrack.h" #include "vport.h" @@ -1036,7 +1037,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, if ((arg->probability != U32_MAX) && (!arg->probability || get_random_u32() > arg->probability)) { if (last) - consume_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_LAST_ACTION); return 0; } @@ -1297,6 +1298,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, if (trace_ovs_do_execute_action_enabled()) trace_ovs_do_execute_action(dp, skb, key, a, rem); + /* Actions that rightfully have to consume the skb should do it + * and return directly. + */ switch (nla_type(a)) { case OVS_ACTION_ATTR_OUTPUT: { int port = nla_get_u32(a); @@ -1332,6 +1336,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, output_userspace(dp, skb, key, a, attr, len, OVS_CB(skb)->cutlen); OVS_CB(skb)->cutlen = 0; + if (nla_is_last(a, rem)) { + consume_skb(skb); + return 0; + } break; case OVS_ACTION_ATTR_HASH: @@ -1485,7 +1493,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, } } - consume_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_LAST_ACTION); return 0; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index a6d2a0b1aa21..d33cb739883f 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -41,6 +41,7 @@ #include #include "datapath.h" +#include "drop.h" #include "flow.h" #include "flow_table.h" #include "flow_netlink.h" @@ -2702,6 +2703,17 @@ static struct pernet_operations ovs_net_ops = { .size = sizeof(struct ovs_net), }; +static const char * const ovs_drop_reasons[] = { +#define S(x) (#x), + OVS_DROP_REASONS(S) +#undef S +}; + +static struct drop_reason_list drop_reason_list_ovs = { + .reasons = ovs_drop_reasons, + .n_reasons = ARRAY_SIZE(ovs_drop_reasons), +}; + static int __init dp_init(void) { int err; @@ -2743,6 +2755,9 @@ static int __init dp_init(void) if (err < 0) goto error_unreg_netdev; + drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_OPENVSWITCH, + &drop_reason_list_ovs); + return 0; error_unreg_netdev: @@ -2769,6 +2784,7 @@ static void dp_cleanup(void) ovs_netdev_exit(); unregister_netdevice_notifier(&ovs_dp_device_notifier); unregister_pernet_device(&ovs_net_ops); + drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_OPENVSWITCH); rcu_barrier(); ovs_vport_exit(); ovs_flow_exit(); diff --git a/net/openvswitch/drop.h b/net/openvswitch/drop.h new file mode 100644 index 000000000000..a5b2b901249b --- /dev/null +++ b/net/openvswitch/drop.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * OpenvSwitch drop reason list. + */ + +#ifndef OPENVSWITCH_DROP_H +#define OPENVSWITCH_DROP_H +#include +#include + +#define OVS_DROP_REASONS(R) \ + R(OVS_DROP_LAST_ACTION) \ + /* deliberate comment for trailing \ */ + +enum ovs_drop_reason { + __OVS_DROP_REASON = SKB_DROP_REASON_SUBSYS_OPENVSWITCH << + SKB_DROP_REASON_SUBSYS_SHIFT, +#define ENUM(x) x, + OVS_DROP_REASONS(ENUM) +#undef ENUM + + OVS_DROP_MAX, +}; + +static inline void +ovs_kfree_skb_reason(struct sk_buff *skb, enum ovs_drop_reason reason) +{ + kfree_skb_reason(skb, (u32)reason); +} + +#endif /* OPENVSWITCH_DROP_H */ -- cgit v1.2.3 From ec7bfb5e5a054f1178e8bdbf4f145fdafa5bf804 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Fri, 11 Aug 2023 16:12:49 +0200 Subject: net: openvswitch: add action error drop reason Add a drop reason for packets that are dropped because an action returns a non-zero error code. Acked-by: Aaron Conole Signed-off-by: Adrian Moreno Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 2 +- net/openvswitch/drop.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 8c8a7a82f76f..bb7aa181da30 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1488,7 +1488,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, } if (unlikely(err)) { - kfree_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_ACTION_ERROR); return err; } } diff --git a/net/openvswitch/drop.h b/net/openvswitch/drop.h index a5b2b901249b..b87613ced713 100644 --- a/net/openvswitch/drop.h +++ b/net/openvswitch/drop.h @@ -10,6 +10,7 @@ #define OVS_DROP_REASONS(R) \ R(OVS_DROP_LAST_ACTION) \ + R(OVS_DROP_ACTION_ERROR) \ /* deliberate comment for trailing \ */ enum ovs_drop_reason { -- cgit v1.2.3 From e7bc7db9ba463e763ac6113279cade19da9cb939 Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Fri, 11 Aug 2023 16:12:50 +0200 Subject: net: openvswitch: add explicit drop action From: Eric Garver This adds an explicit drop action. This is used by OVS to drop packets for which it cannot determine what to do. An explicit action in the kernel allows passing the reason _why_ the packet is being dropped or zero to indicate no particular error happened (i.e: OVS intentionally dropped the packet). Since the error codes coming from userspace mean nothing for the kernel, we squash all of them into only two drop reasons: - OVS_DROP_EXPLICIT_WITH_ERROR to indicate a non-zero value was passed - OVS_DROP_EXPLICIT to indicate a zero value was passed (no error) e.g. trace all OVS dropped skbs # perf trace -e skb:kfree_skb --filter="reason >= 0x30000" [..] 106.023 ping/2465 skb:kfree_skb(skbaddr: 0xffffa0e8765f2000, \ location:0xffffffffc0d9b462, protocol: 2048, reason: 196611) reason: 196611 --> 0x30003 (OVS_DROP_EXPLICIT) Also, this patch allows ovs-dpctl.py to add explicit drop actions as: "drop" -> implicit empty-action drop "drop(0)" -> explicit non-error action drop "drop(42)" -> explicit error action drop Signed-off-by: Eric Garver Co-developed-by: Adrian Moreno Signed-off-by: Adrian Moreno Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 2 ++ net/openvswitch/actions.c | 9 +++++++++ net/openvswitch/drop.h | 2 ++ net/openvswitch/flow_netlink.c | 10 +++++++++- .../testing/selftests/net/openvswitch/ovs-dpctl.py | 22 ++++++++++++++++++---- 5 files changed, 40 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index e94870e77ee9..efc82c318fa2 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -965,6 +965,7 @@ struct check_pkt_len_arg { * start of the packet or at the start of the l3 header depending on the value * of l3 tunnel flag in the tun_flags field of OVS_ACTION_ATTR_ADD_MPLS * argument. + * @OVS_ACTION_ATTR_DROP: Explicit drop action. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -1002,6 +1003,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_CHECK_PKT_LEN, /* Nested OVS_CHECK_PKT_LEN_ATTR_*. */ OVS_ACTION_ATTR_ADD_MPLS, /* struct ovs_action_add_mpls. */ OVS_ACTION_ATTR_DEC_TTL, /* Nested OVS_DEC_TTL_ATTR_*. */ + OVS_ACTION_ATTR_DROP, /* u32 error code. */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index bb7aa181da30..e188f32ea96f 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1485,6 +1485,15 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, return dec_ttl_exception_handler(dp, skb, key, a); break; + + case OVS_ACTION_ATTR_DROP: { + enum ovs_drop_reason reason = nla_get_u32(a) + ? OVS_DROP_EXPLICIT_WITH_ERROR + : OVS_DROP_EXPLICIT; + + ovs_kfree_skb_reason(skb, reason); + return 0; + } } if (unlikely(err)) { diff --git a/net/openvswitch/drop.h b/net/openvswitch/drop.h index b87613ced713..22e66d28c0ca 100644 --- a/net/openvswitch/drop.h +++ b/net/openvswitch/drop.h @@ -11,6 +11,8 @@ #define OVS_DROP_REASONS(R) \ R(OVS_DROP_LAST_ACTION) \ R(OVS_DROP_ACTION_ERROR) \ + R(OVS_DROP_EXPLICIT) \ + R(OVS_DROP_EXPLICIT_WITH_ERROR) \ /* deliberate comment for trailing \ */ enum ovs_drop_reason { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 41116361433d..88965e2068ac 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -38,6 +38,7 @@ #include #include +#include "drop.h" #include "flow_netlink.h" struct ovs_len_tbl { @@ -61,6 +62,7 @@ static bool actions_may_change_flow(const struct nlattr *actions) case OVS_ACTION_ATTR_RECIRC: case OVS_ACTION_ATTR_TRUNC: case OVS_ACTION_ATTR_USERSPACE: + case OVS_ACTION_ATTR_DROP: break; case OVS_ACTION_ATTR_CT: @@ -2394,7 +2396,7 @@ static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len) /* Whenever new actions are added, the need to update this * function should be considered. */ - BUILD_BUG_ON(OVS_ACTION_ATTR_MAX != 23); + BUILD_BUG_ON(OVS_ACTION_ATTR_MAX != 24); if (!actions) return; @@ -3182,6 +3184,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1, [OVS_ACTION_ATTR_ADD_MPLS] = sizeof(struct ovs_action_add_mpls), [OVS_ACTION_ATTR_DEC_TTL] = (u32)-1, + [OVS_ACTION_ATTR_DROP] = sizeof(u32), }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -3453,6 +3456,11 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, skip_copy = true; break; + case OVS_ACTION_ATTR_DROP: + if (!nla_is_last(a, rem)) + return -EINVAL; + break; + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; diff --git a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py index fbdac15e3134..912dc8c49085 100644 --- a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py +++ b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py @@ -301,6 +301,7 @@ class ovsactions(nla): ("OVS_ACTION_ATTR_CHECK_PKT_LEN", "none"), ("OVS_ACTION_ATTR_ADD_MPLS", "none"), ("OVS_ACTION_ATTR_DEC_TTL", "none"), + ("OVS_ACTION_ATTR_DROP", "uint32"), ) class ctact(nla): @@ -447,6 +448,8 @@ class ovsactions(nla): print_str += "recirc(0x%x)" % int(self.get_attr(field[0])) elif field[0] == "OVS_ACTION_ATTR_TRUNC": print_str += "trunc(%d)" % int(self.get_attr(field[0])) + elif field[0] == "OVS_ACTION_ATTR_DROP": + print_str += "drop(%d)" % int(self.get_attr(field[0])) elif field[1] == "flag": if field[0] == "OVS_ACTION_ATTR_CT_CLEAR": print_str += "ct_clear" @@ -468,10 +471,21 @@ class ovsactions(nla): while len(actstr) != 0: parsed = False if actstr.startswith("drop"): - # for now, drops have no explicit action, so we - # don't need to set any attributes. The final - # act of the processing chain will just drop the packet - return + # If no reason is provided, the implicit drop is used (i.e no + # action). If some reason is given, an explicit action is used. + actstr, reason = parse_extract_field( + actstr, + "drop(", + "([0-9]+)", + lambda x: int(x, 0), + False, + None, + ) + if reason is not None: + self["attrs"].append(["OVS_ACTION_ATTR_DROP", reason]) + parsed = True + else: + return elif parse_starts_block(actstr, "^(\d+)", False, True): actstr, output = parse_extract_field( -- cgit v1.2.3 From f329d1bc1a4580e0f8a402b14a6fd024ec8e5c7b Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Fri, 11 Aug 2023 16:12:51 +0200 Subject: net: openvswitch: add meter drop reason By using an independent drop reason it makes it easy to distinguish between QoS-triggered or flow-triggered drop. Acked-by: Aaron Conole Signed-off-by: Adrian Moreno Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 2 +- net/openvswitch/drop.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index e188f32ea96f..122211478606 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1454,7 +1454,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_METER: if (ovs_meter_execute(dp, skb, key, nla_get_u32(a))) { - consume_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_METER); return 0; } break; diff --git a/net/openvswitch/drop.h b/net/openvswitch/drop.h index 22e66d28c0ca..a35e63d71094 100644 --- a/net/openvswitch/drop.h +++ b/net/openvswitch/drop.h @@ -13,6 +13,7 @@ R(OVS_DROP_ACTION_ERROR) \ R(OVS_DROP_EXPLICIT) \ R(OVS_DROP_EXPLICIT_WITH_ERROR) \ + R(OVS_DROP_METER) \ /* deliberate comment for trailing \ */ enum ovs_drop_reason { -- cgit v1.2.3 From 43d95b30cf5793cdd3c7b1c1cd5fead9b469bd60 Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Fri, 11 Aug 2023 16:12:52 +0200 Subject: net: openvswitch: add misc error drop reasons Use drop reasons from include/net/dropreason-core.h when a reasonable candidate exists. Acked-by: Aaron Conole Signed-off-by: Adrian Moreno Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 17 ++++++++++------- net/openvswitch/conntrack.c | 3 ++- net/openvswitch/drop.h | 6 ++++++ 3 files changed, 18 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 122211478606..fd66014d8a76 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -782,7 +782,7 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct vport *vport = data->vport; if (skb_cow_head(skb, data->l2_len) < 0) { - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM); return -ENOMEM; } @@ -853,6 +853,7 @@ static void ovs_fragment(struct net *net, struct vport *vport, struct sk_buff *skb, u16 mru, struct sw_flow_key *key) { + enum ovs_drop_reason reason; u16 orig_network_offset = 0; if (eth_p_mpls(skb->protocol)) { @@ -862,6 +863,7 @@ static void ovs_fragment(struct net *net, struct vport *vport, if (skb_network_offset(skb) > MAX_L2_LEN) { OVS_NLERR(1, "L2 header too long to fragment"); + reason = OVS_DROP_FRAG_L2_TOO_LONG; goto err; } @@ -902,12 +904,13 @@ static void ovs_fragment(struct net *net, struct vport *vport, WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", ovs_vport_name(vport), ntohs(key->eth.type), mru, vport->dev->mtu); + reason = OVS_DROP_FRAG_INVALID_PROTO; goto err; } return; err: - kfree_skb(skb); + ovs_kfree_skb_reason(skb, reason); } static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, @@ -934,10 +937,10 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, ovs_fragment(net, vport, skb, mru, key); } else { - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); } } else { - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_DEV_READY); } } @@ -1011,7 +1014,7 @@ static int dec_ttl_exception_handler(struct datapath *dp, struct sk_buff *skb, return clone_execute(dp, skb, key, 0, nla_data(actions), nla_len(actions), true, false); - consume_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_IP_TTL); return 0; } @@ -1564,7 +1567,7 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, /* Out of per CPU action FIFO space. Drop the 'skb' and * log an error. */ - kfree_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_DEFERRED_LIMIT); if (net_ratelimit()) { if (actions) { /* Sample action */ @@ -1616,7 +1619,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, if (unlikely(level > OVS_RECURSION_LIMIT)) { net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", ovs_dp_name(dp)); - kfree_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_RECURSION_LIMIT); err = -ENETDOWN; goto out; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index fa955e892210..0cfa1e9482e6 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -29,6 +29,7 @@ #include #include "datapath.h" +#include "drop.h" #include "conntrack.h" #include "flow.h" #include "flow_netlink.h" @@ -1035,7 +1036,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, skb_push_rcsum(skb, nh_ofs); if (err) - kfree_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_CONNTRACK); return err; } diff --git a/net/openvswitch/drop.h b/net/openvswitch/drop.h index a35e63d71094..cedf9b7b5796 100644 --- a/net/openvswitch/drop.h +++ b/net/openvswitch/drop.h @@ -14,6 +14,12 @@ R(OVS_DROP_EXPLICIT) \ R(OVS_DROP_EXPLICIT_WITH_ERROR) \ R(OVS_DROP_METER) \ + R(OVS_DROP_RECURSION_LIMIT) \ + R(OVS_DROP_DEFERRED_LIMIT) \ + R(OVS_DROP_FRAG_L2_TOO_LONG) \ + R(OVS_DROP_FRAG_INVALID_PROTO) \ + R(OVS_DROP_CONNTRACK) \ + R(OVS_DROP_IP_TTL) \ /* deliberate comment for trailing \ */ enum ovs_drop_reason { -- cgit v1.2.3 From 950c92bbaa8f16fc71485f4d564d429a9e4cc9fd Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 2 Aug 2023 10:16:49 +0200 Subject: batman-adv: Drop unused function batadv_gw_bandwidth_set This function is no longer used since the sysfs support was removed from batman-adv. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/gateway_common.c | 88 ----------------------------------------- net/batman-adv/gateway_common.h | 2 - 2 files changed, 90 deletions(-) (limited to 'net') diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 6a964a773f57..d9632607f92b 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -9,7 +9,6 @@ #include #include -#include #include #include #include @@ -90,42 +89,6 @@ bool batadv_parse_throughput(struct net_device *net_dev, char *buff, return true; } -/** - * batadv_parse_gw_bandwidth() - parse supplied string buffer to extract - * download and upload bandwidth information - * @net_dev: the soft interface net device - * @buff: string buffer to parse - * @down: pointer holding the returned download bandwidth information - * @up: pointer holding the returned upload bandwidth information - * - * Return: false on parse error and true otherwise. - */ -static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, - u32 *down, u32 *up) -{ - char *slash_ptr; - bool ret; - - slash_ptr = strchr(buff, '/'); - if (slash_ptr) - *slash_ptr = 0; - - ret = batadv_parse_throughput(net_dev, buff, "download gateway speed", - down); - if (!ret) - return false; - - /* we also got some upload info */ - if (slash_ptr) { - ret = batadv_parse_throughput(net_dev, slash_ptr + 1, - "upload gateway speed", up); - if (!ret) - return false; - } - - return true; -} - /** * batadv_gw_tvlv_container_update() - update the gw tvlv container after * gateway setting change @@ -155,57 +118,6 @@ void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv) } } -/** - * batadv_gw_bandwidth_set() - Parse and set download/upload gateway bandwidth - * from supplied string buffer - * @net_dev: netdev struct of the soft interface - * @buff: the buffer containing the user data - * @count: number of bytes in the buffer - * - * Return: 'count' on success or a negative error code in case of failure - */ -ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, - size_t count) -{ - struct batadv_priv *bat_priv = netdev_priv(net_dev); - u32 down_curr; - u32 up_curr; - u32 down_new = 0; - u32 up_new = 0; - bool ret; - - down_curr = (unsigned int)atomic_read(&bat_priv->gw.bandwidth_down); - up_curr = (unsigned int)atomic_read(&bat_priv->gw.bandwidth_up); - - ret = batadv_parse_gw_bandwidth(net_dev, buff, &down_new, &up_new); - if (!ret) - return -EINVAL; - - if (!down_new) - down_new = 1; - - if (!up_new) - up_new = down_new / 5; - - if (!up_new) - up_new = 1; - - if (down_curr == down_new && up_curr == up_new) - return count; - - batadv_gw_reselect(bat_priv); - batadv_info(net_dev, - "Changing gateway bandwidth from: '%u.%u/%u.%u MBit' to: '%u.%u/%u.%u MBit'\n", - down_curr / 10, down_curr % 10, up_curr / 10, up_curr % 10, - down_new / 10, down_new % 10, up_new / 10, up_new % 10); - - atomic_set(&bat_priv->gw.bandwidth_down, down_new); - atomic_set(&bat_priv->gw.bandwidth_up, up_new); - batadv_gw_tvlv_container_update(bat_priv); - - return count; -} - /** * batadv_gw_tvlv_ogm_handler_v1() - process incoming gateway tvlv container * @bat_priv: the bat priv with all the soft interface information diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 87c37f907261..cb2e72d7ab14 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -27,8 +27,6 @@ enum batadv_bandwidth_units { #define BATADV_GW_MODE_CLIENT_NAME "client" #define BATADV_GW_MODE_SERVER_NAME "server" -ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, - size_t count); void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv); void batadv_gw_init(struct batadv_priv *bat_priv); void batadv_gw_free(struct batadv_priv *bat_priv); -- cgit v1.2.3 From 02e61f06a97e7ba73fa44046d6da097e5c2aacc8 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 2 Aug 2023 10:24:20 +0200 Subject: batman-adv: Keep batadv_netlink_notify_* static The batadv_netlink_notify_*() functions are not used by any other source file. Just keep them local to netlink.c to get informed by the compiler when they are not used anymore. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/netlink.c | 10 +++++----- net/batman-adv/netlink.h | 6 ------ 2 files changed, 5 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index ad5714f737be..b6c512ce6704 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -377,7 +377,7 @@ nla_put_failure: * * Return: 0 on success, < 0 on error */ -int batadv_netlink_notify_mesh(struct batadv_priv *bat_priv) +static int batadv_netlink_notify_mesh(struct batadv_priv *bat_priv) { struct sk_buff *msg; int ret; @@ -858,8 +858,8 @@ nla_put_failure: * * Return: 0 on success, < 0 on error */ -int batadv_netlink_notify_hardif(struct batadv_priv *bat_priv, - struct batadv_hard_iface *hard_iface) +static int batadv_netlink_notify_hardif(struct batadv_priv *bat_priv, + struct batadv_hard_iface *hard_iface) { struct sk_buff *msg; int ret; @@ -1073,8 +1073,8 @@ nla_put_failure: * * Return: 0 on success, < 0 on error */ -int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv, - struct batadv_softif_vlan *vlan) +static int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv, + struct batadv_softif_vlan *vlan) { struct sk_buff *msg; int ret; diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index 48102cc7490c..876d2806a67d 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -21,12 +21,6 @@ int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst, u8 result, u32 test_time, u64 total_bytes, u32 cookie); -int batadv_netlink_notify_mesh(struct batadv_priv *bat_priv); -int batadv_netlink_notify_hardif(struct batadv_priv *bat_priv, - struct batadv_hard_iface *hard_iface); -int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv, - struct batadv_softif_vlan *vlan); - extern struct genl_family batadv_netlink_family; #endif /* _NET_BATMAN_ADV_NETLINK_H_ */ -- cgit v1.2.3 From 6f96d46f9a1ad1c02589b598c56ea881094129d8 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 2 Aug 2023 10:39:28 +0200 Subject: batman-adv: Drop per algo GW section class code This code was only used in the past for the sysfs interface. But since this was replace with netlink, it was never executed. The function pointer was only checked to figure out whether the limit 255 (B.A.T.M.A.N. IV) or 2**32-1 (B.A.T.M.A.N. V) should be used as limit. So instead of keeping the function pointer, just store the limits directly in struct batadv_algo_gw_ops. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 1 + net/batman-adv/bat_v.c | 23 ++----------- net/batman-adv/gateway_common.c | 74 +---------------------------------------- net/batman-adv/gateway_common.h | 5 --- net/batman-adv/netlink.c | 5 +-- net/batman-adv/types.h | 7 ++-- 6 files changed, 8 insertions(+), 107 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 828fb393ee94..74b49c35ddc1 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -2516,6 +2516,7 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = { }, .gw = { .init_sel_class = batadv_iv_init_sel_class, + .sel_class_max = BATADV_TQ_MAX_VALUE, .get_best_gw_node = batadv_iv_gw_get_best_gw_node, .is_eligible = batadv_iv_gw_is_eligible, .dump = batadv_iv_gw_dump, diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 54e41fc709c3..ac11f1f08db0 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -34,7 +35,6 @@ #include "bat_v_elp.h" #include "bat_v_ogm.h" #include "gateway_client.h" -#include "gateway_common.h" #include "hard-interface.h" #include "hash.h" #include "log.h" @@ -512,25 +512,6 @@ static void batadv_v_init_sel_class(struct batadv_priv *bat_priv) atomic_set(&bat_priv->gw.sel_class, 50); } -static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv, - char *buff, size_t count) -{ - u32 old_class, class; - - if (!batadv_parse_throughput(bat_priv->soft_iface, buff, - "B.A.T.M.A.N. V GW selection class", - &class)) - return -EINVAL; - - old_class = atomic_read(&bat_priv->gw.sel_class); - atomic_set(&bat_priv->gw.sel_class, class); - - if (old_class != class) - batadv_gw_reselect(bat_priv); - - return count; -} - /** * batadv_v_gw_throughput_get() - retrieve the GW-bandwidth for a given GW * @gw_node: the GW to retrieve the metric for @@ -818,7 +799,7 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = { }, .gw = { .init_sel_class = batadv_v_init_sel_class, - .store_sel_class = batadv_v_store_sel_class, + .sel_class_max = U32_MAX, .get_best_gw_node = batadv_v_gw_get_best_gw_node, .is_eligible = batadv_v_gw_is_eligible, .dump = batadv_v_gw_dump, diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index d9632607f92b..2dd36ef03c84 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -9,86 +9,14 @@ #include #include -#include -#include -#include -#include #include -#include +#include #include #include #include "gateway_client.h" -#include "log.h" #include "tvlv.h" -/** - * batadv_parse_throughput() - parse supplied string buffer to extract - * throughput information - * @net_dev: the soft interface net device - * @buff: string buffer to parse - * @description: text shown when throughput string cannot be parsed - * @throughput: pointer holding the returned throughput information - * - * Return: false on parse error and true otherwise. - */ -bool batadv_parse_throughput(struct net_device *net_dev, char *buff, - const char *description, u32 *throughput) -{ - enum batadv_bandwidth_units bw_unit_type = BATADV_BW_UNIT_KBIT; - u64 lthroughput; - char *tmp_ptr; - int ret; - - if (strlen(buff) > 4) { - tmp_ptr = buff + strlen(buff) - 4; - - if (strncasecmp(tmp_ptr, "mbit", 4) == 0) - bw_unit_type = BATADV_BW_UNIT_MBIT; - - if (strncasecmp(tmp_ptr, "kbit", 4) == 0 || - bw_unit_type == BATADV_BW_UNIT_MBIT) - *tmp_ptr = '\0'; - } - - ret = kstrtou64(buff, 10, <hroughput); - if (ret) { - batadv_err(net_dev, - "Invalid throughput speed for %s: %s\n", - description, buff); - return false; - } - - switch (bw_unit_type) { - case BATADV_BW_UNIT_MBIT: - /* prevent overflow */ - if (U64_MAX / 10 < lthroughput) { - batadv_err(net_dev, - "Throughput speed for %s too large: %s\n", - description, buff); - return false; - } - - lthroughput *= 10; - break; - case BATADV_BW_UNIT_KBIT: - default: - lthroughput = div_u64(lthroughput, 100); - break; - } - - if (lthroughput > U32_MAX) { - batadv_err(net_dev, - "Throughput speed for %s too large: %s\n", - description, buff); - return false; - } - - *throughput = lthroughput; - - return true; -} - /** * batadv_gw_tvlv_container_update() - update the gw tvlv container after * gateway setting change diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index cb2e72d7ab14..5d097d6a1dd9 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -9,9 +9,6 @@ #include "main.h" -#include -#include - /** * enum batadv_bandwidth_units - bandwidth unit types */ @@ -30,7 +27,5 @@ enum batadv_bandwidth_units { void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv); void batadv_gw_init(struct batadv_priv *bat_priv); void batadv_gw_free(struct batadv_priv *bat_priv); -bool batadv_parse_throughput(struct net_device *net_dev, char *buff, - const char *description, u32 *throughput); #endif /* _NET_BATMAN_ADV_GATEWAY_COMMON_H_ */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index b6c512ce6704..d37872b34281 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -548,15 +548,12 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) * algorithm in use implements the GW API */ - u32 sel_class_max = 0xffffffffu; + u32 sel_class_max = bat_priv->algo_ops->gw.sel_class_max; u32 sel_class; attr = info->attrs[BATADV_ATTR_GW_SEL_CLASS]; sel_class = nla_get_u32(attr); - if (!bat_priv->algo_ops->gw.store_sel_class) - sel_class_max = BATADV_TQ_MAX_VALUE; - if (sel_class >= 1 && sel_class <= sel_class_max) { atomic_set(&bat_priv->gw.sel_class, sel_class); batadv_gw_reselect(bat_priv); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index ca9449ec9836..54c2b8fa48cc 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -2191,11 +2191,10 @@ struct batadv_algo_gw_ops { void (*init_sel_class)(struct batadv_priv *bat_priv); /** - * @store_sel_class: parse and stores a new GW selection class - * (optional) + * @sel_class_max: maximum allowed GW selection class */ - ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff, - size_t count); + u32 sel_class_max; + /** * @get_best_gw_node: select the best GW from the list of available * nodes (optional) -- cgit v1.2.3 From 63618463cb94bda3ee68dc2ada998c0ef3c00e67 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 11 Aug 2023 17:57:02 +0200 Subject: devlink: parse linecard attr in doit() callbacks No need to give the linecards any special treatment in netlink attribute parsing, as unlike for ports, there is only a couple of commands benefiting from that. Remove DEVLINK_NL_FLAG_NEED_LINECARD, make pre_doit() callback simpler by moving the linecard attribute parsing to linecard_[gs]et_doit() ops. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230811155714.1736405-2-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/devl_internal.h | 7 ------- net/devlink/leftover.c | 19 +++++++++++++------ net/devlink/netlink.c | 8 -------- 3 files changed, 13 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 7fdd956ff992..3bbecebf192d 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -94,7 +94,6 @@ static inline bool devl_is_registered(struct devlink *devlink) #define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) #define DEVLINK_NL_FLAG_NEED_RATE BIT(2) #define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) -#define DEVLINK_NL_FLAG_NEED_LINECARD BIT(4) enum devlink_multicast_groups { DEVLINK_MCGRP_CONFIG, @@ -203,12 +202,6 @@ int devlink_resources_validate(struct devlink *devlink, struct devlink_resource *resource, struct genl_info *info); -/* Line cards */ -struct devlink_linecard; - -struct devlink_linecard * -devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info); - /* Rates */ int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack); diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index e7900d9fa205..46cdd5d88583 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -285,7 +285,7 @@ devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) return ERR_PTR(-EINVAL); } -struct devlink_linecard * +static struct devlink_linecard * devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_linecard_get_from_attrs(devlink, info->attrs); @@ -1814,11 +1814,15 @@ static void devlink_linecard_notify(struct devlink_linecard *linecard, static int devlink_nl_cmd_linecard_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_linecard *linecard = info->user_ptr[1]; - struct devlink *devlink = linecard->devlink; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_linecard *linecard; struct sk_buff *msg; int err; + linecard = devlink_linecard_get_from_info(devlink, info); + if (IS_ERR(linecard)) + return PTR_ERR(linecard); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -2008,10 +2012,15 @@ out: static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_linecard *linecard = info->user_ptr[1]; struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_linecard *linecard; int err; + linecard = devlink_linecard_get_from_info(devlink, info); + if (IS_ERR(linecard)) + return PTR_ERR(linecard); + if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) { const char *type; @@ -6347,14 +6356,12 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .cmd = DEVLINK_CMD_LINECARD_GET, .doit = devlink_nl_cmd_linecard_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_LINECARD_SET, .doit = devlink_nl_cmd_linecard_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, }, { .cmd = DEVLINK_CMD_SB_GET, diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index bada2819827b..9fd683f38a53 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -112,7 +112,6 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs) int devlink_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { - struct devlink_linecard *linecard; struct devlink_port *devlink_port; struct devlink *devlink; int err; @@ -151,13 +150,6 @@ int devlink_nl_pre_doit(const struct genl_split_ops *ops, goto unlock; } info->user_ptr[1] = rate_node; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { - linecard = devlink_linecard_get_from_info(devlink, info); - if (IS_ERR(linecard)) { - err = PTR_ERR(linecard); - goto unlock; - } - info->user_ptr[1] = linecard; } return 0; -- cgit v1.2.3 From 41a1d4d1399af0f4ba9754086b23f7b9576eb25f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 11 Aug 2023 17:57:03 +0200 Subject: devlink: parse rate attrs in doit() callbacks No need to give the rate any special treatment in netlink attributes parsing, as unlike for ports, there is only a couple of commands benefiting from that. Remove DEVLINK_NL_FLAG_NEED_RATE*, make pre_doit() callback simpler by moving the rate attributes parsing to rate_*_doit() ops. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230811155714.1736405-3-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/devl_internal.h | 8 +------- net/devlink/leftover.c | 37 ++++++++++++++++++++++++------------- net/devlink/netlink.c | 18 ------------------ 3 files changed, 25 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 3bbecebf192d..ca04e4ee9427 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -92,8 +92,6 @@ static inline bool devl_is_registered(struct devlink *devlink) /* Netlink */ #define DEVLINK_NL_FLAG_NEED_PORT BIT(0) #define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) -#define DEVLINK_NL_FLAG_NEED_RATE BIT(2) -#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) enum devlink_multicast_groups { DEVLINK_MCGRP_CONFIG, @@ -205,11 +203,7 @@ int devlink_resources_validate(struct devlink *devlink, /* Rates */ int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack); -struct devlink_rate * -devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info); -struct devlink_rate * -devlink_rate_node_get_from_info(struct devlink *devlink, - struct genl_info *info); + /* Devlink nl cmds */ int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info); diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 46cdd5d88583..b7b6850e26d8 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -232,13 +232,13 @@ devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) return devlink_rate_node_get_by_name(devlink, rate_node_name); } -struct devlink_rate * +static struct devlink_rate * devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_rate_node_get_from_attrs(devlink, info->attrs); } -struct devlink_rate * +static struct devlink_rate * devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) { struct nlattr **attrs = info->attrs; @@ -1041,10 +1041,15 @@ const struct devlink_cmd devl_cmd_rate_get = { static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_rate *devlink_rate = info->user_ptr[1]; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_rate *devlink_rate; struct sk_buff *msg; int err; + devlink_rate = devlink_rate_get_from_info(devlink, info); + if (IS_ERR(devlink_rate)) + return PTR_ERR(devlink_rate); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -1629,11 +1634,16 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, static int devlink_nl_cmd_rate_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_rate *devlink_rate = info->user_ptr[1]; - struct devlink *devlink = devlink_rate->devlink; - const struct devlink_ops *ops = devlink->ops; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_rate *devlink_rate; + const struct devlink_ops *ops; int err; + devlink_rate = devlink_rate_get_from_info(devlink, info); + if (IS_ERR(devlink_rate)) + return PTR_ERR(devlink_rate); + + ops = devlink->ops; if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type)) return -EOPNOTSUPP; @@ -1704,18 +1714,22 @@ err_strdup: static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_rate *rate_node = info->user_ptr[1]; - struct devlink *devlink = rate_node->devlink; - const struct devlink_ops *ops = devlink->ops; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_rate *rate_node; int err; + rate_node = devlink_rate_node_get_from_info(devlink, info); + if (IS_ERR(rate_node)) + return PTR_ERR(rate_node); + if (refcount_read(&rate_node->refcnt) > 1) { NL_SET_ERR_MSG(info->extack, "Node has children. Cannot delete node."); return -EBUSY; } devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); - err = ops->rate_node_del(rate_node, rate_node->priv, info->extack); + err = devlink->ops->rate_node_del(rate_node, rate_node->priv, + info->extack); if (rate_node->parent) refcount_dec(&rate_node->parent->refcnt); list_del(&rate_node->list); @@ -6307,14 +6321,12 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .cmd = DEVLINK_CMD_RATE_GET, .doit = devlink_nl_cmd_rate_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_RATE_SET, .doit = devlink_nl_cmd_rate_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_RATE, }, { .cmd = DEVLINK_CMD_RATE_NEW, @@ -6325,7 +6337,6 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .cmd = DEVLINK_CMD_RATE_DEL, .doit = devlink_nl_cmd_rate_del_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_RATE_NODE, }, { .cmd = DEVLINK_CMD_PORT_SPLIT, diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 9fd683f38a53..96cf8a1b8bce 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -132,24 +132,6 @@ int devlink_nl_pre_doit(const struct genl_split_ops *ops, devlink_port = devlink_port_get_from_info(devlink, info); if (!IS_ERR(devlink_port)) info->user_ptr[1] = devlink_port; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) { - struct devlink_rate *devlink_rate; - - devlink_rate = devlink_rate_get_from_info(devlink, info); - if (IS_ERR(devlink_rate)) { - err = PTR_ERR(devlink_rate); - goto unlock; - } - info->user_ptr[1] = devlink_rate; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE_NODE) { - struct devlink_rate *rate_node; - - rate_node = devlink_rate_node_get_from_info(devlink, info); - if (IS_ERR(rate_node)) { - err = PTR_ERR(rate_node); - goto unlock; - } - info->user_ptr[1] = rate_node; } return 0; -- cgit v1.2.3 From ee6d78ac28c708c62fd83595e9366b0eb0deb014 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 11 Aug 2023 17:57:04 +0200 Subject: devlink: introduce devlink_nl_pre_doit_port*() helper functions Define port handling helpers what don't rely on internal_flags. Have __devlink_nl_pre_doit() to accept the flags as a function arg and make devlink_nl_pre_doit() a wrapper helper function calling it. Introduce new helpers devlink_nl_pre_doit_port() and devlink_nl_pre_doit_port_optional() to be used by split ops in follow-up patch. Note that the function prototypes are temporary until the generated ones will replace them in a follow-up patch. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230811155714.1736405-4-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/devl_internal.h | 5 +++++ net/devlink/netlink.c | 27 +++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index ca04e4ee9427..9851fd48ab59 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -205,6 +205,11 @@ int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack); /* Devlink nl cmds */ +int devlink_nl_pre_doit_port(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops, + struct sk_buff *skb, + struct genl_info *info); int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info); diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 96cf8a1b8bce..3c59b9c49150 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -109,8 +109,8 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs) return ERR_PTR(-ENODEV); } -int devlink_nl_pre_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) +static int __devlink_nl_pre_doit(struct sk_buff *skb, struct genl_info *info, + u8 flags) { struct devlink_port *devlink_port; struct devlink *devlink; @@ -121,14 +121,14 @@ int devlink_nl_pre_doit(const struct genl_split_ops *ops, return PTR_ERR(devlink); info->user_ptr[0] = devlink; - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { + if (flags & DEVLINK_NL_FLAG_NEED_PORT) { devlink_port = devlink_port_get_from_info(devlink, info); if (IS_ERR(devlink_port)) { err = PTR_ERR(devlink_port); goto unlock; } info->user_ptr[1] = devlink_port; - } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) { + } else if (flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) { devlink_port = devlink_port_get_from_info(devlink, info); if (!IS_ERR(devlink_port)) info->user_ptr[1] = devlink_port; @@ -141,6 +141,25 @@ unlock: return err; } +int devlink_nl_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + return __devlink_nl_pre_doit(skb, info, ops->internal_flags); +} + +int devlink_nl_pre_doit_port(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_PORT); +} + +int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops, + struct sk_buff *skb, + struct genl_info *info) +{ + return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT); +} + void devlink_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { -- cgit v1.2.3 From 8fa995ad1f7f8f615b479edcf540cc12fe7b87ba Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 11 Aug 2023 17:57:05 +0200 Subject: devlink: rename doit callbacks for per-instance dump commands Rename netlink doit callback functions for the commands that do implement per-instance dump to match the generated names that are going to be introduce in the follow-up patch. Note that the function prototypes are temporary until the generated ones will replace them in a follow-up patch. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230811155714.1736405-5-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/dev.c | 3 +- net/devlink/devl_internal.h | 22 +++++++++++++-- net/devlink/health.c | 4 +-- net/devlink/leftover.c | 68 ++++++++++++++++++++------------------------- 4 files changed, 52 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/devlink/dev.c b/net/devlink/dev.c index 5dfba2248b90..167fe6188d21 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -1206,8 +1206,7 @@ err_cancel_msg: return err; } -int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_selftests_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct sk_buff *msg; diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 9851fd48ab59..f29ec0bfc559 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -214,10 +214,22 @@ int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_selftests_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, - struct genl_info *info); +int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_sb_port_pool_get_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_sb_tc_pool_bind_get_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_param_get_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_health_reporter_get_doit(struct sk_buff *skb, + struct genl_info *info); int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, @@ -230,3 +242,7 @@ int devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_trap_policer_get_doit(struct sk_buff *skb, + struct genl_info *info); diff --git a/net/devlink/health.c b/net/devlink/health.c index 194340a8bb86..6df4e343d8c2 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -356,8 +356,8 @@ devlink_health_reporter_get_from_info(struct devlink *devlink, return devlink_health_reporter_get_from_attrs(devlink, info->attrs); } -int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_health_reporter_get_doit(struct sk_buff *skb, + struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index b7b6850e26d8..bd8fa2f9a05b 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -1038,8 +1038,7 @@ const struct devlink_cmd devl_cmd_rate_get = { .dump_one = devlink_nl_cmd_rate_get_dump_one, }; -static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *devlink_rate; @@ -1077,8 +1076,7 @@ devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, return false; } -static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct sk_buff *msg; @@ -1825,8 +1823,7 @@ static void devlink_linecard_notify(struct devlink_linecard *linecard, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -static int devlink_nl_cmd_linecard_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_linecard *linecard; @@ -2091,8 +2088,7 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_sb *devlink_sb; @@ -2194,8 +2190,7 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_nl_cmd_sb_pool_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_sb *devlink_sb; @@ -2394,8 +2389,8 @@ sb_occ_get_failure: return err; } -static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_sb_port_pool_get_doit(struct sk_buff *skb, + struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = devlink_port->devlink; @@ -2603,8 +2598,8 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_nl_cmd_sb_tc_pool_bind_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_sb_tc_pool_bind_get_doit(struct sk_buff *skb, + struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = devlink_port->devlink; @@ -4295,8 +4290,8 @@ devlink_param_get_from_info(struct xarray *params, struct genl_info *info) return devlink_param_find_by_name(params, param_name); } -static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_param_get_doit(struct sk_buff *skb, + struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_param_item *param_item; @@ -4793,8 +4788,7 @@ static void devlink_region_snapshot_del(struct devlink_region *region, kfree(snapshot); } -static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_port *port = NULL; @@ -5655,8 +5649,7 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_nl_cmd_trap_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; @@ -5866,8 +5859,7 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_nl_cmd_trap_group_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; @@ -6160,8 +6152,8 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_nl_cmd_trap_policer_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_trap_policer_get_doit(struct sk_buff *skb, + struct genl_info *info) { struct devlink_trap_policer_item *policer_item; struct netlink_ext_ack *extack = info->extack; @@ -6305,7 +6297,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { { .cmd = DEVLINK_CMD_PORT_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_get_doit, + .doit = devlink_nl_port_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ @@ -6319,7 +6311,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { }, { .cmd = DEVLINK_CMD_RATE_GET, - .doit = devlink_nl_cmd_rate_get_doit, + .doit = devlink_nl_rate_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, @@ -6365,7 +6357,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { }, { .cmd = DEVLINK_CMD_LINECARD_GET, - .doit = devlink_nl_cmd_linecard_get_doit, + .doit = devlink_nl_linecard_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, @@ -6377,14 +6369,14 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { { .cmd = DEVLINK_CMD_SB_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_get_doit, + .doit = devlink_nl_sb_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_SB_POOL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_pool_get_doit, + .doit = devlink_nl_sb_pool_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, @@ -6397,7 +6389,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { { .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_port_pool_get_doit, + .doit = devlink_nl_sb_port_pool_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ @@ -6412,7 +6404,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { { .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit, + .doit = devlink_nl_sb_tc_pool_bind_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ @@ -6493,7 +6485,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { { .cmd = DEVLINK_CMD_PARAM_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_param_get_doit, + .doit = devlink_nl_param_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, @@ -6521,7 +6513,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { { .cmd = DEVLINK_CMD_REGION_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_region_get_doit, + .doit = devlink_nl_region_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, .flags = GENL_ADMIN_PERM, }, @@ -6547,7 +6539,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { { .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_get_doit, + .doit = devlink_nl_health_reporter_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, /* can be retrieved by unprivileged users */ @@ -6602,7 +6594,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { }, { .cmd = DEVLINK_CMD_TRAP_GET, - .doit = devlink_nl_cmd_trap_get_doit, + .doit = devlink_nl_trap_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, @@ -6613,7 +6605,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { }, { .cmd = DEVLINK_CMD_TRAP_GROUP_GET, - .doit = devlink_nl_cmd_trap_group_get_doit, + .doit = devlink_nl_trap_group_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, @@ -6624,7 +6616,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { }, { .cmd = DEVLINK_CMD_TRAP_POLICER_GET, - .doit = devlink_nl_cmd_trap_policer_get_doit, + .doit = devlink_nl_trap_policer_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, @@ -6635,7 +6627,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { }, { .cmd = DEVLINK_CMD_SELFTESTS_GET, - .doit = devlink_nl_cmd_selftests_get_doit, + .doit = devlink_nl_selftests_get_doit, .dumpit = devlink_nl_instance_iter_dumpit, /* can be retrieved by unprivileged users */ }, -- cgit v1.2.3 From 24c8e56d4f983527cc5e1f6d771fef8ec7ce352e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 11 Aug 2023 17:57:06 +0200 Subject: devlink: introduce dumpit callbacks for split ops Introduce dumpit callbacks for generated split ops. Have them as a thin wrapper around iteration function and allow to pass dump_one() function pointer directly without need to store in devlink_cmd structs. Note that the function prototypes are temporary until the generated ones will replace them in a follow-up patch. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230811155714.1736405-6-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/dev.c | 15 ++-- net/devlink/devl_internal.h | 45 ++++++----- net/devlink/health.c | 16 ++-- net/devlink/leftover.c | 189 ++++++++++++++++++++++++-------------------- net/devlink/netlink.c | 26 ------ 5 files changed, 144 insertions(+), 147 deletions(-) (limited to 'net') diff --git a/net/devlink/dev.c b/net/devlink/dev.c index 167fe6188d21..22e8ab3eaaa2 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -1229,10 +1229,9 @@ int devlink_nl_selftests_get_doit(struct sk_buff *skb, struct genl_info *info) return genlmsg_reply(msg, info); } -static int -devlink_nl_cmd_selftests_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_selftests_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { if (!devlink->ops->selftest_check) return 0; @@ -1243,9 +1242,11 @@ devlink_nl_cmd_selftests_get_dump_one(struct sk_buff *msg, cb->extack); } -const struct devlink_cmd devl_cmd_selftests_get = { - .dump_one = devlink_nl_cmd_selftests_get_dump_one, -}; +int devlink_nl_selftests_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_selftests_get_dump_one); +} static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id, enum devlink_selftest_status test_status) diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index f29ec0bfc559..500c91c61b2d 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -117,10 +117,6 @@ typedef int devlink_nl_dump_one_func_t(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb); -struct devlink_cmd { - devlink_nl_dump_one_func_t *dump_one; -}; - extern const struct genl_small_ops devlink_nl_small_ops[54]; struct devlink * @@ -131,7 +127,6 @@ void devlink_notify_register(struct devlink *devlink); int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, devlink_nl_dump_one_func_t *dump_one); -int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, struct netlink_callback *cb); static inline struct devlink_nl_dump_state * devlink_dump_state(struct netlink_callback *cb) @@ -151,22 +146,6 @@ devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) return 0; } -/* Commands */ -extern const struct devlink_cmd devl_cmd_port_get; -extern const struct devlink_cmd devl_cmd_sb_get; -extern const struct devlink_cmd devl_cmd_sb_pool_get; -extern const struct devlink_cmd devl_cmd_sb_port_pool_get; -extern const struct devlink_cmd devl_cmd_sb_tc_pool_bind_get; -extern const struct devlink_cmd devl_cmd_param_get; -extern const struct devlink_cmd devl_cmd_region_get; -extern const struct devlink_cmd devl_cmd_health_reporter_get; -extern const struct devlink_cmd devl_cmd_trap_get; -extern const struct devlink_cmd devl_cmd_trap_group_get; -extern const struct devlink_cmd devl_cmd_trap_policer_get; -extern const struct devlink_cmd devl_cmd_rate_get; -extern const struct devlink_cmd devl_cmd_linecard_get; -extern const struct devlink_cmd devl_cmd_selftests_get; - /* Notify */ void devlink_notify(struct devlink *devlink, enum devlink_command cmd); @@ -215,21 +194,40 @@ int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info) int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info); int devlink_nl_selftests_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_selftests_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info); int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_rate_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_linecard_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_sb_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int devlink_nl_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_sb_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); int devlink_nl_sb_port_pool_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_sb_port_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); int devlink_nl_sb_tc_pool_bind_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_sb_tc_pool_bind_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); int devlink_nl_param_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_param_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_region_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); int devlink_nl_health_reporter_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_health_reporter_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, @@ -243,6 +241,11 @@ int devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_trap_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int devlink_nl_trap_group_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_trap_policer_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_trap_policer_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); diff --git a/net/devlink/health.c b/net/devlink/health.c index 6df4e343d8c2..dbe2d6a1df3b 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -384,10 +384,9 @@ int devlink_nl_health_reporter_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int -devlink_nl_cmd_health_reporter_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_health_reporter_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_health_reporter *reporter; @@ -434,9 +433,12 @@ devlink_nl_cmd_health_reporter_get_dump_one(struct sk_buff *msg, return 0; } -const struct devlink_cmd devl_cmd_health_reporter_get = { - .dump_one = devlink_nl_cmd_health_reporter_get_dump_one, -}; +int devlink_nl_health_reporter_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, + devlink_nl_health_reporter_get_dump_one); +} int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info) diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index bd8fa2f9a05b..21f1058ef14d 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -1005,8 +1005,8 @@ static void devlink_rate_notify(struct devlink_rate *devlink_rate, } static int -devlink_nl_cmd_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_rate *devlink_rate; @@ -1034,9 +1034,10 @@ devlink_nl_cmd_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, return err; } -const struct devlink_cmd devl_cmd_rate_get = { - .dump_one = devlink_nl_cmd_rate_get_dump_one, -}; +int devlink_nl_rate_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_rate_get_dump_one); +} int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info) { @@ -1098,8 +1099,8 @@ int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info) } static int -devlink_nl_cmd_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_port *devlink_port; @@ -1121,9 +1122,10 @@ devlink_nl_cmd_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, return err; } -const struct devlink_cmd devl_cmd_port_get = { - .dump_one = devlink_nl_cmd_port_get_dump_one, -}; +int devlink_nl_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_port_get_dump_one); +} static int devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type port_type) @@ -1852,9 +1854,9 @@ int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info) return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_linecard_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_linecard_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_linecard *linecard; @@ -1884,9 +1886,11 @@ static int devlink_nl_cmd_linecard_get_dump_one(struct sk_buff *msg, return err; } -const struct devlink_cmd devl_cmd_linecard_get = { - .dump_one = devlink_nl_cmd_linecard_get_dump_one, -}; +int devlink_nl_linecard_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_linecard_get_dump_one); +} static struct devlink_linecard_type * devlink_linecard_type_lookup(struct devlink_linecard *linecard, @@ -2115,8 +2119,8 @@ int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info) } static int -devlink_nl_cmd_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; @@ -2143,9 +2147,10 @@ devlink_nl_cmd_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink, return err; } -const struct devlink_cmd devl_cmd_sb_get = { - .dump_one = devlink_nl_cmd_sb_get_dump_one, -}; +int devlink_nl_sb_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_sb_get_dump_one); +} static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, @@ -2252,9 +2257,8 @@ static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, } static int -devlink_nl_cmd_sb_pool_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_sb_pool_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; @@ -2280,9 +2284,11 @@ devlink_nl_cmd_sb_pool_get_dump_one(struct sk_buff *msg, return err; } -const struct devlink_cmd devl_cmd_sb_pool_get = { - .dump_one = devlink_nl_cmd_sb_pool_get_dump_one, -}; +int devlink_nl_sb_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_sb_pool_get_dump_one); +} static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index, u16 pool_index, u32 size, @@ -2460,9 +2466,9 @@ static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, } static int -devlink_nl_cmd_sb_port_pool_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +devlink_nl_sb_port_pool_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; @@ -2488,9 +2494,11 @@ devlink_nl_cmd_sb_port_pool_get_dump_one(struct sk_buff *msg, return err; } -const struct devlink_cmd devl_cmd_sb_port_pool_get = { - .dump_one = devlink_nl_cmd_sb_port_pool_get_dump_one, -}; +int devlink_nl_sb_port_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_sb_port_pool_get_dump_one); +} static int devlink_sb_port_pool_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 pool_index, @@ -2694,10 +2702,9 @@ static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, return 0; } -static int -devlink_nl_cmd_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; @@ -2723,9 +2730,12 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg, return err; } -const struct devlink_cmd devl_cmd_sb_tc_pool_bind_get = { - .dump_one = devlink_nl_cmd_sb_tc_pool_bind_get_dump_one, -}; +int devlink_nl_sb_tc_pool_bind_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, + devlink_nl_sb_tc_pool_bind_get_dump_one); +} static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, unsigned int sb_index, u16 tc_index, @@ -4173,9 +4183,9 @@ static void devlink_param_notify(struct devlink *devlink, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -static int -devlink_nl_cmd_param_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_param_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_param_item *param_item; @@ -4199,9 +4209,11 @@ devlink_nl_cmd_param_get_dump_one(struct sk_buff *msg, struct devlink *devlink, return err; } -const struct devlink_cmd devl_cmd_param_get = { - .dump_one = devlink_nl_cmd_param_get_dump_one, -}; +int devlink_nl_param_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_param_get_dump_one); +} static int devlink_param_type_get_from_info(struct genl_info *info, @@ -4861,9 +4873,9 @@ out: return err; } -static int -devlink_nl_cmd_region_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_region_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_region *region; @@ -4901,9 +4913,11 @@ devlink_nl_cmd_region_get_dump_one(struct sk_buff *msg, struct devlink *devlink, return 0; } -const struct devlink_cmd devl_cmd_region_get = { - .dump_one = devlink_nl_cmd_region_get_dump_one, -}; +int devlink_nl_region_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_region_get_dump_one); +} static int devlink_nl_cmd_region_del(struct sk_buff *skb, struct genl_info *info) @@ -5683,9 +5697,9 @@ err_trap_fill: return err; } -static int -devlink_nl_cmd_trap_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_trap_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_item *trap_item; @@ -5712,9 +5726,10 @@ devlink_nl_cmd_trap_get_dump_one(struct sk_buff *msg, struct devlink *devlink, return err; } -const struct devlink_cmd devl_cmd_trap_get = { - .dump_one = devlink_nl_cmd_trap_get_dump_one, -}; +int devlink_nl_trap_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_trap_get_dump_one); +} static int __devlink_trap_action_set(struct devlink *devlink, struct devlink_trap_item *trap_item, @@ -5893,10 +5908,9 @@ err_trap_group_fill: return err; } -static int -devlink_nl_cmd_trap_group_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_trap_group_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_group_item *group_item; @@ -5924,9 +5938,11 @@ devlink_nl_cmd_trap_group_get_dump_one(struct sk_buff *msg, return err; } -const struct devlink_cmd devl_cmd_trap_group_get = { - .dump_one = devlink_nl_cmd_trap_group_get_dump_one, -}; +int devlink_nl_trap_group_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_trap_group_get_dump_one); +} static int __devlink_trap_group_action_set(struct devlink *devlink, @@ -6187,10 +6203,9 @@ err_trap_policer_fill: return err; } -static int -devlink_nl_cmd_trap_policer_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb) +static int devlink_nl_trap_policer_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_policer_item *policer_item; @@ -6217,9 +6232,11 @@ devlink_nl_cmd_trap_policer_get_dump_one(struct sk_buff *msg, return err; } -const struct devlink_cmd devl_cmd_trap_policer_get = { - .dump_one = devlink_nl_cmd_trap_policer_get_dump_one, -}; +int devlink_nl_trap_policer_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_trap_policer_get_dump_one); +} static int devlink_trap_policer_set(struct devlink *devlink, @@ -6298,7 +6315,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .cmd = DEVLINK_CMD_PORT_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_port_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_port_get_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, @@ -6312,7 +6329,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { { .cmd = DEVLINK_CMD_RATE_GET, .doit = devlink_nl_rate_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_rate_get_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -6358,7 +6375,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { { .cmd = DEVLINK_CMD_LINECARD_GET, .doit = devlink_nl_linecard_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_linecard_get_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -6370,14 +6387,14 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .cmd = DEVLINK_CMD_SB_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_sb_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_sb_get_dumpit, /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_SB_POOL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_sb_pool_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_sb_pool_get_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -6390,7 +6407,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_sb_port_pool_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_sb_port_pool_get_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, @@ -6405,7 +6422,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_sb_tc_pool_bind_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_sb_tc_pool_bind_get_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, @@ -6486,7 +6503,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .cmd = DEVLINK_CMD_PARAM_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_param_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_param_get_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -6514,7 +6531,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .cmd = DEVLINK_CMD_REGION_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_region_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_region_get_dumpit, .flags = GENL_ADMIN_PERM, }, { @@ -6540,7 +6557,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_health_reporter_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_health_reporter_get_dumpit, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, /* can be retrieved by unprivileged users */ }, @@ -6595,7 +6612,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { { .cmd = DEVLINK_CMD_TRAP_GET, .doit = devlink_nl_trap_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_trap_get_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -6606,7 +6623,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { { .cmd = DEVLINK_CMD_TRAP_GROUP_GET, .doit = devlink_nl_trap_group_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_trap_group_get_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -6617,7 +6634,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { { .cmd = DEVLINK_CMD_TRAP_POLICER_GET, .doit = devlink_nl_trap_policer_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_trap_policer_get_dumpit, /* can be retrieved by unprivileged users */ }, { @@ -6628,7 +6645,7 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { { .cmd = DEVLINK_CMD_SELFTESTS_GET, .doit = devlink_nl_selftests_get_doit, - .dumpit = devlink_nl_instance_iter_dumpit, + .dumpit = devlink_nl_selftests_get_dumpit, /* can be retrieved by unprivileged users */ }, { diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 3c59b9c49150..13388665f319 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -170,23 +170,6 @@ void devlink_nl_post_doit(const struct genl_split_ops *ops, devlink_put(devlink); } -static const struct devlink_cmd *devl_cmds[] = { - [DEVLINK_CMD_PORT_GET] = &devl_cmd_port_get, - [DEVLINK_CMD_SB_GET] = &devl_cmd_sb_get, - [DEVLINK_CMD_SB_POOL_GET] = &devl_cmd_sb_pool_get, - [DEVLINK_CMD_SB_PORT_POOL_GET] = &devl_cmd_sb_port_pool_get, - [DEVLINK_CMD_SB_TC_POOL_BIND_GET] = &devl_cmd_sb_tc_pool_bind_get, - [DEVLINK_CMD_PARAM_GET] = &devl_cmd_param_get, - [DEVLINK_CMD_REGION_GET] = &devl_cmd_region_get, - [DEVLINK_CMD_HEALTH_REPORTER_GET] = &devl_cmd_health_reporter_get, - [DEVLINK_CMD_TRAP_GET] = &devl_cmd_trap_get, - [DEVLINK_CMD_TRAP_GROUP_GET] = &devl_cmd_trap_group_get, - [DEVLINK_CMD_TRAP_POLICER_GET] = &devl_cmd_trap_policer_get, - [DEVLINK_CMD_RATE_GET] = &devl_cmd_rate_get, - [DEVLINK_CMD_LINECARD_GET] = &devl_cmd_linecard_get, - [DEVLINK_CMD_SELFTESTS_GET] = &devl_cmd_selftests_get, -}; - int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, devlink_nl_dump_one_func_t *dump_one) { @@ -220,15 +203,6 @@ int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, return msg->len; } -int devlink_nl_instance_iter_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - const struct genl_dumpit_info *info = genl_dumpit_info(cb); - const struct devlink_cmd *cmd = devl_cmds[info->op.cmd]; - - return devlink_nl_dumpit(msg, cb, cmd->dump_one); -} - struct genl_family devlink_nl_family __ro_after_init = { .name = DEVLINK_GENL_NAME, .version = DEVLINK_GENL_VERSION, -- cgit v1.2.3 From 7d3c6fec6135e10842587f38a15d7d06fd02c21f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 11 Aug 2023 17:57:07 +0200 Subject: devlink: pass flags as an arg of dump_one() callback In order to easily set NLM_F_DUMP_FILTERED for partial dumps, pass the flags as an arg of dump_one() callback. Currently, it is always NLM_F_MULTI. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230811155714.1736405-7-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/dev.c | 13 +++---- net/devlink/devl_internal.h | 3 +- net/devlink/health.c | 7 ++-- net/devlink/leftover.c | 87 ++++++++++++++++++++++----------------------- net/devlink/netlink.c | 2 +- 5 files changed, 56 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/net/devlink/dev.c b/net/devlink/dev.c index 22e8ab3eaaa2..abf3393a7a17 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -218,11 +218,11 @@ int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info) static int devlink_nl_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, int flags) { return devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); + cb->nlh->nlmsg_seq, flags); } int devlink_nl_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) @@ -828,13 +828,13 @@ int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info) static int devlink_nl_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, int flags) { int err; err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->nlh->nlmsg_seq, flags, cb->extack); if (err == -EOPNOTSUPP) err = 0; @@ -1231,14 +1231,15 @@ int devlink_nl_selftests_get_doit(struct sk_buff *skb, struct genl_info *info) static int devlink_nl_selftests_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, + int flags) { if (!devlink->ops->selftest_check) return 0; return devlink_nl_selftests_fill(msg, devlink, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->nlh->nlmsg_seq, flags, cb->extack); } diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 500c91c61b2d..f8af6ffdbb3a 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -115,7 +115,8 @@ struct devlink_nl_dump_state { typedef int devlink_nl_dump_one_func_t(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb); + struct netlink_callback *cb, + int flags); extern const struct genl_small_ops devlink_nl_small_ops[54]; diff --git a/net/devlink/health.c b/net/devlink/health.c index dbe2d6a1df3b..b9b3e68d9043 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -386,7 +386,8 @@ int devlink_nl_health_reporter_get_doit(struct sk_buff *skb, static int devlink_nl_health_reporter_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, + int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_health_reporter *reporter; @@ -404,7 +405,7 @@ static int devlink_nl_health_reporter_get_dump_one(struct sk_buff *msg, DEVLINK_CMD_HEALTH_REPORTER_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI); + flags); if (err) { state->idx = idx; return err; @@ -421,7 +422,7 @@ static int devlink_nl_health_reporter_get_dump_one(struct sk_buff *msg, DEVLINK_CMD_HEALTH_REPORTER_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI); + flags); if (err) { state->idx = idx; return err; diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 21f1058ef14d..883c65545d26 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -1006,7 +1006,7 @@ static void devlink_rate_notify(struct devlink_rate *devlink_rate, static int devlink_nl_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_rate *devlink_rate; @@ -1022,8 +1022,7 @@ devlink_nl_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, continue; } err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, NULL); + cb->nlh->nlmsg_seq, flags, NULL); if (err) { state->idx = idx; break; @@ -1100,7 +1099,7 @@ int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info) static int devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_port *devlink_port; @@ -1111,8 +1110,8 @@ devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->extack); + cb->nlh->nlmsg_seq, flags, + cb->extack); if (err) { state->idx = port_index; break; @@ -1856,7 +1855,8 @@ int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info) static int devlink_nl_linecard_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, + int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_linecard *linecard; @@ -1872,8 +1872,7 @@ static int devlink_nl_linecard_get_dump_one(struct sk_buff *msg, err = devlink_nl_linecard_fill(msg, devlink, linecard, DEVLINK_CMD_LINECARD_NEW, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, + cb->nlh->nlmsg_seq, flags, cb->extack); mutex_unlock(&linecard->state_lock); if (err) { @@ -2120,7 +2119,7 @@ int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info) static int devlink_nl_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; @@ -2135,8 +2134,7 @@ devlink_nl_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink, err = devlink_nl_sb_fill(msg, devlink, devlink_sb, DEVLINK_CMD_SB_NEW, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; @@ -2233,7 +2231,7 @@ int devlink_nl_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info) static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, struct devlink *devlink, struct devlink_sb *devlink_sb, - u32 portid, u32 seq) + u32 portid, u32 seq, int flags) { u16 pool_count = devlink_sb_pool_count(devlink_sb); u16 pool_index; @@ -2248,7 +2246,7 @@ static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, devlink_sb, pool_index, DEVLINK_CMD_SB_POOL_NEW, - portid, seq, NLM_F_MULTI); + portid, seq, flags); if (err) return err; (*p_idx)++; @@ -2258,7 +2256,7 @@ static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, static int devlink_nl_sb_pool_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; @@ -2272,7 +2270,7 @@ devlink_nl_sb_pool_get_dump_one(struct sk_buff *msg, struct devlink *devlink, err = __sb_pool_get_dumpit(msg, state->idx, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); + cb->nlh->nlmsg_seq, flags); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { @@ -2436,7 +2434,7 @@ int devlink_nl_sb_port_pool_get_doit(struct sk_buff *skb, static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, struct devlink *devlink, struct devlink_sb *devlink_sb, - u32 portid, u32 seq) + u32 portid, u32 seq, int flags) { struct devlink_port *devlink_port; u16 pool_count = devlink_sb_pool_count(devlink_sb); @@ -2455,8 +2453,7 @@ static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, devlink_sb, pool_index, DEVLINK_CMD_SB_PORT_POOL_NEW, - portid, seq, - NLM_F_MULTI); + portid, seq, flags); if (err) return err; (*p_idx)++; @@ -2468,7 +2465,7 @@ static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, static int devlink_nl_sb_port_pool_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; @@ -2482,7 +2479,7 @@ devlink_nl_sb_port_pool_get_dump_one(struct sk_buff *msg, err = __sb_port_pool_get_dumpit(msg, state->idx, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); + cb->nlh->nlmsg_seq, flags); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { @@ -2654,7 +2651,7 @@ static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, int start, int *p_idx, struct devlink *devlink, struct devlink_sb *devlink_sb, - u32 portid, u32 seq) + u32 portid, u32 seq, int flags) { struct devlink_port *devlink_port; unsigned long port_index; @@ -2675,7 +2672,7 @@ static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, DEVLINK_SB_POOL_TYPE_INGRESS, DEVLINK_CMD_SB_TC_POOL_BIND_NEW, portid, seq, - NLM_F_MULTI); + flags); if (err) return err; (*p_idx)++; @@ -2693,7 +2690,7 @@ static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, DEVLINK_SB_POOL_TYPE_EGRESS, DEVLINK_CMD_SB_TC_POOL_BIND_NEW, portid, seq, - NLM_F_MULTI); + flags); if (err) return err; (*p_idx)++; @@ -2704,7 +2701,8 @@ static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, static int devlink_nl_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, + int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_sb *devlink_sb; @@ -2718,7 +2716,7 @@ static int devlink_nl_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg, err = __sb_tc_pool_bind_get_dumpit(msg, state->idx, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq); + cb->nlh->nlmsg_seq, flags); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { @@ -4185,7 +4183,8 @@ static void devlink_param_notify(struct devlink *devlink, static int devlink_nl_param_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, + int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_param_item *param_item; @@ -4196,8 +4195,7 @@ static int devlink_nl_param_get_dump_one(struct sk_buff *msg, err = devlink_nl_param_fill(msg, devlink, 0, param_item, DEVLINK_CMD_PARAM_GET, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + cb->nlh->nlmsg_seq, flags); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { @@ -4848,8 +4846,7 @@ int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info) static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg, struct netlink_callback *cb, struct devlink_port *port, - int *idx, - int start) + int *idx, int start, int flags) { struct devlink_region *region; int err = 0; @@ -4863,7 +4860,7 @@ static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg, DEVLINK_CMD_REGION_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI, region); + flags, region); if (err) goto out; (*idx)++; @@ -4875,7 +4872,8 @@ out: static int devlink_nl_region_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, + int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_region *region; @@ -4892,8 +4890,8 @@ static int devlink_nl_region_get_dump_one(struct sk_buff *msg, err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, region); + cb->nlh->nlmsg_seq, flags, + region); if (err) { state->idx = idx; return err; @@ -4903,7 +4901,7 @@ static int devlink_nl_region_get_dump_one(struct sk_buff *msg, xa_for_each(&devlink->ports, port_index, port) { err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, &idx, - state->idx); + state->idx, flags); if (err) { state->idx = idx; return err; @@ -5699,7 +5697,7 @@ err_trap_fill: static int devlink_nl_trap_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_item *trap_item; @@ -5714,8 +5712,7 @@ static int devlink_nl_trap_get_dump_one(struct sk_buff *msg, err = devlink_nl_trap_fill(msg, devlink, trap_item, DEVLINK_CMD_TRAP_NEW, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; @@ -5910,7 +5907,8 @@ err_trap_group_fill: static int devlink_nl_trap_group_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, + int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_group_item *group_item; @@ -5926,8 +5924,7 @@ static int devlink_nl_trap_group_get_dump_one(struct sk_buff *msg, err = devlink_nl_trap_group_fill(msg, devlink, group_item, DEVLINK_CMD_TRAP_GROUP_NEW, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; @@ -6205,7 +6202,8 @@ err_trap_policer_fill: static int devlink_nl_trap_policer_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb) + struct netlink_callback *cb, + int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_policer_item *policer_item; @@ -6220,8 +6218,7 @@ static int devlink_nl_trap_policer_get_dump_one(struct sk_buff *msg, err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_NEW, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 13388665f319..47e44fb45815 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -182,7 +182,7 @@ int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, devl_lock(devlink); if (devl_is_registered(devlink)) - err = dump_one(msg, devlink, cb); + err = dump_one(msg, devlink, cb, NLM_F_MULTI); else err = 0; -- cgit v1.2.3 From 7199c86247e9618981a2916900d3fcb83e5df157 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 11 Aug 2023 17:57:08 +0200 Subject: netlink: specs: devlink: add commands that do per-instance dump Add the definitions for the commands that do per-instance dump and re-generate the related code. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230811155714.1736405-8-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 444 +++++- net/devlink/netlink_gen.c | 332 ++++- net/devlink/netlink_gen.h | 52 +- tools/net/ynl/generated/devlink-user.c | 2341 ++++++++++++++++++++++++++++-- tools/net/ynl/generated/devlink-user.h | 1295 ++++++++++++++++- 5 files changed, 4319 insertions(+), 145 deletions(-) (limited to 'net') diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index f6df0b3fd502..4f2aa48017a7 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -6,6 +6,16 @@ protocol: genetlink-legacy doc: Partial family for Devlink. +definitions: + - + type: enum + name: sb-pool-type + entries: + - + name: ingress + - + name: egress + attribute-sets: - name: devlink @@ -24,6 +34,46 @@ attribute-sets: # TODO: fill in the attributes in between + - + name: sb-index + type: u32 + value: 11 + + # TODO: fill in the attributes in between + + - + name: sb-pool-index + type: u16 + value: 17 + + - + name: sb-pool-type + type: u8 + enum: sb-pool-type + + # TODO: fill in the attributes in between + + - + name: sb-tc-index + type: u16 + value: 22 + + # TODO: fill in the attributes in between + + - + name: param-name + type: string + value: 81 + + # TODO: fill in the attributes in between + + - + name: region-name + type: string + value: 88 + + # TODO: fill in the attributes in between + - name: info-driver-name type: string @@ -55,10 +105,35 @@ attribute-sets: # TODO: fill in the attributes in between + - + name: health-reporter-name + type: string + value: 115 + + # TODO: fill in the attributes in between + + - + name: trap-name + type: string + value: 130 + + # TODO: fill in the attributes in between + + - + name: trap-group-name + type: string + value: 135 + - name: reload-failed type: u8 - value: 136 + + # TODO: fill in the attributes in between + + - + name: trap-policer-id + type: u32 + value: 142 # TODO: fill in the attributes in between @@ -103,6 +178,21 @@ attribute-sets: type: nest multi-attr: true nested-attributes: dl-reload-act-stats + + # TODO: fill in the attributes in between + + - + name: rate-node-name + type: string + value: 168 + + # TODO: fill in the attributes in between + + - + name: linecard-index + type: u32 + value: 171 + - name: dl-dev-stats subset-of: devlink @@ -188,6 +278,188 @@ operations: dump: reply: *get-reply + - + name: port-get + doc: Get devlink port instances. + attribute-set: devlink + dont-validate: + - strict + - dump + + do: + pre: devlink-nl-pre-doit-port + post: devlink-nl-post-doit + request: + value: 5 + attributes: &port-id-attrs + - bus-name + - dev-name + - port-index + reply: + value: 7 + attributes: *port-id-attrs + dump: + reply: + value: 3 # due to a bug, port dump returns DEVLINK_CMD_NEW + attributes: *port-id-attrs + + # TODO: fill in the operations in between + + - + name: sb-get + doc: Get shared buffer instances. + attribute-set: devlink + dont-validate: + - strict + - dump + + do: + pre: devlink-nl-pre-doit + post: devlink-nl-post-doit + request: + value: 11 + attributes: &sb-id-attrs + - bus-name + - dev-name + - sb-index + reply: &sb-get-reply + value: 11 + attributes: *sb-id-attrs + dump: + reply: *sb-get-reply + + # TODO: fill in the operations in between + + - + name: sb-pool-get + doc: Get shared buffer pool instances. + attribute-set: devlink + dont-validate: + - strict + - dump + + do: + pre: devlink-nl-pre-doit + post: devlink-nl-post-doit + request: + value: 15 + attributes: &sb-pool-id-attrs + - bus-name + - dev-name + - sb-index + - sb-pool-index + reply: &sb-pool-get-reply + value: 15 + attributes: *sb-pool-id-attrs + dump: + reply: *sb-pool-get-reply + + # TODO: fill in the operations in between + + - + name: sb-port-pool-get + doc: Get shared buffer port-pool combinations and threshold. + attribute-set: devlink + dont-validate: + - strict + - dump + + do: + pre: devlink-nl-pre-doit-port + post: devlink-nl-post-doit + request: + value: 19 + attributes: &sb-port-pool-id-attrs + - bus-name + - dev-name + - port-index + - sb-index + - sb-pool-index + reply: &sb-port-pool-get-reply + value: 19 + attributes: *sb-port-pool-id-attrs + dump: + reply: *sb-port-pool-get-reply + + # TODO: fill in the operations in between + + - + name: sb-tc-pool-bind-get + doc: Get shared buffer port-TC to pool bindings and threshold. + attribute-set: devlink + dont-validate: + - strict + - dump + + do: + pre: devlink-nl-pre-doit-port + post: devlink-nl-post-doit + request: + value: 23 + attributes: &sb-tc-pool-bind-id-attrs + - bus-name + - dev-name + - port-index + - sb-index + - sb-pool-type + - sb-tc-index + reply: &sb-tc-pool-bind-get-reply + value: 23 + attributes: *sb-tc-pool-bind-id-attrs + dump: + reply: *sb-tc-pool-bind-get-reply + + # TODO: fill in the operations in between + + - + name: param-get + doc: Get param instances. + attribute-set: devlink + dont-validate: + - strict + - dump + + do: + pre: devlink-nl-pre-doit + post: devlink-nl-post-doit + request: + value: 38 + attributes: ¶m-id-attrs + - bus-name + - dev-name + - param-name + reply: ¶m-get-reply + value: 38 + attributes: *param-id-attrs + dump: + reply: *param-get-reply + + # TODO: fill in the operations in between + + - + name: region-get + doc: Get region instances. + attribute-set: devlink + dont-validate: + - strict + - dump + + do: + pre: devlink-nl-pre-doit-port-optional + post: devlink-nl-post-doit + request: + value: 42 + attributes: ®ion-id-attrs + - bus-name + - dev-name + - port-index + - region-name + reply: ®ion-get-reply + value: 42 + attributes: *region-id-attrs + dump: + reply: *region-get-reply + # TODO: fill in the operations in between - @@ -216,3 +488,173 @@ operations: - info-version-stored dump: reply: *info-get-reply + + - + name: health-reporter-get + doc: Get health reporter instances. + attribute-set: devlink + dont-validate: + - strict + - dump + + do: + pre: devlink-nl-pre-doit-port-optional + post: devlink-nl-post-doit + request: + attributes: &health-reporter-id-attrs + - bus-name + - dev-name + - port-index + - health-reporter-name + reply: &health-reporter-get-reply + attributes: *health-reporter-id-attrs + dump: + reply: *health-reporter-get-reply + + # TODO: fill in the operations in between + + - + name: trap-get + doc: Get trap instances. + attribute-set: devlink + dont-validate: + - strict + - dump + + do: + pre: devlink-nl-pre-doit + post: devlink-nl-post-doit + request: + value: 61 + attributes: &trap-id-attrs + - bus-name + - dev-name + - trap-name + reply: &trap-get-reply + value: 61 + attributes: *trap-id-attrs + dump: + reply: *trap-get-reply + + # TODO: fill in the operations in between + + - + name: trap-group-get + doc: Get trap group instances. + attribute-set: devlink + dont-validate: + - strict + - dump + + do: + pre: devlink-nl-pre-doit + post: devlink-nl-post-doit + request: + value: 65 + attributes: &trap-group-id-attrs + - bus-name + - dev-name + - trap-group-name + reply: &trap-group-get-reply + value: 65 + attributes: *trap-group-id-attrs + dump: + reply: *trap-group-get-reply + + # TODO: fill in the operations in between + + - + name: trap-policer-get + doc: Get trap policer instances. + attribute-set: devlink + dont-validate: + - strict + - dump + + do: + pre: devlink-nl-pre-doit + post: devlink-nl-post-doit + request: + value: 69 + attributes: &trap-policer-id-attrs + - bus-name + - dev-name + - trap-policer-id + reply: &trap-policer-get-reply + value: 69 + attributes: *trap-policer-id-attrs + dump: + reply: *trap-policer-get-reply + + # TODO: fill in the operations in between + + - + name: rate-get + doc: Get rate instances. + attribute-set: devlink + dont-validate: + - strict + - dump + + do: + pre: devlink-nl-pre-doit + post: devlink-nl-post-doit + request: + value: 74 + attributes: &rate-id-attrs + - bus-name + - dev-name + - port-index + - rate-node-name + reply: &rate-get-reply + value: 74 + attributes: *rate-id-attrs + dump: + reply: *rate-get-reply + + # TODO: fill in the operations in between + + - + name: linecard-get + doc: Get line card instances. + attribute-set: devlink + dont-validate: + - strict + - dump + + do: + pre: devlink-nl-pre-doit + post: devlink-nl-post-doit + request: + value: 78 + attributes: &linecard-id-attrs + - bus-name + - dev-name + - linecard-index + reply: &linecard-get-reply + value: 78 + attributes: *linecard-id-attrs + dump: + reply: *linecard-get-reply + + # TODO: fill in the operations in between + + - + name: selftests-get + doc: Get device selftest instances. + attribute-set: devlink + dont-validate: + - strict + - dump + + do: + pre: devlink-nl-pre-doit + post: devlink-nl-post-doit + request: + value: 82 + attributes: *dev-id-attrs + reply: &selftests-get-reply + value: 82 + attributes: *dev-id-attrs + dump: + reply: *selftests-get-reply diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index 32d8cbed0c30..e384b91b2e3b 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -16,14 +16,120 @@ static const struct nla_policy devlink_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, }; +/* DEVLINK_CMD_PORT_GET - do */ +static const struct nla_policy devlink_port_get_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, +}; + +/* DEVLINK_CMD_SB_GET - do */ +static const struct nla_policy devlink_sb_get_nl_policy[DEVLINK_ATTR_SB_INDEX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, }, +}; + +/* DEVLINK_CMD_SB_POOL_GET - do */ +static const struct nla_policy devlink_sb_pool_get_nl_policy[DEVLINK_ATTR_SB_POOL_INDEX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, }, +}; + +/* DEVLINK_CMD_SB_PORT_POOL_GET - do */ +static const struct nla_policy devlink_sb_port_pool_get_nl_policy[DEVLINK_ATTR_SB_POOL_INDEX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, }, +}; + +/* DEVLINK_CMD_SB_TC_POOL_BIND_GET - do */ +static const struct nla_policy devlink_sb_tc_pool_bind_get_nl_policy[DEVLINK_ATTR_SB_TC_INDEX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_SB_POOL_TYPE] = NLA_POLICY_MAX(NLA_U8, 1), + [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16, }, +}; + +/* DEVLINK_CMD_PARAM_GET - do */ +static const struct nla_policy devlink_param_get_nl_policy[DEVLINK_ATTR_PARAM_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_REGION_GET - do */ +static const struct nla_policy devlink_region_get_nl_policy[DEVLINK_ATTR_REGION_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING, }, +}; + /* DEVLINK_CMD_INFO_GET - do */ static const struct nla_policy devlink_info_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, }; +/* DEVLINK_CMD_HEALTH_REPORTER_GET - do */ +static const struct nla_policy devlink_health_reporter_get_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_TRAP_GET - do */ +static const struct nla_policy devlink_trap_get_nl_policy[DEVLINK_ATTR_TRAP_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_TRAP_GROUP_GET - do */ +static const struct nla_policy devlink_trap_group_get_nl_policy[DEVLINK_ATTR_TRAP_GROUP_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_TRAP_POLICER_GET - do */ +static const struct nla_policy devlink_trap_policer_get_nl_policy[DEVLINK_ATTR_TRAP_POLICER_ID + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32, }, +}; + +/* DEVLINK_CMD_RATE_GET - do */ +static const struct nla_policy devlink_rate_get_nl_policy[DEVLINK_ATTR_RATE_NODE_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, + [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, +}; + +/* DEVLINK_CMD_LINECARD_GET - do */ +static const struct nla_policy devlink_linecard_get_nl_policy[DEVLINK_ATTR_LINECARD_INDEX + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32, }, +}; + +/* DEVLINK_CMD_SELFTESTS_GET - do */ +static const struct nla_policy devlink_selftests_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + /* Ops table for devlink */ -const struct genl_split_ops devlink_nl_ops[4] = { +const struct genl_split_ops devlink_nl_ops[32] = { { .cmd = DEVLINK_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT, @@ -40,6 +146,118 @@ const struct genl_split_ops devlink_nl_ops[4] = { .dumpit = devlink_nl_get_dumpit, .flags = GENL_CMD_CAP_DUMP, }, + { + .cmd = DEVLINK_CMD_PORT_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit_port, + .doit = devlink_nl_port_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_port_get_nl_policy, + .maxattr = DEVLINK_ATTR_PORT_INDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_PORT_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_port_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_SB_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_sb_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_sb_get_nl_policy, + .maxattr = DEVLINK_ATTR_SB_INDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_SB_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_sb_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_SB_POOL_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_sb_pool_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_sb_pool_get_nl_policy, + .maxattr = DEVLINK_ATTR_SB_POOL_INDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_SB_POOL_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_sb_pool_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit_port, + .doit = devlink_nl_sb_port_pool_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_sb_port_pool_get_nl_policy, + .maxattr = DEVLINK_ATTR_SB_POOL_INDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_sb_port_pool_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit_port, + .doit = devlink_nl_sb_tc_pool_bind_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_sb_tc_pool_bind_get_nl_policy, + .maxattr = DEVLINK_ATTR_SB_TC_INDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_sb_tc_pool_bind_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_PARAM_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_param_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_param_get_nl_policy, + .maxattr = DEVLINK_ATTR_PARAM_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_PARAM_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_param_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_REGION_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit_port_optional, + .doit = devlink_nl_region_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_region_get_nl_policy, + .maxattr = DEVLINK_ATTR_REGION_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_REGION_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_region_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, { .cmd = DEVLINK_CMD_INFO_GET, .validate = GENL_DONT_VALIDATE_STRICT, @@ -56,4 +274,116 @@ const struct genl_split_ops devlink_nl_ops[4] = { .dumpit = devlink_nl_info_get_dumpit, .flags = GENL_CMD_CAP_DUMP, }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit_port_optional, + .doit = devlink_nl_health_reporter_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_health_reporter_get_nl_policy, + .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_health_reporter_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_TRAP_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_trap_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_trap_get_nl_policy, + .maxattr = DEVLINK_ATTR_TRAP_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_TRAP_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_trap_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_TRAP_GROUP_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_trap_group_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_trap_group_get_nl_policy, + .maxattr = DEVLINK_ATTR_TRAP_GROUP_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_TRAP_GROUP_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_trap_group_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_TRAP_POLICER_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_trap_policer_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_trap_policer_get_nl_policy, + .maxattr = DEVLINK_ATTR_TRAP_POLICER_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_TRAP_POLICER_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_trap_policer_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_RATE_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_rate_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_rate_get_nl_policy, + .maxattr = DEVLINK_ATTR_RATE_NODE_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_RATE_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_rate_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_LINECARD_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_linecard_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_linecard_get_nl_policy, + .maxattr = DEVLINK_ATTR_LINECARD_INDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_LINECARD_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_linecard_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = DEVLINK_CMD_SELFTESTS_GET, + .validate = GENL_DONT_VALIDATE_STRICT, + .pre_doit = devlink_nl_pre_doit, + .doit = devlink_nl_selftests_get_doit, + .post_doit = devlink_nl_post_doit, + .policy = devlink_selftests_get_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = DEVLINK_CMD_SELFTESTS_GET, + .validate = GENL_DONT_VALIDATE_DUMP, + .dumpit = devlink_nl_selftests_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, }; diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h index 11980e04a718..f8bbc93e39be 100644 --- a/net/devlink/netlink_gen.h +++ b/net/devlink/netlink_gen.h @@ -12,18 +12,68 @@ #include /* Ops table for devlink */ -extern const struct genl_split_ops devlink_nl_ops[4]; +extern const struct genl_split_ops devlink_nl_ops[32]; int devlink_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); +int devlink_nl_pre_doit_port(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops, + struct sk_buff *skb, + struct genl_info *info); void devlink_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_port_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_sb_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int devlink_nl_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_sb_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_sb_port_pool_get_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_sb_port_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_sb_tc_pool_bind_get_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_sb_tc_pool_bind_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_param_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_param_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_region_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_info_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int devlink_nl_health_reporter_get_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_health_reporter_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_trap_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_trap_group_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_trap_policer_get_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_trap_policer_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_rate_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_linecard_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int devlink_nl_selftests_get_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_selftests_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); #endif /* _LINUX_DEVLINK_GEN_H */ diff --git a/tools/net/ynl/generated/devlink-user.c b/tools/net/ynl/generated/devlink-user.c index 8492789433b9..e9ab98b5ffe3 100644 --- a/tools/net/ynl/generated/devlink-user.c +++ b/tools/net/ynl/generated/devlink-user.c @@ -15,7 +15,21 @@ /* Enums */ static const char * const devlink_op_strmap[] = { [3] = "get", + [7] = "port-get", + [DEVLINK_CMD_SB_GET] = "sb-get", + [DEVLINK_CMD_SB_POOL_GET] = "sb-pool-get", + [DEVLINK_CMD_SB_PORT_POOL_GET] = "sb-port-pool-get", + [DEVLINK_CMD_SB_TC_POOL_BIND_GET] = "sb-tc-pool-bind-get", + [DEVLINK_CMD_PARAM_GET] = "param-get", + [DEVLINK_CMD_REGION_GET] = "region-get", [DEVLINK_CMD_INFO_GET] = "info-get", + [DEVLINK_CMD_HEALTH_REPORTER_GET] = "health-reporter-get", + [DEVLINK_CMD_TRAP_GET] = "trap-get", + [DEVLINK_CMD_TRAP_GROUP_GET] = "trap-group-get", + [DEVLINK_CMD_TRAP_POLICER_GET] = "trap-policer-get", + [DEVLINK_CMD_RATE_GET] = "rate-get", + [DEVLINK_CMD_LINECARD_GET] = "linecard-get", + [DEVLINK_CMD_SELFTESTS_GET] = "selftests-get", }; const char *devlink_op_str(int op) @@ -25,6 +39,18 @@ const char *devlink_op_str(int op) return devlink_op_strmap[op]; } +static const char * const devlink_sb_pool_type_strmap[] = { + [0] = "ingress", + [1] = "egress", +}; + +const char *devlink_sb_pool_type_str(enum devlink_sb_pool_type value) +{ + if (value < 0 || value >= (int)MNL_ARRAY_SIZE(devlink_sb_pool_type_strmap)) + return NULL; + return devlink_sb_pool_type_strmap[value]; +} + /* Policies */ struct ynl_policy_attr devlink_dl_info_version_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_INFO_VERSION_NAME] = { .name = "info-version-name", .type = YNL_PT_NUL_STR, }, @@ -88,6 +114,12 @@ struct ynl_policy_attr devlink_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .name = "bus-name", .type = YNL_PT_NUL_STR, }, [DEVLINK_ATTR_DEV_NAME] = { .name = "dev-name", .type = YNL_PT_NUL_STR, }, [DEVLINK_ATTR_PORT_INDEX] = { .name = "port-index", .type = YNL_PT_U32, }, + [DEVLINK_ATTR_SB_INDEX] = { .name = "sb-index", .type = YNL_PT_U32, }, + [DEVLINK_ATTR_SB_POOL_INDEX] = { .name = "sb-pool-index", .type = YNL_PT_U16, }, + [DEVLINK_ATTR_SB_POOL_TYPE] = { .name = "sb-pool-type", .type = YNL_PT_U8, }, + [DEVLINK_ATTR_SB_TC_INDEX] = { .name = "sb-tc-index", .type = YNL_PT_U16, }, + [DEVLINK_ATTR_PARAM_NAME] = { .name = "param-name", .type = YNL_PT_NUL_STR, }, + [DEVLINK_ATTR_REGION_NAME] = { .name = "region-name", .type = YNL_PT_NUL_STR, }, [DEVLINK_ATTR_INFO_DRIVER_NAME] = { .name = "info-driver-name", .type = YNL_PT_NUL_STR, }, [DEVLINK_ATTR_INFO_SERIAL_NUMBER] = { .name = "info-serial-number", .type = YNL_PT_NUL_STR, }, [DEVLINK_ATTR_INFO_VERSION_FIXED] = { .name = "info-version-fixed", .type = YNL_PT_NEST, .nest = &devlink_dl_info_version_nest, }, @@ -95,7 +127,11 @@ struct ynl_policy_attr devlink_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_INFO_VERSION_STORED] = { .name = "info-version-stored", .type = YNL_PT_NEST, .nest = &devlink_dl_info_version_nest, }, [DEVLINK_ATTR_INFO_VERSION_NAME] = { .name = "info-version-name", .type = YNL_PT_NUL_STR, }, [DEVLINK_ATTR_INFO_VERSION_VALUE] = { .name = "info-version-value", .type = YNL_PT_NUL_STR, }, + [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .name = "health-reporter-name", .type = YNL_PT_NUL_STR, }, + [DEVLINK_ATTR_TRAP_NAME] = { .name = "trap-name", .type = YNL_PT_NUL_STR, }, + [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .name = "trap-group-name", .type = YNL_PT_NUL_STR, }, [DEVLINK_ATTR_RELOAD_FAILED] = { .name = "reload-failed", .type = YNL_PT_U8, }, + [DEVLINK_ATTR_TRAP_POLICER_ID] = { .name = "trap-policer-id", .type = YNL_PT_U32, }, [DEVLINK_ATTR_RELOAD_ACTION] = { .name = "reload-action", .type = YNL_PT_U8, }, [DEVLINK_ATTR_DEV_STATS] = { .name = "dev-stats", .type = YNL_PT_NEST, .nest = &devlink_dl_dev_stats_nest, }, [DEVLINK_ATTR_RELOAD_STATS] = { .name = "reload-stats", .type = YNL_PT_NEST, .nest = &devlink_dl_reload_stats_nest, }, @@ -105,6 +141,8 @@ struct ynl_policy_attr devlink_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_REMOTE_RELOAD_STATS] = { .name = "remote-reload-stats", .type = YNL_PT_NEST, .nest = &devlink_dl_reload_stats_nest, }, [DEVLINK_ATTR_RELOAD_ACTION_INFO] = { .name = "reload-action-info", .type = YNL_PT_NEST, .nest = &devlink_dl_reload_act_info_nest, }, [DEVLINK_ATTR_RELOAD_ACTION_STATS] = { .name = "reload-action-stats", .type = YNL_PT_NEST, .nest = &devlink_dl_reload_act_stats_nest, }, + [DEVLINK_ATTR_RATE_NODE_NAME] = { .name = "rate-node-name", .type = YNL_PT_NUL_STR, }, + [DEVLINK_ATTR_LINECARD_INDEX] = { .name = "linecard-index", .type = YNL_PT_U32, }, }; struct ynl_policy_nest devlink_nest = { @@ -531,55 +569,29 @@ free_list: return NULL; } -/* ============== DEVLINK_CMD_INFO_GET ============== */ -/* DEVLINK_CMD_INFO_GET - do */ -void devlink_info_get_req_free(struct devlink_info_get_req *req) +/* ============== DEVLINK_CMD_PORT_GET ============== */ +/* DEVLINK_CMD_PORT_GET - do */ +void devlink_port_get_req_free(struct devlink_port_get_req *req) { free(req->bus_name); free(req->dev_name); free(req); } -void devlink_info_get_rsp_free(struct devlink_info_get_rsp *rsp) +void devlink_port_get_rsp_free(struct devlink_port_get_rsp *rsp) { - unsigned int i; - free(rsp->bus_name); free(rsp->dev_name); - free(rsp->info_driver_name); - free(rsp->info_serial_number); - for (i = 0; i < rsp->n_info_version_fixed; i++) - devlink_dl_info_version_free(&rsp->info_version_fixed[i]); - free(rsp->info_version_fixed); - for (i = 0; i < rsp->n_info_version_running; i++) - devlink_dl_info_version_free(&rsp->info_version_running[i]); - free(rsp->info_version_running); - for (i = 0; i < rsp->n_info_version_stored; i++) - devlink_dl_info_version_free(&rsp->info_version_stored[i]); - free(rsp->info_version_stored); free(rsp); } -int devlink_info_get_rsp_parse(const struct nlmsghdr *nlh, void *data) +int devlink_port_get_rsp_parse(const struct nlmsghdr *nlh, void *data) { - unsigned int n_info_version_running = 0; - unsigned int n_info_version_stored = 0; - unsigned int n_info_version_fixed = 0; struct ynl_parse_arg *yarg = data; - struct devlink_info_get_rsp *dst; + struct devlink_port_get_rsp *dst; const struct nlattr *attr; - struct ynl_parse_arg parg; - int i; dst = yarg->data; - parg.ys = yarg->ys; - - if (dst->info_version_fixed) - return ynl_error_parse(yarg, "attribute already present (devlink.info-version-fixed)"); - if (dst->info_version_running) - return ynl_error_parse(yarg, "attribute already present (devlink.info-version-running)"); - if (dst->info_version_stored) - return ynl_error_parse(yarg, "attribute already present (devlink.info-version-stored)"); mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { unsigned int type = mnl_attr_get_type(attr); @@ -606,92 +618,205 @@ int devlink_info_get_rsp_parse(const struct nlmsghdr *nlh, void *data) dst->dev_name = malloc(len + 1); memcpy(dst->dev_name, mnl_attr_get_str(attr), len); dst->dev_name[len] = 0; - } else if (type == DEVLINK_ATTR_INFO_DRIVER_NAME) { + } else if (type == DEVLINK_ATTR_PORT_INDEX) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.port_index = 1; + dst->port_index = mnl_attr_get_u32(attr); + } + } + + return MNL_CB_OK; +} + +struct devlink_port_get_rsp * +devlink_port_get(struct ynl_sock *ys, struct devlink_port_get_req *req) +{ + struct ynl_req_state yrs = { .yarg = { .ys = ys, }, }; + struct devlink_port_get_rsp *rsp; + struct nlmsghdr *nlh; + int err; + + nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_PORT_GET, 1); + ys->req_policy = &devlink_nest; + yrs.yarg.rsp_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + if (req->_present.port_index) + mnl_attr_put_u32(nlh, DEVLINK_ATTR_PORT_INDEX, req->port_index); + + rsp = calloc(1, sizeof(*rsp)); + yrs.yarg.data = rsp; + yrs.cb = devlink_port_get_rsp_parse; + yrs.rsp_cmd = 7; + + err = ynl_exec(ys, nlh, &yrs); + if (err < 0) + goto err_free; + + return rsp; + +err_free: + devlink_port_get_rsp_free(rsp); + return NULL; +} + +/* DEVLINK_CMD_PORT_GET - dump */ +int devlink_port_get_rsp_dump_parse(const struct nlmsghdr *nlh, void *data) +{ + struct devlink_port_get_rsp_dump *dst; + struct ynl_parse_arg *yarg = data; + const struct nlattr *attr; + + dst = yarg->data; + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == DEVLINK_ATTR_BUS_NAME) { unsigned int len; if (ynl_attr_validate(yarg, attr)) return MNL_CB_ERROR; len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); - dst->_present.info_driver_name_len = len; - dst->info_driver_name = malloc(len + 1); - memcpy(dst->info_driver_name, mnl_attr_get_str(attr), len); - dst->info_driver_name[len] = 0; - } else if (type == DEVLINK_ATTR_INFO_SERIAL_NUMBER) { + dst->_present.bus_name_len = len; + dst->bus_name = malloc(len + 1); + memcpy(dst->bus_name, mnl_attr_get_str(attr), len); + dst->bus_name[len] = 0; + } else if (type == DEVLINK_ATTR_DEV_NAME) { unsigned int len; if (ynl_attr_validate(yarg, attr)) return MNL_CB_ERROR; len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); - dst->_present.info_serial_number_len = len; - dst->info_serial_number = malloc(len + 1); - memcpy(dst->info_serial_number, mnl_attr_get_str(attr), len); - dst->info_serial_number[len] = 0; - } else if (type == DEVLINK_ATTR_INFO_VERSION_FIXED) { - n_info_version_fixed++; - } else if (type == DEVLINK_ATTR_INFO_VERSION_RUNNING) { - n_info_version_running++; - } else if (type == DEVLINK_ATTR_INFO_VERSION_STORED) { - n_info_version_stored++; + dst->_present.dev_name_len = len; + dst->dev_name = malloc(len + 1); + memcpy(dst->dev_name, mnl_attr_get_str(attr), len); + dst->dev_name[len] = 0; + } else if (type == DEVLINK_ATTR_PORT_INDEX) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.port_index = 1; + dst->port_index = mnl_attr_get_u32(attr); } } - if (n_info_version_fixed) { - dst->info_version_fixed = calloc(n_info_version_fixed, sizeof(*dst->info_version_fixed)); - dst->n_info_version_fixed = n_info_version_fixed; - i = 0; - parg.rsp_policy = &devlink_dl_info_version_nest; - mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { - if (mnl_attr_get_type(attr) == DEVLINK_ATTR_INFO_VERSION_FIXED) { - parg.data = &dst->info_version_fixed[i]; - if (devlink_dl_info_version_parse(&parg, attr)) - return MNL_CB_ERROR; - i++; - } - } - } - if (n_info_version_running) { - dst->info_version_running = calloc(n_info_version_running, sizeof(*dst->info_version_running)); - dst->n_info_version_running = n_info_version_running; - i = 0; - parg.rsp_policy = &devlink_dl_info_version_nest; - mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { - if (mnl_attr_get_type(attr) == DEVLINK_ATTR_INFO_VERSION_RUNNING) { - parg.data = &dst->info_version_running[i]; - if (devlink_dl_info_version_parse(&parg, attr)) - return MNL_CB_ERROR; - i++; - } - } + return MNL_CB_OK; +} + +void devlink_port_get_rsp_list_free(struct devlink_port_get_rsp_list *rsp) +{ + struct devlink_port_get_rsp_list *next = rsp; + + while ((void *)next != YNL_LIST_END) { + rsp = next; + next = rsp->next; + + free(rsp->obj.bus_name); + free(rsp->obj.dev_name); + free(rsp); } - if (n_info_version_stored) { - dst->info_version_stored = calloc(n_info_version_stored, sizeof(*dst->info_version_stored)); - dst->n_info_version_stored = n_info_version_stored; - i = 0; - parg.rsp_policy = &devlink_dl_info_version_nest; - mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { - if (mnl_attr_get_type(attr) == DEVLINK_ATTR_INFO_VERSION_STORED) { - parg.data = &dst->info_version_stored[i]; - if (devlink_dl_info_version_parse(&parg, attr)) - return MNL_CB_ERROR; - i++; - } +} + +struct devlink_port_get_rsp_list *devlink_port_get_dump(struct ynl_sock *ys) +{ + struct ynl_dump_state yds = {}; + struct nlmsghdr *nlh; + int err; + + yds.ys = ys; + yds.alloc_sz = sizeof(struct devlink_port_get_rsp_list); + yds.cb = devlink_port_get_rsp_dump_parse; + yds.rsp_cmd = 7; + yds.rsp_policy = &devlink_nest; + + nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_PORT_GET, 1); + + err = ynl_exec_dump(ys, nlh, &yds); + if (err < 0) + goto free_list; + + return yds.first; + +free_list: + devlink_port_get_rsp_list_free(yds.first); + return NULL; +} + +/* ============== DEVLINK_CMD_SB_GET ============== */ +/* DEVLINK_CMD_SB_GET - do */ +void devlink_sb_get_req_free(struct devlink_sb_get_req *req) +{ + free(req->bus_name); + free(req->dev_name); + free(req); +} + +void devlink_sb_get_rsp_free(struct devlink_sb_get_rsp *rsp) +{ + free(rsp->bus_name); + free(rsp->dev_name); + free(rsp); +} + +int devlink_sb_get_rsp_parse(const struct nlmsghdr *nlh, void *data) +{ + struct ynl_parse_arg *yarg = data; + struct devlink_sb_get_rsp *dst; + const struct nlattr *attr; + + dst = yarg->data; + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == DEVLINK_ATTR_BUS_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.bus_name_len = len; + dst->bus_name = malloc(len + 1); + memcpy(dst->bus_name, mnl_attr_get_str(attr), len); + dst->bus_name[len] = 0; + } else if (type == DEVLINK_ATTR_DEV_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.dev_name_len = len; + dst->dev_name = malloc(len + 1); + memcpy(dst->dev_name, mnl_attr_get_str(attr), len); + dst->dev_name[len] = 0; + } else if (type == DEVLINK_ATTR_SB_INDEX) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.sb_index = 1; + dst->sb_index = mnl_attr_get_u32(attr); } } return MNL_CB_OK; } -struct devlink_info_get_rsp * -devlink_info_get(struct ynl_sock *ys, struct devlink_info_get_req *req) +struct devlink_sb_get_rsp * +devlink_sb_get(struct ynl_sock *ys, struct devlink_sb_get_req *req) { struct ynl_req_state yrs = { .yarg = { .ys = ys, }, }; - struct devlink_info_get_rsp *rsp; + struct devlink_sb_get_rsp *rsp; struct nlmsghdr *nlh; int err; - nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_INFO_GET, 1); + nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_SB_GET, 1); ys->req_policy = &devlink_nest; yrs.yarg.rsp_policy = &devlink_nest; @@ -699,11 +824,13 @@ devlink_info_get(struct ynl_sock *ys, struct devlink_info_get_req *req) mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); if (req->_present.dev_name_len) mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + if (req->_present.sb_index) + mnl_attr_put_u32(nlh, DEVLINK_ATTR_SB_INDEX, req->sb_index); rsp = calloc(1, sizeof(*rsp)); yrs.yarg.data = rsp; - yrs.cb = devlink_info_get_rsp_parse; - yrs.rsp_cmd = DEVLINK_CMD_INFO_GET; + yrs.cb = devlink_sb_get_rsp_parse; + yrs.rsp_cmd = DEVLINK_CMD_SB_GET; err = ynl_exec(ys, nlh, &yrs); if (err < 0) @@ -712,51 +839,38 @@ devlink_info_get(struct ynl_sock *ys, struct devlink_info_get_req *req) return rsp; err_free: - devlink_info_get_rsp_free(rsp); + devlink_sb_get_rsp_free(rsp); return NULL; } -/* DEVLINK_CMD_INFO_GET - dump */ -void devlink_info_get_list_free(struct devlink_info_get_list *rsp) +/* DEVLINK_CMD_SB_GET - dump */ +void devlink_sb_get_list_free(struct devlink_sb_get_list *rsp) { - struct devlink_info_get_list *next = rsp; + struct devlink_sb_get_list *next = rsp; while ((void *)next != YNL_LIST_END) { - unsigned int i; - rsp = next; next = rsp->next; free(rsp->obj.bus_name); free(rsp->obj.dev_name); - free(rsp->obj.info_driver_name); - free(rsp->obj.info_serial_number); - for (i = 0; i < rsp->obj.n_info_version_fixed; i++) - devlink_dl_info_version_free(&rsp->obj.info_version_fixed[i]); - free(rsp->obj.info_version_fixed); - for (i = 0; i < rsp->obj.n_info_version_running; i++) - devlink_dl_info_version_free(&rsp->obj.info_version_running[i]); - free(rsp->obj.info_version_running); - for (i = 0; i < rsp->obj.n_info_version_stored; i++) - devlink_dl_info_version_free(&rsp->obj.info_version_stored[i]); - free(rsp->obj.info_version_stored); free(rsp); } } -struct devlink_info_get_list *devlink_info_get_dump(struct ynl_sock *ys) +struct devlink_sb_get_list *devlink_sb_get_dump(struct ynl_sock *ys) { struct ynl_dump_state yds = {}; struct nlmsghdr *nlh; int err; yds.ys = ys; - yds.alloc_sz = sizeof(struct devlink_info_get_list); - yds.cb = devlink_info_get_rsp_parse; - yds.rsp_cmd = DEVLINK_CMD_INFO_GET; + yds.alloc_sz = sizeof(struct devlink_sb_get_list); + yds.cb = devlink_sb_get_rsp_parse; + yds.rsp_cmd = DEVLINK_CMD_SB_GET; yds.rsp_policy = &devlink_nest; - nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_INFO_GET, 1); + nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_SB_GET, 1); err = ynl_exec_dump(ys, nlh, &yds); if (err < 0) @@ -765,7 +879,2002 @@ struct devlink_info_get_list *devlink_info_get_dump(struct ynl_sock *ys) return yds.first; free_list: - devlink_info_get_list_free(yds.first); + devlink_sb_get_list_free(yds.first); + return NULL; +} + +/* ============== DEVLINK_CMD_SB_POOL_GET ============== */ +/* DEVLINK_CMD_SB_POOL_GET - do */ +void devlink_sb_pool_get_req_free(struct devlink_sb_pool_get_req *req) +{ + free(req->bus_name); + free(req->dev_name); + free(req); +} + +void devlink_sb_pool_get_rsp_free(struct devlink_sb_pool_get_rsp *rsp) +{ + free(rsp->bus_name); + free(rsp->dev_name); + free(rsp); +} + +int devlink_sb_pool_get_rsp_parse(const struct nlmsghdr *nlh, void *data) +{ + struct devlink_sb_pool_get_rsp *dst; + struct ynl_parse_arg *yarg = data; + const struct nlattr *attr; + + dst = yarg->data; + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == DEVLINK_ATTR_BUS_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.bus_name_len = len; + dst->bus_name = malloc(len + 1); + memcpy(dst->bus_name, mnl_attr_get_str(attr), len); + dst->bus_name[len] = 0; + } else if (type == DEVLINK_ATTR_DEV_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.dev_name_len = len; + dst->dev_name = malloc(len + 1); + memcpy(dst->dev_name, mnl_attr_get_str(attr), len); + dst->dev_name[len] = 0; + } else if (type == DEVLINK_ATTR_SB_INDEX) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.sb_index = 1; + dst->sb_index = mnl_attr_get_u32(attr); + } else if (type == DEVLINK_ATTR_SB_POOL_INDEX) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.sb_pool_index = 1; + dst->sb_pool_index = mnl_attr_get_u16(attr); + } + } + + return MNL_CB_OK; +} + +struct devlink_sb_pool_get_rsp * +devlink_sb_pool_get(struct ynl_sock *ys, struct devlink_sb_pool_get_req *req) +{ + struct ynl_req_state yrs = { .yarg = { .ys = ys, }, }; + struct devlink_sb_pool_get_rsp *rsp; + struct nlmsghdr *nlh; + int err; + + nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_SB_POOL_GET, 1); + ys->req_policy = &devlink_nest; + yrs.yarg.rsp_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + if (req->_present.sb_index) + mnl_attr_put_u32(nlh, DEVLINK_ATTR_SB_INDEX, req->sb_index); + if (req->_present.sb_pool_index) + mnl_attr_put_u16(nlh, DEVLINK_ATTR_SB_POOL_INDEX, req->sb_pool_index); + + rsp = calloc(1, sizeof(*rsp)); + yrs.yarg.data = rsp; + yrs.cb = devlink_sb_pool_get_rsp_parse; + yrs.rsp_cmd = DEVLINK_CMD_SB_POOL_GET; + + err = ynl_exec(ys, nlh, &yrs); + if (err < 0) + goto err_free; + + return rsp; + +err_free: + devlink_sb_pool_get_rsp_free(rsp); + return NULL; +} + +/* DEVLINK_CMD_SB_POOL_GET - dump */ +void devlink_sb_pool_get_list_free(struct devlink_sb_pool_get_list *rsp) +{ + struct devlink_sb_pool_get_list *next = rsp; + + while ((void *)next != YNL_LIST_END) { + rsp = next; + next = rsp->next; + + free(rsp->obj.bus_name); + free(rsp->obj.dev_name); + free(rsp); + } +} + +struct devlink_sb_pool_get_list *devlink_sb_pool_get_dump(struct ynl_sock *ys) +{ + struct ynl_dump_state yds = {}; + struct nlmsghdr *nlh; + int err; + + yds.ys = ys; + yds.alloc_sz = sizeof(struct devlink_sb_pool_get_list); + yds.cb = devlink_sb_pool_get_rsp_parse; + yds.rsp_cmd = DEVLINK_CMD_SB_POOL_GET; + yds.rsp_policy = &devlink_nest; + + nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_SB_POOL_GET, 1); + + err = ynl_exec_dump(ys, nlh, &yds); + if (err < 0) + goto free_list; + + return yds.first; + +free_list: + devlink_sb_pool_get_list_free(yds.first); + return NULL; +} + +/* ============== DEVLINK_CMD_SB_PORT_POOL_GET ============== */ +/* DEVLINK_CMD_SB_PORT_POOL_GET - do */ +void +devlink_sb_port_pool_get_req_free(struct devlink_sb_port_pool_get_req *req) +{ + free(req->bus_name); + free(req->dev_name); + free(req); +} + +void +devlink_sb_port_pool_get_rsp_free(struct devlink_sb_port_pool_get_rsp *rsp) +{ + free(rsp->bus_name); + free(rsp->dev_name); + free(rsp); +} + +int devlink_sb_port_pool_get_rsp_parse(const struct nlmsghdr *nlh, void *data) +{ + struct devlink_sb_port_pool_get_rsp *dst; + struct ynl_parse_arg *yarg = data; + const struct nlattr *attr; + + dst = yarg->data; + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == DEVLINK_ATTR_BUS_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.bus_name_len = len; + dst->bus_name = malloc(len + 1); + memcpy(dst->bus_name, mnl_attr_get_str(attr), len); + dst->bus_name[len] = 0; + } else if (type == DEVLINK_ATTR_DEV_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.dev_name_len = len; + dst->dev_name = malloc(len + 1); + memcpy(dst->dev_name, mnl_attr_get_str(attr), len); + dst->dev_name[len] = 0; + } else if (type == DEVLINK_ATTR_PORT_INDEX) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.port_index = 1; + dst->port_index = mnl_attr_get_u32(attr); + } else if (type == DEVLINK_ATTR_SB_INDEX) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.sb_index = 1; + dst->sb_index = mnl_attr_get_u32(attr); + } else if (type == DEVLINK_ATTR_SB_POOL_INDEX) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.sb_pool_index = 1; + dst->sb_pool_index = mnl_attr_get_u16(attr); + } + } + + return MNL_CB_OK; +} + +struct devlink_sb_port_pool_get_rsp * +devlink_sb_port_pool_get(struct ynl_sock *ys, + struct devlink_sb_port_pool_get_req *req) +{ + struct ynl_req_state yrs = { .yarg = { .ys = ys, }, }; + struct devlink_sb_port_pool_get_rsp *rsp; + struct nlmsghdr *nlh; + int err; + + nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_SB_PORT_POOL_GET, 1); + ys->req_policy = &devlink_nest; + yrs.yarg.rsp_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + if (req->_present.port_index) + mnl_attr_put_u32(nlh, DEVLINK_ATTR_PORT_INDEX, req->port_index); + if (req->_present.sb_index) + mnl_attr_put_u32(nlh, DEVLINK_ATTR_SB_INDEX, req->sb_index); + if (req->_present.sb_pool_index) + mnl_attr_put_u16(nlh, DEVLINK_ATTR_SB_POOL_INDEX, req->sb_pool_index); + + rsp = calloc(1, sizeof(*rsp)); + yrs.yarg.data = rsp; + yrs.cb = devlink_sb_port_pool_get_rsp_parse; + yrs.rsp_cmd = DEVLINK_CMD_SB_PORT_POOL_GET; + + err = ynl_exec(ys, nlh, &yrs); + if (err < 0) + goto err_free; + + return rsp; + +err_free: + devlink_sb_port_pool_get_rsp_free(rsp); + return NULL; +} + +/* DEVLINK_CMD_SB_PORT_POOL_GET - dump */ +void +devlink_sb_port_pool_get_list_free(struct devlink_sb_port_pool_get_list *rsp) +{ + struct devlink_sb_port_pool_get_list *next = rsp; + + while ((void *)next != YNL_LIST_END) { + rsp = next; + next = rsp->next; + + free(rsp->obj.bus_name); + free(rsp->obj.dev_name); + free(rsp); + } +} + +struct devlink_sb_port_pool_get_list * +devlink_sb_port_pool_get_dump(struct ynl_sock *ys) +{ + struct ynl_dump_state yds = {}; + struct nlmsghdr *nlh; + int err; + + yds.ys = ys; + yds.alloc_sz = sizeof(struct devlink_sb_port_pool_get_list); + yds.cb = devlink_sb_port_pool_get_rsp_parse; + yds.rsp_cmd = DEVLINK_CMD_SB_PORT_POOL_GET; + yds.rsp_policy = &devlink_nest; + + nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_SB_PORT_POOL_GET, 1); + + err = ynl_exec_dump(ys, nlh, &yds); + if (err < 0) + goto free_list; + + return yds.first; + +free_list: + devlink_sb_port_pool_get_list_free(yds.first); + return NULL; +} + +/* ============== DEVLINK_CMD_SB_TC_POOL_BIND_GET ============== */ +/* DEVLINK_CMD_SB_TC_POOL_BIND_GET - do */ +void +devlink_sb_tc_pool_bind_get_req_free(struct devlink_sb_tc_pool_bind_get_req *req) +{ + free(req->bus_name); + free(req->dev_name); + free(req); +} + +void +devlink_sb_tc_pool_bind_get_rsp_free(struct devlink_sb_tc_pool_bind_get_rsp *rsp) +{ + free(rsp->bus_name); + free(rsp->dev_name); + free(rsp); +} + +int devlink_sb_tc_pool_bind_get_rsp_parse(const struct nlmsghdr *nlh, + void *data) +{ + struct devlink_sb_tc_pool_bind_get_rsp *dst; + struct ynl_parse_arg *yarg = data; + const struct nlattr *attr; + + dst = yarg->data; + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == DEVLINK_ATTR_BUS_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.bus_name_len = len; + dst->bus_name = malloc(len + 1); + memcpy(dst->bus_name, mnl_attr_get_str(attr), len); + dst->bus_name[len] = 0; + } else if (type == DEVLINK_ATTR_DEV_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.dev_name_len = len; + dst->dev_name = malloc(len + 1); + memcpy(dst->dev_name, mnl_attr_get_str(attr), len); + dst->dev_name[len] = 0; + } else if (type == DEVLINK_ATTR_PORT_INDEX) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.port_index = 1; + dst->port_index = mnl_attr_get_u32(attr); + } else if (type == DEVLINK_ATTR_SB_INDEX) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.sb_index = 1; + dst->sb_index = mnl_attr_get_u32(attr); + } else if (type == DEVLINK_ATTR_SB_POOL_TYPE) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.sb_pool_type = 1; + dst->sb_pool_type = mnl_attr_get_u8(attr); + } else if (type == DEVLINK_ATTR_SB_TC_INDEX) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.sb_tc_index = 1; + dst->sb_tc_index = mnl_attr_get_u16(attr); + } + } + + return MNL_CB_OK; +} + +struct devlink_sb_tc_pool_bind_get_rsp * +devlink_sb_tc_pool_bind_get(struct ynl_sock *ys, + struct devlink_sb_tc_pool_bind_get_req *req) +{ + struct ynl_req_state yrs = { .yarg = { .ys = ys, }, }; + struct devlink_sb_tc_pool_bind_get_rsp *rsp; + struct nlmsghdr *nlh; + int err; + + nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_SB_TC_POOL_BIND_GET, 1); + ys->req_policy = &devlink_nest; + yrs.yarg.rsp_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + if (req->_present.port_index) + mnl_attr_put_u32(nlh, DEVLINK_ATTR_PORT_INDEX, req->port_index); + if (req->_present.sb_index) + mnl_attr_put_u32(nlh, DEVLINK_ATTR_SB_INDEX, req->sb_index); + if (req->_present.sb_pool_type) + mnl_attr_put_u8(nlh, DEVLINK_ATTR_SB_POOL_TYPE, req->sb_pool_type); + if (req->_present.sb_tc_index) + mnl_attr_put_u16(nlh, DEVLINK_ATTR_SB_TC_INDEX, req->sb_tc_index); + + rsp = calloc(1, sizeof(*rsp)); + yrs.yarg.data = rsp; + yrs.cb = devlink_sb_tc_pool_bind_get_rsp_parse; + yrs.rsp_cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET; + + err = ynl_exec(ys, nlh, &yrs); + if (err < 0) + goto err_free; + + return rsp; + +err_free: + devlink_sb_tc_pool_bind_get_rsp_free(rsp); + return NULL; +} + +/* DEVLINK_CMD_SB_TC_POOL_BIND_GET - dump */ +void +devlink_sb_tc_pool_bind_get_list_free(struct devlink_sb_tc_pool_bind_get_list *rsp) +{ + struct devlink_sb_tc_pool_bind_get_list *next = rsp; + + while ((void *)next != YNL_LIST_END) { + rsp = next; + next = rsp->next; + + free(rsp->obj.bus_name); + free(rsp->obj.dev_name); + free(rsp); + } +} + +struct devlink_sb_tc_pool_bind_get_list * +devlink_sb_tc_pool_bind_get_dump(struct ynl_sock *ys) +{ + struct ynl_dump_state yds = {}; + struct nlmsghdr *nlh; + int err; + + yds.ys = ys; + yds.alloc_sz = sizeof(struct devlink_sb_tc_pool_bind_get_list); + yds.cb = devlink_sb_tc_pool_bind_get_rsp_parse; + yds.rsp_cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET; + yds.rsp_policy = &devlink_nest; + + nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_SB_TC_POOL_BIND_GET, 1); + + err = ynl_exec_dump(ys, nlh, &yds); + if (err < 0) + goto free_list; + + return yds.first; + +free_list: + devlink_sb_tc_pool_bind_get_list_free(yds.first); + return NULL; +} + +/* ============== DEVLINK_CMD_PARAM_GET ============== */ +/* DEVLINK_CMD_PARAM_GET - do */ +void devlink_param_get_req_free(struct devlink_param_get_req *req) +{ + free(req->bus_name); + free(req->dev_name); + free(req->param_name); + free(req); +} + +void devlink_param_get_rsp_free(struct devlink_param_get_rsp *rsp) +{ + free(rsp->bus_name); + free(rsp->dev_name); + free(rsp->param_name); + free(rsp); +} + +int devlink_param_get_rsp_parse(const struct nlmsghdr *nlh, void *data) +{ + struct devlink_param_get_rsp *dst; + struct ynl_parse_arg *yarg = data; + const struct nlattr *attr; + + dst = yarg->data; + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == DEVLINK_ATTR_BUS_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.bus_name_len = len; + dst->bus_name = malloc(len + 1); + memcpy(dst->bus_name, mnl_attr_get_str(attr), len); + dst->bus_name[len] = 0; + } else if (type == DEVLINK_ATTR_DEV_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.dev_name_len = len; + dst->dev_name = malloc(len + 1); + memcpy(dst->dev_name, mnl_attr_get_str(attr), len); + dst->dev_name[len] = 0; + } else if (type == DEVLINK_ATTR_PARAM_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.param_name_len = len; + dst->param_name = malloc(len + 1); + memcpy(dst->param_name, mnl_attr_get_str(attr), len); + dst->param_name[len] = 0; + } + } + + return MNL_CB_OK; +} + +struct devlink_param_get_rsp * +devlink_param_get(struct ynl_sock *ys, struct devlink_param_get_req *req) +{ + struct ynl_req_state yrs = { .yarg = { .ys = ys, }, }; + struct devlink_param_get_rsp *rsp; + struct nlmsghdr *nlh; + int err; + + nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_PARAM_GET, 1); + ys->req_policy = &devlink_nest; + yrs.yarg.rsp_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + if (req->_present.param_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_PARAM_NAME, req->param_name); + + rsp = calloc(1, sizeof(*rsp)); + yrs.yarg.data = rsp; + yrs.cb = devlink_param_get_rsp_parse; + yrs.rsp_cmd = DEVLINK_CMD_PARAM_GET; + + err = ynl_exec(ys, nlh, &yrs); + if (err < 0) + goto err_free; + + return rsp; + +err_free: + devlink_param_get_rsp_free(rsp); + return NULL; +} + +/* DEVLINK_CMD_PARAM_GET - dump */ +void devlink_param_get_list_free(struct devlink_param_get_list *rsp) +{ + struct devlink_param_get_list *next = rsp; + + while ((void *)next != YNL_LIST_END) { + rsp = next; + next = rsp->next; + + free(rsp->obj.bus_name); + free(rsp->obj.dev_name); + free(rsp->obj.param_name); + free(rsp); + } +} + +struct devlink_param_get_list *devlink_param_get_dump(struct ynl_sock *ys) +{ + struct ynl_dump_state yds = {}; + struct nlmsghdr *nlh; + int err; + + yds.ys = ys; + yds.alloc_sz = sizeof(struct devlink_param_get_list); + yds.cb = devlink_param_get_rsp_parse; + yds.rsp_cmd = DEVLINK_CMD_PARAM_GET; + yds.rsp_policy = &devlink_nest; + + nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_PARAM_GET, 1); + + err = ynl_exec_dump(ys, nlh, &yds); + if (err < 0) + goto free_list; + + return yds.first; + +free_list: + devlink_param_get_list_free(yds.first); + return NULL; +} + +/* ============== DEVLINK_CMD_REGION_GET ============== */ +/* DEVLINK_CMD_REGION_GET - do */ +void devlink_region_get_req_free(struct devlink_region_get_req *req) +{ + free(req->bus_name); + free(req->dev_name); + free(req->region_name); + free(req); +} + +void devlink_region_get_rsp_free(struct devlink_region_get_rsp *rsp) +{ + free(rsp->bus_name); + free(rsp->dev_name); + free(rsp->region_name); + free(rsp); +} + +int devlink_region_get_rsp_parse(const struct nlmsghdr *nlh, void *data) +{ + struct devlink_region_get_rsp *dst; + struct ynl_parse_arg *yarg = data; + const struct nlattr *attr; + + dst = yarg->data; + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == DEVLINK_ATTR_BUS_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.bus_name_len = len; + dst->bus_name = malloc(len + 1); + memcpy(dst->bus_name, mnl_attr_get_str(attr), len); + dst->bus_name[len] = 0; + } else if (type == DEVLINK_ATTR_DEV_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.dev_name_len = len; + dst->dev_name = malloc(len + 1); + memcpy(dst->dev_name, mnl_attr_get_str(attr), len); + dst->dev_name[len] = 0; + } else if (type == DEVLINK_ATTR_PORT_INDEX) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.port_index = 1; + dst->port_index = mnl_attr_get_u32(attr); + } else if (type == DEVLINK_ATTR_REGION_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.region_name_len = len; + dst->region_name = malloc(len + 1); + memcpy(dst->region_name, mnl_attr_get_str(attr), len); + dst->region_name[len] = 0; + } + } + + return MNL_CB_OK; +} + +struct devlink_region_get_rsp * +devlink_region_get(struct ynl_sock *ys, struct devlink_region_get_req *req) +{ + struct ynl_req_state yrs = { .yarg = { .ys = ys, }, }; + struct devlink_region_get_rsp *rsp; + struct nlmsghdr *nlh; + int err; + + nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_REGION_GET, 1); + ys->req_policy = &devlink_nest; + yrs.yarg.rsp_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + if (req->_present.port_index) + mnl_attr_put_u32(nlh, DEVLINK_ATTR_PORT_INDEX, req->port_index); + if (req->_present.region_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_REGION_NAME, req->region_name); + + rsp = calloc(1, sizeof(*rsp)); + yrs.yarg.data = rsp; + yrs.cb = devlink_region_get_rsp_parse; + yrs.rsp_cmd = DEVLINK_CMD_REGION_GET; + + err = ynl_exec(ys, nlh, &yrs); + if (err < 0) + goto err_free; + + return rsp; + +err_free: + devlink_region_get_rsp_free(rsp); + return NULL; +} + +/* DEVLINK_CMD_REGION_GET - dump */ +void devlink_region_get_list_free(struct devlink_region_get_list *rsp) +{ + struct devlink_region_get_list *next = rsp; + + while ((void *)next != YNL_LIST_END) { + rsp = next; + next = rsp->next; + + free(rsp->obj.bus_name); + free(rsp->obj.dev_name); + free(rsp->obj.region_name); + free(rsp); + } +} + +struct devlink_region_get_list *devlink_region_get_dump(struct ynl_sock *ys) +{ + struct ynl_dump_state yds = {}; + struct nlmsghdr *nlh; + int err; + + yds.ys = ys; + yds.alloc_sz = sizeof(struct devlink_region_get_list); + yds.cb = devlink_region_get_rsp_parse; + yds.rsp_cmd = DEVLINK_CMD_REGION_GET; + yds.rsp_policy = &devlink_nest; + + nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_REGION_GET, 1); + + err = ynl_exec_dump(ys, nlh, &yds); + if (err < 0) + goto free_list; + + return yds.first; + +free_list: + devlink_region_get_list_free(yds.first); + return NULL; +} + +/* ============== DEVLINK_CMD_INFO_GET ============== */ +/* DEVLINK_CMD_INFO_GET - do */ +void devlink_info_get_req_free(struct devlink_info_get_req *req) +{ + free(req->bus_name); + free(req->dev_name); + free(req); +} + +void devlink_info_get_rsp_free(struct devlink_info_get_rsp *rsp) +{ + unsigned int i; + + free(rsp->bus_name); + free(rsp->dev_name); + free(rsp->info_driver_name); + free(rsp->info_serial_number); + for (i = 0; i < rsp->n_info_version_fixed; i++) + devlink_dl_info_version_free(&rsp->info_version_fixed[i]); + free(rsp->info_version_fixed); + for (i = 0; i < rsp->n_info_version_running; i++) + devlink_dl_info_version_free(&rsp->info_version_running[i]); + free(rsp->info_version_running); + for (i = 0; i < rsp->n_info_version_stored; i++) + devlink_dl_info_version_free(&rsp->info_version_stored[i]); + free(rsp->info_version_stored); + free(rsp); +} + +int devlink_info_get_rsp_parse(const struct nlmsghdr *nlh, void *data) +{ + unsigned int n_info_version_running = 0; + unsigned int n_info_version_stored = 0; + unsigned int n_info_version_fixed = 0; + struct ynl_parse_arg *yarg = data; + struct devlink_info_get_rsp *dst; + const struct nlattr *attr; + struct ynl_parse_arg parg; + int i; + + dst = yarg->data; + parg.ys = yarg->ys; + + if (dst->info_version_fixed) + return ynl_error_parse(yarg, "attribute already present (devlink.info-version-fixed)"); + if (dst->info_version_running) + return ynl_error_parse(yarg, "attribute already present (devlink.info-version-running)"); + if (dst->info_version_stored) + return ynl_error_parse(yarg, "attribute already present (devlink.info-version-stored)"); + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == DEVLINK_ATTR_BUS_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.bus_name_len = len; + dst->bus_name = malloc(len + 1); + memcpy(dst->bus_name, mnl_attr_get_str(attr), len); + dst->bus_name[len] = 0; + } else if (type == DEVLINK_ATTR_DEV_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.dev_name_len = len; + dst->dev_name = malloc(len + 1); + memcpy(dst->dev_name, mnl_attr_get_str(attr), len); + dst->dev_name[len] = 0; + } else if (type == DEVLINK_ATTR_INFO_DRIVER_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.info_driver_name_len = len; + dst->info_driver_name = malloc(len + 1); + memcpy(dst->info_driver_name, mnl_attr_get_str(attr), len); + dst->info_driver_name[len] = 0; + } else if (type == DEVLINK_ATTR_INFO_SERIAL_NUMBER) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.info_serial_number_len = len; + dst->info_serial_number = malloc(len + 1); + memcpy(dst->info_serial_number, mnl_attr_get_str(attr), len); + dst->info_serial_number[len] = 0; + } else if (type == DEVLINK_ATTR_INFO_VERSION_FIXED) { + n_info_version_fixed++; + } else if (type == DEVLINK_ATTR_INFO_VERSION_RUNNING) { + n_info_version_running++; + } else if (type == DEVLINK_ATTR_INFO_VERSION_STORED) { + n_info_version_stored++; + } + } + + if (n_info_version_fixed) { + dst->info_version_fixed = calloc(n_info_version_fixed, sizeof(*dst->info_version_fixed)); + dst->n_info_version_fixed = n_info_version_fixed; + i = 0; + parg.rsp_policy = &devlink_dl_info_version_nest; + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + if (mnl_attr_get_type(attr) == DEVLINK_ATTR_INFO_VERSION_FIXED) { + parg.data = &dst->info_version_fixed[i]; + if (devlink_dl_info_version_parse(&parg, attr)) + return MNL_CB_ERROR; + i++; + } + } + } + if (n_info_version_running) { + dst->info_version_running = calloc(n_info_version_running, sizeof(*dst->info_version_running)); + dst->n_info_version_running = n_info_version_running; + i = 0; + parg.rsp_policy = &devlink_dl_info_version_nest; + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + if (mnl_attr_get_type(attr) == DEVLINK_ATTR_INFO_VERSION_RUNNING) { + parg.data = &dst->info_version_running[i]; + if (devlink_dl_info_version_parse(&parg, attr)) + return MNL_CB_ERROR; + i++; + } + } + } + if (n_info_version_stored) { + dst->info_version_stored = calloc(n_info_version_stored, sizeof(*dst->info_version_stored)); + dst->n_info_version_stored = n_info_version_stored; + i = 0; + parg.rsp_policy = &devlink_dl_info_version_nest; + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + if (mnl_attr_get_type(attr) == DEVLINK_ATTR_INFO_VERSION_STORED) { + parg.data = &dst->info_version_stored[i]; + if (devlink_dl_info_version_parse(&parg, attr)) + return MNL_CB_ERROR; + i++; + } + } + } + + return MNL_CB_OK; +} + +struct devlink_info_get_rsp * +devlink_info_get(struct ynl_sock *ys, struct devlink_info_get_req *req) +{ + struct ynl_req_state yrs = { .yarg = { .ys = ys, }, }; + struct devlink_info_get_rsp *rsp; + struct nlmsghdr *nlh; + int err; + + nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_INFO_GET, 1); + ys->req_policy = &devlink_nest; + yrs.yarg.rsp_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + + rsp = calloc(1, sizeof(*rsp)); + yrs.yarg.data = rsp; + yrs.cb = devlink_info_get_rsp_parse; + yrs.rsp_cmd = DEVLINK_CMD_INFO_GET; + + err = ynl_exec(ys, nlh, &yrs); + if (err < 0) + goto err_free; + + return rsp; + +err_free: + devlink_info_get_rsp_free(rsp); + return NULL; +} + +/* DEVLINK_CMD_INFO_GET - dump */ +void devlink_info_get_list_free(struct devlink_info_get_list *rsp) +{ + struct devlink_info_get_list *next = rsp; + + while ((void *)next != YNL_LIST_END) { + unsigned int i; + + rsp = next; + next = rsp->next; + + free(rsp->obj.bus_name); + free(rsp->obj.dev_name); + free(rsp->obj.info_driver_name); + free(rsp->obj.info_serial_number); + for (i = 0; i < rsp->obj.n_info_version_fixed; i++) + devlink_dl_info_version_free(&rsp->obj.info_version_fixed[i]); + free(rsp->obj.info_version_fixed); + for (i = 0; i < rsp->obj.n_info_version_running; i++) + devlink_dl_info_version_free(&rsp->obj.info_version_running[i]); + free(rsp->obj.info_version_running); + for (i = 0; i < rsp->obj.n_info_version_stored; i++) + devlink_dl_info_version_free(&rsp->obj.info_version_stored[i]); + free(rsp->obj.info_version_stored); + free(rsp); + } +} + +struct devlink_info_get_list *devlink_info_get_dump(struct ynl_sock *ys) +{ + struct ynl_dump_state yds = {}; + struct nlmsghdr *nlh; + int err; + + yds.ys = ys; + yds.alloc_sz = sizeof(struct devlink_info_get_list); + yds.cb = devlink_info_get_rsp_parse; + yds.rsp_cmd = DEVLINK_CMD_INFO_GET; + yds.rsp_policy = &devlink_nest; + + nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_INFO_GET, 1); + + err = ynl_exec_dump(ys, nlh, &yds); + if (err < 0) + goto free_list; + + return yds.first; + +free_list: + devlink_info_get_list_free(yds.first); + return NULL; +} + +/* ============== DEVLINK_CMD_HEALTH_REPORTER_GET ============== */ +/* DEVLINK_CMD_HEALTH_REPORTER_GET - do */ +void +devlink_health_reporter_get_req_free(struct devlink_health_reporter_get_req *req) +{ + free(req->bus_name); + free(req->dev_name); + free(req->health_reporter_name); + free(req); +} + +void +devlink_health_reporter_get_rsp_free(struct devlink_health_reporter_get_rsp *rsp) +{ + free(rsp->bus_name); + free(rsp->dev_name); + free(rsp->health_reporter_name); + free(rsp); +} + +int devlink_health_reporter_get_rsp_parse(const struct nlmsghdr *nlh, + void *data) +{ + struct devlink_health_reporter_get_rsp *dst; + struct ynl_parse_arg *yarg = data; + const struct nlattr *attr; + + dst = yarg->data; + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == DEVLINK_ATTR_BUS_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.bus_name_len = len; + dst->bus_name = malloc(len + 1); + memcpy(dst->bus_name, mnl_attr_get_str(attr), len); + dst->bus_name[len] = 0; + } else if (type == DEVLINK_ATTR_DEV_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.dev_name_len = len; + dst->dev_name = malloc(len + 1); + memcpy(dst->dev_name, mnl_attr_get_str(attr), len); + dst->dev_name[len] = 0; + } else if (type == DEVLINK_ATTR_PORT_INDEX) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.port_index = 1; + dst->port_index = mnl_attr_get_u32(attr); + } else if (type == DEVLINK_ATTR_HEALTH_REPORTER_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.health_reporter_name_len = len; + dst->health_reporter_name = malloc(len + 1); + memcpy(dst->health_reporter_name, mnl_attr_get_str(attr), len); + dst->health_reporter_name[len] = 0; + } + } + + return MNL_CB_OK; +} + +struct devlink_health_reporter_get_rsp * +devlink_health_reporter_get(struct ynl_sock *ys, + struct devlink_health_reporter_get_req *req) +{ + struct ynl_req_state yrs = { .yarg = { .ys = ys, }, }; + struct devlink_health_reporter_get_rsp *rsp; + struct nlmsghdr *nlh; + int err; + + nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_HEALTH_REPORTER_GET, 1); + ys->req_policy = &devlink_nest; + yrs.yarg.rsp_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + if (req->_present.port_index) + mnl_attr_put_u32(nlh, DEVLINK_ATTR_PORT_INDEX, req->port_index); + if (req->_present.health_reporter_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_HEALTH_REPORTER_NAME, req->health_reporter_name); + + rsp = calloc(1, sizeof(*rsp)); + yrs.yarg.data = rsp; + yrs.cb = devlink_health_reporter_get_rsp_parse; + yrs.rsp_cmd = DEVLINK_CMD_HEALTH_REPORTER_GET; + + err = ynl_exec(ys, nlh, &yrs); + if (err < 0) + goto err_free; + + return rsp; + +err_free: + devlink_health_reporter_get_rsp_free(rsp); + return NULL; +} + +/* DEVLINK_CMD_HEALTH_REPORTER_GET - dump */ +void +devlink_health_reporter_get_list_free(struct devlink_health_reporter_get_list *rsp) +{ + struct devlink_health_reporter_get_list *next = rsp; + + while ((void *)next != YNL_LIST_END) { + rsp = next; + next = rsp->next; + + free(rsp->obj.bus_name); + free(rsp->obj.dev_name); + free(rsp->obj.health_reporter_name); + free(rsp); + } +} + +struct devlink_health_reporter_get_list * +devlink_health_reporter_get_dump(struct ynl_sock *ys) +{ + struct ynl_dump_state yds = {}; + struct nlmsghdr *nlh; + int err; + + yds.ys = ys; + yds.alloc_sz = sizeof(struct devlink_health_reporter_get_list); + yds.cb = devlink_health_reporter_get_rsp_parse; + yds.rsp_cmd = DEVLINK_CMD_HEALTH_REPORTER_GET; + yds.rsp_policy = &devlink_nest; + + nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_HEALTH_REPORTER_GET, 1); + + err = ynl_exec_dump(ys, nlh, &yds); + if (err < 0) + goto free_list; + + return yds.first; + +free_list: + devlink_health_reporter_get_list_free(yds.first); + return NULL; +} + +/* ============== DEVLINK_CMD_TRAP_GET ============== */ +/* DEVLINK_CMD_TRAP_GET - do */ +void devlink_trap_get_req_free(struct devlink_trap_get_req *req) +{ + free(req->bus_name); + free(req->dev_name); + free(req->trap_name); + free(req); +} + +void devlink_trap_get_rsp_free(struct devlink_trap_get_rsp *rsp) +{ + free(rsp->bus_name); + free(rsp->dev_name); + free(rsp->trap_name); + free(rsp); +} + +int devlink_trap_get_rsp_parse(const struct nlmsghdr *nlh, void *data) +{ + struct ynl_parse_arg *yarg = data; + struct devlink_trap_get_rsp *dst; + const struct nlattr *attr; + + dst = yarg->data; + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == DEVLINK_ATTR_BUS_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.bus_name_len = len; + dst->bus_name = malloc(len + 1); + memcpy(dst->bus_name, mnl_attr_get_str(attr), len); + dst->bus_name[len] = 0; + } else if (type == DEVLINK_ATTR_DEV_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.dev_name_len = len; + dst->dev_name = malloc(len + 1); + memcpy(dst->dev_name, mnl_attr_get_str(attr), len); + dst->dev_name[len] = 0; + } else if (type == DEVLINK_ATTR_TRAP_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.trap_name_len = len; + dst->trap_name = malloc(len + 1); + memcpy(dst->trap_name, mnl_attr_get_str(attr), len); + dst->trap_name[len] = 0; + } + } + + return MNL_CB_OK; +} + +struct devlink_trap_get_rsp * +devlink_trap_get(struct ynl_sock *ys, struct devlink_trap_get_req *req) +{ + struct ynl_req_state yrs = { .yarg = { .ys = ys, }, }; + struct devlink_trap_get_rsp *rsp; + struct nlmsghdr *nlh; + int err; + + nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_TRAP_GET, 1); + ys->req_policy = &devlink_nest; + yrs.yarg.rsp_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + if (req->_present.trap_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_TRAP_NAME, req->trap_name); + + rsp = calloc(1, sizeof(*rsp)); + yrs.yarg.data = rsp; + yrs.cb = devlink_trap_get_rsp_parse; + yrs.rsp_cmd = DEVLINK_CMD_TRAP_GET; + + err = ynl_exec(ys, nlh, &yrs); + if (err < 0) + goto err_free; + + return rsp; + +err_free: + devlink_trap_get_rsp_free(rsp); + return NULL; +} + +/* DEVLINK_CMD_TRAP_GET - dump */ +void devlink_trap_get_list_free(struct devlink_trap_get_list *rsp) +{ + struct devlink_trap_get_list *next = rsp; + + while ((void *)next != YNL_LIST_END) { + rsp = next; + next = rsp->next; + + free(rsp->obj.bus_name); + free(rsp->obj.dev_name); + free(rsp->obj.trap_name); + free(rsp); + } +} + +struct devlink_trap_get_list *devlink_trap_get_dump(struct ynl_sock *ys) +{ + struct ynl_dump_state yds = {}; + struct nlmsghdr *nlh; + int err; + + yds.ys = ys; + yds.alloc_sz = sizeof(struct devlink_trap_get_list); + yds.cb = devlink_trap_get_rsp_parse; + yds.rsp_cmd = DEVLINK_CMD_TRAP_GET; + yds.rsp_policy = &devlink_nest; + + nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_TRAP_GET, 1); + + err = ynl_exec_dump(ys, nlh, &yds); + if (err < 0) + goto free_list; + + return yds.first; + +free_list: + devlink_trap_get_list_free(yds.first); + return NULL; +} + +/* ============== DEVLINK_CMD_TRAP_GROUP_GET ============== */ +/* DEVLINK_CMD_TRAP_GROUP_GET - do */ +void devlink_trap_group_get_req_free(struct devlink_trap_group_get_req *req) +{ + free(req->bus_name); + free(req->dev_name); + free(req->trap_group_name); + free(req); +} + +void devlink_trap_group_get_rsp_free(struct devlink_trap_group_get_rsp *rsp) +{ + free(rsp->bus_name); + free(rsp->dev_name); + free(rsp->trap_group_name); + free(rsp); +} + +int devlink_trap_group_get_rsp_parse(const struct nlmsghdr *nlh, void *data) +{ + struct devlink_trap_group_get_rsp *dst; + struct ynl_parse_arg *yarg = data; + const struct nlattr *attr; + + dst = yarg->data; + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == DEVLINK_ATTR_BUS_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.bus_name_len = len; + dst->bus_name = malloc(len + 1); + memcpy(dst->bus_name, mnl_attr_get_str(attr), len); + dst->bus_name[len] = 0; + } else if (type == DEVLINK_ATTR_DEV_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.dev_name_len = len; + dst->dev_name = malloc(len + 1); + memcpy(dst->dev_name, mnl_attr_get_str(attr), len); + dst->dev_name[len] = 0; + } else if (type == DEVLINK_ATTR_TRAP_GROUP_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.trap_group_name_len = len; + dst->trap_group_name = malloc(len + 1); + memcpy(dst->trap_group_name, mnl_attr_get_str(attr), len); + dst->trap_group_name[len] = 0; + } + } + + return MNL_CB_OK; +} + +struct devlink_trap_group_get_rsp * +devlink_trap_group_get(struct ynl_sock *ys, + struct devlink_trap_group_get_req *req) +{ + struct ynl_req_state yrs = { .yarg = { .ys = ys, }, }; + struct devlink_trap_group_get_rsp *rsp; + struct nlmsghdr *nlh; + int err; + + nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_TRAP_GROUP_GET, 1); + ys->req_policy = &devlink_nest; + yrs.yarg.rsp_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + if (req->_present.trap_group_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_TRAP_GROUP_NAME, req->trap_group_name); + + rsp = calloc(1, sizeof(*rsp)); + yrs.yarg.data = rsp; + yrs.cb = devlink_trap_group_get_rsp_parse; + yrs.rsp_cmd = DEVLINK_CMD_TRAP_GROUP_GET; + + err = ynl_exec(ys, nlh, &yrs); + if (err < 0) + goto err_free; + + return rsp; + +err_free: + devlink_trap_group_get_rsp_free(rsp); + return NULL; +} + +/* DEVLINK_CMD_TRAP_GROUP_GET - dump */ +void devlink_trap_group_get_list_free(struct devlink_trap_group_get_list *rsp) +{ + struct devlink_trap_group_get_list *next = rsp; + + while ((void *)next != YNL_LIST_END) { + rsp = next; + next = rsp->next; + + free(rsp->obj.bus_name); + free(rsp->obj.dev_name); + free(rsp->obj.trap_group_name); + free(rsp); + } +} + +struct devlink_trap_group_get_list * +devlink_trap_group_get_dump(struct ynl_sock *ys) +{ + struct ynl_dump_state yds = {}; + struct nlmsghdr *nlh; + int err; + + yds.ys = ys; + yds.alloc_sz = sizeof(struct devlink_trap_group_get_list); + yds.cb = devlink_trap_group_get_rsp_parse; + yds.rsp_cmd = DEVLINK_CMD_TRAP_GROUP_GET; + yds.rsp_policy = &devlink_nest; + + nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_TRAP_GROUP_GET, 1); + + err = ynl_exec_dump(ys, nlh, &yds); + if (err < 0) + goto free_list; + + return yds.first; + +free_list: + devlink_trap_group_get_list_free(yds.first); + return NULL; +} + +/* ============== DEVLINK_CMD_TRAP_POLICER_GET ============== */ +/* DEVLINK_CMD_TRAP_POLICER_GET - do */ +void +devlink_trap_policer_get_req_free(struct devlink_trap_policer_get_req *req) +{ + free(req->bus_name); + free(req->dev_name); + free(req); +} + +void +devlink_trap_policer_get_rsp_free(struct devlink_trap_policer_get_rsp *rsp) +{ + free(rsp->bus_name); + free(rsp->dev_name); + free(rsp); +} + +int devlink_trap_policer_get_rsp_parse(const struct nlmsghdr *nlh, void *data) +{ + struct devlink_trap_policer_get_rsp *dst; + struct ynl_parse_arg *yarg = data; + const struct nlattr *attr; + + dst = yarg->data; + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == DEVLINK_ATTR_BUS_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.bus_name_len = len; + dst->bus_name = malloc(len + 1); + memcpy(dst->bus_name, mnl_attr_get_str(attr), len); + dst->bus_name[len] = 0; + } else if (type == DEVLINK_ATTR_DEV_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.dev_name_len = len; + dst->dev_name = malloc(len + 1); + memcpy(dst->dev_name, mnl_attr_get_str(attr), len); + dst->dev_name[len] = 0; + } else if (type == DEVLINK_ATTR_TRAP_POLICER_ID) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.trap_policer_id = 1; + dst->trap_policer_id = mnl_attr_get_u32(attr); + } + } + + return MNL_CB_OK; +} + +struct devlink_trap_policer_get_rsp * +devlink_trap_policer_get(struct ynl_sock *ys, + struct devlink_trap_policer_get_req *req) +{ + struct ynl_req_state yrs = { .yarg = { .ys = ys, }, }; + struct devlink_trap_policer_get_rsp *rsp; + struct nlmsghdr *nlh; + int err; + + nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_TRAP_POLICER_GET, 1); + ys->req_policy = &devlink_nest; + yrs.yarg.rsp_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + if (req->_present.trap_policer_id) + mnl_attr_put_u32(nlh, DEVLINK_ATTR_TRAP_POLICER_ID, req->trap_policer_id); + + rsp = calloc(1, sizeof(*rsp)); + yrs.yarg.data = rsp; + yrs.cb = devlink_trap_policer_get_rsp_parse; + yrs.rsp_cmd = DEVLINK_CMD_TRAP_POLICER_GET; + + err = ynl_exec(ys, nlh, &yrs); + if (err < 0) + goto err_free; + + return rsp; + +err_free: + devlink_trap_policer_get_rsp_free(rsp); + return NULL; +} + +/* DEVLINK_CMD_TRAP_POLICER_GET - dump */ +void +devlink_trap_policer_get_list_free(struct devlink_trap_policer_get_list *rsp) +{ + struct devlink_trap_policer_get_list *next = rsp; + + while ((void *)next != YNL_LIST_END) { + rsp = next; + next = rsp->next; + + free(rsp->obj.bus_name); + free(rsp->obj.dev_name); + free(rsp); + } +} + +struct devlink_trap_policer_get_list * +devlink_trap_policer_get_dump(struct ynl_sock *ys) +{ + struct ynl_dump_state yds = {}; + struct nlmsghdr *nlh; + int err; + + yds.ys = ys; + yds.alloc_sz = sizeof(struct devlink_trap_policer_get_list); + yds.cb = devlink_trap_policer_get_rsp_parse; + yds.rsp_cmd = DEVLINK_CMD_TRAP_POLICER_GET; + yds.rsp_policy = &devlink_nest; + + nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_TRAP_POLICER_GET, 1); + + err = ynl_exec_dump(ys, nlh, &yds); + if (err < 0) + goto free_list; + + return yds.first; + +free_list: + devlink_trap_policer_get_list_free(yds.first); + return NULL; +} + +/* ============== DEVLINK_CMD_RATE_GET ============== */ +/* DEVLINK_CMD_RATE_GET - do */ +void devlink_rate_get_req_free(struct devlink_rate_get_req *req) +{ + free(req->bus_name); + free(req->dev_name); + free(req->rate_node_name); + free(req); +} + +void devlink_rate_get_rsp_free(struct devlink_rate_get_rsp *rsp) +{ + free(rsp->bus_name); + free(rsp->dev_name); + free(rsp->rate_node_name); + free(rsp); +} + +int devlink_rate_get_rsp_parse(const struct nlmsghdr *nlh, void *data) +{ + struct ynl_parse_arg *yarg = data; + struct devlink_rate_get_rsp *dst; + const struct nlattr *attr; + + dst = yarg->data; + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == DEVLINK_ATTR_BUS_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.bus_name_len = len; + dst->bus_name = malloc(len + 1); + memcpy(dst->bus_name, mnl_attr_get_str(attr), len); + dst->bus_name[len] = 0; + } else if (type == DEVLINK_ATTR_DEV_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.dev_name_len = len; + dst->dev_name = malloc(len + 1); + memcpy(dst->dev_name, mnl_attr_get_str(attr), len); + dst->dev_name[len] = 0; + } else if (type == DEVLINK_ATTR_PORT_INDEX) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.port_index = 1; + dst->port_index = mnl_attr_get_u32(attr); + } else if (type == DEVLINK_ATTR_RATE_NODE_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.rate_node_name_len = len; + dst->rate_node_name = malloc(len + 1); + memcpy(dst->rate_node_name, mnl_attr_get_str(attr), len); + dst->rate_node_name[len] = 0; + } + } + + return MNL_CB_OK; +} + +struct devlink_rate_get_rsp * +devlink_rate_get(struct ynl_sock *ys, struct devlink_rate_get_req *req) +{ + struct ynl_req_state yrs = { .yarg = { .ys = ys, }, }; + struct devlink_rate_get_rsp *rsp; + struct nlmsghdr *nlh; + int err; + + nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_RATE_GET, 1); + ys->req_policy = &devlink_nest; + yrs.yarg.rsp_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + if (req->_present.port_index) + mnl_attr_put_u32(nlh, DEVLINK_ATTR_PORT_INDEX, req->port_index); + if (req->_present.rate_node_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_RATE_NODE_NAME, req->rate_node_name); + + rsp = calloc(1, sizeof(*rsp)); + yrs.yarg.data = rsp; + yrs.cb = devlink_rate_get_rsp_parse; + yrs.rsp_cmd = DEVLINK_CMD_RATE_GET; + + err = ynl_exec(ys, nlh, &yrs); + if (err < 0) + goto err_free; + + return rsp; + +err_free: + devlink_rate_get_rsp_free(rsp); + return NULL; +} + +/* DEVLINK_CMD_RATE_GET - dump */ +void devlink_rate_get_list_free(struct devlink_rate_get_list *rsp) +{ + struct devlink_rate_get_list *next = rsp; + + while ((void *)next != YNL_LIST_END) { + rsp = next; + next = rsp->next; + + free(rsp->obj.bus_name); + free(rsp->obj.dev_name); + free(rsp->obj.rate_node_name); + free(rsp); + } +} + +struct devlink_rate_get_list *devlink_rate_get_dump(struct ynl_sock *ys) +{ + struct ynl_dump_state yds = {}; + struct nlmsghdr *nlh; + int err; + + yds.ys = ys; + yds.alloc_sz = sizeof(struct devlink_rate_get_list); + yds.cb = devlink_rate_get_rsp_parse; + yds.rsp_cmd = DEVLINK_CMD_RATE_GET; + yds.rsp_policy = &devlink_nest; + + nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_RATE_GET, 1); + + err = ynl_exec_dump(ys, nlh, &yds); + if (err < 0) + goto free_list; + + return yds.first; + +free_list: + devlink_rate_get_list_free(yds.first); + return NULL; +} + +/* ============== DEVLINK_CMD_LINECARD_GET ============== */ +/* DEVLINK_CMD_LINECARD_GET - do */ +void devlink_linecard_get_req_free(struct devlink_linecard_get_req *req) +{ + free(req->bus_name); + free(req->dev_name); + free(req); +} + +void devlink_linecard_get_rsp_free(struct devlink_linecard_get_rsp *rsp) +{ + free(rsp->bus_name); + free(rsp->dev_name); + free(rsp); +} + +int devlink_linecard_get_rsp_parse(const struct nlmsghdr *nlh, void *data) +{ + struct devlink_linecard_get_rsp *dst; + struct ynl_parse_arg *yarg = data; + const struct nlattr *attr; + + dst = yarg->data; + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == DEVLINK_ATTR_BUS_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.bus_name_len = len; + dst->bus_name = malloc(len + 1); + memcpy(dst->bus_name, mnl_attr_get_str(attr), len); + dst->bus_name[len] = 0; + } else if (type == DEVLINK_ATTR_DEV_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.dev_name_len = len; + dst->dev_name = malloc(len + 1); + memcpy(dst->dev_name, mnl_attr_get_str(attr), len); + dst->dev_name[len] = 0; + } else if (type == DEVLINK_ATTR_LINECARD_INDEX) { + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + dst->_present.linecard_index = 1; + dst->linecard_index = mnl_attr_get_u32(attr); + } + } + + return MNL_CB_OK; +} + +struct devlink_linecard_get_rsp * +devlink_linecard_get(struct ynl_sock *ys, struct devlink_linecard_get_req *req) +{ + struct ynl_req_state yrs = { .yarg = { .ys = ys, }, }; + struct devlink_linecard_get_rsp *rsp; + struct nlmsghdr *nlh; + int err; + + nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_LINECARD_GET, 1); + ys->req_policy = &devlink_nest; + yrs.yarg.rsp_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + if (req->_present.linecard_index) + mnl_attr_put_u32(nlh, DEVLINK_ATTR_LINECARD_INDEX, req->linecard_index); + + rsp = calloc(1, sizeof(*rsp)); + yrs.yarg.data = rsp; + yrs.cb = devlink_linecard_get_rsp_parse; + yrs.rsp_cmd = DEVLINK_CMD_LINECARD_GET; + + err = ynl_exec(ys, nlh, &yrs); + if (err < 0) + goto err_free; + + return rsp; + +err_free: + devlink_linecard_get_rsp_free(rsp); + return NULL; +} + +/* DEVLINK_CMD_LINECARD_GET - dump */ +void devlink_linecard_get_list_free(struct devlink_linecard_get_list *rsp) +{ + struct devlink_linecard_get_list *next = rsp; + + while ((void *)next != YNL_LIST_END) { + rsp = next; + next = rsp->next; + + free(rsp->obj.bus_name); + free(rsp->obj.dev_name); + free(rsp); + } +} + +struct devlink_linecard_get_list * +devlink_linecard_get_dump(struct ynl_sock *ys) +{ + struct ynl_dump_state yds = {}; + struct nlmsghdr *nlh; + int err; + + yds.ys = ys; + yds.alloc_sz = sizeof(struct devlink_linecard_get_list); + yds.cb = devlink_linecard_get_rsp_parse; + yds.rsp_cmd = DEVLINK_CMD_LINECARD_GET; + yds.rsp_policy = &devlink_nest; + + nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_LINECARD_GET, 1); + + err = ynl_exec_dump(ys, nlh, &yds); + if (err < 0) + goto free_list; + + return yds.first; + +free_list: + devlink_linecard_get_list_free(yds.first); + return NULL; +} + +/* ============== DEVLINK_CMD_SELFTESTS_GET ============== */ +/* DEVLINK_CMD_SELFTESTS_GET - do */ +void devlink_selftests_get_req_free(struct devlink_selftests_get_req *req) +{ + free(req->bus_name); + free(req->dev_name); + free(req); +} + +void devlink_selftests_get_rsp_free(struct devlink_selftests_get_rsp *rsp) +{ + free(rsp->bus_name); + free(rsp->dev_name); + free(rsp); +} + +int devlink_selftests_get_rsp_parse(const struct nlmsghdr *nlh, void *data) +{ + struct devlink_selftests_get_rsp *dst; + struct ynl_parse_arg *yarg = data; + const struct nlattr *attr; + + dst = yarg->data; + + mnl_attr_for_each(attr, nlh, sizeof(struct genlmsghdr)) { + unsigned int type = mnl_attr_get_type(attr); + + if (type == DEVLINK_ATTR_BUS_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.bus_name_len = len; + dst->bus_name = malloc(len + 1); + memcpy(dst->bus_name, mnl_attr_get_str(attr), len); + dst->bus_name[len] = 0; + } else if (type == DEVLINK_ATTR_DEV_NAME) { + unsigned int len; + + if (ynl_attr_validate(yarg, attr)) + return MNL_CB_ERROR; + + len = strnlen(mnl_attr_get_str(attr), mnl_attr_get_payload_len(attr)); + dst->_present.dev_name_len = len; + dst->dev_name = malloc(len + 1); + memcpy(dst->dev_name, mnl_attr_get_str(attr), len); + dst->dev_name[len] = 0; + } + } + + return MNL_CB_OK; +} + +struct devlink_selftests_get_rsp * +devlink_selftests_get(struct ynl_sock *ys, + struct devlink_selftests_get_req *req) +{ + struct ynl_req_state yrs = { .yarg = { .ys = ys, }, }; + struct devlink_selftests_get_rsp *rsp; + struct nlmsghdr *nlh; + int err; + + nlh = ynl_gemsg_start_req(ys, ys->family_id, DEVLINK_CMD_SELFTESTS_GET, 1); + ys->req_policy = &devlink_nest; + yrs.yarg.rsp_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + + rsp = calloc(1, sizeof(*rsp)); + yrs.yarg.data = rsp; + yrs.cb = devlink_selftests_get_rsp_parse; + yrs.rsp_cmd = DEVLINK_CMD_SELFTESTS_GET; + + err = ynl_exec(ys, nlh, &yrs); + if (err < 0) + goto err_free; + + return rsp; + +err_free: + devlink_selftests_get_rsp_free(rsp); + return NULL; +} + +/* DEVLINK_CMD_SELFTESTS_GET - dump */ +void devlink_selftests_get_list_free(struct devlink_selftests_get_list *rsp) +{ + struct devlink_selftests_get_list *next = rsp; + + while ((void *)next != YNL_LIST_END) { + rsp = next; + next = rsp->next; + + free(rsp->obj.bus_name); + free(rsp->obj.dev_name); + free(rsp); + } +} + +struct devlink_selftests_get_list * +devlink_selftests_get_dump(struct ynl_sock *ys) +{ + struct ynl_dump_state yds = {}; + struct nlmsghdr *nlh; + int err; + + yds.ys = ys; + yds.alloc_sz = sizeof(struct devlink_selftests_get_list); + yds.cb = devlink_selftests_get_rsp_parse; + yds.rsp_cmd = DEVLINK_CMD_SELFTESTS_GET; + yds.rsp_policy = &devlink_nest; + + nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_SELFTESTS_GET, 1); + + err = ynl_exec_dump(ys, nlh, &yds); + if (err < 0) + goto free_list; + + return yds.first; + +free_list: + devlink_selftests_get_list_free(yds.first); return NULL; } diff --git a/tools/net/ynl/generated/devlink-user.h b/tools/net/ynl/generated/devlink-user.h index af65e2f2f529..f6ec5b6e5d26 100644 --- a/tools/net/ynl/generated/devlink-user.h +++ b/tools/net/ynl/generated/devlink-user.h @@ -17,6 +17,7 @@ extern const struct ynl_family ynl_devlink_family; /* Enums */ const char *devlink_op_str(int op); +const char *devlink_sb_pool_type_str(enum devlink_sb_pool_type value); /* Common nested types */ struct devlink_dl_info_version { @@ -140,6 +141,659 @@ void devlink_get_list_free(struct devlink_get_list *rsp); struct devlink_get_list *devlink_get_dump(struct ynl_sock *ys); +/* ============== DEVLINK_CMD_PORT_GET ============== */ +/* DEVLINK_CMD_PORT_GET - do */ +struct devlink_port_get_req { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 port_index:1; + } _present; + + char *bus_name; + char *dev_name; + __u32 port_index; +}; + +static inline struct devlink_port_get_req *devlink_port_get_req_alloc(void) +{ + return calloc(1, sizeof(struct devlink_port_get_req)); +} +void devlink_port_get_req_free(struct devlink_port_get_req *req); + +static inline void +devlink_port_get_req_set_bus_name(struct devlink_port_get_req *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_port_get_req_set_dev_name(struct devlink_port_get_req *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} +static inline void +devlink_port_get_req_set_port_index(struct devlink_port_get_req *req, + __u32 port_index) +{ + req->_present.port_index = 1; + req->port_index = port_index; +} + +struct devlink_port_get_rsp { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 port_index:1; + } _present; + + char *bus_name; + char *dev_name; + __u32 port_index; +}; + +void devlink_port_get_rsp_free(struct devlink_port_get_rsp *rsp); + +/* + * Get devlink port instances. + */ +struct devlink_port_get_rsp * +devlink_port_get(struct ynl_sock *ys, struct devlink_port_get_req *req); + +/* DEVLINK_CMD_PORT_GET - dump */ +struct devlink_port_get_rsp_dump { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 port_index:1; + } _present; + + char *bus_name; + char *dev_name; + __u32 port_index; +}; + +struct devlink_port_get_rsp_list { + struct devlink_port_get_rsp_list *next; + struct devlink_port_get_rsp_dump obj __attribute__ ((aligned (8))); +}; + +void devlink_port_get_rsp_list_free(struct devlink_port_get_rsp_list *rsp); + +struct devlink_port_get_rsp_list *devlink_port_get_dump(struct ynl_sock *ys); + +/* ============== DEVLINK_CMD_SB_GET ============== */ +/* DEVLINK_CMD_SB_GET - do */ +struct devlink_sb_get_req { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 sb_index:1; + } _present; + + char *bus_name; + char *dev_name; + __u32 sb_index; +}; + +static inline struct devlink_sb_get_req *devlink_sb_get_req_alloc(void) +{ + return calloc(1, sizeof(struct devlink_sb_get_req)); +} +void devlink_sb_get_req_free(struct devlink_sb_get_req *req); + +static inline void +devlink_sb_get_req_set_bus_name(struct devlink_sb_get_req *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_sb_get_req_set_dev_name(struct devlink_sb_get_req *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} +static inline void +devlink_sb_get_req_set_sb_index(struct devlink_sb_get_req *req, __u32 sb_index) +{ + req->_present.sb_index = 1; + req->sb_index = sb_index; +} + +struct devlink_sb_get_rsp { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 sb_index:1; + } _present; + + char *bus_name; + char *dev_name; + __u32 sb_index; +}; + +void devlink_sb_get_rsp_free(struct devlink_sb_get_rsp *rsp); + +/* + * Get shared buffer instances. + */ +struct devlink_sb_get_rsp * +devlink_sb_get(struct ynl_sock *ys, struct devlink_sb_get_req *req); + +/* DEVLINK_CMD_SB_GET - dump */ +struct devlink_sb_get_list { + struct devlink_sb_get_list *next; + struct devlink_sb_get_rsp obj __attribute__ ((aligned (8))); +}; + +void devlink_sb_get_list_free(struct devlink_sb_get_list *rsp); + +struct devlink_sb_get_list *devlink_sb_get_dump(struct ynl_sock *ys); + +/* ============== DEVLINK_CMD_SB_POOL_GET ============== */ +/* DEVLINK_CMD_SB_POOL_GET - do */ +struct devlink_sb_pool_get_req { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 sb_index:1; + __u32 sb_pool_index:1; + } _present; + + char *bus_name; + char *dev_name; + __u32 sb_index; + __u16 sb_pool_index; +}; + +static inline struct devlink_sb_pool_get_req * +devlink_sb_pool_get_req_alloc(void) +{ + return calloc(1, sizeof(struct devlink_sb_pool_get_req)); +} +void devlink_sb_pool_get_req_free(struct devlink_sb_pool_get_req *req); + +static inline void +devlink_sb_pool_get_req_set_bus_name(struct devlink_sb_pool_get_req *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_sb_pool_get_req_set_dev_name(struct devlink_sb_pool_get_req *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} +static inline void +devlink_sb_pool_get_req_set_sb_index(struct devlink_sb_pool_get_req *req, + __u32 sb_index) +{ + req->_present.sb_index = 1; + req->sb_index = sb_index; +} +static inline void +devlink_sb_pool_get_req_set_sb_pool_index(struct devlink_sb_pool_get_req *req, + __u16 sb_pool_index) +{ + req->_present.sb_pool_index = 1; + req->sb_pool_index = sb_pool_index; +} + +struct devlink_sb_pool_get_rsp { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 sb_index:1; + __u32 sb_pool_index:1; + } _present; + + char *bus_name; + char *dev_name; + __u32 sb_index; + __u16 sb_pool_index; +}; + +void devlink_sb_pool_get_rsp_free(struct devlink_sb_pool_get_rsp *rsp); + +/* + * Get shared buffer pool instances. + */ +struct devlink_sb_pool_get_rsp * +devlink_sb_pool_get(struct ynl_sock *ys, struct devlink_sb_pool_get_req *req); + +/* DEVLINK_CMD_SB_POOL_GET - dump */ +struct devlink_sb_pool_get_list { + struct devlink_sb_pool_get_list *next; + struct devlink_sb_pool_get_rsp obj __attribute__ ((aligned (8))); +}; + +void devlink_sb_pool_get_list_free(struct devlink_sb_pool_get_list *rsp); + +struct devlink_sb_pool_get_list *devlink_sb_pool_get_dump(struct ynl_sock *ys); + +/* ============== DEVLINK_CMD_SB_PORT_POOL_GET ============== */ +/* DEVLINK_CMD_SB_PORT_POOL_GET - do */ +struct devlink_sb_port_pool_get_req { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 port_index:1; + __u32 sb_index:1; + __u32 sb_pool_index:1; + } _present; + + char *bus_name; + char *dev_name; + __u32 port_index; + __u32 sb_index; + __u16 sb_pool_index; +}; + +static inline struct devlink_sb_port_pool_get_req * +devlink_sb_port_pool_get_req_alloc(void) +{ + return calloc(1, sizeof(struct devlink_sb_port_pool_get_req)); +} +void +devlink_sb_port_pool_get_req_free(struct devlink_sb_port_pool_get_req *req); + +static inline void +devlink_sb_port_pool_get_req_set_bus_name(struct devlink_sb_port_pool_get_req *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_sb_port_pool_get_req_set_dev_name(struct devlink_sb_port_pool_get_req *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} +static inline void +devlink_sb_port_pool_get_req_set_port_index(struct devlink_sb_port_pool_get_req *req, + __u32 port_index) +{ + req->_present.port_index = 1; + req->port_index = port_index; +} +static inline void +devlink_sb_port_pool_get_req_set_sb_index(struct devlink_sb_port_pool_get_req *req, + __u32 sb_index) +{ + req->_present.sb_index = 1; + req->sb_index = sb_index; +} +static inline void +devlink_sb_port_pool_get_req_set_sb_pool_index(struct devlink_sb_port_pool_get_req *req, + __u16 sb_pool_index) +{ + req->_present.sb_pool_index = 1; + req->sb_pool_index = sb_pool_index; +} + +struct devlink_sb_port_pool_get_rsp { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 port_index:1; + __u32 sb_index:1; + __u32 sb_pool_index:1; + } _present; + + char *bus_name; + char *dev_name; + __u32 port_index; + __u32 sb_index; + __u16 sb_pool_index; +}; + +void +devlink_sb_port_pool_get_rsp_free(struct devlink_sb_port_pool_get_rsp *rsp); + +/* + * Get shared buffer port-pool combinations and threshold. + */ +struct devlink_sb_port_pool_get_rsp * +devlink_sb_port_pool_get(struct ynl_sock *ys, + struct devlink_sb_port_pool_get_req *req); + +/* DEVLINK_CMD_SB_PORT_POOL_GET - dump */ +struct devlink_sb_port_pool_get_list { + struct devlink_sb_port_pool_get_list *next; + struct devlink_sb_port_pool_get_rsp obj __attribute__ ((aligned (8))); +}; + +void +devlink_sb_port_pool_get_list_free(struct devlink_sb_port_pool_get_list *rsp); + +struct devlink_sb_port_pool_get_list * +devlink_sb_port_pool_get_dump(struct ynl_sock *ys); + +/* ============== DEVLINK_CMD_SB_TC_POOL_BIND_GET ============== */ +/* DEVLINK_CMD_SB_TC_POOL_BIND_GET - do */ +struct devlink_sb_tc_pool_bind_get_req { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 port_index:1; + __u32 sb_index:1; + __u32 sb_pool_type:1; + __u32 sb_tc_index:1; + } _present; + + char *bus_name; + char *dev_name; + __u32 port_index; + __u32 sb_index; + enum devlink_sb_pool_type sb_pool_type; + __u16 sb_tc_index; +}; + +static inline struct devlink_sb_tc_pool_bind_get_req * +devlink_sb_tc_pool_bind_get_req_alloc(void) +{ + return calloc(1, sizeof(struct devlink_sb_tc_pool_bind_get_req)); +} +void +devlink_sb_tc_pool_bind_get_req_free(struct devlink_sb_tc_pool_bind_get_req *req); + +static inline void +devlink_sb_tc_pool_bind_get_req_set_bus_name(struct devlink_sb_tc_pool_bind_get_req *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_sb_tc_pool_bind_get_req_set_dev_name(struct devlink_sb_tc_pool_bind_get_req *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} +static inline void +devlink_sb_tc_pool_bind_get_req_set_port_index(struct devlink_sb_tc_pool_bind_get_req *req, + __u32 port_index) +{ + req->_present.port_index = 1; + req->port_index = port_index; +} +static inline void +devlink_sb_tc_pool_bind_get_req_set_sb_index(struct devlink_sb_tc_pool_bind_get_req *req, + __u32 sb_index) +{ + req->_present.sb_index = 1; + req->sb_index = sb_index; +} +static inline void +devlink_sb_tc_pool_bind_get_req_set_sb_pool_type(struct devlink_sb_tc_pool_bind_get_req *req, + enum devlink_sb_pool_type sb_pool_type) +{ + req->_present.sb_pool_type = 1; + req->sb_pool_type = sb_pool_type; +} +static inline void +devlink_sb_tc_pool_bind_get_req_set_sb_tc_index(struct devlink_sb_tc_pool_bind_get_req *req, + __u16 sb_tc_index) +{ + req->_present.sb_tc_index = 1; + req->sb_tc_index = sb_tc_index; +} + +struct devlink_sb_tc_pool_bind_get_rsp { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 port_index:1; + __u32 sb_index:1; + __u32 sb_pool_type:1; + __u32 sb_tc_index:1; + } _present; + + char *bus_name; + char *dev_name; + __u32 port_index; + __u32 sb_index; + enum devlink_sb_pool_type sb_pool_type; + __u16 sb_tc_index; +}; + +void +devlink_sb_tc_pool_bind_get_rsp_free(struct devlink_sb_tc_pool_bind_get_rsp *rsp); + +/* + * Get shared buffer port-TC to pool bindings and threshold. + */ +struct devlink_sb_tc_pool_bind_get_rsp * +devlink_sb_tc_pool_bind_get(struct ynl_sock *ys, + struct devlink_sb_tc_pool_bind_get_req *req); + +/* DEVLINK_CMD_SB_TC_POOL_BIND_GET - dump */ +struct devlink_sb_tc_pool_bind_get_list { + struct devlink_sb_tc_pool_bind_get_list *next; + struct devlink_sb_tc_pool_bind_get_rsp obj __attribute__ ((aligned (8))); +}; + +void +devlink_sb_tc_pool_bind_get_list_free(struct devlink_sb_tc_pool_bind_get_list *rsp); + +struct devlink_sb_tc_pool_bind_get_list * +devlink_sb_tc_pool_bind_get_dump(struct ynl_sock *ys); + +/* ============== DEVLINK_CMD_PARAM_GET ============== */ +/* DEVLINK_CMD_PARAM_GET - do */ +struct devlink_param_get_req { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 param_name_len; + } _present; + + char *bus_name; + char *dev_name; + char *param_name; +}; + +static inline struct devlink_param_get_req *devlink_param_get_req_alloc(void) +{ + return calloc(1, sizeof(struct devlink_param_get_req)); +} +void devlink_param_get_req_free(struct devlink_param_get_req *req); + +static inline void +devlink_param_get_req_set_bus_name(struct devlink_param_get_req *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_param_get_req_set_dev_name(struct devlink_param_get_req *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} +static inline void +devlink_param_get_req_set_param_name(struct devlink_param_get_req *req, + const char *param_name) +{ + free(req->param_name); + req->_present.param_name_len = strlen(param_name); + req->param_name = malloc(req->_present.param_name_len + 1); + memcpy(req->param_name, param_name, req->_present.param_name_len); + req->param_name[req->_present.param_name_len] = 0; +} + +struct devlink_param_get_rsp { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 param_name_len; + } _present; + + char *bus_name; + char *dev_name; + char *param_name; +}; + +void devlink_param_get_rsp_free(struct devlink_param_get_rsp *rsp); + +/* + * Get param instances. + */ +struct devlink_param_get_rsp * +devlink_param_get(struct ynl_sock *ys, struct devlink_param_get_req *req); + +/* DEVLINK_CMD_PARAM_GET - dump */ +struct devlink_param_get_list { + struct devlink_param_get_list *next; + struct devlink_param_get_rsp obj __attribute__ ((aligned (8))); +}; + +void devlink_param_get_list_free(struct devlink_param_get_list *rsp); + +struct devlink_param_get_list *devlink_param_get_dump(struct ynl_sock *ys); + +/* ============== DEVLINK_CMD_REGION_GET ============== */ +/* DEVLINK_CMD_REGION_GET - do */ +struct devlink_region_get_req { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 port_index:1; + __u32 region_name_len; + } _present; + + char *bus_name; + char *dev_name; + __u32 port_index; + char *region_name; +}; + +static inline struct devlink_region_get_req *devlink_region_get_req_alloc(void) +{ + return calloc(1, sizeof(struct devlink_region_get_req)); +} +void devlink_region_get_req_free(struct devlink_region_get_req *req); + +static inline void +devlink_region_get_req_set_bus_name(struct devlink_region_get_req *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_region_get_req_set_dev_name(struct devlink_region_get_req *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} +static inline void +devlink_region_get_req_set_port_index(struct devlink_region_get_req *req, + __u32 port_index) +{ + req->_present.port_index = 1; + req->port_index = port_index; +} +static inline void +devlink_region_get_req_set_region_name(struct devlink_region_get_req *req, + const char *region_name) +{ + free(req->region_name); + req->_present.region_name_len = strlen(region_name); + req->region_name = malloc(req->_present.region_name_len + 1); + memcpy(req->region_name, region_name, req->_present.region_name_len); + req->region_name[req->_present.region_name_len] = 0; +} + +struct devlink_region_get_rsp { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 port_index:1; + __u32 region_name_len; + } _present; + + char *bus_name; + char *dev_name; + __u32 port_index; + char *region_name; +}; + +void devlink_region_get_rsp_free(struct devlink_region_get_rsp *rsp); + +/* + * Get region instances. + */ +struct devlink_region_get_rsp * +devlink_region_get(struct ynl_sock *ys, struct devlink_region_get_req *req); + +/* DEVLINK_CMD_REGION_GET - dump */ +struct devlink_region_get_list { + struct devlink_region_get_list *next; + struct devlink_region_get_rsp obj __attribute__ ((aligned (8))); +}; + +void devlink_region_get_list_free(struct devlink_region_get_list *rsp); + +struct devlink_region_get_list *devlink_region_get_dump(struct ynl_sock *ys); + /* ============== DEVLINK_CMD_INFO_GET ============== */ /* DEVLINK_CMD_INFO_GET - do */ struct devlink_info_get_req { @@ -152,14 +806,442 @@ struct devlink_info_get_req { char *dev_name; }; -static inline struct devlink_info_get_req *devlink_info_get_req_alloc(void) +static inline struct devlink_info_get_req *devlink_info_get_req_alloc(void) +{ + return calloc(1, sizeof(struct devlink_info_get_req)); +} +void devlink_info_get_req_free(struct devlink_info_get_req *req); + +static inline void +devlink_info_get_req_set_bus_name(struct devlink_info_get_req *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_info_get_req_set_dev_name(struct devlink_info_get_req *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} + +struct devlink_info_get_rsp { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 info_driver_name_len; + __u32 info_serial_number_len; + } _present; + + char *bus_name; + char *dev_name; + char *info_driver_name; + char *info_serial_number; + unsigned int n_info_version_fixed; + struct devlink_dl_info_version *info_version_fixed; + unsigned int n_info_version_running; + struct devlink_dl_info_version *info_version_running; + unsigned int n_info_version_stored; + struct devlink_dl_info_version *info_version_stored; +}; + +void devlink_info_get_rsp_free(struct devlink_info_get_rsp *rsp); + +/* + * Get device information, like driver name, hardware and firmware versions etc. + */ +struct devlink_info_get_rsp * +devlink_info_get(struct ynl_sock *ys, struct devlink_info_get_req *req); + +/* DEVLINK_CMD_INFO_GET - dump */ +struct devlink_info_get_list { + struct devlink_info_get_list *next; + struct devlink_info_get_rsp obj __attribute__ ((aligned (8))); +}; + +void devlink_info_get_list_free(struct devlink_info_get_list *rsp); + +struct devlink_info_get_list *devlink_info_get_dump(struct ynl_sock *ys); + +/* ============== DEVLINK_CMD_HEALTH_REPORTER_GET ============== */ +/* DEVLINK_CMD_HEALTH_REPORTER_GET - do */ +struct devlink_health_reporter_get_req { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 port_index:1; + __u32 health_reporter_name_len; + } _present; + + char *bus_name; + char *dev_name; + __u32 port_index; + char *health_reporter_name; +}; + +static inline struct devlink_health_reporter_get_req * +devlink_health_reporter_get_req_alloc(void) +{ + return calloc(1, sizeof(struct devlink_health_reporter_get_req)); +} +void +devlink_health_reporter_get_req_free(struct devlink_health_reporter_get_req *req); + +static inline void +devlink_health_reporter_get_req_set_bus_name(struct devlink_health_reporter_get_req *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_health_reporter_get_req_set_dev_name(struct devlink_health_reporter_get_req *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} +static inline void +devlink_health_reporter_get_req_set_port_index(struct devlink_health_reporter_get_req *req, + __u32 port_index) +{ + req->_present.port_index = 1; + req->port_index = port_index; +} +static inline void +devlink_health_reporter_get_req_set_health_reporter_name(struct devlink_health_reporter_get_req *req, + const char *health_reporter_name) +{ + free(req->health_reporter_name); + req->_present.health_reporter_name_len = strlen(health_reporter_name); + req->health_reporter_name = malloc(req->_present.health_reporter_name_len + 1); + memcpy(req->health_reporter_name, health_reporter_name, req->_present.health_reporter_name_len); + req->health_reporter_name[req->_present.health_reporter_name_len] = 0; +} + +struct devlink_health_reporter_get_rsp { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 port_index:1; + __u32 health_reporter_name_len; + } _present; + + char *bus_name; + char *dev_name; + __u32 port_index; + char *health_reporter_name; +}; + +void +devlink_health_reporter_get_rsp_free(struct devlink_health_reporter_get_rsp *rsp); + +/* + * Get health reporter instances. + */ +struct devlink_health_reporter_get_rsp * +devlink_health_reporter_get(struct ynl_sock *ys, + struct devlink_health_reporter_get_req *req); + +/* DEVLINK_CMD_HEALTH_REPORTER_GET - dump */ +struct devlink_health_reporter_get_list { + struct devlink_health_reporter_get_list *next; + struct devlink_health_reporter_get_rsp obj __attribute__ ((aligned (8))); +}; + +void +devlink_health_reporter_get_list_free(struct devlink_health_reporter_get_list *rsp); + +struct devlink_health_reporter_get_list * +devlink_health_reporter_get_dump(struct ynl_sock *ys); + +/* ============== DEVLINK_CMD_TRAP_GET ============== */ +/* DEVLINK_CMD_TRAP_GET - do */ +struct devlink_trap_get_req { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 trap_name_len; + } _present; + + char *bus_name; + char *dev_name; + char *trap_name; +}; + +static inline struct devlink_trap_get_req *devlink_trap_get_req_alloc(void) +{ + return calloc(1, sizeof(struct devlink_trap_get_req)); +} +void devlink_trap_get_req_free(struct devlink_trap_get_req *req); + +static inline void +devlink_trap_get_req_set_bus_name(struct devlink_trap_get_req *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_trap_get_req_set_dev_name(struct devlink_trap_get_req *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} +static inline void +devlink_trap_get_req_set_trap_name(struct devlink_trap_get_req *req, + const char *trap_name) +{ + free(req->trap_name); + req->_present.trap_name_len = strlen(trap_name); + req->trap_name = malloc(req->_present.trap_name_len + 1); + memcpy(req->trap_name, trap_name, req->_present.trap_name_len); + req->trap_name[req->_present.trap_name_len] = 0; +} + +struct devlink_trap_get_rsp { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 trap_name_len; + } _present; + + char *bus_name; + char *dev_name; + char *trap_name; +}; + +void devlink_trap_get_rsp_free(struct devlink_trap_get_rsp *rsp); + +/* + * Get trap instances. + */ +struct devlink_trap_get_rsp * +devlink_trap_get(struct ynl_sock *ys, struct devlink_trap_get_req *req); + +/* DEVLINK_CMD_TRAP_GET - dump */ +struct devlink_trap_get_list { + struct devlink_trap_get_list *next; + struct devlink_trap_get_rsp obj __attribute__ ((aligned (8))); +}; + +void devlink_trap_get_list_free(struct devlink_trap_get_list *rsp); + +struct devlink_trap_get_list *devlink_trap_get_dump(struct ynl_sock *ys); + +/* ============== DEVLINK_CMD_TRAP_GROUP_GET ============== */ +/* DEVLINK_CMD_TRAP_GROUP_GET - do */ +struct devlink_trap_group_get_req { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 trap_group_name_len; + } _present; + + char *bus_name; + char *dev_name; + char *trap_group_name; +}; + +static inline struct devlink_trap_group_get_req * +devlink_trap_group_get_req_alloc(void) { - return calloc(1, sizeof(struct devlink_info_get_req)); + return calloc(1, sizeof(struct devlink_trap_group_get_req)); } -void devlink_info_get_req_free(struct devlink_info_get_req *req); +void devlink_trap_group_get_req_free(struct devlink_trap_group_get_req *req); static inline void -devlink_info_get_req_set_bus_name(struct devlink_info_get_req *req, +devlink_trap_group_get_req_set_bus_name(struct devlink_trap_group_get_req *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_trap_group_get_req_set_dev_name(struct devlink_trap_group_get_req *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} +static inline void +devlink_trap_group_get_req_set_trap_group_name(struct devlink_trap_group_get_req *req, + const char *trap_group_name) +{ + free(req->trap_group_name); + req->_present.trap_group_name_len = strlen(trap_group_name); + req->trap_group_name = malloc(req->_present.trap_group_name_len + 1); + memcpy(req->trap_group_name, trap_group_name, req->_present.trap_group_name_len); + req->trap_group_name[req->_present.trap_group_name_len] = 0; +} + +struct devlink_trap_group_get_rsp { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 trap_group_name_len; + } _present; + + char *bus_name; + char *dev_name; + char *trap_group_name; +}; + +void devlink_trap_group_get_rsp_free(struct devlink_trap_group_get_rsp *rsp); + +/* + * Get trap group instances. + */ +struct devlink_trap_group_get_rsp * +devlink_trap_group_get(struct ynl_sock *ys, + struct devlink_trap_group_get_req *req); + +/* DEVLINK_CMD_TRAP_GROUP_GET - dump */ +struct devlink_trap_group_get_list { + struct devlink_trap_group_get_list *next; + struct devlink_trap_group_get_rsp obj __attribute__ ((aligned (8))); +}; + +void devlink_trap_group_get_list_free(struct devlink_trap_group_get_list *rsp); + +struct devlink_trap_group_get_list * +devlink_trap_group_get_dump(struct ynl_sock *ys); + +/* ============== DEVLINK_CMD_TRAP_POLICER_GET ============== */ +/* DEVLINK_CMD_TRAP_POLICER_GET - do */ +struct devlink_trap_policer_get_req { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 trap_policer_id:1; + } _present; + + char *bus_name; + char *dev_name; + __u32 trap_policer_id; +}; + +static inline struct devlink_trap_policer_get_req * +devlink_trap_policer_get_req_alloc(void) +{ + return calloc(1, sizeof(struct devlink_trap_policer_get_req)); +} +void +devlink_trap_policer_get_req_free(struct devlink_trap_policer_get_req *req); + +static inline void +devlink_trap_policer_get_req_set_bus_name(struct devlink_trap_policer_get_req *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_trap_policer_get_req_set_dev_name(struct devlink_trap_policer_get_req *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} +static inline void +devlink_trap_policer_get_req_set_trap_policer_id(struct devlink_trap_policer_get_req *req, + __u32 trap_policer_id) +{ + req->_present.trap_policer_id = 1; + req->trap_policer_id = trap_policer_id; +} + +struct devlink_trap_policer_get_rsp { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 trap_policer_id:1; + } _present; + + char *bus_name; + char *dev_name; + __u32 trap_policer_id; +}; + +void +devlink_trap_policer_get_rsp_free(struct devlink_trap_policer_get_rsp *rsp); + +/* + * Get trap policer instances. + */ +struct devlink_trap_policer_get_rsp * +devlink_trap_policer_get(struct ynl_sock *ys, + struct devlink_trap_policer_get_req *req); + +/* DEVLINK_CMD_TRAP_POLICER_GET - dump */ +struct devlink_trap_policer_get_list { + struct devlink_trap_policer_get_list *next; + struct devlink_trap_policer_get_rsp obj __attribute__ ((aligned (8))); +}; + +void +devlink_trap_policer_get_list_free(struct devlink_trap_policer_get_list *rsp); + +struct devlink_trap_policer_get_list * +devlink_trap_policer_get_dump(struct ynl_sock *ys); + +/* ============== DEVLINK_CMD_RATE_GET ============== */ +/* DEVLINK_CMD_RATE_GET - do */ +struct devlink_rate_get_req { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 port_index:1; + __u32 rate_node_name_len; + } _present; + + char *bus_name; + char *dev_name; + __u32 port_index; + char *rate_node_name; +}; + +static inline struct devlink_rate_get_req *devlink_rate_get_req_alloc(void) +{ + return calloc(1, sizeof(struct devlink_rate_get_req)); +} +void devlink_rate_get_req_free(struct devlink_rate_get_req *req); + +static inline void +devlink_rate_get_req_set_bus_name(struct devlink_rate_get_req *req, const char *bus_name) { free(req->bus_name); @@ -169,7 +1251,7 @@ devlink_info_get_req_set_bus_name(struct devlink_info_get_req *req, req->bus_name[req->_present.bus_name_len] = 0; } static inline void -devlink_info_get_req_set_dev_name(struct devlink_info_get_req *req, +devlink_rate_get_req_set_dev_name(struct devlink_rate_get_req *req, const char *dev_name) { free(req->dev_name); @@ -178,43 +1260,204 @@ devlink_info_get_req_set_dev_name(struct devlink_info_get_req *req, memcpy(req->dev_name, dev_name, req->_present.dev_name_len); req->dev_name[req->_present.dev_name_len] = 0; } +static inline void +devlink_rate_get_req_set_port_index(struct devlink_rate_get_req *req, + __u32 port_index) +{ + req->_present.port_index = 1; + req->port_index = port_index; +} +static inline void +devlink_rate_get_req_set_rate_node_name(struct devlink_rate_get_req *req, + const char *rate_node_name) +{ + free(req->rate_node_name); + req->_present.rate_node_name_len = strlen(rate_node_name); + req->rate_node_name = malloc(req->_present.rate_node_name_len + 1); + memcpy(req->rate_node_name, rate_node_name, req->_present.rate_node_name_len); + req->rate_node_name[req->_present.rate_node_name_len] = 0; +} -struct devlink_info_get_rsp { +struct devlink_rate_get_rsp { struct { __u32 bus_name_len; __u32 dev_name_len; - __u32 info_driver_name_len; - __u32 info_serial_number_len; + __u32 port_index:1; + __u32 rate_node_name_len; } _present; char *bus_name; char *dev_name; - char *info_driver_name; - char *info_serial_number; - unsigned int n_info_version_fixed; - struct devlink_dl_info_version *info_version_fixed; - unsigned int n_info_version_running; - struct devlink_dl_info_version *info_version_running; - unsigned int n_info_version_stored; - struct devlink_dl_info_version *info_version_stored; + __u32 port_index; + char *rate_node_name; }; -void devlink_info_get_rsp_free(struct devlink_info_get_rsp *rsp); +void devlink_rate_get_rsp_free(struct devlink_rate_get_rsp *rsp); /* - * Get device information, like driver name, hardware and firmware versions etc. + * Get rate instances. */ -struct devlink_info_get_rsp * -devlink_info_get(struct ynl_sock *ys, struct devlink_info_get_req *req); +struct devlink_rate_get_rsp * +devlink_rate_get(struct ynl_sock *ys, struct devlink_rate_get_req *req); -/* DEVLINK_CMD_INFO_GET - dump */ -struct devlink_info_get_list { - struct devlink_info_get_list *next; - struct devlink_info_get_rsp obj __attribute__ ((aligned (8))); +/* DEVLINK_CMD_RATE_GET - dump */ +struct devlink_rate_get_list { + struct devlink_rate_get_list *next; + struct devlink_rate_get_rsp obj __attribute__ ((aligned (8))); }; -void devlink_info_get_list_free(struct devlink_info_get_list *rsp); +void devlink_rate_get_list_free(struct devlink_rate_get_list *rsp); -struct devlink_info_get_list *devlink_info_get_dump(struct ynl_sock *ys); +struct devlink_rate_get_list *devlink_rate_get_dump(struct ynl_sock *ys); + +/* ============== DEVLINK_CMD_LINECARD_GET ============== */ +/* DEVLINK_CMD_LINECARD_GET - do */ +struct devlink_linecard_get_req { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 linecard_index:1; + } _present; + + char *bus_name; + char *dev_name; + __u32 linecard_index; +}; + +static inline struct devlink_linecard_get_req * +devlink_linecard_get_req_alloc(void) +{ + return calloc(1, sizeof(struct devlink_linecard_get_req)); +} +void devlink_linecard_get_req_free(struct devlink_linecard_get_req *req); + +static inline void +devlink_linecard_get_req_set_bus_name(struct devlink_linecard_get_req *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_linecard_get_req_set_dev_name(struct devlink_linecard_get_req *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} +static inline void +devlink_linecard_get_req_set_linecard_index(struct devlink_linecard_get_req *req, + __u32 linecard_index) +{ + req->_present.linecard_index = 1; + req->linecard_index = linecard_index; +} + +struct devlink_linecard_get_rsp { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + __u32 linecard_index:1; + } _present; + + char *bus_name; + char *dev_name; + __u32 linecard_index; +}; + +void devlink_linecard_get_rsp_free(struct devlink_linecard_get_rsp *rsp); + +/* + * Get line card instances. + */ +struct devlink_linecard_get_rsp * +devlink_linecard_get(struct ynl_sock *ys, struct devlink_linecard_get_req *req); + +/* DEVLINK_CMD_LINECARD_GET - dump */ +struct devlink_linecard_get_list { + struct devlink_linecard_get_list *next; + struct devlink_linecard_get_rsp obj __attribute__ ((aligned (8))); +}; + +void devlink_linecard_get_list_free(struct devlink_linecard_get_list *rsp); + +struct devlink_linecard_get_list * +devlink_linecard_get_dump(struct ynl_sock *ys); + +/* ============== DEVLINK_CMD_SELFTESTS_GET ============== */ +/* DEVLINK_CMD_SELFTESTS_GET - do */ +struct devlink_selftests_get_req { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + } _present; + + char *bus_name; + char *dev_name; +}; + +static inline struct devlink_selftests_get_req * +devlink_selftests_get_req_alloc(void) +{ + return calloc(1, sizeof(struct devlink_selftests_get_req)); +} +void devlink_selftests_get_req_free(struct devlink_selftests_get_req *req); + +static inline void +devlink_selftests_get_req_set_bus_name(struct devlink_selftests_get_req *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_selftests_get_req_set_dev_name(struct devlink_selftests_get_req *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} + +struct devlink_selftests_get_rsp { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + } _present; + + char *bus_name; + char *dev_name; +}; + +void devlink_selftests_get_rsp_free(struct devlink_selftests_get_rsp *rsp); + +/* + * Get device selftest instances. + */ +struct devlink_selftests_get_rsp * +devlink_selftests_get(struct ynl_sock *ys, + struct devlink_selftests_get_req *req); + +/* DEVLINK_CMD_SELFTESTS_GET - dump */ +struct devlink_selftests_get_list { + struct devlink_selftests_get_list *next; + struct devlink_selftests_get_rsp obj __attribute__ ((aligned (8))); +}; + +void devlink_selftests_get_list_free(struct devlink_selftests_get_list *rsp); + +struct devlink_selftests_get_list * +devlink_selftests_get_dump(struct ynl_sock *ys); #endif /* _LINUX_DEVLINK_GEN_H */ -- cgit v1.2.3 From ddff283280ba0a6b84929f357859f7ea74cecfbe Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 11 Aug 2023 17:57:09 +0200 Subject: devlink: remove duplicate temporary netlink callback prototypes Remove the duplicate temporary netlink callback prototype as the generated ones are already in place. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230811155714.1736405-9-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/devl_internal.h | 48 --------------------------------------------- 1 file changed, 48 deletions(-) (limited to 'net') diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index f8af6ffdbb3a..7caa385703de 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -185,50 +185,11 @@ int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack); /* Devlink nl cmds */ -int devlink_nl_pre_doit_port(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info); -int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops, - struct sk_buff *skb, - struct genl_info *info); int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_selftests_get_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_selftests_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb); int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); -int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_rate_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); -int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_linecard_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb); -int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_sb_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); -int devlink_nl_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_sb_pool_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb); -int devlink_nl_sb_port_pool_get_doit(struct sk_buff *skb, - struct genl_info *info); -int devlink_nl_sb_port_pool_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb); -int devlink_nl_sb_tc_pool_bind_get_doit(struct sk_buff *skb, - struct genl_info *info); -int devlink_nl_sb_tc_pool_bind_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb); -int devlink_nl_param_get_doit(struct sk_buff *skb, - struct genl_info *info); -int devlink_nl_param_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb); -int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_region_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb); -int devlink_nl_health_reporter_get_doit(struct sk_buff *skb, - struct genl_info *info); -int devlink_nl_health_reporter_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb); int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, @@ -241,12 +202,3 @@ int devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_trap_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); -int devlink_nl_trap_group_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb); -int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info); -int devlink_nl_trap_policer_get_doit(struct sk_buff *skb, - struct genl_info *info); -int devlink_nl_trap_policer_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb); -- cgit v1.2.3 From 833e479d330c29aa4cfd35b8647d5993d5a78643 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 11 Aug 2023 17:57:10 +0200 Subject: devlink: remove converted commands from small ops As the commands are already defined in split ops, remove them from small ops. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230811155714.1736405-10-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/devl_internal.h | 2 +- net/devlink/leftover.c | 99 +-------------------------------------------- 2 files changed, 3 insertions(+), 98 deletions(-) (limited to 'net') diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 7caa385703de..eb1d5066c73f 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -118,7 +118,7 @@ typedef int devlink_nl_dump_one_func_t(struct sk_buff *msg, struct netlink_callback *cb, int flags); -extern const struct genl_small_ops devlink_nl_small_ops[54]; +extern const struct genl_small_ops devlink_nl_small_ops[40]; struct devlink * devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs); diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 883c65545d26..3883a90d32bb 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -6307,15 +6307,7 @@ static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, return devlink_trap_policer_set(devlink, policer_item, info); } -const struct genl_small_ops devlink_nl_small_ops[54] = { - { - .cmd = DEVLINK_CMD_PORT_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_port_get_doit, - .dumpit = devlink_nl_port_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - /* can be retrieved by unprivileged users */ - }, +const struct genl_small_ops devlink_nl_small_ops[40] = { { .cmd = DEVLINK_CMD_PORT_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -6323,12 +6315,6 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, - { - .cmd = DEVLINK_CMD_RATE_GET, - .doit = devlink_nl_rate_get_doit, - .dumpit = devlink_nl_rate_get_dumpit, - /* can be retrieved by unprivileged users */ - }, { .cmd = DEVLINK_CMD_RATE_SET, .doit = devlink_nl_cmd_rate_set_doit, @@ -6369,45 +6355,18 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, - { - .cmd = DEVLINK_CMD_LINECARD_GET, - .doit = devlink_nl_linecard_get_doit, - .dumpit = devlink_nl_linecard_get_dumpit, - /* can be retrieved by unprivileged users */ - }, + { .cmd = DEVLINK_CMD_LINECARD_SET, .doit = devlink_nl_cmd_linecard_set_doit, .flags = GENL_ADMIN_PERM, }, - { - .cmd = DEVLINK_CMD_SB_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_sb_get_doit, - .dumpit = devlink_nl_sb_get_dumpit, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_SB_POOL_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_sb_pool_get_doit, - .dumpit = devlink_nl_sb_pool_get_dumpit, - /* can be retrieved by unprivileged users */ - }, { .cmd = DEVLINK_CMD_SB_POOL_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_pool_set_doit, .flags = GENL_ADMIN_PERM, }, - { - .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_sb_port_pool_get_doit, - .dumpit = devlink_nl_sb_port_pool_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - /* can be retrieved by unprivileged users */ - }, { .cmd = DEVLINK_CMD_SB_PORT_POOL_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -6415,14 +6374,6 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, - { - .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_sb_tc_pool_bind_get_doit, - .dumpit = devlink_nl_sb_tc_pool_bind_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - /* can be retrieved by unprivileged users */ - }, { .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -6496,13 +6447,6 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .doit = devlink_nl_cmd_reload, .flags = GENL_ADMIN_PERM, }, - { - .cmd = DEVLINK_CMD_PARAM_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_param_get_doit, - .dumpit = devlink_nl_param_get_dumpit, - /* can be retrieved by unprivileged users */ - }, { .cmd = DEVLINK_CMD_PARAM_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -6524,13 +6468,6 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, - { - .cmd = DEVLINK_CMD_REGION_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_region_get_doit, - .dumpit = devlink_nl_region_get_dumpit, - .flags = GENL_ADMIN_PERM, - }, { .cmd = DEVLINK_CMD_REGION_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -6550,14 +6487,6 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .dumpit = devlink_nl_cmd_region_read_dumpit, .flags = GENL_ADMIN_PERM, }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_health_reporter_get_doit, - .dumpit = devlink_nl_health_reporter_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - /* can be retrieved by unprivileged users */ - }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, @@ -6606,45 +6535,21 @@ const struct genl_small_ops devlink_nl_small_ops[54] = { .doit = devlink_nl_cmd_flash_update, .flags = GENL_ADMIN_PERM, }, - { - .cmd = DEVLINK_CMD_TRAP_GET, - .doit = devlink_nl_trap_get_doit, - .dumpit = devlink_nl_trap_get_dumpit, - /* can be retrieved by unprivileged users */ - }, { .cmd = DEVLINK_CMD_TRAP_SET, .doit = devlink_nl_cmd_trap_set_doit, .flags = GENL_ADMIN_PERM, }, - { - .cmd = DEVLINK_CMD_TRAP_GROUP_GET, - .doit = devlink_nl_trap_group_get_doit, - .dumpit = devlink_nl_trap_group_get_dumpit, - /* can be retrieved by unprivileged users */ - }, { .cmd = DEVLINK_CMD_TRAP_GROUP_SET, .doit = devlink_nl_cmd_trap_group_set_doit, .flags = GENL_ADMIN_PERM, }, - { - .cmd = DEVLINK_CMD_TRAP_POLICER_GET, - .doit = devlink_nl_trap_policer_get_doit, - .dumpit = devlink_nl_trap_policer_get_dumpit, - /* can be retrieved by unprivileged users */ - }, { .cmd = DEVLINK_CMD_TRAP_POLICER_SET, .doit = devlink_nl_cmd_trap_policer_set_doit, .flags = GENL_ADMIN_PERM, }, - { - .cmd = DEVLINK_CMD_SELFTESTS_GET, - .doit = devlink_nl_selftests_get_doit, - .dumpit = devlink_nl_selftests_get_dumpit, - /* can be retrieved by unprivileged users */ - }, { .cmd = DEVLINK_CMD_SELFTESTS_RUN, .doit = devlink_nl_cmd_selftests_run, -- cgit v1.2.3 From 4a1b5aa8b5c7433ad6e0c8b52469980ae94a4398 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 11 Aug 2023 17:57:11 +0200 Subject: devlink: allow user to narrow per-instance dumps by passing handle attrs For SFs, one devlink instance per SF is created. There might be thousands of these on a single host. When a user needs to know port handle for specific SF, he needs to dump all devlink ports on the host which does not scale good. Allow user to pass devlink handle attributes alongside the dump command and dump only objects which are under selected devlink instance. Example: $ devlink port show auxiliary/mlx5_core.eth.0/65535: type eth netdev eth2 flavour physical port 0 splittable false auxiliary/mlx5_core.eth.1/131071: type eth netdev eth3 flavour physical port 1 splittable false $ devlink port show auxiliary/mlx5_core.eth.0 auxiliary/mlx5_core.eth.0/65535: type eth netdev eth2 flavour physical port 0 splittable false $ devlink port show auxiliary/mlx5_core.eth.1 auxiliary/mlx5_core.eth.1/131071: type eth netdev eth3 flavour physical port 1 splittable false Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230811155714.1736405-11-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/netlink.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 47e44fb45815..a9b43b0c5959 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -170,8 +170,30 @@ void devlink_nl_post_doit(const struct genl_split_ops *ops, devlink_put(devlink); } -int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, - devlink_nl_dump_one_func_t *dump_one) +static int devlink_nl_inst_single_dumpit(struct sk_buff *msg, + struct netlink_callback *cb, int flags, + devlink_nl_dump_one_func_t *dump_one, + struct nlattr **attrs) +{ + struct devlink *devlink; + int err; + + devlink = devlink_get_from_attrs_lock(sock_net(msg->sk), attrs); + if (IS_ERR(devlink)) + return PTR_ERR(devlink); + err = dump_one(msg, devlink, cb, flags | NLM_F_DUMP_FILTERED); + + devl_unlock(devlink); + devlink_put(devlink); + + if (err != -EMSGSIZE) + return err; + return msg->len; +} + +static int devlink_nl_inst_iter_dumpit(struct sk_buff *msg, + struct netlink_callback *cb, int flags, + devlink_nl_dump_one_func_t *dump_one) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink *devlink; @@ -182,7 +204,7 @@ int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, devl_lock(devlink); if (devl_is_registered(devlink)) - err = dump_one(msg, devlink, cb, NLM_F_MULTI); + err = dump_one(msg, devlink, cb, flags); else err = 0; @@ -203,6 +225,21 @@ int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, return msg->len; } +int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, + devlink_nl_dump_one_func_t *dump_one) +{ + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct nlattr **attrs = info->attrs; + int flags = NLM_F_MULTI; + + if (attrs && + (attrs[DEVLINK_ATTR_BUS_NAME] || attrs[DEVLINK_ATTR_DEV_NAME])) + return devlink_nl_inst_single_dumpit(msg, cb, flags, dump_one, + attrs); + else + return devlink_nl_inst_iter_dumpit(msg, cb, flags, dump_one); +} + struct genl_family devlink_nl_family __ro_after_init = { .name = DEVLINK_GENL_NAME, .version = DEVLINK_GENL_VERSION, -- cgit v1.2.3 From 34493336e7d3d38a8ec6472243baa6660214d990 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 11 Aug 2023 17:57:12 +0200 Subject: netlink: specs: devlink: extend per-instance dump commands to accept instance attributes Extend per-instance dump command definitions to accept instance attributes. Allow parsing of devlink handle attributes so they could be used for instance selection. Re-generate the related code. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230811155714.1736405-12-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 39 ++- net/devlink/netlink_gen.c | 169 +++++++--- tools/net/ynl/generated/devlink-user.c | 123 ++++++- tools/net/ynl/generated/devlink-user.h | 546 ++++++++++++++++++++++++++++++- 4 files changed, 799 insertions(+), 78 deletions(-) (limited to 'net') diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index 4f2aa48017a7..6dbebeebd63e 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -284,7 +284,6 @@ operations: attribute-set: devlink dont-validate: - strict - - dump do: pre: devlink-nl-pre-doit-port @@ -299,6 +298,8 @@ operations: value: 7 attributes: *port-id-attrs dump: + request: + attributes: *dev-id-attrs reply: value: 3 # due to a bug, port dump returns DEVLINK_CMD_NEW attributes: *port-id-attrs @@ -311,7 +312,6 @@ operations: attribute-set: devlink dont-validate: - strict - - dump do: pre: devlink-nl-pre-doit @@ -326,6 +326,8 @@ operations: value: 11 attributes: *sb-id-attrs dump: + request: + attributes: *dev-id-attrs reply: *sb-get-reply # TODO: fill in the operations in between @@ -336,7 +338,6 @@ operations: attribute-set: devlink dont-validate: - strict - - dump do: pre: devlink-nl-pre-doit @@ -352,6 +353,8 @@ operations: value: 15 attributes: *sb-pool-id-attrs dump: + request: + attributes: *dev-id-attrs reply: *sb-pool-get-reply # TODO: fill in the operations in between @@ -362,7 +365,6 @@ operations: attribute-set: devlink dont-validate: - strict - - dump do: pre: devlink-nl-pre-doit-port @@ -379,6 +381,8 @@ operations: value: 19 attributes: *sb-port-pool-id-attrs dump: + request: + attributes: *dev-id-attrs reply: *sb-port-pool-get-reply # TODO: fill in the operations in between @@ -389,7 +393,6 @@ operations: attribute-set: devlink dont-validate: - strict - - dump do: pre: devlink-nl-pre-doit-port @@ -407,6 +410,8 @@ operations: value: 23 attributes: *sb-tc-pool-bind-id-attrs dump: + request: + attributes: *dev-id-attrs reply: *sb-tc-pool-bind-get-reply # TODO: fill in the operations in between @@ -417,7 +422,6 @@ operations: attribute-set: devlink dont-validate: - strict - - dump do: pre: devlink-nl-pre-doit @@ -432,6 +436,8 @@ operations: value: 38 attributes: *param-id-attrs dump: + request: + attributes: *dev-id-attrs reply: *param-get-reply # TODO: fill in the operations in between @@ -442,7 +448,6 @@ operations: attribute-set: devlink dont-validate: - strict - - dump do: pre: devlink-nl-pre-doit-port-optional @@ -458,6 +463,8 @@ operations: value: 42 attributes: *region-id-attrs dump: + request: + attributes: *dev-id-attrs reply: *region-get-reply # TODO: fill in the operations in between @@ -495,7 +502,6 @@ operations: attribute-set: devlink dont-validate: - strict - - dump do: pre: devlink-nl-pre-doit-port-optional @@ -509,6 +515,8 @@ operations: reply: &health-reporter-get-reply attributes: *health-reporter-id-attrs dump: + request: + attributes: *dev-id-attrs reply: *health-reporter-get-reply # TODO: fill in the operations in between @@ -519,7 +527,6 @@ operations: attribute-set: devlink dont-validate: - strict - - dump do: pre: devlink-nl-pre-doit @@ -534,6 +541,8 @@ operations: value: 61 attributes: *trap-id-attrs dump: + request: + attributes: *dev-id-attrs reply: *trap-get-reply # TODO: fill in the operations in between @@ -544,7 +553,6 @@ operations: attribute-set: devlink dont-validate: - strict - - dump do: pre: devlink-nl-pre-doit @@ -559,6 +567,8 @@ operations: value: 65 attributes: *trap-group-id-attrs dump: + request: + attributes: *dev-id-attrs reply: *trap-group-get-reply # TODO: fill in the operations in between @@ -569,7 +579,6 @@ operations: attribute-set: devlink dont-validate: - strict - - dump do: pre: devlink-nl-pre-doit @@ -584,6 +593,8 @@ operations: value: 69 attributes: *trap-policer-id-attrs dump: + request: + attributes: *dev-id-attrs reply: *trap-policer-get-reply # TODO: fill in the operations in between @@ -594,7 +605,6 @@ operations: attribute-set: devlink dont-validate: - strict - - dump do: pre: devlink-nl-pre-doit @@ -610,6 +620,8 @@ operations: value: 74 attributes: *rate-id-attrs dump: + request: + attributes: *dev-id-attrs reply: *rate-get-reply # TODO: fill in the operations in between @@ -620,7 +632,6 @@ operations: attribute-set: devlink dont-validate: - strict - - dump do: pre: devlink-nl-pre-doit @@ -635,6 +646,8 @@ operations: value: 78 attributes: *linecard-id-attrs dump: + request: + attributes: *dev-id-attrs reply: *linecard-get-reply # TODO: fill in the operations in between diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index e384b91b2e3b..5f3e980c6a7c 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -17,29 +17,47 @@ static const struct nla_policy devlink_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] }; /* DEVLINK_CMD_PORT_GET - do */ -static const struct nla_policy devlink_port_get_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = { +static const struct nla_policy devlink_port_get_do_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, }; +/* DEVLINK_CMD_PORT_GET - dump */ +static const struct nla_policy devlink_port_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + /* DEVLINK_CMD_SB_GET - do */ -static const struct nla_policy devlink_sb_get_nl_policy[DEVLINK_ATTR_SB_INDEX + 1] = { +static const struct nla_policy devlink_sb_get_do_nl_policy[DEVLINK_ATTR_SB_INDEX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, }, }; +/* DEVLINK_CMD_SB_GET - dump */ +static const struct nla_policy devlink_sb_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + /* DEVLINK_CMD_SB_POOL_GET - do */ -static const struct nla_policy devlink_sb_pool_get_nl_policy[DEVLINK_ATTR_SB_POOL_INDEX + 1] = { +static const struct nla_policy devlink_sb_pool_get_do_nl_policy[DEVLINK_ATTR_SB_POOL_INDEX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, }, [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, }, }; +/* DEVLINK_CMD_SB_POOL_GET - dump */ +static const struct nla_policy devlink_sb_pool_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + /* DEVLINK_CMD_SB_PORT_POOL_GET - do */ -static const struct nla_policy devlink_sb_port_pool_get_nl_policy[DEVLINK_ATTR_SB_POOL_INDEX + 1] = { +static const struct nla_policy devlink_sb_port_pool_get_do_nl_policy[DEVLINK_ATTR_SB_POOL_INDEX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, @@ -47,8 +65,14 @@ static const struct nla_policy devlink_sb_port_pool_get_nl_policy[DEVLINK_ATTR_S [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, }, }; +/* DEVLINK_CMD_SB_PORT_POOL_GET - dump */ +static const struct nla_policy devlink_sb_port_pool_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + /* DEVLINK_CMD_SB_TC_POOL_BIND_GET - do */ -static const struct nla_policy devlink_sb_tc_pool_bind_get_nl_policy[DEVLINK_ATTR_SB_TC_INDEX + 1] = { +static const struct nla_policy devlink_sb_tc_pool_bind_get_do_nl_policy[DEVLINK_ATTR_SB_TC_INDEX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, @@ -57,21 +81,39 @@ static const struct nla_policy devlink_sb_tc_pool_bind_get_nl_policy[DEVLINK_ATT [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16, }, }; +/* DEVLINK_CMD_SB_TC_POOL_BIND_GET - dump */ +static const struct nla_policy devlink_sb_tc_pool_bind_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + /* DEVLINK_CMD_PARAM_GET - do */ -static const struct nla_policy devlink_param_get_nl_policy[DEVLINK_ATTR_PARAM_NAME + 1] = { +static const struct nla_policy devlink_param_get_do_nl_policy[DEVLINK_ATTR_PARAM_NAME + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING, }, }; +/* DEVLINK_CMD_PARAM_GET - dump */ +static const struct nla_policy devlink_param_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + /* DEVLINK_CMD_REGION_GET - do */ -static const struct nla_policy devlink_region_get_nl_policy[DEVLINK_ATTR_REGION_NAME + 1] = { +static const struct nla_policy devlink_region_get_do_nl_policy[DEVLINK_ATTR_REGION_NAME + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING, }, }; +/* DEVLINK_CMD_REGION_GET - dump */ +static const struct nla_policy devlink_region_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + /* DEVLINK_CMD_INFO_GET - do */ static const struct nla_policy devlink_info_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, @@ -79,49 +121,85 @@ static const struct nla_policy devlink_info_get_nl_policy[DEVLINK_ATTR_DEV_NAME }; /* DEVLINK_CMD_HEALTH_REPORTER_GET - do */ -static const struct nla_policy devlink_health_reporter_get_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = { +static const struct nla_policy devlink_health_reporter_get_do_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, }, }; +/* DEVLINK_CMD_HEALTH_REPORTER_GET - dump */ +static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + /* DEVLINK_CMD_TRAP_GET - do */ -static const struct nla_policy devlink_trap_get_nl_policy[DEVLINK_ATTR_TRAP_NAME + 1] = { +static const struct nla_policy devlink_trap_get_do_nl_policy[DEVLINK_ATTR_TRAP_NAME + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING, }, }; +/* DEVLINK_CMD_TRAP_GET - dump */ +static const struct nla_policy devlink_trap_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + /* DEVLINK_CMD_TRAP_GROUP_GET - do */ -static const struct nla_policy devlink_trap_group_get_nl_policy[DEVLINK_ATTR_TRAP_GROUP_NAME + 1] = { +static const struct nla_policy devlink_trap_group_get_do_nl_policy[DEVLINK_ATTR_TRAP_GROUP_NAME + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING, }, }; +/* DEVLINK_CMD_TRAP_GROUP_GET - dump */ +static const struct nla_policy devlink_trap_group_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + /* DEVLINK_CMD_TRAP_POLICER_GET - do */ -static const struct nla_policy devlink_trap_policer_get_nl_policy[DEVLINK_ATTR_TRAP_POLICER_ID + 1] = { +static const struct nla_policy devlink_trap_policer_get_do_nl_policy[DEVLINK_ATTR_TRAP_POLICER_ID + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32, }, }; +/* DEVLINK_CMD_TRAP_POLICER_GET - dump */ +static const struct nla_policy devlink_trap_policer_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + /* DEVLINK_CMD_RATE_GET - do */ -static const struct nla_policy devlink_rate_get_nl_policy[DEVLINK_ATTR_RATE_NODE_NAME + 1] = { +static const struct nla_policy devlink_rate_get_do_nl_policy[DEVLINK_ATTR_RATE_NODE_NAME + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, }; +/* DEVLINK_CMD_RATE_GET - dump */ +static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + /* DEVLINK_CMD_LINECARD_GET - do */ -static const struct nla_policy devlink_linecard_get_nl_policy[DEVLINK_ATTR_LINECARD_INDEX + 1] = { +static const struct nla_policy devlink_linecard_get_do_nl_policy[DEVLINK_ATTR_LINECARD_INDEX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32, }, }; +/* DEVLINK_CMD_LINECARD_GET - dump */ +static const struct nla_policy devlink_linecard_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { + [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, +}; + /* DEVLINK_CMD_SELFTESTS_GET - do */ static const struct nla_policy devlink_selftests_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, @@ -152,14 +230,15 @@ const struct genl_split_ops devlink_nl_ops[32] = { .pre_doit = devlink_nl_pre_doit_port, .doit = devlink_nl_port_get_doit, .post_doit = devlink_nl_post_doit, - .policy = devlink_port_get_nl_policy, + .policy = devlink_port_get_do_nl_policy, .maxattr = DEVLINK_ATTR_PORT_INDEX, .flags = GENL_CMD_CAP_DO, }, { .cmd = DEVLINK_CMD_PORT_GET, - .validate = GENL_DONT_VALIDATE_DUMP, .dumpit = devlink_nl_port_get_dumpit, + .policy = devlink_port_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, .flags = GENL_CMD_CAP_DUMP, }, { @@ -168,14 +247,15 @@ const struct genl_split_ops devlink_nl_ops[32] = { .pre_doit = devlink_nl_pre_doit, .doit = devlink_nl_sb_get_doit, .post_doit = devlink_nl_post_doit, - .policy = devlink_sb_get_nl_policy, + .policy = devlink_sb_get_do_nl_policy, .maxattr = DEVLINK_ATTR_SB_INDEX, .flags = GENL_CMD_CAP_DO, }, { .cmd = DEVLINK_CMD_SB_GET, - .validate = GENL_DONT_VALIDATE_DUMP, .dumpit = devlink_nl_sb_get_dumpit, + .policy = devlink_sb_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, .flags = GENL_CMD_CAP_DUMP, }, { @@ -184,14 +264,15 @@ const struct genl_split_ops devlink_nl_ops[32] = { .pre_doit = devlink_nl_pre_doit, .doit = devlink_nl_sb_pool_get_doit, .post_doit = devlink_nl_post_doit, - .policy = devlink_sb_pool_get_nl_policy, + .policy = devlink_sb_pool_get_do_nl_policy, .maxattr = DEVLINK_ATTR_SB_POOL_INDEX, .flags = GENL_CMD_CAP_DO, }, { .cmd = DEVLINK_CMD_SB_POOL_GET, - .validate = GENL_DONT_VALIDATE_DUMP, .dumpit = devlink_nl_sb_pool_get_dumpit, + .policy = devlink_sb_pool_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, .flags = GENL_CMD_CAP_DUMP, }, { @@ -200,14 +281,15 @@ const struct genl_split_ops devlink_nl_ops[32] = { .pre_doit = devlink_nl_pre_doit_port, .doit = devlink_nl_sb_port_pool_get_doit, .post_doit = devlink_nl_post_doit, - .policy = devlink_sb_port_pool_get_nl_policy, + .policy = devlink_sb_port_pool_get_do_nl_policy, .maxattr = DEVLINK_ATTR_SB_POOL_INDEX, .flags = GENL_CMD_CAP_DO, }, { .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, - .validate = GENL_DONT_VALIDATE_DUMP, .dumpit = devlink_nl_sb_port_pool_get_dumpit, + .policy = devlink_sb_port_pool_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, .flags = GENL_CMD_CAP_DUMP, }, { @@ -216,14 +298,15 @@ const struct genl_split_ops devlink_nl_ops[32] = { .pre_doit = devlink_nl_pre_doit_port, .doit = devlink_nl_sb_tc_pool_bind_get_doit, .post_doit = devlink_nl_post_doit, - .policy = devlink_sb_tc_pool_bind_get_nl_policy, + .policy = devlink_sb_tc_pool_bind_get_do_nl_policy, .maxattr = DEVLINK_ATTR_SB_TC_INDEX, .flags = GENL_CMD_CAP_DO, }, { .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, - .validate = GENL_DONT_VALIDATE_DUMP, .dumpit = devlink_nl_sb_tc_pool_bind_get_dumpit, + .policy = devlink_sb_tc_pool_bind_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, .flags = GENL_CMD_CAP_DUMP, }, { @@ -232,14 +315,15 @@ const struct genl_split_ops devlink_nl_ops[32] = { .pre_doit = devlink_nl_pre_doit, .doit = devlink_nl_param_get_doit, .post_doit = devlink_nl_post_doit, - .policy = devlink_param_get_nl_policy, + .policy = devlink_param_get_do_nl_policy, .maxattr = DEVLINK_ATTR_PARAM_NAME, .flags = GENL_CMD_CAP_DO, }, { .cmd = DEVLINK_CMD_PARAM_GET, - .validate = GENL_DONT_VALIDATE_DUMP, .dumpit = devlink_nl_param_get_dumpit, + .policy = devlink_param_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, .flags = GENL_CMD_CAP_DUMP, }, { @@ -248,14 +332,15 @@ const struct genl_split_ops devlink_nl_ops[32] = { .pre_doit = devlink_nl_pre_doit_port_optional, .doit = devlink_nl_region_get_doit, .post_doit = devlink_nl_post_doit, - .policy = devlink_region_get_nl_policy, + .policy = devlink_region_get_do_nl_policy, .maxattr = DEVLINK_ATTR_REGION_NAME, .flags = GENL_CMD_CAP_DO, }, { .cmd = DEVLINK_CMD_REGION_GET, - .validate = GENL_DONT_VALIDATE_DUMP, .dumpit = devlink_nl_region_get_dumpit, + .policy = devlink_region_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, .flags = GENL_CMD_CAP_DUMP, }, { @@ -280,14 +365,15 @@ const struct genl_split_ops devlink_nl_ops[32] = { .pre_doit = devlink_nl_pre_doit_port_optional, .doit = devlink_nl_health_reporter_get_doit, .post_doit = devlink_nl_post_doit, - .policy = devlink_health_reporter_get_nl_policy, + .policy = devlink_health_reporter_get_do_nl_policy, .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_NAME, .flags = GENL_CMD_CAP_DO, }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, - .validate = GENL_DONT_VALIDATE_DUMP, .dumpit = devlink_nl_health_reporter_get_dumpit, + .policy = devlink_health_reporter_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, .flags = GENL_CMD_CAP_DUMP, }, { @@ -296,14 +382,15 @@ const struct genl_split_ops devlink_nl_ops[32] = { .pre_doit = devlink_nl_pre_doit, .doit = devlink_nl_trap_get_doit, .post_doit = devlink_nl_post_doit, - .policy = devlink_trap_get_nl_policy, + .policy = devlink_trap_get_do_nl_policy, .maxattr = DEVLINK_ATTR_TRAP_NAME, .flags = GENL_CMD_CAP_DO, }, { .cmd = DEVLINK_CMD_TRAP_GET, - .validate = GENL_DONT_VALIDATE_DUMP, .dumpit = devlink_nl_trap_get_dumpit, + .policy = devlink_trap_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, .flags = GENL_CMD_CAP_DUMP, }, { @@ -312,14 +399,15 @@ const struct genl_split_ops devlink_nl_ops[32] = { .pre_doit = devlink_nl_pre_doit, .doit = devlink_nl_trap_group_get_doit, .post_doit = devlink_nl_post_doit, - .policy = devlink_trap_group_get_nl_policy, + .policy = devlink_trap_group_get_do_nl_policy, .maxattr = DEVLINK_ATTR_TRAP_GROUP_NAME, .flags = GENL_CMD_CAP_DO, }, { .cmd = DEVLINK_CMD_TRAP_GROUP_GET, - .validate = GENL_DONT_VALIDATE_DUMP, .dumpit = devlink_nl_trap_group_get_dumpit, + .policy = devlink_trap_group_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, .flags = GENL_CMD_CAP_DUMP, }, { @@ -328,14 +416,15 @@ const struct genl_split_ops devlink_nl_ops[32] = { .pre_doit = devlink_nl_pre_doit, .doit = devlink_nl_trap_policer_get_doit, .post_doit = devlink_nl_post_doit, - .policy = devlink_trap_policer_get_nl_policy, + .policy = devlink_trap_policer_get_do_nl_policy, .maxattr = DEVLINK_ATTR_TRAP_POLICER_ID, .flags = GENL_CMD_CAP_DO, }, { .cmd = DEVLINK_CMD_TRAP_POLICER_GET, - .validate = GENL_DONT_VALIDATE_DUMP, .dumpit = devlink_nl_trap_policer_get_dumpit, + .policy = devlink_trap_policer_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, .flags = GENL_CMD_CAP_DUMP, }, { @@ -344,14 +433,15 @@ const struct genl_split_ops devlink_nl_ops[32] = { .pre_doit = devlink_nl_pre_doit, .doit = devlink_nl_rate_get_doit, .post_doit = devlink_nl_post_doit, - .policy = devlink_rate_get_nl_policy, + .policy = devlink_rate_get_do_nl_policy, .maxattr = DEVLINK_ATTR_RATE_NODE_NAME, .flags = GENL_CMD_CAP_DO, }, { .cmd = DEVLINK_CMD_RATE_GET, - .validate = GENL_DONT_VALIDATE_DUMP, .dumpit = devlink_nl_rate_get_dumpit, + .policy = devlink_rate_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, .flags = GENL_CMD_CAP_DUMP, }, { @@ -360,14 +450,15 @@ const struct genl_split_ops devlink_nl_ops[32] = { .pre_doit = devlink_nl_pre_doit, .doit = devlink_nl_linecard_get_doit, .post_doit = devlink_nl_post_doit, - .policy = devlink_linecard_get_nl_policy, + .policy = devlink_linecard_get_do_nl_policy, .maxattr = DEVLINK_ATTR_LINECARD_INDEX, .flags = GENL_CMD_CAP_DO, }, { .cmd = DEVLINK_CMD_LINECARD_GET, - .validate = GENL_DONT_VALIDATE_DUMP, .dumpit = devlink_nl_linecard_get_dumpit, + .policy = devlink_linecard_get_dump_nl_policy, + .maxattr = DEVLINK_ATTR_DEV_NAME, .flags = GENL_CMD_CAP_DUMP, }, { diff --git a/tools/net/ynl/generated/devlink-user.c b/tools/net/ynl/generated/devlink-user.c index e9ab98b5ffe3..80ee9d24eb16 100644 --- a/tools/net/ynl/generated/devlink-user.c +++ b/tools/net/ynl/generated/devlink-user.c @@ -723,7 +723,9 @@ void devlink_port_get_rsp_list_free(struct devlink_port_get_rsp_list *rsp) } } -struct devlink_port_get_rsp_list *devlink_port_get_dump(struct ynl_sock *ys) +struct devlink_port_get_rsp_list * +devlink_port_get_dump(struct ynl_sock *ys, + struct devlink_port_get_req_dump *req) { struct ynl_dump_state yds = {}; struct nlmsghdr *nlh; @@ -736,6 +738,12 @@ struct devlink_port_get_rsp_list *devlink_port_get_dump(struct ynl_sock *ys) yds.rsp_policy = &devlink_nest; nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_PORT_GET, 1); + ys->req_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); err = ynl_exec_dump(ys, nlh, &yds); if (err < 0) @@ -858,7 +866,8 @@ void devlink_sb_get_list_free(struct devlink_sb_get_list *rsp) } } -struct devlink_sb_get_list *devlink_sb_get_dump(struct ynl_sock *ys) +struct devlink_sb_get_list * +devlink_sb_get_dump(struct ynl_sock *ys, struct devlink_sb_get_req_dump *req) { struct ynl_dump_state yds = {}; struct nlmsghdr *nlh; @@ -871,6 +880,12 @@ struct devlink_sb_get_list *devlink_sb_get_dump(struct ynl_sock *ys) yds.rsp_policy = &devlink_nest; nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_SB_GET, 1); + ys->req_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); err = ynl_exec_dump(ys, nlh, &yds); if (err < 0) @@ -1000,7 +1015,9 @@ void devlink_sb_pool_get_list_free(struct devlink_sb_pool_get_list *rsp) } } -struct devlink_sb_pool_get_list *devlink_sb_pool_get_dump(struct ynl_sock *ys) +struct devlink_sb_pool_get_list * +devlink_sb_pool_get_dump(struct ynl_sock *ys, + struct devlink_sb_pool_get_req_dump *req) { struct ynl_dump_state yds = {}; struct nlmsghdr *nlh; @@ -1013,6 +1030,12 @@ struct devlink_sb_pool_get_list *devlink_sb_pool_get_dump(struct ynl_sock *ys) yds.rsp_policy = &devlink_nest; nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_SB_POOL_GET, 1); + ys->req_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); err = ynl_exec_dump(ys, nlh, &yds); if (err < 0) @@ -1154,7 +1177,8 @@ devlink_sb_port_pool_get_list_free(struct devlink_sb_port_pool_get_list *rsp) } struct devlink_sb_port_pool_get_list * -devlink_sb_port_pool_get_dump(struct ynl_sock *ys) +devlink_sb_port_pool_get_dump(struct ynl_sock *ys, + struct devlink_sb_port_pool_get_req_dump *req) { struct ynl_dump_state yds = {}; struct nlmsghdr *nlh; @@ -1167,6 +1191,12 @@ devlink_sb_port_pool_get_dump(struct ynl_sock *ys) yds.rsp_policy = &devlink_nest; nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_SB_PORT_POOL_GET, 1); + ys->req_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); err = ynl_exec_dump(ys, nlh, &yds); if (err < 0) @@ -1316,7 +1346,8 @@ devlink_sb_tc_pool_bind_get_list_free(struct devlink_sb_tc_pool_bind_get_list *r } struct devlink_sb_tc_pool_bind_get_list * -devlink_sb_tc_pool_bind_get_dump(struct ynl_sock *ys) +devlink_sb_tc_pool_bind_get_dump(struct ynl_sock *ys, + struct devlink_sb_tc_pool_bind_get_req_dump *req) { struct ynl_dump_state yds = {}; struct nlmsghdr *nlh; @@ -1329,6 +1360,12 @@ devlink_sb_tc_pool_bind_get_dump(struct ynl_sock *ys) yds.rsp_policy = &devlink_nest; nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_SB_TC_POOL_BIND_GET, 1); + ys->req_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); err = ynl_exec_dump(ys, nlh, &yds); if (err < 0) @@ -1460,7 +1497,9 @@ void devlink_param_get_list_free(struct devlink_param_get_list *rsp) } } -struct devlink_param_get_list *devlink_param_get_dump(struct ynl_sock *ys) +struct devlink_param_get_list * +devlink_param_get_dump(struct ynl_sock *ys, + struct devlink_param_get_req_dump *req) { struct ynl_dump_state yds = {}; struct nlmsghdr *nlh; @@ -1473,6 +1512,12 @@ struct devlink_param_get_list *devlink_param_get_dump(struct ynl_sock *ys) yds.rsp_policy = &devlink_nest; nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_PARAM_GET, 1); + ys->req_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); err = ynl_exec_dump(ys, nlh, &yds); if (err < 0) @@ -1611,7 +1656,9 @@ void devlink_region_get_list_free(struct devlink_region_get_list *rsp) } } -struct devlink_region_get_list *devlink_region_get_dump(struct ynl_sock *ys) +struct devlink_region_get_list * +devlink_region_get_dump(struct ynl_sock *ys, + struct devlink_region_get_req_dump *req) { struct ynl_dump_state yds = {}; struct nlmsghdr *nlh; @@ -1624,6 +1671,12 @@ struct devlink_region_get_list *devlink_region_get_dump(struct ynl_sock *ys) yds.rsp_policy = &devlink_nest; nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_REGION_GET, 1); + ys->req_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); err = ynl_exec_dump(ys, nlh, &yds); if (err < 0) @@ -2006,7 +2059,8 @@ devlink_health_reporter_get_list_free(struct devlink_health_reporter_get_list *r } struct devlink_health_reporter_get_list * -devlink_health_reporter_get_dump(struct ynl_sock *ys) +devlink_health_reporter_get_dump(struct ynl_sock *ys, + struct devlink_health_reporter_get_req_dump *req) { struct ynl_dump_state yds = {}; struct nlmsghdr *nlh; @@ -2019,6 +2073,12 @@ devlink_health_reporter_get_dump(struct ynl_sock *ys) yds.rsp_policy = &devlink_nest; nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_HEALTH_REPORTER_GET, 1); + ys->req_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); err = ynl_exec_dump(ys, nlh, &yds); if (err < 0) @@ -2150,7 +2210,9 @@ void devlink_trap_get_list_free(struct devlink_trap_get_list *rsp) } } -struct devlink_trap_get_list *devlink_trap_get_dump(struct ynl_sock *ys) +struct devlink_trap_get_list * +devlink_trap_get_dump(struct ynl_sock *ys, + struct devlink_trap_get_req_dump *req) { struct ynl_dump_state yds = {}; struct nlmsghdr *nlh; @@ -2163,6 +2225,12 @@ struct devlink_trap_get_list *devlink_trap_get_dump(struct ynl_sock *ys) yds.rsp_policy = &devlink_nest; nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_TRAP_GET, 1); + ys->req_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); err = ynl_exec_dump(ys, nlh, &yds); if (err < 0) @@ -2296,7 +2364,8 @@ void devlink_trap_group_get_list_free(struct devlink_trap_group_get_list *rsp) } struct devlink_trap_group_get_list * -devlink_trap_group_get_dump(struct ynl_sock *ys) +devlink_trap_group_get_dump(struct ynl_sock *ys, + struct devlink_trap_group_get_req_dump *req) { struct ynl_dump_state yds = {}; struct nlmsghdr *nlh; @@ -2309,6 +2378,12 @@ devlink_trap_group_get_dump(struct ynl_sock *ys) yds.rsp_policy = &devlink_nest; nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_TRAP_GROUP_GET, 1); + ys->req_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); err = ynl_exec_dump(ys, nlh, &yds); if (err < 0) @@ -2436,7 +2511,8 @@ devlink_trap_policer_get_list_free(struct devlink_trap_policer_get_list *rsp) } struct devlink_trap_policer_get_list * -devlink_trap_policer_get_dump(struct ynl_sock *ys) +devlink_trap_policer_get_dump(struct ynl_sock *ys, + struct devlink_trap_policer_get_req_dump *req) { struct ynl_dump_state yds = {}; struct nlmsghdr *nlh; @@ -2449,6 +2525,12 @@ devlink_trap_policer_get_dump(struct ynl_sock *ys) yds.rsp_policy = &devlink_nest; nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_TRAP_POLICER_GET, 1); + ys->req_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); err = ynl_exec_dump(ys, nlh, &yds); if (err < 0) @@ -2587,7 +2669,9 @@ void devlink_rate_get_list_free(struct devlink_rate_get_list *rsp) } } -struct devlink_rate_get_list *devlink_rate_get_dump(struct ynl_sock *ys) +struct devlink_rate_get_list * +devlink_rate_get_dump(struct ynl_sock *ys, + struct devlink_rate_get_req_dump *req) { struct ynl_dump_state yds = {}; struct nlmsghdr *nlh; @@ -2600,6 +2684,12 @@ struct devlink_rate_get_list *devlink_rate_get_dump(struct ynl_sock *ys) yds.rsp_policy = &devlink_nest; nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_RATE_GET, 1); + ys->req_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); err = ynl_exec_dump(ys, nlh, &yds); if (err < 0) @@ -2723,7 +2813,8 @@ void devlink_linecard_get_list_free(struct devlink_linecard_get_list *rsp) } struct devlink_linecard_get_list * -devlink_linecard_get_dump(struct ynl_sock *ys) +devlink_linecard_get_dump(struct ynl_sock *ys, + struct devlink_linecard_get_req_dump *req) { struct ynl_dump_state yds = {}; struct nlmsghdr *nlh; @@ -2736,6 +2827,12 @@ devlink_linecard_get_dump(struct ynl_sock *ys) yds.rsp_policy = &devlink_nest; nlh = ynl_gemsg_start_dump(ys, ys->family_id, DEVLINK_CMD_LINECARD_GET, 1); + ys->req_policy = &devlink_nest; + + if (req->_present.bus_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); + if (req->_present.dev_name_len) + mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); err = ynl_exec_dump(ys, nlh, &yds); if (err < 0) diff --git a/tools/net/ynl/generated/devlink-user.h b/tools/net/ynl/generated/devlink-user.h index f6ec5b6e5d26..12530f1795ca 100644 --- a/tools/net/ynl/generated/devlink-user.h +++ b/tools/net/ynl/generated/devlink-user.h @@ -210,6 +210,44 @@ struct devlink_port_get_rsp * devlink_port_get(struct ynl_sock *ys, struct devlink_port_get_req *req); /* DEVLINK_CMD_PORT_GET - dump */ +struct devlink_port_get_req_dump { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + } _present; + + char *bus_name; + char *dev_name; +}; + +static inline struct devlink_port_get_req_dump * +devlink_port_get_req_dump_alloc(void) +{ + return calloc(1, sizeof(struct devlink_port_get_req_dump)); +} +void devlink_port_get_req_dump_free(struct devlink_port_get_req_dump *req); + +static inline void +devlink_port_get_req_dump_set_bus_name(struct devlink_port_get_req_dump *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_port_get_req_dump_set_dev_name(struct devlink_port_get_req_dump *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} + struct devlink_port_get_rsp_dump { struct { __u32 bus_name_len; @@ -229,7 +267,9 @@ struct devlink_port_get_rsp_list { void devlink_port_get_rsp_list_free(struct devlink_port_get_rsp_list *rsp); -struct devlink_port_get_rsp_list *devlink_port_get_dump(struct ynl_sock *ys); +struct devlink_port_get_rsp_list * +devlink_port_get_dump(struct ynl_sock *ys, + struct devlink_port_get_req_dump *req); /* ============== DEVLINK_CMD_SB_GET ============== */ /* DEVLINK_CMD_SB_GET - do */ @@ -299,6 +339,44 @@ struct devlink_sb_get_rsp * devlink_sb_get(struct ynl_sock *ys, struct devlink_sb_get_req *req); /* DEVLINK_CMD_SB_GET - dump */ +struct devlink_sb_get_req_dump { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + } _present; + + char *bus_name; + char *dev_name; +}; + +static inline struct devlink_sb_get_req_dump * +devlink_sb_get_req_dump_alloc(void) +{ + return calloc(1, sizeof(struct devlink_sb_get_req_dump)); +} +void devlink_sb_get_req_dump_free(struct devlink_sb_get_req_dump *req); + +static inline void +devlink_sb_get_req_dump_set_bus_name(struct devlink_sb_get_req_dump *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_sb_get_req_dump_set_dev_name(struct devlink_sb_get_req_dump *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} + struct devlink_sb_get_list { struct devlink_sb_get_list *next; struct devlink_sb_get_rsp obj __attribute__ ((aligned (8))); @@ -306,7 +384,8 @@ struct devlink_sb_get_list { void devlink_sb_get_list_free(struct devlink_sb_get_list *rsp); -struct devlink_sb_get_list *devlink_sb_get_dump(struct ynl_sock *ys); +struct devlink_sb_get_list * +devlink_sb_get_dump(struct ynl_sock *ys, struct devlink_sb_get_req_dump *req); /* ============== DEVLINK_CMD_SB_POOL_GET ============== */ /* DEVLINK_CMD_SB_POOL_GET - do */ @@ -389,6 +468,45 @@ struct devlink_sb_pool_get_rsp * devlink_sb_pool_get(struct ynl_sock *ys, struct devlink_sb_pool_get_req *req); /* DEVLINK_CMD_SB_POOL_GET - dump */ +struct devlink_sb_pool_get_req_dump { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + } _present; + + char *bus_name; + char *dev_name; +}; + +static inline struct devlink_sb_pool_get_req_dump * +devlink_sb_pool_get_req_dump_alloc(void) +{ + return calloc(1, sizeof(struct devlink_sb_pool_get_req_dump)); +} +void +devlink_sb_pool_get_req_dump_free(struct devlink_sb_pool_get_req_dump *req); + +static inline void +devlink_sb_pool_get_req_dump_set_bus_name(struct devlink_sb_pool_get_req_dump *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_sb_pool_get_req_dump_set_dev_name(struct devlink_sb_pool_get_req_dump *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} + struct devlink_sb_pool_get_list { struct devlink_sb_pool_get_list *next; struct devlink_sb_pool_get_rsp obj __attribute__ ((aligned (8))); @@ -396,7 +514,9 @@ struct devlink_sb_pool_get_list { void devlink_sb_pool_get_list_free(struct devlink_sb_pool_get_list *rsp); -struct devlink_sb_pool_get_list *devlink_sb_pool_get_dump(struct ynl_sock *ys); +struct devlink_sb_pool_get_list * +devlink_sb_pool_get_dump(struct ynl_sock *ys, + struct devlink_sb_pool_get_req_dump *req); /* ============== DEVLINK_CMD_SB_PORT_POOL_GET ============== */ /* DEVLINK_CMD_SB_PORT_POOL_GET - do */ @@ -493,6 +613,45 @@ devlink_sb_port_pool_get(struct ynl_sock *ys, struct devlink_sb_port_pool_get_req *req); /* DEVLINK_CMD_SB_PORT_POOL_GET - dump */ +struct devlink_sb_port_pool_get_req_dump { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + } _present; + + char *bus_name; + char *dev_name; +}; + +static inline struct devlink_sb_port_pool_get_req_dump * +devlink_sb_port_pool_get_req_dump_alloc(void) +{ + return calloc(1, sizeof(struct devlink_sb_port_pool_get_req_dump)); +} +void +devlink_sb_port_pool_get_req_dump_free(struct devlink_sb_port_pool_get_req_dump *req); + +static inline void +devlink_sb_port_pool_get_req_dump_set_bus_name(struct devlink_sb_port_pool_get_req_dump *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_sb_port_pool_get_req_dump_set_dev_name(struct devlink_sb_port_pool_get_req_dump *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} + struct devlink_sb_port_pool_get_list { struct devlink_sb_port_pool_get_list *next; struct devlink_sb_port_pool_get_rsp obj __attribute__ ((aligned (8))); @@ -502,7 +661,8 @@ void devlink_sb_port_pool_get_list_free(struct devlink_sb_port_pool_get_list *rsp); struct devlink_sb_port_pool_get_list * -devlink_sb_port_pool_get_dump(struct ynl_sock *ys); +devlink_sb_port_pool_get_dump(struct ynl_sock *ys, + struct devlink_sb_port_pool_get_req_dump *req); /* ============== DEVLINK_CMD_SB_TC_POOL_BIND_GET ============== */ /* DEVLINK_CMD_SB_TC_POOL_BIND_GET - do */ @@ -610,6 +770,45 @@ devlink_sb_tc_pool_bind_get(struct ynl_sock *ys, struct devlink_sb_tc_pool_bind_get_req *req); /* DEVLINK_CMD_SB_TC_POOL_BIND_GET - dump */ +struct devlink_sb_tc_pool_bind_get_req_dump { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + } _present; + + char *bus_name; + char *dev_name; +}; + +static inline struct devlink_sb_tc_pool_bind_get_req_dump * +devlink_sb_tc_pool_bind_get_req_dump_alloc(void) +{ + return calloc(1, sizeof(struct devlink_sb_tc_pool_bind_get_req_dump)); +} +void +devlink_sb_tc_pool_bind_get_req_dump_free(struct devlink_sb_tc_pool_bind_get_req_dump *req); + +static inline void +devlink_sb_tc_pool_bind_get_req_dump_set_bus_name(struct devlink_sb_tc_pool_bind_get_req_dump *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_sb_tc_pool_bind_get_req_dump_set_dev_name(struct devlink_sb_tc_pool_bind_get_req_dump *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} + struct devlink_sb_tc_pool_bind_get_list { struct devlink_sb_tc_pool_bind_get_list *next; struct devlink_sb_tc_pool_bind_get_rsp obj __attribute__ ((aligned (8))); @@ -619,7 +818,8 @@ void devlink_sb_tc_pool_bind_get_list_free(struct devlink_sb_tc_pool_bind_get_list *rsp); struct devlink_sb_tc_pool_bind_get_list * -devlink_sb_tc_pool_bind_get_dump(struct ynl_sock *ys); +devlink_sb_tc_pool_bind_get_dump(struct ynl_sock *ys, + struct devlink_sb_tc_pool_bind_get_req_dump *req); /* ============== DEVLINK_CMD_PARAM_GET ============== */ /* DEVLINK_CMD_PARAM_GET - do */ @@ -693,6 +893,44 @@ struct devlink_param_get_rsp * devlink_param_get(struct ynl_sock *ys, struct devlink_param_get_req *req); /* DEVLINK_CMD_PARAM_GET - dump */ +struct devlink_param_get_req_dump { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + } _present; + + char *bus_name; + char *dev_name; +}; + +static inline struct devlink_param_get_req_dump * +devlink_param_get_req_dump_alloc(void) +{ + return calloc(1, sizeof(struct devlink_param_get_req_dump)); +} +void devlink_param_get_req_dump_free(struct devlink_param_get_req_dump *req); + +static inline void +devlink_param_get_req_dump_set_bus_name(struct devlink_param_get_req_dump *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_param_get_req_dump_set_dev_name(struct devlink_param_get_req_dump *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} + struct devlink_param_get_list { struct devlink_param_get_list *next; struct devlink_param_get_rsp obj __attribute__ ((aligned (8))); @@ -700,7 +938,9 @@ struct devlink_param_get_list { void devlink_param_get_list_free(struct devlink_param_get_list *rsp); -struct devlink_param_get_list *devlink_param_get_dump(struct ynl_sock *ys); +struct devlink_param_get_list * +devlink_param_get_dump(struct ynl_sock *ys, + struct devlink_param_get_req_dump *req); /* ============== DEVLINK_CMD_REGION_GET ============== */ /* DEVLINK_CMD_REGION_GET - do */ @@ -785,6 +1025,44 @@ struct devlink_region_get_rsp * devlink_region_get(struct ynl_sock *ys, struct devlink_region_get_req *req); /* DEVLINK_CMD_REGION_GET - dump */ +struct devlink_region_get_req_dump { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + } _present; + + char *bus_name; + char *dev_name; +}; + +static inline struct devlink_region_get_req_dump * +devlink_region_get_req_dump_alloc(void) +{ + return calloc(1, sizeof(struct devlink_region_get_req_dump)); +} +void devlink_region_get_req_dump_free(struct devlink_region_get_req_dump *req); + +static inline void +devlink_region_get_req_dump_set_bus_name(struct devlink_region_get_req_dump *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_region_get_req_dump_set_dev_name(struct devlink_region_get_req_dump *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} + struct devlink_region_get_list { struct devlink_region_get_list *next; struct devlink_region_get_rsp obj __attribute__ ((aligned (8))); @@ -792,7 +1070,9 @@ struct devlink_region_get_list { void devlink_region_get_list_free(struct devlink_region_get_list *rsp); -struct devlink_region_get_list *devlink_region_get_dump(struct ynl_sock *ys); +struct devlink_region_get_list * +devlink_region_get_dump(struct ynl_sock *ys, + struct devlink_region_get_req_dump *req); /* ============== DEVLINK_CMD_INFO_GET ============== */ /* DEVLINK_CMD_INFO_GET - do */ @@ -958,6 +1238,45 @@ devlink_health_reporter_get(struct ynl_sock *ys, struct devlink_health_reporter_get_req *req); /* DEVLINK_CMD_HEALTH_REPORTER_GET - dump */ +struct devlink_health_reporter_get_req_dump { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + } _present; + + char *bus_name; + char *dev_name; +}; + +static inline struct devlink_health_reporter_get_req_dump * +devlink_health_reporter_get_req_dump_alloc(void) +{ + return calloc(1, sizeof(struct devlink_health_reporter_get_req_dump)); +} +void +devlink_health_reporter_get_req_dump_free(struct devlink_health_reporter_get_req_dump *req); + +static inline void +devlink_health_reporter_get_req_dump_set_bus_name(struct devlink_health_reporter_get_req_dump *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_health_reporter_get_req_dump_set_dev_name(struct devlink_health_reporter_get_req_dump *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} + struct devlink_health_reporter_get_list { struct devlink_health_reporter_get_list *next; struct devlink_health_reporter_get_rsp obj __attribute__ ((aligned (8))); @@ -967,7 +1286,8 @@ void devlink_health_reporter_get_list_free(struct devlink_health_reporter_get_list *rsp); struct devlink_health_reporter_get_list * -devlink_health_reporter_get_dump(struct ynl_sock *ys); +devlink_health_reporter_get_dump(struct ynl_sock *ys, + struct devlink_health_reporter_get_req_dump *req); /* ============== DEVLINK_CMD_TRAP_GET ============== */ /* DEVLINK_CMD_TRAP_GET - do */ @@ -1041,6 +1361,44 @@ struct devlink_trap_get_rsp * devlink_trap_get(struct ynl_sock *ys, struct devlink_trap_get_req *req); /* DEVLINK_CMD_TRAP_GET - dump */ +struct devlink_trap_get_req_dump { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + } _present; + + char *bus_name; + char *dev_name; +}; + +static inline struct devlink_trap_get_req_dump * +devlink_trap_get_req_dump_alloc(void) +{ + return calloc(1, sizeof(struct devlink_trap_get_req_dump)); +} +void devlink_trap_get_req_dump_free(struct devlink_trap_get_req_dump *req); + +static inline void +devlink_trap_get_req_dump_set_bus_name(struct devlink_trap_get_req_dump *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_trap_get_req_dump_set_dev_name(struct devlink_trap_get_req_dump *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} + struct devlink_trap_get_list { struct devlink_trap_get_list *next; struct devlink_trap_get_rsp obj __attribute__ ((aligned (8))); @@ -1048,7 +1406,9 @@ struct devlink_trap_get_list { void devlink_trap_get_list_free(struct devlink_trap_get_list *rsp); -struct devlink_trap_get_list *devlink_trap_get_dump(struct ynl_sock *ys); +struct devlink_trap_get_list * +devlink_trap_get_dump(struct ynl_sock *ys, + struct devlink_trap_get_req_dump *req); /* ============== DEVLINK_CMD_TRAP_GROUP_GET ============== */ /* DEVLINK_CMD_TRAP_GROUP_GET - do */ @@ -1124,6 +1484,45 @@ devlink_trap_group_get(struct ynl_sock *ys, struct devlink_trap_group_get_req *req); /* DEVLINK_CMD_TRAP_GROUP_GET - dump */ +struct devlink_trap_group_get_req_dump { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + } _present; + + char *bus_name; + char *dev_name; +}; + +static inline struct devlink_trap_group_get_req_dump * +devlink_trap_group_get_req_dump_alloc(void) +{ + return calloc(1, sizeof(struct devlink_trap_group_get_req_dump)); +} +void +devlink_trap_group_get_req_dump_free(struct devlink_trap_group_get_req_dump *req); + +static inline void +devlink_trap_group_get_req_dump_set_bus_name(struct devlink_trap_group_get_req_dump *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_trap_group_get_req_dump_set_dev_name(struct devlink_trap_group_get_req_dump *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} + struct devlink_trap_group_get_list { struct devlink_trap_group_get_list *next; struct devlink_trap_group_get_rsp obj __attribute__ ((aligned (8))); @@ -1132,7 +1531,8 @@ struct devlink_trap_group_get_list { void devlink_trap_group_get_list_free(struct devlink_trap_group_get_list *rsp); struct devlink_trap_group_get_list * -devlink_trap_group_get_dump(struct ynl_sock *ys); +devlink_trap_group_get_dump(struct ynl_sock *ys, + struct devlink_trap_group_get_req_dump *req); /* ============== DEVLINK_CMD_TRAP_POLICER_GET ============== */ /* DEVLINK_CMD_TRAP_POLICER_GET - do */ @@ -1207,6 +1607,45 @@ devlink_trap_policer_get(struct ynl_sock *ys, struct devlink_trap_policer_get_req *req); /* DEVLINK_CMD_TRAP_POLICER_GET - dump */ +struct devlink_trap_policer_get_req_dump { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + } _present; + + char *bus_name; + char *dev_name; +}; + +static inline struct devlink_trap_policer_get_req_dump * +devlink_trap_policer_get_req_dump_alloc(void) +{ + return calloc(1, sizeof(struct devlink_trap_policer_get_req_dump)); +} +void +devlink_trap_policer_get_req_dump_free(struct devlink_trap_policer_get_req_dump *req); + +static inline void +devlink_trap_policer_get_req_dump_set_bus_name(struct devlink_trap_policer_get_req_dump *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_trap_policer_get_req_dump_set_dev_name(struct devlink_trap_policer_get_req_dump *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} + struct devlink_trap_policer_get_list { struct devlink_trap_policer_get_list *next; struct devlink_trap_policer_get_rsp obj __attribute__ ((aligned (8))); @@ -1216,7 +1655,8 @@ void devlink_trap_policer_get_list_free(struct devlink_trap_policer_get_list *rsp); struct devlink_trap_policer_get_list * -devlink_trap_policer_get_dump(struct ynl_sock *ys); +devlink_trap_policer_get_dump(struct ynl_sock *ys, + struct devlink_trap_policer_get_req_dump *req); /* ============== DEVLINK_CMD_RATE_GET ============== */ /* DEVLINK_CMD_RATE_GET - do */ @@ -1301,6 +1741,44 @@ struct devlink_rate_get_rsp * devlink_rate_get(struct ynl_sock *ys, struct devlink_rate_get_req *req); /* DEVLINK_CMD_RATE_GET - dump */ +struct devlink_rate_get_req_dump { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + } _present; + + char *bus_name; + char *dev_name; +}; + +static inline struct devlink_rate_get_req_dump * +devlink_rate_get_req_dump_alloc(void) +{ + return calloc(1, sizeof(struct devlink_rate_get_req_dump)); +} +void devlink_rate_get_req_dump_free(struct devlink_rate_get_req_dump *req); + +static inline void +devlink_rate_get_req_dump_set_bus_name(struct devlink_rate_get_req_dump *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_rate_get_req_dump_set_dev_name(struct devlink_rate_get_req_dump *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} + struct devlink_rate_get_list { struct devlink_rate_get_list *next; struct devlink_rate_get_rsp obj __attribute__ ((aligned (8))); @@ -1308,7 +1786,9 @@ struct devlink_rate_get_list { void devlink_rate_get_list_free(struct devlink_rate_get_list *rsp); -struct devlink_rate_get_list *devlink_rate_get_dump(struct ynl_sock *ys); +struct devlink_rate_get_list * +devlink_rate_get_dump(struct ynl_sock *ys, + struct devlink_rate_get_req_dump *req); /* ============== DEVLINK_CMD_LINECARD_GET ============== */ /* DEVLINK_CMD_LINECARD_GET - do */ @@ -1380,6 +1860,45 @@ struct devlink_linecard_get_rsp * devlink_linecard_get(struct ynl_sock *ys, struct devlink_linecard_get_req *req); /* DEVLINK_CMD_LINECARD_GET - dump */ +struct devlink_linecard_get_req_dump { + struct { + __u32 bus_name_len; + __u32 dev_name_len; + } _present; + + char *bus_name; + char *dev_name; +}; + +static inline struct devlink_linecard_get_req_dump * +devlink_linecard_get_req_dump_alloc(void) +{ + return calloc(1, sizeof(struct devlink_linecard_get_req_dump)); +} +void +devlink_linecard_get_req_dump_free(struct devlink_linecard_get_req_dump *req); + +static inline void +devlink_linecard_get_req_dump_set_bus_name(struct devlink_linecard_get_req_dump *req, + const char *bus_name) +{ + free(req->bus_name); + req->_present.bus_name_len = strlen(bus_name); + req->bus_name = malloc(req->_present.bus_name_len + 1); + memcpy(req->bus_name, bus_name, req->_present.bus_name_len); + req->bus_name[req->_present.bus_name_len] = 0; +} +static inline void +devlink_linecard_get_req_dump_set_dev_name(struct devlink_linecard_get_req_dump *req, + const char *dev_name) +{ + free(req->dev_name); + req->_present.dev_name_len = strlen(dev_name); + req->dev_name = malloc(req->_present.dev_name_len + 1); + memcpy(req->dev_name, dev_name, req->_present.dev_name_len); + req->dev_name[req->_present.dev_name_len] = 0; +} + struct devlink_linecard_get_list { struct devlink_linecard_get_list *next; struct devlink_linecard_get_rsp obj __attribute__ ((aligned (8))); @@ -1388,7 +1907,8 @@ struct devlink_linecard_get_list { void devlink_linecard_get_list_free(struct devlink_linecard_get_list *rsp); struct devlink_linecard_get_list * -devlink_linecard_get_dump(struct ynl_sock *ys); +devlink_linecard_get_dump(struct ynl_sock *ys, + struct devlink_linecard_get_req_dump *req); /* ============== DEVLINK_CMD_SELFTESTS_GET ============== */ /* DEVLINK_CMD_SELFTESTS_GET - do */ -- cgit v1.2.3 From b03f13cb67a59175ec7bb7201e5e746e50f2331e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 11 Aug 2023 17:57:13 +0200 Subject: devlink: extend health reporter dump selector by port index Introduce a possibility for devlink object to expose attributes it supports for selection of dumped objects. Use this by health reporter to indicate it supports port index based selection of dump objects. Implement this selection mechanism in devlink_nl_cmd_health_reporter_get_dump_one() Example: $ devlink health pci/0000:08:00.0: reporter fw state healthy error 0 recover 0 auto_dump true reporter fw_fatal state healthy error 0 recover 0 grace_period 60000 auto_recover true auto_dump true reporter vnic state healthy error 0 recover 0 pci/0000:08:00.0/32768: reporter vnic state healthy error 0 recover 0 pci/0000:08:00.0/32769: reporter vnic state healthy error 0 recover 0 pci/0000:08:00.0/32770: reporter vnic state healthy error 0 recover 0 pci/0000:08:00.1: reporter fw state healthy error 0 recover 0 auto_dump true reporter fw_fatal state healthy error 0 recover 0 grace_period 60000 auto_recover true auto_dump true reporter vnic state healthy error 0 recover 0 pci/0000:08:00.1/98304: reporter vnic state healthy error 0 recover 0 pci/0000:08:00.1/98305: reporter vnic state healthy error 0 recover 0 pci/0000:08:00.1/98306: reporter vnic state healthy error 0 recover 0 $ devlink health show pci/0000:08:00.0 pci/0000:08:00.0: reporter fw state healthy error 0 recover 0 auto_dump true reporter fw_fatal state healthy error 0 recover 0 grace_period 60000 auto_recover true auto_dump true reporter vnic state healthy error 0 recover 0 pci/0000:08:00.0/32768: reporter vnic state healthy error 0 recover 0 pci/0000:08:00.0/32769: reporter vnic state healthy error 0 recover 0 pci/0000:08:00.0/32770: reporter vnic state healthy error 0 recover 0 $ devlink health show pci/0000:08:00.0/32768 pci/0000:08:00.0/32768: reporter vnic state healthy error 0 recover 0 The last command is possible because of this patch. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230811155714.1736405-13-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/health.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/devlink/health.c b/net/devlink/health.c index b9b3e68d9043..a85bdec34801 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -390,12 +390,23 @@ static int devlink_nl_health_reporter_get_dump_one(struct sk_buff *msg, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); + const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct devlink_health_reporter *reporter; + unsigned long port_index_end = ULONG_MAX; + struct nlattr **attrs = info->attrs; + unsigned long port_index_start = 0; struct devlink_port *port; unsigned long port_index; int idx = 0; int err; + if (attrs && attrs[DEVLINK_ATTR_PORT_INDEX]) { + port_index_start = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); + port_index_end = port_index_start; + flags |= NLM_F_DUMP_FILTERED; + goto per_port_dump; + } + list_for_each_entry(reporter, &devlink->reporter_list, list) { if (idx < state->idx) { idx++; @@ -412,7 +423,9 @@ static int devlink_nl_health_reporter_get_dump_one(struct sk_buff *msg, } idx++; } - xa_for_each(&devlink->ports, port_index, port) { +per_port_dump: + xa_for_each_range(&devlink->ports, port_index, port, + port_index_start, port_index_end) { list_for_each_entry(reporter, &port->reporter_list, list) { if (idx < state->idx) { idx++; -- cgit v1.2.3 From 0149bca172622d802c0caff8cf0928fb0ded767b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 11 Aug 2023 17:57:14 +0200 Subject: netlink: specs: devlink: extend health reporter dump attributes by port index Allow user to pass port index for health reporter dump request. Re-generate the related code. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230811155714.1736405-14-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 2 +- net/devlink/netlink_gen.c | 5 +++-- tools/net/ynl/generated/devlink-user.c | 2 ++ tools/net/ynl/generated/devlink-user.h | 9 +++++++++ 4 files changed, 15 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index 6dbebeebd63e..d1ebcd927149 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -516,7 +516,7 @@ operations: attributes: *health-reporter-id-attrs dump: request: - attributes: *dev-id-attrs + attributes: *port-id-attrs reply: *health-reporter-get-reply # TODO: fill in the operations in between diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index 5f3e980c6a7c..467b7a431de1 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -129,9 +129,10 @@ static const struct nla_policy devlink_health_reporter_get_do_nl_policy[DEVLINK_ }; /* DEVLINK_CMD_HEALTH_REPORTER_GET - dump */ -static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = { +static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, }; /* DEVLINK_CMD_TRAP_GET - do */ @@ -373,7 +374,7 @@ const struct genl_split_ops devlink_nl_ops[32] = { .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, .dumpit = devlink_nl_health_reporter_get_dumpit, .policy = devlink_health_reporter_get_dump_nl_policy, - .maxattr = DEVLINK_ATTR_DEV_NAME, + .maxattr = DEVLINK_ATTR_PORT_INDEX, .flags = GENL_CMD_CAP_DUMP, }, { diff --git a/tools/net/ynl/generated/devlink-user.c b/tools/net/ynl/generated/devlink-user.c index 80ee9d24eb16..3a8d8499fab6 100644 --- a/tools/net/ynl/generated/devlink-user.c +++ b/tools/net/ynl/generated/devlink-user.c @@ -2079,6 +2079,8 @@ devlink_health_reporter_get_dump(struct ynl_sock *ys, mnl_attr_put_strz(nlh, DEVLINK_ATTR_BUS_NAME, req->bus_name); if (req->_present.dev_name_len) mnl_attr_put_strz(nlh, DEVLINK_ATTR_DEV_NAME, req->dev_name); + if (req->_present.port_index) + mnl_attr_put_u32(nlh, DEVLINK_ATTR_PORT_INDEX, req->port_index); err = ynl_exec_dump(ys, nlh, &yds); if (err < 0) diff --git a/tools/net/ynl/generated/devlink-user.h b/tools/net/ynl/generated/devlink-user.h index 12530f1795ca..4b686d147613 100644 --- a/tools/net/ynl/generated/devlink-user.h +++ b/tools/net/ynl/generated/devlink-user.h @@ -1242,10 +1242,12 @@ struct devlink_health_reporter_get_req_dump { struct { __u32 bus_name_len; __u32 dev_name_len; + __u32 port_index:1; } _present; char *bus_name; char *dev_name; + __u32 port_index; }; static inline struct devlink_health_reporter_get_req_dump * @@ -1276,6 +1278,13 @@ devlink_health_reporter_get_req_dump_set_dev_name(struct devlink_health_reporter memcpy(req->dev_name, dev_name, req->_present.dev_name_len); req->dev_name[req->_present.dev_name_len] = 0; } +static inline void +devlink_health_reporter_get_req_dump_set_port_index(struct devlink_health_reporter_get_req_dump *req, + __u32 port_index) +{ + req->_present.port_index = 1; + req->port_index = port_index; +} struct devlink_health_reporter_get_list { struct devlink_health_reporter_get_list *next; -- cgit v1.2.3 From 84817d8c6042e6261ea45c53fe8b5a0bd55c3993 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Aug 2023 14:47:14 -0700 Subject: genetlink: push conditional locking into dumpit/done Add helpers which take/release the genl mutex based on family->parallel_ops. Remove the separation between handling of ops in locked and parallel families. Future patches would make the duplicated code grow even more. Reviewed-by: Johannes Berg Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20230814214723.2924989-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/netlink/genetlink.c | 90 +++++++++++++++++++------------------------------ 1 file changed, 35 insertions(+), 55 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 6bd2ce51271f..0d4285688ab9 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -52,6 +52,18 @@ static void genl_unlock_all(void) up_write(&cb_lock); } +static void genl_op_lock(const struct genl_family *family) +{ + if (!family->parallel_ops) + genl_lock(); +} + +static void genl_op_unlock(const struct genl_family *family) +{ + if (!family->parallel_ops) + genl_unlock(); +} + static DEFINE_IDR(genl_fam_idr); /* @@ -838,11 +850,9 @@ static int genl_start(struct netlink_callback *cb) cb->data = info; if (ops->start) { - if (!ctx->family->parallel_ops) - genl_lock(); + genl_op_lock(ctx->family); rc = ops->start(cb); - if (!ctx->family->parallel_ops) - genl_unlock(); + genl_op_unlock(ctx->family); } if (rc) { @@ -853,46 +863,34 @@ static int genl_start(struct netlink_callback *cb) return rc; } -static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +static int genl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - const struct genl_split_ops *ops = &genl_dumpit_info(cb)->op; + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + const struct genl_split_ops *ops = &info->op; int rc; - genl_lock(); + genl_op_lock(info->family); rc = ops->dumpit(skb, cb); - genl_unlock(); + genl_op_unlock(info->family); return rc; } -static int genl_lock_done(struct netlink_callback *cb) +static int genl_done(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); const struct genl_split_ops *ops = &info->op; int rc = 0; if (ops->done) { - genl_lock(); + genl_op_lock(info->family); rc = ops->done(cb); - genl_unlock(); + genl_op_unlock(info->family); } genl_family_rcv_msg_attrs_free(info->attrs); genl_dumpit_info_free(info); return rc; } -static int genl_parallel_done(struct netlink_callback *cb) -{ - const struct genl_dumpit_info *info = genl_dumpit_info(cb); - const struct genl_split_ops *ops = &info->op; - int rc = 0; - - if (ops->done) - rc = ops->done(cb); - genl_family_rcv_msg_attrs_free(info->attrs); - genl_dumpit_info_free(info); - return rc; -} - static int genl_family_rcv_msg_dumpit(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, @@ -901,6 +899,14 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family, int hdrlen, struct net *net) { struct genl_start_context ctx; + struct netlink_dump_control c = { + .module = family->module, + .data = &ctx, + .start = genl_start, + .dump = genl_dumpit, + .done = genl_done, + .extack = extack, + }; int err; ctx.family = family; @@ -909,31 +915,9 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family, ctx.ops = ops; ctx.hdrlen = hdrlen; - if (!family->parallel_ops) { - struct netlink_dump_control c = { - .module = family->module, - .data = &ctx, - .start = genl_start, - .dump = genl_lock_dumpit, - .done = genl_lock_done, - .extack = extack, - }; - - genl_unlock(); - err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); - genl_lock(); - } else { - struct netlink_dump_control c = { - .module = family->module, - .data = &ctx, - .start = genl_start, - .dump = ops->dumpit, - .done = genl_parallel_done, - .extack = extack, - }; - - err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); - } + genl_op_unlock(family); + err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); + genl_op_lock(family); return err; } @@ -1065,13 +1049,9 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (family == NULL) return -ENOENT; - if (!family->parallel_ops) - genl_lock(); - + genl_op_lock(family); err = genl_family_rcv_msg(family, skb, nlh, extack); - - if (!family->parallel_ops) - genl_unlock(); + genl_op_unlock(family); return err; } -- cgit v1.2.3 From fde9bd4a4d41b65a936d65eb416c1de27cb562f1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Aug 2023 14:47:15 -0700 Subject: genetlink: make genl_info->nlhdr const struct netlink_callback has a const nlh pointer, make the pointer in struct genl_info const as well, to make copying between the two easier. Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20230814214723.2924989-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/genetlink.h | 2 +- net/ncsi/ncsi-netlink.c | 2 +- net/ncsi/ncsi-netlink.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index ed4622dd4828..0366d0925596 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -104,7 +104,7 @@ struct genl_family { struct genl_info { u32 snd_seq; u32 snd_portid; - struct nlmsghdr * nlhdr; + const struct nlmsghdr * nlhdr; struct genlmsghdr * genlhdr; void * userhdr; struct nlattr ** attrs; diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index d27f4eccce6d..a3a6753a1db7 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -563,7 +563,7 @@ int ncsi_send_netlink_timeout(struct ncsi_request *nr, int ncsi_send_netlink_err(struct net_device *dev, u32 snd_seq, u32 snd_portid, - struct nlmsghdr *nlhdr, + const struct nlmsghdr *nlhdr, int err) { struct nlmsghdr *nlh; diff --git a/net/ncsi/ncsi-netlink.h b/net/ncsi/ncsi-netlink.h index 39a1a9d7bf77..747767ea0aae 100644 --- a/net/ncsi/ncsi-netlink.h +++ b/net/ncsi/ncsi-netlink.h @@ -19,7 +19,7 @@ int ncsi_send_netlink_timeout(struct ncsi_request *nr, int ncsi_send_netlink_err(struct net_device *dev, u32 snd_seq, u32 snd_portid, - struct nlmsghdr *nlhdr, + const struct nlmsghdr *nlhdr, int err); #endif /* __NCSI_NETLINK_H__ */ -- cgit v1.2.3 From bffcc6882a1bb2be8c9420184966f4c2c822078e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Aug 2023 14:47:16 -0700 Subject: genetlink: remove userhdr from struct genl_info Only three families use info->userhdr today and going forward we discourage using fixed headers in new families. So having the pointer to user header in struct genl_info is an overkill. Compute the header pointer at runtime. Reviewed-by: Johannes Berg Reviewed-by: Jiri Pirko Reviewed-by: Aaron Conole Link: https://lore.kernel.org/r/20230814214723.2924989-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/block/drbd/drbd_nl.c | 9 +++++---- include/net/genetlink.h | 7 +++++-- net/netlink/genetlink.c | 1 - net/openvswitch/conntrack.c | 2 +- net/openvswitch/datapath.c | 29 ++++++++++++++++------------- net/openvswitch/meter.c | 10 +++++----- net/tipc/netlink_compat.c | 2 +- 7 files changed, 33 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index cddae6f4b00f..d3538bd83fb3 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -159,7 +159,7 @@ static int drbd_msg_sprintf_info(struct sk_buff *skb, const char *fmt, ...) static int drbd_adm_prepare(struct drbd_config_context *adm_ctx, struct sk_buff *skb, struct genl_info *info, unsigned flags) { - struct drbd_genlmsghdr *d_in = info->userhdr; + struct drbd_genlmsghdr *d_in = genl_info_userhdr(info); const u8 cmd = info->genlhdr->cmd; int err; @@ -1396,8 +1396,9 @@ static void drbd_suspend_al(struct drbd_device *device) static bool should_set_defaults(struct genl_info *info) { - unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags; - return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); + struct drbd_genlmsghdr *dh = genl_info_userhdr(info); + + return 0 != (dh->flags & DRBD_GENL_F_SET_DEFAULTS); } static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev) @@ -4276,7 +4277,7 @@ static void device_to_info(struct device_info *info, int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; - struct drbd_genlmsghdr *dh = info->userhdr; + struct drbd_genlmsghdr *dh = genl_info_userhdr(info); enum drbd_ret_code retcode; retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 0366d0925596..9dc21ec15734 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -95,7 +95,6 @@ struct genl_family { * @snd_portid: netlink portid of sender * @nlhdr: netlink message header * @genlhdr: generic netlink message header - * @userhdr: user specific header * @attrs: netlink attributes * @_net: network namespace * @user_ptr: user pointers @@ -106,7 +105,6 @@ struct genl_info { u32 snd_portid; const struct nlmsghdr * nlhdr; struct genlmsghdr * genlhdr; - void * userhdr; struct nlattr ** attrs; possible_net_t _net; void * user_ptr[2]; @@ -123,6 +121,11 @@ static inline void genl_info_net_set(struct genl_info *info, struct net *net) write_pnet(&info->_net, net); } +static inline void *genl_info_userhdr(const struct genl_info *info) +{ + return (u8 *)info->genlhdr + GENL_HDRLEN; +} + #define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg) #define GENL_SET_ERR_MSG_FMT(info, msg, args...) \ diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 0d4285688ab9..f98f730bb245 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -943,7 +943,6 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family, info.snd_portid = NETLINK_CB(skb).portid; info.nlhdr = nlh; info.genlhdr = nlmsg_data(nlh); - info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; info.attrs = attrbuf; info.extack = extack; genl_info_net_set(&info, net); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 0cfa1e9482e6..0b9a785dea45 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1605,7 +1605,7 @@ static struct sk_buff * ovs_ct_limit_cmd_reply_start(struct genl_info *info, u8 cmd, struct ovs_header **ovs_reply_header) { - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct sk_buff *skb; skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index d33cb739883f..0a974eeef76e 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -590,7 +590,7 @@ out: static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) { - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct sw_flow_actions *acts; @@ -967,7 +967,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct sw_flow *flow = NULL, *new_flow; struct sw_flow_mask mask; struct sk_buff *reply; @@ -1214,7 +1214,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct sw_flow_key key; struct sw_flow *flow; struct sk_buff *reply = NULL; @@ -1315,7 +1315,7 @@ error: static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; @@ -1374,7 +1374,7 @@ unlock: static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; @@ -1642,7 +1642,7 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb, { struct datapath *dp; - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), info->attrs); if (IS_ERR(dp)) return; @@ -1935,7 +1935,8 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), + info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) goto err_unlock_free; @@ -1968,7 +1969,8 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), + info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) goto err_unlock_free; @@ -2003,7 +2005,8 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), + info->attrs); if (IS_ERR(dp)) { err = PTR_ERR(dp); goto err_unlock_free; @@ -2246,7 +2249,7 @@ static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom) static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct vport_parms parms; struct sk_buff *reply; struct vport *vport; @@ -2348,7 +2351,7 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); - vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); + vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a); err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; @@ -2404,7 +2407,7 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); - vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); + vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a); err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; @@ -2447,7 +2450,7 @@ exit_unlock_free: static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct sk_buff *reply; struct vport *vport; int err; diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index c4ebf810e4b1..cc08e0403909 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -211,7 +211,7 @@ ovs_meter_cmd_reply_start(struct genl_info *info, u8 cmd, struct ovs_header **ovs_reply_header) { struct sk_buff *skb; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) @@ -272,7 +272,7 @@ error: static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) { - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr *nla, *band_nla; struct sk_buff *reply; @@ -409,7 +409,7 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) struct dp_meter *meter, *old_meter; struct sk_buff *reply; struct ovs_header *ovs_reply_header; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct dp_meter_table *meter_tbl; struct datapath *dp; int err; @@ -482,7 +482,7 @@ exit_free_meter: static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) { - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr **a = info->attrs; struct dp_meter *meter; @@ -535,7 +535,7 @@ exit_unlock: static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) { - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr **a = info->attrs; struct dp_meter *old_meter; diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 9b47c8409231..299cd6754f14 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -1294,7 +1294,7 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) struct tipc_nl_compat_msg msg; struct nlmsghdr *req_nlh; struct nlmsghdr *rep_nlh; - struct tipc_genlmsghdr *req_userhdr = info->userhdr; + struct tipc_genlmsghdr *req_userhdr = genl_info_userhdr(info); memset(&msg, 0, sizeof(msg)); -- cgit v1.2.3 From 9272af109fe65d1a13f28c5c13777b62d3e97e8c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Aug 2023 14:47:17 -0700 Subject: genetlink: add struct genl_info to struct genl_dumpit_info Netlink GET implementations must currently juggle struct genl_info and struct netlink_callback, depending on whether they were called from doit or dumpit. Add genl_info to the dump state and populate the fields. This way implementations can simply pass struct genl_info around. Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20230814214723.2924989-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/genetlink.h | 8 ++++++++ net/netlink/genetlink.c | 16 ++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 9dc21ec15734..86c8eaaa3a43 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -250,11 +250,13 @@ struct genl_split_ops { * @family: generic netlink family - for internal genl code usage * @op: generic netlink ops - for internal genl code usage * @attrs: netlink attributes + * @info: struct genl_info describing the request */ struct genl_dumpit_info { const struct genl_family *family; struct genl_split_ops op; struct nlattr **attrs; + struct genl_info info; }; static inline const struct genl_dumpit_info * @@ -263,6 +265,12 @@ genl_dumpit_info(struct netlink_callback *cb) return cb->data; } +static inline const struct genl_info * +genl_info_dump(struct netlink_callback *cb) +{ + return &genl_dumpit_info(cb)->info; +} + int genl_register_family(struct genl_family *family); int genl_unregister_family(const struct genl_family *family); void genl_notify(const struct genl_family *family, struct sk_buff *skb, diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index f98f730bb245..82ad26970b9b 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -847,6 +847,14 @@ static int genl_start(struct netlink_callback *cb) info->family = ctx->family; info->op = *ops; info->attrs = attrs; + info->info.snd_seq = cb->nlh->nlmsg_seq; + info->info.snd_portid = NETLINK_CB(cb->skb).portid; + info->info.nlhdr = cb->nlh; + info->info.genlhdr = nlmsg_data(cb->nlh); + info->info.attrs = attrs; + genl_info_net_set(&info->info, sock_net(cb->skb->sk)); + info->info.extack = cb->extack; + memset(&info->info.user_ptr, 0, sizeof(info->info.user_ptr)); cb->data = info; if (ops->start) { @@ -865,10 +873,12 @@ static int genl_start(struct netlink_callback *cb) static int genl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct genl_dumpit_info *info = cb->data; const struct genl_split_ops *ops = &info->op; int rc; + info->info.extack = cb->extack; + genl_op_lock(info->family); rc = ops->dumpit(skb, cb); genl_op_unlock(info->family); @@ -877,10 +887,12 @@ static int genl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) static int genl_done(struct netlink_callback *cb) { - const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct genl_dumpit_info *info = cb->data; const struct genl_split_ops *ops = &info->op; int rc = 0; + info->info.extack = cb->extack; + if (ops->done) { genl_op_lock(info->family); rc = ops->done(cb); -- cgit v1.2.3 From 7288dd2fd4888c85c687f8ded69c280938d1a7b6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Aug 2023 14:47:18 -0700 Subject: genetlink: use attrs from struct genl_info Since dumps carry struct genl_info now, use the attrs pointer from genl_info and remove the one in struct genl_dumpit_info. Reviewed-by: Johannes Berg Reviewed-by: Miquel Raynal Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20230814214723.2924989-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/netlink.c | 2 +- include/net/genetlink.h | 1 - net/devlink/health.c | 4 ++-- net/devlink/leftover.c | 6 +++--- net/devlink/netlink.c | 2 +- net/ethtool/netlink.c | 3 ++- net/ethtool/tunnels.c | 2 +- net/ieee802154/nl802154.c | 4 ++-- net/netlink/genetlink.c | 7 +++---- net/nfc/netlink.c | 4 ++-- net/tipc/netlink_compat.c | 2 +- net/tipc/node.c | 4 ++-- net/tipc/socket.c | 2 +- net/tipc/udp_media.c | 2 +- 14 files changed, 22 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/drivers/net/wireguard/netlink.c b/drivers/net/wireguard/netlink.c index 6d1bd9f52d02..dc09b75a3248 100644 --- a/drivers/net/wireguard/netlink.c +++ b/drivers/net/wireguard/netlink.c @@ -200,7 +200,7 @@ static int wg_get_device_start(struct netlink_callback *cb) { struct wg_device *wg; - wg = lookup_interface(genl_dumpit_info(cb)->attrs, cb->skb); + wg = lookup_interface(genl_info_dump(cb)->attrs, cb->skb); if (IS_ERR(wg)) return PTR_ERR(wg); DUMP_CTX(cb)->wg = wg; diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 86c8eaaa3a43..a8a15b9c22c8 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -255,7 +255,6 @@ struct genl_split_ops { struct genl_dumpit_info { const struct genl_family *family; struct genl_split_ops op; - struct nlattr **attrs; struct genl_info info; }; diff --git a/net/devlink/health.c b/net/devlink/health.c index a85bdec34801..638cad8d5c65 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -390,7 +390,7 @@ static int devlink_nl_health_reporter_get_dump_one(struct sk_buff *msg, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); - const struct genl_dumpit_info *info = genl_dumpit_info(cb); + const struct genl_info *info = genl_info_dump(cb); struct devlink_health_reporter *reporter; unsigned long port_index_end = ULONG_MAX; struct nlattr **attrs = info->attrs; @@ -1264,7 +1264,7 @@ out: static struct devlink_health_reporter * devlink_health_reporter_get_from_cb(struct netlink_callback *cb) { - const struct genl_dumpit_info *info = genl_dumpit_info(cb); + const struct genl_info *info = genl_info_dump(cb); struct devlink_health_reporter *reporter; struct nlattr **attrs = info->attrs; struct devlink *devlink; diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 3883a90d32bb..72ba8a716525 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -5201,7 +5201,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct nlattr *chunks_attr, *region_attr, *snapshot_attr; u64 ret_offset, start_offset, end_offset = U64_MAX; - struct nlattr **attrs = info->attrs; + struct nlattr **attrs = info->info.attrs; struct devlink_port *port = NULL; devlink_chunk_fill_t *region_cb; struct devlink_region *region; @@ -5224,8 +5224,8 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto out_unlock; } - if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { - index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + if (attrs[DEVLINK_ATTR_PORT_INDEX]) { + index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); if (!port) { diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index a9b43b0c5959..72a5005a64cd 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -228,7 +228,7 @@ static int devlink_nl_inst_iter_dumpit(struct sk_buff *msg, int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, devlink_nl_dump_one_func_t *dump_one) { - const struct genl_dumpit_info *info = genl_dumpit_info(cb); + const struct genl_info *info = genl_info_dump(cb); struct nlattr **attrs = info->attrs; int flags = NLM_F_MULTI; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index ae344f1b0bbd..9fc7c41f4786 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -538,7 +538,8 @@ static int ethnl_default_start(struct netlink_callback *cb) goto free_req_info; } - ret = ethnl_default_parse(req_info, info->attrs, sock_net(cb->skb->sk), + ret = ethnl_default_parse(req_info, info->info.attrs, + sock_net(cb->skb->sk), ops, cb->extack, false); if (req_info->dev) { /* We ignore device specification in dump requests but as the diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c index 05f752557b5e..b4ce47dd2aa6 100644 --- a/net/ethtool/tunnels.c +++ b/net/ethtool/tunnels.c @@ -219,7 +219,7 @@ int ethnl_tunnel_info_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; - struct nlattr **tb = info->attrs; + struct nlattr **tb = info->info.attrs; int ret; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index d610c1886160..1a265a421308 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -262,7 +262,7 @@ nl802154_prepare_wpan_dev_dump(struct sk_buff *skb, if (!cb->args[0]) { *wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk), - info->attrs); + info->info.attrs); if (IS_ERR(*wpan_dev)) { err = PTR_ERR(*wpan_dev); goto out_unlock; @@ -570,7 +570,7 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb, struct nl802154_dump_wpan_phy_state *state) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); - struct nlattr **tb = info->attrs; + struct nlattr **tb = info->info.attrs; if (tb[NL802154_ATTR_WPAN_PHY]) state->filter_wpan_phy = nla_get_u32(tb[NL802154_ATTR_WPAN_PHY]); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 82ad26970b9b..d47879d5a74c 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -846,7 +846,6 @@ static int genl_start(struct netlink_callback *cb) } info->family = ctx->family; info->op = *ops; - info->attrs = attrs; info->info.snd_seq = cb->nlh->nlmsg_seq; info->info.snd_portid = NETLINK_CB(cb->skb).portid; info->info.nlhdr = cb->nlh; @@ -864,7 +863,7 @@ static int genl_start(struct netlink_callback *cb) } if (rc) { - genl_family_rcv_msg_attrs_free(info->attrs); + genl_family_rcv_msg_attrs_free(info->info.attrs); genl_dumpit_info_free(info); cb->data = NULL; } @@ -898,7 +897,7 @@ static int genl_done(struct netlink_callback *cb) rc = ops->done(cb); genl_op_unlock(info->family); } - genl_family_rcv_msg_attrs_free(info->attrs); + genl_family_rcv_msg_attrs_free(info->info.attrs); genl_dumpit_info_free(info); return rc; } @@ -1387,7 +1386,7 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; - struct nlattr **tb = info->attrs; + struct nlattr **tb = info->info.attrs; const struct genl_family *rt; struct genl_op_iter i; int err; diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index e9ac6a6f934e..aa1dbf654c3e 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -110,10 +110,10 @@ static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb) struct nfc_dev *dev; u32 idx; - if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) + if (!info->info.attrs[NFC_ATTR_DEVICE_INDEX]) return ERR_PTR(-EINVAL); - idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); + idx = nla_get_u32(info->info.attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 299cd6754f14..5bc076f2fa74 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -208,7 +208,7 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, goto err_out; } - info.attrs = attrbuf; + info.info.attrs = attrbuf; if (nlmsg_len(cb.nlh) > 0) { err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf, diff --git a/net/tipc/node.c b/net/tipc/node.c index a9c5b6594889..3105abe97bb9 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2662,7 +2662,7 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - struct nlattr **attrs = genl_dumpit_info(cb)->attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *link[TIPC_NLA_LINK_MAX + 1]; struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *node; @@ -2870,7 +2870,7 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, int err; if (!prev_node) { - struct nlattr **attrs = genl_dumpit_info(cb)->attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *mon[TIPC_NLA_MON_MAX + 1]; if (!attrs[TIPC_NLA_MON]) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ef8e5139a873..bb1118d02f95 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3791,7 +3791,7 @@ int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb) struct tipc_sock *tsk; if (!tsk_portid) { - struct nlattr **attrs = genl_dumpit_info(cb)->attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1]; if (!attrs[TIPC_NLA_SOCK]) diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 926232557e77..f892b0903dba 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -465,7 +465,7 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) int i; if (!bid && !skip_cnt) { - struct nlattr **attrs = genl_dumpit_info(cb)->attrs; + struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct net *net = sock_net(skb->sk); struct nlattr *battrs[TIPC_NLA_BEARER_MAX + 1]; char *bname; -- cgit v1.2.3 From 5c670a010de46687ed27553602d8131ce4d7a9fb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Aug 2023 14:47:19 -0700 Subject: genetlink: add a family pointer to struct genl_info Having family in struct genl_info is quite useful. It cuts down the number of arguments which need to be passed to helpers which already take struct genl_info. Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20230814214723.2924989-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/genetlink.h | 4 ++-- net/netlink/genetlink.c | 21 ++++++++++++--------- 2 files changed, 14 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index a8a15b9c22c8..6b858c4cba5b 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -93,6 +93,7 @@ struct genl_family { * struct genl_info - receiving information * @snd_seq: sending sequence number * @snd_portid: netlink portid of sender + * @family: generic netlink family * @nlhdr: netlink message header * @genlhdr: generic netlink message header * @attrs: netlink attributes @@ -103,6 +104,7 @@ struct genl_family { struct genl_info { u32 snd_seq; u32 snd_portid; + const struct genl_family *family; const struct nlmsghdr * nlhdr; struct genlmsghdr * genlhdr; struct nlattr ** attrs; @@ -247,13 +249,11 @@ struct genl_split_ops { /** * struct genl_dumpit_info - info that is available during dumpit op call - * @family: generic netlink family - for internal genl code usage * @op: generic netlink ops - for internal genl code usage * @attrs: netlink attributes * @info: struct genl_info describing the request */ struct genl_dumpit_info { - const struct genl_family *family; struct genl_split_ops op; struct genl_info info; }; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index d47879d5a74c..8315d31b53db 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -844,8 +844,8 @@ static int genl_start(struct netlink_callback *cb) genl_family_rcv_msg_attrs_free(attrs); return -ENOMEM; } - info->family = ctx->family; info->op = *ops; + info->info.family = ctx->family; info->info.snd_seq = cb->nlh->nlmsg_seq; info->info.snd_portid = NETLINK_CB(cb->skb).portid; info->info.nlhdr = cb->nlh; @@ -872,11 +872,12 @@ static int genl_start(struct netlink_callback *cb) static int genl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - struct genl_dumpit_info *info = cb->data; - const struct genl_split_ops *ops = &info->op; + struct genl_dumpit_info *dump_info = cb->data; + const struct genl_split_ops *ops = &dump_info->op; + struct genl_info *info = &dump_info->info; int rc; - info->info.extack = cb->extack; + info->extack = cb->extack; genl_op_lock(info->family); rc = ops->dumpit(skb, cb); @@ -886,19 +887,20 @@ static int genl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) static int genl_done(struct netlink_callback *cb) { - struct genl_dumpit_info *info = cb->data; - const struct genl_split_ops *ops = &info->op; + struct genl_dumpit_info *dump_info = cb->data; + const struct genl_split_ops *ops = &dump_info->op; + struct genl_info *info = &dump_info->info; int rc = 0; - info->info.extack = cb->extack; + info->extack = cb->extack; if (ops->done) { genl_op_lock(info->family); rc = ops->done(cb); genl_op_unlock(info->family); } - genl_family_rcv_msg_attrs_free(info->info.attrs); - genl_dumpit_info_free(info); + genl_family_rcv_msg_attrs_free(info->attrs); + genl_dumpit_info_free(dump_info); return rc; } @@ -952,6 +954,7 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family, info.snd_seq = nlh->nlmsg_seq; info.snd_portid = NETLINK_CB(skb).portid; + info.family = family; info.nlhdr = nlh; info.genlhdr = nlmsg_data(nlh); info.attrs = attrbuf; -- cgit v1.2.3 From 0e19d3108aeab6a9edcef07bf0f123fa2961d0d6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Aug 2023 14:47:21 -0700 Subject: netdev-genl: use struct genl_info for reply construction Use the just added APIs to make the code simpler. Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20230814214723.2924989-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 797c813c7c77..c1aea8b756b6 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -10,11 +10,11 @@ static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, - u32 portid, u32 seq, int flags, u32 cmd) + const struct genl_info *info) { void *hdr; - hdr = genlmsg_put(rsp, portid, seq, &netdev_nl_family, flags, cmd); + hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; @@ -41,17 +41,20 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, static void netdev_genl_dev_notify(struct net_device *netdev, int cmd) { + struct genl_info info; struct sk_buff *ntf; if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev), NETDEV_NLGRP_MGMT)) return; + genl_info_init_ntf(&info, &netdev_nl_family, cmd); + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ntf) return; - if (netdev_nl_dev_fill(netdev, ntf, 0, 0, 0, cmd)) { + if (netdev_nl_dev_fill(netdev, ntf, &info)) { nlmsg_free(ntf); return; } @@ -80,8 +83,7 @@ int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) netdev = __dev_get_by_index(genl_info_net(info), ifindex); if (netdev) - err = netdev_nl_dev_fill(netdev, rsp, info->snd_portid, - info->snd_seq, 0, info->genlhdr->cmd); + err = netdev_nl_dev_fill(netdev, rsp, info); else err = -ENODEV; @@ -105,10 +107,7 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) rtnl_lock(); for_each_netdev_dump(net, netdev, cb->args[0]) { - err = netdev_nl_dev_fill(netdev, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - NETDEV_CMD_DEV_GET); + err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) break; } -- cgit v1.2.3 From ec0e5b09b834da3889be8458bb0451c3baa803d9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Aug 2023 14:47:22 -0700 Subject: ethtool: netlink: simplify arguments to ethnl_default_parse() Pass struct genl_info directly instead of its members. Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20230814214723.2924989-10-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/netlink.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 9fc7c41f4786..f7b3171a0aad 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -316,10 +316,8 @@ static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) /** * ethnl_default_parse() - Parse request message * @req_info: pointer to structure to put data into - * @tb: parsed attributes - * @net: request netns + * @info: genl_info from the request * @request_ops: struct request_ops for request type - * @extack: netlink extack for error reporting * @require_dev: fail if no device identified in header * * Parse universal request header and call request specific ->parse_request() @@ -328,19 +326,21 @@ static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) * Return: 0 on success or negative error code */ static int ethnl_default_parse(struct ethnl_req_info *req_info, - struct nlattr **tb, struct net *net, + const struct genl_info *info, const struct ethnl_request_ops *request_ops, - struct netlink_ext_ack *extack, bool require_dev) + bool require_dev) { + struct nlattr **tb = info->attrs; int ret; ret = ethnl_parse_header_dev_get(req_info, tb[request_ops->hdr_attr], - net, extack, require_dev); + genl_info_net(info), info->extack, + require_dev); if (ret < 0) return ret; if (request_ops->parse_request) { - ret = request_ops->parse_request(req_info, tb, extack); + ret = request_ops->parse_request(req_info, tb, info->extack); if (ret < 0) return ret; } @@ -393,8 +393,7 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; } - ret = ethnl_default_parse(req_info, info->attrs, genl_info_net(info), - ops, info->extack, !ops->allow_nodev_do); + ret = ethnl_default_parse(req_info, info, ops, !ops->allow_nodev_do); if (ret < 0) goto err_dev; ethnl_init_reply_data(reply_data, ops, req_info->dev); @@ -538,9 +537,7 @@ static int ethnl_default_start(struct netlink_callback *cb) goto free_req_info; } - ret = ethnl_default_parse(req_info, info->info.attrs, - sock_net(cb->skb->sk), - ops, cb->extack, false); + ret = ethnl_default_parse(req_info, &info->info, ops, false); if (req_info->dev) { /* We ignore device specification in dump requests but as the * same parser as for non-dump (doit) requests is used, it -- cgit v1.2.3 From f946270d05c26044c67511ef5a9d91e06962d79f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Aug 2023 14:47:23 -0700 Subject: ethtool: netlink: always pass genl_info to .prepare_data We had a number of bugs in the past because developers forgot to fully test dumps, which pass NULL as info to .prepare_data. .prepare_data implementations would try to access info->extack leading to a null-deref. Now that dumps and notifications can access struct genl_info we can pass it in, and remove the info null checks. Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean # pause Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20230814214723.2924989-11-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/channels.c | 2 +- net/ethtool/coalesce.c | 6 +++--- net/ethtool/debug.c | 2 +- net/ethtool/eee.c | 2 +- net/ethtool/eeprom.c | 9 ++++----- net/ethtool/features.c | 2 +- net/ethtool/fec.c | 2 +- net/ethtool/linkinfo.c | 2 +- net/ethtool/linkmodes.c | 2 +- net/ethtool/linkstate.c | 2 +- net/ethtool/mm.c | 2 +- net/ethtool/module.c | 5 ++--- net/ethtool/netlink.c | 13 ++++++++----- net/ethtool/netlink.h | 2 +- net/ethtool/pause.c | 5 ++--- net/ethtool/phc_vclocks.c | 2 +- net/ethtool/plca.c | 4 ++-- net/ethtool/privflags.c | 2 +- net/ethtool/pse-pd.c | 6 +++--- net/ethtool/rings.c | 5 ++--- net/ethtool/rss.c | 3 ++- net/ethtool/stats.c | 5 ++--- net/ethtool/strset.c | 2 +- net/ethtool/tsinfo.c | 2 +- net/ethtool/wol.c | 5 +++-- 25 files changed, 47 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 61c40e889a4d..7b4bbd674bae 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -24,7 +24,7 @@ const struct nla_policy ethnl_channels_get_policy[] = { static int channels_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct channels_reply_data *data = CHANNELS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c index 01a59ce211c8..83112c1a71ae 100644 --- a/net/ethtool/coalesce.c +++ b/net/ethtool/coalesce.c @@ -59,10 +59,9 @@ const struct nla_policy ethnl_coalesce_get_policy[] = { static int coalesce_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); - struct netlink_ext_ack *extack = info ? info->extack : NULL; struct net_device *dev = reply_base->dev; int ret; @@ -73,7 +72,8 @@ static int coalesce_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce, - &data->kernel_coalesce, extack); + &data->kernel_coalesce, + info->extack); ethnl_ops_complete(dev); return ret; diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c index e4369769817e..0b2dea56d461 100644 --- a/net/ethtool/debug.c +++ b/net/ethtool/debug.c @@ -23,7 +23,7 @@ const struct nla_policy ethnl_debug_get_policy[] = { static int debug_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct debug_reply_data *data = DEBUG_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c index 42104bcb0e47..2853394d06a8 100644 --- a/net/ethtool/eee.c +++ b/net/ethtool/eee.c @@ -26,7 +26,7 @@ const struct nla_policy ethnl_eee_get_policy[] = { static int eee_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct eee_reply_data *data = EEE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c index 49c0a2a77f02..6209c3a9c8f7 100644 --- a/net/ethtool/eeprom.c +++ b/net/ethtool/eeprom.c @@ -51,8 +51,7 @@ static int fallback_set_params(struct eeprom_req_info *request, } static int eeprom_fallback(struct eeprom_req_info *request, - struct eeprom_reply_data *reply, - struct genl_info *info) + struct eeprom_reply_data *reply) { struct net_device *dev = reply->base.dev; struct ethtool_modinfo modinfo = {0}; @@ -103,7 +102,7 @@ static int get_module_eeprom_by_page(struct net_device *dev, static int eeprom_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base); @@ -124,7 +123,7 @@ static int eeprom_prepare_data(const struct ethnl_req_info *req_base, if (ret) goto err_free; - ret = get_module_eeprom_by_page(dev, &page_data, info ? info->extack : NULL); + ret = get_module_eeprom_by_page(dev, &page_data, info->extack); if (ret < 0) goto err_ops; @@ -140,7 +139,7 @@ err_free: kfree(page_data.data); if (ret == -EOPNOTSUPP) - return eeprom_fallback(request, reply, info); + return eeprom_fallback(request, reply); return ret; } diff --git a/net/ethtool/features.c b/net/ethtool/features.c index 55d449a2d3fc..a79af8c25a07 100644 --- a/net/ethtool/features.c +++ b/net/ethtool/features.c @@ -35,7 +35,7 @@ static void ethnl_features_to_bitmap32(u32 *dest, netdev_features_t src) static int features_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct features_reply_data *data = FEATURES_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c index 0d9a3d153170..e7d3f2c352a3 100644 --- a/net/ethtool/fec.c +++ b/net/ethtool/fec.c @@ -92,7 +92,7 @@ fec_stats_recalc(struct fec_stat_grp *grp, struct ethtool_fec_stat *stats) static int fec_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { __ETHTOOL_DECLARE_LINK_MODE_MASK(active_fec_modes) = {}; struct fec_reply_data *data = FEC_REPDATA(reply_base); diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c index 310dfe63292a..5c317d23787b 100644 --- a/net/ethtool/linkinfo.c +++ b/net/ethtool/linkinfo.c @@ -23,7 +23,7 @@ const struct nla_policy ethnl_linkinfo_get_policy[] = { static int linkinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 20165e07ef90..b2591db49f7d 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -27,7 +27,7 @@ const struct nla_policy ethnl_linkmodes_get_policy[] = { static int linkmodes_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index 2158c17a0b32..b2de2108b356 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -81,7 +81,7 @@ static int linkstate_get_link_ext_state(struct net_device *dev, static int linkstate_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c index 4058a557b5a4..2816bb23c3ad 100644 --- a/net/ethtool/mm.c +++ b/net/ethtool/mm.c @@ -27,7 +27,7 @@ const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1] = { static int mm_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct mm_reply_data *data = MM_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/module.c b/net/ethtool/module.c index e0d539b21423..ceb575efc290 100644 --- a/net/ethtool/module.c +++ b/net/ethtool/module.c @@ -38,10 +38,9 @@ static int module_get_power_mode(struct net_device *dev, static int module_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct module_reply_data *data = MODULE_REPDATA(reply_base); - struct netlink_ext_ack *extack = info ? info->extack : NULL; struct net_device *dev = reply_base->dev; int ret; @@ -49,7 +48,7 @@ static int module_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; - ret = module_get_power_mode(dev, data, extack); + ret = module_get_power_mode(dev, data, info->extack); if (ret < 0) goto out_complete; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index f7b3171a0aad..3bbd5afb7b31 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -444,12 +444,12 @@ err_dev: static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, const struct ethnl_dump_ctx *ctx, - struct netlink_callback *cb) + const struct genl_info *info) { void *ehdr; int ret; - ehdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + ehdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, ðtool_genl_family, NLM_F_MULTI, ctx->ops->reply_cmd); if (!ehdr) @@ -457,7 +457,7 @@ static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, ethnl_init_reply_data(ctx->reply_data, ctx->ops, dev); rtnl_lock(); - ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, NULL); + ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, info); rtnl_unlock(); if (ret < 0) goto out; @@ -495,7 +495,7 @@ static int ethnl_default_dumpit(struct sk_buff *skb, dev_hold(dev); rtnl_unlock(); - ret = ethnl_default_dump_one(skb, dev, ctx, cb); + ret = ethnl_default_dump_one(skb, dev, ctx, genl_info_dump(cb)); rtnl_lock(); dev_put(dev); @@ -647,11 +647,14 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, struct ethnl_reply_data *reply_data; const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; + struct genl_info info; struct sk_buff *skb; void *reply_payload; int reply_len; int ret; + genl_info_init_ntf(&info, ðtool_genl_family, cmd); + if (WARN_ONCE(cmd > ETHTOOL_MSG_KERNEL_MAX || !ethnl_default_notify_ops[cmd], "unexpected notification type %u\n", cmd)) @@ -670,7 +673,7 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, req_info->flags |= ETHTOOL_FLAG_COMPACT_BITSETS; ethnl_init_reply_data(reply_data, ops, dev); - ret = ops->prepare_data(req_info, reply_data, NULL); + ret = ops->prepare_data(req_info, reply_data, &info); if (ret < 0) goto err_cleanup; ret = ops->reply_size(req_info, reply_data); diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 79424b34b553..9a333a8d04c1 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -355,7 +355,7 @@ struct ethnl_request_ops { struct netlink_ext_ack *extack); int (*prepare_data)(const struct ethnl_req_info *req_info, struct ethnl_reply_data *reply_data, - struct genl_info *info); + const struct genl_info *info); int (*reply_size)(const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data); int (*fill_reply)(struct sk_buff *skb, diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 6657d0b888d8..f7c847aeb1a2 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -51,10 +51,9 @@ static int pause_parse_request(struct ethnl_req_info *req_base, static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { const struct pause_req_info *req_info = PAUSE_REQINFO(req_base); - struct netlink_ext_ack *extack = info ? info->extack : NULL; struct pause_reply_data *data = PAUSE_REPDATA(reply_base); enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; @@ -74,7 +73,7 @@ static int pause_prepare_data(const struct ethnl_req_info *req_base, if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || src == ETHTOOL_MAC_STATS_SRC_PMAC) && !__ethtool_dev_mm_supported(dev)) { - NL_SET_ERR_MSG_MOD(extack, + NL_SET_ERR_MSG_MOD(info->extack, "Device does not support MAC merge layer"); ethnl_ops_complete(dev); return -EOPNOTSUPP; diff --git a/net/ethtool/phc_vclocks.c b/net/ethtool/phc_vclocks.c index 637b2f5297d5..cadaabed60bd 100644 --- a/net/ethtool/phc_vclocks.c +++ b/net/ethtool/phc_vclocks.c @@ -24,7 +24,7 @@ const struct nla_policy ethnl_phc_vclocks_get_policy[] = { static int phc_vclocks_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/plca.c b/net/ethtool/plca.c index 5a8cab4df0c9..b238a1afe9ae 100644 --- a/net/ethtool/plca.c +++ b/net/ethtool/plca.c @@ -40,7 +40,7 @@ const struct nla_policy ethnl_plca_get_cfg_policy[] = { static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct plca_reply_data *data = PLCA_REPDATA(reply_base); struct net_device *dev = reply_base->dev; @@ -183,7 +183,7 @@ const struct nla_policy ethnl_plca_get_status_policy[] = { static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct plca_reply_data *data = PLCA_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/privflags.c b/net/ethtool/privflags.c index 23264a1ebf12..297be6a13ab9 100644 --- a/net/ethtool/privflags.c +++ b/net/ethtool/privflags.c @@ -57,7 +57,7 @@ static int ethnl_get_priv_flags_info(struct net_device *dev, static int privflags_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index 530b8b99e6df..cc478af77111 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -53,8 +53,8 @@ static int pse_get_pse_attributes(struct net_device *dev, } static int pse_prepare_data(const struct ethnl_req_info *req_base, - struct ethnl_reply_data *reply_base, - struct genl_info *info) + struct ethnl_reply_data *reply_base, + const struct genl_info *info) { struct pse_reply_data *data = PSE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; @@ -64,7 +64,7 @@ static int pse_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; - ret = pse_get_pse_attributes(dev, info ? info->extack : NULL, data); + ret = pse_get_pse_attributes(dev, info->extack, data); ethnl_ops_complete(dev); diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index 1c4972526142..fb09f774ea01 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -24,10 +24,9 @@ const struct nla_policy ethnl_rings_get_policy[] = { static int rings_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct rings_reply_data *data = RINGS_REPDATA(reply_base); - struct netlink_ext_ack *extack = info ? info->extack : NULL; struct net_device *dev = reply_base->dev; int ret; @@ -39,7 +38,7 @@ static int rings_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; dev->ethtool_ops->get_ringparam(dev, &data->ringparam, - &data->kernel_ringparam, extack); + &data->kernel_ringparam, info->extack); ethnl_ops_complete(dev); return 0; diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index be260ab34e58..5764202e6cb6 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -42,7 +42,8 @@ rss_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb, static int rss_prepare_data(const struct ethnl_req_info *req_base, - struct ethnl_reply_data *reply_base, struct genl_info *info) + struct ethnl_reply_data *reply_base, + const struct genl_info *info) { struct rss_reply_data *data = RSS_REPDATA(reply_base); struct rss_req_info *request = RSS_REQINFO(req_base); diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c index 010ed19ccc99..912f0c4fff2f 100644 --- a/net/ethtool/stats.c +++ b/net/ethtool/stats.c @@ -114,10 +114,9 @@ static int stats_parse_request(struct ethnl_req_info *req_base, static int stats_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { const struct stats_req_info *req_info = STATS_REQINFO(req_base); - struct netlink_ext_ack *extack = info ? info->extack : NULL; struct stats_reply_data *data = STATS_REPDATA(reply_base); enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; @@ -130,7 +129,7 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || src == ETHTOOL_MAC_STATS_SRC_PMAC) && !__ethtool_dev_mm_supported(dev)) { - NL_SET_ERR_MSG_MOD(extack, + NL_SET_ERR_MSG_MOD(info->extack, "Device does not support MAC merge layer"); ethnl_ops_complete(dev); return -EOPNOTSUPP; diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 3f7de54d85fb..c678b484a079 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -274,7 +274,7 @@ static int strset_prepare_set(struct strset_info *info, struct net_device *dev, static int strset_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { const struct strset_req_info *req_info = STRSET_REQINFO(req_base); struct strset_reply_data *data = STRSET_REPDATA(reply_base); diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c index 63b5814bd460..9daed0aab162 100644 --- a/net/ethtool/tsinfo.c +++ b/net/ethtool/tsinfo.c @@ -25,7 +25,7 @@ const struct nla_policy ethnl_tsinfo_get_policy[] = { static int tsinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); struct net_device *dev = reply_base->dev; diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c index a4a43d9e6e9d..0ed56c9ac1bc 100644 --- a/net/ethtool/wol.c +++ b/net/ethtool/wol.c @@ -24,7 +24,7 @@ const struct nla_policy ethnl_wol_get_policy[] = { static int wol_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, - struct genl_info *info) + const struct genl_info *info) { struct wol_reply_data *data = WOL_REPDATA(reply_base); struct net_device *dev = reply_base->dev; @@ -39,7 +39,8 @@ static int wol_prepare_data(const struct ethnl_req_info *req_base, dev->ethtool_ops->get_wol(dev, &data->wol); ethnl_ops_complete(dev); /* do not include password in notifications */ - data->show_sopass = info && (data->wol.supported & WAKE_MAGICSECURE); + data->show_sopass = !genl_info_is_ntf(info) && + (data->wol.supported & WAKE_MAGICSECURE); return 0; } -- cgit v1.2.3 From 7458575a07f1c6768e46b8eac6e09a358804c9f6 Mon Sep 17 00:00:00 2001 From: Andrea Mayer Date: Sat, 12 Aug 2023 20:09:25 +0200 Subject: seg6: add NEXT-C-SID support for SRv6 End.X behavior The NEXT-C-SID mechanism described in [1] offers the possibility of encoding several SRv6 segments within a single 128 bit SID address. Such a SID address is called a Compressed SID (C-SID) container. In this way, the length of the SID List can be drastically reduced. A SID instantiated with the NEXT-C-SID flavor considers an IPv6 address logically structured in three main blocks: i) Locator-Block; ii) Locator-Node Function; iii) Argument. C-SID container +------------------------------------------------------------------+ | Locator-Block |Loc-Node| Argument | | |Function| | +------------------------------------------------------------------+ <--------- B -----------> <- NF -> <------------- A ---------------> (i) The Locator-Block can be any IPv6 prefix available to the provider; (ii) The Locator-Node Function represents the node and the function to be triggered when a packet is received on the node; (iii) The Argument carries the remaining C-SIDs in the current C-SID container. This patch leverages the NEXT-C-SID mechanism previously introduced in the Linux SRv6 subsystem [2] to support SID compression capabilities in the SRv6 End.X behavior [3]. An SRv6 End.X behavior with NEXT-C-SID flavor works as an End.X behavior but it is capable of processing the compressed SID List encoded in C-SID containers. An SRv6 End.X behavior with NEXT-C-SID flavor can be configured to support user-provided Locator-Block and Locator-Node Function lengths. In this implementation, such lengths must be evenly divisible by 8 (i.e. must be byte-aligned), otherwise the kernel informs the user about invalid values with a meaningful error code and message through netlink_ext_ack. If Locator-Block and/or Locator-Node Function lengths are not provided by the user during configuration of an SRv6 End.X behavior instance with NEXT-C-SID flavor, the kernel will choose their default values i.e., 32-bit Locator-Block and 16-bit Locator-Node Function. [1] - https://datatracker.ietf.org/doc/html/draft-ietf-spring-srv6-srh-compression [2] - https://lore.kernel.org/all/20220912171619.16943-1-andrea.mayer@uniroma2.it/ [3] - https://datatracker.ietf.org/doc/html/rfc8986#name-endx-l3-cross-connect Signed-off-by: Andrea Mayer Reviewed-by: Hangbin Liu Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230812180926.16689-2-andrea.mayer@uniroma2.it Signed-off-by: Jakub Kicinski --- net/ipv6/seg6_local.c | 108 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 88 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index dd433cc265c8..24e2b4b494cb 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -109,15 +109,19 @@ struct bpf_lwt_prog { #define next_csid_chk_lcnode_fn_bits(flen) \ next_csid_chk_lcblock_bits(flen) +/* flag indicating that flavors are set up for a given End* behavior */ +#define SEG6_F_LOCAL_FLAVORS SEG6_F_ATTR(SEG6_LOCAL_FLAVORS) + #define SEG6_F_LOCAL_FLV_OP(flvname) BIT(SEG6_LOCAL_FLV_OP_##flvname) +#define SEG6_F_LOCAL_FLV_NEXT_CSID SEG6_F_LOCAL_FLV_OP(NEXT_CSID) #define SEG6_F_LOCAL_FLV_PSP SEG6_F_LOCAL_FLV_OP(PSP) /* Supported RFC8986 Flavor operations are reported in this bitmask */ #define SEG6_LOCAL_FLV8986_SUPP_OPS SEG6_F_LOCAL_FLV_PSP -/* Supported Flavor operations are reported in this bitmask */ -#define SEG6_LOCAL_FLV_SUPP_OPS (SEG6_F_LOCAL_FLV_OP(NEXT_CSID) | \ +#define SEG6_LOCAL_END_FLV_SUPP_OPS (SEG6_F_LOCAL_FLV_NEXT_CSID | \ SEG6_LOCAL_FLV8986_SUPP_OPS) +#define SEG6_LOCAL_END_X_FLV_SUPP_OPS SEG6_F_LOCAL_FLV_NEXT_CSID struct seg6_flavors_info { /* Flavor operations */ @@ -411,9 +415,72 @@ static int end_next_csid_core(struct sk_buff *skb, struct seg6_local_lwt *slwt) return input_action_end_finish(skb, slwt); } +static int input_action_end_x_finish(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + seg6_lookup_nexthop(skb, &slwt->nh6, 0); + + return dst_input(skb); +} + +static int input_action_end_x_core(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + struct ipv6_sr_hdr *srh; + + srh = get_and_validate_srh(skb); + if (!srh) + goto drop; + + advance_nextseg(srh, &ipv6_hdr(skb)->daddr); + + return input_action_end_x_finish(skb, slwt); + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int end_x_next_csid_core(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + const struct seg6_flavors_info *finfo = &slwt->flv_info; + struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; + + if (seg6_next_csid_is_arg_zero(daddr, finfo)) + return input_action_end_x_core(skb, slwt); + + /* update DA */ + seg6_next_csid_advance_arg(daddr, finfo); + + return input_action_end_x_finish(skb, slwt); +} + static bool seg6_next_csid_enabled(__u32 fops) { - return fops & BIT(SEG6_LOCAL_FLV_OP_NEXT_CSID); + return fops & SEG6_F_LOCAL_FLV_NEXT_CSID; +} + +/* Processing of SRv6 End, End.X, and End.T behaviors can be extended through + * the flavors framework. These behaviors must report the subset of (flavor) + * operations they currently implement. In this way, if a user specifies a + * flavor combination that is not supported by a given End* behavior, the + * kernel refuses to instantiate the tunnel reporting the error. + */ +static int seg6_flv_supp_ops_by_action(int action, __u32 *fops) +{ + switch (action) { + case SEG6_LOCAL_ACTION_END: + *fops = SEG6_LOCAL_END_FLV_SUPP_OPS; + break; + case SEG6_LOCAL_ACTION_END_X: + *fops = SEG6_LOCAL_END_X_FLV_SUPP_OPS; + break; + default: + return -EOPNOTSUPP; + } + + return 0; } /* We describe the packet state in relation to the absence/presence of the SRH @@ -746,21 +813,14 @@ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt) /* regular endpoint, and forward to specified nexthop */ static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt) { - struct ipv6_sr_hdr *srh; - - srh = get_and_validate_srh(skb); - if (!srh) - goto drop; - - advance_nextseg(srh, &ipv6_hdr(skb)->daddr); - - seg6_lookup_nexthop(skb, &slwt->nh6, 0); + const struct seg6_flavors_info *finfo = &slwt->flv_info; + __u32 fops = finfo->flv_ops; - return dst_input(skb); + /* check for the presence of NEXT-C-SID since it applies first */ + if (seg6_next_csid_enabled(fops)) + return end_x_next_csid_core(skb, slwt); -drop: - kfree_skb(skb); - return -EINVAL; + return input_action_end_x_core(skb, slwt); } static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt) @@ -1404,13 +1464,14 @@ static struct seg6_action_desc seg6_action_table[] = { .action = SEG6_LOCAL_ACTION_END, .attrs = 0, .optattrs = SEG6_F_LOCAL_COUNTERS | - SEG6_F_ATTR(SEG6_LOCAL_FLAVORS), + SEG6_F_LOCAL_FLAVORS, .input = input_action_end, }, { .action = SEG6_LOCAL_ACTION_END_X, .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH6), - .optattrs = SEG6_F_LOCAL_COUNTERS, + .optattrs = SEG6_F_LOCAL_COUNTERS | + SEG6_F_LOCAL_FLAVORS, .input = input_action_end_x, }, { @@ -2070,7 +2131,8 @@ static int parse_nla_flavors(struct nlattr **attrs, struct seg6_local_lwt *slwt, { struct seg6_flavors_info *finfo = &slwt->flv_info; struct nlattr *tb[SEG6_LOCAL_FLV_MAX + 1]; - unsigned long fops; + int action = slwt->action; + __u32 fops, supp_fops; int rc; rc = nla_parse_nested_deprecated(tb, SEG6_LOCAL_FLV_MAX, @@ -2086,7 +2148,8 @@ static int parse_nla_flavors(struct nlattr **attrs, struct seg6_local_lwt *slwt, return -EINVAL; fops = nla_get_u32(tb[SEG6_LOCAL_FLV_OPERATION]); - if (fops & ~SEG6_LOCAL_FLV_SUPP_OPS) { + rc = seg6_flv_supp_ops_by_action(action, &supp_fops); + if (rc < 0 || (fops & ~supp_fops)) { NL_SET_ERR_MSG(extack, "Unsupported Flavor operation(s)"); return -EOPNOTSUPP; } @@ -2618,6 +2681,11 @@ int __init seg6_local_init(void) */ BUILD_BUG_ON(SEG6_LOCAL_MAX + 1 > BITS_PER_TYPE(unsigned long)); + /* Check whether the number of defined flavors exceeds the maximum + * allowed value. + */ + BUILD_BUG_ON(SEG6_LOCAL_FLV_OP_MAX + 1 > BITS_PER_TYPE(__u32)); + /* If the default NEXT-C-SID Locator-Block/Node Function lengths (in * bits) have been changed with invalid values, kernel build stops * here. -- cgit v1.2.3 From 23ab9324fd260277f83a07c51fdc625442e98265 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 13 Aug 2023 19:48:55 +0300 Subject: nexthop: Simplify nexthop bucket dump Before commit f10d3d9df49d ("nexthop: Make nexthop bucket dump more efficient"), rtm_dump_nexthop_bucket_nh() returned a non-zero return code for each resilient nexthop group whose buckets it dumped, regardless if it encountered an error or not. This meant that the sentinel ('dd->ctx->nh.idx') used by the function that walked the different nexthops could not be used as a sentinel for the bucket dump, as otherwise buckets from the same group would be dumped over and over again. This was dealt with by adding another sentinel ('dd->ctx->done_nh_idx') that was incremented by rtm_dump_nexthop_bucket_nh() after successfully dumping all the buckets from a given group. After the previously mentioned commit this sentinel is no longer necessary since the function no longer returns a non-zero return code when successfully dumping all the buckets from a given group. Remove this sentinel and simplify the code. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230813164856.2379822-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 09d36bcbd7d4..7e8bb85e9dcb 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -3337,7 +3337,6 @@ static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh, struct rtm_dump_res_bucket_ctx { struct rtm_dump_nh_ctx nh; u16 bucket_index; - u32 done_nh_idx; /* 1 + the index of the last fully processed NH. */ }; static struct rtm_dump_res_bucket_ctx * @@ -3366,9 +3365,6 @@ static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, u16 bucket_index; int err; - if (dd->ctx->nh.idx < dd->ctx->done_nh_idx) - return 0; - nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); for (bucket_index = dd->ctx->bucket_index; @@ -3395,7 +3391,6 @@ static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, return err; } - dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1; dd->ctx->bucket_index = 0; return 0; -- cgit v1.2.3 From db1428f66a8c97793e6596e7c62047211dd6db79 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 13 Aug 2023 19:48:56 +0300 Subject: nexthop: Do not increment dump sentinel at the end of the dump The nexthop and nexthop bucket dump callbacks previously returned a positive return code even when the dump was complete, prompting the core netlink code to invoke the callback again, until returning zero. Zero was only returned by these callbacks when no information was filled in the provided skb, which was achieved by incrementing the dump sentinel at the end of the dump beyond the ID of the last nexthop. This is no longer necessary as when the dump is complete these callbacks return zero. Remove the unnecessary increment. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230813164856.2379822-3-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 7e8bb85e9dcb..bbff68b5b5d4 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -3209,7 +3209,6 @@ static int rtm_dump_walk_nexthops(struct sk_buff *skb, return err; } - ctx->idx++; return 0; } -- cgit v1.2.3 From 956db0a13b47df7f3d6d624394e602e8bf9b057e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Aug 2023 13:56:25 -0700 Subject: net: warn about attempts to register negative ifindex Since the xarray changes we mix returning valid ifindex and negative errno in a single int returned from dev_index_reserve(). This depends on the fact that ifindexes can't be negative. Otherwise we may insert into the xarray and return a very large negative value. This in turn may break ERR_PTR(). OvS is susceptible to this problem and lacking validation (fix posted separately for net). Reject negative ifindex explicitly. Add a warning because the input validation is better handled by the caller. Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230814205627.2914583-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/dev.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 636b41f0b32d..17e6281e408c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9589,6 +9589,11 @@ static int dev_index_reserve(struct net *net, u32 ifindex) { int err; + if (ifindex > INT_MAX) { + DEBUG_NET_WARN_ON_ONCE(1); + return -EINVAL; + } + if (!ifindex) err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL, xa_limit_31b, &net->ifindex, GFP_KERNEL); -- cgit v1.2.3 From c274af2242693f59f00851b3660a21b10bcd76cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 08:15:33 +0000 Subject: inet: introduce inet->inet_flags Various inet fields are currently racy. do_ip_setsockopt() and do_ip_getsockopt() are mostly holding the socket lock, but some (fast) paths do not. Use a new inet->inet_flags to hold atomic bits in the series. Remove inet->cmsg_flags, and use instead 9 bits from inet_flags. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/inet_sock.h | 55 +++++++++++++++++++++++++------- net/ipv4/ip_sockglue.c | 84 ++++++++++++++++++------------------------------- net/ipv4/ping.c | 5 +-- net/ipv4/raw.c | 2 +- net/ipv4/udp.c | 2 +- net/ipv6/datagram.c | 2 +- net/ipv6/udp.c | 2 +- net/l2tp/l2tp_ip.c | 2 +- 8 files changed, 83 insertions(+), 71 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 0bb32bfc6183..e3b35b0015f3 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -194,6 +194,7 @@ struct rtable; * @inet_rcv_saddr - Bound local IPv4 addr * @inet_dport - Destination port * @inet_num - Local port + * @inet_flags - various atomic flags * @inet_saddr - Sending source * @uc_ttl - Unicast TTL * @inet_sport - Source port @@ -218,11 +219,11 @@ struct inet_sock { #define inet_dport sk.__sk_common.skc_dport #define inet_num sk.__sk_common.skc_num + unsigned long inet_flags; __be32 inet_saddr; __s16 uc_ttl; - __u16 cmsg_flags; - struct ip_options_rcu __rcu *inet_opt; __be16 inet_sport; + struct ip_options_rcu __rcu *inet_opt; __u16 inet_id; __u8 tos; @@ -259,16 +260,48 @@ struct inet_sock { #define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ #define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */ +enum { + INET_FLAGS_PKTINFO = 0, + INET_FLAGS_TTL = 1, + INET_FLAGS_TOS = 2, + INET_FLAGS_RECVOPTS = 3, + INET_FLAGS_RETOPTS = 4, + INET_FLAGS_PASSSEC = 5, + INET_FLAGS_ORIGDSTADDR = 6, + INET_FLAGS_CHECKSUM = 7, + INET_FLAGS_RECVFRAGSIZE = 8, +}; + /* cmsg flags for inet */ -#define IP_CMSG_PKTINFO BIT(0) -#define IP_CMSG_TTL BIT(1) -#define IP_CMSG_TOS BIT(2) -#define IP_CMSG_RECVOPTS BIT(3) -#define IP_CMSG_RETOPTS BIT(4) -#define IP_CMSG_PASSSEC BIT(5) -#define IP_CMSG_ORIGDSTADDR BIT(6) -#define IP_CMSG_CHECKSUM BIT(7) -#define IP_CMSG_RECVFRAGSIZE BIT(8) +#define IP_CMSG_PKTINFO BIT(INET_FLAGS_PKTINFO) +#define IP_CMSG_TTL BIT(INET_FLAGS_TTL) +#define IP_CMSG_TOS BIT(INET_FLAGS_TOS) +#define IP_CMSG_RECVOPTS BIT(INET_FLAGS_RECVOPTS) +#define IP_CMSG_RETOPTS BIT(INET_FLAGS_RETOPTS) +#define IP_CMSG_PASSSEC BIT(INET_FLAGS_PASSSEC) +#define IP_CMSG_ORIGDSTADDR BIT(INET_FLAGS_ORIGDSTADDR) +#define IP_CMSG_CHECKSUM BIT(INET_FLAGS_CHECKSUM) +#define IP_CMSG_RECVFRAGSIZE BIT(INET_FLAGS_RECVFRAGSIZE) + +#define IP_CMSG_ALL (IP_CMSG_PKTINFO | IP_CMSG_TTL | \ + IP_CMSG_TOS | IP_CMSG_RECVOPTS | \ + IP_CMSG_RETOPTS | IP_CMSG_PASSSEC | \ + IP_CMSG_ORIGDSTADDR | IP_CMSG_CHECKSUM | \ + IP_CMSG_RECVFRAGSIZE) + +static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) +{ + return READ_ONCE(inet->inet_flags) & IP_CMSG_ALL; +} + +#define inet_test_bit(nr, sk) \ + test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) +#define inet_set_bit(nr, sk) \ + set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) +#define inet_clear_bit(nr, sk) \ + clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) +#define inet_assign_bit(nr, sk, val) \ + assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val) static inline bool sk_is_inet(struct sock *sk) { diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index d41bce8927b2..66f55f3db5ec 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -171,8 +171,10 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, int tlen, int offset) { - struct inet_sock *inet = inet_sk(sk); - unsigned int flags = inet->cmsg_flags; + unsigned long flags = inet_cmsg_flags(inet_sk(sk)); + + if (!flags) + return; /* Ordered by supposed usage frequency */ if (flags & IP_CMSG_PKTINFO) { @@ -568,7 +570,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; - if (inet_sk(sk)->cmsg_flags) + if (inet_cmsg_flags(inet_sk(sk))) ip_cmsg_recv(msg, skb); } @@ -635,7 +637,7 @@ EXPORT_SYMBOL(ip_sock_set_mtu_discover); void ip_sock_set_pktinfo(struct sock *sk) { lock_sock(sk); - inet_sk(sk)->cmsg_flags |= IP_CMSG_PKTINFO; + inet_set_bit(PKTINFO, sk); release_sock(sk); } EXPORT_SYMBOL(ip_sock_set_pktinfo); @@ -990,67 +992,43 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, break; } case IP_PKTINFO: - if (val) - inet->cmsg_flags |= IP_CMSG_PKTINFO; - else - inet->cmsg_flags &= ~IP_CMSG_PKTINFO; + inet_assign_bit(PKTINFO, sk, val); break; case IP_RECVTTL: - if (val) - inet->cmsg_flags |= IP_CMSG_TTL; - else - inet->cmsg_flags &= ~IP_CMSG_TTL; + inet_assign_bit(TTL, sk, val); break; case IP_RECVTOS: - if (val) - inet->cmsg_flags |= IP_CMSG_TOS; - else - inet->cmsg_flags &= ~IP_CMSG_TOS; + inet_assign_bit(TOS, sk, val); break; case IP_RECVOPTS: - if (val) - inet->cmsg_flags |= IP_CMSG_RECVOPTS; - else - inet->cmsg_flags &= ~IP_CMSG_RECVOPTS; + inet_assign_bit(RECVOPTS, sk, val); break; case IP_RETOPTS: - if (val) - inet->cmsg_flags |= IP_CMSG_RETOPTS; - else - inet->cmsg_flags &= ~IP_CMSG_RETOPTS; + inet_assign_bit(RETOPTS, sk, val); break; case IP_PASSSEC: - if (val) - inet->cmsg_flags |= IP_CMSG_PASSSEC; - else - inet->cmsg_flags &= ~IP_CMSG_PASSSEC; + inet_assign_bit(PASSSEC, sk, val); break; case IP_RECVORIGDSTADDR: - if (val) - inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR; - else - inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR; + inet_assign_bit(ORIGDSTADDR, sk, val); break; case IP_CHECKSUM: if (val) { - if (!(inet->cmsg_flags & IP_CMSG_CHECKSUM)) { + if (!(inet_test_bit(CHECKSUM, sk))) { inet_inc_convert_csum(sk); - inet->cmsg_flags |= IP_CMSG_CHECKSUM; + inet_set_bit(CHECKSUM, sk); } } else { - if (inet->cmsg_flags & IP_CMSG_CHECKSUM) { + if (inet_test_bit(CHECKSUM, sk)) { inet_dec_convert_csum(sk); - inet->cmsg_flags &= ~IP_CMSG_CHECKSUM; + inet_clear_bit(CHECKSUM, sk); } } break; case IP_RECVFRAGSIZE: if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM) goto e_inval; - if (val) - inet->cmsg_flags |= IP_CMSG_RECVFRAGSIZE; - else - inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE; + inet_assign_bit(RECVFRAGSIZE, sk, val); break; case IP_TOS: /* This sets both TOS and Precedence */ __ip_sock_set_tos(sk, val); @@ -1415,7 +1393,7 @@ e_inval: void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); - bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) || + bool prepare = inet_test_bit(PKTINFO, sk) || ipv6_sk_rxinfo(sk); if (prepare && skb_rtable(skb)) { @@ -1601,31 +1579,31 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, return 0; } case IP_PKTINFO: - val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0; + val = inet_test_bit(PKTINFO, sk); break; case IP_RECVTTL: - val = (inet->cmsg_flags & IP_CMSG_TTL) != 0; + val = inet_test_bit(TTL, sk); break; case IP_RECVTOS: - val = (inet->cmsg_flags & IP_CMSG_TOS) != 0; + val = inet_test_bit(TOS, sk); break; case IP_RECVOPTS: - val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0; + val = inet_test_bit(RECVOPTS, sk); break; case IP_RETOPTS: - val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0; + val = inet_test_bit(RETOPTS, sk); break; case IP_PASSSEC: - val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0; + val = inet_test_bit(PASSSEC, sk); break; case IP_RECVORIGDSTADDR: - val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0; + val = inet_test_bit(ORIGDSTADDR, sk); break; case IP_CHECKSUM: - val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0; + val = inet_test_bit(CHECKSUM, sk); break; case IP_RECVFRAGSIZE: - val = (inet->cmsg_flags & IP_CMSG_RECVFRAGSIZE) != 0; + val = inet_test_bit(RECVFRAGSIZE, sk); break; case IP_TOS: val = inet->tos; @@ -1737,7 +1715,7 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, msg.msg_controllen = len; msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; - if (inet->cmsg_flags & IP_CMSG_PKTINFO) { + if (inet_test_bit(PKTINFO, sk)) { struct in_pktinfo info; info.ipi_addr.s_addr = inet->inet_rcv_saddr; @@ -1745,11 +1723,11 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, info.ipi_ifindex = inet->mc_index; put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); } - if (inet->cmsg_flags & IP_CMSG_TTL) { + if (inet_test_bit(TTL, sk)) { int hlim = inet->mc_ttl; put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); } - if (inet->cmsg_flags & IP_CMSG_TOS) { + if (inet_test_bit(TOS, sk)) { int tos = inet->rcv_tos; put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos); } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 25dd78cee179..7e8702cb6634 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -894,7 +894,7 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, *addr_len = sizeof(*sin); } - if (isk->cmsg_flags) + if (inet_cmsg_flags(isk)) ip_cmsg_recv(msg, skb); #if IS_ENABLED(CONFIG_IPV6) @@ -921,7 +921,8 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, if (skb->protocol == htons(ETH_P_IPV6) && inet6_sk(sk)->rxopt.all) pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb); - else if (skb->protocol == htons(ETH_P_IP) && isk->cmsg_flags) + else if (skb->protocol == htons(ETH_P_IP) && + inet_cmsg_flags(isk)) ip_cmsg_recv(msg, skb); #endif } else { diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index cb381f5aa464..e6e813f4aa31 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -767,7 +767,7 @@ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } - if (inet->cmsg_flags) + if (inet_cmsg_flags(inet)) ip_cmsg_recv(msg, skb); if (flags & MSG_TRUNC) copied = skb->len; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3e2f29c14fa8..4b791133989c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1870,7 +1870,7 @@ try_again: if (udp_sk(sk)->gro_enabled) udp_cmsg_recv(msg, sk, skb); - if (inet->cmsg_flags) + if (inet_cmsg_flags(inet)) ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); err = copied; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index d80d6024cafa..41ebc4e57473 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -524,7 +524,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) } else { ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &sin->sin6_addr); - if (inet_sk(sk)->cmsg_flags) + if (inet_cmsg_flags(inet_sk(sk))) ip_cmsg_recv(msg, skb); } } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 1ea01b0d9be3..ebc6ae47cfea 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -420,7 +420,7 @@ try_again: ip6_datagram_recv_common_ctl(sk, msg, skb); if (is_udp4) { - if (inet->cmsg_flags) + if (inet_cmsg_flags(inet)) ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); } else { diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index f9073bc7281f..9a2a9ed3ba47 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -552,7 +552,7 @@ static int l2tp_ip_recvmsg(struct sock *sk, struct msghdr *msg, memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } - if (inet->cmsg_flags) + if (inet_cmsg_flags(inet)) ip_cmsg_recv(msg, skb); if (flags & MSG_TRUNC) copied = skb->len; -- cgit v1.2.3 From b4d84bce4c437c4ac1b6f7ef4d612d7d52f62cfe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 08:15:34 +0000 Subject: inet: set/get simple options locklessly Now we have inet->inet_flags, we can set following options without having to hold the socket lock: IP_PKTINFO, IP_RECVTTL, IP_RECVTOS, IP_RECVOPTS, IP_RETOPTS, IP_PASSSEC, IP_RECVORIGDSTADDR, IP_RECVFRAGSIZE. ip_sock_set_pktinfo() no longer hold the socket lock. Similarly we can get the following options whithout holding the socket lock: IP_PKTINFO, IP_RECVTTL, IP_RECVTOS, IP_RECVOPTS, IP_RETOPTS, IP_PASSSEC, IP_RECVORIGDSTADDR, IP_CHECKSUM, IP_RECVFRAGSIZE. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 118 ++++++++++++++++++++++++++----------------------- 1 file changed, 62 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 66f55f3db5ec..69b87518348a 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -636,9 +636,7 @@ EXPORT_SYMBOL(ip_sock_set_mtu_discover); void ip_sock_set_pktinfo(struct sock *sk) { - lock_sock(sk); inet_set_bit(PKTINFO, sk); - release_sock(sk); } EXPORT_SYMBOL(ip_sock_set_pktinfo); @@ -952,6 +950,36 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, if (ip_mroute_opt(optname)) return ip_mroute_setsockopt(sk, optname, optval, optlen); + /* Handle options that can be set without locking the socket. */ + switch (optname) { + case IP_PKTINFO: + inet_assign_bit(PKTINFO, sk, val); + return 0; + case IP_RECVTTL: + inet_assign_bit(TTL, sk, val); + return 0; + case IP_RECVTOS: + inet_assign_bit(TOS, sk, val); + return 0; + case IP_RECVOPTS: + inet_assign_bit(RECVOPTS, sk, val); + return 0; + case IP_RETOPTS: + inet_assign_bit(RETOPTS, sk, val); + return 0; + case IP_PASSSEC: + inet_assign_bit(PASSSEC, sk, val); + return 0; + case IP_RECVORIGDSTADDR: + inet_assign_bit(ORIGDSTADDR, sk, val); + return 0; + case IP_RECVFRAGSIZE: + if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM) + return -EINVAL; + inet_assign_bit(RECVFRAGSIZE, sk, val); + return 0; + } + err = 0; if (needs_rtnl) rtnl_lock(); @@ -991,27 +1019,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, kfree_rcu(old, rcu); break; } - case IP_PKTINFO: - inet_assign_bit(PKTINFO, sk, val); - break; - case IP_RECVTTL: - inet_assign_bit(TTL, sk, val); - break; - case IP_RECVTOS: - inet_assign_bit(TOS, sk, val); - break; - case IP_RECVOPTS: - inet_assign_bit(RECVOPTS, sk, val); - break; - case IP_RETOPTS: - inet_assign_bit(RETOPTS, sk, val); - break; - case IP_PASSSEC: - inet_assign_bit(PASSSEC, sk, val); - break; - case IP_RECVORIGDSTADDR: - inet_assign_bit(ORIGDSTADDR, sk, val); - break; case IP_CHECKSUM: if (val) { if (!(inet_test_bit(CHECKSUM, sk))) { @@ -1025,11 +1032,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, } } break; - case IP_RECVFRAGSIZE: - if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM) - goto e_inval; - inet_assign_bit(RECVFRAGSIZE, sk, val); - break; case IP_TOS: /* This sets both TOS and Precedence */ __ip_sock_set_tos(sk, val); break; @@ -1544,6 +1546,37 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, if (len < 0) return -EINVAL; + /* Handle options that can be read without locking the socket. */ + switch (optname) { + case IP_PKTINFO: + val = inet_test_bit(PKTINFO, sk); + goto copyval; + case IP_RECVTTL: + val = inet_test_bit(TTL, sk); + goto copyval; + case IP_RECVTOS: + val = inet_test_bit(TOS, sk); + goto copyval; + case IP_RECVOPTS: + val = inet_test_bit(RECVOPTS, sk); + goto copyval; + case IP_RETOPTS: + val = inet_test_bit(RETOPTS, sk); + goto copyval; + case IP_PASSSEC: + val = inet_test_bit(PASSSEC, sk); + goto copyval; + case IP_RECVORIGDSTADDR: + val = inet_test_bit(ORIGDSTADDR, sk); + goto copyval; + case IP_CHECKSUM: + val = inet_test_bit(CHECKSUM, sk); + goto copyval; + case IP_RECVFRAGSIZE: + val = inet_test_bit(RECVFRAGSIZE, sk); + goto copyval; + } + if (needs_rtnl) rtnl_lock(); sockopt_lock_sock(sk); @@ -1578,33 +1611,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; return 0; } - case IP_PKTINFO: - val = inet_test_bit(PKTINFO, sk); - break; - case IP_RECVTTL: - val = inet_test_bit(TTL, sk); - break; - case IP_RECVTOS: - val = inet_test_bit(TOS, sk); - break; - case IP_RECVOPTS: - val = inet_test_bit(RECVOPTS, sk); - break; - case IP_RETOPTS: - val = inet_test_bit(RETOPTS, sk); - break; - case IP_PASSSEC: - val = inet_test_bit(PASSSEC, sk); - break; - case IP_RECVORIGDSTADDR: - val = inet_test_bit(ORIGDSTADDR, sk); - break; - case IP_CHECKSUM: - val = inet_test_bit(CHECKSUM, sk); - break; - case IP_RECVFRAGSIZE: - val = inet_test_bit(RECVFRAGSIZE, sk); - break; case IP_TOS: val = inet->tos; break; @@ -1754,7 +1760,7 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, return -ENOPROTOOPT; } sockopt_release_sock(sk); - +copyval: if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) { unsigned char ucval = (unsigned char)val; len = 1; -- cgit v1.2.3 From 6b5f43ea08150e7ff72f734545101c58489ead5b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 08:15:35 +0000 Subject: inet: move inet->recverr to inet->inet_flags IP_RECVERR socket option can now be set/get without locking the socket. This patch potentially avoid data-races around inet->recverr. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/inet_sock.h | 5 +++-- net/dccp/ipv4.c | 4 +--- net/ipv4/inet_diag.c | 2 +- net/ipv4/ip_sockglue.c | 23 ++++++++++------------- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 14 ++++++++------ net/ipv4/tcp_ipv4.c | 5 ++--- net/ipv4/udp.c | 5 +++-- net/sctp/input.c | 2 +- 9 files changed, 30 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index e3b35b0015f3..552188aa5a2d 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -230,8 +230,7 @@ struct inet_sock { __u8 min_ttl; __u8 mc_ttl; __u8 pmtudisc; - __u8 recverr:1, - is_icsk:1, + __u8 is_icsk:1, freebind:1, hdrincl:1, mc_loop:1, @@ -270,6 +269,8 @@ enum { INET_FLAGS_ORIGDSTADDR = 6, INET_FLAGS_CHECKSUM = 7, INET_FLAGS_RECVFRAGSIZE = 8, + + INET_FLAGS_RECVERR = 9, }; /* cmsg flags for inet */ diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 8e919cfe6e23..8dd6837c476a 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -247,7 +247,6 @@ static int dccp_v4_err(struct sk_buff *skb, u32 info) const u8 offset = iph->ihl << 2; const struct dccp_hdr *dh; struct dccp_sock *dp; - struct inet_sock *inet; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct sock *sk; @@ -361,8 +360,7 @@ static int dccp_v4_err(struct sk_buff *skb, u32 info) * --ANK (980905) */ - inet = inet_sk(sk); - if (!sock_owned_by_user(sk) && inet->recverr) { + if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { sk->sk_err = err; sk_error_report(sk); } else { /* Only an error on timeout */ diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index f7426926a104..25d5f76b66bd 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -182,7 +182,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, r->idiag_inode = sock_i_ino(sk); memset(&inet_sockopt, 0, sizeof(inet_sockopt)); - inet_sockopt.recverr = inet->recverr; + inet_sockopt.recverr = inet_test_bit(RECVERR, sk); inet_sockopt.is_icsk = inet->is_icsk; inet_sockopt.freebind = inet->freebind; inet_sockopt.hdrincl = inet->hdrincl; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 69b87518348a..8283d862a9db 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -446,12 +446,11 @@ EXPORT_SYMBOL_GPL(ip_icmp_error); void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info) { - struct inet_sock *inet = inet_sk(sk); struct sock_exterr_skb *serr; struct iphdr *iph; struct sk_buff *skb; - if (!inet->recverr) + if (!inet_test_bit(RECVERR, sk)) return; skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC); @@ -617,9 +616,7 @@ EXPORT_SYMBOL(ip_sock_set_freebind); void ip_sock_set_recverr(struct sock *sk) { - lock_sock(sk); - inet_sk(sk)->recverr = true; - release_sock(sk); + inet_set_bit(RECVERR, sk); } EXPORT_SYMBOL(ip_sock_set_recverr); @@ -978,6 +975,11 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; inet_assign_bit(RECVFRAGSIZE, sk, val); return 0; + case IP_RECVERR: + inet_assign_bit(RECVERR, sk, val); + if (!val) + skb_queue_purge(&sk->sk_error_queue); + return 0; } err = 0; @@ -1064,11 +1066,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, goto e_inval; inet->pmtudisc = val; break; - case IP_RECVERR: - inet->recverr = !!val; - if (!val) - skb_queue_purge(&sk->sk_error_queue); - break; case IP_RECVERR_RFC4884: if (val < 0 || val > 1) goto e_inval; @@ -1575,6 +1572,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_RECVFRAGSIZE: val = inet_test_bit(RECVFRAGSIZE, sk); goto copyval; + case IP_RECVERR: + val = inet_test_bit(RECVERR, sk); + goto copyval; } if (needs_rtnl) @@ -1649,9 +1649,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, } break; } - case IP_RECVERR: - val = inet->recverr; - break; case IP_RECVERR_RFC4884: val = inet->recverr_rfc4884; break; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 7e8702cb6634..75e0aee35eb7 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -580,7 +580,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info) * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ - if ((family == AF_INET && !inet_sock->recverr) || + if ((family == AF_INET && !inet_test_bit(RECVERR, sk)) || (family == AF_INET6 && !inet6_sk(sk)->recverr)) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index e6e813f4aa31..f4c27dc5714b 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -203,8 +203,9 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) struct inet_sock *inet = inet_sk(sk); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - int err = 0; int harderr = 0; + bool recverr; + int err = 0; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) ipv4_sk_update_pmtu(skb, sk, info); @@ -218,7 +219,8 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) 2. Socket is connected (otherwise the error indication is useless without ip_recverr and error is hard. */ - if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED) + recverr = inet_test_bit(RECVERR, sk); + if (!recverr && sk->sk_state != TCP_ESTABLISHED) return; switch (type) { @@ -245,7 +247,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) } } - if (inet->recverr) { + if (recverr) { const struct iphdr *iph = (const struct iphdr *)skb->data; u8 *payload = skb->data + (iph->ihl << 2); @@ -254,7 +256,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) ip_icmp_error(sk, skb, err, 0, info, payload); } - if (inet->recverr || harderr) { + if (recverr || harderr) { sk->sk_err = err; sk_error_report(sk); } @@ -413,7 +415,7 @@ error_free: kfree_skb(skb); error: IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); - if (err == -ENOBUFS && !inet->recverr) + if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) err = 0; return err; } @@ -645,7 +647,7 @@ back_from_confirm: ip_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) { err = ip_push_pending_frames(sk, &fl4); - if (err == -ENOBUFS && !inet->recverr) + if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) err = 0; } release_sock(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5b18a048f613..2a662d5f3072 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -477,7 +477,6 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) const struct iphdr *iph = (const struct iphdr *)skb->data; struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); struct tcp_sock *tp; - struct inet_sock *inet; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct sock *sk; @@ -625,8 +624,8 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) * --ANK (980905) */ - inet = inet_sk(sk); - if (!sock_owned_by_user(sk) && inet->recverr) { + if (!sock_owned_by_user(sk) && + inet_test_bit(RECVERR, sk)) { WRITE_ONCE(sk->sk_err, err); sk_error_report(sk); } else { /* Only an error on timeout */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4b791133989c..0794a2c46a56 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -779,7 +779,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) (u8 *)(uh+1)); goto out; } - if (!inet->recverr) { + if (!inet_test_bit(RECVERR, sk)) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else @@ -962,7 +962,8 @@ csum_partial: send: err = ip_send_skb(sock_net(sk), skb); if (err) { - if (err == -ENOBUFS && !inet->recverr) { + if (err == -ENOBUFS && + !inet_test_bit(RECVERR, sk)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); err = 0; diff --git a/net/sctp/input.c b/net/sctp/input.c index 2613c4d74b16..17fcaa9b0df9 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -581,7 +581,7 @@ static void sctp_v4_err_handle(struct sctp_transport *t, struct sk_buff *skb, default: return; } - if (!sock_owned_by_user(sk) && inet_sk(sk)->recverr) { + if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { sk->sk_err = err; sk_error_report(sk); } else { /* Only an error on timeout */ -- cgit v1.2.3 From 8e8cfb114d9f1e2efddb892f993c1ad61635c85e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 08:15:36 +0000 Subject: inet: move inet->recverr_rfc4884 to inet->inet_flags IP_RECVERR_RFC4884 socket option can now be set/read without locking the socket. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/inet_sock.h | 2 +- net/ipv4/inet_diag.c | 2 +- net/ipv4/ip_sockglue.c | 18 +++++++++--------- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 552188aa5a2d..c01f1f64a861 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -238,7 +238,6 @@ struct inet_sock { mc_all:1, nodefrag:1; __u8 bind_address_no_port:1, - recverr_rfc4884:1, defer_connect:1; /* Indicates that fastopen_connect is set * and cookie exists so we defer connect * until first data frame is written @@ -271,6 +270,7 @@ enum { INET_FLAGS_RECVFRAGSIZE = 8, INET_FLAGS_RECVERR = 9, + INET_FLAGS_RECVERR_RFC4884 = 10, }; /* cmsg flags for inet */ diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 25d5f76b66bd..6255d6fdbc80 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -191,7 +191,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, inet_sockopt.mc_all = inet->mc_all; inet_sockopt.nodefrag = inet->nodefrag; inet_sockopt.bind_address_no_port = inet->bind_address_no_port; - inet_sockopt.recverr_rfc4884 = inet->recverr_rfc4884; + inet_sockopt.recverr_rfc4884 = inet_test_bit(RECVERR_RFC4884, sk); inet_sockopt.defer_connect = inet->defer_connect; if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt), &inet_sockopt)) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 8283d862a9db..f75f44ad7b11 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -433,7 +433,7 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr->port = port; if (skb_pull(skb, payload - skb->data)) { - if (inet_sk(sk)->recverr_rfc4884) + if (inet_test_bit(RECVERR_RFC4884, sk)) ipv4_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); skb_reset_transport_header(skb); @@ -980,6 +980,11 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, if (!val) skb_queue_purge(&sk->sk_error_queue); return 0; + case IP_RECVERR_RFC4884: + if (val < 0 || val > 1) + return -EINVAL; + inet_assign_bit(RECVERR_RFC4884, sk, val); + return 0; } err = 0; @@ -1066,11 +1071,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, goto e_inval; inet->pmtudisc = val; break; - case IP_RECVERR_RFC4884: - if (val < 0 || val > 1) - goto e_inval; - inet->recverr_rfc4884 = !!val; - break; case IP_MULTICAST_TTL: if (sk->sk_type == SOCK_STREAM) goto e_inval; @@ -1575,6 +1575,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_RECVERR: val = inet_test_bit(RECVERR, sk); goto copyval; + case IP_RECVERR_RFC4884: + val = inet_test_bit(RECVERR_RFC4884, sk); + goto copyval; } if (needs_rtnl) @@ -1649,9 +1652,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, } break; } - case IP_RECVERR_RFC4884: - val = inet->recverr_rfc4884; - break; case IP_MULTICAST_TTL: val = inet->mc_ttl; break; -- cgit v1.2.3 From 3f7e753206bb20fc098b44ec40001befd1fe18d1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 08:15:37 +0000 Subject: inet: move inet->freebind to inet->inet_flags IP_FREEBIND socket option can now be set/read without locking the socket. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Simon Horman Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- include/net/inet_sock.h | 5 +++-- include/net/ipv6.h | 3 ++- net/ipv4/inet_diag.c | 2 +- net/ipv4/ip_sockglue.c | 21 +++++++++------------ net/ipv6/ipv6_sockglue.c | 4 ++-- net/mptcp/sockopt.c | 7 ++++--- net/sctp/protocol.c | 2 +- 7 files changed, 22 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index c01f1f64a861..d6ba963534b4 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -231,7 +231,6 @@ struct inet_sock { __u8 mc_ttl; __u8 pmtudisc; __u8 is_icsk:1, - freebind:1, hdrincl:1, mc_loop:1, transparent:1, @@ -271,6 +270,7 @@ enum { INET_FLAGS_RECVERR = 9, INET_FLAGS_RECVERR_RFC4884 = 10, + INET_FLAGS_FREEBIND = 11, }; /* cmsg flags for inet */ @@ -423,7 +423,8 @@ static inline bool inet_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { return READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind) || - inet->freebind || inet->transparent; + test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) || + inet->transparent; } static inline bool inet_addr_valid_or_nonlocal(struct net *net, diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 22643ffc2df8..fd570d77588e 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -937,7 +937,8 @@ static inline bool ipv6_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { return net->ipv6.sysctl.ip_nonlocal_bind || - inet->freebind || inet->transparent; + test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) || + inet->transparent; } /* Sysctl settings for net ipv6.auto_flowlabels */ diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 6255d6fdbc80..5a96f4f28eca 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -184,7 +184,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, memset(&inet_sockopt, 0, sizeof(inet_sockopt)); inet_sockopt.recverr = inet_test_bit(RECVERR, sk); inet_sockopt.is_icsk = inet->is_icsk; - inet_sockopt.freebind = inet->freebind; + inet_sockopt.freebind = inet_test_bit(FREEBIND, sk); inet_sockopt.hdrincl = inet->hdrincl; inet_sockopt.mc_loop = inet->mc_loop; inet_sockopt.transparent = inet->transparent; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index f75f44ad7b11..6af843106312 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -608,9 +608,7 @@ EXPORT_SYMBOL(ip_sock_set_tos); void ip_sock_set_freebind(struct sock *sk) { - lock_sock(sk); - inet_sk(sk)->freebind = true; - release_sock(sk); + inet_set_bit(FREEBIND, sk); } EXPORT_SYMBOL(ip_sock_set_freebind); @@ -985,6 +983,11 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; inet_assign_bit(RECVERR_RFC4884, sk, val); return 0; + case IP_FREEBIND: + if (optlen < 1) + return -EINVAL; + inet_assign_bit(FREEBIND, sk, val); + return 0; } err = 0; @@ -1310,12 +1313,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, inet->mc_all = val; break; - case IP_FREEBIND: - if (optlen < 1) - goto e_inval; - inet->freebind = !!val; - break; - case IP_IPSEC_POLICY: case IP_XFRM_POLICY: err = -EPERM; @@ -1578,6 +1575,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_RECVERR_RFC4884: val = inet_test_bit(RECVERR_RFC4884, sk); goto copyval; + case IP_FREEBIND: + val = inet_test_bit(FREEBIND, sk); + goto copyval; } if (needs_rtnl) @@ -1737,9 +1737,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, len -= msg.msg_controllen; return copy_to_sockptr(optlen, &len, sizeof(int)); } - case IP_FREEBIND: - val = inet->freebind; - break; case IP_TRANSPARENT: val = inet->transparent; break; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index ca377159967c..3eb38436f8d4 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -641,7 +641,7 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int)) goto e_inval; /* we also don't have a separate freebind bit for IPV6 */ - inet_sk(sk)->freebind = valbool; + inet_assign_bit(FREEBIND, sk, valbool); retv = 0; break; @@ -1334,7 +1334,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_FREEBIND: - val = inet_sk(sk)->freebind; + val = inet_test_bit(FREEBIND, sk); break; case IPV6_RECVORIGDSTADDR: diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 21bc46acbe38..ffbe2f5f5b44 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -419,7 +419,8 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, inet_sk(sk)->transparent = inet_sk(ssk)->transparent; break; case IPV6_FREEBIND: - inet_sk(sk)->freebind = inet_sk(ssk)->freebind; + inet_assign_bit(FREEBIND, sk, + inet_test_bit(FREEBIND, ssk)); break; } @@ -704,7 +705,7 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o switch (optname) { case IP_FREEBIND: - issk->freebind = inet_sk(sk)->freebind; + inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); break; case IP_TRANSPARENT: issk->transparent = inet_sk(sk)->transparent; @@ -1441,7 +1442,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) __tcp_sock_set_nodelay(ssk, !!msk->nodelay); inet_sk(ssk)->transparent = inet_sk(sk)->transparent; - inet_sk(ssk)->freebind = inet_sk(sk)->freebind; + inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); } static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 33c0895e101c..2185f44198de 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -360,7 +360,7 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) ret = inet_addr_type_table(net, addr->v4.sin_addr.s_addr, tb_id); if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) && ret != RTN_LOCAL && - !sp->inet.freebind && + !inet_test_bit(FREEBIND, sk) && !READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind)) return 0; -- cgit v1.2.3 From cafbe182a467bf6799242fd7468438cf1ab833dc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 08:15:38 +0000 Subject: inet: move inet->hdrincl to inet->inet_flags IP_HDRINCL socket option can now be set/read without locking the socket. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/inet_sock.h | 4 ++-- net/ipv4/af_inet.c | 2 +- net/ipv4/inet_diag.c | 2 +- net/ipv4/ip_output.c | 5 +++-- net/ipv4/ip_sockglue.c | 18 ++++++++---------- net/ipv4/raw.c | 10 +++------- net/ipv4/route.c | 8 ++++---- net/ipv6/af_inet6.c | 2 +- net/ipv6/ip6_output.c | 5 +++-- net/ipv6/raw.c | 16 +++++----------- 10 files changed, 31 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index d6ba963534b4..ad1895e32e7d 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -231,7 +231,6 @@ struct inet_sock { __u8 mc_ttl; __u8 pmtudisc; __u8 is_icsk:1, - hdrincl:1, mc_loop:1, transparent:1, mc_all:1, @@ -271,6 +270,7 @@ enum { INET_FLAGS_RECVERR = 9, INET_FLAGS_RECVERR_RFC4884 = 10, INET_FLAGS_FREEBIND = 11, + INET_FLAGS_HDRINCL = 12, }; /* cmsg flags for inet */ @@ -397,7 +397,7 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) { __u8 flags = 0; - if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl) + if (inet_sk(sk)->transparent || inet_test_bit(HDRINCL, sk)) flags |= FLOWI_FLAG_ANYSRC; return flags; } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c59da65f19d2..5785fe9de58c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -338,7 +338,7 @@ lookup_protocol: if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) - inet->hdrincl = 1; + inet_set_bit(HDRINCL, sk); } if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5a96f4f28eca..98f3eb0ce16a 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -185,7 +185,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, inet_sockopt.recverr = inet_test_bit(RECVERR, sk); inet_sockopt.is_icsk = inet->is_icsk; inet_sockopt.freebind = inet_test_bit(FREEBIND, sk); - inet_sockopt.hdrincl = inet->hdrincl; + inet_sockopt.hdrincl = inet_test_bit(HDRINCL, sk); inet_sockopt.mc_loop = inet->mc_loop; inet_sockopt.transparent = inet->transparent; inet_sockopt.mc_all = inet->mc_all; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f28c87533a46..8f396eada1b6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1039,7 +1039,7 @@ static int __ip_append_data(struct sock *sk, } } } else if ((flags & MSG_SPLICE_PAGES) && length) { - if (inet->hdrincl) + if (inet_test_bit(HDRINCL, sk)) return -EPERM; if (rt->dst.dev->features & NETIF_F_SG && getfrag == ip_generic_getfrag) @@ -1467,7 +1467,8 @@ struct sk_buff *__ip_make_skb(struct sock *sk, * so icmphdr does not in skb linear region and can not get icmp_type * by icmp_hdr(skb)->type. */ - if (sk->sk_type == SOCK_RAW && !inet_sk(sk)->hdrincl) + if (sk->sk_type == SOCK_RAW && + !inet_test_bit(HDRINCL, sk)) icmp_type = fl4->fl4_icmp_type; else icmp_type = icmp_hdr(skb)->type; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 6af843106312..763456fd4f4f 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -988,6 +988,11 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; inet_assign_bit(FREEBIND, sk, val); return 0; + case IP_HDRINCL: + if (sk->sk_type != SOCK_RAW) + return -ENOPROTOOPT; + inet_assign_bit(HDRINCL, sk, val); + return 0; } err = 0; @@ -1052,13 +1057,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, goto e_inval; inet->uc_ttl = val; break; - case IP_HDRINCL: - if (sk->sk_type != SOCK_RAW) { - err = -ENOPROTOOPT; - break; - } - inet->hdrincl = val ? 1 : 0; - break; case IP_NODEFRAG: if (sk->sk_type != SOCK_RAW) { err = -ENOPROTOOPT; @@ -1578,6 +1576,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_FREEBIND: val = inet_test_bit(FREEBIND, sk); goto copyval; + case IP_HDRINCL: + val = inet_test_bit(HDRINCL, sk); + goto copyval; } if (needs_rtnl) @@ -1625,9 +1626,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, inet->uc_ttl); break; } - case IP_HDRINCL: - val = inet->hdrincl; - break; case IP_NODEFRAG: val = inet->nodefrag; break; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index f4c27dc5714b..4b5db5d1edc2 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -251,7 +251,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) const struct iphdr *iph = (const struct iphdr *)skb->data; u8 *payload = skb->data + (iph->ihl << 2); - if (inet->hdrincl) + if (inet_test_bit(HDRINCL, sk)) payload = skb->data; ip_icmp_error(sk, skb, err, 0, info, payload); } @@ -491,12 +491,8 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (len > 0xFFFF) goto out; - /* hdrincl should be READ_ONCE(inet->hdrincl) - * but READ_ONCE() doesn't work with bit fields. - * Doing this indirectly yields the same result. - */ - hdrincl = inet->hdrincl; - hdrincl = READ_ONCE(hdrincl); + hdrincl = inet_test_bit(HDRINCL, sk); + /* * Check the flags. */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 92fede388d52..a4e153dd615b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -515,13 +515,12 @@ static void __build_flow_key(const struct net *net, struct flowi4 *fl4, __u8 scope = RT_SCOPE_UNIVERSE; if (sk) { - const struct inet_sock *inet = inet_sk(sk); - oif = sk->sk_bound_dev_if; mark = READ_ONCE(sk->sk_mark); tos = ip_sock_rt_tos(sk); scope = ip_sock_rt_scope(sk); - prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; + prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : + sk->sk_protocol; } flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope, @@ -555,7 +554,8 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk) & IPTOS_RT_MASK, ip_sock_rt_scope(sk), - inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + inet_test_bit(HDRINCL, sk) ? + IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, inet->inet_saddr, 0, 0, sk->sk_uid); rcu_read_unlock(); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 3ec0359d5c1f..ed9d207080ac 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -205,7 +205,7 @@ lookup_protocol: if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) - inet->hdrincl = 1; + inet_set_bit(HDRINCL, sk); } sk->sk_destruct = inet6_sock_destruct; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index bc96559bbf0f..f8a1f6bb3f87 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1591,7 +1591,7 @@ emsgsize: } } } else if ((flags & MSG_SPLICE_PAGES) && length) { - if (inet_sk(sk)->hdrincl) + if (inet_test_bit(HDRINCL, sk)) return -EPERM; if (rt->dst.dev->features & NETIF_F_SG && getfrag == ip_generic_getfrag) @@ -1995,7 +1995,8 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); u8 icmp6_type; - if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl) + if (sk->sk_socket->type == SOCK_RAW && + !inet_test_bit(HDRINCL, sk)) icmp6_type = fl6->fl6_icmp_type; else icmp6_type = icmp6_hdr(skb)->icmp6_type; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ea16734f5e1f..0eae7661a85c 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -291,7 +291,6 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); int err; int harderr; @@ -315,7 +314,7 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, } if (np->recverr) { u8 *payload = skb->data; - if (!inet->hdrincl) + if (!inet_test_bit(HDRINCL, sk)) payload += offset; ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload); } @@ -406,7 +405,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) skb->len, inet->inet_num, 0)); - if (inet->hdrincl) { + if (inet_test_bit(HDRINCL, sk)) { if (skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM); @@ -762,12 +761,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; - /* hdrincl should be READ_ONCE(inet->hdrincl) - * but READ_ONCE() doesn't work with bit fields. - * Doing this indirectly yields the same result. - */ - hdrincl = inet->hdrincl; - hdrincl = READ_ONCE(hdrincl); + hdrincl = inet_test_bit(HDRINCL, sk); /* * Get and verify the address. @@ -1000,7 +994,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, case IPV6_HDRINCL: if (sk->sk_type != SOCK_RAW) return -EINVAL; - inet_sk(sk)->hdrincl = !!val; + inet_assign_bit(HDRINCL, sk, val); return 0; case IPV6_CHECKSUM: if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 && @@ -1068,7 +1062,7 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, switch (optname) { case IPV6_HDRINCL: - val = inet_sk(sk)->hdrincl; + val = inet_test_bit(HDRINCL, sk); break; case IPV6_CHECKSUM: /* -- cgit v1.2.3 From b09bde5c3554d58cb711dac55a10ecc56d436966 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 08:15:39 +0000 Subject: inet: move inet->mc_loop to inet->inet_frags IP_MULTICAST_LOOP socket option can now be set/read without locking the socket. v3: fix build bot error reported in ipvs set_mcast_loop() Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/inet_sock.h | 2 +- net/core/sock.c | 2 +- net/ipv4/af_inet.c | 2 +- net/ipv4/inet_diag.c | 2 +- net/ipv4/ip_sockglue.c | 16 ++++++++-------- net/ipv4/udp_tunnel_core.c | 2 +- net/ipv6/af_inet6.c | 2 +- net/netfilter/ipvs/ip_vs_sync.c | 4 +--- net/sctp/socket.c | 2 +- 9 files changed, 16 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index ad1895e32e7d..6c4eeca59f60 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -231,7 +231,6 @@ struct inet_sock { __u8 mc_ttl; __u8 pmtudisc; __u8 is_icsk:1, - mc_loop:1, transparent:1, mc_all:1, nodefrag:1; @@ -271,6 +270,7 @@ enum { INET_FLAGS_RECVERR_RFC4884 = 10, INET_FLAGS_FREEBIND = 11, INET_FLAGS_HDRINCL = 12, + INET_FLAGS_MC_LOOP = 13, }; /* cmsg flags for inet */ diff --git a/net/core/sock.c b/net/core/sock.c index 525619776c6f..22d94394335f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -767,7 +767,7 @@ bool sk_mc_loop(struct sock *sk) return true; switch (sk->sk_family) { case AF_INET: - return inet_sk(sk)->mc_loop; + return inet_test_bit(MC_LOOP, sk); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return inet6_sk(sk)->mc_loop; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5785fe9de58c..e847822df9b6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -356,7 +356,7 @@ lookup_protocol: sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); inet->uc_ttl = -1; - inet->mc_loop = 1; + inet_set_bit(MC_LOOP, sk); inet->mc_ttl = 1; inet->mc_all = 1; inet->mc_index = 0; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 98f3eb0ce16a..cc797261893b 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -186,7 +186,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, inet_sockopt.is_icsk = inet->is_icsk; inet_sockopt.freebind = inet_test_bit(FREEBIND, sk); inet_sockopt.hdrincl = inet_test_bit(HDRINCL, sk); - inet_sockopt.mc_loop = inet->mc_loop; + inet_sockopt.mc_loop = inet_test_bit(MC_LOOP, sk); inet_sockopt.transparent = inet->transparent; inet_sockopt.mc_all = inet->mc_all; inet_sockopt.nodefrag = inet->nodefrag; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 763456fd4f4f..be569032b612 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -993,6 +993,11 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, return -ENOPROTOOPT; inet_assign_bit(HDRINCL, sk, val); return 0; + case IP_MULTICAST_LOOP: + if (optlen < 1) + return -EINVAL; + inet_assign_bit(MC_LOOP, sk, val); + return 0; } err = 0; @@ -1083,11 +1088,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, goto e_inval; inet->mc_ttl = val; break; - case IP_MULTICAST_LOOP: - if (optlen < 1) - goto e_inval; - inet->mc_loop = !!val; - break; case IP_UNICAST_IF: { struct net_device *dev = NULL; @@ -1579,6 +1579,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_HDRINCL: val = inet_test_bit(HDRINCL, sk); goto copyval; + case IP_MULTICAST_LOOP: + val = inet_test_bit(MC_LOOP, sk); + goto copyval; } if (needs_rtnl) @@ -1653,9 +1656,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_MULTICAST_TTL: val = inet->mc_ttl; break; - case IP_MULTICAST_LOOP: - val = inet->mc_loop; - break; case IP_UNICAST_IF: val = (__force int)htonl((__u32) inet->uc_index); break; diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 5f8104cf082d..9b18f371af0d 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -63,7 +63,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct sock *sk = sock->sk; /* Disable multicast loopback */ - inet_sk(sk)->mc_loop = 0; + inet_clear_bit(MC_LOOP, sk); /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ inet_inc_convert_csum(sk); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index ed9d207080ac..8e9d4c1f0e83 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -229,7 +229,7 @@ lookup_protocol: */ inet->uc_ttl = -1; - inet->mc_loop = 1; + inet_set_bit(MC_LOOP, sk); inet->mc_ttl = 1; inet->mc_index = 0; RCU_INIT_POINTER(inet->mc_list, NULL); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 264f2f87a437..da5af28ff57b 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1297,11 +1297,9 @@ static void set_sock_size(struct sock *sk, int mode, int val) */ static void set_mcast_loop(struct sock *sk, u_char loop) { - struct inet_sock *inet = inet_sk(sk); - /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ lock_sock(sk); - inet->mc_loop = loop ? 1 : 0; + inet_assign_bit(MC_LOOP, sk, loop); #ifdef CONFIG_IP_VS_IPV6 if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6e3d28aa587c..04b390892827 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9482,7 +9482,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newinet->inet_id = get_random_u16(); newinet->uc_ttl = inet->uc_ttl; - newinet->mc_loop = 1; + inet_set_bit(MC_LOOP, newsk); newinet->mc_ttl = 1; newinet->mc_index = 0; newinet->mc_list = NULL; -- cgit v1.2.3 From 307b4ac6dc18436076cdd314aa3e556be077bf2d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 08:15:40 +0000 Subject: inet: move inet->mc_all to inet->inet_frags IP_MULTICAST_ALL socket option can now be set/read without locking the socket. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/inet_sock.h | 2 +- net/ipv4/af_inet.c | 2 +- net/ipv4/igmp.c | 2 +- net/ipv4/inet_diag.c | 2 +- net/ipv4/ip_sockglue.c | 22 +++++++++++----------- 5 files changed, 15 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 6c4eeca59f60..fffd34fa6a7c 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -232,7 +232,6 @@ struct inet_sock { __u8 pmtudisc; __u8 is_icsk:1, transparent:1, - mc_all:1, nodefrag:1; __u8 bind_address_no_port:1, defer_connect:1; /* Indicates that fastopen_connect is set @@ -271,6 +270,7 @@ enum { INET_FLAGS_FREEBIND = 11, INET_FLAGS_HDRINCL = 12, INET_FLAGS_MC_LOOP = 13, + INET_FLAGS_MC_ALL = 14, }; /* cmsg flags for inet */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e847822df9b6..47c9cdd7687f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -358,7 +358,7 @@ lookup_protocol: inet->uc_ttl = -1; inet_set_bit(MC_LOOP, sk); inet->mc_ttl = 1; - inet->mc_all = 1; + inet_set_bit(MC_ALL, sk); inet->mc_index = 0; inet->mc_list = NULL; inet->rcv_tos = 0; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 48ff5f13e797..0c9e768e5628 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2658,7 +2658,7 @@ int ip_mc_sf_allow(const struct sock *sk, __be32 loc_addr, __be32 rmt_addr, (sdif && pmc->multi.imr_ifindex == sdif))) break; } - ret = inet->mc_all; + ret = inet_test_bit(MC_ALL, sk); if (!pmc) goto unlock; psl = rcu_dereference(pmc->sflist); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index cc797261893b..e009dab80c35 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -188,7 +188,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, inet_sockopt.hdrincl = inet_test_bit(HDRINCL, sk); inet_sockopt.mc_loop = inet_test_bit(MC_LOOP, sk); inet_sockopt.transparent = inet->transparent; - inet_sockopt.mc_all = inet->mc_all; + inet_sockopt.mc_all = inet_test_bit(MC_ALL, sk); inet_sockopt.nodefrag = inet->nodefrag; inet_sockopt.bind_address_no_port = inet->bind_address_no_port; inet_sockopt.recverr_rfc4884 = inet_test_bit(RECVERR_RFC4884, sk); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index be569032b612..2f27c30a4ecc 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -998,6 +998,14 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; inet_assign_bit(MC_LOOP, sk, val); return 0; + case IP_MULTICAST_ALL: + if (optlen < 1) + return -EINVAL; + if (val != 0 && val != 1) + return -EINVAL; + inet_assign_bit(MC_ALL, sk, val); + return 0; + } err = 0; @@ -1303,14 +1311,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, else err = ip_set_mcast_msfilter(sk, optval, optlen); break; - case IP_MULTICAST_ALL: - if (optlen < 1) - goto e_inval; - if (val != 0 && val != 1) - goto e_inval; - inet->mc_all = val; - break; - case IP_IPSEC_POLICY: case IP_XFRM_POLICY: err = -EPERM; @@ -1582,6 +1582,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_MULTICAST_LOOP: val = inet_test_bit(MC_LOOP, sk); goto copyval; + case IP_MULTICAST_ALL: + val = inet_test_bit(MC_ALL, sk); + goto copyval; } if (needs_rtnl) @@ -1694,9 +1697,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, else err = ip_get_mcast_msfilter(sk, optval, optlen, len); goto out; - case IP_MULTICAST_ALL: - val = inet->mc_all; - break; case IP_PKTOPTIONS: { struct msghdr msg; -- cgit v1.2.3 From 4bd0623f04eef65c0a324000fad73c4d3a677f8e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 08:15:41 +0000 Subject: inet: move inet->transparent to inet->inet_flags IP_TRANSPARENT socket option can now be set/read without locking the socket. v2: removed unused issk variable in mptcp_setsockopt_sol_ip_set_transparent() v4: rebased after commit 3f326a821b99 ("mptcp: change the mpc check helper to return a sk") Signed-off-by: Eric Dumazet Cc: Paolo Abeni Acked-by: Soheil Hassas Yeganeh Reviewed-by: Simon Horman Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- include/net/inet_sock.h | 6 +++--- include/net/ipv6.h | 2 +- include/net/route.h | 2 +- include/net/tcp.h | 2 +- net/ipv4/inet_diag.c | 2 +- net/ipv4/inet_timewait_sock.c | 2 +- net/ipv4/ip_sockglue.c | 28 +++++++++++++--------------- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_minisocks.c | 3 +-- net/ipv6/ipv6_sockglue.c | 4 ++-- net/mptcp/sockopt.c | 11 +++++------ 11 files changed, 30 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index fffd34fa6a7c..cefd9a60dc6d 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -231,7 +231,6 @@ struct inet_sock { __u8 mc_ttl; __u8 pmtudisc; __u8 is_icsk:1, - transparent:1, nodefrag:1; __u8 bind_address_no_port:1, defer_connect:1; /* Indicates that fastopen_connect is set @@ -271,6 +270,7 @@ enum { INET_FLAGS_HDRINCL = 12, INET_FLAGS_MC_LOOP = 13, INET_FLAGS_MC_ALL = 14, + INET_FLAGS_TRANSPARENT = 15, }; /* cmsg flags for inet */ @@ -397,7 +397,7 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) { __u8 flags = 0; - if (inet_sk(sk)->transparent || inet_test_bit(HDRINCL, sk)) + if (inet_test_bit(TRANSPARENT, sk) || inet_test_bit(HDRINCL, sk)) flags |= FLOWI_FLAG_ANYSRC; return flags; } @@ -424,7 +424,7 @@ static inline bool inet_can_nonlocal_bind(struct net *net, { return READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind) || test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) || - inet->transparent; + test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags); } static inline bool inet_addr_valid_or_nonlocal(struct net *net, diff --git a/include/net/ipv6.h b/include/net/ipv6.h index fd570d77588e..d40d8238d4c2 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -938,7 +938,7 @@ static inline bool ipv6_can_nonlocal_bind(struct net *net, { return net->ipv6.sysctl.ip_nonlocal_bind || test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) || - inet->transparent; + test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags); } /* Sysctl settings for net ipv6.auto_flowlabels */ diff --git a/include/net/route.h b/include/net/route.h index d9ca98d2366f..51a45b1887b5 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -298,7 +298,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, { __u8 flow_flags = 0; - if (inet_sk(sk)->transparent) + if (inet_test_bit(TRANSPARENT, sk)) flow_flags |= FLOWI_FLAG_ANYSRC; flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), diff --git a/include/net/tcp.h b/include/net/tcp.h index 6d77c08d83b7..07b21d9a9620 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2031,7 +2031,7 @@ static inline bool inet_sk_transparent(const struct sock *sk) case TCP_NEW_SYN_RECV: return inet_rsk(inet_reqsk(sk))->no_srccheck; } - return inet_sk(sk)->transparent; + return inet_test_bit(TRANSPARENT, sk); } /* Determines whether this is a thin stream (which may suffer from diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index e009dab80c35..45fefd2f31fd 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -187,7 +187,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, inet_sockopt.freebind = inet_test_bit(FREEBIND, sk); inet_sockopt.hdrincl = inet_test_bit(HDRINCL, sk); inet_sockopt.mc_loop = inet_test_bit(MC_LOOP, sk); - inet_sockopt.transparent = inet->transparent; + inet_sockopt.transparent = inet_test_bit(TRANSPARENT, sk); inet_sockopt.mc_all = inet_test_bit(MC_ALL, sk); inet_sockopt.nodefrag = inet->nodefrag; inet_sockopt.bind_address_no_port = inet->bind_address_no_port; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 2c1b245dba8e..dd37a5bf6881 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -203,7 +203,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_reuseport = sk->sk_reuseport; tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; - tw->tw_transparent = inet->transparent; + tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 2f27c30a4ecc..3f5323a230b3 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1005,7 +1005,16 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; inet_assign_bit(MC_ALL, sk, val); return 0; - + case IP_TRANSPARENT: + if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + err = -EPERM; + break; + } + if (optlen < 1) + goto e_inval; + inet_assign_bit(TRANSPARENT, sk, val); + return 0; } err = 0; @@ -1319,17 +1328,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, err = xfrm_user_policy(sk, optname, optval, optlen); break; - case IP_TRANSPARENT: - if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { - err = -EPERM; - break; - } - if (optlen < 1) - goto e_inval; - inet->transparent = !!val; - break; - case IP_MINTTL: if (optlen < 1) goto e_inval; @@ -1585,6 +1583,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_MULTICAST_ALL: val = inet_test_bit(MC_ALL, sk); goto copyval; + case IP_TRANSPARENT: + val = inet_test_bit(TRANSPARENT, sk); + goto copyval; } if (needs_rtnl) @@ -1735,9 +1736,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, len -= msg.msg_controllen; return copy_to_sockptr(optlen, &len, sizeof(int)); } - case IP_TRANSPARENT: - val = inet->transparent; - break; case IP_MINTTL: val = inet->min_ttl; break; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d34d52fdfdb1..06fe1cf645d5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7000,7 +7000,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb, sk); - inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent; + inet_rsk(req)->no_srccheck = inet_test_bit(TRANSPARENT, sk); /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 13ee12983c42..b98d476f1594 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -289,9 +289,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); - struct inet_sock *inet = inet_sk(sk); - tw->tw_transparent = inet->transparent; + tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); tw->tw_mark = sk->sk_mark; tw->tw_priority = sk->sk_priority; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 3eb38436f8d4..eb334122512c 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -633,7 +633,7 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int)) goto e_inval; /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ - inet_sk(sk)->transparent = valbool; + inet_assign_bit(TRANSPARENT, sk, valbool); retv = 0; break; @@ -1330,7 +1330,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, } case IPV6_TRANSPARENT: - val = inet_sk(sk)->transparent; + val = inet_test_bit(TRANSPARENT, sk); break; case IPV6_FREEBIND: diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index ffbe2f5f5b44..8260202c0066 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -416,7 +416,8 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, sk->sk_ipv6only = ssk->sk_ipv6only; break; case IPV6_TRANSPARENT: - inet_sk(sk)->transparent = inet_sk(ssk)->transparent; + inet_assign_bit(TRANSPARENT, sk, + inet_test_bit(TRANSPARENT, ssk)); break; case IPV6_FREEBIND: inet_assign_bit(FREEBIND, sk, @@ -685,7 +686,6 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; - struct inet_sock *issk; struct sock *ssk; int err; @@ -701,14 +701,13 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o return PTR_ERR(ssk); } - issk = inet_sk(ssk); - switch (optname) { case IP_FREEBIND: inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); break; case IP_TRANSPARENT: - issk->transparent = inet_sk(sk)->transparent; + inet_assign_bit(TRANSPARENT, ssk, + inet_test_bit(TRANSPARENT, sk)); break; default: release_sock(sk); @@ -1441,7 +1440,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) __tcp_sock_set_cork(ssk, !!msk->cork); __tcp_sock_set_nodelay(ssk, !!msk->nodelay); - inet_sk(ssk)->transparent = inet_sk(sk)->transparent; + inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); } -- cgit v1.2.3 From b1c0356a58577ce0a583e3044bf801877fc0b5de Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 08:15:42 +0000 Subject: inet: move inet->is_icsk to inet->inet_flags We move single bit fields to inet->inet_flags to avoid races. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 4 ++-- include/net/inet_sock.h | 5 ++--- net/ipv4/af_inet.c | 2 +- net/ipv4/cipso_ipv4.c | 4 ++-- net/ipv4/inet_diag.c | 2 +- net/ipv4/ip_sockglue.c | 4 ++-- net/ipv6/af_inet6.c | 2 +- net/ipv6/ipv6_sockglue.c | 4 ++-- 8 files changed, 13 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index be3c858a2ebb..5d2fcc137b88 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -342,9 +342,9 @@ static inline bool inet_csk_in_pingpong_mode(struct sock *sk) return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH; } -static inline bool inet_csk_has_ulp(struct sock *sk) +static inline bool inet_csk_has_ulp(const struct sock *sk) { - return inet_sk(sk)->is_icsk && !!inet_csk(sk)->icsk_ulp_ops; + return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops; } #endif /* _INET_CONNECTION_SOCK_H */ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index cefd9a60dc6d..38f7fc1c4dac 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -201,7 +201,6 @@ struct rtable; * @inet_id - ID counter for DF pkts * @tos - TOS * @mc_ttl - Multicasting TTL - * @is_icsk - is this an inet_connection_sock? * @uc_index - Unicast outgoing device index * @mc_index - Multicast device index * @mc_list - Group array @@ -230,8 +229,7 @@ struct inet_sock { __u8 min_ttl; __u8 mc_ttl; __u8 pmtudisc; - __u8 is_icsk:1, - nodefrag:1; + __u8 nodefrag:1; __u8 bind_address_no_port:1, defer_connect:1; /* Indicates that fastopen_connect is set * and cookie exists so we defer connect @@ -271,6 +269,7 @@ enum { INET_FLAGS_MC_LOOP = 13, INET_FLAGS_MC_ALL = 14, INET_FLAGS_TRANSPARENT = 15, + INET_FLAGS_IS_ICSK = 16, }; /* cmsg flags for inet */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 47c9cdd7687f..0f46b71f8a98 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -331,7 +331,7 @@ lookup_protocol: sk->sk_reuse = SK_CAN_REUSE; inet = inet_sk(sk); - inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; + inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); inet->nodefrag = 0; diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 79ae7204e8ed..d048aa833293 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1881,7 +1881,7 @@ int cipso_v4_sock_setattr(struct sock *sk, old = rcu_dereference_protected(sk_inet->inet_opt, lockdep_sock_is_held(sk)); - if (sk_inet->is_icsk) { + if (inet_test_bit(IS_ICSK, sk)) { sk_conn = inet_csk(sk); if (old) sk_conn->icsk_ext_hdr_len -= old->opt.optlen; @@ -2051,7 +2051,7 @@ void cipso_v4_sock_delattr(struct sock *sk) sk_inet = inet_sk(sk); hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); - if (sk_inet->is_icsk && hdr_delta > 0) { + if (inet_test_bit(IS_ICSK, sk) && hdr_delta > 0) { struct inet_connection_sock *sk_conn = inet_csk(sk); sk_conn->icsk_ext_hdr_len -= hdr_delta; sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 45fefd2f31fd..ada198fc1a92 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -183,7 +183,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, memset(&inet_sockopt, 0, sizeof(inet_sockopt)); inet_sockopt.recverr = inet_test_bit(RECVERR, sk); - inet_sockopt.is_icsk = inet->is_icsk; + inet_sockopt.is_icsk = inet_test_bit(IS_ICSK, sk); inet_sockopt.freebind = inet_test_bit(FREEBIND, sk); inet_sockopt.hdrincl = inet_test_bit(HDRINCL, sk); inet_sockopt.mc_loop = inet_test_bit(MC_LOOP, sk); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 3f5323a230b3..dac471ed067b 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1034,7 +1034,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, break; old = rcu_dereference_protected(inet->inet_opt, lockdep_sock_is_held(sk)); - if (inet->is_icsk) { + if (inet_test_bit(IS_ICSK, sk)) { struct inet_connection_sock *icsk = inet_csk(sk); #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == PF_INET || @@ -1209,7 +1209,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, struct ip_mreqn mreq; err = -EPROTO; - if (inet_sk(sk)->is_icsk) + if (inet_test_bit(IS_ICSK, sk)) break; if (optlen < sizeof(struct ip_mreq)) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8e9d4c1f0e83..d8c56c7aba96 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -200,7 +200,7 @@ lookup_protocol: sk->sk_reuse = SK_CAN_REUSE; inet = inet_sk(sk); - inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; + inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); if (SOCK_RAW == sock->type) { inet->inet_num = protocol; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index eb334122512c..d19577a94bcc 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -102,7 +102,7 @@ int ip6_ra_control(struct sock *sk, int sel) struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt) { - if (inet_sk(sk)->is_icsk) { + if (inet_test_bit(IS_ICSK, sk)) { if (opt && !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { @@ -831,7 +831,7 @@ done: goto e_inval; retv = -EPROTO; - if (inet_sk(sk)->is_icsk) + if (inet_test_bit(IS_ICSK, sk)) break; retv = -EFAULT; -- cgit v1.2.3 From f04b8d3478a3a63f17d8dc0dc6da16a828b48df0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 08:15:43 +0000 Subject: inet: move inet->nodefrag to inet->inet_flags IP_NODEFRAG socket option can now be set/read without locking the socket. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/inet_sock.h | 2 +- net/ipv4/af_inet.c | 2 +- net/ipv4/inet_diag.c | 2 +- net/ipv4/ip_sockglue.c | 18 ++++++++---------- net/ipv4/netfilter/nf_defrag_ipv4.c | 2 +- net/netfilter/ipvs/ip_vs_core.c | 4 ++-- 6 files changed, 14 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 38f7fc1c4dac..0e6e1b017efb 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -229,7 +229,6 @@ struct inet_sock { __u8 min_ttl; __u8 mc_ttl; __u8 pmtudisc; - __u8 nodefrag:1; __u8 bind_address_no_port:1, defer_connect:1; /* Indicates that fastopen_connect is set * and cookie exists so we defer connect @@ -270,6 +269,7 @@ enum { INET_FLAGS_MC_ALL = 14, INET_FLAGS_TRANSPARENT = 15, INET_FLAGS_IS_ICSK = 16, + INET_FLAGS_NODEFRAG = 17, }; /* cmsg flags for inet */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0f46b71f8a98..2fb9948b5230 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -333,7 +333,7 @@ lookup_protocol: inet = inet_sk(sk); inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); - inet->nodefrag = 0; + inet_clear_bit(NODEFRAG, sk); if (SOCK_RAW == sock->type) { inet->inet_num = protocol; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index ada198fc1a92..39606caad484 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -189,7 +189,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, inet_sockopt.mc_loop = inet_test_bit(MC_LOOP, sk); inet_sockopt.transparent = inet_test_bit(TRANSPARENT, sk); inet_sockopt.mc_all = inet_test_bit(MC_ALL, sk); - inet_sockopt.nodefrag = inet->nodefrag; + inet_sockopt.nodefrag = inet_test_bit(NODEFRAG, sk); inet_sockopt.bind_address_no_port = inet->bind_address_no_port; inet_sockopt.recverr_rfc4884 = inet_test_bit(RECVERR_RFC4884, sk); inet_sockopt.defer_connect = inet->defer_connect; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index dac471ed067b..ec946c13ea20 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1015,6 +1015,11 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, goto e_inval; inet_assign_bit(TRANSPARENT, sk, val); return 0; + case IP_NODEFRAG: + if (sk->sk_type != SOCK_RAW) + return -ENOPROTOOPT; + inet_assign_bit(NODEFRAG, sk, val); + return 0; } err = 0; @@ -1079,13 +1084,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, goto e_inval; inet->uc_ttl = val; break; - case IP_NODEFRAG: - if (sk->sk_type != SOCK_RAW) { - err = -ENOPROTOOPT; - break; - } - inet->nodefrag = val ? 1 : 0; - break; case IP_BIND_ADDRESS_NO_PORT: inet->bind_address_no_port = val ? 1 : 0; break; @@ -1586,6 +1584,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_TRANSPARENT: val = inet_test_bit(TRANSPARENT, sk); goto copyval; + case IP_NODEFRAG: + val = inet_test_bit(NODEFRAG, sk); + goto copyval; } if (needs_rtnl) @@ -1633,9 +1634,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, inet->uc_ttl); break; } - case IP_NODEFRAG: - val = inet->nodefrag; - break; case IP_BIND_ADDRESS_NO_PORT: val = inet->bind_address_no_port; break; diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index a9ba7de092c4..265b39bc435b 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -66,7 +66,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv, struct sock *sk = skb->sk; if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) && - inet_sk(sk)->nodefrag) + inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; #if IS_ENABLED(CONFIG_NF_CONNTRACK) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index cb83ca506c5c..3230506ae3ff 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1346,7 +1346,7 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { - if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) + if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; } @@ -1946,7 +1946,7 @@ ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) { - if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) + if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; } -- cgit v1.2.3 From ca571e2eb7eb6202aa367e01c811aab023272f38 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 08:15:44 +0000 Subject: inet: move inet->bind_address_no_port to inet->inet_flags IP_BIND_ADDRESS_NO_PORT socket option can now be set/read without locking the socket. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/inet_sock.h | 4 ++-- net/ipv4/af_inet.c | 2 +- net/ipv4/inet_diag.c | 2 +- net/ipv4/ip_sockglue.c | 12 ++++++------ net/ipv6/af_inet6.c | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 0e6e1b017efb..5eca2e70cbb2 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -229,8 +229,7 @@ struct inet_sock { __u8 min_ttl; __u8 mc_ttl; __u8 pmtudisc; - __u8 bind_address_no_port:1, - defer_connect:1; /* Indicates that fastopen_connect is set + __u8 defer_connect:1; /* Indicates that fastopen_connect is set * and cookie exists so we defer connect * until first data frame is written */ @@ -270,6 +269,7 @@ enum { INET_FLAGS_TRANSPARENT = 15, INET_FLAGS_IS_ICSK = 16, INET_FLAGS_NODEFRAG = 17, + INET_FLAGS_BIND_ADDRESS_NO_PORT = 18, }; /* cmsg flags for inet */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 2fb9948b5230..26e7cd9cb059 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -529,7 +529,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if (snum || !(inet->bind_address_no_port || + if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) || (flags & BIND_FORCE_ADDRESS_NO_PORT))) { err = sk->sk_prot->get_port(sk, snum); if (err) { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 39606caad484..128966dea554 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -190,7 +190,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, inet_sockopt.transparent = inet_test_bit(TRANSPARENT, sk); inet_sockopt.mc_all = inet_test_bit(MC_ALL, sk); inet_sockopt.nodefrag = inet_test_bit(NODEFRAG, sk); - inet_sockopt.bind_address_no_port = inet->bind_address_no_port; + inet_sockopt.bind_address_no_port = inet_test_bit(BIND_ADDRESS_NO_PORT, sk); inet_sockopt.recverr_rfc4884 = inet_test_bit(RECVERR_RFC4884, sk); inet_sockopt.defer_connect = inet->defer_connect; if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt), diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ec946c13ea20..cfa65a0b0900 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1020,6 +1020,9 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, return -ENOPROTOOPT; inet_assign_bit(NODEFRAG, sk, val); return 0; + case IP_BIND_ADDRESS_NO_PORT: + inet_assign_bit(BIND_ADDRESS_NO_PORT, sk, val); + return 0; } err = 0; @@ -1084,9 +1087,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, goto e_inval; inet->uc_ttl = val; break; - case IP_BIND_ADDRESS_NO_PORT: - inet->bind_address_no_port = val ? 1 : 0; - break; case IP_MTU_DISCOVER: if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) goto e_inval; @@ -1587,6 +1587,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_NODEFRAG: val = inet_test_bit(NODEFRAG, sk); goto copyval; + case IP_BIND_ADDRESS_NO_PORT: + val = inet_test_bit(BIND_ADDRESS_NO_PORT, sk); + goto copyval; } if (needs_rtnl) @@ -1634,9 +1637,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, inet->uc_ttl); break; } - case IP_BIND_ADDRESS_NO_PORT: - val = inet->bind_address_no_port; - break; case IP_MTU_DISCOVER: val = inet->pmtudisc; break; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d8c56c7aba96..368824fe9719 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -399,7 +399,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, sk->sk_ipv6only = 1; /* Make sure we are allowed to bind here. */ - if (snum || !(inet->bind_address_no_port || + if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) || (flags & BIND_FORCE_ADDRESS_NO_PORT))) { err = sk->sk_prot->get_port(sk, snum); if (err) { -- cgit v1.2.3 From 08e39c0dfa29f233f5a621f7d274b793a080c769 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 08:15:45 +0000 Subject: inet: move inet->defer_connect to inet->inet_flags Make room in struct inet_sock by removing this bit field, using one available bit in inet_flags instead. Also move local_port_range to fill the resulting hole, saving 8 bytes on 64bit arches. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Simon Horman Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- include/net/inet_sock.h | 10 ++++------ net/ipv4/af_inet.c | 4 ++-- net/ipv4/inet_diag.c | 2 +- net/ipv4/tcp.c | 12 +++++++----- net/ipv4/tcp_fastopen.c | 2 +- net/mptcp/protocol.c | 10 ++++++---- 6 files changed, 21 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 5eca2e70cbb2..acbb93d7607a 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -229,21 +229,18 @@ struct inet_sock { __u8 min_ttl; __u8 mc_ttl; __u8 pmtudisc; - __u8 defer_connect:1; /* Indicates that fastopen_connect is set - * and cookie exists so we defer connect - * until first data frame is written - */ __u8 rcv_tos; __u8 convert_csum; int uc_index; int mc_index; __be32 mc_addr; - struct ip_mc_socklist __rcu *mc_list; - struct inet_cork_full cork; struct { __u16 lo; __u16 hi; } local_port_range; + + struct ip_mc_socklist __rcu *mc_list; + struct inet_cork_full cork; }; #define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ @@ -270,6 +267,7 @@ enum { INET_FLAGS_IS_ICSK = 16, INET_FLAGS_NODEFRAG = 17, INET_FLAGS_BIND_ADDRESS_NO_PORT = 18, + INET_FLAGS_DEFER_CONNECT = 19, }; /* cmsg flags for inet */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 26e7cd9cb059..e07ee60625d9 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -656,7 +656,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, err = -EISCONN; goto out; case SS_CONNECTING: - if (inet_sk(sk)->defer_connect) + if (inet_test_bit(DEFER_CONNECT, sk)) err = is_sendmsg ? -EINPROGRESS : -EISCONN; else err = -EALREADY; @@ -679,7 +679,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, sock->state = SS_CONNECTING; - if (!err && inet_sk(sk)->defer_connect) + if (!err && inet_test_bit(DEFER_CONNECT, sk)) goto out; /* Just entered SS_CONNECTING state; the only diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 128966dea554..e13a84433413 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -192,7 +192,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, inet_sockopt.nodefrag = inet_test_bit(NODEFRAG, sk); inet_sockopt.bind_address_no_port = inet_test_bit(BIND_ADDRESS_NO_PORT, sk); inet_sockopt.recverr_rfc4884 = inet_test_bit(RECVERR_RFC4884, sk); - inet_sockopt.defer_connect = inet->defer_connect; + inet_sockopt.defer_connect = inet_test_bit(DEFER_CONNECT, sk); if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt), &inet_sockopt)) goto errout; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4fbc7ff8c53c..cee1e548660c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -583,7 +583,8 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (urg_data & TCP_URG_VALID) mask |= EPOLLPRI; - } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { + } else if (state == TCP_SYN_SENT && + inet_test_bit(DEFER_CONNECT, sk)) { /* Active TCP fastopen socket with defer_connect * Return EPOLLOUT so application can call write() * in order for kernel to generate SYN+data @@ -1007,7 +1008,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, tp->fastopen_req->size = size; tp->fastopen_req->uarg = uarg; - if (inet->defer_connect) { + if (inet_test_bit(DEFER_CONNECT, sk)) { err = tcp_connect(sk); /* Same failure procedure as in tcp_v4/6_connect */ if (err) { @@ -1025,7 +1026,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, if (tp->fastopen_req) { *copied = tp->fastopen_req->copied; tcp_free_fastopen_req(tp); - inet->defer_connect = 0; + inet_clear_bit(DEFER_CONNECT, sk); } return err; } @@ -1066,7 +1067,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_SPLICE_PAGES; } - if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && + if (unlikely(flags & MSG_FASTOPEN || + inet_test_bit(DEFER_CONNECT, sk)) && !tp->repair) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg); if (err == -EINPROGRESS && copied_syn > 0) @@ -3088,7 +3090,7 @@ int tcp_disconnect(struct sock *sk, int flags) /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); - inet->defer_connect = 0; + inet_clear_bit(DEFER_CONNECT, sk); tp->fastopen_client_fail = 0; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 85e4953f1182..8ed54e7334a9 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -451,7 +451,7 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err) if (tp->fastopen_connect && !tp->fastopen_req) { if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) { - inet_sk(sk)->defer_connect = 1; + inet_set_bit(DEFER_CONNECT, sk); return true; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 6ea0a1da8068..6019a3cf1625 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1686,7 +1686,7 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, if (!mptcp_disconnect(sk, 0)) sk->sk_socket->state = SS_UNCONNECTED; } - inet_sk(sk)->defer_connect = 0; + inet_clear_bit(DEFER_CONNECT, sk); return ret; } @@ -1704,7 +1704,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); - if (unlikely(inet_sk(sk)->defer_connect || msg->msg_flags & MSG_FASTOPEN)) { + if (unlikely(inet_test_bit(DEFER_CONNECT, sk) || + msg->msg_flags & MSG_FASTOPEN)) { int copied_syn = 0; ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn); @@ -3601,7 +3602,7 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (err < 0) goto out; - inet_sk(sk)->defer_connect = inet_sk(ssk)->defer_connect; + inet_assign_bit(DEFER_CONNECT, sk, inet_test_bit(DEFER_CONNECT, ssk)); out: if (!msk->fastopening) @@ -3827,7 +3828,8 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, mask |= EPOLLOUT | EPOLLWRNORM; else mask |= mptcp_check_writeable(msk); - } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { + } else if (state == TCP_SYN_SENT && + inet_test_bit(DEFER_CONNECT, sk)) { /* cf tcp_poll() note about TFO */ mask |= EPOLLOUT | EPOLLWRNORM; } -- cgit v1.2.3 From 10f42426e5bcea728d7fae96609b29b4fb1f7518 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 08:15:46 +0000 Subject: inet: implement lockless IP_TTL ip_select_ttl() is racy, because it reads inet->uc_ttl without proper locking. Add READ_ONCE()/WRITE_ONCE() annotations while allowing IP_TTL socket option to be set/read without holding the socket lock. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- net/ipv4/ip_sockglue.c | 27 ++++++++++++--------------- 2 files changed, 13 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8f396eada1b6..ce6257860a40 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -133,7 +133,7 @@ EXPORT_SYMBOL_GPL(ip_local_out); static inline int ip_select_ttl(const struct inet_sock *inet, const struct dst_entry *dst) { - int ttl = inet->uc_ttl; + int ttl = READ_ONCE(inet->uc_ttl); if (ttl < 0) ttl = ip4_dst_hoplimit(dst); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index cfa65a0b0900..dbb2d2342ebf 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1023,6 +1023,13 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, case IP_BIND_ADDRESS_NO_PORT: inet_assign_bit(BIND_ADDRESS_NO_PORT, sk, val); return 0; + case IP_TTL: + if (optlen < 1) + return -EINVAL; + if (val != -1 && (val < 1 || val > 255)) + return -EINVAL; + WRITE_ONCE(inet->uc_ttl, val); + return 0; } err = 0; @@ -1080,13 +1087,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, case IP_TOS: /* This sets both TOS and Precedence */ __ip_sock_set_tos(sk, val); break; - case IP_TTL: - if (optlen < 1) - goto e_inval; - if (val != -1 && (val < 1 || val > 255)) - goto e_inval; - inet->uc_ttl = val; - break; case IP_MTU_DISCOVER: if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) goto e_inval; @@ -1590,6 +1590,11 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_BIND_ADDRESS_NO_PORT: val = inet_test_bit(BIND_ADDRESS_NO_PORT, sk); goto copyval; + case IP_TTL: + val = READ_ONCE(inet->uc_ttl); + if (val < 0) + val = READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_default_ttl); + goto copyval; } if (needs_rtnl) @@ -1629,14 +1634,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_TOS: val = inet->tos; break; - case IP_TTL: - { - struct net *net = sock_net(sk); - val = (inet->uc_ttl == -1 ? - READ_ONCE(net->ipv4.sysctl_ip_default_ttl) : - inet->uc_ttl); - break; - } case IP_MTU_DISCOVER: val = inet->pmtudisc; break; -- cgit v1.2.3 From 12af73269fd942c98ff9999a2ce0c04165aae136 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 08:15:47 +0000 Subject: inet: implement lockless IP_MINTTL inet->min_ttl is already read with READ_ONCE(). Implementing IP_MINTTL socket option set/read without holding the socket lock is easy. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index dbb2d2342ebf..61b2e7bc7031 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1030,6 +1030,17 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; WRITE_ONCE(inet->uc_ttl, val); return 0; + case IP_MINTTL: + if (optlen < 1) + return -EINVAL; + if (val < 0 || val > 255) + return -EINVAL; + + if (val) + static_branch_enable(&ip4_min_ttl); + + WRITE_ONCE(inet->min_ttl, val); + return 0; } err = 0; @@ -1326,21 +1337,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, err = xfrm_user_policy(sk, optname, optval, optlen); break; - case IP_MINTTL: - if (optlen < 1) - goto e_inval; - if (val < 0 || val > 255) - goto e_inval; - - if (val) - static_branch_enable(&ip4_min_ttl); - - /* tcp_v4_err() and tcp_v4_rcv() might read min_ttl - * while we are changint it. - */ - WRITE_ONCE(inet->min_ttl, val); - break; - case IP_LOCAL_PORT_RANGE: { const __u16 lo = val; @@ -1595,6 +1591,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, if (val < 0) val = READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_default_ttl); goto copyval; + case IP_MINTTL: + val = READ_ONCE(inet->min_ttl); + goto copyval; } if (needs_rtnl) @@ -1731,9 +1730,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, len -= msg.msg_controllen; return copy_to_sockptr(optlen, &len, sizeof(int)); } - case IP_MINTTL: - val = inet->min_ttl; - break; case IP_LOCAL_PORT_RANGE: val = inet->local_port_range.hi << 16 | inet->local_port_range.lo; break; -- cgit v1.2.3 From 3dec89b14d37ee635e772636dad3f09f78f1ab87 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Tue, 15 Aug 2023 11:07:05 -0700 Subject: net/ipv6: Remove expired routes with a separated list of routes. FIB6 GC walks trees of fib6_tables to remove expired routes. Walking a tree can be expensive if the number of routes in a table is big, even if most of them are permanent. Checking routes in a separated list of routes having expiration will avoid this potential issue. Signed-off-by: Kui-Feng Lee Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 64 ++++++++++++++++++++++++++++++++++++++++----------- net/ipv6/ip6_fib.c | 55 ++++++++++++++++++++++++++++++++++++++----- net/ipv6/route.c | 6 ++--- 3 files changed, 103 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 05e6f756feaf..c9ff23cf313e 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -179,6 +179,9 @@ struct fib6_info { refcount_t fib6_ref; unsigned long expires; + + struct hlist_node gc_link; + struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] @@ -247,19 +250,6 @@ static inline bool fib6_requires_src(const struct fib6_info *rt) return rt->fib6_src.plen > 0; } -static inline void fib6_clean_expires(struct fib6_info *f6i) -{ - f6i->fib6_flags &= ~RTF_EXPIRES; - f6i->expires = 0; -} - -static inline void fib6_set_expires(struct fib6_info *f6i, - unsigned long expires) -{ - f6i->expires = expires; - f6i->fib6_flags |= RTF_EXPIRES; -} - static inline bool fib6_check_expired(const struct fib6_info *f6i) { if (f6i->fib6_flags & RTF_EXPIRES) @@ -267,6 +257,11 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i) return false; } +static inline bool fib6_has_expires(const struct fib6_info *f6i) +{ + return f6i->fib6_flags & RTF_EXPIRES; +} + /* Function to safely get fn->fn_sernum for passed in rt * and store result in passed in cookie. * Return true if we can get cookie safely @@ -388,6 +383,7 @@ struct fib6_table { struct inet_peer_base tb6_peers; unsigned int flags; unsigned int fib_seq; + struct hlist_head tb6_gc_hlist; /* GC candidates */ #define RT6_TABLE_HAS_DFLT_ROUTER BIT(0) }; @@ -504,6 +500,48 @@ void fib6_gc_cleanup(void); int fib6_init(void); +/* fib6_info must be locked by the caller, and fib6_info->fib6_table can be + * NULL. + */ +static inline void fib6_set_expires_locked(struct fib6_info *f6i, + unsigned long expires) +{ + struct fib6_table *tb6; + + tb6 = f6i->fib6_table; + f6i->expires = expires; + if (tb6 && !fib6_has_expires(f6i)) + hlist_add_head(&f6i->gc_link, &tb6->tb6_gc_hlist); + f6i->fib6_flags |= RTF_EXPIRES; +} + +/* fib6_info must be locked by the caller, and fib6_info->fib6_table can be + * NULL. If fib6_table is NULL, the fib6_info will no be inserted into the + * list of GC candidates until it is inserted into a table. + */ +static inline void fib6_set_expires(struct fib6_info *f6i, + unsigned long expires) +{ + spin_lock_bh(&f6i->fib6_table->tb6_lock); + fib6_set_expires_locked(f6i, expires); + spin_unlock_bh(&f6i->fib6_table->tb6_lock); +} + +static inline void fib6_clean_expires_locked(struct fib6_info *f6i) +{ + if (fib6_has_expires(f6i)) + hlist_del_init(&f6i->gc_link); + f6i->fib6_flags &= ~RTF_EXPIRES; + f6i->expires = 0; +} + +static inline void fib6_clean_expires(struct fib6_info *f6i) +{ + spin_lock_bh(&f6i->fib6_table->tb6_lock); + fib6_clean_expires_locked(f6i); + spin_unlock_bh(&f6i->fib6_table->tb6_lock); +} + struct ipv6_route_iter { struct seq_net_private p; struct fib6_walker w; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index bac768d36cc1..28b01a068412 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -160,6 +160,8 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); + INIT_HLIST_NODE(&f6i->gc_link); + return f6i; } @@ -246,6 +248,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); + INIT_HLIST_HEAD(&table->tb6_gc_hlist); } return table; @@ -1057,6 +1060,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, lockdep_is_held(&table->tb6_lock)); } } + + fib6_clean_expires_locked(rt); } /* @@ -1118,9 +1123,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) - fib6_clean_expires(iter); + fib6_clean_expires_locked(iter); else - fib6_set_expires(iter, rt->expires); + fib6_set_expires_locked(iter, + rt->expires); if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, @@ -1479,6 +1485,10 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); + + if (fib6_has_expires(rt)) + hlist_add_head(&rt->gc_link, &table->tb6_gc_hlist); + fib6_start_gc(info->nl_net, rt); } @@ -2285,9 +2295,8 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -static int fib6_age(struct fib6_info *rt, void *arg) +static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { - struct fib6_gc_args *gc_args = arg; unsigned long now = jiffies; /* @@ -2295,7 +2304,7 @@ static int fib6_age(struct fib6_info *rt, void *arg) * Routes are expired even if they are in use. */ - if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { + if (fib6_has_expires(rt) && rt->expires) { if (time_after(now, rt->expires)) { RT6_TRACE("expiring %p\n", rt); return -1; @@ -2312,6 +2321,40 @@ static int fib6_age(struct fib6_info *rt, void *arg) return 0; } +static void fib6_gc_table(struct net *net, + struct fib6_table *tb6, + struct fib6_gc_args *gc_args) +{ + struct fib6_info *rt; + struct hlist_node *n; + struct nl_info info = { + .nl_net = net, + .skip_notify = false, + }; + + hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link) + if (fib6_age(rt, gc_args) == -1) + fib6_del(rt, &info); +} + +static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) +{ + struct fib6_table *table; + struct hlist_head *head; + unsigned int h; + + rcu_read_lock(); + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(table, head, tb6_hlist) { + spin_lock_bh(&table->tb6_lock); + fib6_gc_table(net, table, gc_args); + spin_unlock_bh(&table->tb6_lock); + } + } + rcu_read_unlock(); +} + void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; @@ -2327,7 +2370,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; - fib6_clean_all(net, fib6_age, &gc_args); + fib6_gc_all(net, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 10751df16dab..db10c36f34bb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3761,10 +3761,10 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, rt->dst_nocount = true; if (cfg->fc_flags & RTF_EXPIRES) - fib6_set_expires(rt, jiffies + - clock_t_to_jiffies(cfg->fc_expires)); + fib6_set_expires_locked(rt, jiffies + + clock_t_to_jiffies(cfg->fc_expires)); else - fib6_clean_expires(rt); + fib6_clean_expires_locked(rt); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; -- cgit v1.2.3 From 0dd061a6a115f25132989cbd591a25afb2dee086 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 16 Aug 2023 09:11:56 +0800 Subject: bpf: Add update_socket_protocol hook Add a hook named update_socket_protocol in __sys_socket(), for bpf progs to attach to and update socket protocol. One user case is to force legacy TCP apps to create and use MPTCP sockets instead of TCP ones. Define a fmod_ret set named bpf_mptcp_fmodret_ids, add the hook update_socket_protocol into this set, and register it in bpf_mptcp_kfunc_init(). Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/79 Acked-by: Matthieu Baerts Acked-by: Yonghong Song Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/ac84be00f97072a46f8a72b4e2be46cbb7fa5053.1692147782.git.geliang.tang@suse.com Signed-off-by: Martin KaFai Lau --- net/mptcp/bpf.c | 15 +++++++++++++++ net/socket.c | 26 +++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c index 5a0a84ad94af..8a16672b94e2 100644 --- a/net/mptcp/bpf.c +++ b/net/mptcp/bpf.c @@ -19,3 +19,18 @@ struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) return NULL; } + +BTF_SET8_START(bpf_mptcp_fmodret_ids) +BTF_ID_FLAGS(func, update_socket_protocol) +BTF_SET8_END(bpf_mptcp_fmodret_ids) + +static const struct btf_kfunc_id_set bpf_mptcp_fmodret_set = { + .owner = THIS_MODULE, + .set = &bpf_mptcp_fmodret_ids, +}; + +static int __init bpf_mptcp_kfunc_init(void) +{ + return register_btf_fmodret_id_set(&bpf_mptcp_fmodret_set); +} +late_initcall(bpf_mptcp_kfunc_init); diff --git a/net/socket.c b/net/socket.c index 5d4e37595e9a..fdb5233bf560 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1657,12 +1657,36 @@ struct file *__sys_socket_file(int family, int type, int protocol) return sock_alloc_file(sock, flags, NULL); } +/* A hook for bpf progs to attach to and update socket protocol. + * + * A static noinline declaration here could cause the compiler to + * optimize away the function. A global noinline declaration will + * keep the definition, but may optimize away the callsite. + * Therefore, __weak is needed to ensure that the call is still + * emitted, by telling the compiler that we don't know what the + * function might eventually be. + * + * __diag_* below are needed to dismiss the missing prototype warning. + */ + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "A fmod_ret entry point for BPF programs"); + +__weak noinline int update_socket_protocol(int family, int type, int protocol) +{ + return protocol; +} + +__diag_pop(); + int __sys_socket(int family, int type, int protocol) { struct socket *sock; int flags; - sock = __sys_socket_create(family, type, protocol); + sock = __sys_socket_create(family, type, + update_socket_protocol(family, type, protocol)); if (IS_ERR(sock)) return PTR_ERR(sock); -- cgit v1.2.3 From 4072d97ddc447ce9dd8f7a39cdf6f92d2031bb01 Mon Sep 17 00:00:00 2001 From: François Michel Date: Tue, 15 Aug 2023 11:23:38 +0200 Subject: netem: add prng attribute to netem_sched_data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add prng attribute to struct netem_sched_data and allows setting the seed of the PRNG through netlink using the new TCA_NETEM_PRNG_SEED attribute. The PRNG attribute is not actually used yet. Signed-off-by: François Michel Reviewed-by: Simon Horman Acked-by: Stephen Hemminger Link: https://lore.kernel.org/r/20230815092348.1449179-2-francois.michel@uclouvain.be Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_netem.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 00f6ff0aff1f..3f85ae578056 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -603,6 +603,7 @@ enum { TCA_NETEM_JITTER64, TCA_NETEM_SLOT, TCA_NETEM_SLOT_DIST, + TCA_NETEM_PRNG_SEED, __TCA_NETEM_MAX, }; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 38d9aa0cd30e..621c6acfd644 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -105,6 +105,11 @@ struct netem_sched_data { u32 rho; } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; + struct prng { + u64 seed; + struct rnd_state prng_state; + } prng; + struct disttable *delay_dist; enum { @@ -922,6 +927,7 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_LATENCY64] = { .type = NLA_S64 }, [TCA_NETEM_JITTER64] = { .type = NLA_S64 }, [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) }, + [TCA_NETEM_PRNG_SEED] = { .type = NLA_U64 }, }; static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, @@ -1040,6 +1046,12 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, /* capping jitter to the range acceptable by tabledist() */ q->jitter = min_t(s64, abs(q->jitter), INT_MAX); + if (tb[TCA_NETEM_PRNG_SEED]) + q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]); + else + q->prng.seed = get_random_u64(); + prandom_seed_state(&q->prng.prng_state, q->prng.seed); + unlock: sch_tree_unlock(sch); @@ -1203,6 +1215,10 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; } + if (nla_put_u64_64bit(skb, TCA_NETEM_PRNG_SEED, q->prng.seed, + TCA_NETEM_PAD)) + goto nla_put_failure; + return nla_nest_end(skb, nla); nla_put_failure: -- cgit v1.2.3 From 9c87b2aeccf174bce220e527d48391439981273b Mon Sep 17 00:00:00 2001 From: François Michel Date: Tue, 15 Aug 2023 11:23:39 +0200 Subject: netem: use a seeded PRNG for generating random losses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use prandom_u32_state() instead of get_random_u32() to generate the random loss events of netem. The state of the prng is part of the prng attribute of struct netem_sched_data. Signed-off-by: François Michel Reviewed-by: Simon Horman Acked-by: Stephen Hemminger Link: https://lore.kernel.org/r/20230815092348.1449179-3-francois.michel@uclouvain.be Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 621c6acfd644..8b54b1005a10 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -206,7 +206,7 @@ static u32 get_crandom(struct crndstate *state) static bool loss_4state(struct netem_sched_data *q) { struct clgstate *clg = &q->clg; - u32 rnd = get_random_u32(); + u32 rnd = prandom_u32_state(&q->prng.prng_state); /* * Makes a comparison between rnd and the transition @@ -271,18 +271,19 @@ static bool loss_4state(struct netem_sched_data *q) static bool loss_gilb_ell(struct netem_sched_data *q) { struct clgstate *clg = &q->clg; + struct rnd_state *s = &q->prng.prng_state; switch (clg->state) { case GOOD_STATE: - if (get_random_u32() < clg->a1) + if (prandom_u32_state(s) < clg->a1) clg->state = BAD_STATE; - if (get_random_u32() < clg->a4) + if (prandom_u32_state(s) < clg->a4) return true; break; case BAD_STATE: - if (get_random_u32() < clg->a2) + if (prandom_u32_state(s) < clg->a2) clg->state = GOOD_STATE; - if (get_random_u32() > clg->a3) + if (prandom_u32_state(s) > clg->a3) return true; } -- cgit v1.2.3 From 3cad70bc74ef8471e30a05a90798904ce8f8feb5 Mon Sep 17 00:00:00 2001 From: François Michel Date: Tue, 15 Aug 2023 11:23:40 +0200 Subject: netem: use seeded PRNG for correlated loss events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use prandom_u32_state() instead of get_random_u32() to generate the correlated loss events of netem. Signed-off-by: François Michel Reviewed-by: Simon Horman Acked-by: Stephen Hemminger Link: https://lore.kernel.org/r/20230815092348.1449179-4-francois.michel@uclouvain.be Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 8b54b1005a10..4ad39a4a3cf5 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -184,15 +184,16 @@ static void init_crandom(struct crndstate *state, unsigned long rho) * Next number depends on last value. * rho is scaled to avoid floating point. */ -static u32 get_crandom(struct crndstate *state) +static u32 get_crandom(struct crndstate *state, struct prng *p) { u64 value, rho; unsigned long answer; + struct rnd_state *s = &p->prng_state; if (!state || state->rho == 0) /* no correlation */ - return get_random_u32(); + return prandom_u32_state(s); - value = get_random_u32(); + value = prandom_u32_state(s); rho = (u64)state->rho + 1; answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; state->last = answer; @@ -295,7 +296,7 @@ static bool loss_event(struct netem_sched_data *q) switch (q->loss_model) { case CLG_RANDOM: /* Random packet drop 0 => none, ~0 => all */ - return q->loss && q->loss >= get_crandom(&q->loss_cor); + return q->loss && q->loss >= get_crandom(&q->loss_cor, &q->prng); case CLG_4_STATES: /* 4state loss model algorithm (used also for GI model) @@ -324,6 +325,7 @@ static bool loss_event(struct netem_sched_data *q) */ static s64 tabledist(s64 mu, s32 sigma, struct crndstate *state, + struct prng *prng, const struct disttable *dist) { s64 x; @@ -333,7 +335,7 @@ static s64 tabledist(s64 mu, s32 sigma, if (sigma == 0) return mu; - rnd = get_crandom(state); + rnd = get_crandom(state, prng); /* default uniform distribution */ if (dist == NULL) @@ -455,7 +457,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, skb->prev = NULL; /* Random duplication */ - if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) + if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng)) ++count; /* Drop packet? */ @@ -498,7 +500,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, * If packet is going to be hardware checksummed, then * do it now in software before we mangle it. */ - if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { + if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor, &q->prng)) { if (skb_is_gso(skb)) { skb = netem_segment(skb, sch, to_free); if (!skb) @@ -536,12 +538,12 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, cb = netem_skb_cb(skb); if (q->gap == 0 || /* not doing reordering */ q->counter < q->gap - 1 || /* inside last reordering gap */ - q->reorder < get_crandom(&q->reorder_cor)) { + q->reorder < get_crandom(&q->reorder_cor, &q->prng)) { u64 now; s64 delay; delay = tabledist(q->latency, q->jitter, - &q->delay_cor, q->delay_dist); + &q->delay_cor, &q->prng, q->delay_dist); now = ktime_get_ns(); @@ -645,7 +647,7 @@ static void get_slot_next(struct netem_sched_data *q, u64 now) else next_delay = tabledist(q->slot_config.dist_delay, (s32)(q->slot_config.dist_jitter), - NULL, q->slot_dist); + NULL, &q->prng, q->slot_dist); q->slot.slot_next = now + next_delay; q->slot.packets_left = q->slot_config.max_packets; -- cgit v1.2.3 From 29b22badb7a84b783e3a4fffca16f7768fb31205 Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Thu, 17 Aug 2023 19:58:11 -0700 Subject: lwt: Fix return values of BPF xmit ops BPF encap ops can return different types of positive values, such like NET_RX_DROP, NET_XMIT_CN, NETDEV_TX_BUSY, and so on, from function skb_do_redirect and bpf_lwt_xmit_reroute. At the xmit hook, such return values would be treated implicitly as LWTUNNEL_XMIT_CONTINUE in ip(6)_finish_output2. When this happens, skbs that have been freed would continue to the neighbor subsystem, causing use-after-free bug and kernel crashes. To fix the incorrect behavior, skb_do_redirect return values can be simply discarded, the same as tc-egress behavior. On the other hand, bpf_lwt_xmit_reroute returns useful errors to local senders, e.g. PMTU information. Thus convert its return values to avoid the conflict with LWTUNNEL_XMIT_CONTINUE. Fixes: 3a0af8fd61f9 ("bpf: BPF for lightweight tunnel infrastructure") Reported-by: Jordan Griege Suggested-by: Martin KaFai Lau Suggested-by: Stanislav Fomichev Signed-off-by: Yan Zhai Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/0d2b878186cfe215fec6b45769c1cd0591d3628d.1692326837.git.yan@cloudflare.com --- net/core/lwt_bpf.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 8b6b5e72b217..4a0797f0a154 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -60,9 +60,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, ret = BPF_OK; } else { skb_reset_mac_header(skb); - ret = skb_do_redirect(skb); - if (ret == 0) - ret = BPF_REDIRECT; + skb_do_redirect(skb); + ret = BPF_REDIRECT; } break; @@ -255,7 +254,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(err)) - return err; + return net_xmit_errno(err); /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ return LWTUNNEL_XMIT_DONE; -- cgit v1.2.3 From a171fbec88a2c730b108c7147ac5e7b2f5a02b47 Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Thu, 17 Aug 2023 19:58:14 -0700 Subject: lwt: Check LWTUNNEL_XMIT_CONTINUE strictly LWTUNNEL_XMIT_CONTINUE is implicitly assumed in ip(6)_finish_output2, such that any positive return value from a xmit hook could cause unexpected continue behavior, despite that related skb may have been freed. This could be error-prone for future xmit hook ops. One of the possible errors is to return statuses of dst_output directly. To make the code safer, redefine LWTUNNEL_XMIT_CONTINUE value to distinguish from dst_output statuses and check the continue condition explicitly. Fixes: 3a0af8fd61f9 ("bpf: BPF for lightweight tunnel infrastructure") Suggested-by: Dan Carpenter Signed-off-by: Yan Zhai Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/96b939b85eda00e8df4f7c080f770970a4c5f698.1692326837.git.yan@cloudflare.com --- include/net/lwtunnel.h | 5 ++++- net/ipv4/ip_output.c | 2 +- net/ipv6/ip6_output.c | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 6f15e6fa154e..53bd2d02a4f0 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -16,9 +16,12 @@ #define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1) #define LWTUNNEL_STATE_XMIT_REDIRECT BIT(2) +/* LWTUNNEL_XMIT_CONTINUE should be distinguishable from dst_output return + * values (NET_XMIT_xxx and NETDEV_TX_xxx in linux/netdevice.h) for safety. + */ enum { LWTUNNEL_XMIT_DONE, - LWTUNNEL_XMIT_CONTINUE, + LWTUNNEL_XMIT_CONTINUE = 0x100, }; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ce6257860a40..43ba4b77b248 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -216,7 +216,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s if (lwtunnel_xmit_redirect(dst->lwtstate)) { int res = lwtunnel_xmit(skb); - if (res < 0 || res == LWTUNNEL_XMIT_DONE) + if (res != LWTUNNEL_XMIT_CONTINUE) return res; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f8a1f6bb3f87..0665e8b09968 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -113,7 +113,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * if (lwtunnel_xmit_redirect(dst->lwtstate)) { int res = lwtunnel_xmit(skb); - if (res < 0 || res == LWTUNNEL_XMIT_DONE) + if (res != LWTUNNEL_XMIT_CONTINUE) return res; } -- cgit v1.2.3 From 0a0643164da4a1976455aa12f0a96d08ee290752 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 15 Aug 2023 17:17:36 +0200 Subject: net: use SLAB_NO_MERGE for kmem_cache skbuff_head_cache Since v6.5-rc1 MM-tree is merged and contains a new flag SLAB_NO_MERGE in commit d0bf7d5759c1 ("mm/slab: introduce kmem_cache flag SLAB_NO_MERGE") now is the time to use this flag for networking as proposed earlier see link. The SKB (sk_buff) kmem_cache slab is critical for network performance. Network stack uses kmem_cache_{alloc,free}_bulk APIs to gain performance by amortising the alloc/free cost. For the bulk API to perform efficiently the slub fragmentation need to be low. Especially for the SLUB allocator, the efficiency of bulk free API depend on objects belonging to the same slab (page). When running different network performance microbenchmarks, I started to notice that performance was reduced (slightly) when machines had longer uptimes. I believe the cause was 'skbuff_head_cache' got aliased/merged into the general slub for 256 bytes sized objects (with my kernel config, without CONFIG_HARDENED_USERCOPY). For SKB kmem_cache network stack have other various reasons for not merging, but it varies depending on kernel config (e.g. CONFIG_HARDENED_USERCOPY). We want to explicitly set SLAB_NO_MERGE for this kmem_cache to get most out of kmem_cache_{alloc,free}_bulk APIs. When CONFIG_SLUB_TINY is configured the bulk APIs are essentially disabled. Thus, for this case drop the SLAB_NO_MERGE flag. Link: https://lore.kernel.org/all/167396280045.539803.7540459812377220500.stgit@firesoul/ Signed-off-by: Jesper Dangaard Brouer Acked-by: Vlastimil Babka Link: https://lore.kernel.org/r/169211265663.1491038.8580163757548985946.stgit@firesoul Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 33fdf04d4334..9b5bc621daf7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4795,12 +4795,23 @@ static void skb_extensions_init(void) static void skb_extensions_init(void) {} #endif +/* The SKB kmem_cache slab is critical for network performance. Never + * merge/alias the slab with similar sized objects. This avoids fragmentation + * that hurts performance of kmem_cache_{alloc,free}_bulk APIs. + */ +#ifndef CONFIG_SLUB_TINY +#define FLAG_SKB_NO_MERGE SLAB_NO_MERGE +#else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */ +#define FLAG_SKB_NO_MERGE 0 +#endif + void __init skb_init(void) { skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache", sizeof(struct sk_buff), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, + SLAB_HWCACHE_ALIGN|SLAB_PANIC| + FLAG_SKB_NO_MERGE, offsetof(struct sk_buff, cb), sizeof_field(struct sk_buff, cb), NULL); -- cgit v1.2.3 From 726e9e8b94b92d69004af17a68f9f37ffc0358b9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Aug 2023 18:23:53 +0000 Subject: tcp: refine skb->ooo_okay setting Enabling BIG TCP on a low end platform apparently increased chances of getting flows locked on one busy TX queue. A similar problem was handled in commit 9b462d02d6dd ("tcp: TCP Small Queues and strange attractors"), but the strategy worked for either bulk flows, or 'large enough' RPC. BIG TCP changed how large RPC needed to be to enable the work around: If RPC fits in a single skb, TSQ never triggers. Root cause for the problem is a busy TX queue, with delayed TX completions. This patch changes how we set skb->ooo_okay to detect the case TX completion was not done, but incoming ACK already was processed and emptied rtx queue. Update the comment to explain the tricky details. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230817182353.2523746-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 769a558159ee..e6b4fbd642f7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1301,14 +1301,21 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } tcp_header_size = tcp_options_size + sizeof(struct tcphdr); - /* if no packet is in qdisc/device queue, then allow XPS to select - * another queue. We can be called from tcp_tsq_handler() - * which holds one reference to sk. - * - * TODO: Ideally, in-flight pure ACK packets should not matter here. - * One way to get this would be to set skb->truesize = 2 on them. + /* We set skb->ooo_okay to one if this packet can select + * a different TX queue than prior packets of this flow, + * to avoid self inflicted reorders. + * The 'other' queue decision is based on current cpu number + * if XPS is enabled, or sk->sk_txhash otherwise. + * We can switch to another (and better) queue if: + * 1) No packet with payload is in qdisc/device queues. + * Delays in TX completion can defeat the test + * even if packets were already sent. + * 2) Or rtx queue is empty. + * This mitigates above case if ACK packets for + * all prior packets were already processed. */ - skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); + skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) || + tcp_rtx_queue_empty(sk); /* If we had to use memory reserve to allocate this skb, * this might cause drops if packet is looped back : -- cgit v1.2.3 From 1e700948c9db0d09f691f715e8f4b947e51e35b5 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 17 Aug 2023 21:20:27 +0800 Subject: net/smc: support smc release version negotiation in clc handshake Support smc release version negotiation in clc handshake based on SMC v2, where no negotiation process for different releases, but for different versions. The latest smc release version was updated to v2.1. And currently there are two release versions of SMCv2, v2.0 and v2.1. In the release version negotiation, client sends the preferred release version by CLC Proposal Message, server makes decision for which release version to use based on the client preferred release version and self-supported release version (here choose the minimum release version of the client preferred and server latest supported), then the decision returns to client by CLC Accept Message. Client confirms the decision by CLC Confirm Message. Client Server Proposal(preferred release version) ------------------------------------> Accept(accpeted release version) min(client preferred, server latest supported) <------------------------------------ Confirm(accpeted release version) ------------------------------------> Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Reviewed-by: Jan Karcher Signed-off-by: David S. Miller --- net/smc/af_smc.c | 21 +++++++++++++++++---- net/smc/smc.h | 5 ++++- net/smc/smc_clc.c | 14 +++++++------- net/smc/smc_clc.h | 23 ++++++++++++++++++++++- net/smc/smc_core.h | 1 + 5 files changed, 51 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index f5834af5fad5..fff196b557ca 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1198,8 +1198,7 @@ static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, struct smc_clc_msg_accept_confirm_v2 *clc_v2 = (struct smc_clc_msg_accept_confirm_v2 *)aclc; struct smc_clc_first_contact_ext *fce = - (struct smc_clc_first_contact_ext *) - (((u8 *)clc_v2) + sizeof(*clc_v2)); + smc_get_clc_first_contact_ext(clc_v2, false); if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1) return 0; @@ -1218,6 +1217,9 @@ static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, return SMC_CLC_DECL_NOINDIRECT; } } + + ini->release_nr = fce->release; + return 0; } @@ -1386,6 +1388,13 @@ static int smc_connect_ism(struct smc_sock *smc, struct smc_clc_msg_accept_confirm_v2 *aclc_v2 = (struct smc_clc_msg_accept_confirm_v2 *)aclc; + if (ini->first_contact_peer) { + struct smc_clc_first_contact_ext *fce = + smc_get_clc_first_contact_ext(aclc_v2, true); + + ini->release_nr = fce->release; + } + rc = smc_v2_determine_accepted_chid(aclc_v2, ini); if (rc) return rc; @@ -1420,7 +1429,7 @@ static int smc_connect_ism(struct smc_sock *smc, } rc = smc_clc_send_confirm(smc, ini->first_contact_local, - aclc->hdr.version, eid, NULL); + aclc->hdr.version, eid, ini); if (rc) goto connect_abort; mutex_unlock(&smc_server_lgr_pending); @@ -1996,6 +2005,10 @@ static int smc_listen_v2_check(struct smc_sock *new_smc, } } + ini->release_nr = pclc_v2_ext->hdr.flag.release; + if (pclc_v2_ext->hdr.flag.release > SMC_RELEASE) + ini->release_nr = SMC_RELEASE; + out: if (!ini->smcd_version && !ini->smcr_version) return rc; @@ -2443,7 +2456,7 @@ static void smc_listen_work(struct work_struct *work) /* send SMC Accept CLC message */ accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version; rc = smc_clc_send_accept(new_smc, ini->first_contact_local, - accept_version, ini->negotiated_eid); + accept_version, ini->negotiated_eid, ini); if (rc) goto out_unlock; diff --git a/net/smc/smc.h b/net/smc/smc.h index 1f2b912c43d1..24745fde4ac2 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -21,7 +21,10 @@ #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ -#define SMC_RELEASE 0 + +#define SMC_RELEASE_0 0 +#define SMC_RELEASE_1 1 +#define SMC_RELEASE SMC_RELEASE_1 /* the latest release version */ #define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ #define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index c90d9e5dda54..fb0be0817e8a 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -420,11 +420,11 @@ smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc) return true; } -static void smc_clc_fill_fce(struct smc_clc_first_contact_ext *fce, int *len) +static void smc_clc_fill_fce(struct smc_clc_first_contact_ext *fce, int *len, int release_nr) { memset(fce, 0, sizeof(*fce)); fce->os_type = SMC_CLC_OS_LINUX; - fce->release = SMC_RELEASE; + fce->release = release_nr; memcpy(fce->hostname, smc_hostname, sizeof(smc_hostname)); (*len) += sizeof(*fce); } @@ -1019,7 +1019,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, memcpy(clc_v2->d1.eid, eid, SMC_MAX_EID_LEN); len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2; if (first_contact) - smc_clc_fill_fce(&fce, &len); + smc_clc_fill_fce(&fce, &len, ini->release_nr); clc_v2->hdr.length = htons(len); } memcpy(trl.eyecatcher, SMCD_EYECATCHER, @@ -1063,10 +1063,10 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, memcpy(clc_v2->r1.eid, eid, SMC_MAX_EID_LEN); len = SMCR_CLC_ACCEPT_CONFIRM_LEN_V2; if (first_contact) { - smc_clc_fill_fce(&fce, &len); + smc_clc_fill_fce(&fce, &len, ini->release_nr); fce.v2_direct = !link->lgr->uses_gateway; memset(&gle, 0, sizeof(gle)); - if (ini && clc->hdr.type == SMC_CLC_CONFIRM) { + if (clc->hdr.type == SMC_CLC_CONFIRM) { gle.gid_cnt = ini->smcrv2.gidlist.len; len += sizeof(gle); len += gle.gid_cnt * sizeof(gle.gid[0]); @@ -1141,7 +1141,7 @@ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, /* send CLC ACCEPT message across internal TCP socket */ int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, - u8 version, u8 *negotiated_eid) + u8 version, u8 *negotiated_eid, struct smc_init_info *ini) { struct smc_clc_msg_accept_confirm_v2 aclc_v2; int len; @@ -1149,7 +1149,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, memset(&aclc_v2, 0, sizeof(aclc_v2)); aclc_v2.hdr.type = SMC_CLC_ACCEPT; len = smc_clc_send_confirm_accept(new_smc, &aclc_v2, srv_first_contact, - version, negotiated_eid, NULL); + version, negotiated_eid, ini); if (len < ntohs(aclc_v2.hdr.length)) len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 5fee545c9a10..b923e89acafb 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -370,6 +370,27 @@ smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext) ntohs(prop_v2ext->hdr.smcd_v2_ext_offset)); } +static inline struct smc_clc_first_contact_ext * +smc_get_clc_first_contact_ext(struct smc_clc_msg_accept_confirm_v2 *clc_v2, + bool is_smcd) +{ + int clc_v2_len; + + if (clc_v2->hdr.version == SMC_V1 || + !(clc_v2->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) + return NULL; + + if (is_smcd) + clc_v2_len = + offsetofend(struct smc_clc_msg_accept_confirm_v2, d1); + else + clc_v2_len = + offsetofend(struct smc_clc_msg_accept_confirm_v2, r1); + + return (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + + clc_v2_len); +} + struct smcd_dev; struct smc_init_info; @@ -382,7 +403,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini); int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, u8 version, u8 *eid, struct smc_init_info *ini); int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact, - u8 version, u8 *negotiated_eid); + u8 version, u8 *negotiated_eid, struct smc_init_info *ini); void smc_clc_init(void) __init; void smc_clc_exit(void); void smc_clc_get_hostname(u8 **host); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 3c1b31bfa1cf..b6c68db61f23 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -374,6 +374,7 @@ struct smc_init_info { u8 is_smcd; u8 smc_type_v1; u8 smc_type_v2; + u8 release_nr; u8 first_contact_peer; u8 first_contact_local; unsigned short vlan_id; -- cgit v1.2.3 From 7290178a82fcc71694816ffe3c0c56453f0cf59c Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 17 Aug 2023 21:20:28 +0800 Subject: net/smc: add vendor unique experimental options area in clc handshake Add vendor unique experimental options area in clc handshake. In clc accept and confirm msg, vendor unique experimental options use the 16-Bytes reserved field, which defined in struct smc_clc_fce_gid_ext in previous version. Because of the struct smc_clc_first_contact_ext is widely used and limit the scope of modification, this patch moves the 16-Bytes reserved field out of struct smc_clc_fce_gid_ext, and followed with the struct smc_clc_first_contact_ext in a new struct names struct smc_clc_first_contact_ext_v2x. For SMC-R first connection, in previous version, the struct smc_clc_ first_contact_ext and the 16-Bytes reserved field has already been included in clc accept and confirm msg. Thus, this patch use struct smc_clc_first_contact_ext_v2x instead of the struct smc_clc_first_ contact_ext and the 16-Bytes reserved field in SMC-R clc accept and confirm msg is compatible with previous version. For SMC-D first connection, in previous version, only the struct smc_ clc_first_contact_ext is included in clc accept and confirm msg, and the 16-Bytes reserved field is not included. Thus, when the negotiated smc release version is the version before v2.1, we still use struct smc_clc_first_contact_ext for compatible consideration. If the negotiated smc release version is v2.1 or later, use struct smc_clc_first_contact_ ext_v2x instead. Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Reviewed-by: Jan Karcher Signed-off-by: David S. Miller --- net/smc/af_smc.c | 2 +- net/smc/smc_clc.c | 44 +++++++++++++++++++++++--------------------- net/smc/smc_clc.h | 15 +++++++++++++-- 3 files changed, 37 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index fff196b557ca..b80c5902f703 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1144,7 +1144,7 @@ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, #define SMC_CLC_MAX_ACCEPT_LEN \ (sizeof(struct smc_clc_msg_accept_confirm_v2) + \ - sizeof(struct smc_clc_first_contact_ext) + \ + sizeof(struct smc_clc_first_contact_ext_v2x) + \ sizeof(struct smc_clc_msg_trail)) /* CLC handshake during connect */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index fb0be0817e8a..a5b28f9c037d 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -391,9 +391,7 @@ smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2) return false; } else { if (hdr->typev1 == SMC_TYPE_D && - ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 && - (ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 + - sizeof(struct smc_clc_first_contact_ext))) + ntohs(hdr->length) < SMCD_CLC_ACCEPT_CONFIRM_LEN_V2) return false; if (hdr->typev1 == SMC_TYPE_R && ntohs(hdr->length) < SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) @@ -420,13 +418,19 @@ smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc) return true; } -static void smc_clc_fill_fce(struct smc_clc_first_contact_ext *fce, int *len, int release_nr) +static int smc_clc_fill_fce(struct smc_clc_first_contact_ext_v2x *fce, + struct smc_init_info *ini) { + int ret = sizeof(*fce); + memset(fce, 0, sizeof(*fce)); - fce->os_type = SMC_CLC_OS_LINUX; - fce->release = release_nr; - memcpy(fce->hostname, smc_hostname, sizeof(smc_hostname)); - (*len) += sizeof(*fce); + fce->fce_v2_base.os_type = SMC_CLC_OS_LINUX; + fce->fce_v2_base.release = ini->release_nr; + memcpy(fce->fce_v2_base.hostname, smc_hostname, sizeof(smc_hostname)); + if (ini->is_smcd && ini->release_nr < SMC_RELEASE_1) + ret = sizeof(struct smc_clc_first_contact_ext); + + return ret; } /* check if received message has a correct header length and contains valid @@ -986,13 +990,13 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, u8 *eid, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; + struct smc_clc_first_contact_ext_v2x fce; struct smc_clc_msg_accept_confirm *clc; - struct smc_clc_first_contact_ext fce; struct smc_clc_fce_gid_ext gle; struct smc_clc_msg_trail trl; + int i, len, fce_len; struct kvec vec[5]; struct msghdr msg; - int i, len; /* send SMC Confirm CLC msg */ clc = (struct smc_clc_msg_accept_confirm *)clc_v2; @@ -1018,8 +1022,10 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, if (eid && eid[0]) memcpy(clc_v2->d1.eid, eid, SMC_MAX_EID_LEN); len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2; - if (first_contact) - smc_clc_fill_fce(&fce, &len, ini->release_nr); + if (first_contact) { + fce_len = smc_clc_fill_fce(&fce, ini); + len += fce_len; + } clc_v2->hdr.length = htons(len); } memcpy(trl.eyecatcher, SMCD_EYECATCHER, @@ -1063,15 +1069,14 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, memcpy(clc_v2->r1.eid, eid, SMC_MAX_EID_LEN); len = SMCR_CLC_ACCEPT_CONFIRM_LEN_V2; if (first_contact) { - smc_clc_fill_fce(&fce, &len, ini->release_nr); - fce.v2_direct = !link->lgr->uses_gateway; - memset(&gle, 0, sizeof(gle)); + fce_len = smc_clc_fill_fce(&fce, ini); + len += fce_len; + fce.fce_v2_base.v2_direct = !link->lgr->uses_gateway; if (clc->hdr.type == SMC_CLC_CONFIRM) { + memset(&gle, 0, sizeof(gle)); gle.gid_cnt = ini->smcrv2.gidlist.len; len += sizeof(gle); len += gle.gid_cnt * sizeof(gle.gid[0]); - } else { - len += sizeof(gle.reserved); } } clc_v2->hdr.length = htons(len); @@ -1094,7 +1099,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, sizeof(trl); if (version > SMC_V1 && first_contact) { vec[i].iov_base = &fce; - vec[i++].iov_len = sizeof(fce); + vec[i++].iov_len = fce_len; if (!conn->lgr->is_smcd) { if (clc->hdr.type == SMC_CLC_CONFIRM) { vec[i].iov_base = &gle; @@ -1102,9 +1107,6 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, vec[i].iov_base = &ini->smcrv2.gidlist.list; vec[i++].iov_len = gle.gid_cnt * sizeof(gle.gid[0]); - } else { - vec[i].iov_base = &gle.reserved; - vec[i++].iov_len = sizeof(gle.reserved); } } } diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index b923e89acafb..bd75382f374d 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -147,7 +147,9 @@ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ struct smc_clc_msg_smcd { /* SMC-D GID information */ struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requestor */ __be16 v2_ext_offset; /* SMC Version 2 Extension Offset */ - u8 reserved[28]; + u8 vendor_oui[3]; /* vendor organizationally unique identifier */ + u8 vendor_exp_options[5]; + u8 reserved[20]; }; struct smc_clc_smcd_v2_extension { @@ -231,8 +233,17 @@ struct smc_clc_first_contact_ext { u8 hostname[SMC_MAX_HOSTNAME_LEN]; }; +struct smc_clc_first_contact_ext_v2x { + struct smc_clc_first_contact_ext fce_v2_base; + u8 reserved3[4]; + __be32 vendor_exp_options; + u8 reserved4[8]; +} __packed; /* format defined in + * IBM Shared Memory Communications Version 2 (Third Edition) + * (https://www.ibm.com/support/pages/node/7009315) + */ + struct smc_clc_fce_gid_ext { - u8 reserved[16]; u8 gid_cnt; u8 reserved2[3]; u8 gid[][SMC_GID_SIZE]; -- cgit v1.2.3 From 6ac1e6563f5915cd38b6bc6a8b26964b2252f751 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 17 Aug 2023 21:20:29 +0800 Subject: net/smc: support smc v2.x features validate Support SMC v2.x features validate for SMC v2.1. This is the frame code for SMC v2.x features validate, and will take effects only when the negotiated release version is v2.1 or later. For Server, v2.x features' validation should be done in smc_clc_srv_ v2x_features_validate when receiving v2.1 or later CLC Proposal Message, such as max conns, max links negotiation, the decision of the final value of max conns and max links should be made in this function. And final check for server when receiving v2.1 or later CLC Confirm Message should be done in smc_clc_v2x_features_confirm_check. For client, v2.x features' validation should be done in smc_clc_clnt_ v2x_features_validate when receiving v2.1 or later CLC Accept Message, for example, the decision to accpt the accepted value or to decline should be made in this function. Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Reviewed-by: Jan Karcher Signed-off-by: David S. Miller --- net/smc/af_smc.c | 18 ++++++++++++++++++ net/smc/smc_clc.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_clc.h | 7 +++++++ 3 files changed, 71 insertions(+) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b80c5902f703..337303b2a64e 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1199,6 +1199,7 @@ static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, (struct smc_clc_msg_accept_confirm_v2 *)aclc; struct smc_clc_first_contact_ext *fce = smc_get_clc_first_contact_ext(clc_v2, false); + int rc; if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1) return 0; @@ -1219,6 +1220,9 @@ static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, } ini->release_nr = fce->release; + rc = smc_clc_clnt_v2x_features_validate(fce, ini); + if (rc) + return rc; return 0; } @@ -1393,6 +1397,9 @@ static int smc_connect_ism(struct smc_sock *smc, smc_get_clc_first_contact_ext(aclc_v2, true); ini->release_nr = fce->release; + rc = smc_clc_clnt_v2x_features_validate(fce, ini); + if (rc) + return rc; } rc = smc_v2_determine_accepted_chid(aclc_v2, ini); @@ -2443,6 +2450,10 @@ static void smc_listen_work(struct work_struct *work) if (rc) goto out_decl; + rc = smc_clc_srv_v2x_features_validate(pclc, ini); + if (rc) + goto out_decl; + mutex_lock(&smc_server_lgr_pending); smc_close_init(new_smc); smc_rx_init(new_smc); @@ -2475,6 +2486,13 @@ static void smc_listen_work(struct work_struct *work) goto out_decl; } + rc = smc_clc_v2x_features_confirm_check(cclc, ini); + if (rc) { + if (!ini->is_smcd) + goto out_unlock; + goto out_decl; + } + /* finish worker */ if (!ini->is_smcd) { rc = smc_listen_rdma_finish(new_smc, cclc, diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index a5b28f9c037d..0232f189dad9 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -1158,6 +1158,52 @@ int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, return len > 0 ? 0 : len; } +int smc_clc_srv_v2x_features_validate(struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_v2_extension *pclc_v2_ext; + + if ((!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) || + ini->release_nr < SMC_RELEASE_1) + return 0; + + pclc_v2_ext = smc_get_clc_v2_ext(pclc); + if (!pclc_v2_ext) + return SMC_CLC_DECL_NOV2EXT; + + return 0; +} + +int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce, + struct smc_init_info *ini) +{ + if (ini->release_nr < SMC_RELEASE_1) + return 0; + + return 0; +} + +int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc, + struct smc_init_info *ini) +{ + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)cclc; + struct smc_clc_first_contact_ext *fce = + smc_get_clc_first_contact_ext(clc_v2, ini->is_smcd); + + if (cclc->hdr.version == SMC_V1 || + !(cclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) + return 0; + + if (ini->release_nr != fce->release) + return SMC_CLC_DECL_RELEASEERR; + + if (fce->release < SMC_RELEASE_1) + return 0; + + return 0; +} + void smc_clc_get_hostname(u8 **host) { *host = &smc_hostname[0]; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index bd75382f374d..552d0656252c 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -45,6 +45,7 @@ #define SMC_CLC_DECL_NOSEID 0x03030006 /* peer sent no SEID */ #define SMC_CLC_DECL_NOSMCD2DEV 0x03030007 /* no SMC-Dv2 device found */ #define SMC_CLC_DECL_NOUEID 0x03030008 /* peer sent no UEID */ +#define SMC_CLC_DECL_RELEASEERR 0x03030009 /* release version negotiate failed */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ @@ -415,6 +416,12 @@ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, u8 version, u8 *eid, struct smc_init_info *ini); int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact, u8 version, u8 *negotiated_eid, struct smc_init_info *ini); +int smc_clc_srv_v2x_features_validate(struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini); +int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce, + struct smc_init_info *ini); +int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc, + struct smc_init_info *ini); void smc_clc_init(void) __init; void smc_clc_exit(void); void smc_clc_get_hostname(u8 **host); -- cgit v1.2.3 From 7f0620b9940b3b4c5d83a815d46b97ac0b88a595 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 17 Aug 2023 21:20:30 +0800 Subject: net/smc: support max connections per lgr negotiation Support max connections per lgr negotiation for SMCR v2.1, which is one of smc v2.1 features. Server makes decision for the final value of max conns based on the client preferred max conns and self-preferred max conns. Here use the minimum value of client preferred max conns and server preferred max conns. Client Server Proposal(max conns(client preferred)) ------------------------------------> Accept(max conns(accepted value)) accepted value=min(client preferred, server preferred) <----------------------------------- Confirm(max conns(accepted value)) -----------------------------------> Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Reviewed-by: Jan Karcher Signed-off-by: David S. Miller --- net/smc/af_smc.c | 1 + net/smc/smc_clc.c | 38 ++++++++++++++++++++++++++++++++++++-- net/smc/smc_clc.h | 7 +++++-- net/smc/smc_core.c | 4 +++- net/smc/smc_core.h | 12 ++++++++++++ net/smc/smc_llc.c | 6 +++++- 6 files changed, 62 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 337303b2a64e..9a72eb8d3038 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1242,6 +1242,7 @@ static int smc_connect_rdma(struct smc_sock *smc, memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN); memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE); memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN); + ini->max_conns = SMC_CONN_PER_LGR_MAX; reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini); if (reason_code) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 0232f189dad9..c88f5e2e65a2 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -427,9 +427,17 @@ static int smc_clc_fill_fce(struct smc_clc_first_contact_ext_v2x *fce, fce->fce_v2_base.os_type = SMC_CLC_OS_LINUX; fce->fce_v2_base.release = ini->release_nr; memcpy(fce->fce_v2_base.hostname, smc_hostname, sizeof(smc_hostname)); - if (ini->is_smcd && ini->release_nr < SMC_RELEASE_1) + if (ini->is_smcd && ini->release_nr < SMC_RELEASE_1) { ret = sizeof(struct smc_clc_first_contact_ext); + goto out; + } + + if (ini->release_nr >= SMC_RELEASE_1) { + if (!ini->is_smcd) + fce->max_conns = ini->max_conns; + } +out: return ret; } @@ -931,8 +939,10 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) sizeof(struct smc_clc_smcd_gid_chid); } } - if (smcr_indicated(ini->smc_type_v2)) + if (smcr_indicated(ini->smc_type_v2)) { memcpy(v2_ext->roce, ini->smcrv2.ib_gid_v2, SMC_GID_SIZE); + v2_ext->max_conns = SMC_CONN_PER_LGR_PREFER; + } pclc_base->hdr.length = htons(plen); memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); @@ -1163,6 +1173,8 @@ int smc_clc_srv_v2x_features_validate(struct smc_clc_msg_proposal *pclc, { struct smc_clc_v2_extension *pclc_v2_ext; + ini->max_conns = SMC_CONN_PER_LGR_MAX; + if ((!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) || ini->release_nr < SMC_RELEASE_1) return 0; @@ -1171,15 +1183,30 @@ int smc_clc_srv_v2x_features_validate(struct smc_clc_msg_proposal *pclc, if (!pclc_v2_ext) return SMC_CLC_DECL_NOV2EXT; + if (ini->smcr_version & SMC_V2) { + ini->max_conns = min_t(u8, pclc_v2_ext->max_conns, SMC_CONN_PER_LGR_PREFER); + if (ini->max_conns < SMC_CONN_PER_LGR_MIN) + return SMC_CLC_DECL_MAXCONNERR; + } + return 0; } int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce, struct smc_init_info *ini) { + struct smc_clc_first_contact_ext_v2x *fce_v2x = + (struct smc_clc_first_contact_ext_v2x *)fce; + if (ini->release_nr < SMC_RELEASE_1) return 0; + if (!ini->is_smcd) { + if (fce_v2x->max_conns < SMC_CONN_PER_LGR_MIN) + return SMC_CLC_DECL_MAXCONNERR; + ini->max_conns = fce_v2x->max_conns; + } + return 0; } @@ -1190,6 +1217,8 @@ int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc, (struct smc_clc_msg_accept_confirm_v2 *)cclc; struct smc_clc_first_contact_ext *fce = smc_get_clc_first_contact_ext(clc_v2, ini->is_smcd); + struct smc_clc_first_contact_ext_v2x *fce_v2x = + (struct smc_clc_first_contact_ext_v2x *)fce; if (cclc->hdr.version == SMC_V1 || !(cclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) @@ -1201,6 +1230,11 @@ int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc, if (fce->release < SMC_RELEASE_1) return 0; + if (!ini->is_smcd) { + if (fce_v2x->max_conns != ini->max_conns) + return SMC_CLC_DECL_MAXCONNERR; + } + return 0; } diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 552d0656252c..464b93b46047 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -46,6 +46,7 @@ #define SMC_CLC_DECL_NOSMCD2DEV 0x03030007 /* no SMC-Dv2 device found */ #define SMC_CLC_DECL_NOUEID 0x03030008 /* peer sent no UEID */ #define SMC_CLC_DECL_RELEASEERR 0x03030009 /* release version negotiate failed */ +#define SMC_CLC_DECL_MAXCONNERR 0x0303000a /* max connections negotiate failed */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ @@ -134,7 +135,8 @@ struct smc_clc_smcd_gid_chid { struct smc_clc_v2_extension { struct smc_clnt_opts_area_hdr hdr; u8 roce[16]; /* RoCEv2 GID */ - u8 reserved[16]; + u8 max_conns; + u8 reserved[15]; u8 user_eids[][SMC_MAX_EID_LEN]; }; @@ -236,7 +238,8 @@ struct smc_clc_first_contact_ext { struct smc_clc_first_contact_ext_v2x { struct smc_clc_first_contact_ext fce_v2_base; - u8 reserved3[4]; + u8 max_conns; /* for SMC-R only */ + u8 reserved3[3]; __be32 vendor_exp_options; u8 reserved4[8]; } __packed; /* format defined in diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 6b78075404d7..8c69cbb70f6e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -895,9 +895,11 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->uses_gateway = ini->smcrv2.uses_gateway; memcpy(lgr->nexthop_mac, ini->smcrv2.nexthop_mac, ETH_ALEN); + lgr->max_conns = ini->max_conns; } else { ibdev = ini->ib_dev; ibport = ini->ib_port; + lgr->max_conns = SMC_CONN_PER_LGR_MAX; } memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], SMC_MAX_PNETID_LEN); @@ -1888,7 +1890,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || - (lgr->conns_num < SMC_RMBS_PER_LGR_MAX && + (lgr->conns_num < lgr->max_conns && !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { /* link group found */ ini->first_contact_local = 0; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index b6c68db61f23..32b199477ef3 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -22,6 +22,15 @@ #include "smc_ib.h" #define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ +#define SMC_CONN_PER_LGR_MIN 16 /* min. # of connections per link group */ +#define SMC_CONN_PER_LGR_MAX 255 /* max. # of connections per link group, + * also is the default value for SMC-R v1 and v2.0 + */ +#define SMC_CONN_PER_LGR_PREFER 255 /* Preferred connections per link group used for + * SMC-R v2.1 and later negotiation, vendors or + * distrubutions may modify it to a value between + * 16-255 as needed. + */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; @@ -331,6 +340,8 @@ struct smc_link_group { __be32 saddr; /* net namespace */ struct net *net; + u8 max_conns; + /* max conn can be assigned to lgr */ }; struct { /* SMC-D */ u64 peer_gid; @@ -375,6 +386,7 @@ struct smc_init_info { u8 smc_type_v1; u8 smc_type_v2; u8 release_nr; + u8 max_conns; u8 first_contact_peer; u8 first_contact_local; unsigned short vlan_id; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 90f0b60b196a..5347b62f1518 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -52,7 +52,8 @@ struct smc_llc_msg_confirm_link { /* type 0x01 */ u8 link_num; u8 link_uid[SMC_LGR_ID_SIZE]; u8 max_links; - u8 reserved[9]; + u8 max_conns; + u8 reserved[8]; }; #define SMC_LLC_FLAG_ADD_LNK_REJ 0x40 @@ -472,6 +473,9 @@ int smc_llc_send_confirm_link(struct smc_link *link, confllc->link_num = link->link_id; memcpy(confllc->link_uid, link->link_uid, SMC_LGR_ID_SIZE); confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; + if (link->lgr->smc_version == SMC_V2 && + link->lgr->peer_smc_release >= SMC_RELEASE_1) + confllc->max_conns = link->lgr->max_conns; /* send llc message */ rc = smc_wr_tx_send(link, pend); put_out: -- cgit v1.2.3 From 69b888e3bb4b186ec597a368bc8348e0e4bc6e65 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 17 Aug 2023 21:20:31 +0800 Subject: net/smc: support max links per lgr negotiation in clc handshake Support max links per lgr negotiation in clc handshake for SMCR v2.1, which is one of smc v2.1 features. Server makes decision for the final value of max links based on the client preferred max links and self-preferred max links. Here use the minimum value of the client preferred max links and server preferred max links. Client Server Proposal(max links(client preferred)) --------------------------------------> Accept(max links(accepted value)) accepted value=min(client preferred, server preferred) <------------------------------------- Confirm(max links(accepted value)) -------------------------------------> Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Reviewed-by: Jan Karcher Signed-off-by: David S. Miller --- net/smc/af_smc.c | 44 +++++++++++++++++++++++++++----------------- net/smc/smc_clc.c | 17 ++++++++++++++++- net/smc/smc_clc.h | 7 +++++-- net/smc/smc_core.c | 5 +++++ net/smc/smc_core.h | 12 ++++++++++++ net/smc/smc_llc.c | 21 +++++++++++++++++---- 6 files changed, 82 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9a72eb8d3038..c4f664fb2bcb 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -641,20 +641,22 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - /* optional 2nd link, receive ADD LINK request from server */ - qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, - SMC_LLC_ADD_LINK); - if (!qentry) { - struct smc_clc_msg_decline dclc; - - rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - if (rc == -EAGAIN) - rc = 0; /* no DECLINE received, go with one link */ - return rc; + if (link->lgr->max_links > 1) { + /* optional 2nd link, receive ADD LINK request from server */ + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK); + if (!qentry) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + if (rc == -EAGAIN) + rc = 0; /* no DECLINE received, go with one link */ + return rc; + } + smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); + smc_llc_cli_add_link(link, qentry); } - smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); - smc_llc_cli_add_link(link, qentry); return 0; } @@ -1243,6 +1245,7 @@ static int smc_connect_rdma(struct smc_sock *smc, memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE); memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN); ini->max_conns = SMC_CONN_PER_LGR_MAX; + ini->max_links = SMC_LINKS_ADD_LNK_MAX; reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini); if (reason_code) @@ -1887,10 +1890,12 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - down_write(&link->lgr->llc_conf_mutex); - /* initial contact - try to establish second link */ - smc_llc_srv_add_link(link, NULL); - up_write(&link->lgr->llc_conf_mutex); + if (link->lgr->max_links > 1) { + down_write(&link->lgr->llc_conf_mutex); + /* initial contact - try to establish second link */ + smc_llc_srv_add_link(link, NULL); + up_write(&link->lgr->llc_conf_mutex); + } return 0; } @@ -2494,6 +2499,11 @@ static void smc_listen_work(struct work_struct *work) goto out_decl; } + /* fce smc release version is needed in smc_listen_rdma_finish, + * so save fce info here. + */ + smc_conn_save_peer_info_fce(new_smc, cclc); + /* finish worker */ if (!ini->is_smcd) { rc = smc_listen_rdma_finish(new_smc, cclc, diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index c88f5e2e65a2..8deb46c28f1d 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -433,8 +433,10 @@ static int smc_clc_fill_fce(struct smc_clc_first_contact_ext_v2x *fce, } if (ini->release_nr >= SMC_RELEASE_1) { - if (!ini->is_smcd) + if (!ini->is_smcd) { fce->max_conns = ini->max_conns; + fce->max_links = ini->max_links; + } } out: @@ -942,6 +944,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) if (smcr_indicated(ini->smc_type_v2)) { memcpy(v2_ext->roce, ini->smcrv2.ib_gid_v2, SMC_GID_SIZE); v2_ext->max_conns = SMC_CONN_PER_LGR_PREFER; + v2_ext->max_links = SMC_LINKS_PER_LGR_MAX_PREFER; } pclc_base->hdr.length = htons(plen); @@ -1174,6 +1177,7 @@ int smc_clc_srv_v2x_features_validate(struct smc_clc_msg_proposal *pclc, struct smc_clc_v2_extension *pclc_v2_ext; ini->max_conns = SMC_CONN_PER_LGR_MAX; + ini->max_links = SMC_LINKS_ADD_LNK_MAX; if ((!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) || ini->release_nr < SMC_RELEASE_1) @@ -1187,6 +1191,10 @@ int smc_clc_srv_v2x_features_validate(struct smc_clc_msg_proposal *pclc, ini->max_conns = min_t(u8, pclc_v2_ext->max_conns, SMC_CONN_PER_LGR_PREFER); if (ini->max_conns < SMC_CONN_PER_LGR_MIN) return SMC_CLC_DECL_MAXCONNERR; + + ini->max_links = min_t(u8, pclc_v2_ext->max_links, SMC_LINKS_PER_LGR_MAX_PREFER); + if (ini->max_links < SMC_LINKS_ADD_LNK_MIN) + return SMC_CLC_DECL_MAXLINKERR; } return 0; @@ -1205,6 +1213,11 @@ int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce, if (fce_v2x->max_conns < SMC_CONN_PER_LGR_MIN) return SMC_CLC_DECL_MAXCONNERR; ini->max_conns = fce_v2x->max_conns; + + if (fce_v2x->max_links > SMC_LINKS_ADD_LNK_MAX || + fce_v2x->max_links < SMC_LINKS_ADD_LNK_MIN) + return SMC_CLC_DECL_MAXLINKERR; + ini->max_links = fce_v2x->max_links; } return 0; @@ -1233,6 +1246,8 @@ int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc, if (!ini->is_smcd) { if (fce_v2x->max_conns != ini->max_conns) return SMC_CLC_DECL_MAXCONNERR; + if (fce_v2x->max_links != ini->max_links) + return SMC_CLC_DECL_MAXLINKERR; } return 0; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 464b93b46047..c5c8e7db775a 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -47,6 +47,7 @@ #define SMC_CLC_DECL_NOUEID 0x03030008 /* peer sent no UEID */ #define SMC_CLC_DECL_RELEASEERR 0x03030009 /* release version negotiate failed */ #define SMC_CLC_DECL_MAXCONNERR 0x0303000a /* max connections negotiate failed */ +#define SMC_CLC_DECL_MAXLINKERR 0x0303000b /* max links negotiate failed */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ @@ -136,7 +137,8 @@ struct smc_clc_v2_extension { struct smc_clnt_opts_area_hdr hdr; u8 roce[16]; /* RoCEv2 GID */ u8 max_conns; - u8 reserved[15]; + u8 max_links; + u8 reserved[14]; u8 user_eids[][SMC_MAX_EID_LEN]; }; @@ -239,7 +241,8 @@ struct smc_clc_first_contact_ext { struct smc_clc_first_contact_ext_v2x { struct smc_clc_first_contact_ext fce_v2_base; u8 max_conns; /* for SMC-R only */ - u8 reserved3[3]; + u8 max_links; /* for SMC-R only */ + u8 reserved3[2]; __be32 vendor_exp_options; u8 reserved4[8]; } __packed; /* format defined in diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 8c69cbb70f6e..aae8d3f5c3cf 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -896,10 +896,12 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) memcpy(lgr->nexthop_mac, ini->smcrv2.nexthop_mac, ETH_ALEN); lgr->max_conns = ini->max_conns; + lgr->max_links = ini->max_links; } else { ibdev = ini->ib_dev; ibport = ini->ib_port; lgr->max_conns = SMC_CONN_PER_LGR_MAX; + lgr->max_links = SMC_LINKS_ADD_LNK_MAX; } memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], SMC_MAX_PNETID_LEN); @@ -1666,6 +1668,9 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) continue; + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) + continue; + /* trigger local add link processing */ link = smc_llc_usable_link(lgr); if (link) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 32b199477ef3..120027d40469 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -173,6 +173,15 @@ struct smc_link { */ #define SMC_LINKS_PER_LGR_MAX 3 #define SMC_SINGLE_LINK 0 +#define SMC_LINKS_ADD_LNK_MIN 1 /* min. # of links per link group */ +#define SMC_LINKS_ADD_LNK_MAX 2 /* max. # of links per link group, also is the + * default value for smc-r v1.0 and v2.0 + */ +#define SMC_LINKS_PER_LGR_MAX_PREFER 2 /* Preferred max links per link group used for + * SMC-R v2.1 and later negotiation, vendors or + * distrubutions may modify it to a value between + * 1-2 as needed. + */ /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */ struct smc_buf_desc { @@ -342,6 +351,8 @@ struct smc_link_group { struct net *net; u8 max_conns; /* max conn can be assigned to lgr */ + u8 max_links; + /* max links can be added in lgr */ }; struct { /* SMC-D */ u64 peer_gid; @@ -387,6 +398,7 @@ struct smc_init_info { u8 smc_type_v2; u8 release_nr; u8 max_conns; + u8 max_links; u8 first_contact_peer; u8 first_contact_local; unsigned short vlan_id; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 5347b62f1518..018ce8133b02 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -59,8 +59,6 @@ struct smc_llc_msg_confirm_link { /* type 0x01 */ #define SMC_LLC_FLAG_ADD_LNK_REJ 0x40 #define SMC_LLC_REJ_RSN_NO_ALT_PATH 1 -#define SMC_LLC_ADD_LNK_MAX_LINKS 2 - struct smc_llc_msg_add_link { /* type 0x02 */ struct smc_llc_hdr hd; u8 sender_mac[ETH_ALEN]; @@ -472,10 +470,12 @@ int smc_llc_send_confirm_link(struct smc_link *link, hton24(confllc->sender_qp_num, link->roce_qp->qp_num); confllc->link_num = link->link_id; memcpy(confllc->link_uid, link->link_uid, SMC_LGR_ID_SIZE); - confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; + confllc->max_links = SMC_LINKS_ADD_LNK_MAX; if (link->lgr->smc_version == SMC_V2 && - link->lgr->peer_smc_release >= SMC_RELEASE_1) + link->lgr->peer_smc_release >= SMC_RELEASE_1) { confllc->max_conns = link->lgr->max_conns; + confllc->max_links = link->lgr->max_links; + } /* send llc message */ rc = smc_wr_tx_send(link, pend); put_out: @@ -1045,6 +1045,11 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) goto out_reject; } + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) { + rc = 0; + goto out_reject; + } + ini->vlan_id = lgr->vlan_id; if (lgr->smc_version == SMC_V2) { ini->check_smcrv2 = true; @@ -1169,6 +1174,9 @@ static void smc_llc_cli_add_link_invite(struct smc_link *link, lgr->type == SMC_LGR_ASYMMETRIC_PEER) goto out; + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) + goto out; + ini = kzalloc(sizeof(*ini), GFP_KERNEL); if (!ini) goto out; @@ -1414,6 +1422,11 @@ int smc_llc_srv_add_link(struct smc_link *link, goto out; } + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) { + rc = 0; + goto out; + } + /* ignore client add link recommendation, start new flow */ ini->vlan_id = lgr->vlan_id; if (lgr->smc_version == SMC_V2) { -- cgit v1.2.3 From bbed596c74a527e0d0d30bc56732f26407f12d6e Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 17 Aug 2023 21:20:32 +0800 Subject: net/smc: Extend SMCR v2 linkgroup netlink attribute Add SMC_NLA_LGR_R_V2_MAX_CONNS and SMC_NLA_LGR_R_V2_MAX_LINKS to SMCR v2 linkgroup netlink attribute SMC_NLA_LGR_R_V2 for linkgroup's detail info showing. Signed-off-by: Guangguan Wang Reviewed-by: Jan Karcher Signed-off-by: David S. Miller --- include/uapi/linux/smc.h | 2 ++ net/smc/smc_core.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index bb4dacca31e7..837fcd4b0abc 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -107,6 +107,8 @@ enum { enum { SMC_NLA_LGR_R_V2_UNSPEC, SMC_NLA_LGR_R_V2_DIRECT, /* u8 */ + SMC_NLA_LGR_R_V2_MAX_CONNS, /* u8 */ + SMC_NLA_LGR_R_V2_MAX_LINKS, /* u8 */ __SMC_NLA_LGR_R_V2_MAX, SMC_NLA_LGR_R_V2_MAX = __SMC_NLA_LGR_R_V2_MAX - 1 }; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index aae8d3f5c3cf..bd01dd31e4bd 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -319,6 +319,10 @@ static int smc_nl_fill_smcr_lgr_v2(struct smc_link_group *lgr, goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_DIRECT, !lgr->uses_gateway)) goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_CONNS, lgr->max_conns)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_LINKS, lgr->max_links)) + goto errv2attr; nla_nest_end(skb, v2_attrs); return 0; -- cgit v1.2.3 From 4025d3e73abde4f65f4b04d4b1d8449b00e31473 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Aug 2023 09:40:39 +0000 Subject: net: add skb_queue_purge_reason and __skb_queue_purge_reason skb_queue_purge() and __skb_queue_purge() become wrappers around the new generic functions. New SKB_DROP_REASON_QUEUE_PURGE drop reason is added, but users can start adding more specific reasons. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/skbuff.h | 23 +++++++++++++++++++---- include/net/dropreason-core.h | 3 +++ net/core/skbuff.c | 11 +++++++---- 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index aa57e2eca33b..9aec136bc690 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3149,20 +3149,35 @@ static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask) } /** - * __skb_queue_purge - empty a list + * __skb_queue_purge_reason - empty a list * @list: list to empty + * @reason: drop reason * * Delete all buffers on an &sk_buff list. Each buffer is removed from * the list and one reference dropped. This function does not take the * list lock and the caller must hold the relevant locks to use it. */ -static inline void __skb_queue_purge(struct sk_buff_head *list) +static inline void __skb_queue_purge_reason(struct sk_buff_head *list, + enum skb_drop_reason reason) { struct sk_buff *skb; + while ((skb = __skb_dequeue(list)) != NULL) - kfree_skb(skb); + kfree_skb_reason(skb, reason); +} + +static inline void __skb_queue_purge(struct sk_buff_head *list) +{ + __skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE); +} + +void skb_queue_purge_reason(struct sk_buff_head *list, + enum skb_drop_reason reason); + +static inline void skb_queue_purge(struct sk_buff_head *list) +{ + skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE); } -void skb_queue_purge(struct sk_buff_head *list); unsigned int skb_rbtree_purge(struct rb_root *root); diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index f291a3b0f9e5..a587e83fc169 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -79,6 +79,7 @@ FN(IPV6_NDISC_BAD_CODE) \ FN(IPV6_NDISC_BAD_OPTIONS) \ FN(IPV6_NDISC_NS_OTHERHOST) \ + FN(QUEUE_PURGE) \ FNe(MAX) /** @@ -342,6 +343,8 @@ enum skb_drop_reason { * for another host. */ SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST, + /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */ + SKB_DROP_REASON_QUEUE_PURGE, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9b5bc621daf7..117d36b1fae3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3701,20 +3701,23 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) EXPORT_SYMBOL(skb_dequeue_tail); /** - * skb_queue_purge - empty a list + * skb_queue_purge_reason - empty a list * @list: list to empty + * @reason: drop reason * * Delete all buffers on an &sk_buff list. Each buffer is removed from * the list and one reference dropped. This function takes the list * lock and is atomic with respect to other list locking functions. */ -void skb_queue_purge(struct sk_buff_head *list) +void skb_queue_purge_reason(struct sk_buff_head *list, + enum skb_drop_reason reason) { struct sk_buff *skb; + while ((skb = skb_dequeue(list)) != NULL) - kfree_skb(skb); + kfree_skb_reason(skb, reason); } -EXPORT_SYMBOL(skb_queue_purge); +EXPORT_SYMBOL(skb_queue_purge_reason); /** * skb_rbtree_purge - empty a skb rbtree -- cgit v1.2.3 From 0f158b32a9b146c9b86783efccfd9ed02c744623 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Aug 2023 17:41:45 +0000 Subject: net: selectively purge error queue in IP_RECVERR / IPV6_RECVERR Setting IP_RECVERR and IPV6_RECVERR options to zero currently purges the socket error queue, which was probably not expected for zerocopy and tx_timestamp users. I discovered this issue while preparing commit 6b5f43ea0815 ("inet: move inet->recverr to inet->inet_flags"), I presume this change does not need to be backported to stable kernels. Add skb_errqueue_purge() helper to purge error messages only. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Soheil Hassas Yeganeh Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + net/core/skbuff.c | 21 +++++++++++++++++++++ net/ipv4/ip_sockglue.c | 2 +- net/ipv6/ipv6_sockglue.c | 2 +- 4 files changed, 24 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9aec136bc690..4174c4b82d13 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3180,6 +3180,7 @@ static inline void skb_queue_purge(struct sk_buff_head *list) } unsigned int skb_rbtree_purge(struct rb_root *root); +void skb_errqueue_purge(struct sk_buff_head *list); void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 117d36b1fae3..faa6c86da2a5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3745,6 +3745,27 @@ unsigned int skb_rbtree_purge(struct rb_root *root) return sum; } +void skb_errqueue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb, *next; + struct sk_buff_head kill; + unsigned long flags; + + __skb_queue_head_init(&kill); + + spin_lock_irqsave(&list->lock, flags); + skb_queue_walk_safe(list, skb, next) { + if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY || + SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) + continue; + __skb_unlink(skb, list); + __skb_queue_tail(&kill, skb); + } + spin_unlock_irqrestore(&list->lock, flags); + __skb_queue_purge(&kill); +} +EXPORT_SYMBOL(skb_errqueue_purge); + /** * skb_queue_head - queue a buffer at the list head * @list: list to use diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 61b2e7bc7031..54ad0f0d5c2d 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -976,7 +976,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, case IP_RECVERR: inet_assign_bit(RECVERR, sk, val); if (!val) - skb_queue_purge(&sk->sk_error_queue); + skb_errqueue_purge(&sk->sk_error_queue); return 0; case IP_RECVERR_RFC4884: if (val < 0 || val > 1) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index d19577a94bcc..0e2a0847b387 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -923,7 +923,7 @@ done: goto e_inval; np->recverr = valbool; if (!val) - skb_queue_purge(&sk->sk_error_queue); + skb_errqueue_purge(&sk->sk_error_queue); retv = 0; break; case IPV6_FLOWINFO_SEND: -- cgit v1.2.3 From 5cb249686e67dbef3ffe53887fa725eefc5a7144 Mon Sep 17 00:00:00 2001 From: Patrick Rohr Date: Fri, 18 Aug 2023 11:22:49 -0700 Subject: net: release reference to inet6_dev pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit addrconf_prefix_rcv returned early without releasing the inet6_dev pointer when the PIO lifetime is less than accept_ra_min_lft. Fixes: 5027d54a9c30 ("net: change accept_ra_min_rtr_lft to affect all RA lifetimes") Cc: Maciej Å»enczykowski Cc: Lorenzo Colitti Cc: David Ahern Cc: Simon Horman Reviewed-by: Simon Horman Reviewed-by: Maciej Å»enczykowski Signed-off-by: Patrick Rohr Reviewed-by: Leon Romanovsky Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5184bd0ceb12..47d1dd8501b7 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2742,7 +2742,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) } if (valid_lft != 0 && valid_lft < in6_dev->cnf.accept_ra_min_lft) - return; + goto put; /* * Two things going on here: -- cgit v1.2.3 From b358f57f7db620f5c25e60d70015f4e4bb050688 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 18 Aug 2023 16:29:01 +0800 Subject: ipv6: do not match device when remove source route After deleting an IPv6 address on an interface and cleaning up the related preferred source entries, it is important to ensure that all routes associated with the deleted address are properly cleared. The current implementation of rt6_remove_prefsrc() only checks the preferred source addresses bound to the current device. However, there may be routes that are bound to other devices but still utilize the same preferred source address. To address this issue, it is necessary to also delete entries that are bound to other interfaces but share the same source address with the current device. Failure to delete these entries would leave routes that are bound to the deleted address unclear. Here is an example reproducer (I have omitted unrelated routes): + ip link add dummy1 type dummy + ip link add dummy2 type dummy + ip link set dummy1 up + ip link set dummy2 up + ip addr add 1:2:3:4::5/64 dev dummy1 + ip route add 7:7:7:0::1 dev dummy1 src 1:2:3:4::5 + ip route add 7:7:7:0::2 dev dummy2 src 1:2:3:4::5 + ip -6 route show 1:2:3:4::/64 dev dummy1 proto kernel metric 256 pref medium 7:7:7::1 dev dummy1 src 1:2:3:4::5 metric 1024 pref medium 7:7:7::2 dev dummy2 src 1:2:3:4::5 metric 1024 pref medium + ip addr del 1:2:3:4::5/64 dev dummy1 + ip -6 route show 7:7:7::1 dev dummy1 metric 1024 pref medium 7:7:7::2 dev dummy2 src 1:2:3:4::5 metric 1024 pref medium As Ido reminds, in IPv6, the preferred source address is looked up in the same VRF as the first nexthop device, which is different with IPv4. So, while removing the device checking, we also need to add an ipv6_chk_addr() check to make sure the address does not exist on the other devices of the rt nexthop device's VRF. After fix: + ip addr del 1:2:3:4::5/64 dev dummy1 + ip -6 route show 7:7:7::1 dev dummy1 metric 1024 pref medium 7:7:7::2 dev dummy2 metric 1024 pref medium Reported-by: Thomas Haller Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2170513 Reviewed-by: Ido Schimmel Signed-off-by: Hangbin Liu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index db10c36f34bb..a5b74b91c8f3 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4582,21 +4582,19 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, /* remove deleted ip from prefsrc entries */ struct arg_dev_net_ip { - struct net_device *dev; struct net *net; struct in6_addr *addr; }; static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) { - struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; if (!rt->nh && - ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) && rt != net->ipv6.fib6_null_entry && - ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { + ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) && + !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) { spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ rt->fib6_prefsrc.plen = 0; @@ -4609,7 +4607,6 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) { struct net *net = dev_net(ifp->idev->dev); struct arg_dev_net_ip adni = { - .dev = ifp->idev->dev, .net = net, .addr = &ifp->addr, }; -- cgit v1.2.3 From b4672c733713f3bc9029c83efa7a2f1ef42ddf5b Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 18 Aug 2023 16:25:23 +0800 Subject: IPv4: add extack info for IPv4 address add/delete Add extack info for IPv4 address add/delete, which would be useful for users to understand the problem without having to read kernel code. No extack message for the ifa_local checking in __inet_insert_ifa() as it has been checked in find_matching_ifa(). Suggested-by: Ido Schimmel Signed-off-by: Hangbin Liu Reviewed-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5deac0517ef7..c3658b8755bc 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -509,6 +509,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, return -EEXIST; } if (ifa1->ifa_scope != ifa->ifa_scope) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid scope value"); inet_free_ifa(ifa); return -EINVAL; } @@ -664,6 +665,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); in_dev = inetdev_by_index(net, ifm->ifa_index); if (!in_dev) { + NL_SET_ERR_MSG(extack, "ipv4: Device not found"); err = -ENODEV; goto errout; } @@ -688,6 +690,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; } + NL_SET_ERR_MSG(extack, "ipv4: Address not found"); err = -EADDRNOTAVAIL; errout: return err; @@ -839,13 +842,23 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); err = -EINVAL; - if (ifm->ifa_prefixlen > 32 || !tb[IFA_LOCAL]) + + if (ifm->ifa_prefixlen > 32) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length"); + goto errout; + } + + if (!tb[IFA_LOCAL]) { + NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied"); goto errout; + } dev = __dev_get_by_index(net, ifm->ifa_index); err = -ENODEV; - if (!dev) + if (!dev) { + NL_SET_ERR_MSG(extack, "ipv4: Device not found"); goto errout; + } in_dev = __in_dev_get_rtnl(dev); err = -ENOBUFS; @@ -897,6 +910,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, ci = nla_data(tb[IFA_CACHEINFO]); if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { + NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid"); err = -EINVAL; goto errout_free; } @@ -954,6 +968,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, int ret = ip_mc_autojoin_config(net, true, ifa); if (ret < 0) { + NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed"); inet_free_ifa(ifa); return ret; } @@ -967,8 +982,10 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, inet_free_ifa(ifa); if (nlh->nlmsg_flags & NLM_F_EXCL || - !(nlh->nlmsg_flags & NLM_F_REPLACE)) + !(nlh->nlmsg_flags & NLM_F_REPLACE)) { + NL_SET_ERR_MSG(extack, "ipv4: Address already assigned"); return -EEXIST; + } ifa = ifa_existing; if (ifa->ifa_rt_priority != new_metric) { -- cgit v1.2.3 From bc1fb82ae11753c5dec53c667a055dc37796dbd2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Aug 2023 04:06:46 +0000 Subject: net: annotate data-races around sk->sk_lingertime sk_getsockopt() runs locklessly. This means sk->sk_lingertime can be read while other threads are changing its value. Other reads also happen without socket lock being held, and must be annotated. Remove preprocessor logic using BITS_PER_LONG, compilers are smart enough to figure this by themselves. v2: fixed a clang W=1 (-Wtautological-constant-out-of-range-compare) warning (Jakub) Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/bluetooth/iso.c | 2 +- net/bluetooth/sco.c | 2 +- net/core/sock.c | 18 +++++++++--------- net/sched/em_meta.c | 2 +- net/smc/af_smc.c | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 6b66d6a88b9a..3c03e49422c7 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1475,7 +1475,7 @@ static int iso_sock_release(struct socket *sock) iso_sock_close(sk); - if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && + if (sock_flag(sk, SOCK_LINGER) && READ_ONCE(sk->sk_lingertime) && !(current->flags & PF_EXITING)) { lock_sock(sk); err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 50ad5935ae47..c736186aba26 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1245,7 +1245,7 @@ static int sco_sock_release(struct socket *sock) sco_sock_close(sk); - if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && + if (sock_flag(sk, SOCK_LINGER) && READ_ONCE(sk->sk_lingertime) && !(current->flags & PF_EXITING)) { lock_sock(sk); err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); diff --git a/net/core/sock.c b/net/core/sock.c index a0722954e4a8..666a17cab4f5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -797,7 +797,7 @@ EXPORT_SYMBOL(sock_set_reuseport); void sock_no_linger(struct sock *sk) { lock_sock(sk); - sk->sk_lingertime = 0; + WRITE_ONCE(sk->sk_lingertime, 0); sock_set_flag(sk, SOCK_LINGER); release_sock(sk); } @@ -1230,15 +1230,15 @@ set_sndbuf: ret = -EFAULT; break; } - if (!ling.l_onoff) + if (!ling.l_onoff) { sock_reset_flag(sk, SOCK_LINGER); - else { -#if (BITS_PER_LONG == 32) - if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) - sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + } else { + unsigned long t_sec = ling.l_linger; + + if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) + WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); else -#endif - sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; + WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); sock_set_flag(sk, SOCK_LINGER); } break; @@ -1692,7 +1692,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, case SO_LINGER: lv = sizeof(v.ling); v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); - v.ling.l_linger = sk->sk_lingertime / HZ; + v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; break; case SO_BSDCOMPAT: diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 6fdba069f6bf..da34fd4c9269 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -502,7 +502,7 @@ META_COLLECTOR(int_sk_lingertime) *err = -1; return; } - dst->value = sk->sk_lingertime / HZ; + dst->value = READ_ONCE(sk->sk_lingertime) / HZ; } META_COLLECTOR(int_sk_err_qlen) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c4f664fb2bcb..bacdd971615e 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1840,7 +1840,7 @@ void smc_close_non_accepted(struct sock *sk) lock_sock(sk); if (!sk->sk_lingertime) /* wait for peer closing */ - sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; + WRITE_ONCE(sk->sk_lingertime, SMC_MAX_STREAM_WAIT_TIMEOUT); __smc_release(smc); release_sock(sk); sock_put(sk); /* sock_hold above */ -- cgit v1.2.3 From 93ca82447c3eab149b5b2588ab8bd9aa2eacef7c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Aug 2023 14:15:23 -0700 Subject: wifi: cfg80211: Annotate struct cfg80211_acl_data with __counted_by Prepare for the coming implementation by GCC and Clang of the __counted_by attribute. Flexible array members annotated with __counted_by can have their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family functions). As found with Coccinelle[1], add __counted_by for struct cfg80211_acl_data. Additionally, since the element count member must be set before accessing the annotated flexible array member, move its initialization earlier. [1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci Cc: Johannes Berg Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Justin Stitt Reviewed-by: Gustavo A. R. Silva Reviewed-by: Jeff Johnson Link: https://lore.kernel.org/r/20230817211531.4193219-1-keescook@chromium.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- net/wireless/nl80211.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d6fa7c8767ad..eb73b5af5d04 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1282,7 +1282,7 @@ struct cfg80211_acl_data { int n_acl_entries; /* Keep it last */ - struct mac_address mac_addrs[]; + struct mac_address mac_addrs[] __counted_by(n_acl_entries); }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8bcf8e293308..80633e815311 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4889,13 +4889,12 @@ static struct cfg80211_acl_data *parse_acl_data(struct wiphy *wiphy, acl = kzalloc(struct_size(acl, mac_addrs, n_entries), GFP_KERNEL); if (!acl) return ERR_PTR(-ENOMEM); + acl->n_acl_entries = n_entries; nla_for_each_nested(attr, info->attrs[NL80211_ATTR_MAC_ADDRS], tmp) { memcpy(acl->mac_addrs[i].addr, nla_data(attr), ETH_ALEN); i++; } - - acl->n_acl_entries = n_entries; acl->acl_policy = acl_policy; return acl; -- cgit v1.2.3 From d4d3aaf25a6600d58774978cea9068974a8ea674 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Aug 2023 14:15:24 -0700 Subject: wifi: cfg80211: Annotate struct cfg80211_cqm_config with __counted_by Prepare for the coming implementation by GCC and Clang of the __counted_by attribute. Flexible array members annotated with __counted_by can have their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family functions). As found with Coccinelle[1], add __counted_by for struct cfg80211_cqm_config. [1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci Cc: Johannes Berg Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Reviewed-by: Jeff Johnson Link: https://lore.kernel.org/r/20230817211531.4193219-2-keescook@chromium.org Signed-off-by: Johannes Berg --- net/wireless/core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/core.h b/net/wireless/core.h index 8a807b609ef7..507d184b8b40 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -298,7 +298,7 @@ struct cfg80211_cqm_config { u32 rssi_hyst; s32 last_rssi_event_value; int n_rssi_thresholds; - s32 rssi_thresholds[]; + s32 rssi_thresholds[] __counted_by(n_rssi_thresholds); }; void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev); -- cgit v1.2.3 From c14679d7005a4d26289eadb4f5fe499aca78ef7c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Aug 2023 14:15:25 -0700 Subject: wifi: cfg80211: Annotate struct cfg80211_mbssid_elems with __counted_by Prepare for the coming implementation by GCC and Clang of the __counted_by attribute. Flexible array members annotated with __counted_by can have their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family functions). As found with Coccinelle[1], add __counted_by for struct cfg80211_mbssid_elems. Additionally, since the element count member must be set before accessing the annotated flexible array member, move its initialization earlier. [1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci Cc: Johannes Berg Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Reviewed-by: Jeff Johnson Link: https://lore.kernel.org/r/20230817211531.4193219-3-keescook@chromium.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- net/wireless/nl80211.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index eb73b5af5d04..5c7d091b3925 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1187,7 +1187,7 @@ struct cfg80211_mbssid_elems { struct { const u8 *data; size_t len; - } elem[]; + } elem[] __counted_by(cnt); }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 80633e815311..9ba4266368db 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5438,13 +5438,13 @@ nl80211_parse_mbssid_elems(struct wiphy *wiphy, struct nlattr *attrs) elems = kzalloc(struct_size(elems, elem, num_elems), GFP_KERNEL); if (!elems) return ERR_PTR(-ENOMEM); + elems->cnt = num_elems; nla_for_each_nested(nl_elems, attrs, rem_elems) { elems->elem[i].data = nla_data(nl_elems); elems->elem[i].len = nla_len(nl_elems); i++; } - elems->cnt = num_elems; return elems; } -- cgit v1.2.3 From 342bc7c9e877847d602c4a67c3135051df07cc4d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Aug 2023 14:15:26 -0700 Subject: wifi: cfg80211: Annotate struct cfg80211_pmsr_request with __counted_by Prepare for the coming implementation by GCC and Clang of the __counted_by attribute. Flexible array members annotated with __counted_by can have their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family functions). As found with Coccinelle[1], add __counted_by for struct cfg80211_pmsr_request. Additionally, since the element count member must be set before accessing the annotated flexible array member, move its initialization earlier. [1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci Cc: Johannes Berg Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Reviewed-by: Jeff Johnson Link: https://lore.kernel.org/r/20230817211531.4193219-4-keescook@chromium.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- net/wireless/pmsr.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5c7d091b3925..e9ca4726a732 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3948,7 +3948,7 @@ struct cfg80211_pmsr_request { struct list_head list; - struct cfg80211_pmsr_request_peer peers[]; + struct cfg80211_pmsr_request_peer peers[] __counted_by(n_peers); }; /** diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index 77000a264855..9611aa0bd051 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -291,6 +291,7 @@ int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info) req = kzalloc(struct_size(req, peers, count), GFP_KERNEL); if (!req) return -ENOMEM; + req->n_peers = count; if (info->attrs[NL80211_ATTR_TIMEOUT]) req->timeout = nla_get_u32(info->attrs[NL80211_ATTR_TIMEOUT]); @@ -321,8 +322,6 @@ int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info) goto out_err; idx++; } - - req->n_peers = count; req->cookie = cfg80211_assign_cookie(rdev); req->nl_portid = info->snd_portid; -- cgit v1.2.3 From 7b6d7087031b69f0694cbef4485616c905664feb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Aug 2023 14:15:27 -0700 Subject: wifi: cfg80211: Annotate struct cfg80211_rnr_elems with __counted_by Prepare for the coming implementation by GCC and Clang of the __counted_by attribute. Flexible array members annotated with __counted_by can have their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family functions). As found with Coccinelle[1], add __counted_by for struct cfg80211_rnr_elems. Additionally, since the element count member must be set before accessing the annotated flexible array member, move its initialization earlier. [1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci Cc: Johannes Berg Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Reviewed-by: Jeff Johnson Link: https://lore.kernel.org/r/20230817211531.4193219-5-keescook@chromium.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- net/wireless/nl80211.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e9ca4726a732..6efe216c01d2 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1204,7 +1204,7 @@ struct cfg80211_rnr_elems { struct { const u8 *data; size_t len; - } elem[]; + } elem[] __counted_by(cnt); }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9ba4266368db..0ffebf1a1eb6 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5470,13 +5470,13 @@ nl80211_parse_rnr_elems(struct wiphy *wiphy, struct nlattr *attrs, elems = kzalloc(struct_size(elems, elem, num_elems), GFP_KERNEL); if (!elems) return ERR_PTR(-ENOMEM); + elems->cnt = num_elems; nla_for_each_nested(nl_elems, attrs, rem_elems) { elems->elem[i].data = nla_data(nl_elems); elems->elem[i].len = nla_len(nl_elems); i++; } - elems->cnt = num_elems; return elems; } -- cgit v1.2.3 From 43c2817225fce05701f062a996255007481935e2 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Mon, 21 Aug 2023 16:41:04 +0800 Subject: net: remove unnecessary input parameter 'how' in ifdown function When the ifdown function in the dst_ops structure is referenced, the input parameter 'how' is always true. In the current implementation of the ifdown interface, ip6_dst_ifdown does not use the input parameter 'how', xfrm6_dst_ifdown and xfrm4_dst_ifdown functions use the input parameter 'unregister'. But false judgment on 'unregister' in xfrm6_dst_ifdown and xfrm4_dst_ifdown is false, so remove the input parameter 'how' in ifdown function. Signed-off-by: Zhengchao Shao Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230821084104.3812233-1-shaozhengchao@huawei.com Signed-off-by: Paolo Abeni --- include/net/dst_ops.h | 2 +- net/core/dst.c | 2 +- net/ipv4/xfrm4_policy.c | 11 +---------- net/ipv6/route.c | 5 ++--- net/ipv6/xfrm6_policy.c | 6 +----- 5 files changed, 6 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 632086b2f644..6d1c8541183d 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -23,7 +23,7 @@ struct dst_ops { u32 * (*cow_metrics)(struct dst_entry *, unsigned long); void (*destroy)(struct dst_entry *); void (*ifdown)(struct dst_entry *, - struct net_device *dev, int how); + struct net_device *dev); struct dst_entry * (*negative_advice)(struct dst_entry *); void (*link_failure)(struct sk_buff *); void (*update_pmtu)(struct dst_entry *dst, struct sock *sk, diff --git a/net/core/dst.c b/net/core/dst.c index 79d9306ad1ee..980e2fd2f013 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -152,7 +152,7 @@ void dst_dev_put(struct dst_entry *dst) dst->obsolete = DST_OBSOLETE_DEAD; if (dst->ops->ifdown) - dst->ops->ifdown(dst, dev, true); + dst->ops->ifdown(dst, dev); dst->input = dst_discard; dst->output = dst_discard_out; dst->dev = blackhole_netdev; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 9403bbaf1b61..cdcc0f6b4f0a 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -124,22 +124,13 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) xfrm_dst_destroy(xdst); } -static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int unregister) -{ - if (!unregister) - return; - - xfrm_dst_ifdown(dst, dev); -} - static struct dst_ops xfrm4_dst_ops_template = { .family = AF_INET, .update_pmtu = xfrm4_update_pmtu, .redirect = xfrm4_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, - .ifdown = xfrm4_dst_ifdown, + .ifdown = xfrm_dst_ifdown, .local_out = __ip_local_out, .gc_thresh = 32768, }; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a5b74b91c8f3..846aec8e0093 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -90,7 +90,7 @@ unsigned int ip6_mtu(const struct dst_entry *dst); static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, - struct net_device *dev, int how); + struct net_device *dev); static void ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); @@ -371,8 +371,7 @@ static void ip6_dst_destroy(struct dst_entry *dst) fib6_info_release(from); } -static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int how) +static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { struct rt6_info *rt = (struct rt6_info *)dst; struct inet6_dev *idev = rt->rt6i_idev; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index eecc5e59da17..188224a76685 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -124,14 +124,10 @@ static void xfrm6_dst_destroy(struct dst_entry *dst) xfrm_dst_destroy(xdst); } -static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int unregister) +static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { struct xfrm_dst *xdst; - if (!unregister) - return; - xdst = (struct xfrm_dst *)dst; if (xdst->u.rt6.rt6i_idev->dev == dev) { struct inet6_dev *loopback_idev = -- cgit v1.2.3 From 19e4a47ee74718a22e963e8a647c8c3bfe8bb05c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 15 Aug 2023 17:51:05 +0200 Subject: wifi: mac80211: check S1G action frame size Before checking the action code, check that it even exists in the frame. Reported-by: syzbot+be9c824e6f269d608288@syzkaller.appspotmail.com Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 4f707d2a160f..33f9764b94de 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3732,6 +3732,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) break; goto queue; case WLAN_CATEGORY_S1G: + if (len < offsetofend(typeof(*mgmt), + u.action.u.s1g.action_code)) + break; + switch (mgmt->u.action.u.s1g.action_code) { case WLAN_S1G_TWT_SETUP: case WLAN_S1G_TWT_TEARDOWN: -- cgit v1.2.3 From a7ed3465daa240bdf01a5420f64336fee879c09d Mon Sep 17 00:00:00 2001 From: "GONG, Ruiqi" Date: Wed, 9 Aug 2023 15:45:03 +0800 Subject: netfilter: ebtables: fix fortify warnings in size_entry_mwt() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling with gcc 13 and CONFIG_FORTIFY_SOURCE=y, the following warning appears: In function ‘fortify_memcpy_chk’, inlined from ‘size_entry_mwt’ at net/bridge/netfilter/ebtables.c:2118:2: ./include/linux/fortify-string.h:592:25: error: call to ‘__read_overflow2_field’ declared with attribute warning: detected read beyond size of field (2nd parameter); maybe use struct_group()? [-Werror=attribute-warning] 592 | __read_overflow2_field(q_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The compiler is complaining: memcpy(&offsets[1], &entry->watchers_offset, sizeof(offsets) - sizeof(offsets[0])); where memcpy reads beyong &entry->watchers_offset to copy {watchers,target,next}_offset altogether into offsets[]. Silence the warning by wrapping these three up via struct_group(). Signed-off-by: GONG, Ruiqi Reviewed-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter_bridge/ebtables.h | 14 ++++++++------ net/bridge/netfilter/ebtables.c | 3 +-- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index a494cf43a755..b0caad82b693 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -182,12 +182,14 @@ struct ebt_entry { unsigned char sourcemsk[ETH_ALEN]; unsigned char destmac[ETH_ALEN]; unsigned char destmsk[ETH_ALEN]; - /* sizeof ebt_entry + matches */ - unsigned int watchers_offset; - /* sizeof ebt_entry + matches + watchers */ - unsigned int target_offset; - /* sizeof ebt_entry + matches + watchers + target */ - unsigned int next_offset; + __struct_group(/* no tag */, offsets, /* no attrs */, + /* sizeof ebt_entry + matches */ + unsigned int watchers_offset; + /* sizeof ebt_entry + matches + watchers */ + unsigned int target_offset; + /* sizeof ebt_entry + matches + watchers + target */ + unsigned int next_offset; + ); unsigned char elems[0] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); }; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 757ec46fc45a..aa23479b20b2 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -2115,8 +2115,7 @@ static int size_entry_mwt(const struct ebt_entry *entry, const unsigned char *ba return ret; offsets[0] = sizeof(struct ebt_entry); /* matches come first */ - memcpy(&offsets[1], &entry->watchers_offset, - sizeof(offsets) - sizeof(offsets[0])); + memcpy(&offsets[1], &entry->offsets, sizeof(entry->offsets)); if (state->buf_kern_start) { buf_start = state->buf_kern_start + state->buf_kern_offset; -- cgit v1.2.3 From e53314034b2374cfe9a426a5edab51e30cfe8f02 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Wed, 9 Aug 2023 01:06:04 +0000 Subject: netfilter: ipset: refactor deprecated strncpy Use `strscpy_pad` instead of `strncpy`. Link: https://github.com/KSPP/linux/issues/90 Cc: linux-hardening@vger.kernel.org Signed-off-by: Justin Stitt Signed-off-by: Florian Westphal --- net/netfilter/ipset/ip_set_core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 0b68e2e2824e..e564b5174261 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -872,7 +872,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name) BUG_ON(!set); read_lock_bh(&ip_set_ref_lock); - strncpy(name, set->name, IPSET_MAXNAMELEN); + strscpy_pad(name, set->name, IPSET_MAXNAMELEN); read_unlock_bh(&ip_set_ref_lock); } EXPORT_SYMBOL_GPL(ip_set_name_byindex); @@ -1326,7 +1326,7 @@ static int ip_set_rename(struct sk_buff *skb, const struct nfnl_info *info, goto out; } } - strncpy(set->name, name2, IPSET_MAXNAMELEN); + strscpy_pad(set->name, name2, IPSET_MAXNAMELEN); out: write_unlock_bh(&ip_set_ref_lock); @@ -1380,9 +1380,9 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, return -EBUSY; } - strncpy(from_name, from->name, IPSET_MAXNAMELEN); - strncpy(from->name, to->name, IPSET_MAXNAMELEN); - strncpy(to->name, from_name, IPSET_MAXNAMELEN); + strscpy_pad(from_name, from->name, IPSET_MAXNAMELEN); + strscpy_pad(from->name, to->name, IPSET_MAXNAMELEN); + strscpy_pad(to->name, from_name, IPSET_MAXNAMELEN); swap(from->ref, to->ref); ip_set(inst, from_id) = to; -- cgit v1.2.3 From 6cdd75a4a66b7c9b2a2667d87619816d6e744d0e Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Wed, 9 Aug 2023 01:06:05 +0000 Subject: netfilter: nf_tables: refactor deprecated strncpy Prefer `strscpy_pad` over `strncpy`. Signed-off-by: Justin Stitt Signed-off-by: Florian Westphal --- net/netfilter/nft_ct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index e87fd4314c68..86bb9d7797d9 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -108,7 +108,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, helper = rcu_dereference(help->helper); if (helper == NULL) goto err; - strncpy((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN); + strscpy_pad((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN); return; #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: { -- cgit v1.2.3 From 7457af8bf9948aeb31de21e4be3bc1fef2730958 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Wed, 9 Aug 2023 01:06:06 +0000 Subject: netfilter: nf_tables: refactor deprecated strncpy Prefer `strscpy_pad` over `strncpy`. Signed-off-by: Justin Stitt Signed-off-by: Florian Westphal --- net/netfilter/nft_fib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 601c9e09d07a..04b51f285332 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -151,7 +151,7 @@ void nft_fib_store_result(void *reg, const struct nft_fib *priv, if (priv->flags & NFTA_FIB_F_PRESENT) *dreg = !!dev; else - strncpy(reg, dev ? dev->name : "", IFNAMSIZ); + strscpy_pad(reg, dev ? dev->name : "", IFNAMSIZ); break; default: WARN_ON_ONCE(1); -- cgit v1.2.3 From 6d87a4eae89e2ae015b40fd0adc90e9a7dc9fccb Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Wed, 9 Aug 2023 01:06:08 +0000 Subject: netfilter: nft_osf: refactor deprecated strncpy Use `strscpy_pad` over `strncpy` for NUL-terminated strings. We can also drop the + 1 from `NFT_OSF_MAXGENRELEN + 1` since `strscpy` will guarantee NUL-termination. Signed-off-by: Justin Stitt Signed-off-by: Florian Westphal --- net/netfilter/nft_osf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 70820c66b591..7f61506e5b44 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -23,7 +23,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nft_osf *priv = nft_expr_priv(expr); u32 *dest = ®s->data[priv->dreg]; struct sk_buff *skb = pkt->skb; - char os_match[NFT_OSF_MAXGENRELEN + 1]; + char os_match[NFT_OSF_MAXGENRELEN]; const struct tcphdr *tcp; struct nf_osf_data data; struct tcphdr _tcph; @@ -45,7 +45,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, } if (!nf_osf_find(skb, nf_osf_fingers, priv->ttl, &data)) { - strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); + strscpy_pad((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); } else { if (priv->flags & NFT_OSF_F_VERSION) snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s", @@ -53,7 +53,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, else strscpy(os_match, data.genre, NFT_OSF_MAXGENRELEN); - strncpy((char *)dest, os_match, NFT_OSF_MAXGENRELEN); + strscpy_pad((char *)dest, os_match, NFT_OSF_MAXGENRELEN); } } -- cgit v1.2.3 From ad156c23d65c1f44f8b4616c2a7505832e323069 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Wed, 9 Aug 2023 01:06:07 +0000 Subject: netfilter: nft_meta: refactor deprecated strncpy Prefer `strscpy_pad` to `strncpy`. Signed-off-by: Justin Stitt Signed-off-by: Florian Westphal --- net/netfilter/nft_meta.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 8fdc7318c03c..f7da7c43333b 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -185,12 +185,12 @@ static noinline bool nft_meta_get_eval_kind(enum nft_meta_keys key, case NFT_META_IIFKIND: if (!in || !in->rtnl_link_ops) return false; - strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); + strscpy_pad((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); break; case NFT_META_OIFKIND: if (!out || !out->rtnl_link_ops) return false; - strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); + strscpy_pad((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); break; default: return false; @@ -206,7 +206,7 @@ static void nft_meta_store_ifindex(u32 *dest, const struct net_device *dev) static void nft_meta_store_ifname(u32 *dest, const struct net_device *dev) { - strncpy((char *)dest, dev ? dev->name : "", IFNAMSIZ); + strscpy_pad((char *)dest, dev ? dev->name : "", IFNAMSIZ); } static bool nft_meta_store_iftype(u32 *dest, const struct net_device *dev) -- cgit v1.2.3 From 06f7d3c3f82c3a53de32a2c0c08e96c0d9ef69ec Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Wed, 9 Aug 2023 01:06:09 +0000 Subject: netfilter: x_tables: refactor deprecated strncpy Prefer `strscpy_pad` to `strncpy`. Signed-off-by: Justin Stitt Signed-off-by: Florian Westphal --- net/netfilter/x_tables.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 470282cf3fae..21624d68314f 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -768,7 +768,7 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, m->u.user.match_size = msize; strscpy(name, match->name, sizeof(name)); module_put(match->me); - strncpy(m->u.user.name, name, sizeof(m->u.user.name)); + strscpy_pad(m->u.user.name, name, sizeof(m->u.user.name)); *size += off; *dstptr += msize; @@ -1148,7 +1148,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, t->u.user.target_size = tsize; strscpy(name, target->name, sizeof(name)); module_put(target->me); - strncpy(t->u.user.name, name, sizeof(t->u.user.name)); + strscpy_pad(t->u.user.name, name, sizeof(t->u.user.name)); *size += off; *dstptr += tsize; @@ -2014,4 +2014,3 @@ static void __exit xt_fini(void) module_init(xt_init); module_exit(xt_fini); - -- cgit v1.2.3 From aa222dd190d69ca8e0e2b990ec171c67a74d5383 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Wed, 9 Aug 2023 01:06:10 +0000 Subject: netfilter: xtables: refactor deprecated strncpy Prefer `strscpy_pad` as it's a more robust interface whilst maintaing zero-padding behavior. There may have existed a bug here due to both `tbl->repl.name` and `info->name` having a size of 32 as defined below: | #define XT_TABLE_MAXNAMELEN 32 This may lead to buffer overreads in some situations -- `strscpy` solves this by guaranteeing NUL-termination of the dest buffer. Signed-off-by: Justin Stitt Signed-off-by: Florian Westphal --- net/netfilter/xt_repldata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h index 68ccbe50bb1e..5d1fb7018dba 100644 --- a/net/netfilter/xt_repldata.h +++ b/net/netfilter/xt_repldata.h @@ -29,7 +29,7 @@ if (tbl == NULL) \ return NULL; \ term = (struct type##_error *)&(((char *)tbl)[term_offset]); \ - strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ + strscpy_pad(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ *term = (struct type##_error)typ2##_ERROR_INIT; \ tbl->repl.valid_hooks = hook_mask; \ tbl->repl.num_entries = nhooks + 1; \ -- cgit v1.2.3 From 169384fbe8513185499bcbb817d198e6a63eb37e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Jun 2023 10:36:26 +0200 Subject: netfilter: nf_tables: allow loop termination for pending fatal signal abort early so task can exit faster if a fatal signal is pending, no need to continue validation in that case. Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3e841e45f2c0..f00a1dff85e8 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3675,6 +3675,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) return -EMLINK; list_for_each_entry(rule, &chain->rules, list) { + if (fatal_signal_pending(current)) + return -EINTR; + if (!nft_is_active_next(ctx->net, rule)) continue; @@ -10479,6 +10482,9 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, if (ctx->chain == chain) return -ELOOP; + if (fatal_signal_pending(current)) + return -EINTR; + list_for_each_entry(rule, &chain->rules, list) { nft_rule_for_each_expr(expr, last, rule) { struct nft_immediate_expr *priv; -- cgit v1.2.3 From 5d4e04bf3a0f098bd9033de3a5291810fa14c7a6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 15 Aug 2023 18:09:00 +0200 Subject: wifi: cfg80211: reject auth/assoc to AP with our address If the AP uses our own address as its MLD address or BSSID, then clearly something's wrong. Reject such connections so we don't try and fail later. Reported-by: syzbot+2676771ed06a6df166ad@syzkaller.appspotmail.com Signed-off-by: Johannes Berg --- net/wireless/mlme.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net') diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index ac059cefbeb3..775cac4d6100 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -281,6 +281,11 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, ether_addr_equal(req->bss->bssid, wdev->u.client.connected_addr)) return -EALREADY; + if (ether_addr_equal(req->bss->bssid, dev->dev_addr) || + (req->link_id >= 0 && + ether_addr_equal(req->ap_mld_addr, dev->dev_addr))) + return -EINVAL; + return rdev_auth(rdev, dev, req); } @@ -335,6 +340,9 @@ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, if (req->links[i].bss == req->links[j].bss) return -EINVAL; } + + if (ether_addr_equal(req->links[i].bss->bssid, dev->dev_addr)) + return -EINVAL; } if (wdev->connected && @@ -342,6 +350,11 @@ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, !ether_addr_equal(wdev->u.client.connected_addr, req->prev_bssid))) return -EALREADY; + if ((req->bss && ether_addr_equal(req->bss->bssid, dev->dev_addr)) || + (req->link_id >= 0 && + ether_addr_equal(req->ap_mld_addr, dev->dev_addr))) + return -EINVAL; + cfg80211_oper_and_ht_capa(&req->ht_capa_mask, rdev->wiphy.ht_capa_mod_mask); cfg80211_oper_and_vht_capa(&req->vht_capa_mask, -- cgit v1.2.3 From abc76cf552e13cfa88a204b362a86b0e08e95228 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 15 Aug 2023 18:32:03 +0200 Subject: wifi: cfg80211: ocb: don't leave if not joined If there's no OCB state, don't ask the driver/mac80211 to leave, since that's just confusing. Since set/clear the chandef state, that's a simple check. Reported-by: syzbot+09d1cd2f71e6dd3bfd2c@syzkaller.appspotmail.com Signed-off-by: Johannes Berg --- net/wireless/ocb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/wireless/ocb.c b/net/wireless/ocb.c index 27a1732264f9..29afaf3da54f 100644 --- a/net/wireless/ocb.c +++ b/net/wireless/ocb.c @@ -68,6 +68,9 @@ int __cfg80211_leave_ocb(struct cfg80211_registered_device *rdev, if (!rdev->ops->leave_ocb) return -EOPNOTSUPP; + if (!wdev->u.ocb.chandef.chan) + return -ENOTCONN; + err = rdev_leave_ocb(rdev, dev); if (!err) memset(&wdev->u.ocb.chandef, 0, sizeof(wdev->u.ocb.chandef)); -- cgit v1.2.3 From 67dfa589aa8806c7959cbca2f4613b8d41c75a06 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 15 Aug 2023 18:41:32 +0200 Subject: wifi: mac80211: check for station first in client probe When probing a client, first check if we have it, and then check for the channel context, otherwise you can trigger the warning there easily by probing when the AP isn't even started yet. Since a client existing means the AP is also operating, we can then keep the warning. Also simplify the moved code a bit. Reported-by: syzbot+999fac712d84878a7379@syzkaller.appspotmail.com Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index e7ac24603892..953f24166ffc 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -4133,19 +4133,20 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, mutex_lock(&local->mtx); rcu_read_lock(); + sta = sta_info_get_bss(sdata, peer); + if (!sta) { + ret = -ENOLINK; + goto unlock; + } + + qos = sta->sta.wme; + chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { ret = -EINVAL; goto unlock; } band = chanctx_conf->def.chan->band; - sta = sta_info_get_bss(sdata, peer); - if (sta) { - qos = sta->sta.wme; - } else { - ret = -ENOLINK; - goto unlock; - } if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | -- cgit v1.2.3 From 927521170c4a18c620f97865f7bad48f17c48967 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 16 Aug 2023 12:13:36 +0200 Subject: wifi: mac80211: fix puncturing bitmap handling in CSA Code inspection reveals that we switch the puncturing bitmap before the real channel switch, since that happens only in the second round of the worker after the channel context is switched by ieee80211_link_use_reserved_context(). Fixes: 2cc25e4b2a04 ("wifi: mac80211: configure puncturing bitmap") Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 953f24166ffc..45e7a5d9c7d9 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3648,12 +3648,6 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata) lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); - if (sdata->vif.bss_conf.eht_puncturing != sdata->vif.bss_conf.csa_punct_bitmap) { - sdata->vif.bss_conf.eht_puncturing = - sdata->vif.bss_conf.csa_punct_bitmap; - changed |= BSS_CHANGED_EHT_PUNCTURING; - } - /* * using reservation isn't immediate as it may be deferred until later * with multi-vif. once reservation is complete it will re-schedule the @@ -3683,6 +3677,12 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata) if (err) return err; + if (sdata->vif.bss_conf.eht_puncturing != sdata->vif.bss_conf.csa_punct_bitmap) { + sdata->vif.bss_conf.eht_puncturing = + sdata->vif.bss_conf.csa_punct_bitmap; + changed |= BSS_CHANGED_EHT_PUNCTURING; + } + ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed); if (sdata->deflink.csa_block_tx) { -- cgit v1.2.3 From 218d690c49b7e9c94ad0d317adbdd4af846ea0dc Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Wed, 9 Aug 2023 11:31:51 +0800 Subject: wifi: nl80211/cfg80211: add forgotten nla_policy for BSS color attribute The previous commit dd3e4fc75b4a ("nl80211/cfg80211: add BSS color to NDP ranging parameters") adds a parameter for NDP ranging by introducing a new attribute type named NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR. However, the author forgot to also describe the nla_policy at nl80211_pmsr_ftm_req_attr_policy (net/wireless/nl80211.c). Just complement it to avoid malformed attribute that causes out-of-attribute access. Fixes: dd3e4fc75b4a ("nl80211/cfg80211: add BSS color to NDP ranging parameters") Signed-off-by: Lin Ma Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230809033151.768910-1-linma@zju.edu.cn Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0ffebf1a1eb6..de47838aca4f 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -323,6 +323,7 @@ nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = { [NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK] = { .type = NLA_FLAG }, + [NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR] = { .type = NLA_U8 }, }; static const struct nla_policy -- cgit v1.2.3 From f14cef00456fb7cd6d2ca7389c149b5f079f0091 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Tue, 1 Aug 2023 21:43:37 +0800 Subject: wifi: mac80211: Remove unused function declarations Commit 685429623f88 ("mac80211: Fix circular locking dependency in ARP filter handling") left the ieee80211_set_arp_filter() declaration unused. And commit 164eb02d070a ("mac80211: add radar detection command/event") introducted ieee80211_dfs_cac_timer() declaration but never implemented it. Signed-off-by: Yue Haibing Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230801134337.24452-1-yuehaibing@huawei.com [reword commit message] Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 91633a0b723e..06bd406846d2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1872,7 +1872,6 @@ void ieee80211_send_pspoll(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); void ieee80211_recalc_ps(struct ieee80211_local *local); void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata); -int ieee80211_set_arp_filter(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); @@ -2564,7 +2563,6 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, struct ieee80211_link_data *rsvd_for); bool ieee80211_is_radar_required(struct ieee80211_local *local); -void ieee80211_dfs_cac_timer(unsigned long data); void ieee80211_dfs_cac_timer_work(struct work_struct *work); void ieee80211_dfs_cac_cancel(struct ieee80211_local *local); void ieee80211_dfs_radar_detected_work(struct work_struct *work); -- cgit v1.2.3 From a3d9c4f7c43dd9b0accc26573b54e4a0bce1f4a6 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Mon, 31 Jul 2023 22:07:12 +0800 Subject: wifi: mac80211: mesh: Remove unused function declaration mesh_ids_set_default() Commit ccf80ddfe492 ("mac80211: mesh function and data structures definitions") introducted this but never implemented it. Signed-off-by: Yue Haibing Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230731140712.1204-1-yuehaibing@huawei.com Signed-off-by: Johannes Berg --- net/mac80211/mesh.h | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 6c94222a9df5..ad8469293d71 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -212,7 +212,6 @@ int mesh_rmc_check(struct ieee80211_sub_if_data *sdata, const u8 *addr, struct ieee80211s_hdr *mesh_hdr); bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *ie); -void mesh_ids_set_default(struct ieee80211_if_mesh *mesh); int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); int mesh_add_meshid_ie(struct ieee80211_sub_if_data *sdata, -- cgit v1.2.3 From 9265f78b69a70a03b57f41d8da3c194d0c8aad22 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Sat, 29 Jul 2023 20:16:51 +0800 Subject: wifi: nl80211: Remove unused declaration nl80211_pmsr_dump_results() nl80211_pmsr_dump_results() is never implemented since it was added in commit 9bb7e0f24e7e ("cfg80211: add peer measurement with FTM initiator API"). Signed-off-by: Yue Haibing Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230729121651.36836-1-yuehaibing@huawei.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.h | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 0278d817bb02..b4af53f9b227 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -120,6 +120,5 @@ void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev); /* peer measurement */ int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info); -int nl80211_pmsr_dump_results(struct sk_buff *skb, struct netlink_callback *cb); #endif /* __NET_WIRELESS_NL80211_H */ -- cgit v1.2.3 From c5b4297dee9110b63f2c8202e4876bdcad43d4c4 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 21 Aug 2023 15:25:12 -0700 Subject: mptcp: refactor push_pending logic To support redundant package schedulers more easily, this patch refactors __mptcp_push_pending() logic from: For each dfrag: While sends succeed: Call the scheduler (selects subflow and msk->snd_burst) Update subflow locks (push/release/acquire as needed) Send the dfrag data with mptcp_sendmsg_frag() Update already_sent, snd_nxt, snd_burst Update msk->first_pending Push/release on final subflow -> While first_pending isn't empty: Call the scheduler (selects subflow and msk->snd_burst) Update subflow locks (push/release/acquire as needed) For each pending dfrag: While sends succeed: Send the dfrag data with mptcp_sendmsg_frag() Update already_sent, snd_nxt, snd_burst Update msk->first_pending Break if required by msk->snd_burst / etc Push/release on final subflow Refactors __mptcp_subflow_push_pending logic from: For each dfrag: While sends succeed: Call the scheduler (selects subflow and msk->snd_burst) Send the dfrag data with mptcp_subflow_delegate(), break Send the dfrag data with mptcp_sendmsg_frag() Update dfrag->already_sent, msk->snd_nxt, msk->snd_burst Update msk->first_pending -> While first_pending isn't empty: Call the scheduler (selects subflow and msk->snd_burst) Send the dfrag data with mptcp_subflow_delegate(), break Send the dfrag data with mptcp_sendmsg_frag() For each pending dfrag: While sends succeed: Send the dfrag data with mptcp_sendmsg_frag() Update already_sent, snd_nxt, snd_burst Update msk->first_pending Break if required by msk->snd_burst / etc Move the duplicate code from __mptcp_push_pending() and __mptcp_subflow_push_pending() into a new helper function, named __subflow_push_pending(). Simplify __mptcp_push_pending() and __mptcp_subflow_push_pending() by invoking this helper. Also move the burst check conditions out of the function mptcp_subflow_get_send(), check them in __subflow_push_pending() in the inner "for each pending dfrag" loop. Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20230821-upstream-net-next-20230818-v1-1-0c860fb256a8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 153 +++++++++++++++++++++++++++------------------------ 1 file changed, 81 insertions(+), 72 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 6019a3cf1625..29c662ffcd05 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1386,14 +1386,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) sk_stream_memory_free(msk->first) ? msk->first : NULL; } - /* re-use last subflow, if the burst allow that */ - if (msk->last_snd && msk->snd_burst > 0 && - sk_stream_memory_free(msk->last_snd) && - mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { - mptcp_set_timeout(sk); - return msk->last_snd; - } - /* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < SSK_MODE_MAX; ++i) { send_info[i].ssk = NULL; @@ -1499,57 +1491,86 @@ void mptcp_check_and_set_pending(struct sock *sk) mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING); } -void __mptcp_push_pending(struct sock *sk, unsigned int flags) +static int __subflow_push_pending(struct sock *sk, struct sock *ssk, + struct mptcp_sendmsg_info *info) { - struct sock *prev_ssk = NULL, *ssk = NULL; struct mptcp_sock *msk = mptcp_sk(sk); - struct mptcp_sendmsg_info info = { - .flags = flags, - }; - bool do_check_data_fin = false; struct mptcp_data_frag *dfrag; - int len; + int len, copied = 0, err = 0; while ((dfrag = mptcp_send_head(sk))) { - info.sent = dfrag->already_sent; - info.limit = dfrag->data_len; + info->sent = dfrag->already_sent; + info->limit = dfrag->data_len; len = dfrag->data_len - dfrag->already_sent; while (len > 0) { int ret = 0; - prev_ssk = ssk; - ssk = mptcp_subflow_get_send(msk); - - /* First check. If the ssk has changed since - * the last round, release prev_ssk - */ - if (ssk != prev_ssk && prev_ssk) - mptcp_push_release(prev_ssk, &info); - if (!ssk) - goto out; - - /* Need to lock the new subflow only if different - * from the previous one, otherwise we are still - * helding the relevant lock - */ - if (ssk != prev_ssk) - lock_sock(ssk); - - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info); if (ret <= 0) { - if (ret == -EAGAIN) - continue; - mptcp_push_release(ssk, &info); + err = copied ? : ret; goto out; } - do_check_data_fin = true; - info.sent += ret; + info->sent += ret; + copied += ret; len -= ret; mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + + if (msk->snd_burst <= 0 || + !sk_stream_memory_free(ssk) || + !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) { + err = copied; + goto out; + } + mptcp_set_timeout(sk); + } + err = copied; + +out: + return err; +} + +void __mptcp_push_pending(struct sock *sk, unsigned int flags) +{ + struct sock *prev_ssk = NULL, *ssk = NULL; + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_sendmsg_info info = { + .flags = flags, + }; + bool do_check_data_fin = false; + + while (mptcp_send_head(sk)) { + int ret = 0; + + prev_ssk = ssk; + ssk = mptcp_subflow_get_send(msk); + + /* First check. If the ssk has changed since + * the last round, release prev_ssk + */ + if (ssk != prev_ssk && prev_ssk) + mptcp_push_release(prev_ssk, &info); + if (!ssk) + goto out; + + /* Need to lock the new subflow only if different + * from the previous one, otherwise we are still + * helding the relevant lock + */ + if (ssk != prev_ssk) + lock_sock(ssk); + + ret = __subflow_push_pending(sk, ssk, &info); + if (ret <= 0) { + if (ret == -EAGAIN) + continue; + mptcp_push_release(ssk, &info); + goto out; + } + do_check_data_fin = true; } /* at this point we held the socket lock for the last subflow we used */ @@ -1570,42 +1591,30 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool struct mptcp_sendmsg_info info = { .data_lock_held = true, }; - struct mptcp_data_frag *dfrag; struct sock *xmit_ssk; - int len, copied = 0; + int copied = 0; info.flags = 0; - while ((dfrag = mptcp_send_head(sk))) { - info.sent = dfrag->already_sent; - info.limit = dfrag->data_len; - len = dfrag->data_len - dfrag->already_sent; - while (len > 0) { - int ret = 0; - - /* check for a different subflow usage only after - * spooling the first chunk of data - */ - xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk); - if (!xmit_ssk) - goto out; - if (xmit_ssk != ssk) { - mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), - MPTCP_DELEGATE_SEND); - goto out; - } - - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); - if (ret <= 0) - goto out; + while (mptcp_send_head(sk)) { + int ret = 0; - info.sent += ret; - copied += ret; - len -= ret; - first = false; - - mptcp_update_post_push(msk, dfrag, ret); + /* check for a different subflow usage only after + * spooling the first chunk of data + */ + xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk); + if (!xmit_ssk) + goto out; + if (xmit_ssk != ssk) { + mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), + MPTCP_DELEGATE_SEND); + goto out; } - WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + + ret = __subflow_push_pending(sk, ssk, &info); + first = false; + if (ret <= 0) + break; + copied += ret; } out: -- cgit v1.2.3 From ebc1e08f01ebedbf962e6417bbf6952bd4ca2142 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 21 Aug 2023 15:25:13 -0700 Subject: mptcp: drop last_snd and MPTCP_RESET_SCHEDULER Since the burst check conditions have moved out of the function mptcp_subflow_get_send(), it makes all msk->last_snd useless. This patch drops them as well as the macro MPTCP_RESET_SCHEDULER. Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20230821-upstream-net-next-20230818-v1-2-0c860fb256a8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 9 +-------- net/mptcp/pm_netlink.c | 3 --- net/mptcp/protocol.c | 11 +---------- net/mptcp/protocol.h | 2 -- 4 files changed, 2 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 7dbbad1e4f55..d8da5374d9e1 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -299,15 +299,8 @@ void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup) pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup); msk = mptcp_sk(sk); - if (subflow->backup != bkup) { + if (subflow->backup != bkup) subflow->backup = bkup; - mptcp_data_lock(sk); - if (!sock_owned_by_user(sk)) - msk->last_snd = NULL; - else - __set_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags); - mptcp_data_unlock(sk); - } mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC); } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index c75d9d88a053..9661f3812682 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -472,9 +472,6 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con slow = lock_sock_fast(ssk); if (prio) { - if (subflow->backup != backup) - msk->last_snd = NULL; - subflow->send_mp_prio = 1; subflow->backup = backup; subflow->request_bkup = backup; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 29c662ffcd05..f15ff80be30f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1438,16 +1438,13 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); wmem = READ_ONCE(ssk->sk_wmem_queued); - if (!burst) { - msk->last_snd = NULL; + if (!burst) return ssk; - } subflow = mptcp_subflow_ctx(ssk); subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + READ_ONCE(ssk->sk_pacing_rate) * burst, burst + wmem); - msk->last_snd = ssk; msk->snd_burst = burst; return ssk; } @@ -2379,9 +2376,6 @@ out_release: WRITE_ONCE(msk->first, NULL); out: - if (ssk == msk->last_snd) - msk->last_snd = NULL; - if (need_push) __mptcp_push_pending(sk, 0); } @@ -3046,7 +3040,6 @@ static int mptcp_disconnect(struct sock *sk, int flags) * subflow */ mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); - msk->last_snd = NULL; WRITE_ONCE(msk->flags, 0); msk->cb_flags = 0; msk->push_pending = 0; @@ -3316,8 +3309,6 @@ static void mptcp_release_cb(struct sock *sk) __mptcp_set_connected(sk); if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) __mptcp_error_report(sk); - if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags)) - msk->last_snd = NULL; } __mptcp_update_rmem(sk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 38c7ea013361..cbf9a9e176b2 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -123,7 +123,6 @@ #define MPTCP_RETRANSMIT 4 #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_CONNECTED 6 -#define MPTCP_RESET_SCHEDULER 7 struct mptcp_skb_cb { u64 map_seq; @@ -269,7 +268,6 @@ struct mptcp_sock { u64 rcv_data_fin_seq; u64 bytes_retrans; int rmem_fwd_alloc; - struct sock *last_snd; int snd_burst; int old_wspace; u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; -- cgit v1.2.3 From 740ebe35bd3f5c4ff8ec60e5e521e47ea8f5492c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 21 Aug 2023 15:25:14 -0700 Subject: mptcp: add struct mptcp_sched_ops This patch defines struct mptcp_sched_ops, which has three struct members, name, owner and list, and four function pointers: init(), release() and get_subflow(). The scheduler function get_subflow() have a struct mptcp_sched_data parameter, which contains a reinject flag for retrans or not, a subflows number and a mptcp_subflow_context array. Add the scheduler registering, unregistering and finding functions to add, delete and find a packet scheduler on the global list mptcp_sched_list. Acked-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20230821-upstream-net-next-20230818-v1-3-0c860fb256a8@kernel.org Signed-off-by: Jakub Kicinski --- include/net/mptcp.h | 21 ++++++++++++++++++++ net/mptcp/Makefile | 2 +- net/mptcp/protocol.h | 3 +++ net/mptcp/sched.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 net/mptcp/sched.c (limited to 'net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 3c5c68618fcc..fb996124b3d5 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -96,6 +96,27 @@ struct mptcp_out_options { #endif }; +#define MPTCP_SCHED_NAME_MAX 16 +#define MPTCP_SUBFLOWS_MAX 8 + +struct mptcp_sched_data { + bool reinject; + u8 subflows; + struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; +}; + +struct mptcp_sched_ops { + int (*get_subflow)(struct mptcp_sock *msk, + struct mptcp_sched_data *data); + + char name[MPTCP_SCHED_NAME_MAX]; + struct module *owner; + struct list_head list; + + void (*init)(struct mptcp_sock *msk); + void (*release)(struct mptcp_sock *msk); +} ____cacheline_aligned_in_smp; + #ifdef CONFIG_MPTCP void mptcp_init(void); diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index a3829ce548f9..84e531f86b82 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ - mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o + mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o sched.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index cbf9a9e176b2..985e8f86668d 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -655,6 +655,9 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); +struct mptcp_sched_ops *mptcp_sched_find(const char *name); +int mptcp_register_scheduler(struct mptcp_sched_ops *sched); +void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); static inline bool __tcp_can_send(const struct sock *ssk) { diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c new file mode 100644 index 000000000000..c5d3bbafba71 --- /dev/null +++ b/net/mptcp/sched.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2022, SUSE. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include +#include +#include +#include +#include +#include "protocol.h" + +static DEFINE_SPINLOCK(mptcp_sched_list_lock); +static LIST_HEAD(mptcp_sched_list); + +/* Must be called with rcu read lock held */ +struct mptcp_sched_ops *mptcp_sched_find(const char *name) +{ + struct mptcp_sched_ops *sched, *ret = NULL; + + list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { + if (!strcmp(sched->name, name)) { + ret = sched; + break; + } + } + + return ret; +} + +int mptcp_register_scheduler(struct mptcp_sched_ops *sched) +{ + if (!sched->get_subflow) + return -EINVAL; + + spin_lock(&mptcp_sched_list_lock); + if (mptcp_sched_find(sched->name)) { + spin_unlock(&mptcp_sched_list_lock); + return -EEXIST; + } + list_add_tail_rcu(&sched->list, &mptcp_sched_list); + spin_unlock(&mptcp_sched_list_lock); + + pr_debug("%s registered", sched->name); + return 0; +} + +void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) +{ + spin_lock(&mptcp_sched_list_lock); + list_del_rcu(&sched->list); + spin_unlock(&mptcp_sched_list_lock); +} -- cgit v1.2.3 From e3b2870b6d220d1cbd2d52d7acc9f0de9fdfeccf Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 21 Aug 2023 15:25:15 -0700 Subject: mptcp: add a new sysctl scheduler This patch adds a new sysctl, named scheduler, to support for selection of different schedulers. Export mptcp_get_scheduler helper to get this sysctl. Acked-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20230821-upstream-net-next-20230818-v1-4-0c860fb256a8@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/mptcp-sysctl.rst | 8 ++++++++ net/mptcp/ctrl.c | 14 ++++++++++++++ net/mptcp/protocol.h | 1 + 3 files changed, 23 insertions(+) (limited to 'net') diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index 213510698014..15f1919d640c 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -74,3 +74,11 @@ stale_loss_cnt - INTEGER This is a per-namespace sysctl. Default: 4 + +scheduler - STRING + Select the scheduler of your choice. + + Support for selection of different schedulers. This is a per-namespace + sysctl. + + Default: "default" diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index ae20b7d92e28..c46c22a84d23 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -32,6 +32,7 @@ struct mptcp_pernet { u8 checksum_enabled; u8 allow_join_initial_addr_port; u8 pm_type; + char scheduler[MPTCP_SCHED_NAME_MAX]; }; static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) @@ -69,6 +70,11 @@ int mptcp_get_pm_type(const struct net *net) return mptcp_get_pernet(net)->pm_type; } +const char *mptcp_get_scheduler(const struct net *net) +{ + return mptcp_get_pernet(net)->scheduler; +} + static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; @@ -77,6 +83,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) pernet->allow_join_initial_addr_port = 1; pernet->stale_loss_cnt = 4; pernet->pm_type = MPTCP_PM_TYPE_KERNEL; + strcpy(pernet->scheduler, "default"); } #ifdef CONFIG_SYSCTL @@ -128,6 +135,12 @@ static struct ctl_table mptcp_sysctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &mptcp_pm_type_max }, + { + .procname = "scheduler", + .maxlen = MPTCP_SCHED_NAME_MAX, + .mode = 0644, + .proc_handler = proc_dostring, + }, {} }; @@ -149,6 +162,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[3].data = &pernet->allow_join_initial_addr_port; table[4].data = &pernet->stale_loss_cnt; table[5].data = &pernet->pm_type; + table[6].data = &pernet->scheduler; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 985e8f86668d..bfa13a50f276 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -623,6 +623,7 @@ int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); int mptcp_get_pm_type(const struct net *net); +const char *mptcp_get_scheduler(const struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); -- cgit v1.2.3 From 1730b2b2c5a5a886007b247366aebe0976dc8881 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 21 Aug 2023 15:25:16 -0700 Subject: mptcp: add sched in mptcp_sock This patch adds a new struct member sched in struct mptcp_sock. And two helpers mptcp_init_sched() and mptcp_release_sched() to init and release it. Init it with the sysctl scheduler in mptcp_init_sock(), copy the scheduler from the parent in mptcp_sk_clone(), and release it in __mptcp_destroy_sock(). Acked-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20230821-upstream-net-next-20230818-v1-5-0c860fb256a8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 8 ++++++++ net/mptcp/protocol.h | 4 ++++ net/mptcp/sched.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f15ff80be30f..54a3eccfa731 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2697,6 +2697,7 @@ static void mptcp_ca_reset(struct sock *sk) static int mptcp_init_sock(struct sock *sk) { struct net *net = sock_net(sk); + int ret; __mptcp_init_sock(sk); @@ -2706,6 +2707,11 @@ static int mptcp_init_sock(struct sock *sk) if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) return -ENOMEM; + ret = mptcp_init_sched(mptcp_sk(sk), + mptcp_sched_find(mptcp_get_scheduler(net))); + if (ret) + return ret; + set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will @@ -2851,6 +2857,7 @@ static void __mptcp_destroy_sock(struct sock *sk) mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; + mptcp_release_sched(msk); sk->sk_prot->destroy(sk); @@ -3105,6 +3112,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, msk->snd_una = msk->write_seq; msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; + mptcp_init_sched(msk, mptcp_sk(sk)->sched); /* passive msk is created after the first/MPC subflow */ msk->subflow_id = 2; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index bfa13a50f276..548c302a757e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -312,6 +312,7 @@ struct mptcp_sock { * lock as such sock is freed after close(). */ struct mptcp_pm_data pm; + struct mptcp_sched_ops *sched; struct { u32 space; /* bytes copied in last measurement window */ u32 copied; /* bytes copied in this measurement window */ @@ -659,6 +660,9 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct mptcp_sched_ops *mptcp_sched_find(const char *name); int mptcp_register_scheduler(struct mptcp_sched_ops *sched); void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); +int mptcp_init_sched(struct mptcp_sock *msk, + struct mptcp_sched_ops *sched); +void mptcp_release_sched(struct mptcp_sock *msk); static inline bool __tcp_can_send(const struct sock *ssk) { diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index c5d3bbafba71..53773668b5ee 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -54,3 +54,36 @@ void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) list_del_rcu(&sched->list); spin_unlock(&mptcp_sched_list_lock); } + +int mptcp_init_sched(struct mptcp_sock *msk, + struct mptcp_sched_ops *sched) +{ + if (!sched) + goto out; + + if (!bpf_try_module_get(sched, sched->owner)) + return -EBUSY; + + msk->sched = sched; + if (msk->sched->init) + msk->sched->init(msk); + + pr_debug("sched=%s", msk->sched->name); + +out: + return 0; +} + +void mptcp_release_sched(struct mptcp_sock *msk) +{ + struct mptcp_sched_ops *sched = msk->sched; + + if (!sched) + return; + + msk->sched = NULL; + if (sched->release) + sched->release(msk); + + bpf_module_put(sched, sched->owner); +} -- cgit v1.2.3 From fce68b03086fd00eb5a8ba4744f36f0d007d0f9d Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 21 Aug 2023 15:25:17 -0700 Subject: mptcp: add scheduled in mptcp_subflow_context This patch adds a new member scheduled in struct mptcp_subflow_context, which will be set in the MPTCP scheduler context when the scheduler picks this subflow to send data. Add a new helper mptcp_subflow_set_scheduled() to set this flag using WRITE_ONCE(). Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20230821-upstream-net-next-20230818-v1-6-0c860fb256a8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.h | 3 +++ net/mptcp/sched.c | 6 ++++++ 2 files changed, 9 insertions(+) (limited to 'net') diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 548c302a757e..e7523a40132f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -491,6 +491,7 @@ struct mptcp_subflow_context { is_mptfo : 1, /* subflow is doing TFO */ __unused : 9; enum mptcp_data_avail data_avail; + bool scheduled; u32 remote_nonce; u64 thmac; u32 local_nonce; @@ -663,6 +664,8 @@ void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); int mptcp_init_sched(struct mptcp_sock *msk, struct mptcp_sched_ops *sched); void mptcp_release_sched(struct mptcp_sock *msk); +void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, + bool scheduled); static inline bool __tcp_can_send(const struct sock *ssk) { diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index 53773668b5ee..d295b92a5789 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -87,3 +87,9 @@ void mptcp_release_sched(struct mptcp_sock *msk) bpf_module_put(sched, sched->owner); } + +void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, + bool scheduled) +{ + WRITE_ONCE(subflow->scheduled, scheduled); +} -- cgit v1.2.3 From 07336a87fe871518a7b3508e29a21ca1735b3edc Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 21 Aug 2023 15:25:18 -0700 Subject: mptcp: add scheduler wrappers This patch defines two packet scheduler wrappers mptcp_sched_get_send() and mptcp_sched_get_retrans(), invoke get_subflow() of msk->sched in them. Set data->reinject to true in mptcp_sched_get_retrans(), set it false in mptcp_sched_get_send(). If msk->sched is NULL, use default functions mptcp_subflow_get_send() and mptcp_subflow_get_retrans() to send data. Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20230821-upstream-net-next-20230818-v1-7-0c860fb256a8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 4 ++-- net/mptcp/protocol.h | 4 ++++ net/mptcp/sched.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 54a3eccfa731..9cd172d2c8d6 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1366,7 +1366,7 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) * returns the subflow that will transmit the next DSS * additionally updates the rtx timeout */ -static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) +struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct subflow_send_info send_info[SSK_MODE_MAX]; struct mptcp_subflow_context *subflow; @@ -2204,7 +2204,7 @@ static void mptcp_timeout_timer(struct timer_list *t) * * A backup subflow is returned only if that is the only kind available. */ -static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) +struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) { struct sock *backup = NULL, *pick = NULL; struct mptcp_subflow_context *subflow; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index e7523a40132f..78562f695c46 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -666,6 +666,10 @@ int mptcp_init_sched(struct mptcp_sock *msk, void mptcp_release_sched(struct mptcp_sock *msk); void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, bool scheduled); +struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk); +struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk); +int mptcp_sched_get_send(struct mptcp_sock *msk); +int mptcp_sched_get_retrans(struct mptcp_sock *msk); static inline bool __tcp_can_send(const struct sock *ssk) { diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index d295b92a5789..884606686cfe 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -93,3 +93,51 @@ void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, { WRITE_ONCE(subflow->scheduled, scheduled); } + +int mptcp_sched_get_send(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sched_data data; + + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) + return 0; + } + + if (!msk->sched) { + struct sock *ssk; + + ssk = mptcp_subflow_get_send(msk); + if (!ssk) + return -EINVAL; + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); + return 0; + } + + data.reinject = false; + return msk->sched->get_subflow(msk, &data); +} + +int mptcp_sched_get_retrans(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sched_data data; + + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) + return 0; + } + + if (!msk->sched) { + struct sock *ssk; + + ssk = mptcp_subflow_get_retrans(msk); + if (!ssk) + return -EINVAL; + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); + return 0; + } + + data.reinject = true; + return msk->sched->get_subflow(msk, &data); +} -- cgit v1.2.3 From 0fa1b3783a17d75a4aa1651a18ede041ffca5750 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 21 Aug 2023 15:25:19 -0700 Subject: mptcp: use get_send wrapper This patch adds the multiple subflows support for __mptcp_push_pending and __mptcp_subflow_push_pending. Use get_send() wrapper instead of mptcp_subflow_get_send() in them. Check the subflow scheduled flags to test which subflow or subflows are picked by the scheduler, use them to send data. Move msk_owned_by_me() and fallback checks into get_send() wrapper from mptcp_subflow_get_send(). This commit allows the scheduler to set the subflow->scheduled bit in multiple subflows, but it does not allow for sending redundant data. Multiple scheduled subflows will send sequential data on each subflow. Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20230821-upstream-net-next-20230818-v1-8-0c860fb256a8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 113 +++++++++++++++++++++++++++++++-------------------- net/mptcp/sched.c | 13 ++++++ 2 files changed, 81 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9cd172d2c8d6..77e94ee82859 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1377,15 +1377,6 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) u64 linger_time; long tout = 0; - msk_owned_by_me(msk); - - if (__mptcp_check_fallback(msk)) { - if (!msk->first) - return NULL; - return __tcp_can_send(msk->first) && - sk_stream_memory_free(msk->first) ? msk->first : NULL; - } - /* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < SSK_MODE_MAX; ++i) { send_info[i].ssk = NULL; @@ -1538,43 +1529,56 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) .flags = flags, }; bool do_check_data_fin = false; + int push_count = 1; - while (mptcp_send_head(sk)) { + while (mptcp_send_head(sk) && (push_count > 0)) { + struct mptcp_subflow_context *subflow; int ret = 0; - prev_ssk = ssk; - ssk = mptcp_subflow_get_send(msk); + if (mptcp_sched_get_send(msk)) + break; - /* First check. If the ssk has changed since - * the last round, release prev_ssk - */ - if (ssk != prev_ssk && prev_ssk) - mptcp_push_release(prev_ssk, &info); - if (!ssk) - goto out; + push_count = 0; - /* Need to lock the new subflow only if different - * from the previous one, otherwise we are still - * helding the relevant lock - */ - if (ssk != prev_ssk) - lock_sock(ssk); + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) { + mptcp_subflow_set_scheduled(subflow, false); - ret = __subflow_push_pending(sk, ssk, &info); - if (ret <= 0) { - if (ret == -EAGAIN) - continue; - mptcp_push_release(ssk, &info); - goto out; + prev_ssk = ssk; + ssk = mptcp_subflow_tcp_sock(subflow); + if (ssk != prev_ssk) { + /* First check. If the ssk has changed since + * the last round, release prev_ssk + */ + if (prev_ssk) + mptcp_push_release(prev_ssk, &info); + + /* Need to lock the new subflow only if different + * from the previous one, otherwise we are still + * helding the relevant lock + */ + lock_sock(ssk); + } + + push_count++; + + ret = __subflow_push_pending(sk, ssk, &info); + if (ret <= 0) { + if (ret != -EAGAIN || + (1 << ssk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE)) + push_count--; + continue; + } + do_check_data_fin = true; + } } - do_check_data_fin = true; } /* at this point we held the socket lock for the last subflow we used */ if (ssk) mptcp_push_release(ssk, &info); -out: /* ensure the rtx timer is running */ if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); @@ -1588,30 +1592,49 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool struct mptcp_sendmsg_info info = { .data_lock_held = true, }; + bool keep_pushing = true; struct sock *xmit_ssk; int copied = 0; info.flags = 0; - while (mptcp_send_head(sk)) { + while (mptcp_send_head(sk) && keep_pushing) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); int ret = 0; /* check for a different subflow usage only after * spooling the first chunk of data */ - xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk); - if (!xmit_ssk) - goto out; - if (xmit_ssk != ssk) { - mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), - MPTCP_DELEGATE_SEND); + if (first) { + mptcp_subflow_set_scheduled(subflow, false); + ret = __subflow_push_pending(sk, ssk, &info); + first = false; + if (ret <= 0) + break; + copied += ret; + continue; + } + + if (mptcp_sched_get_send(msk)) goto out; + + if (READ_ONCE(subflow->scheduled)) { + mptcp_subflow_set_scheduled(subflow, false); + ret = __subflow_push_pending(sk, ssk, &info); + if (ret <= 0) + keep_pushing = false; + copied += ret; } - ret = __subflow_push_pending(sk, ssk, &info); - first = false; - if (ret <= 0) - break; - copied += ret; + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) { + xmit_ssk = mptcp_subflow_tcp_sock(subflow); + if (xmit_ssk != ssk) { + mptcp_subflow_delegate(subflow, + MPTCP_DELEGATE_SEND); + keep_pushing = false; + } + } + } } out: diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index 884606686cfe..078b5d44978d 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -99,6 +99,19 @@ int mptcp_sched_get_send(struct mptcp_sock *msk) struct mptcp_subflow_context *subflow; struct mptcp_sched_data data; + msk_owned_by_me(msk); + + /* the following check is moved out of mptcp_subflow_get_send */ + if (__mptcp_check_fallback(msk)) { + if (msk->first && + __tcp_can_send(msk->first) && + sk_stream_memory_free(msk->first)) { + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true); + return 0; + } + return -EINVAL; + } + mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) return 0; -- cgit v1.2.3 From ee2708aedad00544d38dba6df88efeb4a330bd66 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 21 Aug 2023 15:25:20 -0700 Subject: mptcp: use get_retrans wrapper This patch adds the multiple subflows support for __mptcp_retrans(). Use get_retrans() wrapper instead of mptcp_subflow_get_retrans() in it. Check the subflow scheduled flags to test which subflow or subflows are picked by the scheduler, use them to send data. Move msk_owned_by_me() and fallback checks into get_retrans() wrapper from mptcp_subflow_get_retrans(). Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20230821-upstream-net-next-20230818-v1-9-0c860fb256a8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 65 ++++++++++++++++++++++++++++++---------------------- net/mptcp/sched.c | 6 +++++ 2 files changed, 43 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 77e94ee82859..61590ff2b9ee 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2233,11 +2233,6 @@ struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) struct mptcp_subflow_context *subflow; int min_stale_count = INT_MAX; - msk_owned_by_me(msk); - - if (__mptcp_check_fallback(msk)) - return NULL; - mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -2515,16 +2510,17 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) static void __mptcp_retrans(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_subflow_context *subflow; struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; - size_t copied = 0; struct sock *ssk; - int ret; + int ret, err; + u16 len = 0; mptcp_clean_una_wakeup(sk); /* first check ssk: need to kick "stale" logic */ - ssk = mptcp_subflow_get_retrans(msk); + err = mptcp_sched_get_retrans(msk); dfrag = mptcp_rtx_head(sk); if (!dfrag) { if (mptcp_data_fin_enabled(msk)) { @@ -2543,32 +2539,45 @@ static void __mptcp_retrans(struct sock *sk) goto reset_timer; } - if (!ssk) + if (err) goto reset_timer; - lock_sock(ssk); + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) { + u16 copied = 0; - /* limit retransmission to the bytes already sent on some subflows */ - info.sent = 0; - info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; - while (info.sent < info.limit) { - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); - if (ret <= 0) - break; + mptcp_subflow_set_scheduled(subflow, false); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); - copied += ret; - info.sent += ret; - } - if (copied) { - dfrag->already_sent = max(dfrag->already_sent, info.sent); - msk->bytes_retrans += copied; - tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, - info.size_goal); - WRITE_ONCE(msk->allow_infinite_fallback, false); + ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + + /* limit retransmission to the bytes already sent on some subflows */ + info.sent = 0; + info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : + dfrag->already_sent; + while (info.sent < info.limit) { + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) + break; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); + copied += ret; + info.sent += ret; + } + if (copied) { + len = max(copied, len); + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); + WRITE_ONCE(msk->allow_infinite_fallback, false); + } + + release_sock(ssk); + } } - release_sock(ssk); + msk->bytes_retrans += len; + dfrag->already_sent = max(dfrag->already_sent, len); reset_timer: mptcp_check_and_set_pending(sk); diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index 078b5d44978d..cac1cc1fa3b0 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -136,6 +136,12 @@ int mptcp_sched_get_retrans(struct mptcp_sock *msk) struct mptcp_subflow_context *subflow; struct mptcp_sched_data data; + msk_owned_by_me(msk); + + /* the following check is moved out of mptcp_subflow_get_retrans */ + if (__mptcp_check_fallback(msk)) + return -EINVAL; + mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) return 0; -- cgit v1.2.3 From ed1ad86b8527f8f864df3c182adbfcd12a445de6 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 21 Aug 2023 15:25:21 -0700 Subject: mptcp: register default scheduler This patch defines the default packet scheduler mptcp_sched_default. Register it in mptcp_sched_init(), which is invoked in mptcp_proto_init(). Skip deleting this default scheduler in mptcp_unregister_scheduler(). Set msk->sched to the default scheduler when the input parameter of mptcp_init_sched() is NULL. Invoke mptcp_sched_default_get_subflow in get_send() and get_retrans() if the defaut scheduler is set or msk->sched is NULL. Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20230821-upstream-net-next-20230818-v1-10-0c860fb256a8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 1 + net/mptcp/protocol.h | 1 + net/mptcp/sched.c | 55 +++++++++++++++++++++++++++++++--------------------- 3 files changed, 35 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 61590ff2b9ee..933b257eee02 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3965,6 +3965,7 @@ void __init mptcp_proto_init(void) mptcp_subflow_init(); mptcp_pm_init(); + mptcp_sched_init(); mptcp_token_init(); if (proto_register(&mptcp_prot, 1) != 0) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 78562f695c46..7254b3562575 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -661,6 +661,7 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct mptcp_sched_ops *mptcp_sched_find(const char *name); int mptcp_register_scheduler(struct mptcp_sched_ops *sched); void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); +void mptcp_sched_init(void); int mptcp_init_sched(struct mptcp_sock *msk, struct mptcp_sched_ops *sched); void mptcp_release_sched(struct mptcp_sock *msk); diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index cac1cc1fa3b0..4ab0693c069c 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -16,6 +16,26 @@ static DEFINE_SPINLOCK(mptcp_sched_list_lock); static LIST_HEAD(mptcp_sched_list); +static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + struct sock *ssk; + + ssk = data->reinject ? mptcp_subflow_get_retrans(msk) : + mptcp_subflow_get_send(msk); + if (!ssk) + return -EINVAL; + + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); + return 0; +} + +static struct mptcp_sched_ops mptcp_sched_default = { + .get_subflow = mptcp_sched_default_get_subflow, + .name = "default", + .owner = THIS_MODULE, +}; + /* Must be called with rcu read lock held */ struct mptcp_sched_ops *mptcp_sched_find(const char *name) { @@ -50,16 +70,24 @@ int mptcp_register_scheduler(struct mptcp_sched_ops *sched) void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) { + if (sched == &mptcp_sched_default) + return; + spin_lock(&mptcp_sched_list_lock); list_del_rcu(&sched->list); spin_unlock(&mptcp_sched_list_lock); } +void mptcp_sched_init(void) +{ + mptcp_register_scheduler(&mptcp_sched_default); +} + int mptcp_init_sched(struct mptcp_sock *msk, struct mptcp_sched_ops *sched) { if (!sched) - goto out; + sched = &mptcp_sched_default; if (!bpf_try_module_get(sched, sched->owner)) return -EBUSY; @@ -70,7 +98,6 @@ int mptcp_init_sched(struct mptcp_sock *msk, pr_debug("sched=%s", msk->sched->name); -out: return 0; } @@ -117,17 +144,9 @@ int mptcp_sched_get_send(struct mptcp_sock *msk) return 0; } - if (!msk->sched) { - struct sock *ssk; - - ssk = mptcp_subflow_get_send(msk); - if (!ssk) - return -EINVAL; - mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); - return 0; - } - data.reinject = false; + if (msk->sched == &mptcp_sched_default || !msk->sched) + return mptcp_sched_default_get_subflow(msk, &data); return msk->sched->get_subflow(msk, &data); } @@ -147,16 +166,8 @@ int mptcp_sched_get_retrans(struct mptcp_sock *msk) return 0; } - if (!msk->sched) { - struct sock *ssk; - - ssk = mptcp_subflow_get_retrans(msk); - if (!ssk) - return -EINVAL; - mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); - return 0; - } - data.reinject = true; + if (msk->sched == &mptcp_sched_default || !msk->sched) + return mptcp_sched_default_get_subflow(msk, &data); return msk->sched->get_subflow(msk, &data); } -- cgit v1.2.3 From 0bdf399342c5acbd817c9098b6c7ed21f1974312 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Mon, 21 Aug 2023 16:45:23 -0500 Subject: net: Avoid address overwrite in kernel_connect BPF programs that run on connect can rewrite the connect address. For the connect system call this isn't a problem, because a copy of the address is made when it is moved into kernel space. However, kernel_connect simply passes through the address it is given, so the caller may observe its address value unexpectedly change. A practical example where this is problematic is where NFS is combined with a system such as Cilium which implements BPF-based load balancing. A common pattern in software-defined storage systems is to have an NFS mount that connects to a persistent virtual IP which in turn maps to an ephemeral server IP. This is usually done to achieve high availability: if your server goes down you can quickly spin up a replacement and remap the virtual IP to that endpoint. With BPF-based load balancing, mounts will forget the virtual IP address when the address rewrite occurs because a pointer to the only copy of that address is passed down the stack. Server failover then breaks, because clients have forgotten the virtual IP address. Reconnects fail and mounts remain broken. This patch was tested by setting up a scenario like this and ensuring that NFS reconnects worked after applying the patch. Signed-off-by: Jordan Rife Signed-off-by: David S. Miller --- net/socket.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index fdb5233bf560..848116d06b51 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3567,7 +3567,12 @@ EXPORT_SYMBOL(kernel_accept); int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags) { - return READ_ONCE(sock->ops)->connect(sock, addr, addrlen, flags); + struct sockaddr_storage address; + + memcpy(&address, addr, addrlen); + + return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)&address, + addrlen, flags); } EXPORT_SYMBOL(kernel_connect); -- cgit v1.2.3 From 8da1985ff75226fd758ef379f9dd98986c811704 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 23 Aug 2023 18:32:25 +0800 Subject: wifi: mac80211: Do not include crypto/algapi.h The header file crypto/algapi.h is for internal use only. Use the header file crypto/utils.h instead. Signed-off-by: Herbert Xu Link: https://lore.kernel.org/r/E1qYlA0-006vFr-Ts@formenos.hmeau.com Signed-off-by: Johannes Berg --- net/mac80211/fils_aead.c | 2 +- net/mac80211/key.c | 2 +- net/mac80211/wpa.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/fils_aead.c b/net/mac80211/fils_aead.c index e1d4cfd99128..912c46f74d24 100644 --- a/net/mac80211/fils_aead.c +++ b/net/mac80211/fils_aead.c @@ -5,9 +5,9 @@ */ #include -#include #include #include +#include #include "ieee80211_i.h" #include "aes_cmac.h" diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 21cf5a208910..13050dc9321f 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -9,6 +9,7 @@ * Copyright 2018-2020, 2022-2023 Intel Corporation */ +#include #include #include #include @@ -17,7 +18,6 @@ #include #include #include -#include #include #include "ieee80211_i.h" #include "driver-ops.h" diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 4133496da378..2d8e38b3bcb5 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "ieee80211_i.h" #include "michael.h" -- cgit v1.2.3 From 5d21d0a65b573507bae774708199328b38dedfe6 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 23 Aug 2023 11:28:38 +0200 Subject: net: generalize calculation of skb extensions length MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the necessity to modify skb_ext_total_length() when new extension types are added. Also reduces the line count a bit. With optimizations enabled the function is folded down to the same constant value as before during compilation. This has been validated on x86 with GCC 6.5.0 and 13.2.1. Also a similar construct has been validated on godbolt.org with GCC 5.1. In any case the compiler has to be able to evaluate the construct at compile-time for the BUILD_BUG_ON() in skb_extensions_init(). Even if not evaluated at compile-time this function would only ever be executed once at run-time, so the overhead would be very minuscule. Signed-off-by: Thomas Weißschuh Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230823-skb_ext-simplify-v2-1-66e26cd66860@weissschuh.net Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index faa6c86da2a5..45707059082f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4785,23 +4785,13 @@ static const u8 skb_ext_type_len[] = { static __always_inline unsigned int skb_ext_total_length(void) { - return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - skb_ext_type_len[SKB_EXT_BRIDGE_NF] + -#endif -#ifdef CONFIG_XFRM - skb_ext_type_len[SKB_EXT_SEC_PATH] + -#endif -#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - skb_ext_type_len[TC_SKB_EXT] + -#endif -#if IS_ENABLED(CONFIG_MPTCP) - skb_ext_type_len[SKB_EXT_MPTCP] + -#endif -#if IS_ENABLED(CONFIG_MCTP_FLOWS) - skb_ext_type_len[SKB_EXT_MCTP] + -#endif - 0; + unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext); + int i; + + for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++) + l += skb_ext_type_len[i]; + + return l; } static void skb_extensions_init(void) -- cgit v1.2.3 From 94d9ba9f9888b748d4abd2aa1547af56ae85f772 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 9 Aug 2023 16:49:33 -0700 Subject: Bluetooth: hci_sync: Fix UAF in hci_disconnect_all_sync Use-after-free can occur in hci_disconnect_all_sync if a connection is deleted by concurrent processing of a controller event. To prevent this the code now tries to iterate over the list backwards to ensure the links are cleanup before its parents, also it no longer relies on a cursor, instead it always uses the last element since hci_abort_conn_sync is guaranteed to call hci_conn_del. UAF crash log: ================================================================== BUG: KASAN: slab-use-after-free in hci_set_powered_sync (net/bluetooth/hci_sync.c:5424) [bluetooth] Read of size 8 at addr ffff888009d9c000 by task kworker/u9:0/124 CPU: 0 PID: 124 Comm: kworker/u9:0 Tainted: G W 6.5.0-rc1+ #10 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc38 04/01/2014 Workqueue: hci0 hci_cmd_sync_work [bluetooth] Call Trace: dump_stack_lvl+0x5b/0x90 print_report+0xcf/0x670 ? __virt_addr_valid+0xdd/0x160 ? hci_set_powered_sync+0x2c9/0x4a0 [bluetooth] kasan_report+0xa6/0xe0 ? hci_set_powered_sync+0x2c9/0x4a0 [bluetooth] ? __pfx_set_powered_sync+0x10/0x10 [bluetooth] hci_set_powered_sync+0x2c9/0x4a0 [bluetooth] ? __pfx_hci_set_powered_sync+0x10/0x10 [bluetooth] ? __pfx_lock_release+0x10/0x10 ? __pfx_set_powered_sync+0x10/0x10 [bluetooth] hci_cmd_sync_work+0x137/0x220 [bluetooth] process_one_work+0x526/0x9d0 ? __pfx_process_one_work+0x10/0x10 ? __pfx_do_raw_spin_lock+0x10/0x10 ? mark_held_locks+0x1a/0x90 worker_thread+0x92/0x630 ? __pfx_worker_thread+0x10/0x10 kthread+0x196/0x1e0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 Allocated by task 1782: kasan_save_stack+0x33/0x60 kasan_set_track+0x25/0x30 __kasan_kmalloc+0x8f/0xa0 hci_conn_add+0xa5/0xa80 [bluetooth] hci_bind_cis+0x881/0x9b0 [bluetooth] iso_connect_cis+0x121/0x520 [bluetooth] iso_sock_connect+0x3f6/0x790 [bluetooth] __sys_connect+0x109/0x130 __x64_sys_connect+0x40/0x50 do_syscall_64+0x60/0x90 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 Freed by task 695: kasan_save_stack+0x33/0x60 kasan_set_track+0x25/0x30 kasan_save_free_info+0x2b/0x50 __kasan_slab_free+0x10a/0x180 __kmem_cache_free+0x14d/0x2e0 device_release+0x5d/0xf0 kobject_put+0xdf/0x270 hci_disconn_complete_evt+0x274/0x3a0 [bluetooth] hci_event_packet+0x579/0x7e0 [bluetooth] hci_rx_work+0x287/0xaa0 [bluetooth] process_one_work+0x526/0x9d0 worker_thread+0x92/0x630 kthread+0x196/0x1e0 ret_from_fork+0x2c/0x50 ================================================================== Fixes: 182ee45da083 ("Bluetooth: hci_sync: Rework hci_suspend_notifier") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 55 ++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 5eb30ba21370..d10a0f36b947 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5370,6 +5370,7 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { int err = 0; u16 handle = conn->handle; + struct hci_conn *c; switch (conn->state) { case BT_CONNECTED: @@ -5389,43 +5390,57 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) hci_dev_unlock(hdev); return 0; default: + hci_dev_lock(hdev); conn->state = BT_CLOSED; + hci_disconn_cfm(conn, reason); + hci_conn_del(conn); + hci_dev_unlock(hdev); return 0; } + hci_dev_lock(hdev); + + /* Check if the connection hasn't been cleanup while waiting + * commands to complete. + */ + c = hci_conn_hash_lookup_handle(hdev, handle); + if (!c || c != conn) { + err = 0; + goto unlock; + } + /* Cleanup hci_conn object if it cannot be cancelled as it * likelly means the controller and host stack are out of sync * or in case of LE it was still scanning so it can be cleanup * safely. */ - if (err) { - struct hci_conn *c; - - /* Check if the connection hasn't been cleanup while waiting - * commands to complete. - */ - c = hci_conn_hash_lookup_handle(hdev, handle); - if (!c || c != conn) - return 0; - - hci_dev_lock(hdev); - hci_conn_failed(conn, err); - hci_dev_unlock(hdev); - } + hci_conn_failed(conn, reason); +unlock: + hci_dev_unlock(hdev); return err; } static int hci_disconnect_all_sync(struct hci_dev *hdev, u8 reason) { - struct hci_conn *conn, *tmp; - int err; + struct list_head *head = &hdev->conn_hash.list; + struct hci_conn *conn; - list_for_each_entry_safe(conn, tmp, &hdev->conn_hash.list, list) { - err = hci_abort_conn_sync(hdev, conn, reason); - if (err) - return err; + rcu_read_lock(); + while ((conn = list_first_or_null_rcu(head, struct hci_conn, list))) { + /* Make sure the connection is not freed while unlocking */ + conn = hci_conn_get(conn); + rcu_read_unlock(); + /* Disregard possible errors since hci_conn_del shall have been + * called even in case of errors had occurred since it would + * then cause hci_conn_failed to be called which calls + * hci_conn_del internally. + */ + hci_abort_conn_sync(hdev, conn, reason); + hci_conn_put(conn); + rcu_read_lock(); } + rcu_read_unlock(); return 0; } -- cgit v1.2.3 From 3a15324fd4bb94c31a5fb893413b04e634567957 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 15 Aug 2023 14:15:05 -0700 Subject: Bluetooth: hci_conn: Fix sending BT_HCI_CMD_LE_CREATE_CONN_CANCEL This fixes sending BT_HCI_CMD_LE_CREATE_CONN_CANCEL when hci_le_create_conn_sync has not been called because HCI_CONN_SCANNING has been clear too early before its cmd_sync callback has been run. Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 234746721047..95339623883c 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1299,6 +1299,7 @@ static int hci_connect_le_sync(struct hci_dev *hdev, void *data) bt_dev_dbg(hdev, "conn %p", conn); + clear_bit(HCI_CONN_SCANNING, &conn->flags); conn->state = BT_CONNECT; return hci_le_create_conn_sync(hdev, conn); @@ -1370,8 +1371,6 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, conn->sec_level = BT_SECURITY_LOW; conn->conn_timeout = conn_timeout; - clear_bit(HCI_CONN_SCANNING, &conn->flags); - err = hci_cmd_sync_queue(hdev, hci_connect_le_sync, UINT_PTR(conn->handle), create_le_conn_complete); -- cgit v1.2.3 From fbdc4bc47268953c80853489f696e02d61f9a2c6 Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Thu, 17 Aug 2023 09:44:27 +0300 Subject: Bluetooth: ISO: Use defer setup to separate PA sync and BIG sync This commit implements defer setup support for the Broadcast Sink scenario: By setting defer setup on a broadcast socket before calling listen, the user is able to trigger the PA sync and BIG sync procedures separately. This is useful if the user first wants to synchronize to the periodic advertising transmitted by a Broadcast Source, and trigger the BIG sync procedure later on. If defer setup is set, once a PA sync established event arrives, a new hcon is created and notified to the ISO layer. A child socket associated with the PA sync connection will be added to the accept queue of the listening socket. Once the accept call returns the fd for the PA sync child socket, the user should call read on that fd. This will trigger the BIG create sync procedure, and the PA sync socket will become a listening socket itself. When the BIG sync established event is notified to the ISO layer, the bis connections will be added to the accept queue of the PA sync parent. The user should call accept on the PA sync socket to get the final bis connections. Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 30 +++++++- net/bluetooth/hci_conn.c | 13 +++- net/bluetooth/hci_event.c | 41 +++++++++- net/bluetooth/hci_sync.c | 15 ++++ net/bluetooth/iso.c | 160 +++++++++++++++++++++++++++++++-------- 5 files changed, 218 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index c53d74236e3a..6fb055e3c595 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -978,6 +978,8 @@ enum { HCI_CONN_CREATE_CIS, HCI_CONN_BIG_SYNC, HCI_CONN_BIG_SYNC_FAILED, + HCI_CONN_PA_SYNC, + HCI_CONN_PA_SYNC_FAILED, }; static inline bool hci_conn_ssp_enabled(struct hci_conn *conn) @@ -1300,7 +1302,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_big_any_dst(struct hci_dev * if (c->type != ISO_LINK) continue; - if (handle == c->iso_qos.bcast.big) { + if (handle != BT_ISO_QOS_BIG_UNSET && handle == c->iso_qos.bcast.big) { rcu_read_unlock(); return c; } @@ -1311,6 +1313,29 @@ static inline struct hci_conn *hci_conn_hash_lookup_big_any_dst(struct hci_dev * return NULL; } +static inline struct hci_conn * +hci_conn_hash_lookup_pa_sync(struct hci_dev *hdev, __u8 big) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != ISO_LINK || + !test_bit(HCI_CONN_PA_SYNC, &c->flags)) + continue; + + if (c->iso_qos.bcast.big == big) { + rcu_read_unlock(); + return c; + } + } + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, __u8 type, __u16 state) { @@ -1435,7 +1460,8 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 data_len, __u8 *data); int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos); -int hci_le_big_create_sync(struct hci_dev *hdev, struct bt_iso_qos *qos, +int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, + struct bt_iso_qos *qos, __u16 sync_handle, __u8 num_bis, __u8 bis[]); int hci_conn_check_link_mode(struct hci_conn *conn); int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 95339623883c..8b0c8e631324 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -734,6 +734,7 @@ struct iso_list_data { }; int count; bool big_term; + bool pa_sync_term; bool big_sync_term; }; @@ -807,7 +808,10 @@ static int big_terminate_sync(struct hci_dev *hdev, void *data) if (d->big_sync_term) hci_le_big_terminate_sync(hdev, d->big); - return hci_le_pa_terminate_sync(hdev, d->sync_handle); + if (d->pa_sync_term) + return hci_le_pa_terminate_sync(hdev, d->sync_handle); + + return 0; } static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *conn) @@ -823,6 +827,7 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c d->big = big; d->sync_handle = conn->sync_handle; + d->pa_sync_term = test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags); d->big_sync_term = test_and_clear_bit(HCI_CONN_BIG_SYNC, &conn->flags); ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d, @@ -2099,7 +2104,8 @@ int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, return hci_cmd_sync_queue(hdev, create_pa_sync, cp, create_pa_complete); } -int hci_le_big_create_sync(struct hci_dev *hdev, struct bt_iso_qos *qos, +int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, + struct bt_iso_qos *qos, __u16 sync_handle, __u8 num_bis, __u8 bis[]) { struct _packed { @@ -2115,6 +2121,9 @@ int hci_le_big_create_sync(struct hci_dev *hdev, struct bt_iso_qos *qos, if (err) return err; + if (hcon) + hcon->iso_qos.bcast.big = qos->bcast.big; + memset(&pdu, 0, sizeof(pdu)); pdu.cp.handle = qos->bcast.big; pdu.cp.sync_handle = cpu_to_le16(sync_handle); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 559b6080706c..b4b72070f5f6 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6581,20 +6581,39 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, struct hci_ev_le_pa_sync_established *ev = data; int mask = hdev->link_mode; __u8 flags = 0; + struct hci_conn *bis; bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); - if (ev->status) - return; - hci_dev_lock(hdev); hci_dev_clear_flag(hdev, HCI_PA_SYNC); mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ISO_LINK, &flags); - if (!(mask & HCI_LM_ACCEPT)) + if (!(mask & HCI_LM_ACCEPT)) { hci_le_pa_term_sync(hdev, ev->handle); + goto unlock; + } + + if (!(flags & HCI_PROTO_DEFER)) + goto unlock; + + /* Add connection to indicate the PA sync event */ + bis = hci_conn_add(hdev, ISO_LINK, BDADDR_ANY, + HCI_ROLE_SLAVE); + if (!bis) + goto unlock; + + if (ev->status) + set_bit(HCI_CONN_PA_SYNC_FAILED, &bis->flags); + else + set_bit(HCI_CONN_PA_SYNC, &bis->flags); + + /* Notify connection to iso layer */ + hci_connect_cfm(bis, ev->status); + +unlock: hci_dev_unlock(hdev); } @@ -7045,6 +7064,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, { struct hci_evt_le_big_sync_estabilished *ev = data; struct hci_conn *bis; + struct hci_conn *pa_sync; int i; bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); @@ -7055,6 +7075,15 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); + if (!ev->status) { + pa_sync = hci_conn_hash_lookup_pa_sync(hdev, ev->handle); + if (pa_sync) + /* Also mark the BIG sync established event on the + * associated PA sync hcon + */ + set_bit(HCI_CONN_BIG_SYNC, &pa_sync->flags); + } + for (i = 0; i < ev->num_bis; i++) { u16 handle = le16_to_cpu(ev->bis[i]); __le32 interval; @@ -7068,6 +7097,10 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, bis->handle = handle; } + if (ev->status != 0x42) + /* Mark PA sync as established */ + set_bit(HCI_CONN_PA_SYNC, &bis->flags); + bis->iso_qos.bcast.big = ev->handle; memset(&interval, 0, sizeof(interval)); memcpy(&interval, ev->latency, sizeof(ev->latency)); diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index d10a0f36b947..0cb780817198 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5384,6 +5384,21 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) err = hci_reject_conn_sync(hdev, conn, reason); break; case BT_OPEN: + hci_dev_lock(hdev); + + /* Cleanup bis or pa sync connections */ + if (test_and_clear_bit(HCI_CONN_BIG_SYNC_FAILED, &conn->flags) || + test_and_clear_bit(HCI_CONN_PA_SYNC_FAILED, &conn->flags)) { + hci_conn_failed(conn, reason); + } else if (test_bit(HCI_CONN_PA_SYNC, &conn->flags) || + test_bit(HCI_CONN_BIG_SYNC, &conn->flags)) { + conn->state = BT_CLOSED; + hci_disconn_cfm(conn, reason); + hci_conn_del(conn); + } + + hci_dev_unlock(hdev); + return 0; case BT_BOUND: hci_dev_lock(hdev); hci_conn_failed(conn, reason); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 3c03e49422c7..2a2c37789e1f 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -51,6 +51,7 @@ static void iso_sock_kill(struct sock *sk); /* iso_pinfo flags values */ enum { BT_SK_BIG_SYNC, + BT_SK_PA_SYNC, }; struct iso_pinfo { @@ -75,6 +76,8 @@ static struct bt_iso_qos default_qos; static bool check_ucast_qos(struct bt_iso_qos *qos); static bool check_bcast_qos(struct bt_iso_qos *qos); +static bool iso_match_sid(struct sock *sk, void *data); +static void iso_sock_disconn(struct sock *sk); /* ---- ISO timers ---- */ #define ISO_CONN_TIMEOUT (HZ * 40) @@ -598,6 +601,15 @@ static void iso_sock_cleanup_listen(struct sock *parent) iso_sock_kill(sk); } + /* If listening socket stands for a PA sync connection, + * properly disconnect the hcon and socket. + */ + if (iso_pi(parent)->conn && iso_pi(parent)->conn->hcon && + test_bit(HCI_CONN_PA_SYNC, &iso_pi(parent)->conn->hcon->flags)) { + iso_sock_disconn(parent); + return; + } + parent->sk_state = BT_CLOSED; sock_set_flag(parent, SOCK_ZAPPED); } @@ -619,6 +631,16 @@ static void iso_sock_kill(struct sock *sk) sock_put(sk); } +static void iso_sock_disconn(struct sock *sk) +{ + sk->sk_state = BT_DISCONN; + iso_sock_set_timer(sk, ISO_DISCONN_TIMEOUT); + iso_conn_lock(iso_pi(sk)->conn); + hci_conn_drop(iso_pi(sk)->conn->hcon); + iso_pi(sk)->conn->hcon = NULL; + iso_conn_unlock(iso_pi(sk)->conn); +} + static void __iso_sock_close(struct sock *sk) { BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket); @@ -631,20 +653,19 @@ static void __iso_sock_close(struct sock *sk) case BT_CONNECT: case BT_CONNECTED: case BT_CONFIG: - if (iso_pi(sk)->conn->hcon) { - sk->sk_state = BT_DISCONN; - iso_sock_set_timer(sk, ISO_DISCONN_TIMEOUT); - iso_conn_lock(iso_pi(sk)->conn); - hci_conn_drop(iso_pi(sk)->conn->hcon); - iso_pi(sk)->conn->hcon = NULL; - iso_conn_unlock(iso_pi(sk)->conn); - } else { + if (iso_pi(sk)->conn->hcon) + iso_sock_disconn(sk); + else iso_chan_del(sk, ECONNRESET); - } break; case BT_CONNECT2: - iso_chan_del(sk, ECONNRESET); + if (iso_pi(sk)->conn->hcon && + (test_bit(HCI_CONN_PA_SYNC, &iso_pi(sk)->conn->hcon->flags) || + test_bit(HCI_CONN_PA_SYNC_FAILED, &iso_pi(sk)->conn->hcon->flags))) + iso_sock_disconn(sk); + else + iso_chan_del(sk, ECONNRESET); break; case BT_DISCONN: iso_chan_del(sk, ECONNRESET); @@ -1139,6 +1160,29 @@ static void iso_conn_defer_accept(struct hci_conn *conn) hci_send_cmd(hdev, HCI_OP_LE_ACCEPT_CIS, sizeof(cp), &cp); } +static void iso_conn_big_sync(struct sock *sk) +{ + int err; + struct hci_dev *hdev; + + hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, + iso_pi(sk)->src_type); + + if (!hdev) + return; + + if (!test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { + err = hci_le_big_create_sync(hdev, iso_pi(sk)->conn->hcon, + &iso_pi(sk)->qos, + iso_pi(sk)->sync_handle, + iso_pi(sk)->bc_num_bis, + iso_pi(sk)->bc_bis); + if (err) + bt_dev_err(hdev, "hci_le_big_create_sync: %d", + err); + } +} + static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { @@ -1151,8 +1195,15 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, lock_sock(sk); switch (sk->sk_state) { case BT_CONNECT2: - iso_conn_defer_accept(pi->conn->hcon); - sk->sk_state = BT_CONFIG; + if (pi->conn->hcon && + test_bit(HCI_CONN_PA_SYNC, &pi->conn->hcon->flags)) { + iso_conn_big_sync(sk); + sk->sk_state = BT_LISTEN; + set_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags); + } else { + iso_conn_defer_accept(pi->conn->hcon); + sk->sk_state = BT_CONFIG; + } release_sock(sk); return 0; case BT_CONNECT: @@ -1513,11 +1564,17 @@ static bool iso_match_big(struct sock *sk, void *data) return ev->handle == iso_pi(sk)->qos.bcast.big; } +static bool iso_match_pa_sync_flag(struct sock *sk, void *data) +{ + return test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags); +} + static void iso_conn_ready(struct iso_conn *conn) { - struct sock *parent; + struct sock *parent = NULL; struct sock *sk = conn->sk; - struct hci_ev_le_big_sync_estabilished *ev; + struct hci_ev_le_big_sync_estabilished *ev = NULL; + struct hci_ev_le_pa_sync_established *ev2 = NULL; struct hci_conn *hcon; BT_DBG("conn %p", conn); @@ -1529,15 +1586,32 @@ static void iso_conn_ready(struct iso_conn *conn) if (!hcon) return; - ev = hci_recv_event_data(hcon->hdev, - HCI_EVT_LE_BIG_SYNC_ESTABILISHED); - if (ev) + if (test_bit(HCI_CONN_BIG_SYNC, &hcon->flags) || + test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) { + ev = hci_recv_event_data(hcon->hdev, + HCI_EVT_LE_BIG_SYNC_ESTABILISHED); + + /* Get reference to PA sync parent socket, if it exists */ parent = iso_get_sock_listen(&hcon->src, &hcon->dst, - iso_match_big, ev); - else + iso_match_pa_sync_flag, NULL); + if (!parent && ev) + parent = iso_get_sock_listen(&hcon->src, + &hcon->dst, + iso_match_big, ev); + } else if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags) || + test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) { + ev2 = hci_recv_event_data(hcon->hdev, + HCI_EV_LE_PA_SYNC_ESTABLISHED); + if (ev2) + parent = iso_get_sock_listen(&hcon->src, + &hcon->dst, + iso_match_sid, ev2); + } + + if (!parent) parent = iso_get_sock_listen(&hcon->src, - BDADDR_ANY, NULL, NULL); + BDADDR_ANY, NULL, NULL); if (!parent) return; @@ -1554,11 +1628,17 @@ static void iso_conn_ready(struct iso_conn *conn) iso_sock_init(sk, parent); bacpy(&iso_pi(sk)->src, &hcon->src); - iso_pi(sk)->src_type = hcon->src_type; + + /* Convert from HCI to three-value type */ + if (hcon->src_type == ADDR_LE_DEV_PUBLIC) + iso_pi(sk)->src_type = BDADDR_LE_PUBLIC; + else + iso_pi(sk)->src_type = BDADDR_LE_RANDOM; /* If hcon has no destination address (BDADDR_ANY) it means it - * was created by HCI_EV_LE_BIG_SYNC_ESTABILISHED so we need to - * initialize using the parent socket destination address. + * was created by HCI_EV_LE_BIG_SYNC_ESTABILISHED or + * HCI_EV_LE_PA_SYNC_ESTABLISHED so we need to initialize using + * the parent socket destination address. */ if (!bacmp(&hcon->dst, BDADDR_ANY)) { bacpy(&hcon->dst, &iso_pi(parent)->dst); @@ -1566,13 +1646,21 @@ static void iso_conn_ready(struct iso_conn *conn) hcon->sync_handle = iso_pi(parent)->sync_handle; } + if (ev2 && !ev2->status) { + iso_pi(sk)->sync_handle = iso_pi(parent)->sync_handle; + iso_pi(sk)->qos = iso_pi(parent)->qos; + iso_pi(sk)->bc_num_bis = iso_pi(parent)->bc_num_bis; + memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis, ISO_MAX_NUM_BIS); + } + bacpy(&iso_pi(sk)->dst, &hcon->dst); iso_pi(sk)->dst_type = hcon->dst_type; hci_conn_hold(hcon); iso_chan_add(conn, sk, parent); - if (ev && ((struct hci_evt_le_big_sync_estabilished *)ev)->status) { + if ((ev && ((struct hci_evt_le_big_sync_estabilished *)ev)->status) || + (ev2 && ev2->status)) { /* Trigger error signal on child socket */ sk->sk_err = ECONNREFUSED; sk->sk_error_report(sk); @@ -1630,7 +1718,7 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) if (ev1) { sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr, iso_match_sid, ev1); - if (sk) + if (sk && !ev1->status) iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle); goto done; @@ -1638,16 +1726,21 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) ev2 = hci_recv_event_data(hdev, HCI_EVT_LE_BIG_INFO_ADV_REPORT); if (ev2) { + /* Try to get PA sync listening socket, if it exists */ sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr, - iso_match_sync_handle, ev2); + iso_match_pa_sync_flag, NULL); + if (!sk) + sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr, + iso_match_sync_handle, ev2); if (sk) { int err; if (ev2->num_bis < iso_pi(sk)->bc_num_bis) iso_pi(sk)->bc_num_bis = ev2->num_bis; - if (!test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { - err = hci_le_big_create_sync(hdev, + if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && + !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { + err = hci_le_big_create_sync(hdev, NULL, &iso_pi(sk)->qos, iso_pi(sk)->sync_handle, iso_pi(sk)->bc_num_bis, @@ -1699,12 +1792,13 @@ static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); - /* Similar to the success case, if HCI_CONN_BIG_SYNC_FAILED is set, - * queue the failed bis connection into the accept queue of the - * listening socket and wake up userspace, to inform the user about - * the BIG sync failed event. + /* Similar to the success case, if HCI_CONN_BIG_SYNC_FAILED or + * HCI_CONN_PA_SYNC_FAILED is set, queue the failed connection + * into the accept queue of the listening socket and wake up + * userspace, to inform the user about the event. */ - if (!status || test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) { + if (!status || test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags) || + test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) { struct iso_conn *conn; conn = iso_conn_add(hcon); -- cgit v1.2.3 From db08722fc7d46168fe31d9b8a7b29229dd959f9f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 18 Aug 2023 14:19:27 -0700 Subject: Bluetooth: hci_core: Fix missing instances using HCI_MAX_AD_LENGTH There a few instances still using HCI_MAX_AD_LENGTH instead of using max_adv_len which takes care of detecting what is the actual maximum length depending on if the controller supports EA or not. Fixes: 112b5090c219 ("Bluetooth: MGMT: Fix always using HCI_MAX_AD_LENGTH") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 6 +++--- net/bluetooth/eir.c | 2 +- net/bluetooth/mgmt.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6fb055e3c595..6e2988b11f99 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -83,7 +83,7 @@ struct discovery_state { u8 last_adv_addr_type; s8 last_adv_rssi; u32 last_adv_flags; - u8 last_adv_data[HCI_MAX_AD_LENGTH]; + u8 last_adv_data[HCI_MAX_EXT_AD_LENGTH]; u8 last_adv_data_len; bool report_invalid_rssi; bool result_filtering; @@ -290,7 +290,7 @@ struct adv_pattern { __u8 ad_type; __u8 offset; __u8 length; - __u8 value[HCI_MAX_AD_LENGTH]; + __u8 value[HCI_MAX_EXT_AD_LENGTH]; }; struct adv_rssi_thresholds { @@ -726,7 +726,7 @@ struct hci_conn { __u16 le_conn_interval; __u16 le_conn_latency; __u16 le_supv_timeout; - __u8 le_adv_data[HCI_MAX_AD_LENGTH]; + __u8 le_adv_data[HCI_MAX_EXT_AD_LENGTH]; __u8 le_adv_data_len; __u8 le_per_adv_data[HCI_MAX_PER_AD_LENGTH]; __u8 le_per_adv_data_len; diff --git a/net/bluetooth/eir.c b/net/bluetooth/eir.c index 8a85f6cdfbc1..9214189279e8 100644 --- a/net/bluetooth/eir.c +++ b/net/bluetooth/eir.c @@ -33,7 +33,7 @@ u8 eir_append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len) size_t complete_len; /* no space left for name (+ NULL + type + len) */ - if ((HCI_MAX_AD_LENGTH - ad_len) < HCI_MAX_SHORT_NAME_LENGTH + 3) + if ((max_adv_len(hdev) - ad_len) < HCI_MAX_SHORT_NAME_LENGTH + 3) return ad_len; /* use complete name if present and fits */ diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index d6c9b7bc8592..ba2e00646e8e 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -5381,9 +5381,9 @@ static u8 parse_adv_monitor_pattern(struct adv_monitor *m, u8 pattern_count, for (i = 0; i < pattern_count; i++) { offset = patterns[i].offset; length = patterns[i].length; - if (offset >= HCI_MAX_AD_LENGTH || - length > HCI_MAX_AD_LENGTH || - (offset + length) > HCI_MAX_AD_LENGTH) + if (offset >= HCI_MAX_EXT_AD_LENGTH || + length > HCI_MAX_EXT_AD_LENGTH || + (offset + length) > HCI_MAX_EXT_AD_LENGTH) return MGMT_STATUS_INVALID_PARAMS; p = kmalloc(sizeof(*p), GFP_KERNEL); -- cgit v1.2.3 From 3344d318337d9dca928fd448e966557ec5063f85 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sat, 19 Aug 2023 16:33:36 +0300 Subject: Bluetooth: hci_conn: fail SCO/ISO via hci_conn_failed if ACL gone early Not calling hci_(dis)connect_cfm before deleting conn referred to by a socket generally results to use-after-free. When cleaning up SCO connections when the parent ACL is deleted too early, use hci_conn_failed to do the connection cleanup properly. We also need to clean up ISO connections in a similar situation when connecting has started but LE Create CIS is not yet sent, so do it too here. Fixes: ca1fd42e7dbf ("Bluetooth: Fix potential double free caused by hci_conn_unlink") Reported-by: syzbot+cf54c1da6574b6c1b049@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-bluetooth/00000000000013b93805fbbadc50@google.com/ Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 8b0c8e631324..9d5057cef30a 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1044,6 +1044,29 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, return conn; } +static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason) +{ + if (!reason) + reason = HCI_ERROR_REMOTE_USER_TERM; + + /* Due to race, SCO/ISO conn might be not established yet at this point, + * and nothing else will clean it up. In other cases it is done via HCI + * events. + */ + switch (conn->type) { + case SCO_LINK: + case ESCO_LINK: + if (HCI_CONN_HANDLE_UNSET(conn->handle)) + hci_conn_failed(conn, reason); + break; + case ISO_LINK: + if (conn->state != BT_CONNECTED && + !test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) + hci_conn_failed(conn, reason); + break; + } +} + static void hci_conn_unlink(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; @@ -1066,14 +1089,7 @@ static void hci_conn_unlink(struct hci_conn *conn) if (!test_bit(HCI_UP, &hdev->flags)) continue; - /* Due to race, SCO connection might be not established - * yet at this point. Delete it now, otherwise it is - * possible for it to be stuck and can't be deleted. - */ - if ((child->type == SCO_LINK || - child->type == ESCO_LINK) && - HCI_CONN_HANDLE_UNSET(child->handle)) - hci_conn_del(child); + hci_conn_cleanup_child(child, conn->abort_reason); } return; -- cgit v1.2.3 From 9c0826310bfb784c9bac7d1d9454e304185446c5 Mon Sep 17 00:00:00 2001 From: Claudia Draghicescu Date: Fri, 30 Jun 2023 12:59:28 +0300 Subject: Bluetooth: ISO: Add support for periodic adv reports processing In the case of a Periodic Synchronized Receiver, the PA report received from a Broadcaster contains the BASE, which has information about codec and other parameters of a BIG. This isnformation is stored and the application can retrieve it using getsockopt(BT_ISO_BASE). Signed-off-by: Claudia Draghicescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 11 +++++++++++ net/bluetooth/hci_event.c | 23 +++++++++++++++++++++++ net/bluetooth/iso.c | 28 +++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 5723405b833e..c58425d4c592 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2771,6 +2771,17 @@ struct hci_ev_le_enh_conn_complete { __u8 clk_accurancy; } __packed; +#define HCI_EV_LE_PER_ADV_REPORT 0x0f +struct hci_ev_le_per_adv_report { + __le16 sync_handle; + __u8 tx_power; + __u8 rssi; + __u8 cte_type; + __u8 data_status; + __u8 length; + __u8 data[]; +} __packed; + #define HCI_EV_LE_EXT_ADV_SET_TERM 0x12 struct hci_evt_le_ext_adv_set_term { __u8 status; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index b4b72070f5f6..35f251041eeb 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6617,6 +6617,24 @@ unlock: hci_dev_unlock(hdev); } +static void hci_le_per_adv_report_evt(struct hci_dev *hdev, void *data, + struct sk_buff *skb) +{ + struct hci_ev_le_per_adv_report *ev = data; + int mask = hdev->link_mode; + __u8 flags = 0; + + bt_dev_dbg(hdev, "sync_handle 0x%4.4x", le16_to_cpu(ev->sync_handle)); + + hci_dev_lock(hdev); + + mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, ISO_LINK, &flags); + if (!(mask & HCI_LM_ACCEPT)) + hci_le_pa_term_sync(hdev, ev->sync_handle); + + hci_dev_unlock(hdev); +} + static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -7213,6 +7231,11 @@ static const struct hci_le_ev { HCI_LE_EV(HCI_EV_LE_PA_SYNC_ESTABLISHED, hci_le_pa_sync_estabilished_evt, sizeof(struct hci_ev_le_pa_sync_established)), + /* [0x0f = HCI_EV_LE_PER_ADV_REPORT] */ + HCI_LE_EV_VL(HCI_EV_LE_PER_ADV_REPORT, + hci_le_per_adv_report_evt, + sizeof(struct hci_ev_le_per_adv_report), + HCI_MAX_EVENT_SIZE), /* [0x12 = HCI_EV_LE_EXT_ADV_SET_TERM] */ HCI_LE_EV(HCI_EV_LE_EXT_ADV_SET_TERM, hci_le_ext_adv_term_evt, sizeof(struct hci_evt_le_ext_adv_set_term)), diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 2a2c37789e1f..16da946f5881 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1446,7 +1446,8 @@ static int iso_sock_getsockopt(struct socket *sock, int level, int optname, break; case BT_ISO_BASE: - if (sk->sk_state == BT_CONNECTED) { + if (sk->sk_state == BT_CONNECTED && + !bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) { base_len = iso_pi(sk)->conn->hcon->le_per_adv_data_len; base = iso_pi(sk)->conn->hcon->le_per_adv_data; } else { @@ -1655,6 +1656,9 @@ static void iso_conn_ready(struct iso_conn *conn) bacpy(&iso_pi(sk)->dst, &hcon->dst); iso_pi(sk)->dst_type = hcon->dst_type; + iso_pi(sk)->sync_handle = iso_pi(parent)->sync_handle; + memcpy(iso_pi(sk)->base, iso_pi(parent)->base, iso_pi(parent)->base_len); + iso_pi(sk)->base_len = iso_pi(parent)->base_len; hci_conn_hold(hcon); iso_chan_add(conn, sk, parent); @@ -1692,12 +1696,20 @@ static bool iso_match_sync_handle(struct sock *sk, void *data) return le16_to_cpu(ev->sync_handle) == iso_pi(sk)->sync_handle; } +static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data) +{ + struct hci_ev_le_per_adv_report *ev = data; + + return le16_to_cpu(ev->sync_handle) == iso_pi(sk)->sync_handle; +} + /* ----- ISO interface with lower layer (HCI) ----- */ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { struct hci_ev_le_pa_sync_established *ev1; struct hci_evt_le_big_info_adv_report *ev2; + struct hci_ev_le_per_adv_report *ev3; struct sock *sk; int lm = 0; @@ -1713,6 +1725,9 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) * 2. HCI_EVT_LE_BIG_INFO_ADV_REPORT: When connect_ind is triggered by a * a BIG Info it attempts to check if there any listening socket with * the same sync_handle and if it does then attempt to create a sync. + * 3. HCI_EV_LE_PER_ADV_REPORT: When a PA report is received, it is stored + * in iso_pi(sk)->base so it can be passed up to user, in the case of a + * broadcast sink. */ ev1 = hci_recv_event_data(hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED); if (ev1) { @@ -1752,6 +1767,17 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) } } } + } + + ev3 = hci_recv_event_data(hdev, HCI_EV_LE_PER_ADV_REPORT); + if (ev3) { + sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr, + iso_match_sync_handle_pa_report, ev3); + + if (sk) { + memcpy(iso_pi(sk)->base, ev3->data, ev3->length); + iso_pi(sk)->base_len = ev3->length; + } } else { sk = iso_get_sock_listen(&hdev->bdaddr, BDADDR_ANY, NULL, NULL); } -- cgit v1.2.3 From 253f3399f4c09ce6f4e67350f839be0361b4d5ff Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 22 Aug 2023 12:02:03 -0700 Subject: Bluetooth: HCI: Introduce HCI_QUIRK_BROKEN_LE_CODED This introduces HCI_QUIRK_BROKEN_LE_CODED which is used to indicate that LE Coded PHY shall not be used, it is then set for some Intel models that claim to support it but when used causes many problems. Cc: stable@vger.kernel.org # 6.4.y+ Link: https://github.com/bluez/bluez/issues/577 Link: https://github.com/bluez/bluez/issues/582 Link: https://lore.kernel.org/linux-bluetooth/CABBYNZKco-v7wkjHHexxQbgwwSz-S=GZ=dZKbRE1qxT1h4fFbQ@mail.gmail.com/T/# Fixes: 288c90224eec ("Bluetooth: Enable all supported LE PHY by default") Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btintel.c | 6 ++++++ include/net/bluetooth/hci.h | 10 ++++++++++ include/net/bluetooth/hci_core.h | 4 +++- net/bluetooth/hci_sync.c | 5 ++++- 4 files changed, 23 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c index 9b239ce96fa4..2462796a512a 100644 --- a/drivers/bluetooth/btintel.c +++ b/drivers/bluetooth/btintel.c @@ -2787,6 +2787,9 @@ static int btintel_setup_combined(struct hci_dev *hdev) set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); + /* These variants don't seem to support LE Coded PHY */ + set_bit(HCI_QUIRK_BROKEN_LE_CODED, &hdev->quirks); + /* Setup MSFT Extension support */ btintel_set_msft_opcode(hdev, ver.hw_variant); @@ -2858,6 +2861,9 @@ static int btintel_setup_combined(struct hci_dev *hdev) */ set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); + /* These variants don't seem to support LE Coded PHY */ + set_bit(HCI_QUIRK_BROKEN_LE_CODED, &hdev->quirks); + /* Set Valid LE States quirk */ set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks); diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index c58425d4c592..87d92accc26e 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -319,6 +319,16 @@ enum { * This quirk must be set before hci_register_dev is called. */ HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, + + /* + * When this quirk is set, LE Coded PHY shall not be used. This is + * required for some Intel controllers which erroneously claim to + * support it but it causes problems with extended scanning. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BROKEN_LE_CODED, }; /* HCI device flags */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6e2988b11f99..e6359f7346f1 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1817,7 +1817,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define scan_2m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_2M) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M)) -#define le_coded_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_CODED)) +#define le_coded_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_CODED) && \ + !test_bit(HCI_QUIRK_BROKEN_LE_CODED, \ + &(dev)->quirks)) #define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 0cb780817198..9b93653c6197 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4668,7 +4668,10 @@ static const struct { "advertised, but not supported."), HCI_QUIRK_BROKEN(SET_RPA_TIMEOUT, "HCI LE Set Random Private Address Timeout command is " - "advertised, but not supported.") + "advertised, but not supported."), + HCI_QUIRK_BROKEN(LE_CODED, + "HCI LE Coded PHY feature bit is set, " + "but its usage is not supported.") }; /* This function handles hdev setup stage: -- cgit v1.2.3 From b3d26c5702c7d6c45456326e56d2ccf3f103e60f Mon Sep 17 00:00:00 2001 From: Budimir Markovic Date: Thu, 24 Aug 2023 01:49:05 -0700 Subject: net/sched: sch_hfsc: Ensure inner classes have fsc curve HFSC assumes that inner classes have an fsc curve, but it is currently possible for classes without an fsc curve to become parents. This leads to bugs including a use-after-free. Don't allow non-root classes without HFSC_FSC to become parents. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Budimir Markovic Signed-off-by: Budimir Markovic Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20230824084905.422-1-markovicbudimir@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/sch_hfsc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 70b0c5873d32..61d52594ff6d 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1012,6 +1012,10 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (parent == NULL) return -ENOENT; } + if (!(parent->cl_flags & HFSC_FSC) && parent != &q->root) { + NL_SET_ERR_MSG(extack, "Invalid parent - parent class must have FSC"); + return -EINVAL; + } if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0) return -EINVAL; -- cgit v1.2.3 From 62b6442c58dc17b168f69b37b398a9cab7cd90c9 Mon Sep 17 00:00:00 2001 From: Dima Chumak Date: Thu, 24 Aug 2023 23:28:29 -0700 Subject: devlink: Expose port function commands to control IPsec crypto offloads Expose port function commands to enable / disable IPsec crypto offloads, this is used to control the port IPsec capabilities. When IPsec crypto is disabled for a function of the port (default), function cannot offload any IPsec crypto operations (Encrypt/Decrypt and XFRM state offloading). When enabled, IPsec crypto operations can be offloaded by the function of the port. Example of a PCI VF port which supports IPsec crypto offloads: $ devlink port show pci/0000:06:00.0/1 pci/0000:06:00.0/1: type eth netdev enp6s0pf0vf0 flavour pcivf pfnum 0 vfnum 0 function: hw_addr 00:00:00:00:00:00 roce enable ipsec_crypto disable $ devlink port function set pci/0000:06:00.0/1 ipsec_crypto enable $ devlink port show pci/0000:06:00.0/1 pci/0000:06:00.0/1: type eth netdev enp6s0pf0vf0 flavour pcivf pfnum 0 vfnum 0 function: hw_addr 00:00:00:00:00:00 roce enable ipsec_crypto enable Signed-off-by: Dima Chumak Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20230825062836.103744-2-saeed@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-port.rst | 27 ++++++++++++ include/net/devlink.h | 15 +++++++ include/uapi/linux/devlink.h | 2 + net/devlink/leftover.c | 52 +++++++++++++++++++++++ 4 files changed, 96 insertions(+) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst index 3da590953ce8..6983b11559cb 100644 --- a/Documentation/networking/devlink/devlink-port.rst +++ b/Documentation/networking/devlink/devlink-port.rst @@ -128,6 +128,9 @@ Users may also set the RoCE capability of the function using Users may also set the function as migratable using 'devlink port function set migratable' command. +Users may also set the IPsec crypto capability of the function using +`devlink port function set ipsec_crypto` command. + Function attributes =================== @@ -240,6 +243,30 @@ Attach VF to the VM. Start the VM. Perform live migration. +IPsec crypto capability setup +----------------------------- +When user enables IPsec crypto capability for a VF, user application can offload +XFRM state crypto operation (Encrypt/Decrypt) to this VF. + +When IPsec crypto capability is disabled (default) for a VF, the XFRM state is +processed in software by the kernel. + +- Get IPsec crypto capability of the VF device:: + + $ devlink port show pci/0000:06:00.0/2 + pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 + function: + hw_addr 00:00:00:00:00:00 ipsec_crypto disabled + +- Set IPsec crypto capability of the VF device:: + + $ devlink port function set pci/0000:06:00.0/2 ipsec_crypto enable + + $ devlink port show pci/0000:06:00.0/2 + pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 + function: + hw_addr 00:00:00:00:00:00 ipsec_crypto enabled + Subfunction ============ diff --git a/include/net/devlink.h b/include/net/devlink.h index f7fec0791acc..1cf07a820a0e 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1583,6 +1583,15 @@ void devlink_free(struct devlink *devlink); * Should be used by device drivers set * the admin state of a function managed * by the devlink port. + * @port_fn_ipsec_crypto_get: Callback used to get port function's ipsec_crypto + * capability. Should be used by device drivers + * to report the current state of ipsec_crypto + * capability of a function managed by the devlink + * port. + * @port_fn_ipsec_crypto_set: Callback used to set port function's ipsec_crypto + * capability. Should be used by device drivers to + * enable/disable ipsec_crypto capability of a + * function managed by the devlink port. * * Note: Driver should return -EOPNOTSUPP if it doesn't support * port function (@port_fn_*) handling for a particular port. @@ -1620,6 +1629,12 @@ struct devlink_port_ops { int (*port_fn_state_set)(struct devlink_port *port, enum devlink_port_fn_state state, struct netlink_ext_ack *extack); + int (*port_fn_ipsec_crypto_get)(struct devlink_port *devlink_port, + bool *is_enable, + struct netlink_ext_ack *extack); + int (*port_fn_ipsec_crypto_set)(struct devlink_port *devlink_port, + bool enable, + struct netlink_ext_ack *extack); }; void devlink_port_init(struct devlink *devlink, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 3782d4219ac9..f9ae9a058ad2 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -661,6 +661,7 @@ enum devlink_resource_unit { enum devlink_port_fn_attr_cap { DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT, DEVLINK_PORT_FN_ATTR_CAP_MIGRATABLE_BIT, + DEVLINK_PORT_FN_ATTR_CAP_IPSEC_CRYPTO_BIT, /* Add new caps above */ __DEVLINK_PORT_FN_ATTR_CAPS_MAX, @@ -669,6 +670,7 @@ enum devlink_port_fn_attr_cap { #define DEVLINK_PORT_FN_CAP_ROCE _BITUL(DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT) #define DEVLINK_PORT_FN_CAP_MIGRATABLE \ _BITUL(DEVLINK_PORT_FN_ATTR_CAP_MIGRATABLE_BIT) +#define DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO _BITUL(DEVLINK_PORT_FN_ATTR_CAP_IPSEC_CRYPTO_BIT) enum devlink_port_function_attr { DEVLINK_PORT_FUNCTION_ATTR_UNSPEC, diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index e2cd13958cc2..fcc1a06cae48 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -492,6 +492,28 @@ static int devlink_port_fn_migratable_fill(struct devlink_port *devlink_port, return 0; } +static int devlink_port_fn_ipsec_crypto_fill(struct devlink_port *devlink_port, + struct nla_bitfield32 *caps, + struct netlink_ext_ack *extack) +{ + bool is_enable; + int err; + + if (!devlink_port->ops->port_fn_ipsec_crypto_get || + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) + return 0; + + err = devlink_port->ops->port_fn_ipsec_crypto_get(devlink_port, &is_enable, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, is_enable); + return 0; +} + static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port, struct sk_buff *msg, struct netlink_ext_ack *extack, @@ -508,6 +530,10 @@ static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port, if (err) return err; + err = devlink_port_fn_ipsec_crypto_fill(devlink_port, &caps, extack); + if (err) + return err; + if (!caps.selector) return 0; err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value, @@ -838,6 +864,13 @@ devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable, extack); } +static int +devlink_port_fn_ipsec_crypto_set(struct devlink_port *devlink_port, bool enable, + struct netlink_ext_ack *extack) +{ + return devlink_port->ops->port_fn_ipsec_crypto_set(devlink_port, enable, extack); +} + static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, const struct nlattr *attr, struct netlink_ext_ack *extack) @@ -862,6 +895,13 @@ static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, if (err) return err; } + if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { + err = devlink_port_fn_ipsec_crypto_set(devlink_port, caps_value & + DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, + extack); + if (err) + return err; + } return 0; } @@ -1226,6 +1266,18 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port, return -EOPNOTSUPP; } } + if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { + if (!ops->port_fn_ipsec_crypto_set) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "Port doesn't support ipsec_crypto function attribute"); + return -EOPNOTSUPP; + } + if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "ipsec_crypto function attribute supported for VFs only"); + return -EOPNOTSUPP; + } + } } return 0; } -- cgit v1.2.3 From 390a24cbc39626a8a38c6d877a59f758fe209f2d Mon Sep 17 00:00:00 2001 From: Dima Chumak Date: Thu, 24 Aug 2023 23:28:30 -0700 Subject: devlink: Expose port function commands to control IPsec packet offloads Expose port function commands to enable / disable IPsec packet offloads, this is used to control the port IPsec capabilities. When IPsec packet is disabled for a function of the port (default), function cannot offload IPsec packet operations (encapsulation and XFRM policy offload). When enabled, IPsec packet operations can be offloaded by the function of the port, which includes crypto operation (Encrypt/Decrypt), IPsec encapsulation and XFRM state and policy offload. Example of a PCI VF port which supports IPsec packet offloads: $ devlink port show pci/0000:06:00.0/1 pci/0000:06:00.0/1: type eth netdev enp6s0pf0vf0 flavour pcivf pfnum 0 vfnum 0 function: hw_addr 00:00:00:00:00:00 roce enable ipsec_packet disable $ devlink port function set pci/0000:06:00.0/1 ipsec_packet enable $ devlink port show pci/0000:06:00.0/1 pci/0000:06:00.0/1: type eth netdev enp6s0pf0vf0 flavour pcivf pfnum 0 vfnum 0 function: hw_addr 00:00:00:00:00:00 roce enable ipsec_packet enable Signed-off-by: Dima Chumak Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20230825062836.103744-3-saeed@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-port.rst | 28 ++++++++++++ include/net/devlink.h | 15 +++++++ include/uapi/linux/devlink.h | 2 + net/devlink/leftover.c | 52 +++++++++++++++++++++++ 4 files changed, 97 insertions(+) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst index 6983b11559cb..f5adb910427a 100644 --- a/Documentation/networking/devlink/devlink-port.rst +++ b/Documentation/networking/devlink/devlink-port.rst @@ -131,6 +131,9 @@ Users may also set the function as migratable using Users may also set the IPsec crypto capability of the function using `devlink port function set ipsec_crypto` command. +Users may also set the IPsec packet capability of the function using +`devlink port function set ipsec_packet` command. + Function attributes =================== @@ -267,6 +270,31 @@ processed in software by the kernel. function: hw_addr 00:00:00:00:00:00 ipsec_crypto enabled +IPsec packet capability setup +----------------------------- +When user enables IPsec packet capability for a VF, user application can offload +XFRM state and policy crypto operation (Encrypt/Decrypt) to this VF, as well as +IPsec encapsulation. + +When IPsec packet capability is disabled (default) for a VF, the XFRM state and +policy is processed in software by the kernel. + +- Get IPsec packet capability of the VF device:: + + $ devlink port show pci/0000:06:00.0/2 + pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 + function: + hw_addr 00:00:00:00:00:00 ipsec_packet disabled + +- Set IPsec packet capability of the VF device:: + + $ devlink port function set pci/0000:06:00.0/2 ipsec_packet enable + + $ devlink port show pci/0000:06:00.0/2 + pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 + function: + hw_addr 00:00:00:00:00:00 ipsec_packet enabled + Subfunction ============ diff --git a/include/net/devlink.h b/include/net/devlink.h index 1cf07a820a0e..29fd1b4ee654 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1592,6 +1592,15 @@ void devlink_free(struct devlink *devlink); * capability. Should be used by device drivers to * enable/disable ipsec_crypto capability of a * function managed by the devlink port. + * @port_fn_ipsec_packet_get: Callback used to get port function's ipsec_packet + * capability. Should be used by device drivers + * to report the current state of ipsec_packet + * capability of a function managed by the devlink + * port. + * @port_fn_ipsec_packet_set: Callback used to set port function's ipsec_packet + * capability. Should be used by device drivers to + * enable/disable ipsec_packet capability of a + * function managed by the devlink port. * * Note: Driver should return -EOPNOTSUPP if it doesn't support * port function (@port_fn_*) handling for a particular port. @@ -1635,6 +1644,12 @@ struct devlink_port_ops { int (*port_fn_ipsec_crypto_set)(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack); + int (*port_fn_ipsec_packet_get)(struct devlink_port *devlink_port, + bool *is_enable, + struct netlink_ext_ack *extack); + int (*port_fn_ipsec_packet_set)(struct devlink_port *devlink_port, + bool enable, + struct netlink_ext_ack *extack); }; void devlink_port_init(struct devlink *devlink, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index f9ae9a058ad2..03875e078be8 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -662,6 +662,7 @@ enum devlink_port_fn_attr_cap { DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT, DEVLINK_PORT_FN_ATTR_CAP_MIGRATABLE_BIT, DEVLINK_PORT_FN_ATTR_CAP_IPSEC_CRYPTO_BIT, + DEVLINK_PORT_FN_ATTR_CAP_IPSEC_PACKET_BIT, /* Add new caps above */ __DEVLINK_PORT_FN_ATTR_CAPS_MAX, @@ -671,6 +672,7 @@ enum devlink_port_fn_attr_cap { #define DEVLINK_PORT_FN_CAP_MIGRATABLE \ _BITUL(DEVLINK_PORT_FN_ATTR_CAP_MIGRATABLE_BIT) #define DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO _BITUL(DEVLINK_PORT_FN_ATTR_CAP_IPSEC_CRYPTO_BIT) +#define DEVLINK_PORT_FN_CAP_IPSEC_PACKET _BITUL(DEVLINK_PORT_FN_ATTR_CAP_IPSEC_PACKET_BIT) enum devlink_port_function_attr { DEVLINK_PORT_FUNCTION_ATTR_UNSPEC, diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index fcc1a06cae48..149752bc9f2d 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -514,6 +514,28 @@ static int devlink_port_fn_ipsec_crypto_fill(struct devlink_port *devlink_port, return 0; } +static int devlink_port_fn_ipsec_packet_fill(struct devlink_port *devlink_port, + struct nla_bitfield32 *caps, + struct netlink_ext_ack *extack) +{ + bool is_enable; + int err; + + if (!devlink_port->ops->port_fn_ipsec_packet_get || + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) + return 0; + + err = devlink_port->ops->port_fn_ipsec_packet_get(devlink_port, &is_enable, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_PACKET, is_enable); + return 0; +} + static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port, struct sk_buff *msg, struct netlink_ext_ack *extack, @@ -534,6 +556,10 @@ static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port, if (err) return err; + err = devlink_port_fn_ipsec_packet_fill(devlink_port, &caps, extack); + if (err) + return err; + if (!caps.selector) return 0; err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value, @@ -871,6 +897,13 @@ devlink_port_fn_ipsec_crypto_set(struct devlink_port *devlink_port, bool enable, return devlink_port->ops->port_fn_ipsec_crypto_set(devlink_port, enable, extack); } +static int +devlink_port_fn_ipsec_packet_set(struct devlink_port *devlink_port, bool enable, + struct netlink_ext_ack *extack) +{ + return devlink_port->ops->port_fn_ipsec_packet_set(devlink_port, enable, extack); +} + static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, const struct nlattr *attr, struct netlink_ext_ack *extack) @@ -902,6 +935,13 @@ static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, if (err) return err; } + if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { + err = devlink_port_fn_ipsec_packet_set(devlink_port, caps_value & + DEVLINK_PORT_FN_CAP_IPSEC_PACKET, + extack); + if (err) + return err; + } return 0; } @@ -1278,6 +1318,18 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port, return -EOPNOTSUPP; } } + if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { + if (!ops->port_fn_ipsec_packet_set) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "Port doesn't support ipsec_packet function attribute"); + return -EOPNOTSUPP; + } + if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "ipsec_packet function attribute supported for VFs only"); + return -EOPNOTSUPP; + } + } } return 0; } -- cgit v1.2.3 From fd0fc6fdd8896a195cb7b0210a5ee46774718fc8 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2023 23:35:09 +0200 Subject: tls: move tls_cipher_size_desc to net/tls/tls.h It's only used in net/tls/*, no need to bloat include/net/tls.h. Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/dd9fad80415e5b3575b41f56b331871038362eab.1692977948.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- include/net/tls.h | 10 ---------- net/tls/tls.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index 06fca9160346..a2b44578dcb7 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -51,16 +51,6 @@ struct tls_rec; -struct tls_cipher_size_desc { - unsigned int iv; - unsigned int key; - unsigned int salt; - unsigned int tag; - unsigned int rec_seq; -}; - -extern const struct tls_cipher_size_desc tls_cipher_size_desc[]; - /* Maximum data size carried in a TLS record */ #define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14) diff --git a/net/tls/tls.h b/net/tls/tls.h index 164d6a955e26..7aae92972e00 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -51,6 +51,16 @@ #define TLS_DEC_STATS(net, field) \ SNMP_DEC_STATS((net)->mib.tls_statistics, field) +struct tls_cipher_size_desc { + unsigned int iv; + unsigned int key; + unsigned int salt; + unsigned int tag; + unsigned int rec_seq; +}; + +extern const struct tls_cipher_size_desc tls_cipher_size_desc[]; + /* TLS records are maintained in 'struct tls_rec'. It stores the memory pages * allocated or mapped for each TLS record. After encryption, the records are * stores in a linked list. -- cgit v1.2.3 From 200e23165109a173ffde3310dffa5ef5e502d97f Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2023 23:35:10 +0200 Subject: tls: add TLS_CIPHER_ARIA_GCM_* to tls_cipher_size_desc Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/b2e0fb79e6d0a4478be9bf33781dc9c9281c9d56.1692977948.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_main.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index f550c84f3408..9843c2af994f 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -73,6 +73,8 @@ const struct tls_cipher_size_desc tls_cipher_size_desc[] = { CIPHER_SIZE_DESC(TLS_CIPHER_CHACHA20_POLY1305), CIPHER_SIZE_DESC(TLS_CIPHER_SM4_GCM), CIPHER_SIZE_DESC(TLS_CIPHER_SM4_CCM), + CIPHER_SIZE_DESC(TLS_CIPHER_ARIA_GCM_128), + CIPHER_SIZE_DESC(TLS_CIPHER_ARIA_GCM_256), }; static const struct proto *saved_tcpv6_prot; -- cgit v1.2.3 From 037303d6760751fdb95ba62cf448ecbc1ac29c98 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2023 23:35:11 +0200 Subject: tls: reduce size of tls_cipher_size_desc tls_cipher_size_desc indexes ciphers by their type, but we're not using indices 0..50 of the array. Each struct tls_cipher_size_desc is 20B, so that's a lot of unused memory. We can reindex the array starting at the lowest used cipher_type. Introduce the get_cipher_size_desc helper to find the right item and avoid out-of-bounds accesses, and make tls_cipher_size_desc's size explicit so that gcc reminds us to update TLS_CIPHER_MIN/MAX when we add a new cipher. Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/5e054e370e240247a5d37881a1cd93a67c15f4ca.1692977948.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls.h | 13 ++++++++++++- net/tls/tls_device.c | 4 ++-- net/tls/tls_device_fallback.c | 8 ++++---- net/tls/tls_main.c | 4 ++-- 4 files changed, 20 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/tls/tls.h b/net/tls/tls.h index 7aae92972e00..ea799ef77bf8 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -59,7 +59,18 @@ struct tls_cipher_size_desc { unsigned int rec_seq; }; -extern const struct tls_cipher_size_desc tls_cipher_size_desc[]; +#define TLS_CIPHER_MIN TLS_CIPHER_AES_GCM_128 +#define TLS_CIPHER_MAX TLS_CIPHER_ARIA_GCM_256 +extern const struct tls_cipher_size_desc tls_cipher_size_desc[TLS_CIPHER_MAX + 1 - TLS_CIPHER_MIN]; + +static inline const struct tls_cipher_size_desc *get_cipher_size_desc(u16 cipher_type) +{ + if (cipher_type < TLS_CIPHER_MIN || cipher_type > TLS_CIPHER_MAX) + return NULL; + + return &tls_cipher_size_desc[cipher_type - TLS_CIPHER_MIN]; +} + /* TLS records are maintained in 'struct tls_rec'. It stores the memory pages * allocated or mapped for each TLS record. After encryption, the records are diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 2392d06845aa..9bc42041c2ce 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -898,7 +898,7 @@ tls_device_reencrypt(struct sock *sk, struct tls_context *tls_ctx) default: return -EINVAL; } - cipher_sz = &tls_cipher_size_desc[tls_ctx->crypto_recv.info.cipher_type]; + cipher_sz = get_cipher_size_desc(tls_ctx->crypto_recv.info.cipher_type); rxm = strp_msg(tls_strp_msg(sw_ctx)); orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + cipher_sz->iv, @@ -1094,7 +1094,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) rc = -EINVAL; goto release_netdev; } - cipher_sz = &tls_cipher_size_desc[crypto_info->cipher_type]; + cipher_sz = get_cipher_size_desc(crypto_info->cipher_type); /* Sanity-check the rec_seq_size for stack allocations */ if (cipher_sz->rec_seq > TLS_MAX_REC_SEQ_SIZE) { diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index b28c5e296dfd..dd21fa4961b6 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -69,7 +69,7 @@ static int tls_enc_record(struct aead_request *aead_req, default: return -EINVAL; } - cipher_sz = &tls_cipher_size_desc[prot->cipher_type]; + cipher_sz = get_cipher_size_desc(prot->cipher_type); buf_size = TLS_HEADER_SIZE + cipher_sz->iv; len = min_t(int, *in_len, buf_size); @@ -310,7 +310,7 @@ static void fill_sg_out(struct scatterlist sg_out[3], void *buf, void *dummy_buf) { const struct tls_cipher_size_desc *cipher_sz = - &tls_cipher_size_desc[tls_ctx->crypto_send.info.cipher_type]; + get_cipher_size_desc(tls_ctx->crypto_send.info.cipher_type); sg_set_buf(&sg_out[0], dummy_buf, sync_size); sg_set_buf(&sg_out[1], nskb->data + tcp_payload_offset, payload_len); @@ -348,7 +348,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, default: goto free_req; } - cipher_sz = &tls_cipher_size_desc[tls_ctx->crypto_send.info.cipher_type]; + cipher_sz = get_cipher_size_desc(tls_ctx->crypto_send.info.cipher_type); buf_len = cipher_sz->salt + cipher_sz->iv + TLS_AAD_SPACE_SIZE + sync_size + cipher_sz->tag; buf = kmalloc(buf_len, GFP_ATOMIC); @@ -495,7 +495,7 @@ int tls_sw_fallback_init(struct sock *sk, rc = -EINVAL; goto free_aead; } - cipher_sz = &tls_cipher_size_desc[crypto_info->cipher_type]; + cipher_sz = get_cipher_size_desc(crypto_info->cipher_type); rc = crypto_aead_setkey(offload_ctx->aead_send, key, cipher_sz->key); if (rc) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 9843c2af994f..1bf04636948d 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -58,7 +58,7 @@ enum { TLS_NUM_PROTS, }; -#define CIPHER_SIZE_DESC(cipher) [cipher] = { \ +#define CIPHER_SIZE_DESC(cipher) [cipher - TLS_CIPHER_MIN] = { \ .iv = cipher ## _IV_SIZE, \ .key = cipher ## _KEY_SIZE, \ .salt = cipher ## _SALT_SIZE, \ @@ -66,7 +66,7 @@ enum { .rec_seq = cipher ## _REC_SEQ_SIZE, \ } -const struct tls_cipher_size_desc tls_cipher_size_desc[] = { +const struct tls_cipher_size_desc tls_cipher_size_desc[TLS_CIPHER_MAX + 1 - TLS_CIPHER_MIN] = { CIPHER_SIZE_DESC(TLS_CIPHER_AES_GCM_128), CIPHER_SIZE_DESC(TLS_CIPHER_AES_GCM_256), CIPHER_SIZE_DESC(TLS_CIPHER_AES_CCM_128), -- cgit v1.2.3 From 8db44ab26bebe969851468bea6072d9a094b2ace Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2023 23:35:12 +0200 Subject: tls: rename tls_cipher_size_desc to tls_cipher_desc We're going to add other fields to it to fully describe a cipher, so the "_size" name won't match the contents. Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/76ca6c7686bd6d1534dfa188fb0f1f6fabebc791.1692977948.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls.h | 8 ++++---- net/tls/tls_device.c | 34 +++++++++++++++++----------------- net/tls/tls_device_fallback.c | 42 +++++++++++++++++++++--------------------- net/tls/tls_main.c | 20 ++++++++++---------- 4 files changed, 52 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/net/tls/tls.h b/net/tls/tls.h index ea799ef77bf8..d4b56ca9d267 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -51,7 +51,7 @@ #define TLS_DEC_STATS(net, field) \ SNMP_DEC_STATS((net)->mib.tls_statistics, field) -struct tls_cipher_size_desc { +struct tls_cipher_desc { unsigned int iv; unsigned int key; unsigned int salt; @@ -61,14 +61,14 @@ struct tls_cipher_size_desc { #define TLS_CIPHER_MIN TLS_CIPHER_AES_GCM_128 #define TLS_CIPHER_MAX TLS_CIPHER_ARIA_GCM_256 -extern const struct tls_cipher_size_desc tls_cipher_size_desc[TLS_CIPHER_MAX + 1 - TLS_CIPHER_MIN]; +extern const struct tls_cipher_desc tls_cipher_desc[TLS_CIPHER_MAX + 1 - TLS_CIPHER_MIN]; -static inline const struct tls_cipher_size_desc *get_cipher_size_desc(u16 cipher_type) +static inline const struct tls_cipher_desc *get_cipher_desc(u16 cipher_type) { if (cipher_type < TLS_CIPHER_MIN || cipher_type > TLS_CIPHER_MAX) return NULL; - return &tls_cipher_size_desc[cipher_type - TLS_CIPHER_MIN]; + return &tls_cipher_desc[cipher_type - TLS_CIPHER_MIN]; } diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 9bc42041c2ce..98885d872d4c 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -884,7 +884,7 @@ static int tls_device_reencrypt(struct sock *sk, struct tls_context *tls_ctx) { struct tls_sw_context_rx *sw_ctx = tls_sw_ctx_rx(tls_ctx); - const struct tls_cipher_size_desc *cipher_sz; + const struct tls_cipher_desc *cipher_desc; int err, offset, copy, data_len, pos; struct sk_buff *skb, *skb_iter; struct scatterlist sg[1]; @@ -898,10 +898,10 @@ tls_device_reencrypt(struct sock *sk, struct tls_context *tls_ctx) default: return -EINVAL; } - cipher_sz = get_cipher_size_desc(tls_ctx->crypto_recv.info.cipher_type); + cipher_desc = get_cipher_desc(tls_ctx->crypto_recv.info.cipher_type); rxm = strp_msg(tls_strp_msg(sw_ctx)); - orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + cipher_sz->iv, + orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + cipher_desc->iv, sk->sk_allocation); if (!orig_buf) return -ENOMEM; @@ -917,8 +917,8 @@ tls_device_reencrypt(struct sock *sk, struct tls_context *tls_ctx) sg_init_table(sg, 1); sg_set_buf(&sg[0], buf, - rxm->full_len + TLS_HEADER_SIZE + cipher_sz->iv); - err = skb_copy_bits(skb, offset, buf, TLS_HEADER_SIZE + cipher_sz->iv); + rxm->full_len + TLS_HEADER_SIZE + cipher_desc->iv); + err = skb_copy_bits(skb, offset, buf, TLS_HEADER_SIZE + cipher_desc->iv); if (err) goto free_buf; @@ -929,7 +929,7 @@ tls_device_reencrypt(struct sock *sk, struct tls_context *tls_ctx) else err = 0; - data_len = rxm->full_len - cipher_sz->tag; + data_len = rxm->full_len - cipher_desc->tag; if (skb_pagelen(skb) > offset) { copy = min_t(int, skb_pagelen(skb) - offset, data_len); @@ -1046,7 +1046,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; - const struct tls_cipher_size_desc *cipher_sz; + const struct tls_cipher_desc *cipher_desc; struct tls_record_info *start_marker_record; struct tls_offload_context_tx *offload_ctx; struct tls_crypto_info *crypto_info; @@ -1094,31 +1094,31 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) rc = -EINVAL; goto release_netdev; } - cipher_sz = get_cipher_size_desc(crypto_info->cipher_type); + cipher_desc = get_cipher_desc(crypto_info->cipher_type); /* Sanity-check the rec_seq_size for stack allocations */ - if (cipher_sz->rec_seq > TLS_MAX_REC_SEQ_SIZE) { + if (cipher_desc->rec_seq > TLS_MAX_REC_SEQ_SIZE) { rc = -EINVAL; goto release_netdev; } prot->version = crypto_info->version; prot->cipher_type = crypto_info->cipher_type; - prot->prepend_size = TLS_HEADER_SIZE + cipher_sz->iv; - prot->tag_size = cipher_sz->tag; + prot->prepend_size = TLS_HEADER_SIZE + cipher_desc->iv; + prot->tag_size = cipher_desc->tag; prot->overhead_size = prot->prepend_size + prot->tag_size; - prot->iv_size = cipher_sz->iv; - prot->salt_size = cipher_sz->salt; - ctx->tx.iv = kmalloc(cipher_sz->iv + cipher_sz->salt, GFP_KERNEL); + prot->iv_size = cipher_desc->iv; + prot->salt_size = cipher_desc->salt; + ctx->tx.iv = kmalloc(cipher_desc->iv + cipher_desc->salt, GFP_KERNEL); if (!ctx->tx.iv) { rc = -ENOMEM; goto release_netdev; } - memcpy(ctx->tx.iv + cipher_sz->salt, iv, cipher_sz->iv); + memcpy(ctx->tx.iv + cipher_desc->salt, iv, cipher_desc->iv); - prot->rec_seq_size = cipher_sz->rec_seq; - ctx->tx.rec_seq = kmemdup(rec_seq, cipher_sz->rec_seq, GFP_KERNEL); + prot->rec_seq_size = cipher_desc->rec_seq; + ctx->tx.rec_seq = kmemdup(rec_seq, cipher_desc->rec_seq, GFP_KERNEL); if (!ctx->tx.rec_seq) { rc = -ENOMEM; goto free_iv; diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index dd21fa4961b6..cb224fb2a394 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -55,7 +55,7 @@ static int tls_enc_record(struct aead_request *aead_req, struct tls_prot_info *prot) { unsigned char buf[TLS_HEADER_SIZE + MAX_IV_SIZE]; - const struct tls_cipher_size_desc *cipher_sz; + const struct tls_cipher_desc *cipher_desc; struct scatterlist sg_in[3]; struct scatterlist sg_out[3]; unsigned int buf_size; @@ -69,9 +69,9 @@ static int tls_enc_record(struct aead_request *aead_req, default: return -EINVAL; } - cipher_sz = get_cipher_size_desc(prot->cipher_type); + cipher_desc = get_cipher_desc(prot->cipher_type); - buf_size = TLS_HEADER_SIZE + cipher_sz->iv; + buf_size = TLS_HEADER_SIZE + cipher_desc->iv; len = min_t(int, *in_len, buf_size); scatterwalk_copychunks(buf, in, len, 0); @@ -85,11 +85,11 @@ static int tls_enc_record(struct aead_request *aead_req, scatterwalk_pagedone(out, 1, 1); len = buf[4] | (buf[3] << 8); - len -= cipher_sz->iv; + len -= cipher_desc->iv; - tls_make_aad(aad, len - cipher_sz->tag, (char *)&rcd_sn, buf[0], prot); + tls_make_aad(aad, len - cipher_desc->tag, (char *)&rcd_sn, buf[0], prot); - memcpy(iv + cipher_sz->salt, buf + TLS_HEADER_SIZE, cipher_sz->iv); + memcpy(iv + cipher_desc->salt, buf + TLS_HEADER_SIZE, cipher_desc->iv); sg_init_table(sg_in, ARRAY_SIZE(sg_in)); sg_init_table(sg_out, ARRAY_SIZE(sg_out)); @@ -100,7 +100,7 @@ static int tls_enc_record(struct aead_request *aead_req, *in_len -= len; if (*in_len < 0) { - *in_len += cipher_sz->tag; + *in_len += cipher_desc->tag; /* the input buffer doesn't contain the entire record. * trim len accordingly. The resulting authentication tag * will contain garbage, but we don't care, so we won't @@ -121,7 +121,7 @@ static int tls_enc_record(struct aead_request *aead_req, scatterwalk_pagedone(out, 1, 1); } - len -= cipher_sz->tag; + len -= cipher_desc->tag; aead_request_set_crypt(aead_req, sg_in, sg_out, len, iv); rc = crypto_aead_encrypt(aead_req); @@ -309,14 +309,14 @@ static void fill_sg_out(struct scatterlist sg_out[3], void *buf, int sync_size, void *dummy_buf) { - const struct tls_cipher_size_desc *cipher_sz = - get_cipher_size_desc(tls_ctx->crypto_send.info.cipher_type); + const struct tls_cipher_desc *cipher_desc = + get_cipher_desc(tls_ctx->crypto_send.info.cipher_type); sg_set_buf(&sg_out[0], dummy_buf, sync_size); sg_set_buf(&sg_out[1], nskb->data + tcp_payload_offset, payload_len); /* Add room for authentication tag produced by crypto */ dummy_buf += sync_size; - sg_set_buf(&sg_out[2], dummy_buf, cipher_sz->tag); + sg_set_buf(&sg_out[2], dummy_buf, cipher_desc->tag); } static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, @@ -328,7 +328,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int tcp_payload_offset = skb_tcp_all_headers(skb); int payload_len = skb->len - tcp_payload_offset; - const struct tls_cipher_size_desc *cipher_sz; + const struct tls_cipher_desc *cipher_desc; void *buf, *iv, *aad, *dummy_buf, *salt; struct aead_request *aead_req; struct sk_buff *nskb = NULL; @@ -348,16 +348,16 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, default: goto free_req; } - cipher_sz = get_cipher_size_desc(tls_ctx->crypto_send.info.cipher_type); - buf_len = cipher_sz->salt + cipher_sz->iv + TLS_AAD_SPACE_SIZE + - sync_size + cipher_sz->tag; + cipher_desc = get_cipher_desc(tls_ctx->crypto_send.info.cipher_type); + buf_len = cipher_desc->salt + cipher_desc->iv + TLS_AAD_SPACE_SIZE + + sync_size + cipher_desc->tag; buf = kmalloc(buf_len, GFP_ATOMIC); if (!buf) goto free_req; iv = buf; - memcpy(iv, salt, cipher_sz->salt); - aad = buf + cipher_sz->salt + cipher_sz->iv; + memcpy(iv, salt, cipher_desc->salt); + aad = buf + cipher_desc->salt + cipher_desc->iv; dummy_buf = aad + TLS_AAD_SPACE_SIZE; nskb = alloc_skb(skb_headroom(skb) + skb->len, GFP_ATOMIC); @@ -471,7 +471,7 @@ int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info) { - const struct tls_cipher_size_desc *cipher_sz; + const struct tls_cipher_desc *cipher_desc; const u8 *key; int rc; @@ -495,13 +495,13 @@ int tls_sw_fallback_init(struct sock *sk, rc = -EINVAL; goto free_aead; } - cipher_sz = get_cipher_size_desc(crypto_info->cipher_type); + cipher_desc = get_cipher_desc(crypto_info->cipher_type); - rc = crypto_aead_setkey(offload_ctx->aead_send, key, cipher_sz->key); + rc = crypto_aead_setkey(offload_ctx->aead_send, key, cipher_desc->key); if (rc) goto free_aead; - rc = crypto_aead_setauthsize(offload_ctx->aead_send, cipher_sz->tag); + rc = crypto_aead_setauthsize(offload_ctx->aead_send, cipher_desc->tag); if (rc) goto free_aead; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 1bf04636948d..217c2aa004dc 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -58,7 +58,7 @@ enum { TLS_NUM_PROTS, }; -#define CIPHER_SIZE_DESC(cipher) [cipher - TLS_CIPHER_MIN] = { \ +#define CIPHER_DESC(cipher) [cipher - TLS_CIPHER_MIN] = { \ .iv = cipher ## _IV_SIZE, \ .key = cipher ## _KEY_SIZE, \ .salt = cipher ## _SALT_SIZE, \ @@ -66,15 +66,15 @@ enum { .rec_seq = cipher ## _REC_SEQ_SIZE, \ } -const struct tls_cipher_size_desc tls_cipher_size_desc[TLS_CIPHER_MAX + 1 - TLS_CIPHER_MIN] = { - CIPHER_SIZE_DESC(TLS_CIPHER_AES_GCM_128), - CIPHER_SIZE_DESC(TLS_CIPHER_AES_GCM_256), - CIPHER_SIZE_DESC(TLS_CIPHER_AES_CCM_128), - CIPHER_SIZE_DESC(TLS_CIPHER_CHACHA20_POLY1305), - CIPHER_SIZE_DESC(TLS_CIPHER_SM4_GCM), - CIPHER_SIZE_DESC(TLS_CIPHER_SM4_CCM), - CIPHER_SIZE_DESC(TLS_CIPHER_ARIA_GCM_128), - CIPHER_SIZE_DESC(TLS_CIPHER_ARIA_GCM_256), +const struct tls_cipher_desc tls_cipher_desc[TLS_CIPHER_MAX + 1 - TLS_CIPHER_MIN] = { + CIPHER_DESC(TLS_CIPHER_AES_GCM_128), + CIPHER_DESC(TLS_CIPHER_AES_GCM_256), + CIPHER_DESC(TLS_CIPHER_AES_CCM_128), + CIPHER_DESC(TLS_CIPHER_CHACHA20_POLY1305), + CIPHER_DESC(TLS_CIPHER_SM4_GCM), + CIPHER_DESC(TLS_CIPHER_SM4_CCM), + CIPHER_DESC(TLS_CIPHER_ARIA_GCM_128), + CIPHER_DESC(TLS_CIPHER_ARIA_GCM_256), }; static const struct proto *saved_tcpv6_prot; -- cgit v1.2.3 From 176a3f50bc6a327c82c6b051b0bedd19917081a2 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2023 23:35:13 +0200 Subject: tls: extend tls_cipher_desc to fully describe the ciphers - add nonce, usually equal to iv_size but not for chacha - add offsets into the crypto_info for each field - add algorithm name - add offloadable flag Also add helpers to access each field of a crypto_info struct described by a tls_cipher_desc. Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/39d5f476d63c171097764e8d38f6f158b7c109ae.1692977948.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls.h | 32 ++++++++++++++++++++++++++++++++ net/tls/tls_main.c | 41 ++++++++++++++++++++++++++++++++--------- 2 files changed, 64 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/tls/tls.h b/net/tls/tls.h index d4b56ca9d267..28a8c0e80e3c 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -52,11 +52,19 @@ SNMP_DEC_STATS((net)->mib.tls_statistics, field) struct tls_cipher_desc { + unsigned int nonce; unsigned int iv; unsigned int key; unsigned int salt; unsigned int tag; unsigned int rec_seq; + unsigned int iv_offset; + unsigned int key_offset; + unsigned int salt_offset; + unsigned int rec_seq_offset; + char *cipher_name; + bool offloadable; + size_t crypto_info; }; #define TLS_CIPHER_MIN TLS_CIPHER_AES_GCM_128 @@ -71,6 +79,30 @@ static inline const struct tls_cipher_desc *get_cipher_desc(u16 cipher_type) return &tls_cipher_desc[cipher_type - TLS_CIPHER_MIN]; } +static inline char *crypto_info_iv(struct tls_crypto_info *crypto_info, + const struct tls_cipher_desc *cipher_desc) +{ + return (char *)crypto_info + cipher_desc->iv_offset; +} + +static inline char *crypto_info_key(struct tls_crypto_info *crypto_info, + const struct tls_cipher_desc *cipher_desc) +{ + return (char *)crypto_info + cipher_desc->key_offset; +} + +static inline char *crypto_info_salt(struct tls_crypto_info *crypto_info, + const struct tls_cipher_desc *cipher_desc) +{ + return (char *)crypto_info + cipher_desc->salt_offset; +} + +static inline char *crypto_info_rec_seq(struct tls_crypto_info *crypto_info, + const struct tls_cipher_desc *cipher_desc) +{ + return (char *)crypto_info + cipher_desc->rec_seq_offset; +} + /* TLS records are maintained in 'struct tls_rec'. It stores the memory pages * allocated or mapped for each TLS record. After encryption, the records are diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 217c2aa004dc..bbdf211cc898 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -58,23 +58,46 @@ enum { TLS_NUM_PROTS, }; -#define CIPHER_DESC(cipher) [cipher - TLS_CIPHER_MIN] = { \ +#define __CIPHER_DESC(ci) \ + .iv_offset = offsetof(struct ci, iv), \ + .key_offset = offsetof(struct ci, key), \ + .salt_offset = offsetof(struct ci, salt), \ + .rec_seq_offset = offsetof(struct ci, rec_seq), \ + .crypto_info = sizeof(struct ci) + +#define CIPHER_DESC(cipher,ci,algname,_offloadable) [cipher - TLS_CIPHER_MIN] = { \ + .nonce = cipher ## _IV_SIZE, \ .iv = cipher ## _IV_SIZE, \ .key = cipher ## _KEY_SIZE, \ .salt = cipher ## _SALT_SIZE, \ .tag = cipher ## _TAG_SIZE, \ .rec_seq = cipher ## _REC_SEQ_SIZE, \ + .cipher_name = algname, \ + .offloadable = _offloadable, \ + __CIPHER_DESC(ci), \ +} + +#define CIPHER_DESC_NONCE0(cipher,ci,algname,_offloadable) [cipher - TLS_CIPHER_MIN] = { \ + .nonce = 0, \ + .iv = cipher ## _IV_SIZE, \ + .key = cipher ## _KEY_SIZE, \ + .salt = cipher ## _SALT_SIZE, \ + .tag = cipher ## _TAG_SIZE, \ + .rec_seq = cipher ## _REC_SEQ_SIZE, \ + .cipher_name = algname, \ + .offloadable = _offloadable, \ + __CIPHER_DESC(ci), \ } const struct tls_cipher_desc tls_cipher_desc[TLS_CIPHER_MAX + 1 - TLS_CIPHER_MIN] = { - CIPHER_DESC(TLS_CIPHER_AES_GCM_128), - CIPHER_DESC(TLS_CIPHER_AES_GCM_256), - CIPHER_DESC(TLS_CIPHER_AES_CCM_128), - CIPHER_DESC(TLS_CIPHER_CHACHA20_POLY1305), - CIPHER_DESC(TLS_CIPHER_SM4_GCM), - CIPHER_DESC(TLS_CIPHER_SM4_CCM), - CIPHER_DESC(TLS_CIPHER_ARIA_GCM_128), - CIPHER_DESC(TLS_CIPHER_ARIA_GCM_256), + CIPHER_DESC(TLS_CIPHER_AES_GCM_128, tls12_crypto_info_aes_gcm_128, "gcm(aes)", true), + CIPHER_DESC(TLS_CIPHER_AES_GCM_256, tls12_crypto_info_aes_gcm_256, "gcm(aes)", true), + CIPHER_DESC(TLS_CIPHER_AES_CCM_128, tls12_crypto_info_aes_ccm_128, "ccm(aes)", false), + CIPHER_DESC_NONCE0(TLS_CIPHER_CHACHA20_POLY1305, tls12_crypto_info_chacha20_poly1305, "rfc7539(chacha20,poly1305)", false), + CIPHER_DESC(TLS_CIPHER_SM4_GCM, tls12_crypto_info_sm4_gcm, "gcm(sm4)", false), + CIPHER_DESC(TLS_CIPHER_SM4_CCM, tls12_crypto_info_sm4_ccm, "ccm(sm4)", false), + CIPHER_DESC(TLS_CIPHER_ARIA_GCM_128, tls12_crypto_info_aria_gcm_128, "gcm(aria)", false), + CIPHER_DESC(TLS_CIPHER_ARIA_GCM_256, tls12_crypto_info_aria_gcm_256, "gcm(aria)", false), }; static const struct proto *saved_tcpv6_prot; -- cgit v1.2.3 From 0d98cc02022d60004f78f6e7e6cc1bd39db80ef9 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2023 23:35:14 +0200 Subject: tls: validate cipher descriptions at compile time Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/b38fb8cf60e099e82ae9979c3c9c92421042417c.1692977948.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_main.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net') diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index bbdf211cc898..9d8629be7017 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -58,6 +58,15 @@ enum { TLS_NUM_PROTS, }; +#define CHECK_CIPHER_DESC(cipher,ci) \ + static_assert(cipher ## _IV_SIZE <= MAX_IV_SIZE); \ + static_assert(cipher ## _REC_SEQ_SIZE <= TLS_MAX_REC_SEQ_SIZE); \ + static_assert(cipher ## _TAG_SIZE == TLS_TAG_SIZE); \ + static_assert(sizeof_field(struct ci, iv) == cipher ## _IV_SIZE); \ + static_assert(sizeof_field(struct ci, key) == cipher ## _KEY_SIZE); \ + static_assert(sizeof_field(struct ci, salt) == cipher ## _SALT_SIZE); \ + static_assert(sizeof_field(struct ci, rec_seq) == cipher ## _REC_SEQ_SIZE); + #define __CIPHER_DESC(ci) \ .iv_offset = offsetof(struct ci, iv), \ .key_offset = offsetof(struct ci, key), \ @@ -100,6 +109,15 @@ const struct tls_cipher_desc tls_cipher_desc[TLS_CIPHER_MAX + 1 - TLS_CIPHER_MIN CIPHER_DESC(TLS_CIPHER_ARIA_GCM_256, tls12_crypto_info_aria_gcm_256, "gcm(aria)", false), }; +CHECK_CIPHER_DESC(TLS_CIPHER_AES_GCM_128, tls12_crypto_info_aes_gcm_128); +CHECK_CIPHER_DESC(TLS_CIPHER_AES_GCM_256, tls12_crypto_info_aes_gcm_256); +CHECK_CIPHER_DESC(TLS_CIPHER_AES_CCM_128, tls12_crypto_info_aes_ccm_128); +CHECK_CIPHER_DESC(TLS_CIPHER_CHACHA20_POLY1305, tls12_crypto_info_chacha20_poly1305); +CHECK_CIPHER_DESC(TLS_CIPHER_SM4_GCM, tls12_crypto_info_sm4_gcm); +CHECK_CIPHER_DESC(TLS_CIPHER_SM4_CCM, tls12_crypto_info_sm4_ccm); +CHECK_CIPHER_DESC(TLS_CIPHER_ARIA_GCM_128, tls12_crypto_info_aria_gcm_128); +CHECK_CIPHER_DESC(TLS_CIPHER_ARIA_GCM_256, tls12_crypto_info_aria_gcm_256); + static const struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); static const struct proto *saved_tcpv4_prot; -- cgit v1.2.3 From 3524dd4d5f1fb9e75fdfaf280822a34fa82059bd Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2023 23:35:15 +0200 Subject: tls: expand use of tls_cipher_desc in tls_set_device_offload tls_set_device_offload is already getting iv and rec_seq sizes from tls_cipher_desc. We can now also check if the cipher_type coming from userspace is valid and can be offloaded. We can also remove the runtime check on rec_seq, since we validate it at compile time. Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/8ab71b8eca856c7aaf981a45fe91ac649eb0e2e9.1692977948.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_device.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 98885d872d4c..8c94c926606a 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -1079,29 +1079,15 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) goto release_netdev; } - switch (crypto_info->cipher_type) { - case TLS_CIPHER_AES_GCM_128: - iv = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->iv; - rec_seq = - ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq; - break; - case TLS_CIPHER_AES_GCM_256: - iv = ((struct tls12_crypto_info_aes_gcm_256 *)crypto_info)->iv; - rec_seq = - ((struct tls12_crypto_info_aes_gcm_256 *)crypto_info)->rec_seq; - break; - default: - rc = -EINVAL; - goto release_netdev; - } cipher_desc = get_cipher_desc(crypto_info->cipher_type); - - /* Sanity-check the rec_seq_size for stack allocations */ - if (cipher_desc->rec_seq > TLS_MAX_REC_SEQ_SIZE) { + if (!cipher_desc || !cipher_desc->offloadable) { rc = -EINVAL; goto release_netdev; } + iv = crypto_info_iv(crypto_info, cipher_desc); + rec_seq = crypto_info_rec_seq(crypto_info, cipher_desc); + prot->version = crypto_info->version; prot->cipher_type = crypto_info->cipher_type; prot->prepend_size = TLS_HEADER_SIZE + cipher_desc->iv; -- cgit v1.2.3 From d2322cf5ed59f084ac86d9339f7c3acccd177bfd Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2023 23:35:16 +0200 Subject: tls: allocate the fallback aead after checking that the cipher is valid No need to allocate the aead if we're going to fail afterwards. Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/335e32511ed55a0b30f3f81a78fa8f323b3bdf8f.1692977948.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_device_fallback.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index cb224fb2a394..4de9061f38f5 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -475,15 +475,6 @@ int tls_sw_fallback_init(struct sock *sk, const u8 *key; int rc; - offload_ctx->aead_send = - crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(offload_ctx->aead_send)) { - rc = PTR_ERR(offload_ctx->aead_send); - pr_err_ratelimited("crypto_alloc_aead failed rc=%d\n", rc); - offload_ctx->aead_send = NULL; - goto err_out; - } - switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: key = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->key; @@ -493,10 +484,19 @@ int tls_sw_fallback_init(struct sock *sk, break; default: rc = -EINVAL; - goto free_aead; + goto err_out; } cipher_desc = get_cipher_desc(crypto_info->cipher_type); + offload_ctx->aead_send = + crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(offload_ctx->aead_send)) { + rc = PTR_ERR(offload_ctx->aead_send); + pr_err_ratelimited("crypto_alloc_aead failed rc=%d\n", rc); + offload_ctx->aead_send = NULL; + goto err_out; + } + rc = crypto_aead_setkey(offload_ctx->aead_send, key, cipher_desc->key); if (rc) goto free_aead; -- cgit v1.2.3 From e907277aeb6caad1c4be96e20195f24531fcfefc Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2023 23:35:17 +0200 Subject: tls: expand use of tls_cipher_desc in tls_sw_fallback_init tls_sw_fallback_init already gets the key and tag size from tls_cipher_desc. We can now also check that the cipher type is valid, and stop hard-coding the algorithm name passed to crypto_alloc_aead. Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/c8c94b8fcafbfb558e09589c1f1ad48dbdf92f76.1692977948.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_device_fallback.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 4de9061f38f5..1d743f310f4f 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -472,24 +472,14 @@ int tls_sw_fallback_init(struct sock *sk, struct tls_crypto_info *crypto_info) { const struct tls_cipher_desc *cipher_desc; - const u8 *key; int rc; - switch (crypto_info->cipher_type) { - case TLS_CIPHER_AES_GCM_128: - key = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->key; - break; - case TLS_CIPHER_AES_GCM_256: - key = ((struct tls12_crypto_info_aes_gcm_256 *)crypto_info)->key; - break; - default: - rc = -EINVAL; - goto err_out; - } cipher_desc = get_cipher_desc(crypto_info->cipher_type); + if (!cipher_desc || !cipher_desc->offloadable) + return -EINVAL; offload_ctx->aead_send = - crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC); + crypto_alloc_aead(cipher_desc->cipher_name, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(offload_ctx->aead_send)) { rc = PTR_ERR(offload_ctx->aead_send); pr_err_ratelimited("crypto_alloc_aead failed rc=%d\n", rc); @@ -497,7 +487,9 @@ int tls_sw_fallback_init(struct sock *sk, goto err_out; } - rc = crypto_aead_setkey(offload_ctx->aead_send, key, cipher_desc->key); + rc = crypto_aead_setkey(offload_ctx->aead_send, + crypto_info_key(crypto_info, cipher_desc), + cipher_desc->key); if (rc) goto free_aead; -- cgit v1.2.3 From 5f309ade49c7068b1149ecf825c4c16e56a3b865 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2023 23:35:18 +0200 Subject: tls: get crypto_info size from tls_cipher_desc in do_tls_setsockopt_conf We can simplify do_tls_setsockopt_conf using tls_cipher_desc. Also use get_cipher_desc's result to check if the cipher_type coming from userspace is valid. Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/e97658eb4c6a5832f8ba20a06c4f36a77763c59e.1692977948.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_main.c | 39 ++++++++------------------------------- 1 file changed, 8 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 9d8629be7017..73cae5dec392 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -739,7 +739,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, struct tls_crypto_info *crypto_info; struct tls_crypto_info *alt_crypto_info; struct tls_context *ctx = tls_get_ctx(sk); - size_t optsize; + const struct tls_cipher_desc *cipher_desc; int rc = 0; int conf; @@ -780,46 +780,23 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, } } - switch (crypto_info->cipher_type) { - case TLS_CIPHER_AES_GCM_128: - optsize = sizeof(struct tls12_crypto_info_aes_gcm_128); - break; - case TLS_CIPHER_AES_GCM_256: { - optsize = sizeof(struct tls12_crypto_info_aes_gcm_256); - break; + cipher_desc = get_cipher_desc(crypto_info->cipher_type); + if (!cipher_desc) { + rc = -EINVAL; + goto err_crypto_info; } - case TLS_CIPHER_AES_CCM_128: - optsize = sizeof(struct tls12_crypto_info_aes_ccm_128); - break; - case TLS_CIPHER_CHACHA20_POLY1305: - optsize = sizeof(struct tls12_crypto_info_chacha20_poly1305); - break; - case TLS_CIPHER_SM4_GCM: - optsize = sizeof(struct tls12_crypto_info_sm4_gcm); - break; - case TLS_CIPHER_SM4_CCM: - optsize = sizeof(struct tls12_crypto_info_sm4_ccm); - break; + + switch (crypto_info->cipher_type) { case TLS_CIPHER_ARIA_GCM_128: - if (crypto_info->version != TLS_1_2_VERSION) { - rc = -EINVAL; - goto err_crypto_info; - } - optsize = sizeof(struct tls12_crypto_info_aria_gcm_128); - break; case TLS_CIPHER_ARIA_GCM_256: if (crypto_info->version != TLS_1_2_VERSION) { rc = -EINVAL; goto err_crypto_info; } - optsize = sizeof(struct tls12_crypto_info_aria_gcm_256); break; - default: - rc = -EINVAL; - goto err_crypto_info; } - if (optlen != optsize) { + if (optlen != cipher_desc->crypto_info) { rc = -EINVAL; goto err_crypto_info; } -- cgit v1.2.3 From 077e05d135489e144d9e0d01454886bf613d32a4 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2023 23:35:19 +0200 Subject: tls: use tls_cipher_desc to simplify do_tls_getsockopt_conf Every cipher uses the same code to update its crypto_info struct based on the values contained in the cctx, with only the struct type and size/offset changing. We can get those from tls_cipher_desc, and use a single pair of memcpy and final copy_to_user. Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/c21a904b91e972bdbbf9d1c6d2731ccfa1eedf72.1692977948.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_main.c | 174 ++++------------------------------------------------- 1 file changed, 11 insertions(+), 163 deletions(-) (limited to 'net') diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 73cae5dec392..02f583ff9239 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -435,6 +435,7 @@ static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval, int __user *optlen, int tx) { int rc = 0; + const struct tls_cipher_desc *cipher_desc; struct tls_context *ctx = tls_get_ctx(sk); struct tls_crypto_info *crypto_info; struct cipher_context *cctx; @@ -473,172 +474,19 @@ static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval, goto out; } - switch (crypto_info->cipher_type) { - case TLS_CIPHER_AES_GCM_128: { - struct tls12_crypto_info_aes_gcm_128 * - crypto_info_aes_gcm_128 = - container_of(crypto_info, - struct tls12_crypto_info_aes_gcm_128, - info); - - if (len != sizeof(*crypto_info_aes_gcm_128)) { - rc = -EINVAL; - goto out; - } - memcpy(crypto_info_aes_gcm_128->iv, - cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - TLS_CIPHER_AES_GCM_128_IV_SIZE); - memcpy(crypto_info_aes_gcm_128->rec_seq, cctx->rec_seq, - TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE); - if (copy_to_user(optval, - crypto_info_aes_gcm_128, - sizeof(*crypto_info_aes_gcm_128))) - rc = -EFAULT; - break; - } - case TLS_CIPHER_AES_GCM_256: { - struct tls12_crypto_info_aes_gcm_256 * - crypto_info_aes_gcm_256 = - container_of(crypto_info, - struct tls12_crypto_info_aes_gcm_256, - info); - - if (len != sizeof(*crypto_info_aes_gcm_256)) { - rc = -EINVAL; - goto out; - } - memcpy(crypto_info_aes_gcm_256->iv, - cctx->iv + TLS_CIPHER_AES_GCM_256_SALT_SIZE, - TLS_CIPHER_AES_GCM_256_IV_SIZE); - memcpy(crypto_info_aes_gcm_256->rec_seq, cctx->rec_seq, - TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE); - if (copy_to_user(optval, - crypto_info_aes_gcm_256, - sizeof(*crypto_info_aes_gcm_256))) - rc = -EFAULT; - break; - } - case TLS_CIPHER_AES_CCM_128: { - struct tls12_crypto_info_aes_ccm_128 *aes_ccm_128 = - container_of(crypto_info, - struct tls12_crypto_info_aes_ccm_128, info); - - if (len != sizeof(*aes_ccm_128)) { - rc = -EINVAL; - goto out; - } - memcpy(aes_ccm_128->iv, - cctx->iv + TLS_CIPHER_AES_CCM_128_SALT_SIZE, - TLS_CIPHER_AES_CCM_128_IV_SIZE); - memcpy(aes_ccm_128->rec_seq, cctx->rec_seq, - TLS_CIPHER_AES_CCM_128_REC_SEQ_SIZE); - if (copy_to_user(optval, aes_ccm_128, sizeof(*aes_ccm_128))) - rc = -EFAULT; - break; - } - case TLS_CIPHER_CHACHA20_POLY1305: { - struct tls12_crypto_info_chacha20_poly1305 *chacha20_poly1305 = - container_of(crypto_info, - struct tls12_crypto_info_chacha20_poly1305, - info); - - if (len != sizeof(*chacha20_poly1305)) { - rc = -EINVAL; - goto out; - } - memcpy(chacha20_poly1305->iv, - cctx->iv + TLS_CIPHER_CHACHA20_POLY1305_SALT_SIZE, - TLS_CIPHER_CHACHA20_POLY1305_IV_SIZE); - memcpy(chacha20_poly1305->rec_seq, cctx->rec_seq, - TLS_CIPHER_CHACHA20_POLY1305_REC_SEQ_SIZE); - if (copy_to_user(optval, chacha20_poly1305, - sizeof(*chacha20_poly1305))) - rc = -EFAULT; - break; + cipher_desc = get_cipher_desc(crypto_info->cipher_type); + if (!cipher_desc || len != cipher_desc->crypto_info) { + rc = -EINVAL; + goto out; } - case TLS_CIPHER_SM4_GCM: { - struct tls12_crypto_info_sm4_gcm *sm4_gcm_info = - container_of(crypto_info, - struct tls12_crypto_info_sm4_gcm, info); - if (len != sizeof(*sm4_gcm_info)) { - rc = -EINVAL; - goto out; - } - memcpy(sm4_gcm_info->iv, - cctx->iv + TLS_CIPHER_SM4_GCM_SALT_SIZE, - TLS_CIPHER_SM4_GCM_IV_SIZE); - memcpy(sm4_gcm_info->rec_seq, cctx->rec_seq, - TLS_CIPHER_SM4_GCM_REC_SEQ_SIZE); - if (copy_to_user(optval, sm4_gcm_info, sizeof(*sm4_gcm_info))) - rc = -EFAULT; - break; - } - case TLS_CIPHER_SM4_CCM: { - struct tls12_crypto_info_sm4_ccm *sm4_ccm_info = - container_of(crypto_info, - struct tls12_crypto_info_sm4_ccm, info); + memcpy(crypto_info_iv(crypto_info, cipher_desc), + cctx->iv + cipher_desc->salt, cipher_desc->iv); + memcpy(crypto_info_rec_seq(crypto_info, cipher_desc), + cctx->rec_seq, cipher_desc->rec_seq); - if (len != sizeof(*sm4_ccm_info)) { - rc = -EINVAL; - goto out; - } - memcpy(sm4_ccm_info->iv, - cctx->iv + TLS_CIPHER_SM4_CCM_SALT_SIZE, - TLS_CIPHER_SM4_CCM_IV_SIZE); - memcpy(sm4_ccm_info->rec_seq, cctx->rec_seq, - TLS_CIPHER_SM4_CCM_REC_SEQ_SIZE); - if (copy_to_user(optval, sm4_ccm_info, sizeof(*sm4_ccm_info))) - rc = -EFAULT; - break; - } - case TLS_CIPHER_ARIA_GCM_128: { - struct tls12_crypto_info_aria_gcm_128 * - crypto_info_aria_gcm_128 = - container_of(crypto_info, - struct tls12_crypto_info_aria_gcm_128, - info); - - if (len != sizeof(*crypto_info_aria_gcm_128)) { - rc = -EINVAL; - goto out; - } - memcpy(crypto_info_aria_gcm_128->iv, - cctx->iv + TLS_CIPHER_ARIA_GCM_128_SALT_SIZE, - TLS_CIPHER_ARIA_GCM_128_IV_SIZE); - memcpy(crypto_info_aria_gcm_128->rec_seq, cctx->rec_seq, - TLS_CIPHER_ARIA_GCM_128_REC_SEQ_SIZE); - if (copy_to_user(optval, - crypto_info_aria_gcm_128, - sizeof(*crypto_info_aria_gcm_128))) - rc = -EFAULT; - break; - } - case TLS_CIPHER_ARIA_GCM_256: { - struct tls12_crypto_info_aria_gcm_256 * - crypto_info_aria_gcm_256 = - container_of(crypto_info, - struct tls12_crypto_info_aria_gcm_256, - info); - - if (len != sizeof(*crypto_info_aria_gcm_256)) { - rc = -EINVAL; - goto out; - } - memcpy(crypto_info_aria_gcm_256->iv, - cctx->iv + TLS_CIPHER_ARIA_GCM_256_SALT_SIZE, - TLS_CIPHER_ARIA_GCM_256_IV_SIZE); - memcpy(crypto_info_aria_gcm_256->rec_seq, cctx->rec_seq, - TLS_CIPHER_ARIA_GCM_256_REC_SEQ_SIZE); - if (copy_to_user(optval, - crypto_info_aria_gcm_256, - sizeof(*crypto_info_aria_gcm_256))) - rc = -EFAULT; - break; - } - default: - rc = -EINVAL; - } + if (copy_to_user(optval, crypto_info, cipher_desc->crypto_info)) + rc = -EFAULT; out: return rc; -- cgit v1.2.3 From d9a6ca1a975839e61c0463a5e29dd8053fd3e8a4 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2023 23:35:20 +0200 Subject: tls: use tls_cipher_desc to get per-cipher sizes in tls_set_sw_offload We can get rid of some local variables, but we have to keep nonce_size because tls1.3 uses nonce_size = 0 for all ciphers. We can also drop the runtime sanity checks on iv/rec_seq/tag size, since we have compile time checks on those values. Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/deed9c4430a62c31751a72b8c03ad66ffe710717.1692977948.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 79 ++++++++++++-------------------------------------------- 1 file changed, 16 insertions(+), 63 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 5c122d7bb784..85708656dcd4 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2590,10 +2590,10 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct tls_sw_context_rx *sw_ctx_rx = NULL; struct cipher_context *cctx; struct crypto_aead **aead; - u16 nonce_size, tag_size, iv_size, rec_seq_size, salt_size; struct crypto_tfm *tfm; char *iv, *rec_seq, *key, *salt, *cipher_name; - size_t keysize; + const struct tls_cipher_desc *cipher_desc; + u16 nonce_size; int rc = 0; if (!ctx) { @@ -2652,16 +2652,10 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; gcm_128_info = (void *)crypto_info; - nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; - tag_size = TLS_CIPHER_AES_GCM_128_TAG_SIZE; - iv_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; iv = gcm_128_info->iv; - rec_seq_size = TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE; rec_seq = gcm_128_info->rec_seq; - keysize = TLS_CIPHER_AES_GCM_128_KEY_SIZE; key = gcm_128_info->key; salt = gcm_128_info->salt; - salt_size = TLS_CIPHER_AES_GCM_128_SALT_SIZE; cipher_name = "gcm(aes)"; break; } @@ -2669,16 +2663,10 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct tls12_crypto_info_aes_gcm_256 *gcm_256_info; gcm_256_info = (void *)crypto_info; - nonce_size = TLS_CIPHER_AES_GCM_256_IV_SIZE; - tag_size = TLS_CIPHER_AES_GCM_256_TAG_SIZE; - iv_size = TLS_CIPHER_AES_GCM_256_IV_SIZE; iv = gcm_256_info->iv; - rec_seq_size = TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE; rec_seq = gcm_256_info->rec_seq; - keysize = TLS_CIPHER_AES_GCM_256_KEY_SIZE; key = gcm_256_info->key; salt = gcm_256_info->salt; - salt_size = TLS_CIPHER_AES_GCM_256_SALT_SIZE; cipher_name = "gcm(aes)"; break; } @@ -2686,16 +2674,10 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct tls12_crypto_info_aes_ccm_128 *ccm_128_info; ccm_128_info = (void *)crypto_info; - nonce_size = TLS_CIPHER_AES_CCM_128_IV_SIZE; - tag_size = TLS_CIPHER_AES_CCM_128_TAG_SIZE; - iv_size = TLS_CIPHER_AES_CCM_128_IV_SIZE; iv = ccm_128_info->iv; - rec_seq_size = TLS_CIPHER_AES_CCM_128_REC_SEQ_SIZE; rec_seq = ccm_128_info->rec_seq; - keysize = TLS_CIPHER_AES_CCM_128_KEY_SIZE; key = ccm_128_info->key; salt = ccm_128_info->salt; - salt_size = TLS_CIPHER_AES_CCM_128_SALT_SIZE; cipher_name = "ccm(aes)"; break; } @@ -2703,16 +2685,10 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct tls12_crypto_info_chacha20_poly1305 *chacha20_poly1305_info; chacha20_poly1305_info = (void *)crypto_info; - nonce_size = 0; - tag_size = TLS_CIPHER_CHACHA20_POLY1305_TAG_SIZE; - iv_size = TLS_CIPHER_CHACHA20_POLY1305_IV_SIZE; iv = chacha20_poly1305_info->iv; - rec_seq_size = TLS_CIPHER_CHACHA20_POLY1305_REC_SEQ_SIZE; rec_seq = chacha20_poly1305_info->rec_seq; - keysize = TLS_CIPHER_CHACHA20_POLY1305_KEY_SIZE; key = chacha20_poly1305_info->key; salt = chacha20_poly1305_info->salt; - salt_size = TLS_CIPHER_CHACHA20_POLY1305_SALT_SIZE; cipher_name = "rfc7539(chacha20,poly1305)"; break; } @@ -2720,16 +2696,10 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct tls12_crypto_info_sm4_gcm *sm4_gcm_info; sm4_gcm_info = (void *)crypto_info; - nonce_size = TLS_CIPHER_SM4_GCM_IV_SIZE; - tag_size = TLS_CIPHER_SM4_GCM_TAG_SIZE; - iv_size = TLS_CIPHER_SM4_GCM_IV_SIZE; iv = sm4_gcm_info->iv; - rec_seq_size = TLS_CIPHER_SM4_GCM_REC_SEQ_SIZE; rec_seq = sm4_gcm_info->rec_seq; - keysize = TLS_CIPHER_SM4_GCM_KEY_SIZE; key = sm4_gcm_info->key; salt = sm4_gcm_info->salt; - salt_size = TLS_CIPHER_SM4_GCM_SALT_SIZE; cipher_name = "gcm(sm4)"; break; } @@ -2737,16 +2707,10 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct tls12_crypto_info_sm4_ccm *sm4_ccm_info; sm4_ccm_info = (void *)crypto_info; - nonce_size = TLS_CIPHER_SM4_CCM_IV_SIZE; - tag_size = TLS_CIPHER_SM4_CCM_TAG_SIZE; - iv_size = TLS_CIPHER_SM4_CCM_IV_SIZE; iv = sm4_ccm_info->iv; - rec_seq_size = TLS_CIPHER_SM4_CCM_REC_SEQ_SIZE; rec_seq = sm4_ccm_info->rec_seq; - keysize = TLS_CIPHER_SM4_CCM_KEY_SIZE; key = sm4_ccm_info->key; salt = sm4_ccm_info->salt; - salt_size = TLS_CIPHER_SM4_CCM_SALT_SIZE; cipher_name = "ccm(sm4)"; break; } @@ -2754,16 +2718,10 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct tls12_crypto_info_aria_gcm_128 *aria_gcm_128_info; aria_gcm_128_info = (void *)crypto_info; - nonce_size = TLS_CIPHER_ARIA_GCM_128_IV_SIZE; - tag_size = TLS_CIPHER_ARIA_GCM_128_TAG_SIZE; - iv_size = TLS_CIPHER_ARIA_GCM_128_IV_SIZE; iv = aria_gcm_128_info->iv; - rec_seq_size = TLS_CIPHER_ARIA_GCM_128_REC_SEQ_SIZE; rec_seq = aria_gcm_128_info->rec_seq; - keysize = TLS_CIPHER_ARIA_GCM_128_KEY_SIZE; key = aria_gcm_128_info->key; salt = aria_gcm_128_info->salt; - salt_size = TLS_CIPHER_ARIA_GCM_128_SALT_SIZE; cipher_name = "gcm(aria)"; break; } @@ -2771,16 +2729,10 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct tls12_crypto_info_aria_gcm_256 *gcm_256_info; gcm_256_info = (void *)crypto_info; - nonce_size = TLS_CIPHER_ARIA_GCM_256_IV_SIZE; - tag_size = TLS_CIPHER_ARIA_GCM_256_TAG_SIZE; - iv_size = TLS_CIPHER_ARIA_GCM_256_IV_SIZE; iv = gcm_256_info->iv; - rec_seq_size = TLS_CIPHER_ARIA_GCM_256_REC_SEQ_SIZE; rec_seq = gcm_256_info->rec_seq; - keysize = TLS_CIPHER_ARIA_GCM_256_KEY_SIZE; key = gcm_256_info->key; salt = gcm_256_info->salt; - salt_size = TLS_CIPHER_ARIA_GCM_256_SALT_SIZE; cipher_name = "gcm(aria)"; break; } @@ -2789,6 +2741,9 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) goto free_priv; } + cipher_desc = get_cipher_desc(crypto_info->cipher_type); + nonce_size = cipher_desc->nonce; + if (crypto_info->version == TLS_1_3_VERSION) { nonce_size = 0; prot->aad_size = TLS_HEADER_SIZE; @@ -2799,9 +2754,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) } /* Sanity-check the sizes for stack allocations. */ - if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE || - rec_seq_size > TLS_MAX_REC_SEQ_SIZE || tag_size != TLS_TAG_SIZE || - prot->aad_size > TLS_MAX_AAD_SIZE) { + if (nonce_size > MAX_IV_SIZE || prot->aad_size > TLS_MAX_AAD_SIZE) { rc = -EINVAL; goto free_priv; } @@ -2809,21 +2762,22 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) prot->version = crypto_info->version; prot->cipher_type = crypto_info->cipher_type; prot->prepend_size = TLS_HEADER_SIZE + nonce_size; - prot->tag_size = tag_size; + prot->tag_size = cipher_desc->tag; prot->overhead_size = prot->prepend_size + prot->tag_size + prot->tail_size; - prot->iv_size = iv_size; - prot->salt_size = salt_size; - cctx->iv = kmalloc(iv_size + salt_size, GFP_KERNEL); + prot->iv_size = cipher_desc->iv; + prot->salt_size = cipher_desc->salt; + cctx->iv = kmalloc(cipher_desc->iv + cipher_desc->salt, GFP_KERNEL); if (!cctx->iv) { rc = -ENOMEM; goto free_priv; } /* Note: 128 & 256 bit salt are the same size */ - prot->rec_seq_size = rec_seq_size; - memcpy(cctx->iv, salt, salt_size); - memcpy(cctx->iv + salt_size, iv, iv_size); - cctx->rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL); + prot->rec_seq_size = cipher_desc->rec_seq; + memcpy(cctx->iv, salt, cipher_desc->salt); + memcpy(cctx->iv + cipher_desc->salt, iv, cipher_desc->iv); + + cctx->rec_seq = kmemdup(rec_seq, cipher_desc->rec_seq, GFP_KERNEL); if (!cctx->rec_seq) { rc = -ENOMEM; goto free_iv; @@ -2840,8 +2794,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) ctx->push_pending_record = tls_sw_push_pending_record; - rc = crypto_aead_setkey(*aead, key, keysize); - + rc = crypto_aead_setkey(*aead, key, cipher_desc->key); if (rc) goto free_aead; -- cgit v1.2.3 From 48dfad27fd4045df82eee54186c2513cbfc54c1d Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2023 23:35:21 +0200 Subject: tls: use tls_cipher_desc to access per-cipher crypto_info in tls_set_sw_offload The crypto_info_* helpers allow us to fetch pointers into the per-cipher crypto_info's data. Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/c23af110caf0af6b68de2f86c58064913e2e902a.1692977948.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 89 +++++++++----------------------------------------------- 1 file changed, 13 insertions(+), 76 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 85708656dcd4..9c18ddf0d568 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2648,94 +2648,26 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) } switch (crypto_info->cipher_type) { - case TLS_CIPHER_AES_GCM_128: { - struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; - - gcm_128_info = (void *)crypto_info; - iv = gcm_128_info->iv; - rec_seq = gcm_128_info->rec_seq; - key = gcm_128_info->key; - salt = gcm_128_info->salt; + case TLS_CIPHER_AES_GCM_128: + case TLS_CIPHER_AES_GCM_256: cipher_name = "gcm(aes)"; break; - } - case TLS_CIPHER_AES_GCM_256: { - struct tls12_crypto_info_aes_gcm_256 *gcm_256_info; - - gcm_256_info = (void *)crypto_info; - iv = gcm_256_info->iv; - rec_seq = gcm_256_info->rec_seq; - key = gcm_256_info->key; - salt = gcm_256_info->salt; - cipher_name = "gcm(aes)"; - break; - } - case TLS_CIPHER_AES_CCM_128: { - struct tls12_crypto_info_aes_ccm_128 *ccm_128_info; - - ccm_128_info = (void *)crypto_info; - iv = ccm_128_info->iv; - rec_seq = ccm_128_info->rec_seq; - key = ccm_128_info->key; - salt = ccm_128_info->salt; + case TLS_CIPHER_AES_CCM_128: cipher_name = "ccm(aes)"; break; - } - case TLS_CIPHER_CHACHA20_POLY1305: { - struct tls12_crypto_info_chacha20_poly1305 *chacha20_poly1305_info; - - chacha20_poly1305_info = (void *)crypto_info; - iv = chacha20_poly1305_info->iv; - rec_seq = chacha20_poly1305_info->rec_seq; - key = chacha20_poly1305_info->key; - salt = chacha20_poly1305_info->salt; + case TLS_CIPHER_CHACHA20_POLY1305: cipher_name = "rfc7539(chacha20,poly1305)"; break; - } - case TLS_CIPHER_SM4_GCM: { - struct tls12_crypto_info_sm4_gcm *sm4_gcm_info; - - sm4_gcm_info = (void *)crypto_info; - iv = sm4_gcm_info->iv; - rec_seq = sm4_gcm_info->rec_seq; - key = sm4_gcm_info->key; - salt = sm4_gcm_info->salt; + case TLS_CIPHER_SM4_GCM: cipher_name = "gcm(sm4)"; break; - } - case TLS_CIPHER_SM4_CCM: { - struct tls12_crypto_info_sm4_ccm *sm4_ccm_info; - - sm4_ccm_info = (void *)crypto_info; - iv = sm4_ccm_info->iv; - rec_seq = sm4_ccm_info->rec_seq; - key = sm4_ccm_info->key; - salt = sm4_ccm_info->salt; + case TLS_CIPHER_SM4_CCM: cipher_name = "ccm(sm4)"; break; - } - case TLS_CIPHER_ARIA_GCM_128: { - struct tls12_crypto_info_aria_gcm_128 *aria_gcm_128_info; - - aria_gcm_128_info = (void *)crypto_info; - iv = aria_gcm_128_info->iv; - rec_seq = aria_gcm_128_info->rec_seq; - key = aria_gcm_128_info->key; - salt = aria_gcm_128_info->salt; + case TLS_CIPHER_ARIA_GCM_128: + case TLS_CIPHER_ARIA_GCM_256: cipher_name = "gcm(aria)"; break; - } - case TLS_CIPHER_ARIA_GCM_256: { - struct tls12_crypto_info_aria_gcm_256 *gcm_256_info; - - gcm_256_info = (void *)crypto_info; - iv = gcm_256_info->iv; - rec_seq = gcm_256_info->rec_seq; - key = gcm_256_info->key; - salt = gcm_256_info->salt; - cipher_name = "gcm(aria)"; - break; - } default: rc = -EINVAL; goto free_priv; @@ -2744,6 +2676,11 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) cipher_desc = get_cipher_desc(crypto_info->cipher_type); nonce_size = cipher_desc->nonce; + iv = crypto_info_iv(crypto_info, cipher_desc); + key = crypto_info_key(crypto_info, cipher_desc); + salt = crypto_info_salt(crypto_info, cipher_desc); + rec_seq = crypto_info_rec_seq(crypto_info, cipher_desc); + if (crypto_info->version == TLS_1_3_VERSION) { nonce_size = 0; prot->aad_size = TLS_HEADER_SIZE; -- cgit v1.2.3 From f3e444e31f9fa45ad59cc2d0e0c1bcc86eef4780 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2023 23:35:22 +0200 Subject: tls: get cipher_name from cipher_desc in tls_set_sw_offload tls_cipher_desc also contains the algorithm name needed by crypto_alloc_aead, use it. Finally, use get_cipher_desc to check if the cipher_type coming from userspace is valid, and remove the cipher_type switch. Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/53d021d80138aa125a9cef4468aa5ce531975a7b.1692977948.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9c18ddf0d568..1ed4a611631f 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2591,7 +2591,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct cipher_context *cctx; struct crypto_aead **aead; struct crypto_tfm *tfm; - char *iv, *rec_seq, *key, *salt, *cipher_name; + char *iv, *rec_seq, *key, *salt; const struct tls_cipher_desc *cipher_desc; u16 nonce_size; int rc = 0; @@ -2647,33 +2647,12 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) aead = &sw_ctx_rx->aead_recv; } - switch (crypto_info->cipher_type) { - case TLS_CIPHER_AES_GCM_128: - case TLS_CIPHER_AES_GCM_256: - cipher_name = "gcm(aes)"; - break; - case TLS_CIPHER_AES_CCM_128: - cipher_name = "ccm(aes)"; - break; - case TLS_CIPHER_CHACHA20_POLY1305: - cipher_name = "rfc7539(chacha20,poly1305)"; - break; - case TLS_CIPHER_SM4_GCM: - cipher_name = "gcm(sm4)"; - break; - case TLS_CIPHER_SM4_CCM: - cipher_name = "ccm(sm4)"; - break; - case TLS_CIPHER_ARIA_GCM_128: - case TLS_CIPHER_ARIA_GCM_256: - cipher_name = "gcm(aria)"; - break; - default: + cipher_desc = get_cipher_desc(crypto_info->cipher_type); + if (!cipher_desc) { rc = -EINVAL; goto free_priv; } - cipher_desc = get_cipher_desc(crypto_info->cipher_type); nonce_size = cipher_desc->nonce; iv = crypto_info_iv(crypto_info, cipher_desc); @@ -2721,7 +2700,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) } if (!*aead) { - *aead = crypto_alloc_aead(cipher_name, 0, 0); + *aead = crypto_alloc_aead(cipher_desc->cipher_name, 0, 0); if (IS_ERR(*aead)) { rc = PTR_ERR(*aead); *aead = NULL; -- cgit v1.2.3 From c2f8fd7949603efb03908e05abbf7726748c8de3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 24 Aug 2023 09:50:59 -0700 Subject: netrom: Deny concurrent connect(). syzkaller reported null-ptr-deref [0] related to AF_NETROM. This is another self-accept issue from the strace log. [1] syz-executor creates an AF_NETROM socket and calls connect(), which is blocked at that time. Then, sk->sk_state is TCP_SYN_SENT and sock->state is SS_CONNECTING. [pid 5059] socket(AF_NETROM, SOCK_SEQPACKET, 0) = 4 [pid 5059] connect(4, {sa_family=AF_NETROM, sa_data="..." Another thread calls connect() concurrently, which finally fails with -EINVAL. However, the problem here is the socket state is reset even while the first connect() is blocked. [pid 5060] connect(4, NULL, 0 [pid 5060] <... connect resumed>) = -1 EINVAL (Invalid argument) As sk->state is TCP_CLOSE and sock->state is SS_UNCONNECTED, the following listen() succeeds. Then, the first connect() looks up itself as a listener and puts skb into the queue with skb->sk itself. As a result, the next accept() gets another FD of itself as 3, and the first connect() finishes. [pid 5060] listen(4, 0 [pid 5060] <... listen resumed>) = 0 [pid 5060] accept(4, NULL, NULL [pid 5060] <... accept resumed>) = 3 [pid 5059] <... connect resumed>) = 0 Then, accept4() is called but blocked, which causes the general protection fault later. [pid 5059] accept4(4, NULL, 0x20000400, SOCK_NONBLOCK After that, another self-accept occurs by accept() and writev(). [pid 5060] accept(4, NULL, NULL [pid 5061] writev(3, [{iov_base=...}] [pid 5061] <... writev resumed>) = 99 [pid 5060] <... accept resumed>) = 6 Finally, the leader thread close()s all FDs. Since the three FDs reference the same socket, nr_release() does the cleanup for it three times, and the remaining accept4() causes the following fault. [pid 5058] close(3) = 0 [pid 5058] close(4) = 0 [pid 5058] close(5) = -1 EBADF (Bad file descriptor) [pid 5058] close(6) = 0 [pid 5058] <... exit_group resumed>) = ? [ 83.456055][ T5059] general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN To avoid the issue, we need to return an error for connect() if another connect() is in progress, as done in __inet_stream_connect(). [0]: general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] CPU: 0 PID: 5059 Comm: syz-executor.0 Not tainted 6.5.0-rc5-syzkaller-00194-gace0ab3a4b54 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 RIP: 0010:__lock_acquire+0x109/0x5de0 kernel/locking/lockdep.c:5012 Code: 45 85 c9 0f 84 cc 0e 00 00 44 8b 05 11 6e 23 0b 45 85 c0 0f 84 be 0d 00 00 48 ba 00 00 00 00 00 fc ff df 4c 89 d1 48 c1 e9 03 <80> 3c 11 00 0f 85 e8 40 00 00 49 81 3a a0 69 48 90 0f 84 96 0d 00 RSP: 0018:ffffc90003d6f9e0 EFLAGS: 00010006 RAX: ffff8880244c8000 RBX: 1ffff920007adf6c RCX: 0000000000000003 RDX: dffffc0000000000 RSI: 0000000000000000 RDI: 0000000000000018 RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000018 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f51d519a6c0(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f51d5158d58 CR3: 000000002943f000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: lock_acquire kernel/locking/lockdep.c:5761 [inline] lock_acquire+0x1ae/0x510 kernel/locking/lockdep.c:5726 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x3a/0x50 kernel/locking/spinlock.c:162 prepare_to_wait+0x47/0x380 kernel/sched/wait.c:269 nr_accept+0x20d/0x650 net/netrom/af_netrom.c:798 do_accept+0x3a6/0x570 net/socket.c:1872 __sys_accept4_file net/socket.c:1913 [inline] __sys_accept4+0x99/0x120 net/socket.c:1943 __do_sys_accept4 net/socket.c:1954 [inline] __se_sys_accept4 net/socket.c:1951 [inline] __x64_sys_accept4+0x96/0x100 net/socket.c:1951 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f51d447cae9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 20 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f51d519a0c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000120 RAX: ffffffffffffffda RBX: 00007f51d459bf80 RCX: 00007f51d447cae9 RDX: 0000000020000400 RSI: 0000000000000000 RDI: 0000000000000004 RBP: 00007f51d44c847a R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000800 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000000b R14: 00007f51d459bf80 R15: 00007ffc25c34e48 Link: https://syzkaller.appspot.com/text?tag=CrashLog&x=152cdb63a80000 [1] Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+666c97e4686410e79649@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=666c97e4686410e79649 Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/netrom/af_netrom.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index eb8ccbd58df7..96e91ab71573 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -660,6 +660,11 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr, goto out_release; } + if (sock->state == SS_CONNECTING) { + err = -EALREADY; + goto out_release; + } + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; -- cgit v1.2.3 From 977ad86c2a1bcaf58f01ab98df5cc145083c489c Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 25 Aug 2023 15:32:41 +0200 Subject: dccp: Fix out of bounds access in DCCP error handler There was a previous attempt to fix an out-of-bounds access in the DCCP error handlers, but that fix assumed that the error handlers only want to access the first 8 bytes of the DCCP header. Actually, they also look at the DCCP sequence number, which is stored beyond 8 bytes, so an explicit pskb_may_pull() is required. Fixes: 6706a97fec96 ("dccp: fix out of bound access in dccp_v4_err()") Fixes: 1aa9d1a0e7ee ("ipv6: dccp: fix out of bound access in dccp_v6_err()") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 13 +++++++++---- net/dccp/ipv6.c | 15 ++++++++++----- 2 files changed, 19 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index a545ad71201c..a5361fb7a415 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -255,12 +255,17 @@ static int dccp_v4_err(struct sk_buff *skb, u32 info) int err; struct net *net = dev_net(skb->dev); - /* Only need dccph_dport & dccph_sport which are the first - * 4 bytes in dccp header. + /* For the first __dccp_basic_hdr_len() check, we only need dh->dccph_x, + * which is in byte 7 of the dccp header. * Our caller (icmp_socket_deliver()) already pulled 8 bytes for us. + * + * Later on, we want to access the sequence number fields, which are + * beyond 8 bytes, so we have to pskb_may_pull() ourselves. */ - BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_sport) > 8); - BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_dport) > 8); + dh = (struct dccp_hdr *)(skb->data + offset); + if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) + return -EINVAL; + iph = (struct iphdr *)skb->data; dh = (struct dccp_hdr *)(skb->data + offset); sk = __inet_lookup_established(net, &dccp_hashinfo, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index d29d1163203d..25816e790527 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -74,7 +74,7 @@ static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb) static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; + const struct ipv6hdr *hdr; const struct dccp_hdr *dh; struct dccp_sock *dp; struct ipv6_pinfo *np; @@ -83,12 +83,17 @@ static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, __u64 seq; struct net *net = dev_net(skb->dev); - /* Only need dccph_dport & dccph_sport which are the first - * 4 bytes in dccp header. + /* For the first __dccp_basic_hdr_len() check, we only need dh->dccph_x, + * which is in byte 7 of the dccp header. * Our caller (icmpv6_notify()) already pulled 8 bytes for us. + * + * Later on, we want to access the sequence number fields, which are + * beyond 8 bytes, so we have to pskb_may_pull() ourselves. */ - BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_sport) > 8); - BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_dport) > 8); + dh = (struct dccp_hdr *)(skb->data + offset); + if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) + return -EINVAL; + hdr = (const struct ipv6hdr *)skb->data; dh = (struct dccp_hdr *)(skb->data + offset); sk = __inet6_lookup_established(net, &dccp_hashinfo, -- cgit v1.2.3 From 28d18b673ffa2d13112ddb6e4c32c60d9b0cda50 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 25 Aug 2023 15:49:45 +0200 Subject: net: Fix skb consume leak in sch_handle_egress Fix a memory leak for the tc egress path with TC_ACT_{STOLEN,QUEUED,TRAP}: [...] unreferenced object 0xffff88818bcb4f00 (size 232): comm "softirq", pid 0, jiffies 4299085078 (age 134.028s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 80 70 61 81 88 ff ff 00 41 31 14 81 88 ff ff ..pa.....A1..... backtrace: [] kmem_cache_alloc_node+0x268/0x400 [] __alloc_skb+0x211/0x2c0 [] alloc_skb_with_frags+0xbe/0x6b0 [] sock_alloc_send_pskb+0x6a9/0x870 [] __ip_append_data+0x14d0/0x3bf0 [] ip_append_data+0xee/0x190 [] icmp_push_reply+0xa6/0x470 [] icmp_reply+0x900/0xa00 [] icmp_echo.part.0+0x1a3/0x230 [] icmp_echo+0xcd/0x190 [] icmp_rcv+0x806/0xe10 [] ip_protocol_deliver_rcu+0x351/0x3d0 [] ip_local_deliver_finish+0x2b4/0x450 [] ip_local_deliver+0x174/0x1f0 [] ip_sublist_rcv_finish+0x1f2/0x420 [] ip_sublist_rcv+0x466/0x920 [...] I was able to reproduce this via: ip link add dev dummy0 type dummy ip link set dev dummy0 up tc qdisc add dev eth0 clsact tc filter add dev eth0 egress protocol ip prio 1 u32 match ip protocol 1 0xff action mirred egress redirect dev dummy0 ping 1.1.1.1 After the fix, there are no kmemleak reports with the reproducer. This is in line with what is also done on the ingress side, and from debugging the skb_unref(skb) on dummy xmit and sch_handle_egress() side, it is visible that these are two different skbs with both skb_unref(skb) as true. The two seen skbs are due to mirred doing a skb_clone() internally as use_reinsert is false in tcf_mirred_act() for egress. This was initially reported by Gal. Fixes: e420bed02507 ("bpf: Add fd-based tcx multi-prog infra with link support") Reported-by: Gal Pressman Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/bdfc2640-8f65-5b56-4472-db8e2b161aab@nvidia.com Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 17e6281e408c..9f6ed6d97f89 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4061,6 +4061,7 @@ egress_verdict: case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: + consume_skb(skb); *ret = NET_XMIT_SUCCESS; return NULL; } -- cgit v1.2.3 From 3a1e2f43985af0dea5750c6436f8cb979780c084 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 25 Aug 2023 15:49:46 +0200 Subject: net: Make consumed action consistent in sch_handle_egress While looking at TC_ACT_* handling, the TC_ACT_CONSUMED is only handled in sch_handle_ingress but not sch_handle_egress. This was added via cd11b164073b ("net/tc: introduce TC_ACT_REINSERT.") and e5cf1baf92cb ("act_mirred: use TC_ACT_REINSERT when possible") and later got renamed into TC_ACT_CONSUMED via 720f22fed81b ("net: sched: refactor reinsert action"). The initial work was targeted for ovs back then and only needed on ingress, and the mirred action module also restricts it to only that. However, given it's an API contract it would still make sense to make this consistent to sch_handle_ingress and handle it on egress side in the same way, that is, setting return code to "success" and returning NULL back to the caller as otherwise an action module sitting on egress returning TC_ACT_CONSUMED could lead to an UAF when untreated. Signed-off-by: Daniel Borkmann Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 9f6ed6d97f89..ccff2b6ef958 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4062,6 +4062,8 @@ egress_verdict: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); + fallthrough; + case TC_ACT_CONSUMED: *ret = NET_XMIT_SUCCESS; return NULL; } -- cgit v1.2.3 From 8be6f88b9d3fe1f6724daec4a70d6023742c9df7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 28 Aug 2023 08:47:32 +0000 Subject: inet: fix IP_TRANSPARENT error handling My recent patch forgot to change error handling for IP_TRANSPARENT socket option. WARNING: bad unlock balance detected! 6.5.0-rc7-syzkaller-01717-g59da9885767a #0 Not tainted ------------------------------------- syz-executor151/5028 is trying to release lock (sk_lock-AF_INET) at: [] sockopt_release_sock+0x53/0x70 net/core/sock.c:1073 but there are no more locks to release! other info that might help us debug this: 1 lock held by syz-executor151/5028: stack backtrace: CPU: 0 PID: 5028 Comm: syz-executor151 Not tainted 6.5.0-rc7-syzkaller-01717-g59da9885767a #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd9/0x1b0 lib/dump_stack.c:106 __lock_release kernel/locking/lockdep.c:5438 [inline] lock_release+0x4b5/0x680 kernel/locking/lockdep.c:5781 sock_release_ownership include/net/sock.h:1824 [inline] release_sock+0x175/0x1b0 net/core/sock.c:3527 sockopt_release_sock+0x53/0x70 net/core/sock.c:1073 do_ip_setsockopt+0x12c1/0x3640 net/ipv4/ip_sockglue.c:1364 ip_setsockopt+0x59/0xe0 net/ipv4/ip_sockglue.c:1419 raw_setsockopt+0x218/0x290 net/ipv4/raw.c:833 __sys_setsockopt+0x2cd/0x5b0 net/socket.c:2305 __do_sys_setsockopt net/socket.c:2316 [inline] __se_sys_setsockopt net/socket.c:2313 [inline] Fixes: 4bd0623f04ee ("inet: move inet->transparent to inet->inet_flags") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Cc: Simon Horman Cc: Matthieu Baerts Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 54ad0f0d5c2d..d1c73660b844 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1007,12 +1007,10 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, return 0; case IP_TRANSPARENT: if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { - err = -EPERM; - break; - } + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; if (optlen < 1) - goto e_inval; + return -EINVAL; inet_assign_bit(TRANSPARENT, sk, val); return 0; case IP_NODEFRAG: -- cgit v1.2.3 From 56e65312830ec45e3ccbc929de6f5b86bc301546 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Aug 2023 08:16:43 +0200 Subject: devlink: push object register/unregister notifications into separate helpers In preparations of leftover.c split to individual files, avoid need to have object structures exposed in devl_internal.h and allow to have them maintained in object files. The register/unregister notifications need to know the structures to iterate lists. To avoid the need, introduce per-object register/unregister notification helpers and use them. Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230828061657.300667-2-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/leftover.c | 247 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 163 insertions(+), 84 deletions(-) (limited to 'net') diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 149752bc9f2d..890eafac9f8f 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -1058,6 +1058,26 @@ static void devlink_port_notify(struct devlink_port *devlink_port, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } +static void devlink_ports_notify(struct devlink *devlink, + enum devlink_command cmd) +{ + struct devlink_port *devlink_port; + unsigned long port_index; + + xa_for_each(&devlink->ports, port_index, devlink_port) + devlink_port_notify(devlink_port, cmd); +} + +static void devlink_ports_notify_register(struct devlink *devlink) +{ + devlink_ports_notify(devlink, DEVLINK_CMD_PORT_NEW); +} + +static void devlink_ports_notify_unregister(struct devlink *devlink) +{ + devlink_ports_notify(devlink, DEVLINK_CMD_PORT_DEL); +} + static void devlink_rate_notify(struct devlink_rate *devlink_rate, enum devlink_command cmd) { @@ -1084,6 +1104,22 @@ static void devlink_rate_notify(struct devlink_rate *devlink_rate, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } +static void devlink_rates_notify_register(struct devlink *devlink) +{ + struct devlink_rate *rate_node; + + list_for_each_entry(rate_node, &devlink->rate_list, list) + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); +} + +static void devlink_rates_notify_unregister(struct devlink *devlink) +{ + struct devlink_rate *rate_node; + + list_for_each_entry_reverse(rate_node, &devlink->rate_list, list) + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); +} + static int devlink_nl_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) @@ -1928,6 +1964,22 @@ static void devlink_linecard_notify(struct devlink_linecard *linecard, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } +static void devlink_linecards_notify_register(struct devlink *devlink) +{ + struct devlink_linecard *linecard; + + list_for_each_entry(linecard, &devlink->linecard_list, list) + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); +} + +static void devlink_linecards_notify_unregister(struct devlink *devlink) +{ + struct devlink_linecard *linecard; + + list_for_each_entry_reverse(linecard, &devlink->linecard_list, list) + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); +} + int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; @@ -4285,6 +4337,26 @@ static void devlink_param_notify(struct devlink *devlink, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } +static void devlink_params_notify(struct devlink *devlink, + enum devlink_command cmd) +{ + struct devlink_param_item *param_item; + unsigned long param_id; + + xa_for_each(&devlink->params, param_id, param_item) + devlink_param_notify(devlink, 0, param_item, cmd); +} + +static void devlink_params_notify_register(struct devlink *devlink) +{ + devlink_params_notify(devlink, DEVLINK_CMD_PARAM_NEW); +} + +static void devlink_params_notify_unregister(struct devlink *devlink) +{ + devlink_params_notify(devlink, DEVLINK_CMD_PARAM_DEL); +} + static int devlink_nl_param_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, @@ -4694,6 +4766,22 @@ static void devlink_nl_region_notify(struct devlink_region *region, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } +static void devlink_regions_notify_register(struct devlink *devlink) +{ + struct devlink_region *region; + + list_for_each_entry(region, &devlink->region_list, list) + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); +} + +static void devlink_regions_notify_unregister(struct devlink *devlink) +{ + struct devlink_region *region; + + list_for_each_entry_reverse(region, &devlink->region_list, list) + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); +} + /** * __devlink_snapshot_id_increment - Increment number of snapshots using an id * @devlink: devlink instance @@ -6662,98 +6750,36 @@ const struct genl_small_ops devlink_nl_small_ops[40] = { /* -- No new ops here! Use split ops going forward! -- */ }; -static void -devlink_trap_policer_notify(struct devlink *devlink, - const struct devlink_trap_policer_item *policer_item, - enum devlink_command cmd); -static void -devlink_trap_group_notify(struct devlink *devlink, - const struct devlink_trap_group_item *group_item, - enum devlink_command cmd); -static void devlink_trap_notify(struct devlink *devlink, - const struct devlink_trap_item *trap_item, - enum devlink_command cmd); +static void devlink_trap_policers_notify_register(struct devlink *devlink); +static void devlink_trap_policers_notify_unregister(struct devlink *devlink); +static void devlink_trap_groups_notify_register(struct devlink *devlink); +static void devlink_trap_groups_notify_unregister(struct devlink *devlink); +static void devlink_traps_notify_register(struct devlink *devlink); +static void devlink_traps_notify_unregister(struct devlink *devlink); void devlink_notify_register(struct devlink *devlink) { - struct devlink_trap_policer_item *policer_item; - struct devlink_trap_group_item *group_item; - struct devlink_param_item *param_item; - struct devlink_trap_item *trap_item; - struct devlink_port *devlink_port; - struct devlink_linecard *linecard; - struct devlink_rate *rate_node; - struct devlink_region *region; - unsigned long port_index; - unsigned long param_id; - devlink_notify(devlink, DEVLINK_CMD_NEW); - list_for_each_entry(linecard, &devlink->linecard_list, list) - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - - xa_for_each(&devlink->ports, port_index, devlink_port) - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); - - list_for_each_entry(policer_item, &devlink->trap_policer_list, list) - devlink_trap_policer_notify(devlink, policer_item, - DEVLINK_CMD_TRAP_POLICER_NEW); - - list_for_each_entry(group_item, &devlink->trap_group_list, list) - devlink_trap_group_notify(devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_NEW); - - list_for_each_entry(trap_item, &devlink->trap_list, list) - devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); - - list_for_each_entry(rate_node, &devlink->rate_list, list) - devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); - - list_for_each_entry(region, &devlink->region_list, list) - devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); - - xa_for_each(&devlink->params, param_id, param_item) - devlink_param_notify(devlink, 0, param_item, - DEVLINK_CMD_PARAM_NEW); + devlink_linecards_notify_register(devlink); + devlink_ports_notify_register(devlink); + devlink_trap_policers_notify_register(devlink); + devlink_trap_groups_notify_register(devlink); + devlink_traps_notify_register(devlink); + devlink_rates_notify_register(devlink); + devlink_regions_notify_register(devlink); + devlink_params_notify_register(devlink); } void devlink_notify_unregister(struct devlink *devlink) { - struct devlink_trap_policer_item *policer_item; - struct devlink_trap_group_item *group_item; - struct devlink_param_item *param_item; - struct devlink_trap_item *trap_item; - struct devlink_port *devlink_port; - struct devlink_linecard *linecard; - struct devlink_rate *rate_node; - struct devlink_region *region; - unsigned long port_index; - unsigned long param_id; - - xa_for_each(&devlink->params, param_id, param_item) - devlink_param_notify(devlink, 0, param_item, - DEVLINK_CMD_PARAM_DEL); - - list_for_each_entry_reverse(region, &devlink->region_list, list) - devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); - - list_for_each_entry_reverse(rate_node, &devlink->rate_list, list) - devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); - - list_for_each_entry_reverse(trap_item, &devlink->trap_list, list) - devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); - - list_for_each_entry_reverse(group_item, &devlink->trap_group_list, list) - devlink_trap_group_notify(devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_DEL); - list_for_each_entry_reverse(policer_item, &devlink->trap_policer_list, - list) - devlink_trap_policer_notify(devlink, policer_item, - DEVLINK_CMD_TRAP_POLICER_DEL); - - xa_for_each(&devlink->ports, port_index, devlink_port) - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); - list_for_each_entry_reverse(linecard, &devlink->linecard_list, list) - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); + devlink_params_notify_unregister(devlink); + devlink_regions_notify_unregister(devlink); + devlink_rates_notify_unregister(devlink); + devlink_traps_notify_unregister(devlink); + devlink_trap_groups_notify_unregister(devlink); + devlink_trap_policers_notify_unregister(devlink); + devlink_ports_notify_unregister(devlink); + devlink_linecards_notify_unregister(devlink); devlink_notify(devlink, DEVLINK_CMD_DEL); } @@ -8879,6 +8905,24 @@ devlink_trap_group_notify(struct devlink *devlink, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } +static void devlink_trap_groups_notify_register(struct devlink *devlink) +{ + struct devlink_trap_group_item *group_item; + + list_for_each_entry(group_item, &devlink->trap_group_list, list) + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_NEW); +} + +static void devlink_trap_groups_notify_unregister(struct devlink *devlink) +{ + struct devlink_trap_group_item *group_item; + + list_for_each_entry_reverse(group_item, &devlink->trap_group_list, list) + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_DEL); +} + static int devlink_trap_item_group_link(struct devlink *devlink, struct devlink_trap_item *trap_item) @@ -8921,6 +8965,22 @@ static void devlink_trap_notify(struct devlink *devlink, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } +static void devlink_traps_notify_register(struct devlink *devlink) +{ + struct devlink_trap_item *trap_item; + + list_for_each_entry(trap_item, &devlink->trap_list, list) + devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); +} + +static void devlink_traps_notify_unregister(struct devlink *devlink) +{ + struct devlink_trap_item *trap_item; + + list_for_each_entry_reverse(trap_item, &devlink->trap_list, list) + devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); +} + static int devlink_trap_register(struct devlink *devlink, const struct devlink_trap *trap, void *priv) @@ -9382,6 +9442,25 @@ devlink_trap_policer_notify(struct devlink *devlink, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } +static void devlink_trap_policers_notify_register(struct devlink *devlink) +{ + struct devlink_trap_policer_item *policer_item; + + list_for_each_entry(policer_item, &devlink->trap_policer_list, list) + devlink_trap_policer_notify(devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_NEW); +} + +static void devlink_trap_policers_notify_unregister(struct devlink *devlink) +{ + struct devlink_trap_policer_item *policer_item; + + list_for_each_entry_reverse(policer_item, &devlink->trap_policer_list, + list) + devlink_trap_policer_notify(devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_DEL); +} + static int devlink_trap_policer_register(struct devlink *devlink, const struct devlink_trap_policer *policer) -- cgit v1.2.3 From eec1e5ea1d715ce2df8dcdf3dac7112df77a6e17 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Aug 2023 08:16:44 +0200 Subject: devlink: push port related code into separate file Cut out another chunk from leftover.c and put port related code into a separate file. Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230828061657.300667-3-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/Makefile | 2 +- net/devlink/devl_internal.h | 32 +- net/devlink/leftover.c | 1550 +------------------------------------------ net/devlink/port.c | 1515 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 1558 insertions(+), 1541 deletions(-) create mode 100644 net/devlink/port.c (limited to 'net') diff --git a/net/devlink/Makefile b/net/devlink/Makefile index a087af581847..456bfb336540 100644 --- a/net/devlink/Makefile +++ b/net/devlink/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o health.o +obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o port.o health.o diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index eb1d5066c73f..7d01e2060702 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -3,6 +3,7 @@ * Copyright (c) 2016 Jiri Pirko */ +#include #include #include #include @@ -11,6 +12,8 @@ #include #include #include +#include +#include #include "netlink_gen.h" @@ -149,16 +152,37 @@ devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) /* Notify */ void devlink_notify(struct devlink *devlink, enum devlink_command cmd); +void devlink_ports_notify_register(struct devlink *devlink); +void devlink_ports_notify_unregister(struct devlink *devlink); /* Ports */ +#define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \ + WARN_ON_ONCE(!(devlink_port)->initialized) + +struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, + unsigned int port_index); int devlink_port_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr); - struct devlink_port * devlink_port_get_from_info(struct devlink *devlink, struct genl_info *info); struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, struct nlattr **attrs); +/* Linecards */ +struct devlink_linecard { + struct list_head list; + struct devlink *devlink; + unsigned int index; + const struct devlink_linecard_ops *ops; + void *priv; + enum devlink_linecard_state state; + struct mutex state_lock; /* Protects state */ + const char *type; + struct devlink_linecard_type *types; + unsigned int types_count; + struct devlink *nested_devlink; +}; + /* Reload */ bool devlink_reload_actions_valid(const struct devlink_ops *ops); int devlink_reload(struct devlink *devlink, struct net *dest_net, @@ -190,6 +214,12 @@ int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info) int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_cmd_port_new_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 890eafac9f8f..d14b40fb8fdf 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -33,20 +33,6 @@ #include "devl_internal.h" -struct devlink_linecard { - struct list_head list; - struct devlink *devlink; - unsigned int index; - const struct devlink_linecard_ops *ops; - void *priv; - enum devlink_linecard_state state; - struct mutex state_lock; /* Protects state */ - const char *type; - struct devlink_linecard_type *types; - unsigned int types_count; - struct devlink *nested_devlink; -}; - /** * struct devlink_resource - devlink resource * @name: name of the resource @@ -131,52 +117,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); -#define DEVLINK_PORT_FN_CAPS_VALID_MASK \ - (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1) - -static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { - [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, - [DEVLINK_PORT_FN_ATTR_STATE] = - NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE, - DEVLINK_PORT_FN_STATE_ACTIVE), - [DEVLINK_PORT_FN_ATTR_CAPS] = - NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK), -}; - -#define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ - WARN_ON_ONCE(!(devlink_port)->registered) -#define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \ - WARN_ON_ONCE((devlink_port)->registered) -#define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \ - WARN_ON_ONCE(!(devlink_port)->initialized) - -static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, - unsigned int port_index) -{ - return xa_load(&devlink->ports, port_index); -} - -struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, - struct nlattr **attrs) -{ - if (attrs[DEVLINK_ATTR_PORT_INDEX]) { - u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); - struct devlink_port *devlink_port; - - devlink_port = devlink_port_get_by_index(devlink, port_index); - if (!devlink_port) - return ERR_PTR(-ENODEV); - return devlink_port; - } - return ERR_PTR(-EINVAL); -} - -struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, - struct genl_info *info) -{ - return devlink_port_get_from_attrs(devlink, info->attrs); -} - static inline bool devlink_rate_is_leaf(struct devlink_rate *devlink_rate) { @@ -439,138 +379,6 @@ devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb, return 0; } -static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps, - u32 cap, bool is_enable) -{ - caps->selector |= cap; - if (is_enable) - caps->value |= cap; -} - -static int devlink_port_fn_roce_fill(struct devlink_port *devlink_port, - struct nla_bitfield32 *caps, - struct netlink_ext_ack *extack) -{ - bool is_enable; - int err; - - if (!devlink_port->ops->port_fn_roce_get) - return 0; - - err = devlink_port->ops->port_fn_roce_get(devlink_port, &is_enable, - extack); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - - devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable); - return 0; -} - -static int devlink_port_fn_migratable_fill(struct devlink_port *devlink_port, - struct nla_bitfield32 *caps, - struct netlink_ext_ack *extack) -{ - bool is_enable; - int err; - - if (!devlink_port->ops->port_fn_migratable_get || - devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) - return 0; - - err = devlink_port->ops->port_fn_migratable_get(devlink_port, - &is_enable, extack); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - - devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable); - return 0; -} - -static int devlink_port_fn_ipsec_crypto_fill(struct devlink_port *devlink_port, - struct nla_bitfield32 *caps, - struct netlink_ext_ack *extack) -{ - bool is_enable; - int err; - - if (!devlink_port->ops->port_fn_ipsec_crypto_get || - devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) - return 0; - - err = devlink_port->ops->port_fn_ipsec_crypto_get(devlink_port, &is_enable, extack); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - - devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, is_enable); - return 0; -} - -static int devlink_port_fn_ipsec_packet_fill(struct devlink_port *devlink_port, - struct nla_bitfield32 *caps, - struct netlink_ext_ack *extack) -{ - bool is_enable; - int err; - - if (!devlink_port->ops->port_fn_ipsec_packet_get || - devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) - return 0; - - err = devlink_port->ops->port_fn_ipsec_packet_get(devlink_port, &is_enable, extack); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - - devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_PACKET, is_enable); - return 0; -} - -static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port, - struct sk_buff *msg, - struct netlink_ext_ack *extack, - bool *msg_updated) -{ - struct nla_bitfield32 caps = {}; - int err; - - err = devlink_port_fn_roce_fill(devlink_port, &caps, extack); - if (err) - return err; - - err = devlink_port_fn_migratable_fill(devlink_port, &caps, extack); - if (err) - return err; - - err = devlink_port_fn_ipsec_crypto_fill(devlink_port, &caps, extack); - if (err) - return err; - - err = devlink_port_fn_ipsec_packet_fill(devlink_port, &caps, extack); - if (err) - return err; - - if (!caps.selector) - return 0; - err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value, - caps.selector); - if (err) - return err; - - *msg_updated = true; - return 0; -} - static int devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, struct genl_info *info, @@ -661,113 +469,6 @@ nla_put_failure: return -EMSGSIZE; } -int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) -{ - if (devlink_nl_put_handle(msg, devlink_port->devlink)) - return -EMSGSIZE; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) - return -EMSGSIZE; - return 0; -} - -size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) -{ - struct devlink *devlink = devlink_port->devlink; - - return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */ - + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */ - + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */ -} - -static int devlink_nl_port_attrs_put(struct sk_buff *msg, - struct devlink_port *devlink_port) -{ - struct devlink_port_attrs *attrs = &devlink_port->attrs; - - if (!devlink_port->attrs_set) - return 0; - if (attrs->lanes) { - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) - return -EMSGSIZE; - } - if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable)) - return -EMSGSIZE; - if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) - return -EMSGSIZE; - switch (devlink_port->attrs.flavour) { - case DEVLINK_PORT_FLAVOUR_PCI_PF: - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, - attrs->pci_pf.controller) || - nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) - return -EMSGSIZE; - if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external)) - return -EMSGSIZE; - break; - case DEVLINK_PORT_FLAVOUR_PCI_VF: - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, - attrs->pci_vf.controller) || - nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) || - nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) - return -EMSGSIZE; - if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) - return -EMSGSIZE; - break; - case DEVLINK_PORT_FLAVOUR_PCI_SF: - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, - attrs->pci_sf.controller) || - nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, - attrs->pci_sf.pf) || - nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER, - attrs->pci_sf.sf)) - return -EMSGSIZE; - break; - case DEVLINK_PORT_FLAVOUR_PHYSICAL: - case DEVLINK_PORT_FLAVOUR_CPU: - case DEVLINK_PORT_FLAVOUR_DSA: - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, - attrs->phys.port_number)) - return -EMSGSIZE; - if (!attrs->split) - return 0; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, - attrs->phys.port_number)) - return -EMSGSIZE; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, - attrs->phys.split_subport_number)) - return -EMSGSIZE; - break; - default: - break; - } - return 0; -} - -static int devlink_port_fn_hw_addr_fill(struct devlink_port *port, - struct sk_buff *msg, - struct netlink_ext_ack *extack, - bool *msg_updated) -{ - u8 hw_addr[MAX_ADDR_LEN]; - int hw_addr_len; - int err; - - if (!port->ops->port_fn_hw_addr_get) - return 0; - - err = port->ops->port_fn_hw_addr_get(port, hw_addr, &hw_addr_len, - extack); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); - if (err) - return err; - *msg_updated = true; - return 0; -} - static int devlink_nl_rate_fill(struct sk_buff *msg, struct devlink_rate *devlink_rate, enum devlink_command cmd, u32 portid, u32 seq, @@ -825,259 +526,6 @@ nla_put_failure: return -EMSGSIZE; } -static bool -devlink_port_fn_state_valid(enum devlink_port_fn_state state) -{ - return state == DEVLINK_PORT_FN_STATE_INACTIVE || - state == DEVLINK_PORT_FN_STATE_ACTIVE; -} - -static bool -devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) -{ - return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED || - opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; -} - -static int devlink_port_fn_state_fill(struct devlink_port *port, - struct sk_buff *msg, - struct netlink_ext_ack *extack, - bool *msg_updated) -{ - enum devlink_port_fn_opstate opstate; - enum devlink_port_fn_state state; - int err; - - if (!port->ops->port_fn_state_get) - return 0; - - err = port->ops->port_fn_state_get(port, &state, &opstate, extack); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } - if (!devlink_port_fn_state_valid(state)) { - WARN_ON_ONCE(1); - NL_SET_ERR_MSG(extack, "Invalid state read from driver"); - return -EINVAL; - } - if (!devlink_port_fn_opstate_valid(opstate)) { - WARN_ON_ONCE(1); - NL_SET_ERR_MSG(extack, "Invalid operational state read from driver"); - return -EINVAL; - } - if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) || - nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate)) - return -EMSGSIZE; - *msg_updated = true; - return 0; -} - -static int -devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable, - struct netlink_ext_ack *extack) -{ - return devlink_port->ops->port_fn_migratable_set(devlink_port, enable, - extack); -} - -static int -devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable, - struct netlink_ext_ack *extack) -{ - return devlink_port->ops->port_fn_roce_set(devlink_port, enable, - extack); -} - -static int -devlink_port_fn_ipsec_crypto_set(struct devlink_port *devlink_port, bool enable, - struct netlink_ext_ack *extack) -{ - return devlink_port->ops->port_fn_ipsec_crypto_set(devlink_port, enable, extack); -} - -static int -devlink_port_fn_ipsec_packet_set(struct devlink_port *devlink_port, bool enable, - struct netlink_ext_ack *extack) -{ - return devlink_port->ops->port_fn_ipsec_packet_set(devlink_port, enable, extack); -} - -static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, - const struct nlattr *attr, - struct netlink_ext_ack *extack) -{ - struct nla_bitfield32 caps; - u32 caps_value; - int err; - - caps = nla_get_bitfield32(attr); - caps_value = caps.value & caps.selector; - if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) { - err = devlink_port_fn_roce_set(devlink_port, - caps_value & DEVLINK_PORT_FN_CAP_ROCE, - extack); - if (err) - return err; - } - if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { - err = devlink_port_fn_mig_set(devlink_port, caps_value & - DEVLINK_PORT_FN_CAP_MIGRATABLE, - extack); - if (err) - return err; - } - if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { - err = devlink_port_fn_ipsec_crypto_set(devlink_port, caps_value & - DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, - extack); - if (err) - return err; - } - if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { - err = devlink_port_fn_ipsec_packet_set(devlink_port, caps_value & - DEVLINK_PORT_FN_CAP_IPSEC_PACKET, - extack); - if (err) - return err; - } - return 0; -} - -static int -devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, - struct netlink_ext_ack *extack) -{ - struct nlattr *function_attr; - bool msg_updated = false; - int err; - - function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION); - if (!function_attr) - return -EMSGSIZE; - - err = devlink_port_fn_hw_addr_fill(port, msg, extack, &msg_updated); - if (err) - goto out; - err = devlink_port_fn_caps_fill(port, msg, extack, &msg_updated); - if (err) - goto out; - err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated); -out: - if (err || !msg_updated) - nla_nest_cancel(msg, function_attr); - else - nla_nest_end(msg, function_attr); - return err; -} - -static int devlink_nl_port_fill(struct sk_buff *msg, - struct devlink_port *devlink_port, - enum devlink_command cmd, u32 portid, u32 seq, - int flags, struct netlink_ext_ack *extack) -{ - struct devlink *devlink = devlink_port->devlink; - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) - goto nla_put_failure; - - spin_lock_bh(&devlink_port->type_lock); - if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) - goto nla_put_failure_type_locked; - if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET && - nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE, - devlink_port->desired_type)) - goto nla_put_failure_type_locked; - if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { - if (devlink_port->type_eth.netdev && - (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, - devlink_port->type_eth.ifindex) || - nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, - devlink_port->type_eth.ifname))) - goto nla_put_failure_type_locked; - } - if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { - struct ib_device *ibdev = devlink_port->type_ib.ibdev; - - if (ibdev && - nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME, - ibdev->name)) - goto nla_put_failure_type_locked; - } - spin_unlock_bh(&devlink_port->type_lock); - if (devlink_nl_port_attrs_put(msg, devlink_port)) - goto nla_put_failure; - if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) - goto nla_put_failure; - if (devlink_port->linecard && - nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, - devlink_port->linecard->index)) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure_type_locked: - spin_unlock_bh(&devlink_port->type_lock); -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void devlink_port_notify(struct devlink_port *devlink_port, - enum devlink_command cmd) -{ - struct devlink *devlink = devlink_port->devlink; - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); - - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, - 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static void devlink_ports_notify(struct devlink *devlink, - enum devlink_command cmd) -{ - struct devlink_port *devlink_port; - unsigned long port_index; - - xa_for_each(&devlink->ports, port_index, devlink_port) - devlink_port_notify(devlink_port, cmd); -} - -static void devlink_ports_notify_register(struct devlink *devlink) -{ - devlink_ports_notify(devlink, DEVLINK_CMD_PORT_NEW); -} - -static void devlink_ports_notify_unregister(struct devlink *devlink) -{ - devlink_ports_notify(devlink, DEVLINK_CMD_PORT_DEL); -} - static void devlink_rate_notify(struct devlink_rate *devlink_rate, enum devlink_command cmd) { @@ -1176,391 +624,20 @@ int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info) nlmsg_free(msg); return err; } - - return genlmsg_reply(msg, info); -} - -static bool -devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, - struct devlink_rate *parent) -{ - while (parent) { - if (parent == devlink_rate) - return true; - parent = parent->parent; - } - return false; -} - -int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct devlink_port *devlink_port = info->user_ptr[1]; - struct sk_buff *msg; - int err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, - info->snd_portid, info->snd_seq, 0, - info->extack); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int -devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb, int flags) -{ - struct devlink_nl_dump_state *state = devlink_dump_state(cb); - struct devlink_port *devlink_port; - unsigned long port_index; - int err = 0; - - xa_for_each_start(&devlink->ports, port_index, devlink_port, state->idx) { - err = devlink_nl_port_fill(msg, devlink_port, - DEVLINK_CMD_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, flags, - cb->extack); - if (err) { - state->idx = port_index; - break; - } - } - - return err; -} - -int devlink_nl_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) -{ - return devlink_nl_dumpit(skb, cb, devlink_nl_port_get_dump_one); -} - -static int devlink_port_type_set(struct devlink_port *devlink_port, - enum devlink_port_type port_type) - -{ - int err; - - if (!devlink_port->ops->port_type_set) - return -EOPNOTSUPP; - - if (port_type == devlink_port->type) - return 0; - - err = devlink_port->ops->port_type_set(devlink_port, port_type); - if (err) - return err; - - devlink_port->desired_type = port_type; - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); - return 0; -} - -static int devlink_port_function_hw_addr_set(struct devlink_port *port, - const struct nlattr *attr, - struct netlink_ext_ack *extack) -{ - const u8 *hw_addr; - int hw_addr_len; - - hw_addr = nla_data(attr); - hw_addr_len = nla_len(attr); - if (hw_addr_len > MAX_ADDR_LEN) { - NL_SET_ERR_MSG(extack, "Port function hardware address too long"); - return -EINVAL; - } - if (port->type == DEVLINK_PORT_TYPE_ETH) { - if (hw_addr_len != ETH_ALEN) { - NL_SET_ERR_MSG(extack, "Address must be 6 bytes for Ethernet device"); - return -EINVAL; - } - if (!is_unicast_ether_addr(hw_addr)) { - NL_SET_ERR_MSG(extack, "Non-unicast hardware address unsupported"); - return -EINVAL; - } - } - - return port->ops->port_fn_hw_addr_set(port, hw_addr, hw_addr_len, - extack); -} - -static int devlink_port_fn_state_set(struct devlink_port *port, - const struct nlattr *attr, - struct netlink_ext_ack *extack) -{ - enum devlink_port_fn_state state; - - state = nla_get_u8(attr); - return port->ops->port_fn_state_set(port, state, extack); -} - -static int devlink_port_function_validate(struct devlink_port *devlink_port, - struct nlattr **tb, - struct netlink_ext_ack *extack) -{ - const struct devlink_port_ops *ops = devlink_port->ops; - struct nlattr *attr; - - if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] && - !ops->port_fn_hw_addr_set) { - NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], - "Port doesn't support function attributes"); - return -EOPNOTSUPP; - } - if (tb[DEVLINK_PORT_FN_ATTR_STATE] && !ops->port_fn_state_set) { - NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], - "Function does not support state setting"); - return -EOPNOTSUPP; - } - attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; - if (attr) { - struct nla_bitfield32 caps; - - caps = nla_get_bitfield32(attr); - if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE && - !ops->port_fn_roce_set) { - NL_SET_ERR_MSG_ATTR(extack, attr, - "Port doesn't support RoCE function attribute"); - return -EOPNOTSUPP; - } - if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { - if (!ops->port_fn_migratable_set) { - NL_SET_ERR_MSG_ATTR(extack, attr, - "Port doesn't support migratable function attribute"); - return -EOPNOTSUPP; - } - if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { - NL_SET_ERR_MSG_ATTR(extack, attr, - "migratable function attribute supported for VFs only"); - return -EOPNOTSUPP; - } - } - if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { - if (!ops->port_fn_ipsec_crypto_set) { - NL_SET_ERR_MSG_ATTR(extack, attr, - "Port doesn't support ipsec_crypto function attribute"); - return -EOPNOTSUPP; - } - if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { - NL_SET_ERR_MSG_ATTR(extack, attr, - "ipsec_crypto function attribute supported for VFs only"); - return -EOPNOTSUPP; - } - } - if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { - if (!ops->port_fn_ipsec_packet_set) { - NL_SET_ERR_MSG_ATTR(extack, attr, - "Port doesn't support ipsec_packet function attribute"); - return -EOPNOTSUPP; - } - if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { - NL_SET_ERR_MSG_ATTR(extack, attr, - "ipsec_packet function attribute supported for VFs only"); - return -EOPNOTSUPP; - } - } - } - return 0; -} - -static int devlink_port_function_set(struct devlink_port *port, - const struct nlattr *attr, - struct netlink_ext_ack *extack) -{ - struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1]; - int err; - - err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr, - devlink_function_nl_policy, extack); - if (err < 0) { - NL_SET_ERR_MSG(extack, "Fail to parse port function attributes"); - return err; - } - - err = devlink_port_function_validate(port, tb, extack); - if (err) - return err; - - attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; - if (attr) { - err = devlink_port_function_hw_addr_set(port, attr, extack); - if (err) - return err; - } - - attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; - if (attr) { - err = devlink_port_fn_caps_set(port, attr, extack); - if (err) - return err; - } - - /* Keep this as the last function attribute set, so that when - * multiple port function attributes are set along with state, - * Those can be applied first before activating the state. - */ - attr = tb[DEVLINK_PORT_FN_ATTR_STATE]; - if (attr) - err = devlink_port_fn_state_set(port, attr, extack); - - if (!err) - devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); - return err; -} - -static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_port *devlink_port = info->user_ptr[1]; - int err; - - if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { - enum devlink_port_type port_type; - - port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); - err = devlink_port_type_set(devlink_port, port_type); - if (err) - return err; - } - - if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) { - struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION]; - struct netlink_ext_ack *extack = info->extack; - - err = devlink_port_function_set(devlink_port, attr, extack); - if (err) - return err; - } - - return 0; -} - -static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = info->user_ptr[0]; - u32 count; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT)) - return -EINVAL; - if (!devlink_port->ops->port_split) - return -EOPNOTSUPP; - - count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); - - if (!devlink_port->attrs.splittable) { - /* Split ports cannot be split. */ - if (devlink_port->attrs.split) - NL_SET_ERR_MSG(info->extack, "Port cannot be split further"); - else - NL_SET_ERR_MSG(info->extack, "Port cannot be split"); - return -EINVAL; - } - - if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) { - NL_SET_ERR_MSG(info->extack, "Invalid split count"); - return -EINVAL; - } - - return devlink_port->ops->port_split(devlink, devlink_port, count, - info->extack); -} - -static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = info->user_ptr[0]; - - if (!devlink_port->ops->port_unsplit) - return -EOPNOTSUPP; - return devlink_port->ops->port_unsplit(devlink, devlink_port, info->extack); -} - -static int devlink_nl_cmd_port_new_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct devlink_port_new_attrs new_attrs = {}; - struct devlink *devlink = info->user_ptr[0]; - struct devlink_port *devlink_port; - struct sk_buff *msg; - int err; - - if (!devlink->ops->port_new) - return -EOPNOTSUPP; - - if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] || - !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) { - NL_SET_ERR_MSG(extack, "Port flavour or PCI PF are not specified"); - return -EINVAL; - } - new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]); - new_attrs.pfnum = - nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]); - - if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { - /* Port index of the new port being created by driver. */ - new_attrs.port_index = - nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); - new_attrs.port_index_valid = true; - } - if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) { - new_attrs.controller = - nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]); - new_attrs.controller_valid = true; - } - if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF && - info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) { - new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]); - new_attrs.sfnum_valid = true; - } - - err = devlink->ops->port_new(devlink, &new_attrs, - extack, &devlink_port); - if (err) - return err; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) { - err = -ENOMEM; - goto err_out_port_del; - } - err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, - info->snd_portid, info->snd_seq, 0, NULL); - if (WARN_ON_ONCE(err)) - goto err_out_msg_free; - err = genlmsg_reply(msg, info); - if (err) - goto err_out_port_del; - return 0; - -err_out_msg_free: - nlmsg_free(msg); -err_out_port_del: - devlink_port->ops->port_del(devlink, devlink_port, NULL); - return err; + + return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, - struct genl_info *info) +static bool +devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, + struct devlink_rate *parent) { - struct devlink_port *devlink_port = info->user_ptr[1]; - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - - if (!devlink_port->ops->port_del) - return -EOPNOTSUPP; - - return devlink_port->ops->port_del(devlink, devlink_port, extack); + while (parent) { + if (parent == devlink_rate) + return true; + parent = parent->parent; + } + return false; } static int @@ -6783,489 +5860,6 @@ void devlink_notify_unregister(struct devlink *devlink) devlink_notify(devlink, DEVLINK_CMD_DEL); } -static void devlink_port_type_warn(struct work_struct *work) -{ - struct devlink_port *port = container_of(to_delayed_work(work), - struct devlink_port, - type_warn_dw); - dev_warn(port->devlink->dev, "Type was not set for devlink port."); -} - -static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) -{ - /* Ignore CPU and DSA flavours. */ - return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && - devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA && - devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED; -} - -#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600) - -static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port) -{ - if (!devlink_port_type_should_warn(devlink_port)) - return; - /* Schedule a work to WARN in case driver does not set port - * type within timeout. - */ - schedule_delayed_work(&devlink_port->type_warn_dw, - DEVLINK_PORT_TYPE_WARN_TIMEOUT); -} - -static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) -{ - if (!devlink_port_type_should_warn(devlink_port)) - return; - cancel_delayed_work_sync(&devlink_port->type_warn_dw); -} - -/** - * devlink_port_init() - Init devlink port - * - * @devlink: devlink - * @devlink_port: devlink port - * - * Initialize essential stuff that is needed for functions - * that may be called before devlink port registration. - * Call to this function is optional and not needed - * in case the driver does not use such functions. - */ -void devlink_port_init(struct devlink *devlink, - struct devlink_port *devlink_port) -{ - if (devlink_port->initialized) - return; - devlink_port->devlink = devlink; - INIT_LIST_HEAD(&devlink_port->region_list); - devlink_port->initialized = true; -} -EXPORT_SYMBOL_GPL(devlink_port_init); - -/** - * devlink_port_fini() - Deinitialize devlink port - * - * @devlink_port: devlink port - * - * Deinitialize essential stuff that is in use for functions - * that may be called after devlink port unregistration. - * Call to this function is optional and not needed - * in case the driver does not use such functions. - */ -void devlink_port_fini(struct devlink_port *devlink_port) -{ - WARN_ON(!list_empty(&devlink_port->region_list)); -} -EXPORT_SYMBOL_GPL(devlink_port_fini); - -static const struct devlink_port_ops devlink_port_dummy_ops = {}; - -/** - * devl_port_register_with_ops() - Register devlink port - * - * @devlink: devlink - * @devlink_port: devlink port - * @port_index: driver-specific numerical identifier of the port - * @ops: port ops - * - * Register devlink port with provided port index. User can use - * any indexing, even hw-related one. devlink_port structure - * is convenient to be embedded inside user driver private structure. - * Note that the caller should take care of zeroing the devlink_port - * structure. - */ -int devl_port_register_with_ops(struct devlink *devlink, - struct devlink_port *devlink_port, - unsigned int port_index, - const struct devlink_port_ops *ops) -{ - int err; - - devl_assert_locked(devlink); - - ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - - devlink_port_init(devlink, devlink_port); - devlink_port->registered = true; - devlink_port->index = port_index; - devlink_port->ops = ops ? ops : &devlink_port_dummy_ops; - spin_lock_init(&devlink_port->type_lock); - INIT_LIST_HEAD(&devlink_port->reporter_list); - err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); - if (err) { - devlink_port->registered = false; - return err; - } - - INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); - devlink_port_type_warn_schedule(devlink_port); - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); - return 0; -} -EXPORT_SYMBOL_GPL(devl_port_register_with_ops); - -/** - * devlink_port_register_with_ops - Register devlink port - * - * @devlink: devlink - * @devlink_port: devlink port - * @port_index: driver-specific numerical identifier of the port - * @ops: port ops - * - * Register devlink port with provided port index. User can use - * any indexing, even hw-related one. devlink_port structure - * is convenient to be embedded inside user driver private structure. - * Note that the caller should take care of zeroing the devlink_port - * structure. - * - * Context: Takes and release devlink->lock . - */ -int devlink_port_register_with_ops(struct devlink *devlink, - struct devlink_port *devlink_port, - unsigned int port_index, - const struct devlink_port_ops *ops) -{ - int err; - - devl_lock(devlink); - err = devl_port_register_with_ops(devlink, devlink_port, - port_index, ops); - devl_unlock(devlink); - return err; -} -EXPORT_SYMBOL_GPL(devlink_port_register_with_ops); - -/** - * devl_port_unregister() - Unregister devlink port - * - * @devlink_port: devlink port - */ -void devl_port_unregister(struct devlink_port *devlink_port) -{ - lockdep_assert_held(&devlink_port->devlink->lock); - WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET); - - devlink_port_type_warn_cancel(devlink_port); - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); - xa_erase(&devlink_port->devlink->ports, devlink_port->index); - WARN_ON(!list_empty(&devlink_port->reporter_list)); - devlink_port->registered = false; -} -EXPORT_SYMBOL_GPL(devl_port_unregister); - -/** - * devlink_port_unregister - Unregister devlink port - * - * @devlink_port: devlink port - * - * Context: Takes and release devlink->lock . - */ -void devlink_port_unregister(struct devlink_port *devlink_port) -{ - struct devlink *devlink = devlink_port->devlink; - - devl_lock(devlink); - devl_port_unregister(devlink_port); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_port_unregister); - -static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, - struct net_device *netdev) -{ - const struct net_device_ops *ops = netdev->netdev_ops; - - /* If driver registers devlink port, it should set devlink port - * attributes accordingly so the compat functions are called - * and the original ops are not used. - */ - if (ops->ndo_get_phys_port_name) { - /* Some drivers use the same set of ndos for netdevs - * that have devlink_port registered and also for - * those who don't. Make sure that ndo_get_phys_port_name - * returns -EOPNOTSUPP here in case it is defined. - * Warn if not. - */ - char name[IFNAMSIZ]; - int err; - - err = ops->ndo_get_phys_port_name(netdev, name, sizeof(name)); - WARN_ON(err != -EOPNOTSUPP); - } - if (ops->ndo_get_port_parent_id) { - /* Some drivers use the same set of ndos for netdevs - * that have devlink_port registered and also for - * those who don't. Make sure that ndo_get_port_parent_id - * returns -EOPNOTSUPP here in case it is defined. - * Warn if not. - */ - struct netdev_phys_item_id ppid; - int err; - - err = ops->ndo_get_port_parent_id(netdev, &ppid); - WARN_ON(err != -EOPNOTSUPP); - } -} - -static void __devlink_port_type_set(struct devlink_port *devlink_port, - enum devlink_port_type type, - void *type_dev) -{ - struct net_device *netdev = type_dev; - - ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); - - if (type == DEVLINK_PORT_TYPE_NOTSET) { - devlink_port_type_warn_schedule(devlink_port); - } else { - devlink_port_type_warn_cancel(devlink_port); - if (type == DEVLINK_PORT_TYPE_ETH && netdev) - devlink_port_type_netdev_checks(devlink_port, netdev); - } - - spin_lock_bh(&devlink_port->type_lock); - devlink_port->type = type; - switch (type) { - case DEVLINK_PORT_TYPE_ETH: - devlink_port->type_eth.netdev = netdev; - if (netdev) { - ASSERT_RTNL(); - devlink_port->type_eth.ifindex = netdev->ifindex; - BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) != - sizeof(netdev->name)); - strcpy(devlink_port->type_eth.ifname, netdev->name); - } - break; - case DEVLINK_PORT_TYPE_IB: - devlink_port->type_ib.ibdev = type_dev; - break; - default: - break; - } - spin_unlock_bh(&devlink_port->type_lock); - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); -} - -/** - * devlink_port_type_eth_set - Set port type to Ethernet - * - * @devlink_port: devlink port - * - * If driver is calling this, most likely it is doing something wrong. - */ -void devlink_port_type_eth_set(struct devlink_port *devlink_port) -{ - dev_warn(devlink_port->devlink->dev, - "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", - devlink_port->index); - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); -} -EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); - -/** - * devlink_port_type_ib_set - Set port type to InfiniBand - * - * @devlink_port: devlink port - * @ibdev: related IB device - */ -void devlink_port_type_ib_set(struct devlink_port *devlink_port, - struct ib_device *ibdev) -{ - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev); -} -EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); - -/** - * devlink_port_type_clear - Clear port type - * - * @devlink_port: devlink port - * - * If driver is calling this for clearing Ethernet type, most likely - * it is doing something wrong. - */ -void devlink_port_type_clear(struct devlink_port *devlink_port) -{ - if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) - dev_warn(devlink_port->devlink->dev, - "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n", - devlink_port->index); - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); -} -EXPORT_SYMBOL_GPL(devlink_port_type_clear); - -int devlink_port_netdevice_event(struct notifier_block *nb, - unsigned long event, void *ptr) -{ - struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - struct devlink_port *devlink_port = netdev->devlink_port; - struct devlink *devlink; - - if (!devlink_port) - return NOTIFY_OK; - devlink = devlink_port->devlink; - - switch (event) { - case NETDEV_POST_INIT: - /* Set the type but not netdev pointer. It is going to be set - * later on by NETDEV_REGISTER event. Happens once during - * netdevice register - */ - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, - NULL); - break; - case NETDEV_REGISTER: - case NETDEV_CHANGENAME: - if (devlink_net(devlink) != dev_net(netdev)) - return NOTIFY_OK; - /* Set the netdev on top of previously set type. Note this - * event happens also during net namespace change so here - * we take into account netdev pointer appearing in this - * namespace. - */ - __devlink_port_type_set(devlink_port, devlink_port->type, - netdev); - break; - case NETDEV_UNREGISTER: - if (devlink_net(devlink) != dev_net(netdev)) - return NOTIFY_OK; - /* Clear netdev pointer, but not the type. This event happens - * also during net namespace change so we need to clear - * pointer to netdev that is going to another net namespace. - */ - __devlink_port_type_set(devlink_port, devlink_port->type, - NULL); - break; - case NETDEV_PRE_UNINIT: - /* Clear the type and the netdev pointer. Happens one during - * netdevice unregister. - */ - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, - NULL); - break; - } - - return NOTIFY_OK; -} - -static int __devlink_port_attrs_set(struct devlink_port *devlink_port, - enum devlink_port_flavour flavour) -{ - struct devlink_port_attrs *attrs = &devlink_port->attrs; - - devlink_port->attrs_set = true; - attrs->flavour = flavour; - if (attrs->switch_id.id_len) { - devlink_port->switch_port = true; - if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN)) - attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN; - } else { - devlink_port->switch_port = false; - } - return 0; -} - -/** - * devlink_port_attrs_set - Set port attributes - * - * @devlink_port: devlink port - * @attrs: devlink port attrs - */ -void devlink_port_attrs_set(struct devlink_port *devlink_port, - struct devlink_port_attrs *attrs) -{ - int ret; - - ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - - devlink_port->attrs = *attrs; - ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); - if (ret) - return; - WARN_ON(attrs->splittable && attrs->split); -} -EXPORT_SYMBOL_GPL(devlink_port_attrs_set); - -/** - * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes - * - * @devlink_port: devlink port - * @controller: associated controller number for the devlink port instance - * @pf: associated PF for the devlink port instance - * @external: indicates if the port is for an external controller - */ -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, - u16 pf, bool external) -{ - struct devlink_port_attrs *attrs = &devlink_port->attrs; - int ret; - - ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - - ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_PF); - if (ret) - return; - attrs->pci_pf.controller = controller; - attrs->pci_pf.pf = pf; - attrs->pci_pf.external = external; -} -EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); - -/** - * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes - * - * @devlink_port: devlink port - * @controller: associated controller number for the devlink port instance - * @pf: associated PF for the devlink port instance - * @vf: associated VF of a PF for the devlink port instance - * @external: indicates if the port is for an external controller - */ -void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, - u16 pf, u16 vf, bool external) -{ - struct devlink_port_attrs *attrs = &devlink_port->attrs; - int ret; - - ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - - ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_VF); - if (ret) - return; - attrs->pci_vf.controller = controller; - attrs->pci_vf.pf = pf; - attrs->pci_vf.vf = vf; - attrs->pci_vf.external = external; -} -EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); - -/** - * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes - * - * @devlink_port: devlink port - * @controller: associated controller number for the devlink port instance - * @pf: associated PF for the devlink port instance - * @sf: associated SF of a PF for the devlink port instance - * @external: indicates if the port is for an external controller - */ -void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, - u16 pf, u32 sf, bool external) -{ - struct devlink_port_attrs *attrs = &devlink_port->attrs; - int ret; - - ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - - ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_SF); - if (ret) - return; - attrs->pci_sf.controller = controller; - attrs->pci_sf.pf = pf; - attrs->pci_sf.sf = sf; - attrs->pci_sf.external = external; -} -EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); - /** * devl_rate_node_create - create devlink rate node * @devlink: devlink instance @@ -7412,92 +6006,6 @@ void devl_rate_nodes_destroy(struct devlink *devlink) } EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy); -/** - * devlink_port_linecard_set - Link port with a linecard - * - * @devlink_port: devlink port - * @linecard: devlink linecard - */ -void devlink_port_linecard_set(struct devlink_port *devlink_port, - struct devlink_linecard *linecard) -{ - ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - - devlink_port->linecard = linecard; -} -EXPORT_SYMBOL_GPL(devlink_port_linecard_set); - -static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, - char *name, size_t len) -{ - struct devlink_port_attrs *attrs = &devlink_port->attrs; - int n = 0; - - if (!devlink_port->attrs_set) - return -EOPNOTSUPP; - - switch (attrs->flavour) { - case DEVLINK_PORT_FLAVOUR_PHYSICAL: - if (devlink_port->linecard) - n = snprintf(name, len, "l%u", - devlink_port->linecard->index); - if (n < len) - n += snprintf(name + n, len - n, "p%u", - attrs->phys.port_number); - if (n < len && attrs->split) - n += snprintf(name + n, len - n, "s%u", - attrs->phys.split_subport_number); - break; - case DEVLINK_PORT_FLAVOUR_CPU: - case DEVLINK_PORT_FLAVOUR_DSA: - case DEVLINK_PORT_FLAVOUR_UNUSED: - /* As CPU and DSA ports do not have a netdevice associated - * case should not ever happen. - */ - WARN_ON(1); - return -EINVAL; - case DEVLINK_PORT_FLAVOUR_PCI_PF: - if (attrs->pci_pf.external) { - n = snprintf(name, len, "c%u", attrs->pci_pf.controller); - if (n >= len) - return -EINVAL; - len -= n; - name += n; - } - n = snprintf(name, len, "pf%u", attrs->pci_pf.pf); - break; - case DEVLINK_PORT_FLAVOUR_PCI_VF: - if (attrs->pci_vf.external) { - n = snprintf(name, len, "c%u", attrs->pci_vf.controller); - if (n >= len) - return -EINVAL; - len -= n; - name += n; - } - n = snprintf(name, len, "pf%uvf%u", - attrs->pci_vf.pf, attrs->pci_vf.vf); - break; - case DEVLINK_PORT_FLAVOUR_PCI_SF: - if (attrs->pci_sf.external) { - n = snprintf(name, len, "c%u", attrs->pci_sf.controller); - if (n >= len) - return -EINVAL; - len -= n; - name += n; - } - n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, - attrs->pci_sf.sf); - break; - case DEVLINK_PORT_FLAVOUR_VIRTUAL: - return -EOPNOTSUPP; - } - - if (n >= len) - return -EINVAL; - - return 0; -} - static int devlink_linecard_types_init(struct devlink_linecard *linecard) { struct devlink_linecard_type *linecard_type; @@ -9572,39 +8080,3 @@ devl_trap_policers_unregister(struct devlink *devlink, devlink_trap_policer_unregister(devlink, &policers[i]); } EXPORT_SYMBOL_GPL(devl_trap_policers_unregister); - -int devlink_compat_phys_port_name_get(struct net_device *dev, - char *name, size_t len) -{ - struct devlink_port *devlink_port; - - /* RTNL mutex is held here which ensures that devlink_port - * instance cannot disappear in the middle. No need to take - * any devlink lock as only permanent values are accessed. - */ - ASSERT_RTNL(); - - devlink_port = dev->devlink_port; - if (!devlink_port) - return -EOPNOTSUPP; - - return __devlink_port_phys_port_name_get(devlink_port, name, len); -} - -int devlink_compat_switch_id_get(struct net_device *dev, - struct netdev_phys_item_id *ppid) -{ - struct devlink_port *devlink_port; - - /* Caller must hold RTNL mutex or reference to dev, which ensures that - * devlink_port instance cannot disappear in the middle. No need to take - * any devlink lock as only permanent values are accessed. - */ - devlink_port = dev->devlink_port; - if (!devlink_port || !devlink_port->switch_port) - return -EOPNOTSUPP; - - memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid)); - - return 0; -} diff --git a/net/devlink/port.c b/net/devlink/port.c new file mode 100644 index 000000000000..4763b42885fb --- /dev/null +++ b/net/devlink/port.c @@ -0,0 +1,1515 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko + */ + +#include "devl_internal.h" + +#define DEVLINK_PORT_FN_CAPS_VALID_MASK \ + (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1) + +static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { + [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, + [DEVLINK_PORT_FN_ATTR_STATE] = + NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE, + DEVLINK_PORT_FN_STATE_ACTIVE), + [DEVLINK_PORT_FN_ATTR_CAPS] = + NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK), +}; + +#define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ + WARN_ON_ONCE(!(devlink_port)->registered) +#define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \ + WARN_ON_ONCE((devlink_port)->registered) + +struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, + unsigned int port_index) +{ + return xa_load(&devlink->ports, port_index); +} + +struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, + struct nlattr **attrs) +{ + if (attrs[DEVLINK_ATTR_PORT_INDEX]) { + u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); + struct devlink_port *devlink_port; + + devlink_port = devlink_port_get_by_index(devlink, port_index); + if (!devlink_port) + return ERR_PTR(-ENODEV); + return devlink_port; + } + return ERR_PTR(-EINVAL); +} + +struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + return devlink_port_get_from_attrs(devlink, info->attrs); +} + +static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps, + u32 cap, bool is_enable) +{ + caps->selector |= cap; + if (is_enable) + caps->value |= cap; +} + +static int devlink_port_fn_roce_fill(struct devlink_port *devlink_port, + struct nla_bitfield32 *caps, + struct netlink_ext_ack *extack) +{ + bool is_enable; + int err; + + if (!devlink_port->ops->port_fn_roce_get) + return 0; + + err = devlink_port->ops->port_fn_roce_get(devlink_port, &is_enable, + extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable); + return 0; +} + +static int devlink_port_fn_migratable_fill(struct devlink_port *devlink_port, + struct nla_bitfield32 *caps, + struct netlink_ext_ack *extack) +{ + bool is_enable; + int err; + + if (!devlink_port->ops->port_fn_migratable_get || + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) + return 0; + + err = devlink_port->ops->port_fn_migratable_get(devlink_port, + &is_enable, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable); + return 0; +} + +static int devlink_port_fn_ipsec_crypto_fill(struct devlink_port *devlink_port, + struct nla_bitfield32 *caps, + struct netlink_ext_ack *extack) +{ + bool is_enable; + int err; + + if (!devlink_port->ops->port_fn_ipsec_crypto_get || + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) + return 0; + + err = devlink_port->ops->port_fn_ipsec_crypto_get(devlink_port, &is_enable, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, is_enable); + return 0; +} + +static int devlink_port_fn_ipsec_packet_fill(struct devlink_port *devlink_port, + struct nla_bitfield32 *caps, + struct netlink_ext_ack *extack) +{ + bool is_enable; + int err; + + if (!devlink_port->ops->port_fn_ipsec_packet_get || + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) + return 0; + + err = devlink_port->ops->port_fn_ipsec_packet_get(devlink_port, &is_enable, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_PACKET, is_enable); + return 0; +} + +static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port, + struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) +{ + struct nla_bitfield32 caps = {}; + int err; + + err = devlink_port_fn_roce_fill(devlink_port, &caps, extack); + if (err) + return err; + + err = devlink_port_fn_migratable_fill(devlink_port, &caps, extack); + if (err) + return err; + + err = devlink_port_fn_ipsec_crypto_fill(devlink_port, &caps, extack); + if (err) + return err; + + err = devlink_port_fn_ipsec_packet_fill(devlink_port, &caps, extack); + if (err) + return err; + + if (!caps.selector) + return 0; + err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value, + caps.selector); + if (err) + return err; + + *msg_updated = true; + return 0; +} + +int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) +{ + if (devlink_nl_put_handle(msg, devlink_port->devlink)) + return -EMSGSIZE; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) + return -EMSGSIZE; + return 0; +} + +size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) +{ + struct devlink *devlink = devlink_port->devlink; + + return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */ + + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */ + + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */ +} + +static int devlink_nl_port_attrs_put(struct sk_buff *msg, + struct devlink_port *devlink_port) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + + if (!devlink_port->attrs_set) + return 0; + if (attrs->lanes) { + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) + return -EMSGSIZE; + } + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable)) + return -EMSGSIZE; + if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) + return -EMSGSIZE; + switch (devlink_port->attrs.flavour) { + case DEVLINK_PORT_FLAVOUR_PCI_PF: + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, + attrs->pci_pf.controller) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) + return -EMSGSIZE; + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external)) + return -EMSGSIZE; + break; + case DEVLINK_PORT_FLAVOUR_PCI_VF: + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, + attrs->pci_vf.controller) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) + return -EMSGSIZE; + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) + return -EMSGSIZE; + break; + case DEVLINK_PORT_FLAVOUR_PCI_SF: + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, + attrs->pci_sf.controller) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, + attrs->pci_sf.pf) || + nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER, + attrs->pci_sf.sf)) + return -EMSGSIZE; + break; + case DEVLINK_PORT_FLAVOUR_PHYSICAL: + case DEVLINK_PORT_FLAVOUR_CPU: + case DEVLINK_PORT_FLAVOUR_DSA: + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, + attrs->phys.port_number)) + return -EMSGSIZE; + if (!attrs->split) + return 0; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, + attrs->phys.port_number)) + return -EMSGSIZE; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, + attrs->phys.split_subport_number)) + return -EMSGSIZE; + break; + default: + break; + } + return 0; +} + +static int devlink_port_fn_hw_addr_fill(struct devlink_port *port, + struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) +{ + u8 hw_addr[MAX_ADDR_LEN]; + int hw_addr_len; + int err; + + if (!port->ops->port_fn_hw_addr_get) + return 0; + + err = port->ops->port_fn_hw_addr_get(port, hw_addr, &hw_addr_len, + extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); + if (err) + return err; + *msg_updated = true; + return 0; +} + +static bool +devlink_port_fn_state_valid(enum devlink_port_fn_state state) +{ + return state == DEVLINK_PORT_FN_STATE_INACTIVE || + state == DEVLINK_PORT_FN_STATE_ACTIVE; +} + +static bool +devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) +{ + return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED || + opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; +} + +static int devlink_port_fn_state_fill(struct devlink_port *port, + struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) +{ + enum devlink_port_fn_opstate opstate; + enum devlink_port_fn_state state; + int err; + + if (!port->ops->port_fn_state_get) + return 0; + + err = port->ops->port_fn_state_get(port, &state, &opstate, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + if (!devlink_port_fn_state_valid(state)) { + WARN_ON_ONCE(1); + NL_SET_ERR_MSG(extack, "Invalid state read from driver"); + return -EINVAL; + } + if (!devlink_port_fn_opstate_valid(opstate)) { + WARN_ON_ONCE(1); + NL_SET_ERR_MSG(extack, "Invalid operational state read from driver"); + return -EINVAL; + } + if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) || + nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate)) + return -EMSGSIZE; + *msg_updated = true; + return 0; +} + +static int +devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable, + struct netlink_ext_ack *extack) +{ + return devlink_port->ops->port_fn_migratable_set(devlink_port, enable, + extack); +} + +static int +devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable, + struct netlink_ext_ack *extack) +{ + return devlink_port->ops->port_fn_roce_set(devlink_port, enable, + extack); +} + +static int +devlink_port_fn_ipsec_crypto_set(struct devlink_port *devlink_port, bool enable, + struct netlink_ext_ack *extack) +{ + return devlink_port->ops->port_fn_ipsec_crypto_set(devlink_port, enable, extack); +} + +static int +devlink_port_fn_ipsec_packet_set(struct devlink_port *devlink_port, bool enable, + struct netlink_ext_ack *extack) +{ + return devlink_port->ops->port_fn_ipsec_packet_set(devlink_port, enable, extack); +} + +static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nla_bitfield32 caps; + u32 caps_value; + int err; + + caps = nla_get_bitfield32(attr); + caps_value = caps.value & caps.selector; + if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) { + err = devlink_port_fn_roce_set(devlink_port, + caps_value & DEVLINK_PORT_FN_CAP_ROCE, + extack); + if (err) + return err; + } + if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { + err = devlink_port_fn_mig_set(devlink_port, caps_value & + DEVLINK_PORT_FN_CAP_MIGRATABLE, + extack); + if (err) + return err; + } + if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { + err = devlink_port_fn_ipsec_crypto_set(devlink_port, caps_value & + DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, + extack); + if (err) + return err; + } + if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { + err = devlink_port_fn_ipsec_packet_set(devlink_port, caps_value & + DEVLINK_PORT_FN_CAP_IPSEC_PACKET, + extack); + if (err) + return err; + } + return 0; +} + +static int +devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, + struct netlink_ext_ack *extack) +{ + struct nlattr *function_attr; + bool msg_updated = false; + int err; + + function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION); + if (!function_attr) + return -EMSGSIZE; + + err = devlink_port_fn_hw_addr_fill(port, msg, extack, &msg_updated); + if (err) + goto out; + err = devlink_port_fn_caps_fill(port, msg, extack, &msg_updated); + if (err) + goto out; + err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated); +out: + if (err || !msg_updated) + nla_nest_cancel(msg, function_attr); + else + nla_nest_end(msg, function_attr); + return err; +} + +static int devlink_nl_port_fill(struct sk_buff *msg, + struct devlink_port *devlink_port, + enum devlink_command cmd, u32 portid, u32 seq, + int flags, struct netlink_ext_ack *extack) +{ + struct devlink *devlink = devlink_port->devlink; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) + goto nla_put_failure; + + spin_lock_bh(&devlink_port->type_lock); + if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) + goto nla_put_failure_type_locked; + if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET && + nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE, + devlink_port->desired_type)) + goto nla_put_failure_type_locked; + if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { + if (devlink_port->type_eth.netdev && + (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, + devlink_port->type_eth.ifindex) || + nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, + devlink_port->type_eth.ifname))) + goto nla_put_failure_type_locked; + } + if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { + struct ib_device *ibdev = devlink_port->type_ib.ibdev; + + if (ibdev && + nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME, + ibdev->name)) + goto nla_put_failure_type_locked; + } + spin_unlock_bh(&devlink_port->type_lock); + if (devlink_nl_port_attrs_put(msg, devlink_port)) + goto nla_put_failure; + if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) + goto nla_put_failure; + if (devlink_port->linecard && + nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, + devlink_port->linecard->index)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure_type_locked: + spin_unlock_bh(&devlink_port->type_lock); +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_port_notify(struct devlink_port *devlink_port, + enum devlink_command cmd) +{ + struct devlink *devlink = devlink_port->devlink; + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); + + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, + 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static void devlink_ports_notify(struct devlink *devlink, + enum devlink_command cmd) +{ + struct devlink_port *devlink_port; + unsigned long port_index; + + xa_for_each(&devlink->ports, port_index, devlink_port) + devlink_port_notify(devlink_port, cmd); +} + +void devlink_ports_notify_register(struct devlink *devlink) +{ + devlink_ports_notify(devlink, DEVLINK_CMD_PORT_NEW); +} + +void devlink_ports_notify_unregister(struct devlink *devlink) +{ + devlink_ports_notify(devlink, DEVLINK_CMD_PORT_DEL); +} + +int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[1]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, + info->snd_portid, info->snd_seq, 0, + info->extack); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int +devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb, int flags) +{ + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + struct devlink_port *devlink_port; + unsigned long port_index; + int err = 0; + + xa_for_each_start(&devlink->ports, port_index, devlink_port, state->idx) { + err = devlink_nl_port_fill(msg, devlink_port, + DEVLINK_CMD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, flags, + cb->extack); + if (err) { + state->idx = port_index; + break; + } + } + + return err; +} + +int devlink_nl_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_port_get_dump_one); +} + +static int devlink_port_type_set(struct devlink_port *devlink_port, + enum devlink_port_type port_type) + +{ + int err; + + if (!devlink_port->ops->port_type_set) + return -EOPNOTSUPP; + + if (port_type == devlink_port->type) + return 0; + + err = devlink_port->ops->port_type_set(devlink_port, port_type); + if (err) + return err; + + devlink_port->desired_type = port_type; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + return 0; +} + +static int devlink_port_function_hw_addr_set(struct devlink_port *port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *hw_addr; + int hw_addr_len; + + hw_addr = nla_data(attr); + hw_addr_len = nla_len(attr); + if (hw_addr_len > MAX_ADDR_LEN) { + NL_SET_ERR_MSG(extack, "Port function hardware address too long"); + return -EINVAL; + } + if (port->type == DEVLINK_PORT_TYPE_ETH) { + if (hw_addr_len != ETH_ALEN) { + NL_SET_ERR_MSG(extack, "Address must be 6 bytes for Ethernet device"); + return -EINVAL; + } + if (!is_unicast_ether_addr(hw_addr)) { + NL_SET_ERR_MSG(extack, "Non-unicast hardware address unsupported"); + return -EINVAL; + } + } + + return port->ops->port_fn_hw_addr_set(port, hw_addr, hw_addr_len, + extack); +} + +static int devlink_port_fn_state_set(struct devlink_port *port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + enum devlink_port_fn_state state; + + state = nla_get_u8(attr); + return port->ops->port_fn_state_set(port, state, extack); +} + +static int devlink_port_function_validate(struct devlink_port *devlink_port, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + const struct devlink_port_ops *ops = devlink_port->ops; + struct nlattr *attr; + + if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] && + !ops->port_fn_hw_addr_set) { + NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], + "Port doesn't support function attributes"); + return -EOPNOTSUPP; + } + if (tb[DEVLINK_PORT_FN_ATTR_STATE] && !ops->port_fn_state_set) { + NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], + "Function does not support state setting"); + return -EOPNOTSUPP; + } + attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; + if (attr) { + struct nla_bitfield32 caps; + + caps = nla_get_bitfield32(attr); + if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE && + !ops->port_fn_roce_set) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "Port doesn't support RoCE function attribute"); + return -EOPNOTSUPP; + } + if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { + if (!ops->port_fn_migratable_set) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "Port doesn't support migratable function attribute"); + return -EOPNOTSUPP; + } + if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "migratable function attribute supported for VFs only"); + return -EOPNOTSUPP; + } + } + if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { + if (!ops->port_fn_ipsec_crypto_set) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "Port doesn't support ipsec_crypto function attribute"); + return -EOPNOTSUPP; + } + if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "ipsec_crypto function attribute supported for VFs only"); + return -EOPNOTSUPP; + } + } + if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { + if (!ops->port_fn_ipsec_packet_set) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "Port doesn't support ipsec_packet function attribute"); + return -EOPNOTSUPP; + } + if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "ipsec_packet function attribute supported for VFs only"); + return -EOPNOTSUPP; + } + } + } + return 0; +} + +static int devlink_port_function_set(struct devlink_port *port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1]; + int err; + + err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr, + devlink_function_nl_policy, extack); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Fail to parse port function attributes"); + return err; + } + + err = devlink_port_function_validate(port, tb, extack); + if (err) + return err; + + attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; + if (attr) { + err = devlink_port_function_hw_addr_set(port, attr, extack); + if (err) + return err; + } + + attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; + if (attr) { + err = devlink_port_fn_caps_set(port, attr, extack); + if (err) + return err; + } + + /* Keep this as the last function attribute set, so that when + * multiple port function attributes are set along with state, + * Those can be applied first before activating the state. + */ + attr = tb[DEVLINK_PORT_FN_ATTR_STATE]; + if (attr) + err = devlink_port_fn_state_set(port, attr, extack); + + if (!err) + devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); + return err; +} + +int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[1]; + int err; + + if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { + enum devlink_port_type port_type; + + port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); + err = devlink_port_type_set(devlink_port, port_type); + if (err) + return err; + } + + if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) { + struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION]; + struct netlink_ext_ack *extack = info->extack; + + err = devlink_port_function_set(devlink_port, attr, extack); + if (err) + return err; + } + + return 0; +} + +int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[1]; + struct devlink *devlink = info->user_ptr[0]; + u32 count; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT)) + return -EINVAL; + if (!devlink_port->ops->port_split) + return -EOPNOTSUPP; + + count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); + + if (!devlink_port->attrs.splittable) { + /* Split ports cannot be split. */ + if (devlink_port->attrs.split) + NL_SET_ERR_MSG(info->extack, "Port cannot be split further"); + else + NL_SET_ERR_MSG(info->extack, "Port cannot be split"); + return -EINVAL; + } + + if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) { + NL_SET_ERR_MSG(info->extack, "Invalid split count"); + return -EINVAL; + } + + return devlink_port->ops->port_split(devlink, devlink_port, count, + info->extack); +} + +int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[1]; + struct devlink *devlink = info->user_ptr[0]; + + if (!devlink_port->ops->port_unsplit) + return -EOPNOTSUPP; + return devlink_port->ops->port_unsplit(devlink, devlink_port, info->extack); +} + +int devlink_nl_cmd_port_new_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink_port_new_attrs new_attrs = {}; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_port *devlink_port; + struct sk_buff *msg; + int err; + + if (!devlink->ops->port_new) + return -EOPNOTSUPP; + + if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] || + !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) { + NL_SET_ERR_MSG(extack, "Port flavour or PCI PF are not specified"); + return -EINVAL; + } + new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]); + new_attrs.pfnum = + nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]); + + if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + /* Port index of the new port being created by driver. */ + new_attrs.port_index = + nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + new_attrs.port_index_valid = true; + } + if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) { + new_attrs.controller = + nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]); + new_attrs.controller_valid = true; + } + if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF && + info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) { + new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]); + new_attrs.sfnum_valid = true; + } + + err = devlink->ops->port_new(devlink, &new_attrs, + extack, &devlink_port); + if (err) + return err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto err_out_port_del; + } + err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, + info->snd_portid, info->snd_seq, 0, NULL); + if (WARN_ON_ONCE(err)) + goto err_out_msg_free; + err = genlmsg_reply(msg, info); + if (err) + goto err_out_port_del; + return 0; + +err_out_msg_free: + nlmsg_free(msg); +err_out_port_del: + devlink_port->ops->port_del(devlink, devlink_port, NULL); + return err; +} + +int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[1]; + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + + if (!devlink_port->ops->port_del) + return -EOPNOTSUPP; + + return devlink_port->ops->port_del(devlink, devlink_port, extack); +} + +static void devlink_port_type_warn(struct work_struct *work) +{ + struct devlink_port *port = container_of(to_delayed_work(work), + struct devlink_port, + type_warn_dw); + dev_warn(port->devlink->dev, "Type was not set for devlink port."); +} + +static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) +{ + /* Ignore CPU and DSA flavours. */ + return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA && + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED; +} + +#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600) + +static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port) +{ + if (!devlink_port_type_should_warn(devlink_port)) + return; + /* Schedule a work to WARN in case driver does not set port + * type within timeout. + */ + schedule_delayed_work(&devlink_port->type_warn_dw, + DEVLINK_PORT_TYPE_WARN_TIMEOUT); +} + +static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) +{ + if (!devlink_port_type_should_warn(devlink_port)) + return; + cancel_delayed_work_sync(&devlink_port->type_warn_dw); +} + +/** + * devlink_port_init() - Init devlink port + * + * @devlink: devlink + * @devlink_port: devlink port + * + * Initialize essential stuff that is needed for functions + * that may be called before devlink port registration. + * Call to this function is optional and not needed + * in case the driver does not use such functions. + */ +void devlink_port_init(struct devlink *devlink, + struct devlink_port *devlink_port) +{ + if (devlink_port->initialized) + return; + devlink_port->devlink = devlink; + INIT_LIST_HEAD(&devlink_port->region_list); + devlink_port->initialized = true; +} +EXPORT_SYMBOL_GPL(devlink_port_init); + +/** + * devlink_port_fini() - Deinitialize devlink port + * + * @devlink_port: devlink port + * + * Deinitialize essential stuff that is in use for functions + * that may be called after devlink port unregistration. + * Call to this function is optional and not needed + * in case the driver does not use such functions. + */ +void devlink_port_fini(struct devlink_port *devlink_port) +{ + WARN_ON(!list_empty(&devlink_port->region_list)); +} +EXPORT_SYMBOL_GPL(devlink_port_fini); + +static const struct devlink_port_ops devlink_port_dummy_ops = {}; + +/** + * devl_port_register_with_ops() - Register devlink port + * + * @devlink: devlink + * @devlink_port: devlink port + * @port_index: driver-specific numerical identifier of the port + * @ops: port ops + * + * Register devlink port with provided port index. User can use + * any indexing, even hw-related one. devlink_port structure + * is convenient to be embedded inside user driver private structure. + * Note that the caller should take care of zeroing the devlink_port + * structure. + */ +int devl_port_register_with_ops(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index, + const struct devlink_port_ops *ops) +{ + int err; + + devl_assert_locked(devlink); + + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + + devlink_port_init(devlink, devlink_port); + devlink_port->registered = true; + devlink_port->index = port_index; + devlink_port->ops = ops ? ops : &devlink_port_dummy_ops; + spin_lock_init(&devlink_port->type_lock); + INIT_LIST_HEAD(&devlink_port->reporter_list); + err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); + if (err) { + devlink_port->registered = false; + return err; + } + + INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); + devlink_port_type_warn_schedule(devlink_port); + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + return 0; +} +EXPORT_SYMBOL_GPL(devl_port_register_with_ops); + +/** + * devlink_port_register_with_ops - Register devlink port + * + * @devlink: devlink + * @devlink_port: devlink port + * @port_index: driver-specific numerical identifier of the port + * @ops: port ops + * + * Register devlink port with provided port index. User can use + * any indexing, even hw-related one. devlink_port structure + * is convenient to be embedded inside user driver private structure. + * Note that the caller should take care of zeroing the devlink_port + * structure. + * + * Context: Takes and release devlink->lock . + */ +int devlink_port_register_with_ops(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index, + const struct devlink_port_ops *ops) +{ + int err; + + devl_lock(devlink); + err = devl_port_register_with_ops(devlink, devlink_port, + port_index, ops); + devl_unlock(devlink); + return err; +} +EXPORT_SYMBOL_GPL(devlink_port_register_with_ops); + +/** + * devl_port_unregister() - Unregister devlink port + * + * @devlink_port: devlink port + */ +void devl_port_unregister(struct devlink_port *devlink_port) +{ + lockdep_assert_held(&devlink_port->devlink->lock); + WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET); + + devlink_port_type_warn_cancel(devlink_port); + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); + xa_erase(&devlink_port->devlink->ports, devlink_port->index); + WARN_ON(!list_empty(&devlink_port->reporter_list)); + devlink_port->registered = false; +} +EXPORT_SYMBOL_GPL(devl_port_unregister); + +/** + * devlink_port_unregister - Unregister devlink port + * + * @devlink_port: devlink port + * + * Context: Takes and release devlink->lock . + */ +void devlink_port_unregister(struct devlink_port *devlink_port) +{ + struct devlink *devlink = devlink_port->devlink; + + devl_lock(devlink); + devl_port_unregister(devlink_port); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_port_unregister); + +static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, + struct net_device *netdev) +{ + const struct net_device_ops *ops = netdev->netdev_ops; + + /* If driver registers devlink port, it should set devlink port + * attributes accordingly so the compat functions are called + * and the original ops are not used. + */ + if (ops->ndo_get_phys_port_name) { + /* Some drivers use the same set of ndos for netdevs + * that have devlink_port registered and also for + * those who don't. Make sure that ndo_get_phys_port_name + * returns -EOPNOTSUPP here in case it is defined. + * Warn if not. + */ + char name[IFNAMSIZ]; + int err; + + err = ops->ndo_get_phys_port_name(netdev, name, sizeof(name)); + WARN_ON(err != -EOPNOTSUPP); + } + if (ops->ndo_get_port_parent_id) { + /* Some drivers use the same set of ndos for netdevs + * that have devlink_port registered and also for + * those who don't. Make sure that ndo_get_port_parent_id + * returns -EOPNOTSUPP here in case it is defined. + * Warn if not. + */ + struct netdev_phys_item_id ppid; + int err; + + err = ops->ndo_get_port_parent_id(netdev, &ppid); + WARN_ON(err != -EOPNOTSUPP); + } +} + +static void __devlink_port_type_set(struct devlink_port *devlink_port, + enum devlink_port_type type, + void *type_dev) +{ + struct net_device *netdev = type_dev; + + ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); + + if (type == DEVLINK_PORT_TYPE_NOTSET) { + devlink_port_type_warn_schedule(devlink_port); + } else { + devlink_port_type_warn_cancel(devlink_port); + if (type == DEVLINK_PORT_TYPE_ETH && netdev) + devlink_port_type_netdev_checks(devlink_port, netdev); + } + + spin_lock_bh(&devlink_port->type_lock); + devlink_port->type = type; + switch (type) { + case DEVLINK_PORT_TYPE_ETH: + devlink_port->type_eth.netdev = netdev; + if (netdev) { + ASSERT_RTNL(); + devlink_port->type_eth.ifindex = netdev->ifindex; + BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) != + sizeof(netdev->name)); + strcpy(devlink_port->type_eth.ifname, netdev->name); + } + break; + case DEVLINK_PORT_TYPE_IB: + devlink_port->type_ib.ibdev = type_dev; + break; + default: + break; + } + spin_unlock_bh(&devlink_port->type_lock); + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); +} + +/** + * devlink_port_type_eth_set - Set port type to Ethernet + * + * @devlink_port: devlink port + * + * If driver is calling this, most likely it is doing something wrong. + */ +void devlink_port_type_eth_set(struct devlink_port *devlink_port) +{ + dev_warn(devlink_port->devlink->dev, + "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", + devlink_port->index); + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); +} +EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); + +/** + * devlink_port_type_ib_set - Set port type to InfiniBand + * + * @devlink_port: devlink port + * @ibdev: related IB device + */ +void devlink_port_type_ib_set(struct devlink_port *devlink_port, + struct ib_device *ibdev) +{ + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev); +} +EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); + +/** + * devlink_port_type_clear - Clear port type + * + * @devlink_port: devlink port + * + * If driver is calling this for clearing Ethernet type, most likely + * it is doing something wrong. + */ +void devlink_port_type_clear(struct devlink_port *devlink_port) +{ + if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) + dev_warn(devlink_port->devlink->dev, + "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n", + devlink_port->index); + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); +} +EXPORT_SYMBOL_GPL(devlink_port_type_clear); + +int devlink_port_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct devlink_port *devlink_port = netdev->devlink_port; + struct devlink *devlink; + + if (!devlink_port) + return NOTIFY_OK; + devlink = devlink_port->devlink; + + switch (event) { + case NETDEV_POST_INIT: + /* Set the type but not netdev pointer. It is going to be set + * later on by NETDEV_REGISTER event. Happens once during + * netdevice register + */ + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, + NULL); + break; + case NETDEV_REGISTER: + case NETDEV_CHANGENAME: + if (devlink_net(devlink) != dev_net(netdev)) + return NOTIFY_OK; + /* Set the netdev on top of previously set type. Note this + * event happens also during net namespace change so here + * we take into account netdev pointer appearing in this + * namespace. + */ + __devlink_port_type_set(devlink_port, devlink_port->type, + netdev); + break; + case NETDEV_UNREGISTER: + if (devlink_net(devlink) != dev_net(netdev)) + return NOTIFY_OK; + /* Clear netdev pointer, but not the type. This event happens + * also during net namespace change so we need to clear + * pointer to netdev that is going to another net namespace. + */ + __devlink_port_type_set(devlink_port, devlink_port->type, + NULL); + break; + case NETDEV_PRE_UNINIT: + /* Clear the type and the netdev pointer. Happens one during + * netdevice unregister. + */ + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, + NULL); + break; + } + + return NOTIFY_OK; +} + +static int __devlink_port_attrs_set(struct devlink_port *devlink_port, + enum devlink_port_flavour flavour) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + + devlink_port->attrs_set = true; + attrs->flavour = flavour; + if (attrs->switch_id.id_len) { + devlink_port->switch_port = true; + if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN)) + attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN; + } else { + devlink_port->switch_port = false; + } + return 0; +} + +/** + * devlink_port_attrs_set - Set port attributes + * + * @devlink_port: devlink port + * @attrs: devlink port attrs + */ +void devlink_port_attrs_set(struct devlink_port *devlink_port, + struct devlink_port_attrs *attrs) +{ + int ret; + + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + + devlink_port->attrs = *attrs; + ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); + if (ret) + return; + WARN_ON(attrs->splittable && attrs->split); +} +EXPORT_SYMBOL_GPL(devlink_port_attrs_set); + +/** + * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes + * + * @devlink_port: devlink port + * @controller: associated controller number for the devlink port instance + * @pf: associated PF for the devlink port instance + * @external: indicates if the port is for an external controller + */ +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, + u16 pf, bool external) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + int ret; + + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + + ret = __devlink_port_attrs_set(devlink_port, + DEVLINK_PORT_FLAVOUR_PCI_PF); + if (ret) + return; + attrs->pci_pf.controller = controller; + attrs->pci_pf.pf = pf; + attrs->pci_pf.external = external; +} +EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); + +/** + * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes + * + * @devlink_port: devlink port + * @controller: associated controller number for the devlink port instance + * @pf: associated PF for the devlink port instance + * @vf: associated VF of a PF for the devlink port instance + * @external: indicates if the port is for an external controller + */ +void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, + u16 pf, u16 vf, bool external) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + int ret; + + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + + ret = __devlink_port_attrs_set(devlink_port, + DEVLINK_PORT_FLAVOUR_PCI_VF); + if (ret) + return; + attrs->pci_vf.controller = controller; + attrs->pci_vf.pf = pf; + attrs->pci_vf.vf = vf; + attrs->pci_vf.external = external; +} +EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); + +/** + * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes + * + * @devlink_port: devlink port + * @controller: associated controller number for the devlink port instance + * @pf: associated PF for the devlink port instance + * @sf: associated SF of a PF for the devlink port instance + * @external: indicates if the port is for an external controller + */ +void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, + u16 pf, u32 sf, bool external) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + int ret; + + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + + ret = __devlink_port_attrs_set(devlink_port, + DEVLINK_PORT_FLAVOUR_PCI_SF); + if (ret) + return; + attrs->pci_sf.controller = controller; + attrs->pci_sf.pf = pf; + attrs->pci_sf.sf = sf; + attrs->pci_sf.external = external; +} +EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); + +/** + * devlink_port_linecard_set - Link port with a linecard + * + * @devlink_port: devlink port + * @linecard: devlink linecard + */ +void devlink_port_linecard_set(struct devlink_port *devlink_port, + struct devlink_linecard *linecard) +{ + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + + devlink_port->linecard = linecard; +} +EXPORT_SYMBOL_GPL(devlink_port_linecard_set); + +static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, + char *name, size_t len) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + int n = 0; + + if (!devlink_port->attrs_set) + return -EOPNOTSUPP; + + switch (attrs->flavour) { + case DEVLINK_PORT_FLAVOUR_PHYSICAL: + if (devlink_port->linecard) + n = snprintf(name, len, "l%u", + devlink_port->linecard->index); + if (n < len) + n += snprintf(name + n, len - n, "p%u", + attrs->phys.port_number); + if (n < len && attrs->split) + n += snprintf(name + n, len - n, "s%u", + attrs->phys.split_subport_number); + break; + case DEVLINK_PORT_FLAVOUR_CPU: + case DEVLINK_PORT_FLAVOUR_DSA: + case DEVLINK_PORT_FLAVOUR_UNUSED: + /* As CPU and DSA ports do not have a netdevice associated + * case should not ever happen. + */ + WARN_ON(1); + return -EINVAL; + case DEVLINK_PORT_FLAVOUR_PCI_PF: + if (attrs->pci_pf.external) { + n = snprintf(name, len, "c%u", attrs->pci_pf.controller); + if (n >= len) + return -EINVAL; + len -= n; + name += n; + } + n = snprintf(name, len, "pf%u", attrs->pci_pf.pf); + break; + case DEVLINK_PORT_FLAVOUR_PCI_VF: + if (attrs->pci_vf.external) { + n = snprintf(name, len, "c%u", attrs->pci_vf.controller); + if (n >= len) + return -EINVAL; + len -= n; + name += n; + } + n = snprintf(name, len, "pf%uvf%u", + attrs->pci_vf.pf, attrs->pci_vf.vf); + break; + case DEVLINK_PORT_FLAVOUR_PCI_SF: + if (attrs->pci_sf.external) { + n = snprintf(name, len, "c%u", attrs->pci_sf.controller); + if (n >= len) + return -EINVAL; + len -= n; + name += n; + } + n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, + attrs->pci_sf.sf); + break; + case DEVLINK_PORT_FLAVOUR_VIRTUAL: + return -EOPNOTSUPP; + } + + if (n >= len) + return -EINVAL; + + return 0; +} + +int devlink_compat_phys_port_name_get(struct net_device *dev, + char *name, size_t len) +{ + struct devlink_port *devlink_port; + + /* RTNL mutex is held here which ensures that devlink_port + * instance cannot disappear in the middle. No need to take + * any devlink lock as only permanent values are accessed. + */ + ASSERT_RTNL(); + + devlink_port = dev->devlink_port; + if (!devlink_port) + return -EOPNOTSUPP; + + return __devlink_port_phys_port_name_get(devlink_port, name, len); +} + +int devlink_compat_switch_id_get(struct net_device *dev, + struct netdev_phys_item_id *ppid) +{ + struct devlink_port *devlink_port; + + /* Caller must hold RTNL mutex or reference to dev, which ensures that + * devlink_port instance cannot disappear in the middle. No need to take + * any devlink lock as only permanent values are accessed. + */ + devlink_port = dev->devlink_port; + if (!devlink_port || !devlink_port->switch_port) + return -EOPNOTSUPP; + + memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid)); + + return 0; +} -- cgit v1.2.3 From 2b4d8bb0888930b4a26cd46ec067753228877488 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Aug 2023 08:16:45 +0200 Subject: devlink: push shared buffer related code into separate file Cut out another chunk from leftover.c and put sb related code into a separate file. Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230828061657.300667-4-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/Makefile | 2 +- net/devlink/devl_internal.h | 9 + net/devlink/leftover.c | 1338 ++++++------------------------------------- net/devlink/sb.c | 996 ++++++++++++++++++++++++++++++++ 4 files changed, 1180 insertions(+), 1165 deletions(-) create mode 100644 net/devlink/sb.c (limited to 'net') diff --git a/net/devlink/Makefile b/net/devlink/Makefile index 456bfb336540..dcc508af48a9 100644 --- a/net/devlink/Makefile +++ b/net/devlink/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o port.o health.o +obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o port.o sb.o health.o \ diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 7d01e2060702..e1a6b7a763b8 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -220,6 +220,15 @@ int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_port_new_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, + struct genl_info *info); int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index d14b40fb8fdf..795bfdd41103 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -231,164 +231,6 @@ devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info) return devlink_linecard_get_from_attrs(devlink, info->attrs); } -struct devlink_sb { - struct list_head list; - unsigned int index; - u32 size; - u16 ingress_pools_count; - u16 egress_pools_count; - u16 ingress_tc_count; - u16 egress_tc_count; -}; - -static u16 devlink_sb_pool_count(struct devlink_sb *devlink_sb) -{ - return devlink_sb->ingress_pools_count + devlink_sb->egress_pools_count; -} - -static struct devlink_sb *devlink_sb_get_by_index(struct devlink *devlink, - unsigned int sb_index) -{ - struct devlink_sb *devlink_sb; - - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - if (devlink_sb->index == sb_index) - return devlink_sb; - } - return NULL; -} - -static bool devlink_sb_index_exists(struct devlink *devlink, - unsigned int sb_index) -{ - return devlink_sb_get_by_index(devlink, sb_index); -} - -static struct devlink_sb *devlink_sb_get_from_attrs(struct devlink *devlink, - struct nlattr **attrs) -{ - if (attrs[DEVLINK_ATTR_SB_INDEX]) { - u32 sb_index = nla_get_u32(attrs[DEVLINK_ATTR_SB_INDEX]); - struct devlink_sb *devlink_sb; - - devlink_sb = devlink_sb_get_by_index(devlink, sb_index); - if (!devlink_sb) - return ERR_PTR(-ENODEV); - return devlink_sb; - } - return ERR_PTR(-EINVAL); -} - -static struct devlink_sb *devlink_sb_get_from_info(struct devlink *devlink, - struct genl_info *info) -{ - return devlink_sb_get_from_attrs(devlink, info->attrs); -} - -static int devlink_sb_pool_index_get_from_attrs(struct devlink_sb *devlink_sb, - struct nlattr **attrs, - u16 *p_pool_index) -{ - u16 val; - - if (!attrs[DEVLINK_ATTR_SB_POOL_INDEX]) - return -EINVAL; - - val = nla_get_u16(attrs[DEVLINK_ATTR_SB_POOL_INDEX]); - if (val >= devlink_sb_pool_count(devlink_sb)) - return -EINVAL; - *p_pool_index = val; - return 0; -} - -static int devlink_sb_pool_index_get_from_info(struct devlink_sb *devlink_sb, - struct genl_info *info, - u16 *p_pool_index) -{ - return devlink_sb_pool_index_get_from_attrs(devlink_sb, info->attrs, - p_pool_index); -} - -static int -devlink_sb_pool_type_get_from_attrs(struct nlattr **attrs, - enum devlink_sb_pool_type *p_pool_type) -{ - u8 val; - - if (!attrs[DEVLINK_ATTR_SB_POOL_TYPE]) - return -EINVAL; - - val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_TYPE]); - if (val != DEVLINK_SB_POOL_TYPE_INGRESS && - val != DEVLINK_SB_POOL_TYPE_EGRESS) - return -EINVAL; - *p_pool_type = val; - return 0; -} - -static int -devlink_sb_pool_type_get_from_info(struct genl_info *info, - enum devlink_sb_pool_type *p_pool_type) -{ - return devlink_sb_pool_type_get_from_attrs(info->attrs, p_pool_type); -} - -static int -devlink_sb_th_type_get_from_attrs(struct nlattr **attrs, - enum devlink_sb_threshold_type *p_th_type) -{ - u8 val; - - if (!attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]) - return -EINVAL; - - val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]); - if (val != DEVLINK_SB_THRESHOLD_TYPE_STATIC && - val != DEVLINK_SB_THRESHOLD_TYPE_DYNAMIC) - return -EINVAL; - *p_th_type = val; - return 0; -} - -static int -devlink_sb_th_type_get_from_info(struct genl_info *info, - enum devlink_sb_threshold_type *p_th_type) -{ - return devlink_sb_th_type_get_from_attrs(info->attrs, p_th_type); -} - -static int -devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb, - struct nlattr **attrs, - enum devlink_sb_pool_type pool_type, - u16 *p_tc_index) -{ - u16 val; - - if (!attrs[DEVLINK_ATTR_SB_TC_INDEX]) - return -EINVAL; - - val = nla_get_u16(attrs[DEVLINK_ATTR_SB_TC_INDEX]); - if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS && - val >= devlink_sb->ingress_tc_count) - return -EINVAL; - if (pool_type == DEVLINK_SB_POOL_TYPE_EGRESS && - val >= devlink_sb->egress_tc_count) - return -EINVAL; - *p_tc_index = val; - return 0; -} - -static int -devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, - struct genl_info *info, - enum devlink_sb_pool_type pool_type, - u16 *p_tc_index) -{ - return devlink_sb_tc_index_get_from_attrs(devlink_sb, info->attrs, - pool_type, p_tc_index); -} - struct devlink_region { struct devlink *devlink; struct devlink_port *port; @@ -1053,825 +895,31 @@ static void devlink_linecards_notify_unregister(struct devlink *devlink) { struct devlink_linecard *linecard; - list_for_each_entry_reverse(linecard, &devlink->linecard_list, list) - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); -} - -int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_linecard *linecard; - struct sk_buff *msg; - int err; - - linecard = devlink_linecard_get_from_info(devlink, info); - if (IS_ERR(linecard)) - return PTR_ERR(linecard); - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - mutex_lock(&linecard->state_lock); - err = devlink_nl_linecard_fill(msg, devlink, linecard, - DEVLINK_CMD_LINECARD_NEW, - info->snd_portid, info->snd_seq, 0, - info->extack); - mutex_unlock(&linecard->state_lock); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_linecard_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb, - int flags) -{ - struct devlink_nl_dump_state *state = devlink_dump_state(cb); - struct devlink_linecard *linecard; - int idx = 0; - int err = 0; - - list_for_each_entry(linecard, &devlink->linecard_list, list) { - if (idx < state->idx) { - idx++; - continue; - } - mutex_lock(&linecard->state_lock); - err = devlink_nl_linecard_fill(msg, devlink, linecard, - DEVLINK_CMD_LINECARD_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, flags, - cb->extack); - mutex_unlock(&linecard->state_lock); - if (err) { - state->idx = idx; - break; - } - idx++; - } - - return err; -} - -int devlink_nl_linecard_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) -{ - return devlink_nl_dumpit(skb, cb, devlink_nl_linecard_get_dump_one); -} - -static struct devlink_linecard_type * -devlink_linecard_type_lookup(struct devlink_linecard *linecard, - const char *type) -{ - struct devlink_linecard_type *linecard_type; - int i; - - for (i = 0; i < linecard->types_count; i++) { - linecard_type = &linecard->types[i]; - if (!strcmp(type, linecard_type->type)) - return linecard_type; - } - return NULL; -} - -static int devlink_linecard_type_set(struct devlink_linecard *linecard, - const char *type, - struct netlink_ext_ack *extack) -{ - const struct devlink_linecard_ops *ops = linecard->ops; - struct devlink_linecard_type *linecard_type; - int err; - - mutex_lock(&linecard->state_lock); - if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { - NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); - err = -EBUSY; - goto out; - } - if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { - NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); - err = -EBUSY; - goto out; - } - - linecard_type = devlink_linecard_type_lookup(linecard, type); - if (!linecard_type) { - NL_SET_ERR_MSG(extack, "Unsupported line card type provided"); - err = -EINVAL; - goto out; - } - - if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED && - linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { - NL_SET_ERR_MSG(extack, "Line card already provisioned"); - err = -EBUSY; - /* Check if the line card is provisioned in the same - * way the user asks. In case it is, make the operation - * to return success. - */ - if (ops->same_provision && - ops->same_provision(linecard, linecard->priv, - linecard_type->type, - linecard_type->priv)) - err = 0; - goto out; - } - - linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING; - linecard->type = linecard_type->type; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - err = ops->provision(linecard, linecard->priv, linecard_type->type, - linecard_type->priv, extack); - if (err) { - /* Provisioning failed. Assume the linecard is unprovisioned - * for future operations. - */ - mutex_lock(&linecard->state_lock); - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - linecard->type = NULL; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - } - return err; - -out: - mutex_unlock(&linecard->state_lock); - return err; -} - -static int devlink_linecard_type_unset(struct devlink_linecard *linecard, - struct netlink_ext_ack *extack) -{ - int err; - - mutex_lock(&linecard->state_lock); - if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { - NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); - err = -EBUSY; - goto out; - } - if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { - NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); - err = -EBUSY; - goto out; - } - if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - linecard->type = NULL; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - err = 0; - goto out; - } - - if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) { - NL_SET_ERR_MSG(extack, "Line card is not provisioned"); - err = 0; - goto out; - } - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - err = linecard->ops->unprovision(linecard, linecard->priv, - extack); - if (err) { - /* Unprovisioning failed. Assume the linecard is unprovisioned - * for future operations. - */ - mutex_lock(&linecard->state_lock); - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - linecard->type = NULL; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - } - return err; - -out: - mutex_unlock(&linecard->state_lock); - return err; -} - -static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - struct devlink_linecard *linecard; - int err; - - linecard = devlink_linecard_get_from_info(devlink, info); - if (IS_ERR(linecard)) - return PTR_ERR(linecard); - - if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) { - const char *type; - - type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]); - if (strcmp(type, "")) { - err = devlink_linecard_type_set(linecard, type, extack); - if (err) - return err; - } else { - err = devlink_linecard_type_unset(linecard, extack); - if (err) - return err; - } - } - - return 0; -} - -static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, - struct devlink_sb *devlink_sb, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) -{ - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_SIZE, devlink_sb->size)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_POOL_COUNT, - devlink_sb->ingress_pools_count)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_POOL_COUNT, - devlink_sb->egress_pools_count)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_TC_COUNT, - devlink_sb->ingress_tc_count)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_TC_COUNT, - devlink_sb->egress_tc_count)) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb; - struct sk_buff *msg; - int err; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_sb_fill(msg, devlink, devlink_sb, - DEVLINK_CMD_SB_NEW, - info->snd_portid, info->snd_seq, 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int -devlink_nl_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb, int flags) -{ - struct devlink_nl_dump_state *state = devlink_dump_state(cb); - struct devlink_sb *devlink_sb; - int idx = 0; - int err = 0; - - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - if (idx < state->idx) { - idx++; - continue; - } - err = devlink_nl_sb_fill(msg, devlink, devlink_sb, - DEVLINK_CMD_SB_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, flags); - if (err) { - state->idx = idx; - break; - } - idx++; - } - - return err; -} - -int devlink_nl_sb_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) -{ - return devlink_nl_dumpit(skb, cb, devlink_nl_sb_get_dump_one); -} - -static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink, - struct devlink_sb *devlink_sb, - u16 pool_index, enum devlink_command cmd, - u32 portid, u32 seq, int flags) -{ - struct devlink_sb_pool_info pool_info; - void *hdr; - int err; - - err = devlink->ops->sb_pool_get(devlink, devlink_sb->index, - pool_index, &pool_info); - if (err) - return err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) - goto nla_put_failure; - if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_info.pool_type)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_SIZE, pool_info.size)) - goto nla_put_failure; - if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE, - pool_info.threshold_type)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_CELL_SIZE, - pool_info.cell_size)) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -int devlink_nl_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb; - struct sk_buff *msg; - u16 pool_index; - int err; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - err = devlink_sb_pool_index_get_from_info(devlink_sb, info, - &pool_index); - if (err) - return err; - - if (!devlink->ops->sb_pool_get) - return -EOPNOTSUPP; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_sb_pool_fill(msg, devlink, devlink_sb, pool_index, - DEVLINK_CMD_SB_POOL_NEW, - info->snd_portid, info->snd_seq, 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, - struct devlink *devlink, - struct devlink_sb *devlink_sb, - u32 portid, u32 seq, int flags) -{ - u16 pool_count = devlink_sb_pool_count(devlink_sb); - u16 pool_index; - int err; - - for (pool_index = 0; pool_index < pool_count; pool_index++) { - if (*p_idx < start) { - (*p_idx)++; - continue; - } - err = devlink_nl_sb_pool_fill(msg, devlink, - devlink_sb, - pool_index, - DEVLINK_CMD_SB_POOL_NEW, - portid, seq, flags); - if (err) - return err; - (*p_idx)++; - } - return 0; -} - -static int -devlink_nl_sb_pool_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb, int flags) -{ - struct devlink_nl_dump_state *state = devlink_dump_state(cb); - struct devlink_sb *devlink_sb; - int err = 0; - int idx = 0; - - if (!devlink->ops->sb_pool_get) - return 0; - - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - err = __sb_pool_get_dumpit(msg, state->idx, &idx, - devlink, devlink_sb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, flags); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - state->idx = idx; - break; - } - } - - return err; -} - -int devlink_nl_sb_pool_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) -{ - return devlink_nl_dumpit(skb, cb, devlink_nl_sb_pool_get_dump_one); -} - -static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index, - u16 pool_index, u32 size, - enum devlink_sb_threshold_type threshold_type, - struct netlink_ext_ack *extack) - -{ - const struct devlink_ops *ops = devlink->ops; - - if (ops->sb_pool_set) - return ops->sb_pool_set(devlink, sb_index, pool_index, - size, threshold_type, extack); - return -EOPNOTSUPP; -} - -static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - enum devlink_sb_threshold_type threshold_type; - struct devlink_sb *devlink_sb; - u16 pool_index; - u32 size; - int err; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - err = devlink_sb_pool_index_get_from_info(devlink_sb, info, - &pool_index); - if (err) - return err; - - err = devlink_sb_th_type_get_from_info(info, &threshold_type); - if (err) - return err; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_POOL_SIZE)) - return -EINVAL; - - size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]); - return devlink_sb_pool_set(devlink, devlink_sb->index, - pool_index, size, threshold_type, - info->extack); -} - -static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg, - struct devlink *devlink, - struct devlink_port *devlink_port, - struct devlink_sb *devlink_sb, - u16 pool_index, - enum devlink_command cmd, - u32 portid, u32 seq, int flags) -{ - const struct devlink_ops *ops = devlink->ops; - u32 threshold; - void *hdr; - int err; - - err = ops->sb_port_pool_get(devlink_port, devlink_sb->index, - pool_index, &threshold); - if (err) - return err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold)) - goto nla_put_failure; - - if (ops->sb_occ_port_pool_get) { - u32 cur; - u32 max; - - err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index, - pool_index, &cur, &max); - if (err && err != -EOPNOTSUPP) - goto sb_occ_get_failure; - if (!err) { - if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max)) - goto nla_put_failure; - } - } - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - err = -EMSGSIZE; -sb_occ_get_failure: - genlmsg_cancel(msg, hdr); - return err; -} - -int devlink_nl_sb_port_pool_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = devlink_port->devlink; - struct devlink_sb *devlink_sb; - struct sk_buff *msg; - u16 pool_index; - int err; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - err = devlink_sb_pool_index_get_from_info(devlink_sb, info, - &pool_index); - if (err) - return err; - - if (!devlink->ops->sb_port_pool_get) - return -EOPNOTSUPP; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_sb_port_pool_fill(msg, devlink, devlink_port, - devlink_sb, pool_index, - DEVLINK_CMD_SB_PORT_POOL_NEW, - info->snd_portid, info->snd_seq, 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, - struct devlink *devlink, - struct devlink_sb *devlink_sb, - u32 portid, u32 seq, int flags) -{ - struct devlink_port *devlink_port; - u16 pool_count = devlink_sb_pool_count(devlink_sb); - unsigned long port_index; - u16 pool_index; - int err; - - xa_for_each(&devlink->ports, port_index, devlink_port) { - for (pool_index = 0; pool_index < pool_count; pool_index++) { - if (*p_idx < start) { - (*p_idx)++; - continue; - } - err = devlink_nl_sb_port_pool_fill(msg, devlink, - devlink_port, - devlink_sb, - pool_index, - DEVLINK_CMD_SB_PORT_POOL_NEW, - portid, seq, flags); - if (err) - return err; - (*p_idx)++; - } - } - return 0; -} - -static int -devlink_nl_sb_port_pool_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb, int flags) -{ - struct devlink_nl_dump_state *state = devlink_dump_state(cb); - struct devlink_sb *devlink_sb; - int idx = 0; - int err = 0; - - if (!devlink->ops->sb_port_pool_get) - return 0; - - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - err = __sb_port_pool_get_dumpit(msg, state->idx, &idx, - devlink, devlink_sb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, flags); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - state->idx = idx; - break; - } - } - - return err; -} - -int devlink_nl_sb_port_pool_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) -{ - return devlink_nl_dumpit(skb, cb, devlink_nl_sb_port_pool_get_dump_one); -} - -static int devlink_sb_port_pool_set(struct devlink_port *devlink_port, - unsigned int sb_index, u16 pool_index, - u32 threshold, - struct netlink_ext_ack *extack) - -{ - const struct devlink_ops *ops = devlink_port->devlink->ops; - - if (ops->sb_port_pool_set) - return ops->sb_port_pool_set(devlink_port, sb_index, - pool_index, threshold, extack); - return -EOPNOTSUPP; -} - -static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = info->user_ptr[0]; - struct devlink_sb *devlink_sb; - u16 pool_index; - u32 threshold; - int err; - - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - err = devlink_sb_pool_index_get_from_info(devlink_sb, info, - &pool_index); - if (err) - return err; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD)) - return -EINVAL; - - threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); - return devlink_sb_port_pool_set(devlink_port, devlink_sb->index, - pool_index, threshold, info->extack); -} - -static int -devlink_nl_sb_tc_pool_bind_fill(struct sk_buff *msg, struct devlink *devlink, - struct devlink_port *devlink_port, - struct devlink_sb *devlink_sb, u16 tc_index, - enum devlink_sb_pool_type pool_type, - enum devlink_command cmd, - u32 portid, u32 seq, int flags) -{ - const struct devlink_ops *ops = devlink->ops; - u16 pool_index; - u32 threshold; - void *hdr; - int err; - - err = ops->sb_tc_pool_bind_get(devlink_port, devlink_sb->index, - tc_index, pool_type, - &pool_index, &threshold); - if (err) - return err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_TC_INDEX, tc_index)) - goto nla_put_failure; - if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_type)) - goto nla_put_failure; - if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold)) - goto nla_put_failure; - - if (ops->sb_occ_tc_port_bind_get) { - u32 cur; - u32 max; - - err = ops->sb_occ_tc_port_bind_get(devlink_port, - devlink_sb->index, - tc_index, pool_type, - &cur, &max); - if (err && err != -EOPNOTSUPP) - return err; - if (!err) { - if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max)) - goto nla_put_failure; - } - } - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; + list_for_each_entry_reverse(linecard, &devlink->linecard_list, list) + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); } -int devlink_nl_sb_tc_pool_bind_get_doit(struct sk_buff *skb, - struct genl_info *info) +int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = devlink_port->devlink; - struct devlink_sb *devlink_sb; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_linecard *linecard; struct sk_buff *msg; - enum devlink_sb_pool_type pool_type; - u16 tc_index; int err; - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - err = devlink_sb_pool_type_get_from_info(info, &pool_type); - if (err) - return err; - - err = devlink_sb_tc_index_get_from_info(devlink_sb, info, - pool_type, &tc_index); - if (err) - return err; - - if (!devlink->ops->sb_tc_pool_bind_get) - return -EOPNOTSUPP; + linecard = devlink_linecard_get_from_info(devlink, info); + if (IS_ERR(linecard)) + return PTR_ERR(linecard); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; - err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, devlink_port, - devlink_sb, tc_index, pool_type, - DEVLINK_CMD_SB_TC_POOL_BIND_NEW, - info->snd_portid, - info->snd_seq, 0); + mutex_lock(&linecard->state_lock); + err = devlink_nl_linecard_fill(msg, devlink, linecard, + DEVLINK_CMD_LINECARD_NEW, + info->snd_portid, info->snd_seq, 0, + info->extack); + mutex_unlock(&linecard->state_lock); if (err) { nlmsg_free(msg); return err; @@ -1880,179 +928,204 @@ int devlink_nl_sb_tc_pool_bind_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, - int start, int *p_idx, - struct devlink *devlink, - struct devlink_sb *devlink_sb, - u32 portid, u32 seq, int flags) -{ - struct devlink_port *devlink_port; - unsigned long port_index; - u16 tc_index; - int err; - - xa_for_each(&devlink->ports, port_index, devlink_port) { - for (tc_index = 0; - tc_index < devlink_sb->ingress_tc_count; tc_index++) { - if (*p_idx < start) { - (*p_idx)++; - continue; - } - err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, - devlink_port, - devlink_sb, - tc_index, - DEVLINK_SB_POOL_TYPE_INGRESS, - DEVLINK_CMD_SB_TC_POOL_BIND_NEW, - portid, seq, - flags); - if (err) - return err; - (*p_idx)++; - } - for (tc_index = 0; - tc_index < devlink_sb->egress_tc_count; tc_index++) { - if (*p_idx < start) { - (*p_idx)++; - continue; - } - err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, - devlink_port, - devlink_sb, - tc_index, - DEVLINK_SB_POOL_TYPE_EGRESS, - DEVLINK_CMD_SB_TC_POOL_BIND_NEW, - portid, seq, - flags); - if (err) - return err; - (*p_idx)++; - } - } - return 0; -} - -static int devlink_nl_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb, - int flags) +static int devlink_nl_linecard_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); - struct devlink_sb *devlink_sb; + struct devlink_linecard *linecard; int idx = 0; int err = 0; - if (!devlink->ops->sb_tc_pool_bind_get) - return 0; - - list_for_each_entry(devlink_sb, &devlink->sb_list, list) { - err = __sb_tc_pool_bind_get_dumpit(msg, state->idx, &idx, - devlink, devlink_sb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, flags); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { + list_for_each_entry(linecard, &devlink->linecard_list, list) { + if (idx < state->idx) { + idx++; + continue; + } + mutex_lock(&linecard->state_lock); + err = devlink_nl_linecard_fill(msg, devlink, linecard, + DEVLINK_CMD_LINECARD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, flags, + cb->extack); + mutex_unlock(&linecard->state_lock); + if (err) { state->idx = idx; break; } + idx++; } return err; } -int devlink_nl_sb_tc_pool_bind_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +int devlink_nl_linecard_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) { - return devlink_nl_dumpit(skb, cb, - devlink_nl_sb_tc_pool_bind_get_dump_one); + return devlink_nl_dumpit(skb, cb, devlink_nl_linecard_get_dump_one); } -static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, - unsigned int sb_index, u16 tc_index, - enum devlink_sb_pool_type pool_type, - u16 pool_index, u32 threshold, - struct netlink_ext_ack *extack) - +static struct devlink_linecard_type * +devlink_linecard_type_lookup(struct devlink_linecard *linecard, + const char *type) { - const struct devlink_ops *ops = devlink_port->devlink->ops; + struct devlink_linecard_type *linecard_type; + int i; - if (ops->sb_tc_pool_bind_set) - return ops->sb_tc_pool_bind_set(devlink_port, sb_index, - tc_index, pool_type, - pool_index, threshold, extack); - return -EOPNOTSUPP; + for (i = 0; i < linecard->types_count; i++) { + linecard_type = &linecard->types[i]; + if (!strcmp(type, linecard_type->type)) + return linecard_type; + } + return NULL; } -static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb, - struct genl_info *info) +static int devlink_linecard_type_set(struct devlink_linecard *linecard, + const char *type, + struct netlink_ext_ack *extack) { - struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = info->user_ptr[0]; - enum devlink_sb_pool_type pool_type; - struct devlink_sb *devlink_sb; - u16 tc_index; - u16 pool_index; - u32 threshold; + const struct devlink_linecard_ops *ops = linecard->ops; + struct devlink_linecard_type *linecard_type; int err; - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); - - err = devlink_sb_pool_type_get_from_info(info, &pool_type); - if (err) - return err; + mutex_lock(&linecard->state_lock); + if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { + NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); + err = -EBUSY; + goto out; + } + if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { + NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); + err = -EBUSY; + goto out; + } - err = devlink_sb_tc_index_get_from_info(devlink_sb, info, - pool_type, &tc_index); - if (err) - return err; + linecard_type = devlink_linecard_type_lookup(linecard, type); + if (!linecard_type) { + NL_SET_ERR_MSG(extack, "Unsupported line card type provided"); + err = -EINVAL; + goto out; + } - err = devlink_sb_pool_index_get_from_info(devlink_sb, info, - &pool_index); - if (err) - return err; + if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED && + linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { + NL_SET_ERR_MSG(extack, "Line card already provisioned"); + err = -EBUSY; + /* Check if the line card is provisioned in the same + * way the user asks. In case it is, make the operation + * to return success. + */ + if (ops->same_provision && + ops->same_provision(linecard, linecard->priv, + linecard_type->type, + linecard_type->priv)) + err = 0; + goto out; + } - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD)) - return -EINVAL; + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING; + linecard->type = linecard_type->type; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + err = ops->provision(linecard, linecard->priv, linecard_type->type, + linecard_type->priv, extack); + if (err) { + /* Provisioning failed. Assume the linecard is unprovisioned + * for future operations. + */ + mutex_lock(&linecard->state_lock); + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + } + return err; - threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); - return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index, - tc_index, pool_type, - pool_index, threshold, info->extack); +out: + mutex_unlock(&linecard->state_lock); + return err; } -static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb, - struct genl_info *info) +static int devlink_linecard_type_unset(struct devlink_linecard *linecard, + struct netlink_ext_ack *extack) { - struct devlink *devlink = info->user_ptr[0]; - const struct devlink_ops *ops = devlink->ops; - struct devlink_sb *devlink_sb; + int err; + + mutex_lock(&linecard->state_lock); + if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { + NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); + err = -EBUSY; + goto out; + } + if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { + NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); + err = -EBUSY; + goto out; + } + if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + err = 0; + goto out; + } - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); + if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) { + NL_SET_ERR_MSG(extack, "Line card is not provisioned"); + err = 0; + goto out; + } + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + err = linecard->ops->unprovision(linecard, linecard->priv, + extack); + if (err) { + /* Unprovisioning failed. Assume the linecard is unprovisioned + * for future operations. + */ + mutex_lock(&linecard->state_lock); + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + } + return err; - if (ops->sb_occ_snapshot) - return ops->sb_occ_snapshot(devlink, devlink_sb->index); - return -EOPNOTSUPP; +out: + mutex_unlock(&linecard->state_lock); + return err; } -static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, - struct genl_info *info) +static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb, + struct genl_info *info) { + struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; - const struct devlink_ops *ops = devlink->ops; - struct devlink_sb *devlink_sb; + struct devlink_linecard *linecard; + int err; - devlink_sb = devlink_sb_get_from_info(devlink, info); - if (IS_ERR(devlink_sb)) - return PTR_ERR(devlink_sb); + linecard = devlink_linecard_get_from_info(devlink, info); + if (IS_ERR(linecard)) + return PTR_ERR(linecard); + + if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) { + const char *type; + + type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]); + if (strcmp(type, "")) { + err = devlink_linecard_type_set(linecard, type, extack); + if (err) + return err; + } else { + err = devlink_linecard_type_unset(linecard, extack); + if (err) + return err; + } + } - if (ops->sb_occ_max_clear) - return ops->sb_occ_max_clear(devlink, devlink_sb->index); - return -EOPNOTSUPP; + return 0; } int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, @@ -6215,69 +5288,6 @@ void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, } EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set); -int devl_sb_register(struct devlink *devlink, unsigned int sb_index, - u32 size, u16 ingress_pools_count, - u16 egress_pools_count, u16 ingress_tc_count, - u16 egress_tc_count) -{ - struct devlink_sb *devlink_sb; - - lockdep_assert_held(&devlink->lock); - - if (devlink_sb_index_exists(devlink, sb_index)) - return -EEXIST; - - devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL); - if (!devlink_sb) - return -ENOMEM; - devlink_sb->index = sb_index; - devlink_sb->size = size; - devlink_sb->ingress_pools_count = ingress_pools_count; - devlink_sb->egress_pools_count = egress_pools_count; - devlink_sb->ingress_tc_count = ingress_tc_count; - devlink_sb->egress_tc_count = egress_tc_count; - list_add_tail(&devlink_sb->list, &devlink->sb_list); - return 0; -} -EXPORT_SYMBOL_GPL(devl_sb_register); - -int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, - u32 size, u16 ingress_pools_count, - u16 egress_pools_count, u16 ingress_tc_count, - u16 egress_tc_count) -{ - int err; - - devl_lock(devlink); - err = devl_sb_register(devlink, sb_index, size, ingress_pools_count, - egress_pools_count, ingress_tc_count, - egress_tc_count); - devl_unlock(devlink); - return err; -} -EXPORT_SYMBOL_GPL(devlink_sb_register); - -void devl_sb_unregister(struct devlink *devlink, unsigned int sb_index) -{ - struct devlink_sb *devlink_sb; - - lockdep_assert_held(&devlink->lock); - - devlink_sb = devlink_sb_get_by_index(devlink, sb_index); - WARN_ON(!devlink_sb); - list_del(&devlink_sb->list); - kfree(devlink_sb); -} -EXPORT_SYMBOL_GPL(devl_sb_unregister); - -void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index) -{ - devl_lock(devlink); - devl_sb_unregister(devlink, sb_index); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_sb_unregister); - /** * devl_dpipe_headers_register - register dpipe headers * diff --git a/net/devlink/sb.c b/net/devlink/sb.c new file mode 100644 index 000000000000..bd677fff5ec8 --- /dev/null +++ b/net/devlink/sb.c @@ -0,0 +1,996 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko + */ + +#include "devl_internal.h" + +struct devlink_sb { + struct list_head list; + unsigned int index; + u32 size; + u16 ingress_pools_count; + u16 egress_pools_count; + u16 ingress_tc_count; + u16 egress_tc_count; +}; + +static u16 devlink_sb_pool_count(struct devlink_sb *devlink_sb) +{ + return devlink_sb->ingress_pools_count + devlink_sb->egress_pools_count; +} + +static struct devlink_sb *devlink_sb_get_by_index(struct devlink *devlink, + unsigned int sb_index) +{ + struct devlink_sb *devlink_sb; + + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + if (devlink_sb->index == sb_index) + return devlink_sb; + } + return NULL; +} + +static bool devlink_sb_index_exists(struct devlink *devlink, + unsigned int sb_index) +{ + return devlink_sb_get_by_index(devlink, sb_index); +} + +static struct devlink_sb *devlink_sb_get_from_attrs(struct devlink *devlink, + struct nlattr **attrs) +{ + if (attrs[DEVLINK_ATTR_SB_INDEX]) { + u32 sb_index = nla_get_u32(attrs[DEVLINK_ATTR_SB_INDEX]); + struct devlink_sb *devlink_sb; + + devlink_sb = devlink_sb_get_by_index(devlink, sb_index); + if (!devlink_sb) + return ERR_PTR(-ENODEV); + return devlink_sb; + } + return ERR_PTR(-EINVAL); +} + +static struct devlink_sb *devlink_sb_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + return devlink_sb_get_from_attrs(devlink, info->attrs); +} + +static int devlink_sb_pool_index_get_from_attrs(struct devlink_sb *devlink_sb, + struct nlattr **attrs, + u16 *p_pool_index) +{ + u16 val; + + if (!attrs[DEVLINK_ATTR_SB_POOL_INDEX]) + return -EINVAL; + + val = nla_get_u16(attrs[DEVLINK_ATTR_SB_POOL_INDEX]); + if (val >= devlink_sb_pool_count(devlink_sb)) + return -EINVAL; + *p_pool_index = val; + return 0; +} + +static int devlink_sb_pool_index_get_from_info(struct devlink_sb *devlink_sb, + struct genl_info *info, + u16 *p_pool_index) +{ + return devlink_sb_pool_index_get_from_attrs(devlink_sb, info->attrs, + p_pool_index); +} + +static int +devlink_sb_pool_type_get_from_attrs(struct nlattr **attrs, + enum devlink_sb_pool_type *p_pool_type) +{ + u8 val; + + if (!attrs[DEVLINK_ATTR_SB_POOL_TYPE]) + return -EINVAL; + + val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_TYPE]); + if (val != DEVLINK_SB_POOL_TYPE_INGRESS && + val != DEVLINK_SB_POOL_TYPE_EGRESS) + return -EINVAL; + *p_pool_type = val; + return 0; +} + +static int +devlink_sb_pool_type_get_from_info(struct genl_info *info, + enum devlink_sb_pool_type *p_pool_type) +{ + return devlink_sb_pool_type_get_from_attrs(info->attrs, p_pool_type); +} + +static int +devlink_sb_th_type_get_from_attrs(struct nlattr **attrs, + enum devlink_sb_threshold_type *p_th_type) +{ + u8 val; + + if (!attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]) + return -EINVAL; + + val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]); + if (val != DEVLINK_SB_THRESHOLD_TYPE_STATIC && + val != DEVLINK_SB_THRESHOLD_TYPE_DYNAMIC) + return -EINVAL; + *p_th_type = val; + return 0; +} + +static int +devlink_sb_th_type_get_from_info(struct genl_info *info, + enum devlink_sb_threshold_type *p_th_type) +{ + return devlink_sb_th_type_get_from_attrs(info->attrs, p_th_type); +} + +static int +devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb, + struct nlattr **attrs, + enum devlink_sb_pool_type pool_type, + u16 *p_tc_index) +{ + u16 val; + + if (!attrs[DEVLINK_ATTR_SB_TC_INDEX]) + return -EINVAL; + + val = nla_get_u16(attrs[DEVLINK_ATTR_SB_TC_INDEX]); + if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS && + val >= devlink_sb->ingress_tc_count) + return -EINVAL; + if (pool_type == DEVLINK_SB_POOL_TYPE_EGRESS && + val >= devlink_sb->egress_tc_count) + return -EINVAL; + *p_tc_index = val; + return 0; +} + +static int +devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, + struct genl_info *info, + enum devlink_sb_pool_type pool_type, + u16 *p_tc_index) +{ + return devlink_sb_tc_index_get_from_attrs(devlink_sb, info->attrs, + pool_type, p_tc_index); +} + +static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_sb *devlink_sb, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_SIZE, devlink_sb->size)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_POOL_COUNT, + devlink_sb->ingress_pools_count)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_POOL_COUNT, + devlink_sb->egress_pools_count)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_TC_COUNT, + devlink_sb->ingress_tc_count)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_TC_COUNT, + devlink_sb->egress_tc_count)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_sb *devlink_sb; + struct sk_buff *msg; + int err; + + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_sb_fill(msg, devlink, devlink_sb, + DEVLINK_CMD_SB_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int +devlink_nl_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb, int flags) +{ + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + struct devlink_sb *devlink_sb; + int idx = 0; + int err = 0; + + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + if (idx < state->idx) { + idx++; + continue; + } + err = devlink_nl_sb_fill(msg, devlink, devlink_sb, + DEVLINK_CMD_SB_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, flags); + if (err) { + state->idx = idx; + break; + } + idx++; + } + + return err; +} + +int devlink_nl_sb_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_sb_get_dump_one); +} + +static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_sb *devlink_sb, + u16 pool_index, enum devlink_command cmd, + u32 portid, u32 seq, int flags) +{ + struct devlink_sb_pool_info pool_info; + void *hdr; + int err; + + err = devlink->ops->sb_pool_get(devlink, devlink_sb->index, + pool_index, &pool_info); + if (err) + return err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) + goto nla_put_failure; + if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_info.pool_type)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_SIZE, pool_info.size)) + goto nla_put_failure; + if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE, + pool_info.threshold_type)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_CELL_SIZE, + pool_info.cell_size)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +int devlink_nl_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_sb *devlink_sb; + struct sk_buff *msg; + u16 pool_index; + int err; + + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, + &pool_index); + if (err) + return err; + + if (!devlink->ops->sb_pool_get) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_sb_pool_fill(msg, devlink, devlink_sb, pool_index, + DEVLINK_CMD_SB_POOL_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, + struct devlink *devlink, + struct devlink_sb *devlink_sb, + u32 portid, u32 seq, int flags) +{ + u16 pool_count = devlink_sb_pool_count(devlink_sb); + u16 pool_index; + int err; + + for (pool_index = 0; pool_index < pool_count; pool_index++) { + if (*p_idx < start) { + (*p_idx)++; + continue; + } + err = devlink_nl_sb_pool_fill(msg, devlink, + devlink_sb, + pool_index, + DEVLINK_CMD_SB_POOL_NEW, + portid, seq, flags); + if (err) + return err; + (*p_idx)++; + } + return 0; +} + +static int +devlink_nl_sb_pool_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb, int flags) +{ + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + struct devlink_sb *devlink_sb; + int err = 0; + int idx = 0; + + if (!devlink->ops->sb_pool_get) + return 0; + + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_pool_get_dumpit(msg, state->idx, &idx, + devlink, devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, flags); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = idx; + break; + } + } + + return err; +} + +int devlink_nl_sb_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_sb_pool_get_dump_one); +} + +static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index, + u16 pool_index, u32 size, + enum devlink_sb_threshold_type threshold_type, + struct netlink_ext_ack *extack) + +{ + const struct devlink_ops *ops = devlink->ops; + + if (ops->sb_pool_set) + return ops->sb_pool_set(devlink, sb_index, pool_index, + size, threshold_type, extack); + return -EOPNOTSUPP; +} + +int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + enum devlink_sb_threshold_type threshold_type; + struct devlink_sb *devlink_sb; + u16 pool_index; + u32 size; + int err; + + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, + &pool_index); + if (err) + return err; + + err = devlink_sb_th_type_get_from_info(info, &threshold_type); + if (err) + return err; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_POOL_SIZE)) + return -EINVAL; + + size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]); + return devlink_sb_pool_set(devlink, devlink_sb->index, + pool_index, size, threshold_type, + info->extack); +} + +static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_port *devlink_port, + struct devlink_sb *devlink_sb, + u16 pool_index, + enum devlink_command cmd, + u32 portid, u32 seq, int flags) +{ + const struct devlink_ops *ops = devlink->ops; + u32 threshold; + void *hdr; + int err; + + err = ops->sb_port_pool_get(devlink_port, devlink_sb->index, + pool_index, &threshold); + if (err) + return err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold)) + goto nla_put_failure; + + if (ops->sb_occ_port_pool_get) { + u32 cur; + u32 max; + + err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index, + pool_index, &cur, &max); + if (err && err != -EOPNOTSUPP) + goto sb_occ_get_failure; + if (!err) { + if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max)) + goto nla_put_failure; + } + } + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + err = -EMSGSIZE; +sb_occ_get_failure: + genlmsg_cancel(msg, hdr); + return err; +} + +int devlink_nl_sb_port_pool_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[1]; + struct devlink *devlink = devlink_port->devlink; + struct devlink_sb *devlink_sb; + struct sk_buff *msg; + u16 pool_index; + int err; + + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, + &pool_index); + if (err) + return err; + + if (!devlink->ops->sb_port_pool_get) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_sb_port_pool_fill(msg, devlink, devlink_port, + devlink_sb, pool_index, + DEVLINK_CMD_SB_PORT_POOL_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, + struct devlink *devlink, + struct devlink_sb *devlink_sb, + u32 portid, u32 seq, int flags) +{ + struct devlink_port *devlink_port; + u16 pool_count = devlink_sb_pool_count(devlink_sb); + unsigned long port_index; + u16 pool_index; + int err; + + xa_for_each(&devlink->ports, port_index, devlink_port) { + for (pool_index = 0; pool_index < pool_count; pool_index++) { + if (*p_idx < start) { + (*p_idx)++; + continue; + } + err = devlink_nl_sb_port_pool_fill(msg, devlink, + devlink_port, + devlink_sb, + pool_index, + DEVLINK_CMD_SB_PORT_POOL_NEW, + portid, seq, flags); + if (err) + return err; + (*p_idx)++; + } + } + return 0; +} + +static int +devlink_nl_sb_port_pool_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, int flags) +{ + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + struct devlink_sb *devlink_sb; + int idx = 0; + int err = 0; + + if (!devlink->ops->sb_port_pool_get) + return 0; + + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_port_pool_get_dumpit(msg, state->idx, &idx, + devlink, devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, flags); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = idx; + break; + } + } + + return err; +} + +int devlink_nl_sb_port_pool_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_sb_port_pool_get_dump_one); +} + +static int devlink_sb_port_pool_set(struct devlink_port *devlink_port, + unsigned int sb_index, u16 pool_index, + u32 threshold, + struct netlink_ext_ack *extack) + +{ + const struct devlink_ops *ops = devlink_port->devlink->ops; + + if (ops->sb_port_pool_set) + return ops->sb_port_pool_set(devlink_port, sb_index, + pool_index, threshold, extack); + return -EOPNOTSUPP; +} + +int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[1]; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_sb *devlink_sb; + u16 pool_index; + u32 threshold; + int err; + + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, + &pool_index); + if (err) + return err; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD)) + return -EINVAL; + + threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); + return devlink_sb_port_pool_set(devlink_port, devlink_sb->index, + pool_index, threshold, info->extack); +} + +static int +devlink_nl_sb_tc_pool_bind_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_port *devlink_port, + struct devlink_sb *devlink_sb, u16 tc_index, + enum devlink_sb_pool_type pool_type, + enum devlink_command cmd, + u32 portid, u32 seq, int flags) +{ + const struct devlink_ops *ops = devlink->ops; + u16 pool_index; + u32 threshold; + void *hdr; + int err; + + err = ops->sb_tc_pool_bind_get(devlink_port, devlink_sb->index, + tc_index, pool_type, + &pool_index, &threshold); + if (err) + return err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_TC_INDEX, tc_index)) + goto nla_put_failure; + if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_type)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold)) + goto nla_put_failure; + + if (ops->sb_occ_tc_port_bind_get) { + u32 cur; + u32 max; + + err = ops->sb_occ_tc_port_bind_get(devlink_port, + devlink_sb->index, + tc_index, pool_type, + &cur, &max); + if (err && err != -EOPNOTSUPP) + return err; + if (!err) { + if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max)) + goto nla_put_failure; + } + } + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +int devlink_nl_sb_tc_pool_bind_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[1]; + struct devlink *devlink = devlink_port->devlink; + struct devlink_sb *devlink_sb; + struct sk_buff *msg; + enum devlink_sb_pool_type pool_type; + u16 tc_index; + int err; + + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + + err = devlink_sb_pool_type_get_from_info(info, &pool_type); + if (err) + return err; + + err = devlink_sb_tc_index_get_from_info(devlink_sb, info, + pool_type, &tc_index); + if (err) + return err; + + if (!devlink->ops->sb_tc_pool_bind_get) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, devlink_port, + devlink_sb, tc_index, pool_type, + DEVLINK_CMD_SB_TC_POOL_BIND_NEW, + info->snd_portid, + info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, + int start, int *p_idx, + struct devlink *devlink, + struct devlink_sb *devlink_sb, + u32 portid, u32 seq, int flags) +{ + struct devlink_port *devlink_port; + unsigned long port_index; + u16 tc_index; + int err; + + xa_for_each(&devlink->ports, port_index, devlink_port) { + for (tc_index = 0; + tc_index < devlink_sb->ingress_tc_count; tc_index++) { + if (*p_idx < start) { + (*p_idx)++; + continue; + } + err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, + devlink_port, + devlink_sb, + tc_index, + DEVLINK_SB_POOL_TYPE_INGRESS, + DEVLINK_CMD_SB_TC_POOL_BIND_NEW, + portid, seq, + flags); + if (err) + return err; + (*p_idx)++; + } + for (tc_index = 0; + tc_index < devlink_sb->egress_tc_count; tc_index++) { + if (*p_idx < start) { + (*p_idx)++; + continue; + } + err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, + devlink_port, + devlink_sb, + tc_index, + DEVLINK_SB_POOL_TYPE_EGRESS, + DEVLINK_CMD_SB_TC_POOL_BIND_NEW, + portid, seq, + flags); + if (err) + return err; + (*p_idx)++; + } + } + return 0; +} + +static int devlink_nl_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags) +{ + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + struct devlink_sb *devlink_sb; + int idx = 0; + int err = 0; + + if (!devlink->ops->sb_tc_pool_bind_get) + return 0; + + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_tc_pool_bind_get_dumpit(msg, state->idx, &idx, + devlink, devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, flags); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = idx; + break; + } + } + + return err; +} + +int devlink_nl_sb_tc_pool_bind_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, + devlink_nl_sb_tc_pool_bind_get_dump_one); +} + +static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 pool_index, u32 threshold, + struct netlink_ext_ack *extack) + +{ + const struct devlink_ops *ops = devlink_port->devlink->ops; + + if (ops->sb_tc_pool_bind_set) + return ops->sb_tc_pool_bind_set(devlink_port, sb_index, + tc_index, pool_type, + pool_index, threshold, extack); + return -EOPNOTSUPP; +} + +int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[1]; + struct devlink *devlink = info->user_ptr[0]; + enum devlink_sb_pool_type pool_type; + struct devlink_sb *devlink_sb; + u16 tc_index; + u16 pool_index; + u32 threshold; + int err; + + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + + err = devlink_sb_pool_type_get_from_info(info, &pool_type); + if (err) + return err; + + err = devlink_sb_tc_index_get_from_info(devlink_sb, info, + pool_type, &tc_index); + if (err) + return err; + + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, + &pool_index); + if (err) + return err; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD)) + return -EINVAL; + + threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); + return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index, + tc_index, pool_type, + pool_index, threshold, info->extack); +} + +int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const struct devlink_ops *ops = devlink->ops; + struct devlink_sb *devlink_sb; + + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + + if (ops->sb_occ_snapshot) + return ops->sb_occ_snapshot(devlink, devlink_sb->index); + return -EOPNOTSUPP; +} + +int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const struct devlink_ops *ops = devlink->ops; + struct devlink_sb *devlink_sb; + + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) + return PTR_ERR(devlink_sb); + + if (ops->sb_occ_max_clear) + return ops->sb_occ_max_clear(devlink, devlink_sb->index); + return -EOPNOTSUPP; +} + +int devl_sb_register(struct devlink *devlink, unsigned int sb_index, + u32 size, u16 ingress_pools_count, + u16 egress_pools_count, u16 ingress_tc_count, + u16 egress_tc_count) +{ + struct devlink_sb *devlink_sb; + + lockdep_assert_held(&devlink->lock); + + if (devlink_sb_index_exists(devlink, sb_index)) + return -EEXIST; + + devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL); + if (!devlink_sb) + return -ENOMEM; + devlink_sb->index = sb_index; + devlink_sb->size = size; + devlink_sb->ingress_pools_count = ingress_pools_count; + devlink_sb->egress_pools_count = egress_pools_count; + devlink_sb->ingress_tc_count = ingress_tc_count; + devlink_sb->egress_tc_count = egress_tc_count; + list_add_tail(&devlink_sb->list, &devlink->sb_list); + return 0; +} +EXPORT_SYMBOL_GPL(devl_sb_register); + +int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, + u32 size, u16 ingress_pools_count, + u16 egress_pools_count, u16 ingress_tc_count, + u16 egress_tc_count) +{ + int err; + + devl_lock(devlink); + err = devl_sb_register(devlink, sb_index, size, ingress_pools_count, + egress_pools_count, ingress_tc_count, + egress_tc_count); + devl_unlock(devlink); + return err; +} +EXPORT_SYMBOL_GPL(devlink_sb_register); + +void devl_sb_unregister(struct devlink *devlink, unsigned int sb_index) +{ + struct devlink_sb *devlink_sb; + + lockdep_assert_held(&devlink->lock); + + devlink_sb = devlink_sb_get_by_index(devlink, sb_index); + WARN_ON(!devlink_sb); + list_del(&devlink_sb->list); + kfree(devlink_sb); +} +EXPORT_SYMBOL_GPL(devl_sb_unregister); + +void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index) +{ + devl_lock(devlink); + devl_sb_unregister(devlink, sb_index); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_sb_unregister); -- cgit v1.2.3 From 2475ed158c478c624d8fbc8d639d344a960c1ad8 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Aug 2023 08:16:46 +0200 Subject: devlink: move and rename devlink_dpipe_send_and_alloc_skb() helper Since both dpipe and resource code is using this helper, in preparation for code split to separate files, move devlink_dpipe_send_and_alloc_skb() helper into netlink.c. Rename it on the way. Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230828061657.300667-5-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/devl_internal.h | 2 ++ net/devlink/leftover.c | 34 +++++++++------------------------- net/devlink/netlink.c | 15 +++++++++++++++ 3 files changed, 26 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index e1a6b7a763b8..66c94957e96c 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -150,6 +150,8 @@ devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) return 0; } +int devlink_nl_msg_reply_and_new(struct sk_buff **msg, struct genl_info *info); + /* Notify */ void devlink_notify(struct devlink *devlink, enum devlink_command cmd); void devlink_ports_notify_register(struct devlink *devlink); diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 795bfdd41103..1bcd4192099e 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -1277,22 +1277,6 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_dpipe_send_and_alloc_skb(struct sk_buff **pskb, - struct genl_info *info) -{ - int err; - - if (*pskb) { - err = genlmsg_reply(*pskb, info); - if (err) - return err; - } - *pskb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!*pskb) - return -ENOMEM; - return 0; -} - static int devlink_dpipe_tables_fill(struct genl_info *info, enum devlink_command cmd, int flags, struct list_head *dpipe_tables, @@ -1311,7 +1295,7 @@ static int devlink_dpipe_tables_fill(struct genl_info *info, table = list_first_entry(dpipe_tables, struct devlink_dpipe_table, list); start_again: - err = devlink_dpipe_send_and_alloc_skb(&skb, info); + err = devlink_nl_msg_reply_and_new(&skb, info); if (err) return err; @@ -1358,7 +1342,7 @@ send_done: nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { - err = devlink_dpipe_send_and_alloc_skb(&skb, info); + err = devlink_nl_msg_reply_and_new(&skb, info); if (err) return err; goto send_done; @@ -1551,8 +1535,8 @@ int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx) struct devlink *devlink; int err; - err = devlink_dpipe_send_and_alloc_skb(&dump_ctx->skb, - dump_ctx->info); + err = devlink_nl_msg_reply_and_new(&dump_ctx->skb, + dump_ctx->info); if (err) return err; @@ -1638,7 +1622,7 @@ send_done: nlh = nlmsg_put(dump_ctx.skb, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { - err = devlink_dpipe_send_and_alloc_skb(&dump_ctx.skb, info); + err = devlink_nl_msg_reply_and_new(&dump_ctx.skb, info); if (err) return err; goto send_done; @@ -1746,7 +1730,7 @@ static int devlink_dpipe_headers_fill(struct genl_info *info, i = 0; start_again: - err = devlink_dpipe_send_and_alloc_skb(&skb, info); + err = devlink_nl_msg_reply_and_new(&skb, info); if (err) return err; @@ -1782,7 +1766,7 @@ send_done: nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { - err = devlink_dpipe_send_and_alloc_skb(&skb, info); + err = devlink_nl_msg_reply_and_new(&skb, info); if (err) return err; goto send_done; @@ -2047,7 +2031,7 @@ static int devlink_resource_fill(struct genl_info *info, resource = list_first_entry(&devlink->resource_list, struct devlink_resource, list); start_again: - err = devlink_dpipe_send_and_alloc_skb(&skb, info); + err = devlink_nl_msg_reply_and_new(&skb, info); if (err) return err; @@ -2086,7 +2070,7 @@ send_done: nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { - err = devlink_dpipe_send_and_alloc_skb(&skb, info); + err = devlink_nl_msg_reply_and_new(&skb, info); if (err) return err; goto send_done; diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 72a5005a64cd..5f57afd31dea 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -82,6 +82,21 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_REGION_DIRECT] = { .type = NLA_FLAG }, }; +int devlink_nl_msg_reply_and_new(struct sk_buff **msg, struct genl_info *info) +{ + int err; + + if (*msg) { + err = genlmsg_reply(*msg, info); + if (err) + return err; + } + *msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!*msg) + return -ENOMEM; + return 0; +} + struct devlink * devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs) { -- cgit v1.2.3 From a9fd44b15fc5625ee0a38661e773444f6bf31b3d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Aug 2023 08:16:47 +0200 Subject: devlink: push dpipe related code into separate file Cut out another chunk from leftover.c and put dpipe related code into a separate file. Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230828061657.300667-6-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/Makefile | 3 +- net/devlink/devl_internal.h | 7 + net/devlink/dpipe.c | 917 ++++++++++++++++++++++++++++++++++++++++++++ net/devlink/leftover.c | 894 ------------------------------------------ 4 files changed, 926 insertions(+), 895 deletions(-) create mode 100644 net/devlink/dpipe.c (limited to 'net') diff --git a/net/devlink/Makefile b/net/devlink/Makefile index dcc508af48a9..122de1d565d2 100644 --- a/net/devlink/Makefile +++ b/net/devlink/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o port.o sb.o health.o \ +obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o port.o sb.o dpipe.o \ + health.o diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 66c94957e96c..72653e1dae80 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -231,6 +231,13 @@ int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_dpipe_table_get(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_cmd_dpipe_headers_get(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, + struct genl_info *info); int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, diff --git a/net/devlink/dpipe.c b/net/devlink/dpipe.c new file mode 100644 index 000000000000..431227c412e5 --- /dev/null +++ b/net/devlink/dpipe.c @@ -0,0 +1,917 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko + */ + +#include "devl_internal.h" + +static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = { + { + .name = "destination mac", + .id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC, + .bitwidth = 48, + }, +}; + +struct devlink_dpipe_header devlink_dpipe_header_ethernet = { + .name = "ethernet", + .id = DEVLINK_DPIPE_HEADER_ETHERNET, + .fields = devlink_dpipe_fields_ethernet, + .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ethernet), + .global = true, +}; +EXPORT_SYMBOL_GPL(devlink_dpipe_header_ethernet); + +static struct devlink_dpipe_field devlink_dpipe_fields_ipv4[] = { + { + .name = "destination ip", + .id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP, + .bitwidth = 32, + }, +}; + +struct devlink_dpipe_header devlink_dpipe_header_ipv4 = { + .name = "ipv4", + .id = DEVLINK_DPIPE_HEADER_IPV4, + .fields = devlink_dpipe_fields_ipv4, + .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv4), + .global = true, +}; +EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv4); + +static struct devlink_dpipe_field devlink_dpipe_fields_ipv6[] = { + { + .name = "destination ip", + .id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP, + .bitwidth = 128, + }, +}; + +struct devlink_dpipe_header devlink_dpipe_header_ipv6 = { + .name = "ipv6", + .id = DEVLINK_DPIPE_HEADER_IPV6, + .fields = devlink_dpipe_fields_ipv6, + .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv6), + .global = true, +}; +EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv6); + +int devlink_dpipe_match_put(struct sk_buff *skb, + struct devlink_dpipe_match *match) +{ + struct devlink_dpipe_header *header = match->header; + struct devlink_dpipe_field *field = &header->fields[match->field_id]; + struct nlattr *match_attr; + + match_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_MATCH); + if (!match_attr) + return -EMSGSIZE; + + if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_MATCH_TYPE, match->type) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, match->header_index) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || + nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) + goto nla_put_failure; + + nla_nest_end(skb, match_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, match_attr); + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_match_put); + +static int devlink_dpipe_matches_put(struct devlink_dpipe_table *table, + struct sk_buff *skb) +{ + struct nlattr *matches_attr; + + matches_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_TABLE_MATCHES); + if (!matches_attr) + return -EMSGSIZE; + + if (table->table_ops->matches_dump(table->priv, skb)) + goto nla_put_failure; + + nla_nest_end(skb, matches_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, matches_attr); + return -EMSGSIZE; +} + +int devlink_dpipe_action_put(struct sk_buff *skb, + struct devlink_dpipe_action *action) +{ + struct devlink_dpipe_header *header = action->header; + struct devlink_dpipe_field *field = &header->fields[action->field_id]; + struct nlattr *action_attr; + + action_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_ACTION); + if (!action_attr) + return -EMSGSIZE; + + if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_ACTION_TYPE, action->type) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, action->header_index) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || + nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) + goto nla_put_failure; + + nla_nest_end(skb, action_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, action_attr); + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_action_put); + +static int devlink_dpipe_actions_put(struct devlink_dpipe_table *table, + struct sk_buff *skb) +{ + struct nlattr *actions_attr; + + actions_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_TABLE_ACTIONS); + if (!actions_attr) + return -EMSGSIZE; + + if (table->table_ops->actions_dump(table->priv, skb)) + goto nla_put_failure; + + nla_nest_end(skb, actions_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, actions_attr); + return -EMSGSIZE; +} + +static int devlink_dpipe_table_put(struct sk_buff *skb, + struct devlink_dpipe_table *table) +{ + struct nlattr *table_attr; + u64 table_size; + + table_size = table->table_ops->size_get(table->priv); + table_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_TABLE); + if (!table_attr) + return -EMSGSIZE; + + if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_TABLE_NAME, table->name) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_SIZE, table_size, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u8(skb, DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED, + table->counters_enabled)) + goto nla_put_failure; + + if (table->resource_valid) { + if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, + table->resource_id, DEVLINK_ATTR_PAD) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS, + table->resource_units, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + } + if (devlink_dpipe_matches_put(table, skb)) + goto nla_put_failure; + + if (devlink_dpipe_actions_put(table, skb)) + goto nla_put_failure; + + nla_nest_end(skb, table_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, table_attr); + return -EMSGSIZE; +} + +static int devlink_dpipe_send_and_alloc_skb(struct sk_buff **pskb, + struct genl_info *info) +{ + int err; + + if (*pskb) { + err = genlmsg_reply(*pskb, info); + if (err) + return err; + } + *pskb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!*pskb) + return -ENOMEM; + return 0; +} + +static int devlink_dpipe_tables_fill(struct genl_info *info, + enum devlink_command cmd, int flags, + struct list_head *dpipe_tables, + const char *table_name) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_dpipe_table *table; + struct nlattr *tables_attr; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + bool incomplete; + void *hdr; + int i; + int err; + + table = list_first_entry(dpipe_tables, + struct devlink_dpipe_table, list); +start_again: + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + return err; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, cmd); + if (!hdr) { + nlmsg_free(skb); + return -EMSGSIZE; + } + + if (devlink_nl_put_handle(skb, devlink)) + goto nla_put_failure; + tables_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_TABLES); + if (!tables_attr) + goto nla_put_failure; + + i = 0; + incomplete = false; + list_for_each_entry_from(table, dpipe_tables, list) { + if (!table_name) { + err = devlink_dpipe_table_put(skb, table); + if (err) { + if (!i) + goto err_table_put; + incomplete = true; + break; + } + } else { + if (!strcmp(table->name, table_name)) { + err = devlink_dpipe_table_put(skb, table); + if (err) + break; + } + } + i++; + } + + nla_nest_end(skb, tables_attr); + genlmsg_end(skb, hdr); + if (incomplete) + goto start_again; + +send_done: + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + return err; + goto send_done; + } + + return genlmsg_reply(skb, info); + +nla_put_failure: + err = -EMSGSIZE; +err_table_put: + nlmsg_free(skb); + return err; +} + +int devlink_nl_cmd_dpipe_table_get(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const char *table_name = NULL; + + if (info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]) + table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); + + return devlink_dpipe_tables_fill(info, DEVLINK_CMD_DPIPE_TABLE_GET, 0, + &devlink->dpipe_table_list, + table_name); +} + +static int devlink_dpipe_value_put(struct sk_buff *skb, + struct devlink_dpipe_value *value) +{ + if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE, + value->value_size, value->value)) + return -EMSGSIZE; + if (value->mask) + if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE_MASK, + value->value_size, value->mask)) + return -EMSGSIZE; + if (value->mapping_valid) + if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_VALUE_MAPPING, + value->mapping_value)) + return -EMSGSIZE; + return 0; +} + +static int devlink_dpipe_action_value_put(struct sk_buff *skb, + struct devlink_dpipe_value *value) +{ + if (!value->action) + return -EINVAL; + if (devlink_dpipe_action_put(skb, value->action)) + return -EMSGSIZE; + if (devlink_dpipe_value_put(skb, value)) + return -EMSGSIZE; + return 0; +} + +static int devlink_dpipe_action_values_put(struct sk_buff *skb, + struct devlink_dpipe_value *values, + unsigned int values_count) +{ + struct nlattr *action_attr; + int i; + int err; + + for (i = 0; i < values_count; i++) { + action_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_ACTION_VALUE); + if (!action_attr) + return -EMSGSIZE; + err = devlink_dpipe_action_value_put(skb, &values[i]); + if (err) + goto err_action_value_put; + nla_nest_end(skb, action_attr); + } + return 0; + +err_action_value_put: + nla_nest_cancel(skb, action_attr); + return err; +} + +static int devlink_dpipe_match_value_put(struct sk_buff *skb, + struct devlink_dpipe_value *value) +{ + if (!value->match) + return -EINVAL; + if (devlink_dpipe_match_put(skb, value->match)) + return -EMSGSIZE; + if (devlink_dpipe_value_put(skb, value)) + return -EMSGSIZE; + return 0; +} + +static int devlink_dpipe_match_values_put(struct sk_buff *skb, + struct devlink_dpipe_value *values, + unsigned int values_count) +{ + struct nlattr *match_attr; + int i; + int err; + + for (i = 0; i < values_count; i++) { + match_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_MATCH_VALUE); + if (!match_attr) + return -EMSGSIZE; + err = devlink_dpipe_match_value_put(skb, &values[i]); + if (err) + goto err_match_value_put; + nla_nest_end(skb, match_attr); + } + return 0; + +err_match_value_put: + nla_nest_cancel(skb, match_attr); + return err; +} + +static int devlink_dpipe_entry_put(struct sk_buff *skb, + struct devlink_dpipe_entry *entry) +{ + struct nlattr *entry_attr, *matches_attr, *actions_attr; + int err; + + entry_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_INDEX, entry->index, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (entry->counter_valid) + if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_COUNTER, + entry->counter, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + matches_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_ENTRY_MATCH_VALUES); + if (!matches_attr) + goto nla_put_failure; + + err = devlink_dpipe_match_values_put(skb, entry->match_values, + entry->match_values_count); + if (err) { + nla_nest_cancel(skb, matches_attr); + goto err_match_values_put; + } + nla_nest_end(skb, matches_attr); + + actions_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_ENTRY_ACTION_VALUES); + if (!actions_attr) + goto nla_put_failure; + + err = devlink_dpipe_action_values_put(skb, entry->action_values, + entry->action_values_count); + if (err) { + nla_nest_cancel(skb, actions_attr); + goto err_action_values_put; + } + nla_nest_end(skb, actions_attr); + + nla_nest_end(skb, entry_attr); + return 0; + +nla_put_failure: + err = -EMSGSIZE; +err_match_values_put: +err_action_values_put: + nla_nest_cancel(skb, entry_attr); + return err; +} + +static struct devlink_dpipe_table * +devlink_dpipe_table_find(struct list_head *dpipe_tables, + const char *table_name, struct devlink *devlink) +{ + struct devlink_dpipe_table *table; + + list_for_each_entry_rcu(table, dpipe_tables, list, + lockdep_is_held(&devlink->lock)) { + if (!strcmp(table->name, table_name)) + return table; + } + return NULL; +} + +int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx) +{ + struct devlink *devlink; + int err; + + err = devlink_dpipe_send_and_alloc_skb(&dump_ctx->skb, + dump_ctx->info); + if (err) + return err; + + dump_ctx->hdr = genlmsg_put(dump_ctx->skb, + dump_ctx->info->snd_portid, + dump_ctx->info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, + dump_ctx->cmd); + if (!dump_ctx->hdr) + goto nla_put_failure; + + devlink = dump_ctx->info->user_ptr[0]; + if (devlink_nl_put_handle(dump_ctx->skb, devlink)) + goto nla_put_failure; + dump_ctx->nest = nla_nest_start_noflag(dump_ctx->skb, + DEVLINK_ATTR_DPIPE_ENTRIES); + if (!dump_ctx->nest) + goto nla_put_failure; + return 0; + +nla_put_failure: + nlmsg_free(dump_ctx->skb); + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_prepare); + +int devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx, + struct devlink_dpipe_entry *entry) +{ + return devlink_dpipe_entry_put(dump_ctx->skb, entry); +} +EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_append); + +int devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx) +{ + nla_nest_end(dump_ctx->skb, dump_ctx->nest); + genlmsg_end(dump_ctx->skb, dump_ctx->hdr); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_close); + +void devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry) + +{ + unsigned int value_count, value_index; + struct devlink_dpipe_value *value; + + value = entry->action_values; + value_count = entry->action_values_count; + for (value_index = 0; value_index < value_count; value_index++) { + kfree(value[value_index].value); + kfree(value[value_index].mask); + } + + value = entry->match_values; + value_count = entry->match_values_count; + for (value_index = 0; value_index < value_count; value_index++) { + kfree(value[value_index].value); + kfree(value[value_index].mask); + } +} +EXPORT_SYMBOL_GPL(devlink_dpipe_entry_clear); + +static int devlink_dpipe_entries_fill(struct genl_info *info, + enum devlink_command cmd, int flags, + struct devlink_dpipe_table *table) +{ + struct devlink_dpipe_dump_ctx dump_ctx; + struct nlmsghdr *nlh; + int err; + + dump_ctx.skb = NULL; + dump_ctx.cmd = cmd; + dump_ctx.info = info; + + err = table->table_ops->entries_dump(table->priv, + table->counters_enabled, + &dump_ctx); + if (err) + return err; + +send_done: + nlh = nlmsg_put(dump_ctx.skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&dump_ctx.skb, info); + if (err) + return err; + goto send_done; + } + return genlmsg_reply(dump_ctx.skb, info); +} + +int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_dpipe_table *table; + const char *table_name; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME)) + return -EINVAL; + + table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name, devlink); + if (!table) + return -EINVAL; + + if (!table->table_ops->entries_dump) + return -EINVAL; + + return devlink_dpipe_entries_fill(info, DEVLINK_CMD_DPIPE_ENTRIES_GET, + 0, table); +} + +static int devlink_dpipe_fields_put(struct sk_buff *skb, + const struct devlink_dpipe_header *header) +{ + struct devlink_dpipe_field *field; + struct nlattr *field_attr; + int i; + + for (i = 0; i < header->fields_count; i++) { + field = &header->fields[i]; + field_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_FIELD); + if (!field_attr) + return -EMSGSIZE; + if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_FIELD_NAME, field->name) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_BITWIDTH, field->bitwidth) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_MAPPING_TYPE, field->mapping_type)) + goto nla_put_failure; + nla_nest_end(skb, field_attr); + } + return 0; + +nla_put_failure: + nla_nest_cancel(skb, field_attr); + return -EMSGSIZE; +} + +static int devlink_dpipe_header_put(struct sk_buff *skb, + struct devlink_dpipe_header *header) +{ + struct nlattr *fields_attr, *header_attr; + int err; + + header_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_HEADER); + if (!header_attr) + return -EMSGSIZE; + + if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_HEADER_NAME, header->name) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || + nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) + goto nla_put_failure; + + fields_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_DPIPE_HEADER_FIELDS); + if (!fields_attr) + goto nla_put_failure; + + err = devlink_dpipe_fields_put(skb, header); + if (err) { + nla_nest_cancel(skb, fields_attr); + goto nla_put_failure; + } + nla_nest_end(skb, fields_attr); + nla_nest_end(skb, header_attr); + return 0; + +nla_put_failure: + err = -EMSGSIZE; + nla_nest_cancel(skb, header_attr); + return err; +} + +static int devlink_dpipe_headers_fill(struct genl_info *info, + enum devlink_command cmd, int flags, + struct devlink_dpipe_headers * + dpipe_headers) +{ + struct devlink *devlink = info->user_ptr[0]; + struct nlattr *headers_attr; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + void *hdr; + int i, j; + int err; + + i = 0; +start_again: + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + return err; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, cmd); + if (!hdr) { + nlmsg_free(skb); + return -EMSGSIZE; + } + + if (devlink_nl_put_handle(skb, devlink)) + goto nla_put_failure; + headers_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_HEADERS); + if (!headers_attr) + goto nla_put_failure; + + j = 0; + for (; i < dpipe_headers->headers_count; i++) { + err = devlink_dpipe_header_put(skb, dpipe_headers->headers[i]); + if (err) { + if (!j) + goto err_table_put; + break; + } + j++; + } + nla_nest_end(skb, headers_attr); + genlmsg_end(skb, hdr); + if (i != dpipe_headers->headers_count) + goto start_again; + +send_done: + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + return err; + goto send_done; + } + return genlmsg_reply(skb, info); + +nla_put_failure: + err = -EMSGSIZE; +err_table_put: + nlmsg_free(skb); + return err; +} + +int devlink_nl_cmd_dpipe_headers_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + + if (!devlink->dpipe_headers) + return -EOPNOTSUPP; + return devlink_dpipe_headers_fill(info, DEVLINK_CMD_DPIPE_HEADERS_GET, + 0, devlink->dpipe_headers); +} + +static int devlink_dpipe_table_counters_set(struct devlink *devlink, + const char *table_name, + bool enable) +{ + struct devlink_dpipe_table *table; + + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name, devlink); + if (!table) + return -EINVAL; + + if (table->counter_control_extern) + return -EOPNOTSUPP; + + if (!(table->counters_enabled ^ enable)) + return 0; + + table->counters_enabled = enable; + if (table->table_ops->counters_set_update) + table->table_ops->counters_set_update(table->priv, enable); + return 0; +} + +int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const char *table_name; + bool counters_enable; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME) || + GENL_REQ_ATTR_CHECK(info, + DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED)) + return -EINVAL; + + table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); + counters_enable = !!nla_get_u8(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]); + + return devlink_dpipe_table_counters_set(devlink, table_name, + counters_enable); +} + +/** + * devl_dpipe_headers_register - register dpipe headers + * + * @devlink: devlink + * @dpipe_headers: dpipe header array + * + * Register the headers supported by hardware. + */ +void devl_dpipe_headers_register(struct devlink *devlink, + struct devlink_dpipe_headers *dpipe_headers) +{ + lockdep_assert_held(&devlink->lock); + + devlink->dpipe_headers = dpipe_headers; +} +EXPORT_SYMBOL_GPL(devl_dpipe_headers_register); + +/** + * devl_dpipe_headers_unregister - unregister dpipe headers + * + * @devlink: devlink + * + * Unregister the headers supported by hardware. + */ +void devl_dpipe_headers_unregister(struct devlink *devlink) +{ + lockdep_assert_held(&devlink->lock); + + devlink->dpipe_headers = NULL; +} +EXPORT_SYMBOL_GPL(devl_dpipe_headers_unregister); + +/** + * devlink_dpipe_table_counter_enabled - check if counter allocation + * required + * @devlink: devlink + * @table_name: tables name + * + * Used by driver to check if counter allocation is required. + * After counter allocation is turned on the table entries + * are updated to include counter statistics. + * + * After that point on the driver must respect the counter + * state so that each entry added to the table is added + * with a counter. + */ +bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, + const char *table_name) +{ + struct devlink_dpipe_table *table; + bool enabled; + + rcu_read_lock(); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name, devlink); + enabled = false; + if (table) + enabled = table->counters_enabled; + rcu_read_unlock(); + return enabled; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled); + +/** + * devl_dpipe_table_register - register dpipe table + * + * @devlink: devlink + * @table_name: table name + * @table_ops: table ops + * @priv: priv + * @counter_control_extern: external control for counters + */ +int devl_dpipe_table_register(struct devlink *devlink, + const char *table_name, + struct devlink_dpipe_table_ops *table_ops, + void *priv, bool counter_control_extern) +{ + struct devlink_dpipe_table *table; + + lockdep_assert_held(&devlink->lock); + + if (WARN_ON(!table_ops->size_get)) + return -EINVAL; + + if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name, + devlink)) + return -EEXIST; + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return -ENOMEM; + + table->name = table_name; + table->table_ops = table_ops; + table->priv = priv; + table->counter_control_extern = counter_control_extern; + + list_add_tail_rcu(&table->list, &devlink->dpipe_table_list); + + return 0; +} +EXPORT_SYMBOL_GPL(devl_dpipe_table_register); + +/** + * devl_dpipe_table_unregister - unregister dpipe table + * + * @devlink: devlink + * @table_name: table name + */ +void devl_dpipe_table_unregister(struct devlink *devlink, + const char *table_name) +{ + struct devlink_dpipe_table *table; + + lockdep_assert_held(&devlink->lock); + + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name, devlink); + if (!table) + return; + list_del_rcu(&table->list); + kfree_rcu(table, rcu); +} +EXPORT_SYMBOL_GPL(devl_dpipe_table_unregister); + +/** + * devl_dpipe_table_resource_set - set the resource id + * + * @devlink: devlink + * @table_name: table name + * @resource_id: resource id + * @resource_units: number of resource's units consumed per table's entry + */ +int devl_dpipe_table_resource_set(struct devlink *devlink, + const char *table_name, u64 resource_id, + u64 resource_units) +{ + struct devlink_dpipe_table *table; + + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name, devlink); + if (!table) + return -EINVAL; + + table->resource_id = resource_id; + table->resource_units = resource_units; + table->resource_valid = true; + return 0; +} +EXPORT_SYMBOL_GPL(devl_dpipe_table_resource_set); diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 1bcd4192099e..f71201e6b5e1 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -62,57 +62,6 @@ struct devlink_resource { void *occ_get_priv; }; -static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = { - { - .name = "destination mac", - .id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC, - .bitwidth = 48, - }, -}; - -struct devlink_dpipe_header devlink_dpipe_header_ethernet = { - .name = "ethernet", - .id = DEVLINK_DPIPE_HEADER_ETHERNET, - .fields = devlink_dpipe_fields_ethernet, - .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ethernet), - .global = true, -}; -EXPORT_SYMBOL_GPL(devlink_dpipe_header_ethernet); - -static struct devlink_dpipe_field devlink_dpipe_fields_ipv4[] = { - { - .name = "destination ip", - .id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP, - .bitwidth = 32, - }, -}; - -struct devlink_dpipe_header devlink_dpipe_header_ipv4 = { - .name = "ipv4", - .id = DEVLINK_DPIPE_HEADER_IPV4, - .fields = devlink_dpipe_fields_ipv4, - .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv4), - .global = true, -}; -EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv4); - -static struct devlink_dpipe_field devlink_dpipe_fields_ipv6[] = { - { - .name = "destination ip", - .id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP, - .bitwidth = 128, - }, -}; - -struct devlink_dpipe_header devlink_dpipe_header_ipv6 = { - .name = "ipv6", - .id = DEVLINK_DPIPE_HEADER_IPV6, - .fields = devlink_dpipe_fields_ipv6, - .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv6), - .global = true, -}; -EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv6); - EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); @@ -1141,698 +1090,6 @@ int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, return 0; } -int devlink_dpipe_match_put(struct sk_buff *skb, - struct devlink_dpipe_match *match) -{ - struct devlink_dpipe_header *header = match->header; - struct devlink_dpipe_field *field = &header->fields[match->field_id]; - struct nlattr *match_attr; - - match_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_MATCH); - if (!match_attr) - return -EMSGSIZE; - - if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_MATCH_TYPE, match->type) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, match->header_index) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || - nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) - goto nla_put_failure; - - nla_nest_end(skb, match_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(skb, match_attr); - return -EMSGSIZE; -} -EXPORT_SYMBOL_GPL(devlink_dpipe_match_put); - -static int devlink_dpipe_matches_put(struct devlink_dpipe_table *table, - struct sk_buff *skb) -{ - struct nlattr *matches_attr; - - matches_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_TABLE_MATCHES); - if (!matches_attr) - return -EMSGSIZE; - - if (table->table_ops->matches_dump(table->priv, skb)) - goto nla_put_failure; - - nla_nest_end(skb, matches_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(skb, matches_attr); - return -EMSGSIZE; -} - -int devlink_dpipe_action_put(struct sk_buff *skb, - struct devlink_dpipe_action *action) -{ - struct devlink_dpipe_header *header = action->header; - struct devlink_dpipe_field *field = &header->fields[action->field_id]; - struct nlattr *action_attr; - - action_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_ACTION); - if (!action_attr) - return -EMSGSIZE; - - if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_ACTION_TYPE, action->type) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, action->header_index) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || - nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) - goto nla_put_failure; - - nla_nest_end(skb, action_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(skb, action_attr); - return -EMSGSIZE; -} -EXPORT_SYMBOL_GPL(devlink_dpipe_action_put); - -static int devlink_dpipe_actions_put(struct devlink_dpipe_table *table, - struct sk_buff *skb) -{ - struct nlattr *actions_attr; - - actions_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_TABLE_ACTIONS); - if (!actions_attr) - return -EMSGSIZE; - - if (table->table_ops->actions_dump(table->priv, skb)) - goto nla_put_failure; - - nla_nest_end(skb, actions_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(skb, actions_attr); - return -EMSGSIZE; -} - -static int devlink_dpipe_table_put(struct sk_buff *skb, - struct devlink_dpipe_table *table) -{ - struct nlattr *table_attr; - u64 table_size; - - table_size = table->table_ops->size_get(table->priv); - table_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_TABLE); - if (!table_attr) - return -EMSGSIZE; - - if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_TABLE_NAME, table->name) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_SIZE, table_size, - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - if (nla_put_u8(skb, DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED, - table->counters_enabled)) - goto nla_put_failure; - - if (table->resource_valid) { - if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, - table->resource_id, DEVLINK_ATTR_PAD) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS, - table->resource_units, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - } - if (devlink_dpipe_matches_put(table, skb)) - goto nla_put_failure; - - if (devlink_dpipe_actions_put(table, skb)) - goto nla_put_failure; - - nla_nest_end(skb, table_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(skb, table_attr); - return -EMSGSIZE; -} - -static int devlink_dpipe_tables_fill(struct genl_info *info, - enum devlink_command cmd, int flags, - struct list_head *dpipe_tables, - const char *table_name) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_dpipe_table *table; - struct nlattr *tables_attr; - struct sk_buff *skb = NULL; - struct nlmsghdr *nlh; - bool incomplete; - void *hdr; - int i; - int err; - - table = list_first_entry(dpipe_tables, - struct devlink_dpipe_table, list); -start_again: - err = devlink_nl_msg_reply_and_new(&skb, info); - if (err) - return err; - - hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, - &devlink_nl_family, NLM_F_MULTI, cmd); - if (!hdr) { - nlmsg_free(skb); - return -EMSGSIZE; - } - - if (devlink_nl_put_handle(skb, devlink)) - goto nla_put_failure; - tables_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_TABLES); - if (!tables_attr) - goto nla_put_failure; - - i = 0; - incomplete = false; - list_for_each_entry_from(table, dpipe_tables, list) { - if (!table_name) { - err = devlink_dpipe_table_put(skb, table); - if (err) { - if (!i) - goto err_table_put; - incomplete = true; - break; - } - } else { - if (!strcmp(table->name, table_name)) { - err = devlink_dpipe_table_put(skb, table); - if (err) - break; - } - } - i++; - } - - nla_nest_end(skb, tables_attr); - genlmsg_end(skb, hdr); - if (incomplete) - goto start_again; - -send_done: - nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, - NLMSG_DONE, 0, flags | NLM_F_MULTI); - if (!nlh) { - err = devlink_nl_msg_reply_and_new(&skb, info); - if (err) - return err; - goto send_done; - } - - return genlmsg_reply(skb, info); - -nla_put_failure: - err = -EMSGSIZE; -err_table_put: - nlmsg_free(skb); - return err; -} - -static int devlink_nl_cmd_dpipe_table_get(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - const char *table_name = NULL; - - if (info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]) - table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); - - return devlink_dpipe_tables_fill(info, DEVLINK_CMD_DPIPE_TABLE_GET, 0, - &devlink->dpipe_table_list, - table_name); -} - -static int devlink_dpipe_value_put(struct sk_buff *skb, - struct devlink_dpipe_value *value) -{ - if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE, - value->value_size, value->value)) - return -EMSGSIZE; - if (value->mask) - if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE_MASK, - value->value_size, value->mask)) - return -EMSGSIZE; - if (value->mapping_valid) - if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_VALUE_MAPPING, - value->mapping_value)) - return -EMSGSIZE; - return 0; -} - -static int devlink_dpipe_action_value_put(struct sk_buff *skb, - struct devlink_dpipe_value *value) -{ - if (!value->action) - return -EINVAL; - if (devlink_dpipe_action_put(skb, value->action)) - return -EMSGSIZE; - if (devlink_dpipe_value_put(skb, value)) - return -EMSGSIZE; - return 0; -} - -static int devlink_dpipe_action_values_put(struct sk_buff *skb, - struct devlink_dpipe_value *values, - unsigned int values_count) -{ - struct nlattr *action_attr; - int i; - int err; - - for (i = 0; i < values_count; i++) { - action_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_ACTION_VALUE); - if (!action_attr) - return -EMSGSIZE; - err = devlink_dpipe_action_value_put(skb, &values[i]); - if (err) - goto err_action_value_put; - nla_nest_end(skb, action_attr); - } - return 0; - -err_action_value_put: - nla_nest_cancel(skb, action_attr); - return err; -} - -static int devlink_dpipe_match_value_put(struct sk_buff *skb, - struct devlink_dpipe_value *value) -{ - if (!value->match) - return -EINVAL; - if (devlink_dpipe_match_put(skb, value->match)) - return -EMSGSIZE; - if (devlink_dpipe_value_put(skb, value)) - return -EMSGSIZE; - return 0; -} - -static int devlink_dpipe_match_values_put(struct sk_buff *skb, - struct devlink_dpipe_value *values, - unsigned int values_count) -{ - struct nlattr *match_attr; - int i; - int err; - - for (i = 0; i < values_count; i++) { - match_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_MATCH_VALUE); - if (!match_attr) - return -EMSGSIZE; - err = devlink_dpipe_match_value_put(skb, &values[i]); - if (err) - goto err_match_value_put; - nla_nest_end(skb, match_attr); - } - return 0; - -err_match_value_put: - nla_nest_cancel(skb, match_attr); - return err; -} - -static int devlink_dpipe_entry_put(struct sk_buff *skb, - struct devlink_dpipe_entry *entry) -{ - struct nlattr *entry_attr, *matches_attr, *actions_attr; - int err; - - entry_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_ENTRY); - if (!entry_attr) - return -EMSGSIZE; - - if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_INDEX, entry->index, - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - if (entry->counter_valid) - if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_COUNTER, - entry->counter, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - matches_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_ENTRY_MATCH_VALUES); - if (!matches_attr) - goto nla_put_failure; - - err = devlink_dpipe_match_values_put(skb, entry->match_values, - entry->match_values_count); - if (err) { - nla_nest_cancel(skb, matches_attr); - goto err_match_values_put; - } - nla_nest_end(skb, matches_attr); - - actions_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_ENTRY_ACTION_VALUES); - if (!actions_attr) - goto nla_put_failure; - - err = devlink_dpipe_action_values_put(skb, entry->action_values, - entry->action_values_count); - if (err) { - nla_nest_cancel(skb, actions_attr); - goto err_action_values_put; - } - nla_nest_end(skb, actions_attr); - - nla_nest_end(skb, entry_attr); - return 0; - -nla_put_failure: - err = -EMSGSIZE; -err_match_values_put: -err_action_values_put: - nla_nest_cancel(skb, entry_attr); - return err; -} - -static struct devlink_dpipe_table * -devlink_dpipe_table_find(struct list_head *dpipe_tables, - const char *table_name, struct devlink *devlink) -{ - struct devlink_dpipe_table *table; - list_for_each_entry_rcu(table, dpipe_tables, list, - lockdep_is_held(&devlink->lock)) { - if (!strcmp(table->name, table_name)) - return table; - } - return NULL; -} - -int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx) -{ - struct devlink *devlink; - int err; - - err = devlink_nl_msg_reply_and_new(&dump_ctx->skb, - dump_ctx->info); - if (err) - return err; - - dump_ctx->hdr = genlmsg_put(dump_ctx->skb, - dump_ctx->info->snd_portid, - dump_ctx->info->snd_seq, - &devlink_nl_family, NLM_F_MULTI, - dump_ctx->cmd); - if (!dump_ctx->hdr) - goto nla_put_failure; - - devlink = dump_ctx->info->user_ptr[0]; - if (devlink_nl_put_handle(dump_ctx->skb, devlink)) - goto nla_put_failure; - dump_ctx->nest = nla_nest_start_noflag(dump_ctx->skb, - DEVLINK_ATTR_DPIPE_ENTRIES); - if (!dump_ctx->nest) - goto nla_put_failure; - return 0; - -nla_put_failure: - nlmsg_free(dump_ctx->skb); - return -EMSGSIZE; -} -EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_prepare); - -int devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx, - struct devlink_dpipe_entry *entry) -{ - return devlink_dpipe_entry_put(dump_ctx->skb, entry); -} -EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_append); - -int devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx) -{ - nla_nest_end(dump_ctx->skb, dump_ctx->nest); - genlmsg_end(dump_ctx->skb, dump_ctx->hdr); - return 0; -} -EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_close); - -void devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry) - -{ - unsigned int value_count, value_index; - struct devlink_dpipe_value *value; - - value = entry->action_values; - value_count = entry->action_values_count; - for (value_index = 0; value_index < value_count; value_index++) { - kfree(value[value_index].value); - kfree(value[value_index].mask); - } - - value = entry->match_values; - value_count = entry->match_values_count; - for (value_index = 0; value_index < value_count; value_index++) { - kfree(value[value_index].value); - kfree(value[value_index].mask); - } -} -EXPORT_SYMBOL_GPL(devlink_dpipe_entry_clear); - -static int devlink_dpipe_entries_fill(struct genl_info *info, - enum devlink_command cmd, int flags, - struct devlink_dpipe_table *table) -{ - struct devlink_dpipe_dump_ctx dump_ctx; - struct nlmsghdr *nlh; - int err; - - dump_ctx.skb = NULL; - dump_ctx.cmd = cmd; - dump_ctx.info = info; - - err = table->table_ops->entries_dump(table->priv, - table->counters_enabled, - &dump_ctx); - if (err) - return err; - -send_done: - nlh = nlmsg_put(dump_ctx.skb, info->snd_portid, info->snd_seq, - NLMSG_DONE, 0, flags | NLM_F_MULTI); - if (!nlh) { - err = devlink_nl_msg_reply_and_new(&dump_ctx.skb, info); - if (err) - return err; - goto send_done; - } - return genlmsg_reply(dump_ctx.skb, info); -} - -static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_dpipe_table *table; - const char *table_name; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME)) - return -EINVAL; - - table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); - table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name, devlink); - if (!table) - return -EINVAL; - - if (!table->table_ops->entries_dump) - return -EINVAL; - - return devlink_dpipe_entries_fill(info, DEVLINK_CMD_DPIPE_ENTRIES_GET, - 0, table); -} - -static int devlink_dpipe_fields_put(struct sk_buff *skb, - const struct devlink_dpipe_header *header) -{ - struct devlink_dpipe_field *field; - struct nlattr *field_attr; - int i; - - for (i = 0; i < header->fields_count; i++) { - field = &header->fields[i]; - field_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_FIELD); - if (!field_attr) - return -EMSGSIZE; - if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_FIELD_NAME, field->name) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_BITWIDTH, field->bitwidth) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_MAPPING_TYPE, field->mapping_type)) - goto nla_put_failure; - nla_nest_end(skb, field_attr); - } - return 0; - -nla_put_failure: - nla_nest_cancel(skb, field_attr); - return -EMSGSIZE; -} - -static int devlink_dpipe_header_put(struct sk_buff *skb, - struct devlink_dpipe_header *header) -{ - struct nlattr *fields_attr, *header_attr; - int err; - - header_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_HEADER); - if (!header_attr) - return -EMSGSIZE; - - if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_HEADER_NAME, header->name) || - nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || - nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) - goto nla_put_failure; - - fields_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_DPIPE_HEADER_FIELDS); - if (!fields_attr) - goto nla_put_failure; - - err = devlink_dpipe_fields_put(skb, header); - if (err) { - nla_nest_cancel(skb, fields_attr); - goto nla_put_failure; - } - nla_nest_end(skb, fields_attr); - nla_nest_end(skb, header_attr); - return 0; - -nla_put_failure: - err = -EMSGSIZE; - nla_nest_cancel(skb, header_attr); - return err; -} - -static int devlink_dpipe_headers_fill(struct genl_info *info, - enum devlink_command cmd, int flags, - struct devlink_dpipe_headers * - dpipe_headers) -{ - struct devlink *devlink = info->user_ptr[0]; - struct nlattr *headers_attr; - struct sk_buff *skb = NULL; - struct nlmsghdr *nlh; - void *hdr; - int i, j; - int err; - - i = 0; -start_again: - err = devlink_nl_msg_reply_and_new(&skb, info); - if (err) - return err; - - hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, - &devlink_nl_family, NLM_F_MULTI, cmd); - if (!hdr) { - nlmsg_free(skb); - return -EMSGSIZE; - } - - if (devlink_nl_put_handle(skb, devlink)) - goto nla_put_failure; - headers_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_HEADERS); - if (!headers_attr) - goto nla_put_failure; - - j = 0; - for (; i < dpipe_headers->headers_count; i++) { - err = devlink_dpipe_header_put(skb, dpipe_headers->headers[i]); - if (err) { - if (!j) - goto err_table_put; - break; - } - j++; - } - nla_nest_end(skb, headers_attr); - genlmsg_end(skb, hdr); - if (i != dpipe_headers->headers_count) - goto start_again; - -send_done: - nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, - NLMSG_DONE, 0, flags | NLM_F_MULTI); - if (!nlh) { - err = devlink_nl_msg_reply_and_new(&skb, info); - if (err) - return err; - goto send_done; - } - return genlmsg_reply(skb, info); - -nla_put_failure: - err = -EMSGSIZE; -err_table_put: - nlmsg_free(skb); - return err; -} - -static int devlink_nl_cmd_dpipe_headers_get(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - - if (!devlink->dpipe_headers) - return -EOPNOTSUPP; - return devlink_dpipe_headers_fill(info, DEVLINK_CMD_DPIPE_HEADERS_GET, - 0, devlink->dpipe_headers); -} - -static int devlink_dpipe_table_counters_set(struct devlink *devlink, - const char *table_name, - bool enable) -{ - struct devlink_dpipe_table *table; - - table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name, devlink); - if (!table) - return -EINVAL; - - if (table->counter_control_extern) - return -EOPNOTSUPP; - - if (!(table->counters_enabled ^ enable)) - return 0; - - table->counters_enabled = enable; - if (table->table_ops->counters_set_update) - table->table_ops->counters_set_update(table->priv, enable); - return 0; -} - -static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - const char *table_name; - bool counters_enable; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME) || - GENL_REQ_ATTR_CHECK(info, - DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED)) - return -EINVAL; - - table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); - counters_enable = !!nla_get_u8(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]); - - return devlink_dpipe_table_counters_set(devlink, table_name, - counters_enable); -} - static struct devlink_resource * devlink_resource_find(struct devlink *devlink, struct devlink_resource *resource, u64 resource_id) @@ -5272,131 +4529,6 @@ void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, } EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set); -/** - * devl_dpipe_headers_register - register dpipe headers - * - * @devlink: devlink - * @dpipe_headers: dpipe header array - * - * Register the headers supported by hardware. - */ -void devl_dpipe_headers_register(struct devlink *devlink, - struct devlink_dpipe_headers *dpipe_headers) -{ - lockdep_assert_held(&devlink->lock); - - devlink->dpipe_headers = dpipe_headers; -} -EXPORT_SYMBOL_GPL(devl_dpipe_headers_register); - -/** - * devl_dpipe_headers_unregister - unregister dpipe headers - * - * @devlink: devlink - * - * Unregister the headers supported by hardware. - */ -void devl_dpipe_headers_unregister(struct devlink *devlink) -{ - lockdep_assert_held(&devlink->lock); - - devlink->dpipe_headers = NULL; -} -EXPORT_SYMBOL_GPL(devl_dpipe_headers_unregister); - -/** - * devlink_dpipe_table_counter_enabled - check if counter allocation - * required - * @devlink: devlink - * @table_name: tables name - * - * Used by driver to check if counter allocation is required. - * After counter allocation is turned on the table entries - * are updated to include counter statistics. - * - * After that point on the driver must respect the counter - * state so that each entry added to the table is added - * with a counter. - */ -bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, - const char *table_name) -{ - struct devlink_dpipe_table *table; - bool enabled; - - rcu_read_lock(); - table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name, devlink); - enabled = false; - if (table) - enabled = table->counters_enabled; - rcu_read_unlock(); - return enabled; -} -EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled); - -/** - * devl_dpipe_table_register - register dpipe table - * - * @devlink: devlink - * @table_name: table name - * @table_ops: table ops - * @priv: priv - * @counter_control_extern: external control for counters - */ -int devl_dpipe_table_register(struct devlink *devlink, - const char *table_name, - struct devlink_dpipe_table_ops *table_ops, - void *priv, bool counter_control_extern) -{ - struct devlink_dpipe_table *table; - - lockdep_assert_held(&devlink->lock); - - if (WARN_ON(!table_ops->size_get)) - return -EINVAL; - - if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name, - devlink)) - return -EEXIST; - - table = kzalloc(sizeof(*table), GFP_KERNEL); - if (!table) - return -ENOMEM; - - table->name = table_name; - table->table_ops = table_ops; - table->priv = priv; - table->counter_control_extern = counter_control_extern; - - list_add_tail_rcu(&table->list, &devlink->dpipe_table_list); - - return 0; -} -EXPORT_SYMBOL_GPL(devl_dpipe_table_register); - -/** - * devl_dpipe_table_unregister - unregister dpipe table - * - * @devlink: devlink - * @table_name: table name - */ -void devl_dpipe_table_unregister(struct devlink *devlink, - const char *table_name) -{ - struct devlink_dpipe_table *table; - - lockdep_assert_held(&devlink->lock); - - table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name, devlink); - if (!table) - return; - list_del_rcu(&table->list); - kfree_rcu(table, rcu); -} -EXPORT_SYMBOL_GPL(devl_dpipe_table_unregister); - /** * devl_resource_register - devlink resource register * @@ -5569,32 +4701,6 @@ int devl_resource_size_get(struct devlink *devlink, } EXPORT_SYMBOL_GPL(devl_resource_size_get); -/** - * devl_dpipe_table_resource_set - set the resource id - * - * @devlink: devlink - * @table_name: table name - * @resource_id: resource id - * @resource_units: number of resource's units consumed per table's entry - */ -int devl_dpipe_table_resource_set(struct devlink *devlink, - const char *table_name, u64 resource_id, - u64 resource_units) -{ - struct devlink_dpipe_table *table; - - table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name, devlink); - if (!table) - return -EINVAL; - - table->resource_id = resource_id; - table->resource_units = resource_units; - table->resource_valid = true; - return 0; -} -EXPORT_SYMBOL_GPL(devl_dpipe_table_resource_set); - /** * devl_resource_occ_get_register - register occupancy getter * -- cgit v1.2.3 From a9f960074ecd9db292744db2491a0cbbd354e24b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Aug 2023 08:16:48 +0200 Subject: devlink: push resource related code into separate file Cut out another chunk from leftover.c and put resource related code into a separate file. Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230828061657.300667-7-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/Makefile | 2 +- net/devlink/devl_internal.h | 2 + net/devlink/leftover.c | 574 ------------------------------------------- net/devlink/resource.c | 579 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 582 insertions(+), 575 deletions(-) create mode 100644 net/devlink/resource.c (limited to 'net') diff --git a/net/devlink/Makefile b/net/devlink/Makefile index 122de1d565d2..8864cd933acb 100644 --- a/net/devlink/Makefile +++ b/net/devlink/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o port.o sb.o dpipe.o \ - health.o + resource.o health.o diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 72653e1dae80..e5c0acf7a758 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -238,6 +238,8 @@ int devlink_nl_cmd_dpipe_headers_get(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_resource_set(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_resource_dump(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index f71201e6b5e1..1c65449f2252 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -33,35 +33,6 @@ #include "devl_internal.h" -/** - * struct devlink_resource - devlink resource - * @name: name of the resource - * @id: id, per devlink instance - * @size: size of the resource - * @size_new: updated size of the resource, reload is needed - * @size_valid: valid in case the total size of the resource is valid - * including its children - * @parent: parent resource - * @size_params: size parameters - * @list: parent list - * @resource_list: list of child resources - * @occ_get: occupancy getter callback - * @occ_get_priv: occupancy getter callback priv - */ -struct devlink_resource { - const char *name; - u64 id; - u64 size; - u64 size_new; - bool size_valid; - struct devlink_resource *parent; - struct devlink_resource_size_params size_params; - struct list_head list; - struct list_head resource_list; - devlink_resource_occ_get_t *occ_get; - void *occ_get_priv; -}; - EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); @@ -1090,290 +1061,6 @@ int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, return 0; } -static struct devlink_resource * -devlink_resource_find(struct devlink *devlink, - struct devlink_resource *resource, u64 resource_id) -{ - struct list_head *resource_list; - - if (resource) - resource_list = &resource->resource_list; - else - resource_list = &devlink->resource_list; - - list_for_each_entry(resource, resource_list, list) { - struct devlink_resource *child_resource; - - if (resource->id == resource_id) - return resource; - - child_resource = devlink_resource_find(devlink, resource, - resource_id); - if (child_resource) - return child_resource; - } - return NULL; -} - -static void -devlink_resource_validate_children(struct devlink_resource *resource) -{ - struct devlink_resource *child_resource; - bool size_valid = true; - u64 parts_size = 0; - - if (list_empty(&resource->resource_list)) - goto out; - - list_for_each_entry(child_resource, &resource->resource_list, list) - parts_size += child_resource->size_new; - - if (parts_size > resource->size_new) - size_valid = false; -out: - resource->size_valid = size_valid; -} - -static int -devlink_resource_validate_size(struct devlink_resource *resource, u64 size, - struct netlink_ext_ack *extack) -{ - u64 reminder; - int err = 0; - - if (size > resource->size_params.size_max) { - NL_SET_ERR_MSG(extack, "Size larger than maximum"); - err = -EINVAL; - } - - if (size < resource->size_params.size_min) { - NL_SET_ERR_MSG(extack, "Size smaller than minimum"); - err = -EINVAL; - } - - div64_u64_rem(size, resource->size_params.size_granularity, &reminder); - if (reminder) { - NL_SET_ERR_MSG(extack, "Wrong granularity"); - err = -EINVAL; - } - - return err; -} - -static int devlink_nl_cmd_resource_set(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_resource *resource; - u64 resource_id; - u64 size; - int err; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_ID) || - GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_SIZE)) - return -EINVAL; - resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]); - - resource = devlink_resource_find(devlink, NULL, resource_id); - if (!resource) - return -EINVAL; - - size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]); - err = devlink_resource_validate_size(resource, size, info->extack); - if (err) - return err; - - resource->size_new = size; - devlink_resource_validate_children(resource); - if (resource->parent) - devlink_resource_validate_children(resource->parent); - return 0; -} - -static int -devlink_resource_size_params_put(struct devlink_resource *resource, - struct sk_buff *skb) -{ - struct devlink_resource_size_params *size_params; - - size_params = &resource->size_params; - if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN, - size_params->size_granularity, DEVLINK_ATTR_PAD) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX, - size_params->size_max, DEVLINK_ATTR_PAD) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN, - size_params->size_min, DEVLINK_ATTR_PAD) || - nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit)) - return -EMSGSIZE; - return 0; -} - -static int devlink_resource_occ_put(struct devlink_resource *resource, - struct sk_buff *skb) -{ - if (!resource->occ_get) - return 0; - return nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC, - resource->occ_get(resource->occ_get_priv), - DEVLINK_ATTR_PAD); -} - -static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, - struct devlink_resource *resource) -{ - struct devlink_resource *child_resource; - struct nlattr *child_resource_attr; - struct nlattr *resource_attr; - - resource_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE); - if (!resource_attr) - return -EMSGSIZE; - - if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size, - DEVLINK_ATTR_PAD) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id, - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - if (resource->size != resource->size_new && - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, - resource->size_new, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - if (devlink_resource_occ_put(resource, skb)) - goto nla_put_failure; - if (devlink_resource_size_params_put(resource, skb)) - goto nla_put_failure; - if (list_empty(&resource->resource_list)) - goto out; - - if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID, - resource->size_valid)) - goto nla_put_failure; - - child_resource_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_RESOURCE_LIST); - if (!child_resource_attr) - goto nla_put_failure; - - list_for_each_entry(child_resource, &resource->resource_list, list) { - if (devlink_resource_put(devlink, skb, child_resource)) - goto resource_put_failure; - } - - nla_nest_end(skb, child_resource_attr); -out: - nla_nest_end(skb, resource_attr); - return 0; - -resource_put_failure: - nla_nest_cancel(skb, child_resource_attr); -nla_put_failure: - nla_nest_cancel(skb, resource_attr); - return -EMSGSIZE; -} - -static int devlink_resource_fill(struct genl_info *info, - enum devlink_command cmd, int flags) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_resource *resource; - struct nlattr *resources_attr; - struct sk_buff *skb = NULL; - struct nlmsghdr *nlh; - bool incomplete; - void *hdr; - int i; - int err; - - resource = list_first_entry(&devlink->resource_list, - struct devlink_resource, list); -start_again: - err = devlink_nl_msg_reply_and_new(&skb, info); - if (err) - return err; - - hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, - &devlink_nl_family, NLM_F_MULTI, cmd); - if (!hdr) { - nlmsg_free(skb); - return -EMSGSIZE; - } - - if (devlink_nl_put_handle(skb, devlink)) - goto nla_put_failure; - - resources_attr = nla_nest_start_noflag(skb, - DEVLINK_ATTR_RESOURCE_LIST); - if (!resources_attr) - goto nla_put_failure; - - incomplete = false; - i = 0; - list_for_each_entry_from(resource, &devlink->resource_list, list) { - err = devlink_resource_put(devlink, skb, resource); - if (err) { - if (!i) - goto err_resource_put; - incomplete = true; - break; - } - i++; - } - nla_nest_end(skb, resources_attr); - genlmsg_end(skb, hdr); - if (incomplete) - goto start_again; -send_done: - nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, - NLMSG_DONE, 0, flags | NLM_F_MULTI); - if (!nlh) { - err = devlink_nl_msg_reply_and_new(&skb, info); - if (err) - return err; - goto send_done; - } - return genlmsg_reply(skb, info); - -nla_put_failure: - err = -EMSGSIZE; -err_resource_put: - nlmsg_free(skb); - return err; -} - -static int devlink_nl_cmd_resource_dump(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - - if (list_empty(&devlink->resource_list)) - return -EOPNOTSUPP; - - return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0); -} - -int devlink_resources_validate(struct devlink *devlink, - struct devlink_resource *resource, - struct genl_info *info) -{ - struct list_head *resource_list; - int err = 0; - - if (resource) - resource_list = &resource->resource_list; - else - resource_list = &devlink->resource_list; - - list_for_each_entry(resource, resource_list, list) { - if (!resource->size_valid) - return -EINVAL; - err = devlink_resources_validate(devlink, resource, info); - if (err) - return err; - } - return err; -} - static const struct devlink_param devlink_param_generic[] = { { .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, @@ -4529,267 +4216,6 @@ void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, } EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set); -/** - * devl_resource_register - devlink resource register - * - * @devlink: devlink - * @resource_name: resource's name - * @resource_size: resource's size - * @resource_id: resource's id - * @parent_resource_id: resource's parent id - * @size_params: size parameters - * - * Generic resources should reuse the same names across drivers. - * Please see the generic resources list at: - * Documentation/networking/devlink/devlink-resource.rst - */ -int devl_resource_register(struct devlink *devlink, - const char *resource_name, - u64 resource_size, - u64 resource_id, - u64 parent_resource_id, - const struct devlink_resource_size_params *size_params) -{ - struct devlink_resource *resource; - struct list_head *resource_list; - bool top_hierarchy; - - lockdep_assert_held(&devlink->lock); - - top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP; - - resource = devlink_resource_find(devlink, NULL, resource_id); - if (resource) - return -EINVAL; - - resource = kzalloc(sizeof(*resource), GFP_KERNEL); - if (!resource) - return -ENOMEM; - - if (top_hierarchy) { - resource_list = &devlink->resource_list; - } else { - struct devlink_resource *parent_resource; - - parent_resource = devlink_resource_find(devlink, NULL, - parent_resource_id); - if (parent_resource) { - resource_list = &parent_resource->resource_list; - resource->parent = parent_resource; - } else { - kfree(resource); - return -EINVAL; - } - } - - resource->name = resource_name; - resource->size = resource_size; - resource->size_new = resource_size; - resource->id = resource_id; - resource->size_valid = true; - memcpy(&resource->size_params, size_params, - sizeof(resource->size_params)); - INIT_LIST_HEAD(&resource->resource_list); - list_add_tail(&resource->list, resource_list); - - return 0; -} -EXPORT_SYMBOL_GPL(devl_resource_register); - -/** - * devlink_resource_register - devlink resource register - * - * @devlink: devlink - * @resource_name: resource's name - * @resource_size: resource's size - * @resource_id: resource's id - * @parent_resource_id: resource's parent id - * @size_params: size parameters - * - * Generic resources should reuse the same names across drivers. - * Please see the generic resources list at: - * Documentation/networking/devlink/devlink-resource.rst - * - * Context: Takes and release devlink->lock . - */ -int devlink_resource_register(struct devlink *devlink, - const char *resource_name, - u64 resource_size, - u64 resource_id, - u64 parent_resource_id, - const struct devlink_resource_size_params *size_params) -{ - int err; - - devl_lock(devlink); - err = devl_resource_register(devlink, resource_name, resource_size, - resource_id, parent_resource_id, size_params); - devl_unlock(devlink); - return err; -} -EXPORT_SYMBOL_GPL(devlink_resource_register); - -static void devlink_resource_unregister(struct devlink *devlink, - struct devlink_resource *resource) -{ - struct devlink_resource *tmp, *child_resource; - - list_for_each_entry_safe(child_resource, tmp, &resource->resource_list, - list) { - devlink_resource_unregister(devlink, child_resource); - list_del(&child_resource->list); - kfree(child_resource); - } -} - -/** - * devl_resources_unregister - free all resources - * - * @devlink: devlink - */ -void devl_resources_unregister(struct devlink *devlink) -{ - struct devlink_resource *tmp, *child_resource; - - lockdep_assert_held(&devlink->lock); - - list_for_each_entry_safe(child_resource, tmp, &devlink->resource_list, - list) { - devlink_resource_unregister(devlink, child_resource); - list_del(&child_resource->list); - kfree(child_resource); - } -} -EXPORT_SYMBOL_GPL(devl_resources_unregister); - -/** - * devlink_resources_unregister - free all resources - * - * @devlink: devlink - * - * Context: Takes and release devlink->lock . - */ -void devlink_resources_unregister(struct devlink *devlink) -{ - devl_lock(devlink); - devl_resources_unregister(devlink); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_resources_unregister); - -/** - * devl_resource_size_get - get and update size - * - * @devlink: devlink - * @resource_id: the requested resource id - * @p_resource_size: ptr to update - */ -int devl_resource_size_get(struct devlink *devlink, - u64 resource_id, - u64 *p_resource_size) -{ - struct devlink_resource *resource; - - lockdep_assert_held(&devlink->lock); - - resource = devlink_resource_find(devlink, NULL, resource_id); - if (!resource) - return -EINVAL; - *p_resource_size = resource->size_new; - resource->size = resource->size_new; - return 0; -} -EXPORT_SYMBOL_GPL(devl_resource_size_get); - -/** - * devl_resource_occ_get_register - register occupancy getter - * - * @devlink: devlink - * @resource_id: resource id - * @occ_get: occupancy getter callback - * @occ_get_priv: occupancy getter callback priv - */ -void devl_resource_occ_get_register(struct devlink *devlink, - u64 resource_id, - devlink_resource_occ_get_t *occ_get, - void *occ_get_priv) -{ - struct devlink_resource *resource; - - lockdep_assert_held(&devlink->lock); - - resource = devlink_resource_find(devlink, NULL, resource_id); - if (WARN_ON(!resource)) - return; - WARN_ON(resource->occ_get); - - resource->occ_get = occ_get; - resource->occ_get_priv = occ_get_priv; -} -EXPORT_SYMBOL_GPL(devl_resource_occ_get_register); - -/** - * devlink_resource_occ_get_register - register occupancy getter - * - * @devlink: devlink - * @resource_id: resource id - * @occ_get: occupancy getter callback - * @occ_get_priv: occupancy getter callback priv - * - * Context: Takes and release devlink->lock . - */ -void devlink_resource_occ_get_register(struct devlink *devlink, - u64 resource_id, - devlink_resource_occ_get_t *occ_get, - void *occ_get_priv) -{ - devl_lock(devlink); - devl_resource_occ_get_register(devlink, resource_id, - occ_get, occ_get_priv); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register); - -/** - * devl_resource_occ_get_unregister - unregister occupancy getter - * - * @devlink: devlink - * @resource_id: resource id - */ -void devl_resource_occ_get_unregister(struct devlink *devlink, - u64 resource_id) -{ - struct devlink_resource *resource; - - lockdep_assert_held(&devlink->lock); - - resource = devlink_resource_find(devlink, NULL, resource_id); - if (WARN_ON(!resource)) - return; - WARN_ON(!resource->occ_get); - - resource->occ_get = NULL; - resource->occ_get_priv = NULL; -} -EXPORT_SYMBOL_GPL(devl_resource_occ_get_unregister); - -/** - * devlink_resource_occ_get_unregister - unregister occupancy getter - * - * @devlink: devlink - * @resource_id: resource id - * - * Context: Takes and release devlink->lock . - */ -void devlink_resource_occ_get_unregister(struct devlink *devlink, - u64 resource_id) -{ - devl_lock(devlink); - devl_resource_occ_get_unregister(devlink, resource_id); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister); - static int devlink_param_verify(const struct devlink_param *param) { if (!param || !param->name || !param->supported_cmodes) diff --git a/net/devlink/resource.c b/net/devlink/resource.c new file mode 100644 index 000000000000..c8b615e4c385 --- /dev/null +++ b/net/devlink/resource.c @@ -0,0 +1,579 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko + */ + +#include "devl_internal.h" + +/** + * struct devlink_resource - devlink resource + * @name: name of the resource + * @id: id, per devlink instance + * @size: size of the resource + * @size_new: updated size of the resource, reload is needed + * @size_valid: valid in case the total size of the resource is valid + * including its children + * @parent: parent resource + * @size_params: size parameters + * @list: parent list + * @resource_list: list of child resources + * @occ_get: occupancy getter callback + * @occ_get_priv: occupancy getter callback priv + */ +struct devlink_resource { + const char *name; + u64 id; + u64 size; + u64 size_new; + bool size_valid; + struct devlink_resource *parent; + struct devlink_resource_size_params size_params; + struct list_head list; + struct list_head resource_list; + devlink_resource_occ_get_t *occ_get; + void *occ_get_priv; +}; + +static struct devlink_resource * +devlink_resource_find(struct devlink *devlink, + struct devlink_resource *resource, u64 resource_id) +{ + struct list_head *resource_list; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + list_for_each_entry(resource, resource_list, list) { + struct devlink_resource *child_resource; + + if (resource->id == resource_id) + return resource; + + child_resource = devlink_resource_find(devlink, resource, + resource_id); + if (child_resource) + return child_resource; + } + return NULL; +} + +static void +devlink_resource_validate_children(struct devlink_resource *resource) +{ + struct devlink_resource *child_resource; + bool size_valid = true; + u64 parts_size = 0; + + if (list_empty(&resource->resource_list)) + goto out; + + list_for_each_entry(child_resource, &resource->resource_list, list) + parts_size += child_resource->size_new; + + if (parts_size > resource->size_new) + size_valid = false; +out: + resource->size_valid = size_valid; +} + +static int +devlink_resource_validate_size(struct devlink_resource *resource, u64 size, + struct netlink_ext_ack *extack) +{ + u64 reminder; + int err = 0; + + if (size > resource->size_params.size_max) { + NL_SET_ERR_MSG(extack, "Size larger than maximum"); + err = -EINVAL; + } + + if (size < resource->size_params.size_min) { + NL_SET_ERR_MSG(extack, "Size smaller than minimum"); + err = -EINVAL; + } + + div64_u64_rem(size, resource->size_params.size_granularity, &reminder); + if (reminder) { + NL_SET_ERR_MSG(extack, "Wrong granularity"); + err = -EINVAL; + } + + return err; +} + +int devlink_nl_cmd_resource_set(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_resource *resource; + u64 resource_id; + u64 size; + int err; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_ID) || + GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_SIZE)) + return -EINVAL; + resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]); + + resource = devlink_resource_find(devlink, NULL, resource_id); + if (!resource) + return -EINVAL; + + size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]); + err = devlink_resource_validate_size(resource, size, info->extack); + if (err) + return err; + + resource->size_new = size; + devlink_resource_validate_children(resource); + if (resource->parent) + devlink_resource_validate_children(resource->parent); + return 0; +} + +static int +devlink_resource_size_params_put(struct devlink_resource *resource, + struct sk_buff *skb) +{ + struct devlink_resource_size_params *size_params; + + size_params = &resource->size_params; + if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN, + size_params->size_granularity, DEVLINK_ATTR_PAD) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX, + size_params->size_max, DEVLINK_ATTR_PAD) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN, + size_params->size_min, DEVLINK_ATTR_PAD) || + nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit)) + return -EMSGSIZE; + return 0; +} + +static int devlink_resource_occ_put(struct devlink_resource *resource, + struct sk_buff *skb) +{ + if (!resource->occ_get) + return 0; + return nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC, + resource->occ_get(resource->occ_get_priv), + DEVLINK_ATTR_PAD); +} + +static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, + struct devlink_resource *resource) +{ + struct devlink_resource *child_resource; + struct nlattr *child_resource_attr; + struct nlattr *resource_attr; + + resource_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE); + if (!resource_attr) + return -EMSGSIZE; + + if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size, + DEVLINK_ATTR_PAD) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (resource->size != resource->size_new && + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, + resource->size_new, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (devlink_resource_occ_put(resource, skb)) + goto nla_put_failure; + if (devlink_resource_size_params_put(resource, skb)) + goto nla_put_failure; + if (list_empty(&resource->resource_list)) + goto out; + + if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID, + resource->size_valid)) + goto nla_put_failure; + + child_resource_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_RESOURCE_LIST); + if (!child_resource_attr) + goto nla_put_failure; + + list_for_each_entry(child_resource, &resource->resource_list, list) { + if (devlink_resource_put(devlink, skb, child_resource)) + goto resource_put_failure; + } + + nla_nest_end(skb, child_resource_attr); +out: + nla_nest_end(skb, resource_attr); + return 0; + +resource_put_failure: + nla_nest_cancel(skb, child_resource_attr); +nla_put_failure: + nla_nest_cancel(skb, resource_attr); + return -EMSGSIZE; +} + +static int devlink_resource_fill(struct genl_info *info, + enum devlink_command cmd, int flags) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_resource *resource; + struct nlattr *resources_attr; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + bool incomplete; + void *hdr; + int i; + int err; + + resource = list_first_entry(&devlink->resource_list, + struct devlink_resource, list); +start_again: + err = devlink_nl_msg_reply_and_new(&skb, info); + if (err) + return err; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, cmd); + if (!hdr) { + nlmsg_free(skb); + return -EMSGSIZE; + } + + if (devlink_nl_put_handle(skb, devlink)) + goto nla_put_failure; + + resources_attr = nla_nest_start_noflag(skb, + DEVLINK_ATTR_RESOURCE_LIST); + if (!resources_attr) + goto nla_put_failure; + + incomplete = false; + i = 0; + list_for_each_entry_from(resource, &devlink->resource_list, list) { + err = devlink_resource_put(devlink, skb, resource); + if (err) { + if (!i) + goto err_resource_put; + incomplete = true; + break; + } + i++; + } + nla_nest_end(skb, resources_attr); + genlmsg_end(skb, hdr); + if (incomplete) + goto start_again; +send_done: + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_nl_msg_reply_and_new(&skb, info); + if (err) + return err; + goto send_done; + } + return genlmsg_reply(skb, info); + +nla_put_failure: + err = -EMSGSIZE; +err_resource_put: + nlmsg_free(skb); + return err; +} + +int devlink_nl_cmd_resource_dump(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + + if (list_empty(&devlink->resource_list)) + return -EOPNOTSUPP; + + return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0); +} + +int devlink_resources_validate(struct devlink *devlink, + struct devlink_resource *resource, + struct genl_info *info) +{ + struct list_head *resource_list; + int err = 0; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + list_for_each_entry(resource, resource_list, list) { + if (!resource->size_valid) + return -EINVAL; + err = devlink_resources_validate(devlink, resource, info); + if (err) + return err; + } + return err; +} + +/** + * devl_resource_register - devlink resource register + * + * @devlink: devlink + * @resource_name: resource's name + * @resource_size: resource's size + * @resource_id: resource's id + * @parent_resource_id: resource's parent id + * @size_params: size parameters + * + * Generic resources should reuse the same names across drivers. + * Please see the generic resources list at: + * Documentation/networking/devlink/devlink-resource.rst + */ +int devl_resource_register(struct devlink *devlink, + const char *resource_name, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + const struct devlink_resource_size_params *size_params) +{ + struct devlink_resource *resource; + struct list_head *resource_list; + bool top_hierarchy; + + lockdep_assert_held(&devlink->lock); + + top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP; + + resource = devlink_resource_find(devlink, NULL, resource_id); + if (resource) + return -EINVAL; + + resource = kzalloc(sizeof(*resource), GFP_KERNEL); + if (!resource) + return -ENOMEM; + + if (top_hierarchy) { + resource_list = &devlink->resource_list; + } else { + struct devlink_resource *parent_resource; + + parent_resource = devlink_resource_find(devlink, NULL, + parent_resource_id); + if (parent_resource) { + resource_list = &parent_resource->resource_list; + resource->parent = parent_resource; + } else { + kfree(resource); + return -EINVAL; + } + } + + resource->name = resource_name; + resource->size = resource_size; + resource->size_new = resource_size; + resource->id = resource_id; + resource->size_valid = true; + memcpy(&resource->size_params, size_params, + sizeof(resource->size_params)); + INIT_LIST_HEAD(&resource->resource_list); + list_add_tail(&resource->list, resource_list); + + return 0; +} +EXPORT_SYMBOL_GPL(devl_resource_register); + +/** + * devlink_resource_register - devlink resource register + * + * @devlink: devlink + * @resource_name: resource's name + * @resource_size: resource's size + * @resource_id: resource's id + * @parent_resource_id: resource's parent id + * @size_params: size parameters + * + * Generic resources should reuse the same names across drivers. + * Please see the generic resources list at: + * Documentation/networking/devlink/devlink-resource.rst + * + * Context: Takes and release devlink->lock . + */ +int devlink_resource_register(struct devlink *devlink, + const char *resource_name, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + const struct devlink_resource_size_params *size_params) +{ + int err; + + devl_lock(devlink); + err = devl_resource_register(devlink, resource_name, resource_size, + resource_id, parent_resource_id, size_params); + devl_unlock(devlink); + return err; +} +EXPORT_SYMBOL_GPL(devlink_resource_register); + +static void devlink_resource_unregister(struct devlink *devlink, + struct devlink_resource *resource) +{ + struct devlink_resource *tmp, *child_resource; + + list_for_each_entry_safe(child_resource, tmp, &resource->resource_list, + list) { + devlink_resource_unregister(devlink, child_resource); + list_del(&child_resource->list); + kfree(child_resource); + } +} + +/** + * devl_resources_unregister - free all resources + * + * @devlink: devlink + */ +void devl_resources_unregister(struct devlink *devlink) +{ + struct devlink_resource *tmp, *child_resource; + + lockdep_assert_held(&devlink->lock); + + list_for_each_entry_safe(child_resource, tmp, &devlink->resource_list, + list) { + devlink_resource_unregister(devlink, child_resource); + list_del(&child_resource->list); + kfree(child_resource); + } +} +EXPORT_SYMBOL_GPL(devl_resources_unregister); + +/** + * devlink_resources_unregister - free all resources + * + * @devlink: devlink + * + * Context: Takes and release devlink->lock . + */ +void devlink_resources_unregister(struct devlink *devlink) +{ + devl_lock(devlink); + devl_resources_unregister(devlink); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_resources_unregister); + +/** + * devl_resource_size_get - get and update size + * + * @devlink: devlink + * @resource_id: the requested resource id + * @p_resource_size: ptr to update + */ +int devl_resource_size_get(struct devlink *devlink, + u64 resource_id, + u64 *p_resource_size) +{ + struct devlink_resource *resource; + + lockdep_assert_held(&devlink->lock); + + resource = devlink_resource_find(devlink, NULL, resource_id); + if (!resource) + return -EINVAL; + *p_resource_size = resource->size_new; + resource->size = resource->size_new; + return 0; +} +EXPORT_SYMBOL_GPL(devl_resource_size_get); + +/** + * devl_resource_occ_get_register - register occupancy getter + * + * @devlink: devlink + * @resource_id: resource id + * @occ_get: occupancy getter callback + * @occ_get_priv: occupancy getter callback priv + */ +void devl_resource_occ_get_register(struct devlink *devlink, + u64 resource_id, + devlink_resource_occ_get_t *occ_get, + void *occ_get_priv) +{ + struct devlink_resource *resource; + + lockdep_assert_held(&devlink->lock); + + resource = devlink_resource_find(devlink, NULL, resource_id); + if (WARN_ON(!resource)) + return; + WARN_ON(resource->occ_get); + + resource->occ_get = occ_get; + resource->occ_get_priv = occ_get_priv; +} +EXPORT_SYMBOL_GPL(devl_resource_occ_get_register); + +/** + * devlink_resource_occ_get_register - register occupancy getter + * + * @devlink: devlink + * @resource_id: resource id + * @occ_get: occupancy getter callback + * @occ_get_priv: occupancy getter callback priv + * + * Context: Takes and release devlink->lock . + */ +void devlink_resource_occ_get_register(struct devlink *devlink, + u64 resource_id, + devlink_resource_occ_get_t *occ_get, + void *occ_get_priv) +{ + devl_lock(devlink); + devl_resource_occ_get_register(devlink, resource_id, + occ_get, occ_get_priv); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register); + +/** + * devl_resource_occ_get_unregister - unregister occupancy getter + * + * @devlink: devlink + * @resource_id: resource id + */ +void devl_resource_occ_get_unregister(struct devlink *devlink, + u64 resource_id) +{ + struct devlink_resource *resource; + + lockdep_assert_held(&devlink->lock); + + resource = devlink_resource_find(devlink, NULL, resource_id); + if (WARN_ON(!resource)) + return; + WARN_ON(!resource->occ_get); + + resource->occ_get = NULL; + resource->occ_get_priv = NULL; +} +EXPORT_SYMBOL_GPL(devl_resource_occ_get_unregister); + +/** + * devlink_resource_occ_get_unregister - unregister occupancy getter + * + * @devlink: devlink + * @resource_id: resource id + * + * Context: Takes and release devlink->lock . + */ +void devlink_resource_occ_get_unregister(struct devlink *devlink, + u64 resource_id) +{ + devl_lock(devlink); + devl_resource_occ_get_unregister(devlink, resource_id); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister); -- cgit v1.2.3 From 830c41e1e987d9745a01b06c5f86434df8bd5f58 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Aug 2023 08:16:49 +0200 Subject: devlink: push param related code into separate file Cut out another chunk from leftover.c and put param related code into a separate file. Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230828061657.300667-8-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/Makefile | 2 +- net/devlink/devl_internal.h | 9 + net/devlink/leftover.c | 859 ------------------------------------------- net/devlink/param.c | 865 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 875 insertions(+), 860 deletions(-) create mode 100644 net/devlink/param.c (limited to 'net') diff --git a/net/devlink/Makefile b/net/devlink/Makefile index 8864cd933acb..c4b29f603a1f 100644 --- a/net/devlink/Makefile +++ b/net/devlink/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o port.o sb.o dpipe.o \ - resource.o health.o + resource.o param.o health.o diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index e5c0acf7a758..e0304d607978 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -156,6 +156,8 @@ int devlink_nl_msg_reply_and_new(struct sk_buff **msg, struct genl_info *info); void devlink_notify(struct devlink *devlink, enum devlink_command cmd); void devlink_ports_notify_register(struct devlink *devlink); void devlink_ports_notify_unregister(struct devlink *devlink); +void devlink_params_notify_register(struct devlink *devlink); +void devlink_params_notify_unregister(struct devlink *devlink); /* Ports */ #define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \ @@ -240,6 +242,13 @@ int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_resource_set(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_resource_dump(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb); +int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, + struct genl_info *info); int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 1c65449f2252..2f1dfd27d1c8 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -1061,611 +1061,6 @@ int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, return 0; } -static const struct devlink_param devlink_param_generic[] = { - { - .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, - .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME, - .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS, - .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME, - .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, - .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME, - .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, - .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME, - .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, - .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME, - .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, - .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME, - .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, - .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, - .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, - .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME, - .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE, - .name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME, - .type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, - .name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME, - .type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET, - .name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME, - .type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH, - .name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME, - .type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA, - .name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME, - .type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET, - .name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME, - .type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP, - .name = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_NAME, - .type = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, - .name = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_NAME, - .type = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_TYPE, - }, - { - .id = DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, - .name = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME, - .type = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE, - }, -}; - -static int devlink_param_generic_verify(const struct devlink_param *param) -{ - /* verify it match generic parameter by id and name */ - if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX) - return -EINVAL; - if (strcmp(param->name, devlink_param_generic[param->id].name)) - return -ENOENT; - - WARN_ON(param->type != devlink_param_generic[param->id].type); - - return 0; -} - -static int devlink_param_driver_verify(const struct devlink_param *param) -{ - int i; - - if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX) - return -EINVAL; - /* verify no such name in generic params */ - for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++) - if (!strcmp(param->name, devlink_param_generic[i].name)) - return -EEXIST; - - return 0; -} - -static struct devlink_param_item * -devlink_param_find_by_name(struct xarray *params, const char *param_name) -{ - struct devlink_param_item *param_item; - unsigned long param_id; - - xa_for_each(params, param_id, param_item) { - if (!strcmp(param_item->param->name, param_name)) - return param_item; - } - return NULL; -} - -static struct devlink_param_item * -devlink_param_find_by_id(struct xarray *params, u32 param_id) -{ - return xa_load(params, param_id); -} - -static bool -devlink_param_cmode_is_supported(const struct devlink_param *param, - enum devlink_param_cmode cmode) -{ - return test_bit(cmode, ¶m->supported_cmodes); -} - -static int devlink_param_get(struct devlink *devlink, - const struct devlink_param *param, - struct devlink_param_gset_ctx *ctx) -{ - if (!param->get) - return -EOPNOTSUPP; - return param->get(devlink, param->id, ctx); -} - -static int devlink_param_set(struct devlink *devlink, - const struct devlink_param *param, - struct devlink_param_gset_ctx *ctx) -{ - if (!param->set) - return -EOPNOTSUPP; - return param->set(devlink, param->id, ctx); -} - -static int -devlink_param_type_to_nla_type(enum devlink_param_type param_type) -{ - switch (param_type) { - case DEVLINK_PARAM_TYPE_U8: - return NLA_U8; - case DEVLINK_PARAM_TYPE_U16: - return NLA_U16; - case DEVLINK_PARAM_TYPE_U32: - return NLA_U32; - case DEVLINK_PARAM_TYPE_STRING: - return NLA_STRING; - case DEVLINK_PARAM_TYPE_BOOL: - return NLA_FLAG; - default: - return -EINVAL; - } -} - -static int -devlink_nl_param_value_fill_one(struct sk_buff *msg, - enum devlink_param_type type, - enum devlink_param_cmode cmode, - union devlink_param_value val) -{ - struct nlattr *param_value_attr; - - param_value_attr = nla_nest_start_noflag(msg, - DEVLINK_ATTR_PARAM_VALUE); - if (!param_value_attr) - goto nla_put_failure; - - if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode)) - goto value_nest_cancel; - - switch (type) { - case DEVLINK_PARAM_TYPE_U8: - if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8)) - goto value_nest_cancel; - break; - case DEVLINK_PARAM_TYPE_U16: - if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16)) - goto value_nest_cancel; - break; - case DEVLINK_PARAM_TYPE_U32: - if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32)) - goto value_nest_cancel; - break; - case DEVLINK_PARAM_TYPE_STRING: - if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, - val.vstr)) - goto value_nest_cancel; - break; - case DEVLINK_PARAM_TYPE_BOOL: - if (val.vbool && - nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA)) - goto value_nest_cancel; - break; - } - - nla_nest_end(msg, param_value_attr); - return 0; - -value_nest_cancel: - nla_nest_cancel(msg, param_value_attr); -nla_put_failure: - return -EMSGSIZE; -} - -static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, - unsigned int port_index, - struct devlink_param_item *param_item, - enum devlink_command cmd, - u32 portid, u32 seq, int flags) -{ - union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1]; - bool param_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {}; - const struct devlink_param *param = param_item->param; - struct devlink_param_gset_ctx ctx; - struct nlattr *param_values_list; - struct nlattr *param_attr; - int nla_type; - void *hdr; - int err; - int i; - - /* Get value from driver part to driverinit configuration mode */ - for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { - if (!devlink_param_cmode_is_supported(param, i)) - continue; - if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) { - if (param_item->driverinit_value_new_valid) - param_value[i] = param_item->driverinit_value_new; - else if (param_item->driverinit_value_valid) - param_value[i] = param_item->driverinit_value; - else - return -EOPNOTSUPP; - } else { - ctx.cmode = i; - err = devlink_param_get(devlink, param, &ctx); - if (err) - return err; - param_value[i] = ctx.val; - } - param_value_set[i] = true; - } - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto genlmsg_cancel; - - if (cmd == DEVLINK_CMD_PORT_PARAM_GET || - cmd == DEVLINK_CMD_PORT_PARAM_NEW || - cmd == DEVLINK_CMD_PORT_PARAM_DEL) - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index)) - goto genlmsg_cancel; - - param_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM); - if (!param_attr) - goto genlmsg_cancel; - if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name)) - goto param_nest_cancel; - if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC)) - goto param_nest_cancel; - - nla_type = devlink_param_type_to_nla_type(param->type); - if (nla_type < 0) - goto param_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type)) - goto param_nest_cancel; - - param_values_list = nla_nest_start_noflag(msg, - DEVLINK_ATTR_PARAM_VALUES_LIST); - if (!param_values_list) - goto param_nest_cancel; - - for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { - if (!param_value_set[i]) - continue; - err = devlink_nl_param_value_fill_one(msg, param->type, - i, param_value[i]); - if (err) - goto values_list_nest_cancel; - } - - nla_nest_end(msg, param_values_list); - nla_nest_end(msg, param_attr); - genlmsg_end(msg, hdr); - return 0; - -values_list_nest_cancel: - nla_nest_end(msg, param_values_list); -param_nest_cancel: - nla_nest_cancel(msg, param_attr); -genlmsg_cancel: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void devlink_param_notify(struct devlink *devlink, - unsigned int port_index, - struct devlink_param_item *param_item, - enum devlink_command cmd) -{ - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL && - cmd != DEVLINK_CMD_PORT_PARAM_NEW && - cmd != DEVLINK_CMD_PORT_PARAM_DEL); - - /* devlink_notify_register() / devlink_notify_unregister() - * will replay the notifications if the params are added/removed - * outside of the lifetime of the instance. - */ - if (!devl_is_registered(devlink)) - return; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd, - 0, 0, 0); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static void devlink_params_notify(struct devlink *devlink, - enum devlink_command cmd) -{ - struct devlink_param_item *param_item; - unsigned long param_id; - - xa_for_each(&devlink->params, param_id, param_item) - devlink_param_notify(devlink, 0, param_item, cmd); -} - -static void devlink_params_notify_register(struct devlink *devlink) -{ - devlink_params_notify(devlink, DEVLINK_CMD_PARAM_NEW); -} - -static void devlink_params_notify_unregister(struct devlink *devlink) -{ - devlink_params_notify(devlink, DEVLINK_CMD_PARAM_DEL); -} - -static int devlink_nl_param_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb, - int flags) -{ - struct devlink_nl_dump_state *state = devlink_dump_state(cb); - struct devlink_param_item *param_item; - unsigned long param_id; - int err = 0; - - xa_for_each_start(&devlink->params, param_id, param_item, state->idx) { - err = devlink_nl_param_fill(msg, devlink, 0, param_item, - DEVLINK_CMD_PARAM_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, flags); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - state->idx = param_id; - break; - } - } - - return err; -} - -int devlink_nl_param_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) -{ - return devlink_nl_dumpit(skb, cb, devlink_nl_param_get_dump_one); -} - -static int -devlink_param_type_get_from_info(struct genl_info *info, - enum devlink_param_type *param_type) -{ - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE)) - return -EINVAL; - - switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) { - case NLA_U8: - *param_type = DEVLINK_PARAM_TYPE_U8; - break; - case NLA_U16: - *param_type = DEVLINK_PARAM_TYPE_U16; - break; - case NLA_U32: - *param_type = DEVLINK_PARAM_TYPE_U32; - break; - case NLA_STRING: - *param_type = DEVLINK_PARAM_TYPE_STRING; - break; - case NLA_FLAG: - *param_type = DEVLINK_PARAM_TYPE_BOOL; - break; - default: - return -EINVAL; - } - - return 0; -} - -static int -devlink_param_value_get_from_info(const struct devlink_param *param, - struct genl_info *info, - union devlink_param_value *value) -{ - struct nlattr *param_data; - int len; - - param_data = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]; - - if (param->type != DEVLINK_PARAM_TYPE_BOOL && !param_data) - return -EINVAL; - - switch (param->type) { - case DEVLINK_PARAM_TYPE_U8: - if (nla_len(param_data) != sizeof(u8)) - return -EINVAL; - value->vu8 = nla_get_u8(param_data); - break; - case DEVLINK_PARAM_TYPE_U16: - if (nla_len(param_data) != sizeof(u16)) - return -EINVAL; - value->vu16 = nla_get_u16(param_data); - break; - case DEVLINK_PARAM_TYPE_U32: - if (nla_len(param_data) != sizeof(u32)) - return -EINVAL; - value->vu32 = nla_get_u32(param_data); - break; - case DEVLINK_PARAM_TYPE_STRING: - len = strnlen(nla_data(param_data), nla_len(param_data)); - if (len == nla_len(param_data) || - len >= __DEVLINK_PARAM_MAX_STRING_VALUE) - return -EINVAL; - strcpy(value->vstr, nla_data(param_data)); - break; - case DEVLINK_PARAM_TYPE_BOOL: - if (param_data && nla_len(param_data)) - return -EINVAL; - value->vbool = nla_get_flag(param_data); - break; - } - return 0; -} - -static struct devlink_param_item * -devlink_param_get_from_info(struct xarray *params, struct genl_info *info) -{ - char *param_name; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_NAME)) - return NULL; - - param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]); - return devlink_param_find_by_name(params, param_name); -} - -int devlink_nl_param_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_param_item *param_item; - struct sk_buff *msg; - int err; - - param_item = devlink_param_get_from_info(&devlink->params, info); - if (!param_item) - return -EINVAL; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_param_fill(msg, devlink, 0, param_item, - DEVLINK_CMD_PARAM_GET, - info->snd_portid, info->snd_seq, 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, - unsigned int port_index, - struct xarray *params, - struct genl_info *info, - enum devlink_command cmd) -{ - enum devlink_param_type param_type; - struct devlink_param_gset_ctx ctx; - enum devlink_param_cmode cmode; - struct devlink_param_item *param_item; - const struct devlink_param *param; - union devlink_param_value value; - int err = 0; - - param_item = devlink_param_get_from_info(params, info); - if (!param_item) - return -EINVAL; - param = param_item->param; - err = devlink_param_type_get_from_info(info, ¶m_type); - if (err) - return err; - if (param_type != param->type) - return -EINVAL; - err = devlink_param_value_get_from_info(param, info, &value); - if (err) - return err; - if (param->validate) { - err = param->validate(devlink, param->id, value, info->extack); - if (err) - return err; - } - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE)) - return -EINVAL; - cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]); - if (!devlink_param_cmode_is_supported(param, cmode)) - return -EOPNOTSUPP; - - if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) { - param_item->driverinit_value_new = value; - param_item->driverinit_value_new_valid = true; - } else { - if (!param->set) - return -EOPNOTSUPP; - ctx.val = value; - ctx.cmode = cmode; - err = devlink_param_set(devlink, param, &ctx); - if (err) - return err; - } - - devlink_param_notify(devlink, port_index, param_item, cmd); - return 0; -} - -static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - - return __devlink_nl_cmd_param_set_doit(devlink, 0, &devlink->params, - info, DEVLINK_CMD_PARAM_NEW); -} - -static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - NL_SET_ERR_MSG(cb->extack, "Port params are not supported"); - return msg->len; -} - -static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - NL_SET_ERR_MSG(info->extack, "Port params are not supported"); - return -EINVAL; -} - -static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - NL_SET_ERR_MSG(info->extack, "Port params are not supported"); - return -EINVAL; -} - static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, struct devlink *devlink, struct devlink_snapshot *snapshot) @@ -4216,260 +3611,6 @@ void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, } EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set); -static int devlink_param_verify(const struct devlink_param *param) -{ - if (!param || !param->name || !param->supported_cmodes) - return -EINVAL; - if (param->generic) - return devlink_param_generic_verify(param); - else - return devlink_param_driver_verify(param); -} - -static int devlink_param_register(struct devlink *devlink, - const struct devlink_param *param) -{ - struct devlink_param_item *param_item; - int err; - - WARN_ON(devlink_param_verify(param)); - WARN_ON(devlink_param_find_by_name(&devlink->params, param->name)); - - if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) - WARN_ON(param->get || param->set); - else - WARN_ON(!param->get || !param->set); - - param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); - if (!param_item) - return -ENOMEM; - - param_item->param = param; - - err = xa_insert(&devlink->params, param->id, param_item, GFP_KERNEL); - if (err) - goto err_xa_insert; - - devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); - return 0; - -err_xa_insert: - kfree(param_item); - return err; -} - -static void devlink_param_unregister(struct devlink *devlink, - const struct devlink_param *param) -{ - struct devlink_param_item *param_item; - - param_item = devlink_param_find_by_id(&devlink->params, param->id); - if (WARN_ON(!param_item)) - return; - devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_DEL); - xa_erase(&devlink->params, param->id); - kfree(param_item); -} - -/** - * devl_params_register - register configuration parameters - * - * @devlink: devlink - * @params: configuration parameters array - * @params_count: number of parameters provided - * - * Register the configuration parameters supported by the driver. - */ -int devl_params_register(struct devlink *devlink, - const struct devlink_param *params, - size_t params_count) -{ - const struct devlink_param *param = params; - int i, err; - - lockdep_assert_held(&devlink->lock); - - for (i = 0; i < params_count; i++, param++) { - err = devlink_param_register(devlink, param); - if (err) - goto rollback; - } - return 0; - -rollback: - if (!i) - return err; - - for (param--; i > 0; i--, param--) - devlink_param_unregister(devlink, param); - return err; -} -EXPORT_SYMBOL_GPL(devl_params_register); - -int devlink_params_register(struct devlink *devlink, - const struct devlink_param *params, - size_t params_count) -{ - int err; - - devl_lock(devlink); - err = devl_params_register(devlink, params, params_count); - devl_unlock(devlink); - return err; -} -EXPORT_SYMBOL_GPL(devlink_params_register); - -/** - * devl_params_unregister - unregister configuration parameters - * @devlink: devlink - * @params: configuration parameters to unregister - * @params_count: number of parameters provided - */ -void devl_params_unregister(struct devlink *devlink, - const struct devlink_param *params, - size_t params_count) -{ - const struct devlink_param *param = params; - int i; - - lockdep_assert_held(&devlink->lock); - - for (i = 0; i < params_count; i++, param++) - devlink_param_unregister(devlink, param); -} -EXPORT_SYMBOL_GPL(devl_params_unregister); - -void devlink_params_unregister(struct devlink *devlink, - const struct devlink_param *params, - size_t params_count) -{ - devl_lock(devlink); - devl_params_unregister(devlink, params, params_count); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_params_unregister); - -/** - * devl_param_driverinit_value_get - get configuration parameter - * value for driver initializing - * - * @devlink: devlink - * @param_id: parameter ID - * @val: pointer to store the value of parameter in driverinit - * configuration mode - * - * This function should be used by the driver to get driverinit - * configuration for initialization after reload command. - * - * Note that lockless call of this function relies on the - * driver to maintain following basic sane behavior: - * 1) Driver ensures a call to this function cannot race with - * registering/unregistering the parameter with the same parameter ID. - * 2) Driver ensures a call to this function cannot race with - * devl_param_driverinit_value_set() call with the same parameter ID. - * 3) Driver ensures a call to this function cannot race with - * reload operation. - * If the driver is not able to comply, it has to take the devlink->lock - * while calling this. - */ -int devl_param_driverinit_value_get(struct devlink *devlink, u32 param_id, - union devlink_param_value *val) -{ - struct devlink_param_item *param_item; - - if (WARN_ON(!devlink_reload_supported(devlink->ops))) - return -EOPNOTSUPP; - - param_item = devlink_param_find_by_id(&devlink->params, param_id); - if (!param_item) - return -EINVAL; - - if (!param_item->driverinit_value_valid) - return -EOPNOTSUPP; - - if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param, - DEVLINK_PARAM_CMODE_DRIVERINIT))) - return -EOPNOTSUPP; - - *val = param_item->driverinit_value; - - return 0; -} -EXPORT_SYMBOL_GPL(devl_param_driverinit_value_get); - -/** - * devl_param_driverinit_value_set - set value of configuration - * parameter for driverinit - * configuration mode - * - * @devlink: devlink - * @param_id: parameter ID - * @init_val: value of parameter to set for driverinit configuration mode - * - * This function should be used by the driver to set driverinit - * configuration mode default value. - */ -void devl_param_driverinit_value_set(struct devlink *devlink, u32 param_id, - union devlink_param_value init_val) -{ - struct devlink_param_item *param_item; - - devl_assert_locked(devlink); - - param_item = devlink_param_find_by_id(&devlink->params, param_id); - if (WARN_ON(!param_item)) - return; - - if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param, - DEVLINK_PARAM_CMODE_DRIVERINIT))) - return; - - param_item->driverinit_value = init_val; - param_item->driverinit_value_valid = true; - - devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); -} -EXPORT_SYMBOL_GPL(devl_param_driverinit_value_set); - -void devlink_params_driverinit_load_new(struct devlink *devlink) -{ - struct devlink_param_item *param_item; - unsigned long param_id; - - xa_for_each(&devlink->params, param_id, param_item) { - if (!devlink_param_cmode_is_supported(param_item->param, - DEVLINK_PARAM_CMODE_DRIVERINIT) || - !param_item->driverinit_value_new_valid) - continue; - param_item->driverinit_value = param_item->driverinit_value_new; - param_item->driverinit_value_valid = true; - param_item->driverinit_value_new_valid = false; - } -} - -/** - * devl_param_value_changed - notify devlink on a parameter's value - * change. Should be called by the driver - * right after the change. - * - * @devlink: devlink - * @param_id: parameter ID - * - * This function should be used by the driver to notify devlink on value - * change, excluding driverinit configuration mode. - * For driverinit configuration mode driver should use the function - */ -void devl_param_value_changed(struct devlink *devlink, u32 param_id) -{ - struct devlink_param_item *param_item; - - param_item = devlink_param_find_by_id(&devlink->params, param_id); - WARN_ON(!param_item); - - devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); -} -EXPORT_SYMBOL_GPL(devl_param_value_changed); - /** * devl_region_create - create a new address region * diff --git a/net/devlink/param.c b/net/devlink/param.c new file mode 100644 index 000000000000..31275f9d4cb7 --- /dev/null +++ b/net/devlink/param.c @@ -0,0 +1,865 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko + */ + +#include "devl_internal.h" + +static const struct devlink_param devlink_param_generic[] = { + { + .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, + .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME, + .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS, + .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME, + .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, + .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, + .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME, + .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, + .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME, + .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, + .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME, + .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, + .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, + .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, + .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME, + .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE, + .name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME, + .type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, + .name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET, + .name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH, + .name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA, + .name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET, + .name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP, + .name = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, + .name = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_NAME, + .type = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, + .name = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME, + .type = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE, + }, +}; + +static int devlink_param_generic_verify(const struct devlink_param *param) +{ + /* verify it match generic parameter by id and name */ + if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX) + return -EINVAL; + if (strcmp(param->name, devlink_param_generic[param->id].name)) + return -ENOENT; + + WARN_ON(param->type != devlink_param_generic[param->id].type); + + return 0; +} + +static int devlink_param_driver_verify(const struct devlink_param *param) +{ + int i; + + if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX) + return -EINVAL; + /* verify no such name in generic params */ + for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++) + if (!strcmp(param->name, devlink_param_generic[i].name)) + return -EEXIST; + + return 0; +} + +static struct devlink_param_item * +devlink_param_find_by_name(struct xarray *params, const char *param_name) +{ + struct devlink_param_item *param_item; + unsigned long param_id; + + xa_for_each(params, param_id, param_item) { + if (!strcmp(param_item->param->name, param_name)) + return param_item; + } + return NULL; +} + +static struct devlink_param_item * +devlink_param_find_by_id(struct xarray *params, u32 param_id) +{ + return xa_load(params, param_id); +} + +static bool +devlink_param_cmode_is_supported(const struct devlink_param *param, + enum devlink_param_cmode cmode) +{ + return test_bit(cmode, ¶m->supported_cmodes); +} + +static int devlink_param_get(struct devlink *devlink, + const struct devlink_param *param, + struct devlink_param_gset_ctx *ctx) +{ + if (!param->get) + return -EOPNOTSUPP; + return param->get(devlink, param->id, ctx); +} + +static int devlink_param_set(struct devlink *devlink, + const struct devlink_param *param, + struct devlink_param_gset_ctx *ctx) +{ + if (!param->set) + return -EOPNOTSUPP; + return param->set(devlink, param->id, ctx); +} + +static int +devlink_param_type_to_nla_type(enum devlink_param_type param_type) +{ + switch (param_type) { + case DEVLINK_PARAM_TYPE_U8: + return NLA_U8; + case DEVLINK_PARAM_TYPE_U16: + return NLA_U16; + case DEVLINK_PARAM_TYPE_U32: + return NLA_U32; + case DEVLINK_PARAM_TYPE_STRING: + return NLA_STRING; + case DEVLINK_PARAM_TYPE_BOOL: + return NLA_FLAG; + default: + return -EINVAL; + } +} + +static int +devlink_nl_param_value_fill_one(struct sk_buff *msg, + enum devlink_param_type type, + enum devlink_param_cmode cmode, + union devlink_param_value val) +{ + struct nlattr *param_value_attr; + + param_value_attr = nla_nest_start_noflag(msg, + DEVLINK_ATTR_PARAM_VALUE); + if (!param_value_attr) + goto nla_put_failure; + + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode)) + goto value_nest_cancel; + + switch (type) { + case DEVLINK_PARAM_TYPE_U8: + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_U16: + if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_U32: + if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_STRING: + if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, + val.vstr)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_BOOL: + if (val.vbool && + nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA)) + goto value_nest_cancel; + break; + } + + nla_nest_end(msg, param_value_attr); + return 0; + +value_nest_cancel: + nla_nest_cancel(msg, param_value_attr); +nla_put_failure: + return -EMSGSIZE; +} + +static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, + unsigned int port_index, + struct devlink_param_item *param_item, + enum devlink_command cmd, + u32 portid, u32 seq, int flags) +{ + union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1]; + bool param_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {}; + const struct devlink_param *param = param_item->param; + struct devlink_param_gset_ctx ctx; + struct nlattr *param_values_list; + struct nlattr *param_attr; + int nla_type; + void *hdr; + int err; + int i; + + /* Get value from driver part to driverinit configuration mode */ + for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { + if (!devlink_param_cmode_is_supported(param, i)) + continue; + if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) { + if (param_item->driverinit_value_new_valid) + param_value[i] = param_item->driverinit_value_new; + else if (param_item->driverinit_value_valid) + param_value[i] = param_item->driverinit_value; + else + return -EOPNOTSUPP; + } else { + ctx.cmode = i; + err = devlink_param_get(devlink, param, &ctx); + if (err) + return err; + param_value[i] = ctx.val; + } + param_value_set[i] = true; + } + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto genlmsg_cancel; + + if (cmd == DEVLINK_CMD_PORT_PARAM_GET || + cmd == DEVLINK_CMD_PORT_PARAM_NEW || + cmd == DEVLINK_CMD_PORT_PARAM_DEL) + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index)) + goto genlmsg_cancel; + + param_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM); + if (!param_attr) + goto genlmsg_cancel; + if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name)) + goto param_nest_cancel; + if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC)) + goto param_nest_cancel; + + nla_type = devlink_param_type_to_nla_type(param->type); + if (nla_type < 0) + goto param_nest_cancel; + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type)) + goto param_nest_cancel; + + param_values_list = nla_nest_start_noflag(msg, + DEVLINK_ATTR_PARAM_VALUES_LIST); + if (!param_values_list) + goto param_nest_cancel; + + for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { + if (!param_value_set[i]) + continue; + err = devlink_nl_param_value_fill_one(msg, param->type, + i, param_value[i]); + if (err) + goto values_list_nest_cancel; + } + + nla_nest_end(msg, param_values_list); + nla_nest_end(msg, param_attr); + genlmsg_end(msg, hdr); + return 0; + +values_list_nest_cancel: + nla_nest_end(msg, param_values_list); +param_nest_cancel: + nla_nest_cancel(msg, param_attr); +genlmsg_cancel: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_param_notify(struct devlink *devlink, + unsigned int port_index, + struct devlink_param_item *param_item, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL && + cmd != DEVLINK_CMD_PORT_PARAM_NEW && + cmd != DEVLINK_CMD_PORT_PARAM_DEL); + + /* devlink_notify_register() / devlink_notify_unregister() + * will replay the notifications if the params are added/removed + * outside of the lifetime of the instance. + */ + if (!devl_is_registered(devlink)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd, + 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static void devlink_params_notify(struct devlink *devlink, + enum devlink_command cmd) +{ + struct devlink_param_item *param_item; + unsigned long param_id; + + xa_for_each(&devlink->params, param_id, param_item) + devlink_param_notify(devlink, 0, param_item, cmd); +} + +void devlink_params_notify_register(struct devlink *devlink) +{ + devlink_params_notify(devlink, DEVLINK_CMD_PARAM_NEW); +} + +void devlink_params_notify_unregister(struct devlink *devlink) +{ + devlink_params_notify(devlink, DEVLINK_CMD_PARAM_DEL); +} + +static int devlink_nl_param_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags) +{ + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + struct devlink_param_item *param_item; + unsigned long param_id; + int err = 0; + + xa_for_each_start(&devlink->params, param_id, param_item, state->idx) { + err = devlink_nl_param_fill(msg, devlink, 0, param_item, + DEVLINK_CMD_PARAM_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, flags); + if (err == -EOPNOTSUPP) { + err = 0; + } else if (err) { + state->idx = param_id; + break; + } + } + + return err; +} + +int devlink_nl_param_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_param_get_dump_one); +} + +static int +devlink_param_type_get_from_info(struct genl_info *info, + enum devlink_param_type *param_type) +{ + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE)) + return -EINVAL; + + switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) { + case NLA_U8: + *param_type = DEVLINK_PARAM_TYPE_U8; + break; + case NLA_U16: + *param_type = DEVLINK_PARAM_TYPE_U16; + break; + case NLA_U32: + *param_type = DEVLINK_PARAM_TYPE_U32; + break; + case NLA_STRING: + *param_type = DEVLINK_PARAM_TYPE_STRING; + break; + case NLA_FLAG: + *param_type = DEVLINK_PARAM_TYPE_BOOL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int +devlink_param_value_get_from_info(const struct devlink_param *param, + struct genl_info *info, + union devlink_param_value *value) +{ + struct nlattr *param_data; + int len; + + param_data = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]; + + if (param->type != DEVLINK_PARAM_TYPE_BOOL && !param_data) + return -EINVAL; + + switch (param->type) { + case DEVLINK_PARAM_TYPE_U8: + if (nla_len(param_data) != sizeof(u8)) + return -EINVAL; + value->vu8 = nla_get_u8(param_data); + break; + case DEVLINK_PARAM_TYPE_U16: + if (nla_len(param_data) != sizeof(u16)) + return -EINVAL; + value->vu16 = nla_get_u16(param_data); + break; + case DEVLINK_PARAM_TYPE_U32: + if (nla_len(param_data) != sizeof(u32)) + return -EINVAL; + value->vu32 = nla_get_u32(param_data); + break; + case DEVLINK_PARAM_TYPE_STRING: + len = strnlen(nla_data(param_data), nla_len(param_data)); + if (len == nla_len(param_data) || + len >= __DEVLINK_PARAM_MAX_STRING_VALUE) + return -EINVAL; + strcpy(value->vstr, nla_data(param_data)); + break; + case DEVLINK_PARAM_TYPE_BOOL: + if (param_data && nla_len(param_data)) + return -EINVAL; + value->vbool = nla_get_flag(param_data); + break; + } + return 0; +} + +static struct devlink_param_item * +devlink_param_get_from_info(struct xarray *params, struct genl_info *info) +{ + char *param_name; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_NAME)) + return NULL; + + param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]); + return devlink_param_find_by_name(params, param_name); +} + +int devlink_nl_param_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_param_item *param_item; + struct sk_buff *msg; + int err; + + param_item = devlink_param_get_from_info(&devlink->params, info); + if (!param_item) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_param_fill(msg, devlink, 0, param_item, + DEVLINK_CMD_PARAM_GET, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, + unsigned int port_index, + struct xarray *params, + struct genl_info *info, + enum devlink_command cmd) +{ + enum devlink_param_type param_type; + struct devlink_param_gset_ctx ctx; + enum devlink_param_cmode cmode; + struct devlink_param_item *param_item; + const struct devlink_param *param; + union devlink_param_value value; + int err = 0; + + param_item = devlink_param_get_from_info(params, info); + if (!param_item) + return -EINVAL; + param = param_item->param; + err = devlink_param_type_get_from_info(info, ¶m_type); + if (err) + return err; + if (param_type != param->type) + return -EINVAL; + err = devlink_param_value_get_from_info(param, info, &value); + if (err) + return err; + if (param->validate) { + err = param->validate(devlink, param->id, value, info->extack); + if (err) + return err; + } + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE)) + return -EINVAL; + cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]); + if (!devlink_param_cmode_is_supported(param, cmode)) + return -EOPNOTSUPP; + + if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) { + param_item->driverinit_value_new = value; + param_item->driverinit_value_new_valid = true; + } else { + if (!param->set) + return -EOPNOTSUPP; + ctx.val = value; + ctx.cmode = cmode; + err = devlink_param_set(devlink, param, &ctx); + if (err) + return err; + } + + devlink_param_notify(devlink, port_index, param_item, cmd); + return 0; +} + +int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + + return __devlink_nl_cmd_param_set_doit(devlink, 0, &devlink->params, + info, DEVLINK_CMD_PARAM_NEW); +} + +int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + NL_SET_ERR_MSG(cb->extack, "Port params are not supported"); + return msg->len; +} + +int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + NL_SET_ERR_MSG(info->extack, "Port params are not supported"); + return -EINVAL; +} + +int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + NL_SET_ERR_MSG(info->extack, "Port params are not supported"); + return -EINVAL; +} + +static int devlink_param_verify(const struct devlink_param *param) +{ + if (!param || !param->name || !param->supported_cmodes) + return -EINVAL; + if (param->generic) + return devlink_param_generic_verify(param); + else + return devlink_param_driver_verify(param); +} + +static int devlink_param_register(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + int err; + + WARN_ON(devlink_param_verify(param)); + WARN_ON(devlink_param_find_by_name(&devlink->params, param->name)); + + if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) + WARN_ON(param->get || param->set); + else + WARN_ON(!param->get || !param->set); + + param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); + if (!param_item) + return -ENOMEM; + + param_item->param = param; + + err = xa_insert(&devlink->params, param->id, param_item, GFP_KERNEL); + if (err) + goto err_xa_insert; + + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); + return 0; + +err_xa_insert: + kfree(param_item); + return err; +} + +static void devlink_param_unregister(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_id(&devlink->params, param->id); + if (WARN_ON(!param_item)) + return; + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_DEL); + xa_erase(&devlink->params, param->id); + kfree(param_item); +} + +/** + * devl_params_register - register configuration parameters + * + * @devlink: devlink + * @params: configuration parameters array + * @params_count: number of parameters provided + * + * Register the configuration parameters supported by the driver. + */ +int devl_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + const struct devlink_param *param = params; + int i, err; + + lockdep_assert_held(&devlink->lock); + + for (i = 0; i < params_count; i++, param++) { + err = devlink_param_register(devlink, param); + if (err) + goto rollback; + } + return 0; + +rollback: + if (!i) + return err; + + for (param--; i > 0; i--, param--) + devlink_param_unregister(devlink, param); + return err; +} +EXPORT_SYMBOL_GPL(devl_params_register); + +int devlink_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + int err; + + devl_lock(devlink); + err = devl_params_register(devlink, params, params_count); + devl_unlock(devlink); + return err; +} +EXPORT_SYMBOL_GPL(devlink_params_register); + +/** + * devl_params_unregister - unregister configuration parameters + * @devlink: devlink + * @params: configuration parameters to unregister + * @params_count: number of parameters provided + */ +void devl_params_unregister(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + const struct devlink_param *param = params; + int i; + + lockdep_assert_held(&devlink->lock); + + for (i = 0; i < params_count; i++, param++) + devlink_param_unregister(devlink, param); +} +EXPORT_SYMBOL_GPL(devl_params_unregister); + +void devlink_params_unregister(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + devl_lock(devlink); + devl_params_unregister(devlink, params, params_count); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_params_unregister); + +/** + * devl_param_driverinit_value_get - get configuration parameter + * value for driver initializing + * + * @devlink: devlink + * @param_id: parameter ID + * @val: pointer to store the value of parameter in driverinit + * configuration mode + * + * This function should be used by the driver to get driverinit + * configuration for initialization after reload command. + * + * Note that lockless call of this function relies on the + * driver to maintain following basic sane behavior: + * 1) Driver ensures a call to this function cannot race with + * registering/unregistering the parameter with the same parameter ID. + * 2) Driver ensures a call to this function cannot race with + * devl_param_driverinit_value_set() call with the same parameter ID. + * 3) Driver ensures a call to this function cannot race with + * reload operation. + * If the driver is not able to comply, it has to take the devlink->lock + * while calling this. + */ +int devl_param_driverinit_value_get(struct devlink *devlink, u32 param_id, + union devlink_param_value *val) +{ + struct devlink_param_item *param_item; + + if (WARN_ON(!devlink_reload_supported(devlink->ops))) + return -EOPNOTSUPP; + + param_item = devlink_param_find_by_id(&devlink->params, param_id); + if (!param_item) + return -EINVAL; + + if (!param_item->driverinit_value_valid) + return -EOPNOTSUPP; + + if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT))) + return -EOPNOTSUPP; + + *val = param_item->driverinit_value; + + return 0; +} +EXPORT_SYMBOL_GPL(devl_param_driverinit_value_get); + +/** + * devl_param_driverinit_value_set - set value of configuration + * parameter for driverinit + * configuration mode + * + * @devlink: devlink + * @param_id: parameter ID + * @init_val: value of parameter to set for driverinit configuration mode + * + * This function should be used by the driver to set driverinit + * configuration mode default value. + */ +void devl_param_driverinit_value_set(struct devlink *devlink, u32 param_id, + union devlink_param_value init_val) +{ + struct devlink_param_item *param_item; + + devl_assert_locked(devlink); + + param_item = devlink_param_find_by_id(&devlink->params, param_id); + if (WARN_ON(!param_item)) + return; + + if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT))) + return; + + param_item->driverinit_value = init_val; + param_item->driverinit_value_valid = true; + + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); +} +EXPORT_SYMBOL_GPL(devl_param_driverinit_value_set); + +void devlink_params_driverinit_load_new(struct devlink *devlink) +{ + struct devlink_param_item *param_item; + unsigned long param_id; + + xa_for_each(&devlink->params, param_id, param_item) { + if (!devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT) || + !param_item->driverinit_value_new_valid) + continue; + param_item->driverinit_value = param_item->driverinit_value_new; + param_item->driverinit_value_valid = true; + param_item->driverinit_value_new_valid = false; + } +} + +/** + * devl_param_value_changed - notify devlink on a parameter's value + * change. Should be called by the driver + * right after the change. + * + * @devlink: devlink + * @param_id: parameter ID + * + * This function should be used by the driver to notify devlink on value + * change, excluding driverinit configuration mode. + * For driverinit configuration mode driver should use the function + */ +void devl_param_value_changed(struct devlink *devlink, u32 param_id) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_id(&devlink->params, param_id); + WARN_ON(!param_item); + + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); +} +EXPORT_SYMBOL_GPL(devl_param_value_changed); -- cgit v1.2.3 From 1aa47ca1f52ee4ef5f553e7ee8fb6749b6c96b9f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Aug 2023 08:16:50 +0200 Subject: devlink: push region related code into separate file Cut out another chunk from leftover.c and put region related code into a separate file. Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230828061657.300667-9-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/Makefile | 2 +- net/devlink/devl_internal.h | 6 + net/devlink/leftover.c | 1506 ++++--------------------------------------- net/devlink/region.c | 1260 ++++++++++++++++++++++++++++++++++++ 4 files changed, 1392 insertions(+), 1382 deletions(-) create mode 100644 net/devlink/region.c (limited to 'net') diff --git a/net/devlink/Makefile b/net/devlink/Makefile index c4b29f603a1f..744f64e467b7 100644 --- a/net/devlink/Makefile +++ b/net/devlink/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o port.o sb.o dpipe.o \ - resource.o param.o health.o + resource.o param.o region.o health.o diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index e0304d607978..32b66233240f 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -158,6 +158,8 @@ void devlink_ports_notify_register(struct devlink *devlink); void devlink_ports_notify_unregister(struct devlink *devlink); void devlink_params_notify_register(struct devlink *devlink); void devlink_params_notify_unregister(struct devlink *devlink); +void devlink_regions_notify_register(struct devlink *devlink); +void devlink_regions_notify_unregister(struct devlink *devlink); /* Ports */ #define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \ @@ -249,6 +251,10 @@ int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_region_del(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); int devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 2f1dfd27d1c8..6d31aaee8ce8 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -151,68 +151,6 @@ devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info) return devlink_linecard_get_from_attrs(devlink, info->attrs); } -struct devlink_region { - struct devlink *devlink; - struct devlink_port *port; - struct list_head list; - union { - const struct devlink_region_ops *ops; - const struct devlink_port_region_ops *port_ops; - }; - struct mutex snapshot_lock; /* protects snapshot_list, - * max_snapshots and cur_snapshots - * consistency. - */ - struct list_head snapshot_list; - u32 max_snapshots; - u32 cur_snapshots; - u64 size; -}; - -struct devlink_snapshot { - struct list_head list; - struct devlink_region *region; - u8 *data; - u32 id; -}; - -static struct devlink_region * -devlink_region_get_by_name(struct devlink *devlink, const char *region_name) -{ - struct devlink_region *region; - - list_for_each_entry(region, &devlink->region_list, list) - if (!strcmp(region->ops->name, region_name)) - return region; - - return NULL; -} - -static struct devlink_region * -devlink_port_region_get_by_name(struct devlink_port *port, - const char *region_name) -{ - struct devlink_region *region; - - list_for_each_entry(region, &port->region_list, list) - if (!strcmp(region->ops->name, region_name)) - return region; - - return NULL; -} - -static struct devlink_snapshot * -devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) -{ - struct devlink_snapshot *snapshot; - - list_for_each_entry(snapshot, ®ion->snapshot_list, list) - if (snapshot->id == id) - return snapshot; - - return NULL; -} - static int devlink_nl_put_nested_handle(struct sk_buff *msg, struct devlink *devlink) { struct nlattr *nested_attr; @@ -909,1125 +847,156 @@ static int devlink_linecard_type_set(struct devlink_linecard *linecard, struct devlink_linecard_type *linecard_type; int err; - mutex_lock(&linecard->state_lock); - if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { - NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); - err = -EBUSY; - goto out; - } - if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { - NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); - err = -EBUSY; - goto out; - } - - linecard_type = devlink_linecard_type_lookup(linecard, type); - if (!linecard_type) { - NL_SET_ERR_MSG(extack, "Unsupported line card type provided"); - err = -EINVAL; - goto out; - } - - if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED && - linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { - NL_SET_ERR_MSG(extack, "Line card already provisioned"); - err = -EBUSY; - /* Check if the line card is provisioned in the same - * way the user asks. In case it is, make the operation - * to return success. - */ - if (ops->same_provision && - ops->same_provision(linecard, linecard->priv, - linecard_type->type, - linecard_type->priv)) - err = 0; - goto out; - } - - linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING; - linecard->type = linecard_type->type; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - err = ops->provision(linecard, linecard->priv, linecard_type->type, - linecard_type->priv, extack); - if (err) { - /* Provisioning failed. Assume the linecard is unprovisioned - * for future operations. - */ - mutex_lock(&linecard->state_lock); - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - linecard->type = NULL; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - } - return err; - -out: - mutex_unlock(&linecard->state_lock); - return err; -} - -static int devlink_linecard_type_unset(struct devlink_linecard *linecard, - struct netlink_ext_ack *extack) -{ - int err; - - mutex_lock(&linecard->state_lock); - if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { - NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); - err = -EBUSY; - goto out; - } - if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { - NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); - err = -EBUSY; - goto out; - } - if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - linecard->type = NULL; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - err = 0; - goto out; - } - - if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) { - NL_SET_ERR_MSG(extack, "Line card is not provisioned"); - err = 0; - goto out; - } - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - err = linecard->ops->unprovision(linecard, linecard->priv, - extack); - if (err) { - /* Unprovisioning failed. Assume the linecard is unprovisioned - * for future operations. - */ - mutex_lock(&linecard->state_lock); - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - linecard->type = NULL; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - } - return err; - -out: - mutex_unlock(&linecard->state_lock); - return err; -} - -static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - struct devlink_linecard *linecard; - int err; - - linecard = devlink_linecard_get_from_info(devlink, info); - if (IS_ERR(linecard)) - return PTR_ERR(linecard); - - if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) { - const char *type; - - type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]); - if (strcmp(type, "")) { - err = devlink_linecard_type_set(linecard, type, extack); - if (err) - return err; - } else { - err = devlink_linecard_type_unset(linecard, extack); - if (err) - return err; - } - } - - return 0; -} - -int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, - struct netlink_ext_ack *extack) -{ - struct devlink_rate *devlink_rate; - - list_for_each_entry(devlink_rate, &devlink->rate_list, list) - if (devlink_rate_is_node(devlink_rate)) { - NL_SET_ERR_MSG(extack, "Rate node(s) exists."); - return -EBUSY; - } - return 0; -} - -static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, - struct devlink *devlink, - struct devlink_snapshot *snapshot) -{ - struct nlattr *snap_attr; - int err; - - snap_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_SNAPSHOT); - if (!snap_attr) - return -EINVAL; - - err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id); - if (err) - goto nla_put_failure; - - nla_nest_end(msg, snap_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(msg, snap_attr); - return err; -} - -static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg, - struct devlink *devlink, - struct devlink_region *region) -{ - struct devlink_snapshot *snapshot; - struct nlattr *snapshots_attr; - int err; - - snapshots_attr = nla_nest_start_noflag(msg, - DEVLINK_ATTR_REGION_SNAPSHOTS); - if (!snapshots_attr) - return -EINVAL; - - list_for_each_entry(snapshot, ®ion->snapshot_list, list) { - err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot); - if (err) - goto nla_put_failure; - } - - nla_nest_end(msg, snapshots_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(msg, snapshots_attr); - return err; -} - -static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, - enum devlink_command cmd, u32 portid, - u32 seq, int flags, - struct devlink_region *region) -{ - void *hdr; - int err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - err = devlink_nl_put_handle(msg, devlink); - if (err) - goto nla_put_failure; - - if (region->port) { - err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, - region->port->index); - if (err) - goto nla_put_failure; - } - - err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); - if (err) - goto nla_put_failure; - - err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, - region->size, - DEVLINK_ATTR_PAD); - if (err) - goto nla_put_failure; - - err = nla_put_u32(msg, DEVLINK_ATTR_REGION_MAX_SNAPSHOTS, - region->max_snapshots); - if (err) - goto nla_put_failure; - - err = devlink_nl_region_snapshots_id_put(msg, devlink, region); - if (err) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return err; -} - -static struct sk_buff * -devlink_nl_region_notify_build(struct devlink_region *region, - struct devlink_snapshot *snapshot, - enum devlink_command cmd, u32 portid, u32 seq) -{ - struct devlink *devlink = region->devlink; - struct sk_buff *msg; - void *hdr; - int err; - - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return ERR_PTR(-ENOMEM); - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd); - if (!hdr) { - err = -EMSGSIZE; - goto out_free_msg; - } - - err = devlink_nl_put_handle(msg, devlink); - if (err) - goto out_cancel_msg; - - if (region->port) { - err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, - region->port->index); - if (err) - goto out_cancel_msg; - } - - err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, - region->ops->name); - if (err) - goto out_cancel_msg; - - if (snapshot) { - err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, - snapshot->id); - if (err) - goto out_cancel_msg; - } else { - err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, - region->size, DEVLINK_ATTR_PAD); - if (err) - goto out_cancel_msg; - } - genlmsg_end(msg, hdr); - - return msg; - -out_cancel_msg: - genlmsg_cancel(msg, hdr); -out_free_msg: - nlmsg_free(msg); - return ERR_PTR(err); -} - -static void devlink_nl_region_notify(struct devlink_region *region, - struct devlink_snapshot *snapshot, - enum devlink_command cmd) -{ - struct devlink *devlink = region->devlink; - struct sk_buff *msg; - - WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0); - if (IS_ERR(msg)) - return; - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, - 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static void devlink_regions_notify_register(struct devlink *devlink) -{ - struct devlink_region *region; - - list_for_each_entry(region, &devlink->region_list, list) - devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); -} - -static void devlink_regions_notify_unregister(struct devlink *devlink) -{ - struct devlink_region *region; - - list_for_each_entry_reverse(region, &devlink->region_list, list) - devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); -} - -/** - * __devlink_snapshot_id_increment - Increment number of snapshots using an id - * @devlink: devlink instance - * @id: the snapshot id - * - * Track when a new snapshot begins using an id. Load the count for the - * given id from the snapshot xarray, increment it, and store it back. - * - * Called when a new snapshot is created with the given id. - * - * The id *must* have been previously allocated by - * devlink_region_snapshot_id_get(). - * - * Returns 0 on success, or an error on failure. - */ -static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id) -{ - unsigned long count; - void *p; - int err; - - xa_lock(&devlink->snapshot_ids); - p = xa_load(&devlink->snapshot_ids, id); - if (WARN_ON(!p)) { - err = -EINVAL; - goto unlock; - } - - if (WARN_ON(!xa_is_value(p))) { - err = -EINVAL; - goto unlock; - } - - count = xa_to_value(p); - count++; - - err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), - GFP_ATOMIC)); -unlock: - xa_unlock(&devlink->snapshot_ids); - return err; -} - -/** - * __devlink_snapshot_id_decrement - Decrease number of snapshots using an id - * @devlink: devlink instance - * @id: the snapshot id - * - * Track when a snapshot is deleted and stops using an id. Load the count - * for the given id from the snapshot xarray, decrement it, and store it - * back. - * - * If the count reaches zero, erase this id from the xarray, freeing it - * up for future re-use by devlink_region_snapshot_id_get(). - * - * Called when a snapshot using the given id is deleted, and when the - * initial allocator of the id is finished using it. - */ -static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id) -{ - unsigned long count; - void *p; - - xa_lock(&devlink->snapshot_ids); - p = xa_load(&devlink->snapshot_ids, id); - if (WARN_ON(!p)) - goto unlock; - - if (WARN_ON(!xa_is_value(p))) - goto unlock; - - count = xa_to_value(p); - - if (count > 1) { - count--; - __xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), - GFP_ATOMIC); - } else { - /* If this was the last user, we can erase this id */ - __xa_erase(&devlink->snapshot_ids, id); - } -unlock: - xa_unlock(&devlink->snapshot_ids); -} - -/** - * __devlink_snapshot_id_insert - Insert a specific snapshot ID - * @devlink: devlink instance - * @id: the snapshot id - * - * Mark the given snapshot id as used by inserting a zero value into the - * snapshot xarray. - * - * This must be called while holding the devlink instance lock. Unlike - * devlink_snapshot_id_get, the initial reference count is zero, not one. - * It is expected that the id will immediately be used before - * releasing the devlink instance lock. - * - * Returns zero on success, or an error code if the snapshot id could not - * be inserted. - */ -static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id) -{ - int err; - - xa_lock(&devlink->snapshot_ids); - if (xa_load(&devlink->snapshot_ids, id)) { - xa_unlock(&devlink->snapshot_ids); - return -EEXIST; - } - err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(0), - GFP_ATOMIC)); - xa_unlock(&devlink->snapshot_ids); - return err; -} - -/** - * __devlink_region_snapshot_id_get - get snapshot ID - * @devlink: devlink instance - * @id: storage to return snapshot id - * - * Allocates a new snapshot id. Returns zero on success, or a negative - * error on failure. Must be called while holding the devlink instance - * lock. - * - * Snapshot IDs are tracked using an xarray which stores the number of - * users of the snapshot id. - * - * Note that the caller of this function counts as a 'user', in order to - * avoid race conditions. The caller must release its hold on the - * snapshot by using devlink_region_snapshot_id_put. - */ -static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) -{ - return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1), - xa_limit_32b, GFP_KERNEL); -} - -/** - * __devlink_region_snapshot_create - create a new snapshot - * This will add a new snapshot of a region. The snapshot - * will be stored on the region struct and can be accessed - * from devlink. This is useful for future analyses of snapshots. - * Multiple snapshots can be created on a region. - * The @snapshot_id should be obtained using the getter function. - * - * Must be called only while holding the region snapshot lock. - * - * @region: devlink region of the snapshot - * @data: snapshot data - * @snapshot_id: snapshot id to be created - */ -static int -__devlink_region_snapshot_create(struct devlink_region *region, - u8 *data, u32 snapshot_id) -{ - struct devlink *devlink = region->devlink; - struct devlink_snapshot *snapshot; - int err; - - lockdep_assert_held(®ion->snapshot_lock); - - /* check if region can hold one more snapshot */ - if (region->cur_snapshots == region->max_snapshots) - return -ENOSPC; - - if (devlink_region_snapshot_get_by_id(region, snapshot_id)) - return -EEXIST; - - snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL); - if (!snapshot) - return -ENOMEM; - - err = __devlink_snapshot_id_increment(devlink, snapshot_id); - if (err) - goto err_snapshot_id_increment; - - snapshot->id = snapshot_id; - snapshot->region = region; - snapshot->data = data; - - list_add_tail(&snapshot->list, ®ion->snapshot_list); - - region->cur_snapshots++; - - devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW); - return 0; - -err_snapshot_id_increment: - kfree(snapshot); - return err; -} - -static void devlink_region_snapshot_del(struct devlink_region *region, - struct devlink_snapshot *snapshot) -{ - struct devlink *devlink = region->devlink; - - lockdep_assert_held(®ion->snapshot_lock); - - devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL); - region->cur_snapshots--; - list_del(&snapshot->list); - region->ops->destructor(snapshot->data); - __devlink_snapshot_id_decrement(devlink, snapshot->id); - kfree(snapshot); -} - -int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_port *port = NULL; - struct devlink_region *region; - const char *region_name; - struct sk_buff *msg; - unsigned int index; - int err; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) - return -EINVAL; - - if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { - index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); - - port = devlink_port_get_by_index(devlink, index); - if (!port) - return -ENODEV; - } - - region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); - if (port) - region = devlink_port_region_get_by_name(port, region_name); - else - region = devlink_region_get_by_name(devlink, region_name); - - if (!region) - return -EINVAL; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET, - info->snd_portid, info->snd_seq, 0, - region); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg, - struct netlink_callback *cb, - struct devlink_port *port, - int *idx, int start, int flags) -{ - struct devlink_region *region; - int err = 0; - - list_for_each_entry(region, &port->region_list, list) { - if (*idx < start) { - (*idx)++; - continue; - } - err = devlink_nl_region_fill(msg, port->devlink, - DEVLINK_CMD_REGION_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - flags, region); - if (err) - goto out; - (*idx)++; - } - -out: - return err; -} - -static int devlink_nl_region_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb, - int flags) -{ - struct devlink_nl_dump_state *state = devlink_dump_state(cb); - struct devlink_region *region; - struct devlink_port *port; - unsigned long port_index; - int idx = 0; - int err; - - list_for_each_entry(region, &devlink->region_list, list) { - if (idx < state->idx) { - idx++; - continue; - } - err = devlink_nl_region_fill(msg, devlink, - DEVLINK_CMD_REGION_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, flags, - region); - if (err) { - state->idx = idx; - return err; - } - idx++; - } - - xa_for_each(&devlink->ports, port_index, port) { - err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, &idx, - state->idx, flags); - if (err) { - state->idx = idx; - return err; - } - } - - return 0; -} - -int devlink_nl_region_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) -{ - return devlink_nl_dumpit(skb, cb, devlink_nl_region_get_dump_one); -} - -static int devlink_nl_cmd_region_del(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_snapshot *snapshot; - struct devlink_port *port = NULL; - struct devlink_region *region; - const char *region_name; - unsigned int index; - u32 snapshot_id; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME) || - GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_SNAPSHOT_ID)) - return -EINVAL; - - region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); - snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); - - if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { - index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); - - port = devlink_port_get_by_index(devlink, index); - if (!port) - return -ENODEV; - } - - if (port) - region = devlink_port_region_get_by_name(port, region_name); - else - region = devlink_region_get_by_name(devlink, region_name); - - if (!region) - return -EINVAL; - - mutex_lock(®ion->snapshot_lock); - snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); - if (!snapshot) { - mutex_unlock(®ion->snapshot_lock); - return -EINVAL; - } - - devlink_region_snapshot_del(region, snapshot); - mutex_unlock(®ion->snapshot_lock); - return 0; -} - -static int -devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_snapshot *snapshot; - struct devlink_port *port = NULL; - struct nlattr *snapshot_id_attr; - struct devlink_region *region; - const char *region_name; - unsigned int index; - u32 snapshot_id; - u8 *data; - int err; - - if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) { - NL_SET_ERR_MSG(info->extack, "No region name provided"); - return -EINVAL; - } - - region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); - - if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { - index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); - - port = devlink_port_get_by_index(devlink, index); - if (!port) - return -ENODEV; - } - - if (port) - region = devlink_port_region_get_by_name(port, region_name); - else - region = devlink_region_get_by_name(devlink, region_name); - - if (!region) { - NL_SET_ERR_MSG(info->extack, "The requested region does not exist"); - return -EINVAL; - } - - if (!region->ops->snapshot) { - NL_SET_ERR_MSG(info->extack, "The requested region does not support taking an immediate snapshot"); - return -EOPNOTSUPP; - } - - mutex_lock(®ion->snapshot_lock); - - if (region->cur_snapshots == region->max_snapshots) { - NL_SET_ERR_MSG(info->extack, "The region has reached the maximum number of stored snapshots"); - err = -ENOSPC; - goto unlock; - } - - snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; - if (snapshot_id_attr) { - snapshot_id = nla_get_u32(snapshot_id_attr); - - if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { - NL_SET_ERR_MSG(info->extack, "The requested snapshot id is already in use"); - err = -EEXIST; - goto unlock; - } - - err = __devlink_snapshot_id_insert(devlink, snapshot_id); - if (err) - goto unlock; - } else { - err = __devlink_region_snapshot_id_get(devlink, &snapshot_id); - if (err) { - NL_SET_ERR_MSG(info->extack, "Failed to allocate a new snapshot id"); - goto unlock; - } - } - - if (port) - err = region->port_ops->snapshot(port, region->port_ops, - info->extack, &data); - else - err = region->ops->snapshot(devlink, region->ops, - info->extack, &data); - if (err) - goto err_snapshot_capture; - - err = __devlink_region_snapshot_create(region, data, snapshot_id); - if (err) - goto err_snapshot_create; - - if (!snapshot_id_attr) { - struct sk_buff *msg; - - snapshot = devlink_region_snapshot_get_by_id(region, - snapshot_id); - if (WARN_ON(!snapshot)) { - err = -EINVAL; - goto unlock; - } - - msg = devlink_nl_region_notify_build(region, snapshot, - DEVLINK_CMD_REGION_NEW, - info->snd_portid, - info->snd_seq); - err = PTR_ERR_OR_ZERO(msg); - if (err) - goto err_notify; - - err = genlmsg_reply(msg, info); - if (err) - goto err_notify; - } - - mutex_unlock(®ion->snapshot_lock); - return 0; - -err_snapshot_create: - region->ops->destructor(data); -err_snapshot_capture: - __devlink_snapshot_id_decrement(devlink, snapshot_id); - mutex_unlock(®ion->snapshot_lock); - return err; - -err_notify: - devlink_region_snapshot_del(region, snapshot); -unlock: - mutex_unlock(®ion->snapshot_lock); - return err; -} - -static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, - u8 *chunk, u32 chunk_size, - u64 addr) -{ - struct nlattr *chunk_attr; - int err; - - chunk_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_CHUNK); - if (!chunk_attr) - return -EINVAL; - - err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk); - if (err) - goto nla_put_failure; - - err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr, - DEVLINK_ATTR_PAD); - if (err) - goto nla_put_failure; - - nla_nest_end(msg, chunk_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(msg, chunk_attr); - return err; -} - -#define DEVLINK_REGION_READ_CHUNK_SIZE 256 - -typedef int devlink_chunk_fill_t(void *cb_priv, u8 *chunk, u32 chunk_size, - u64 curr_offset, - struct netlink_ext_ack *extack); - -static int -devlink_nl_region_read_fill(struct sk_buff *skb, devlink_chunk_fill_t *cb, - void *cb_priv, u64 start_offset, u64 end_offset, - u64 *new_offset, struct netlink_ext_ack *extack) -{ - u64 curr_offset = start_offset; - int err = 0; - u8 *data; - - /* Allocate and re-use a single buffer */ - data = kmalloc(DEVLINK_REGION_READ_CHUNK_SIZE, GFP_KERNEL); - if (!data) - return -ENOMEM; - - *new_offset = start_offset; - - while (curr_offset < end_offset) { - u32 data_size; - - data_size = min_t(u32, end_offset - curr_offset, - DEVLINK_REGION_READ_CHUNK_SIZE); - - err = cb(cb_priv, data, data_size, curr_offset, extack); - if (err) - break; - - err = devlink_nl_cmd_region_read_chunk_fill(skb, data, data_size, curr_offset); - if (err) - break; - - curr_offset += data_size; - } - *new_offset = curr_offset; - - kfree(data); - - return err; -} - -static int -devlink_region_snapshot_fill(void *cb_priv, u8 *chunk, u32 chunk_size, - u64 curr_offset, - struct netlink_ext_ack __always_unused *extack) -{ - struct devlink_snapshot *snapshot = cb_priv; - - memcpy(chunk, &snapshot->data[curr_offset], chunk_size); - - return 0; -} - -static int -devlink_region_port_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, - u64 curr_offset, struct netlink_ext_ack *extack) -{ - struct devlink_region *region = cb_priv; - - return region->port_ops->read(region->port, region->port_ops, extack, - curr_offset, chunk_size, chunk); -} - -static int -devlink_region_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, - u64 curr_offset, struct netlink_ext_ack *extack) -{ - struct devlink_region *region = cb_priv; - - return region->ops->read(region->devlink, region->ops, extack, - curr_offset, chunk_size, chunk); -} - -static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) -{ - const struct genl_dumpit_info *info = genl_dumpit_info(cb); - struct devlink_nl_dump_state *state = devlink_dump_state(cb); - struct nlattr *chunks_attr, *region_attr, *snapshot_attr; - u64 ret_offset, start_offset, end_offset = U64_MAX; - struct nlattr **attrs = info->info.attrs; - struct devlink_port *port = NULL; - devlink_chunk_fill_t *region_cb; - struct devlink_region *region; - const char *region_name; - struct devlink *devlink; - unsigned int index; - void *region_cb_priv; - void *hdr; - int err; - - start_offset = state->start_offset; - - devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs); - if (IS_ERR(devlink)) - return PTR_ERR(devlink); - - if (!attrs[DEVLINK_ATTR_REGION_NAME]) { - NL_SET_ERR_MSG(cb->extack, "No region name provided"); - err = -EINVAL; - goto out_unlock; - } - - if (attrs[DEVLINK_ATTR_PORT_INDEX]) { - index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); - - port = devlink_port_get_by_index(devlink, index); - if (!port) { - err = -ENODEV; - goto out_unlock; - } + mutex_lock(&linecard->state_lock); + if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { + NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); + err = -EBUSY; + goto out; + } + if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { + NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); + err = -EBUSY; + goto out; } - region_attr = attrs[DEVLINK_ATTR_REGION_NAME]; - region_name = nla_data(region_attr); - - if (port) - region = devlink_port_region_get_by_name(port, region_name); - else - region = devlink_region_get_by_name(devlink, region_name); - - if (!region) { - NL_SET_ERR_MSG_ATTR(cb->extack, region_attr, "Requested region does not exist"); + linecard_type = devlink_linecard_type_lookup(linecard, type); + if (!linecard_type) { + NL_SET_ERR_MSG(extack, "Unsupported line card type provided"); err = -EINVAL; - goto out_unlock; + goto out; } - snapshot_attr = attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; - if (!snapshot_attr) { - if (!nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { - NL_SET_ERR_MSG(cb->extack, "No snapshot id provided"); - err = -EINVAL; - goto out_unlock; - } + if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED && + linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { + NL_SET_ERR_MSG(extack, "Line card already provisioned"); + err = -EBUSY; + /* Check if the line card is provisioned in the same + * way the user asks. In case it is, make the operation + * to return success. + */ + if (ops->same_provision && + ops->same_provision(linecard, linecard->priv, + linecard_type->type, + linecard_type->priv)) + err = 0; + goto out; + } - if (!region->ops->read) { - NL_SET_ERR_MSG(cb->extack, "Requested region does not support direct read"); - err = -EOPNOTSUPP; - goto out_unlock; - } + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING; + linecard->type = linecard_type->type; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + err = ops->provision(linecard, linecard->priv, linecard_type->type, + linecard_type->priv, extack); + if (err) { + /* Provisioning failed. Assume the linecard is unprovisioned + * for future operations. + */ + mutex_lock(&linecard->state_lock); + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + } + return err; - if (port) - region_cb = &devlink_region_port_direct_fill; - else - region_cb = &devlink_region_direct_fill; - region_cb_priv = region; - } else { - struct devlink_snapshot *snapshot; - u32 snapshot_id; +out: + mutex_unlock(&linecard->state_lock); + return err; +} - if (nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { - NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Direct region read does not use snapshot"); - err = -EINVAL; - goto out_unlock; - } +static int devlink_linecard_type_unset(struct devlink_linecard *linecard, + struct netlink_ext_ack *extack) +{ + int err; - snapshot_id = nla_get_u32(snapshot_attr); - snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); - if (!snapshot) { - NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist"); - err = -EINVAL; - goto out_unlock; - } - region_cb = &devlink_region_snapshot_fill; - region_cb_priv = snapshot; + mutex_lock(&linecard->state_lock); + if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { + NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); + err = -EBUSY; + goto out; } - - if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && - attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { - if (!start_offset) - start_offset = - nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); - - end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); - end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); + if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { + NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); + err = -EBUSY; + goto out; } - - if (end_offset > region->size) - end_offset = region->size; - - /* return 0 if there is no further data to read */ - if (start_offset == end_offset) { + if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); err = 0; - goto out_unlock; + goto out; } - hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, - DEVLINK_CMD_REGION_READ); - if (!hdr) { - err = -EMSGSIZE; - goto out_unlock; + if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) { + NL_SET_ERR_MSG(extack, "Line card is not provisioned"); + err = 0; + goto out; } - - err = devlink_nl_put_handle(skb, devlink); - if (err) - goto nla_put_failure; - - if (region->port) { - err = nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX, - region->port->index); - if (err) - goto nla_put_failure; + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + err = linecard->ops->unprovision(linecard, linecard->priv, + extack); + if (err) { + /* Unprovisioning failed. Assume the linecard is unprovisioned + * for future operations. + */ + mutex_lock(&linecard->state_lock); + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); } + return err; - err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name); - if (err) - goto nla_put_failure; +out: + mutex_unlock(&linecard->state_lock); + return err; +} - chunks_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_REGION_CHUNKS); - if (!chunks_attr) { - err = -EMSGSIZE; - goto nla_put_failure; - } +static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_linecard *linecard; + int err; - err = devlink_nl_region_read_fill(skb, region_cb, region_cb_priv, - start_offset, end_offset, &ret_offset, - cb->extack); + linecard = devlink_linecard_get_from_info(devlink, info); + if (IS_ERR(linecard)) + return PTR_ERR(linecard); - if (err && err != -EMSGSIZE) - goto nla_put_failure; + if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) { + const char *type; - /* Check if there was any progress done to prevent infinite loop */ - if (ret_offset == start_offset) { - err = -EINVAL; - goto nla_put_failure; + type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]); + if (strcmp(type, "")) { + err = devlink_linecard_type_set(linecard, type, extack); + if (err) + return err; + } else { + err = devlink_linecard_type_unset(linecard, extack); + if (err) + return err; + } } - state->start_offset = ret_offset; + return 0; +} - nla_nest_end(skb, chunks_attr); - genlmsg_end(skb, hdr); - devl_unlock(devlink); - devlink_put(devlink); - return skb->len; +int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack) +{ + struct devlink_rate *devlink_rate; -nla_put_failure: - genlmsg_cancel(skb, hdr); -out_unlock: - devl_unlock(devlink); - devlink_put(devlink); - return err; + list_for_each_entry(devlink_rate, &devlink->rate_list, list) + if (devlink_rate_is_node(devlink_rate)) { + NL_SET_ERR_MSG(extack, "Rate node(s) exists."); + return -EBUSY; + } + return 0; } struct devlink_stats { @@ -3611,231 +2580,6 @@ void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, } EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set); -/** - * devl_region_create - create a new address region - * - * @devlink: devlink - * @ops: region operations and name - * @region_max_snapshots: Maximum supported number of snapshots for region - * @region_size: size of region - */ -struct devlink_region *devl_region_create(struct devlink *devlink, - const struct devlink_region_ops *ops, - u32 region_max_snapshots, - u64 region_size) -{ - struct devlink_region *region; - - devl_assert_locked(devlink); - - if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) - return ERR_PTR(-EINVAL); - - if (devlink_region_get_by_name(devlink, ops->name)) - return ERR_PTR(-EEXIST); - - region = kzalloc(sizeof(*region), GFP_KERNEL); - if (!region) - return ERR_PTR(-ENOMEM); - - region->devlink = devlink; - region->max_snapshots = region_max_snapshots; - region->ops = ops; - region->size = region_size; - INIT_LIST_HEAD(®ion->snapshot_list); - mutex_init(®ion->snapshot_lock); - list_add_tail(®ion->list, &devlink->region_list); - devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); - - return region; -} -EXPORT_SYMBOL_GPL(devl_region_create); - -/** - * devlink_region_create - create a new address region - * - * @devlink: devlink - * @ops: region operations and name - * @region_max_snapshots: Maximum supported number of snapshots for region - * @region_size: size of region - * - * Context: Takes and release devlink->lock . - */ -struct devlink_region * -devlink_region_create(struct devlink *devlink, - const struct devlink_region_ops *ops, - u32 region_max_snapshots, u64 region_size) -{ - struct devlink_region *region; - - devl_lock(devlink); - region = devl_region_create(devlink, ops, region_max_snapshots, - region_size); - devl_unlock(devlink); - return region; -} -EXPORT_SYMBOL_GPL(devlink_region_create); - -/** - * devlink_port_region_create - create a new address region for a port - * - * @port: devlink port - * @ops: region operations and name - * @region_max_snapshots: Maximum supported number of snapshots for region - * @region_size: size of region - * - * Context: Takes and release devlink->lock . - */ -struct devlink_region * -devlink_port_region_create(struct devlink_port *port, - const struct devlink_port_region_ops *ops, - u32 region_max_snapshots, u64 region_size) -{ - struct devlink *devlink = port->devlink; - struct devlink_region *region; - int err = 0; - - ASSERT_DEVLINK_PORT_INITIALIZED(port); - - if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) - return ERR_PTR(-EINVAL); - - devl_lock(devlink); - - if (devlink_port_region_get_by_name(port, ops->name)) { - err = -EEXIST; - goto unlock; - } - - region = kzalloc(sizeof(*region), GFP_KERNEL); - if (!region) { - err = -ENOMEM; - goto unlock; - } - - region->devlink = devlink; - region->port = port; - region->max_snapshots = region_max_snapshots; - region->port_ops = ops; - region->size = region_size; - INIT_LIST_HEAD(®ion->snapshot_list); - mutex_init(®ion->snapshot_lock); - list_add_tail(®ion->list, &port->region_list); - devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); - - devl_unlock(devlink); - return region; - -unlock: - devl_unlock(devlink); - return ERR_PTR(err); -} -EXPORT_SYMBOL_GPL(devlink_port_region_create); - -/** - * devl_region_destroy - destroy address region - * - * @region: devlink region to destroy - */ -void devl_region_destroy(struct devlink_region *region) -{ - struct devlink *devlink = region->devlink; - struct devlink_snapshot *snapshot, *ts; - - devl_assert_locked(devlink); - - /* Free all snapshots of region */ - mutex_lock(®ion->snapshot_lock); - list_for_each_entry_safe(snapshot, ts, ®ion->snapshot_list, list) - devlink_region_snapshot_del(region, snapshot); - mutex_unlock(®ion->snapshot_lock); - - list_del(®ion->list); - mutex_destroy(®ion->snapshot_lock); - - devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); - kfree(region); -} -EXPORT_SYMBOL_GPL(devl_region_destroy); - -/** - * devlink_region_destroy - destroy address region - * - * @region: devlink region to destroy - * - * Context: Takes and release devlink->lock . - */ -void devlink_region_destroy(struct devlink_region *region) -{ - struct devlink *devlink = region->devlink; - - devl_lock(devlink); - devl_region_destroy(region); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_region_destroy); - -/** - * devlink_region_snapshot_id_get - get snapshot ID - * - * This callback should be called when adding a new snapshot, - * Driver should use the same id for multiple snapshots taken - * on multiple regions at the same time/by the same trigger. - * - * The caller of this function must use devlink_region_snapshot_id_put - * when finished creating regions using this id. - * - * Returns zero on success, or a negative error code on failure. - * - * @devlink: devlink - * @id: storage to return id - */ -int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) -{ - return __devlink_region_snapshot_id_get(devlink, id); -} -EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get); - -/** - * devlink_region_snapshot_id_put - put snapshot ID reference - * - * This should be called by a driver after finishing creating snapshots - * with an id. Doing so ensures that the ID can later be released in the - * event that all snapshots using it have been destroyed. - * - * @devlink: devlink - * @id: id to release reference on - */ -void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id) -{ - __devlink_snapshot_id_decrement(devlink, id); -} -EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put); - -/** - * devlink_region_snapshot_create - create a new snapshot - * This will add a new snapshot of a region. The snapshot - * will be stored on the region struct and can be accessed - * from devlink. This is useful for future analyses of snapshots. - * Multiple snapshots can be created on a region. - * The @snapshot_id should be obtained using the getter function. - * - * @region: devlink region of the snapshot - * @data: snapshot data - * @snapshot_id: snapshot id to be created - */ -int devlink_region_snapshot_create(struct devlink_region *region, - u8 *data, u32 snapshot_id) -{ - int err; - - mutex_lock(®ion->snapshot_lock); - err = __devlink_region_snapshot_create(region, data, snapshot_id); - mutex_unlock(®ion->snapshot_lock); - return err; -} -EXPORT_SYMBOL_GPL(devlink_region_snapshot_create); - #define DEVLINK_TRAP(_id, _type) \ { \ .type = DEVLINK_TRAP_TYPE_##_type, \ diff --git a/net/devlink/region.c b/net/devlink/region.c new file mode 100644 index 000000000000..d197cdb662db --- /dev/null +++ b/net/devlink/region.c @@ -0,0 +1,1260 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko + */ + +#include "devl_internal.h" + +struct devlink_region { + struct devlink *devlink; + struct devlink_port *port; + struct list_head list; + union { + const struct devlink_region_ops *ops; + const struct devlink_port_region_ops *port_ops; + }; + struct mutex snapshot_lock; /* protects snapshot_list, + * max_snapshots and cur_snapshots + * consistency. + */ + struct list_head snapshot_list; + u32 max_snapshots; + u32 cur_snapshots; + u64 size; +}; + +struct devlink_snapshot { + struct list_head list; + struct devlink_region *region; + u8 *data; + u32 id; +}; + +static struct devlink_region * +devlink_region_get_by_name(struct devlink *devlink, const char *region_name) +{ + struct devlink_region *region; + + list_for_each_entry(region, &devlink->region_list, list) + if (!strcmp(region->ops->name, region_name)) + return region; + + return NULL; +} + +static struct devlink_region * +devlink_port_region_get_by_name(struct devlink_port *port, + const char *region_name) +{ + struct devlink_region *region; + + list_for_each_entry(region, &port->region_list, list) + if (!strcmp(region->ops->name, region_name)) + return region; + + return NULL; +} + +static struct devlink_snapshot * +devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) +{ + struct devlink_snapshot *snapshot; + + list_for_each_entry(snapshot, ®ion->snapshot_list, list) + if (snapshot->id == id) + return snapshot; + + return NULL; +} + +static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_snapshot *snapshot) +{ + struct nlattr *snap_attr; + int err; + + snap_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_SNAPSHOT); + if (!snap_attr) + return -EINVAL; + + err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id); + if (err) + goto nla_put_failure; + + nla_nest_end(msg, snap_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, snap_attr); + return err; +} + +static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_region *region) +{ + struct devlink_snapshot *snapshot; + struct nlattr *snapshots_attr; + int err; + + snapshots_attr = nla_nest_start_noflag(msg, + DEVLINK_ATTR_REGION_SNAPSHOTS); + if (!snapshots_attr) + return -EINVAL; + + list_for_each_entry(snapshot, ®ion->snapshot_list, list) { + err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot); + if (err) + goto nla_put_failure; + } + + nla_nest_end(msg, snapshots_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, snapshots_attr); + return err; +} + +static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, + struct devlink_region *region) +{ + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + err = devlink_nl_put_handle(msg, devlink); + if (err) + goto nla_put_failure; + + if (region->port) { + err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, + region->port->index); + if (err) + goto nla_put_failure; + } + + err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); + if (err) + goto nla_put_failure; + + err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, + region->size, + DEVLINK_ATTR_PAD); + if (err) + goto nla_put_failure; + + err = nla_put_u32(msg, DEVLINK_ATTR_REGION_MAX_SNAPSHOTS, + region->max_snapshots); + if (err) + goto nla_put_failure; + + err = devlink_nl_region_snapshots_id_put(msg, devlink, region); + if (err) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return err; +} + +static struct sk_buff * +devlink_nl_region_notify_build(struct devlink_region *region, + struct devlink_snapshot *snapshot, + enum devlink_command cmd, u32 portid, u32 seq) +{ + struct devlink *devlink = region->devlink; + struct sk_buff *msg; + void *hdr; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return ERR_PTR(-ENOMEM); + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd); + if (!hdr) { + err = -EMSGSIZE; + goto out_free_msg; + } + + err = devlink_nl_put_handle(msg, devlink); + if (err) + goto out_cancel_msg; + + if (region->port) { + err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, + region->port->index); + if (err) + goto out_cancel_msg; + } + + err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, + region->ops->name); + if (err) + goto out_cancel_msg; + + if (snapshot) { + err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, + snapshot->id); + if (err) + goto out_cancel_msg; + } else { + err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, + region->size, DEVLINK_ATTR_PAD); + if (err) + goto out_cancel_msg; + } + genlmsg_end(msg, hdr); + + return msg; + +out_cancel_msg: + genlmsg_cancel(msg, hdr); +out_free_msg: + nlmsg_free(msg); + return ERR_PTR(err); +} + +static void devlink_nl_region_notify(struct devlink_region *region, + struct devlink_snapshot *snapshot, + enum devlink_command cmd) +{ + struct devlink *devlink = region->devlink; + struct sk_buff *msg; + + WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; + + msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0); + if (IS_ERR(msg)) + return; + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, + 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +void devlink_regions_notify_register(struct devlink *devlink) +{ + struct devlink_region *region; + + list_for_each_entry(region, &devlink->region_list, list) + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); +} + +void devlink_regions_notify_unregister(struct devlink *devlink) +{ + struct devlink_region *region; + + list_for_each_entry_reverse(region, &devlink->region_list, list) + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); +} + +/** + * __devlink_snapshot_id_increment - Increment number of snapshots using an id + * @devlink: devlink instance + * @id: the snapshot id + * + * Track when a new snapshot begins using an id. Load the count for the + * given id from the snapshot xarray, increment it, and store it back. + * + * Called when a new snapshot is created with the given id. + * + * The id *must* have been previously allocated by + * devlink_region_snapshot_id_get(). + * + * Returns 0 on success, or an error on failure. + */ +static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id) +{ + unsigned long count; + void *p; + int err; + + xa_lock(&devlink->snapshot_ids); + p = xa_load(&devlink->snapshot_ids, id); + if (WARN_ON(!p)) { + err = -EINVAL; + goto unlock; + } + + if (WARN_ON(!xa_is_value(p))) { + err = -EINVAL; + goto unlock; + } + + count = xa_to_value(p); + count++; + + err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), + GFP_ATOMIC)); +unlock: + xa_unlock(&devlink->snapshot_ids); + return err; +} + +/** + * __devlink_snapshot_id_decrement - Decrease number of snapshots using an id + * @devlink: devlink instance + * @id: the snapshot id + * + * Track when a snapshot is deleted and stops using an id. Load the count + * for the given id from the snapshot xarray, decrement it, and store it + * back. + * + * If the count reaches zero, erase this id from the xarray, freeing it + * up for future re-use by devlink_region_snapshot_id_get(). + * + * Called when a snapshot using the given id is deleted, and when the + * initial allocator of the id is finished using it. + */ +static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id) +{ + unsigned long count; + void *p; + + xa_lock(&devlink->snapshot_ids); + p = xa_load(&devlink->snapshot_ids, id); + if (WARN_ON(!p)) + goto unlock; + + if (WARN_ON(!xa_is_value(p))) + goto unlock; + + count = xa_to_value(p); + + if (count > 1) { + count--; + __xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), + GFP_ATOMIC); + } else { + /* If this was the last user, we can erase this id */ + __xa_erase(&devlink->snapshot_ids, id); + } +unlock: + xa_unlock(&devlink->snapshot_ids); +} + +/** + * __devlink_snapshot_id_insert - Insert a specific snapshot ID + * @devlink: devlink instance + * @id: the snapshot id + * + * Mark the given snapshot id as used by inserting a zero value into the + * snapshot xarray. + * + * This must be called while holding the devlink instance lock. Unlike + * devlink_snapshot_id_get, the initial reference count is zero, not one. + * It is expected that the id will immediately be used before + * releasing the devlink instance lock. + * + * Returns zero on success, or an error code if the snapshot id could not + * be inserted. + */ +static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id) +{ + int err; + + xa_lock(&devlink->snapshot_ids); + if (xa_load(&devlink->snapshot_ids, id)) { + xa_unlock(&devlink->snapshot_ids); + return -EEXIST; + } + err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(0), + GFP_ATOMIC)); + xa_unlock(&devlink->snapshot_ids); + return err; +} + +/** + * __devlink_region_snapshot_id_get - get snapshot ID + * @devlink: devlink instance + * @id: storage to return snapshot id + * + * Allocates a new snapshot id. Returns zero on success, or a negative + * error on failure. Must be called while holding the devlink instance + * lock. + * + * Snapshot IDs are tracked using an xarray which stores the number of + * users of the snapshot id. + * + * Note that the caller of this function counts as a 'user', in order to + * avoid race conditions. The caller must release its hold on the + * snapshot by using devlink_region_snapshot_id_put. + */ +static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) +{ + return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1), + xa_limit_32b, GFP_KERNEL); +} + +/** + * __devlink_region_snapshot_create - create a new snapshot + * This will add a new snapshot of a region. The snapshot + * will be stored on the region struct and can be accessed + * from devlink. This is useful for future analyses of snapshots. + * Multiple snapshots can be created on a region. + * The @snapshot_id should be obtained using the getter function. + * + * Must be called only while holding the region snapshot lock. + * + * @region: devlink region of the snapshot + * @data: snapshot data + * @snapshot_id: snapshot id to be created + */ +static int +__devlink_region_snapshot_create(struct devlink_region *region, + u8 *data, u32 snapshot_id) +{ + struct devlink *devlink = region->devlink; + struct devlink_snapshot *snapshot; + int err; + + lockdep_assert_held(®ion->snapshot_lock); + + /* check if region can hold one more snapshot */ + if (region->cur_snapshots == region->max_snapshots) + return -ENOSPC; + + if (devlink_region_snapshot_get_by_id(region, snapshot_id)) + return -EEXIST; + + snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL); + if (!snapshot) + return -ENOMEM; + + err = __devlink_snapshot_id_increment(devlink, snapshot_id); + if (err) + goto err_snapshot_id_increment; + + snapshot->id = snapshot_id; + snapshot->region = region; + snapshot->data = data; + + list_add_tail(&snapshot->list, ®ion->snapshot_list); + + region->cur_snapshots++; + + devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW); + return 0; + +err_snapshot_id_increment: + kfree(snapshot); + return err; +} + +static void devlink_region_snapshot_del(struct devlink_region *region, + struct devlink_snapshot *snapshot) +{ + struct devlink *devlink = region->devlink; + + lockdep_assert_held(®ion->snapshot_lock); + + devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL); + region->cur_snapshots--; + list_del(&snapshot->list); + region->ops->destructor(snapshot->data); + __devlink_snapshot_id_decrement(devlink, snapshot->id); + kfree(snapshot); +} + +int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_port *port = NULL; + struct devlink_region *region; + const char *region_name; + struct sk_buff *msg; + unsigned int index; + int err; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) + return -EINVAL; + + if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + + port = devlink_port_get_by_index(devlink, index); + if (!port) + return -ENODEV; + } + + region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); + if (port) + region = devlink_port_region_get_by_name(port, region_name); + else + region = devlink_region_get_by_name(devlink, region_name); + + if (!region) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET, + info->snd_portid, info->snd_seq, 0, + region); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg, + struct netlink_callback *cb, + struct devlink_port *port, + int *idx, int start, int flags) +{ + struct devlink_region *region; + int err = 0; + + list_for_each_entry(region, &port->region_list, list) { + if (*idx < start) { + (*idx)++; + continue; + } + err = devlink_nl_region_fill(msg, port->devlink, + DEVLINK_CMD_REGION_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + flags, region); + if (err) + goto out; + (*idx)++; + } + +out: + return err; +} + +static int devlink_nl_region_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags) +{ + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + struct devlink_region *region; + struct devlink_port *port; + unsigned long port_index; + int idx = 0; + int err; + + list_for_each_entry(region, &devlink->region_list, list) { + if (idx < state->idx) { + idx++; + continue; + } + err = devlink_nl_region_fill(msg, devlink, + DEVLINK_CMD_REGION_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, flags, + region); + if (err) { + state->idx = idx; + return err; + } + idx++; + } + + xa_for_each(&devlink->ports, port_index, port) { + err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, &idx, + state->idx, flags); + if (err) { + state->idx = idx; + return err; + } + } + + return 0; +} + +int devlink_nl_region_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_region_get_dump_one); +} + +int devlink_nl_cmd_region_del(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_snapshot *snapshot; + struct devlink_port *port = NULL; + struct devlink_region *region; + const char *region_name; + unsigned int index; + u32 snapshot_id; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME) || + GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_SNAPSHOT_ID)) + return -EINVAL; + + region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); + snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); + + if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + + port = devlink_port_get_by_index(devlink, index); + if (!port) + return -ENODEV; + } + + if (port) + region = devlink_port_region_get_by_name(port, region_name); + else + region = devlink_region_get_by_name(devlink, region_name); + + if (!region) + return -EINVAL; + + mutex_lock(®ion->snapshot_lock); + snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); + if (!snapshot) { + mutex_unlock(®ion->snapshot_lock); + return -EINVAL; + } + + devlink_region_snapshot_del(region, snapshot); + mutex_unlock(®ion->snapshot_lock); + return 0; +} + +int devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_snapshot *snapshot; + struct devlink_port *port = NULL; + struct nlattr *snapshot_id_attr; + struct devlink_region *region; + const char *region_name; + unsigned int index; + u32 snapshot_id; + u8 *data; + int err; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) { + NL_SET_ERR_MSG(info->extack, "No region name provided"); + return -EINVAL; + } + + region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); + + if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); + + port = devlink_port_get_by_index(devlink, index); + if (!port) + return -ENODEV; + } + + if (port) + region = devlink_port_region_get_by_name(port, region_name); + else + region = devlink_region_get_by_name(devlink, region_name); + + if (!region) { + NL_SET_ERR_MSG(info->extack, "The requested region does not exist"); + return -EINVAL; + } + + if (!region->ops->snapshot) { + NL_SET_ERR_MSG(info->extack, "The requested region does not support taking an immediate snapshot"); + return -EOPNOTSUPP; + } + + mutex_lock(®ion->snapshot_lock); + + if (region->cur_snapshots == region->max_snapshots) { + NL_SET_ERR_MSG(info->extack, "The region has reached the maximum number of stored snapshots"); + err = -ENOSPC; + goto unlock; + } + + snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; + if (snapshot_id_attr) { + snapshot_id = nla_get_u32(snapshot_id_attr); + + if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { + NL_SET_ERR_MSG(info->extack, "The requested snapshot id is already in use"); + err = -EEXIST; + goto unlock; + } + + err = __devlink_snapshot_id_insert(devlink, snapshot_id); + if (err) + goto unlock; + } else { + err = __devlink_region_snapshot_id_get(devlink, &snapshot_id); + if (err) { + NL_SET_ERR_MSG(info->extack, "Failed to allocate a new snapshot id"); + goto unlock; + } + } + + if (port) + err = region->port_ops->snapshot(port, region->port_ops, + info->extack, &data); + else + err = region->ops->snapshot(devlink, region->ops, + info->extack, &data); + if (err) + goto err_snapshot_capture; + + err = __devlink_region_snapshot_create(region, data, snapshot_id); + if (err) + goto err_snapshot_create; + + if (!snapshot_id_attr) { + struct sk_buff *msg; + + snapshot = devlink_region_snapshot_get_by_id(region, + snapshot_id); + if (WARN_ON(!snapshot)) { + err = -EINVAL; + goto unlock; + } + + msg = devlink_nl_region_notify_build(region, snapshot, + DEVLINK_CMD_REGION_NEW, + info->snd_portid, + info->snd_seq); + err = PTR_ERR_OR_ZERO(msg); + if (err) + goto err_notify; + + err = genlmsg_reply(msg, info); + if (err) + goto err_notify; + } + + mutex_unlock(®ion->snapshot_lock); + return 0; + +err_snapshot_create: + region->ops->destructor(data); +err_snapshot_capture: + __devlink_snapshot_id_decrement(devlink, snapshot_id); + mutex_unlock(®ion->snapshot_lock); + return err; + +err_notify: + devlink_region_snapshot_del(region, snapshot); +unlock: + mutex_unlock(®ion->snapshot_lock); + return err; +} + +static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, + u8 *chunk, u32 chunk_size, + u64 addr) +{ + struct nlattr *chunk_attr; + int err; + + chunk_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_CHUNK); + if (!chunk_attr) + return -EINVAL; + + err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk); + if (err) + goto nla_put_failure; + + err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr, + DEVLINK_ATTR_PAD); + if (err) + goto nla_put_failure; + + nla_nest_end(msg, chunk_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, chunk_attr); + return err; +} + +#define DEVLINK_REGION_READ_CHUNK_SIZE 256 + +typedef int devlink_chunk_fill_t(void *cb_priv, u8 *chunk, u32 chunk_size, + u64 curr_offset, + struct netlink_ext_ack *extack); + +static int +devlink_nl_region_read_fill(struct sk_buff *skb, devlink_chunk_fill_t *cb, + void *cb_priv, u64 start_offset, u64 end_offset, + u64 *new_offset, struct netlink_ext_ack *extack) +{ + u64 curr_offset = start_offset; + int err = 0; + u8 *data; + + /* Allocate and re-use a single buffer */ + data = kmalloc(DEVLINK_REGION_READ_CHUNK_SIZE, GFP_KERNEL); + if (!data) + return -ENOMEM; + + *new_offset = start_offset; + + while (curr_offset < end_offset) { + u32 data_size; + + data_size = min_t(u32, end_offset - curr_offset, + DEVLINK_REGION_READ_CHUNK_SIZE); + + err = cb(cb_priv, data, data_size, curr_offset, extack); + if (err) + break; + + err = devlink_nl_cmd_region_read_chunk_fill(skb, data, data_size, curr_offset); + if (err) + break; + + curr_offset += data_size; + } + *new_offset = curr_offset; + + kfree(data); + + return err; +} + +static int +devlink_region_snapshot_fill(void *cb_priv, u8 *chunk, u32 chunk_size, + u64 curr_offset, + struct netlink_ext_ack __always_unused *extack) +{ + struct devlink_snapshot *snapshot = cb_priv; + + memcpy(chunk, &snapshot->data[curr_offset], chunk_size); + + return 0; +} + +static int +devlink_region_port_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, + u64 curr_offset, struct netlink_ext_ack *extack) +{ + struct devlink_region *region = cb_priv; + + return region->port_ops->read(region->port, region->port_ops, extack, + curr_offset, chunk_size, chunk); +} + +static int +devlink_region_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, + u64 curr_offset, struct netlink_ext_ack *extack) +{ + struct devlink_region *region = cb_priv; + + return region->ops->read(region->devlink, region->ops, extack, + curr_offset, chunk_size, chunk); +} + +int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + struct nlattr *chunks_attr, *region_attr, *snapshot_attr; + u64 ret_offset, start_offset, end_offset = U64_MAX; + struct nlattr **attrs = info->info.attrs; + struct devlink_port *port = NULL; + devlink_chunk_fill_t *region_cb; + struct devlink_region *region; + const char *region_name; + struct devlink *devlink; + unsigned int index; + void *region_cb_priv; + void *hdr; + int err; + + start_offset = state->start_offset; + + devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs); + if (IS_ERR(devlink)) + return PTR_ERR(devlink); + + if (!attrs[DEVLINK_ATTR_REGION_NAME]) { + NL_SET_ERR_MSG(cb->extack, "No region name provided"); + err = -EINVAL; + goto out_unlock; + } + + if (attrs[DEVLINK_ATTR_PORT_INDEX]) { + index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); + + port = devlink_port_get_by_index(devlink, index); + if (!port) { + err = -ENODEV; + goto out_unlock; + } + } + + region_attr = attrs[DEVLINK_ATTR_REGION_NAME]; + region_name = nla_data(region_attr); + + if (port) + region = devlink_port_region_get_by_name(port, region_name); + else + region = devlink_region_get_by_name(devlink, region_name); + + if (!region) { + NL_SET_ERR_MSG_ATTR(cb->extack, region_attr, "Requested region does not exist"); + err = -EINVAL; + goto out_unlock; + } + + snapshot_attr = attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; + if (!snapshot_attr) { + if (!nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { + NL_SET_ERR_MSG(cb->extack, "No snapshot id provided"); + err = -EINVAL; + goto out_unlock; + } + + if (!region->ops->read) { + NL_SET_ERR_MSG(cb->extack, "Requested region does not support direct read"); + err = -EOPNOTSUPP; + goto out_unlock; + } + + if (port) + region_cb = &devlink_region_port_direct_fill; + else + region_cb = &devlink_region_direct_fill; + region_cb_priv = region; + } else { + struct devlink_snapshot *snapshot; + u32 snapshot_id; + + if (nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { + NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Direct region read does not use snapshot"); + err = -EINVAL; + goto out_unlock; + } + + snapshot_id = nla_get_u32(snapshot_attr); + snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); + if (!snapshot) { + NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist"); + err = -EINVAL; + goto out_unlock; + } + region_cb = &devlink_region_snapshot_fill; + region_cb_priv = snapshot; + } + + if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && + attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { + if (!start_offset) + start_offset = + nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + + end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); + } + + if (end_offset > region->size) + end_offset = region->size; + + /* return 0 if there is no further data to read */ + if (start_offset == end_offset) { + err = 0; + goto out_unlock; + } + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, + DEVLINK_CMD_REGION_READ); + if (!hdr) { + err = -EMSGSIZE; + goto out_unlock; + } + + err = devlink_nl_put_handle(skb, devlink); + if (err) + goto nla_put_failure; + + if (region->port) { + err = nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX, + region->port->index); + if (err) + goto nla_put_failure; + } + + err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name); + if (err) + goto nla_put_failure; + + chunks_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_REGION_CHUNKS); + if (!chunks_attr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + err = devlink_nl_region_read_fill(skb, region_cb, region_cb_priv, + start_offset, end_offset, &ret_offset, + cb->extack); + + if (err && err != -EMSGSIZE) + goto nla_put_failure; + + /* Check if there was any progress done to prevent infinite loop */ + if (ret_offset == start_offset) { + err = -EINVAL; + goto nla_put_failure; + } + + state->start_offset = ret_offset; + + nla_nest_end(skb, chunks_attr); + genlmsg_end(skb, hdr); + devl_unlock(devlink); + devlink_put(devlink); + return skb->len; + +nla_put_failure: + genlmsg_cancel(skb, hdr); +out_unlock: + devl_unlock(devlink); + devlink_put(devlink); + return err; +} + +/** + * devl_region_create - create a new address region + * + * @devlink: devlink + * @ops: region operations and name + * @region_max_snapshots: Maximum supported number of snapshots for region + * @region_size: size of region + */ +struct devlink_region *devl_region_create(struct devlink *devlink, + const struct devlink_region_ops *ops, + u32 region_max_snapshots, + u64 region_size) +{ + struct devlink_region *region; + + devl_assert_locked(devlink); + + if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) + return ERR_PTR(-EINVAL); + + if (devlink_region_get_by_name(devlink, ops->name)) + return ERR_PTR(-EEXIST); + + region = kzalloc(sizeof(*region), GFP_KERNEL); + if (!region) + return ERR_PTR(-ENOMEM); + + region->devlink = devlink; + region->max_snapshots = region_max_snapshots; + region->ops = ops; + region->size = region_size; + INIT_LIST_HEAD(®ion->snapshot_list); + mutex_init(®ion->snapshot_lock); + list_add_tail(®ion->list, &devlink->region_list); + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); + + return region; +} +EXPORT_SYMBOL_GPL(devl_region_create); + +/** + * devlink_region_create - create a new address region + * + * @devlink: devlink + * @ops: region operations and name + * @region_max_snapshots: Maximum supported number of snapshots for region + * @region_size: size of region + * + * Context: Takes and release devlink->lock . + */ +struct devlink_region * +devlink_region_create(struct devlink *devlink, + const struct devlink_region_ops *ops, + u32 region_max_snapshots, u64 region_size) +{ + struct devlink_region *region; + + devl_lock(devlink); + region = devl_region_create(devlink, ops, region_max_snapshots, + region_size); + devl_unlock(devlink); + return region; +} +EXPORT_SYMBOL_GPL(devlink_region_create); + +/** + * devlink_port_region_create - create a new address region for a port + * + * @port: devlink port + * @ops: region operations and name + * @region_max_snapshots: Maximum supported number of snapshots for region + * @region_size: size of region + * + * Context: Takes and release devlink->lock . + */ +struct devlink_region * +devlink_port_region_create(struct devlink_port *port, + const struct devlink_port_region_ops *ops, + u32 region_max_snapshots, u64 region_size) +{ + struct devlink *devlink = port->devlink; + struct devlink_region *region; + int err = 0; + + ASSERT_DEVLINK_PORT_INITIALIZED(port); + + if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) + return ERR_PTR(-EINVAL); + + devl_lock(devlink); + + if (devlink_port_region_get_by_name(port, ops->name)) { + err = -EEXIST; + goto unlock; + } + + region = kzalloc(sizeof(*region), GFP_KERNEL); + if (!region) { + err = -ENOMEM; + goto unlock; + } + + region->devlink = devlink; + region->port = port; + region->max_snapshots = region_max_snapshots; + region->port_ops = ops; + region->size = region_size; + INIT_LIST_HEAD(®ion->snapshot_list); + mutex_init(®ion->snapshot_lock); + list_add_tail(®ion->list, &port->region_list); + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); + + devl_unlock(devlink); + return region; + +unlock: + devl_unlock(devlink); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(devlink_port_region_create); + +/** + * devl_region_destroy - destroy address region + * + * @region: devlink region to destroy + */ +void devl_region_destroy(struct devlink_region *region) +{ + struct devlink *devlink = region->devlink; + struct devlink_snapshot *snapshot, *ts; + + devl_assert_locked(devlink); + + /* Free all snapshots of region */ + mutex_lock(®ion->snapshot_lock); + list_for_each_entry_safe(snapshot, ts, ®ion->snapshot_list, list) + devlink_region_snapshot_del(region, snapshot); + mutex_unlock(®ion->snapshot_lock); + + list_del(®ion->list); + mutex_destroy(®ion->snapshot_lock); + + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); + kfree(region); +} +EXPORT_SYMBOL_GPL(devl_region_destroy); + +/** + * devlink_region_destroy - destroy address region + * + * @region: devlink region to destroy + * + * Context: Takes and release devlink->lock . + */ +void devlink_region_destroy(struct devlink_region *region) +{ + struct devlink *devlink = region->devlink; + + devl_lock(devlink); + devl_region_destroy(region); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_region_destroy); + +/** + * devlink_region_snapshot_id_get - get snapshot ID + * + * This callback should be called when adding a new snapshot, + * Driver should use the same id for multiple snapshots taken + * on multiple regions at the same time/by the same trigger. + * + * The caller of this function must use devlink_region_snapshot_id_put + * when finished creating regions using this id. + * + * Returns zero on success, or a negative error code on failure. + * + * @devlink: devlink + * @id: storage to return id + */ +int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) +{ + return __devlink_region_snapshot_id_get(devlink, id); +} +EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get); + +/** + * devlink_region_snapshot_id_put - put snapshot ID reference + * + * This should be called by a driver after finishing creating snapshots + * with an id. Doing so ensures that the ID can later be released in the + * event that all snapshots using it have been destroyed. + * + * @devlink: devlink + * @id: id to release reference on + */ +void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id) +{ + __devlink_snapshot_id_decrement(devlink, id); +} +EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put); + +/** + * devlink_region_snapshot_create - create a new snapshot + * This will add a new snapshot of a region. The snapshot + * will be stored on the region struct and can be accessed + * from devlink. This is useful for future analyses of snapshots. + * Multiple snapshots can be created on a region. + * The @snapshot_id should be obtained using the getter function. + * + * @region: devlink region of the snapshot + * @data: snapshot data + * @snapshot_id: snapshot id to be created + */ +int devlink_region_snapshot_create(struct devlink_region *region, + u8 *data, u32 snapshot_id) +{ + int err; + + mutex_lock(®ion->snapshot_lock); + err = __devlink_region_snapshot_create(region, data, snapshot_id); + mutex_unlock(®ion->snapshot_lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_region_snapshot_create); -- cgit v1.2.3 From 85facf94fd804ef557d5d844b5253859adef10d5 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Aug 2023 08:16:51 +0200 Subject: devlink: use tracepoint_enabled() helper In preparation for the trap code move, use tracepoint_enabled() helper instead of trace_devlink_trap_report_enabled() which would not be defined in that scope. Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230828061657.300667-10-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/leftover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 6d31aaee8ce8..1db3a7928465 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -3131,7 +3131,7 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, devlink_trap_stats_update(trap_item->stats, skb->len); devlink_trap_stats_update(trap_item->group_item->stats, skb->len); - if (trace_devlink_trap_report_enabled()) { + if (tracepoint_enabled(devlink_trap_report)) { struct devlink_trap_metadata metadata = {}; devlink_trap_report_metadata_set(&metadata, trap_item, -- cgit v1.2.3 From 4bbdec80ff270a69b6b61644c597ad6657f2a58b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Aug 2023 08:16:52 +0200 Subject: devlink: push trap related code into separate file Cut out another chunk from leftover.c and put trap related code into a separate file. Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230828061657.300667-11-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/Makefile | 2 +- net/devlink/devl_internal.h | 11 + net/devlink/leftover.c | 2915 ++++++++----------------------------------- net/devlink/trap.c | 1861 +++++++++++++++++++++++++++ 4 files changed, 2400 insertions(+), 2389 deletions(-) create mode 100644 net/devlink/trap.c (limited to 'net') diff --git a/net/devlink/Makefile b/net/devlink/Makefile index 744f64e467b7..523efffe522c 100644 --- a/net/devlink/Makefile +++ b/net/devlink/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o port.o sb.o dpipe.o \ - resource.o param.o region.o health.o + resource.o param.o region.o health.o trap.o diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 32b66233240f..148525aa2847 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -160,6 +160,12 @@ void devlink_params_notify_register(struct devlink *devlink); void devlink_params_notify_unregister(struct devlink *devlink); void devlink_regions_notify_register(struct devlink *devlink); void devlink_regions_notify_unregister(struct devlink *devlink); +void devlink_trap_policers_notify_register(struct devlink *devlink); +void devlink_trap_policers_notify_unregister(struct devlink *devlink); +void devlink_trap_groups_notify_register(struct devlink *devlink); +void devlink_trap_groups_notify_unregister(struct devlink *devlink); +void devlink_traps_notify_register(struct devlink *devlink); +void devlink_traps_notify_unregister(struct devlink *devlink); /* Ports */ #define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \ @@ -267,3 +273,8 @@ int devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb, + struct genl_info *info); +int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, + struct genl_info *info); diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 1db3a7928465..5b153ce097ab 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -999,2495 +999,634 @@ int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, return 0; } -struct devlink_stats { - u64_stats_t rx_bytes; - u64_stats_t rx_packets; - struct u64_stats_sync syncp; -}; - -/** - * struct devlink_trap_policer_item - Packet trap policer attributes. - * @policer: Immutable packet trap policer attributes. - * @rate: Rate in packets / sec. - * @burst: Burst size in packets. - * @list: trap_policer_list member. - * - * Describes packet trap policer attributes. Created by devlink during trap - * policer registration. - */ -struct devlink_trap_policer_item { - const struct devlink_trap_policer *policer; - u64 rate; - u64 burst; - struct list_head list; -}; - -/** - * struct devlink_trap_group_item - Packet trap group attributes. - * @group: Immutable packet trap group attributes. - * @policer_item: Associated policer item. Can be NULL. - * @list: trap_group_list member. - * @stats: Trap group statistics. - * - * Describes packet trap group attributes. Created by devlink during trap - * group registration. - */ -struct devlink_trap_group_item { - const struct devlink_trap_group *group; - struct devlink_trap_policer_item *policer_item; - struct list_head list; - struct devlink_stats __percpu *stats; -}; - -/** - * struct devlink_trap_item - Packet trap attributes. - * @trap: Immutable packet trap attributes. - * @group_item: Associated group item. - * @list: trap_list member. - * @action: Trap action. - * @stats: Trap statistics. - * @priv: Driver private information. - * - * Describes both mutable and immutable packet trap attributes. Created by - * devlink during trap registration and used for all trap related operations. - */ -struct devlink_trap_item { - const struct devlink_trap *trap; - struct devlink_trap_group_item *group_item; - struct list_head list; - enum devlink_trap_action action; - struct devlink_stats __percpu *stats; - void *priv; -}; - -static struct devlink_trap_policer_item * -devlink_trap_policer_item_lookup(struct devlink *devlink, u32 id) -{ - struct devlink_trap_policer_item *policer_item; - - list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { - if (policer_item->policer->id == id) - return policer_item; - } - - return NULL; -} - -static struct devlink_trap_item * -devlink_trap_item_lookup(struct devlink *devlink, const char *name) -{ - struct devlink_trap_item *trap_item; - - list_for_each_entry(trap_item, &devlink->trap_list, list) { - if (!strcmp(trap_item->trap->name, name)) - return trap_item; - } - - return NULL; -} - -static struct devlink_trap_item * -devlink_trap_item_get_from_info(struct devlink *devlink, - struct genl_info *info) -{ - struct nlattr *attr; - - if (!info->attrs[DEVLINK_ATTR_TRAP_NAME]) - return NULL; - attr = info->attrs[DEVLINK_ATTR_TRAP_NAME]; - - return devlink_trap_item_lookup(devlink, nla_data(attr)); -} - -static int -devlink_trap_action_get_from_info(struct genl_info *info, - enum devlink_trap_action *p_trap_action) -{ - u8 val; - - val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]); - switch (val) { - case DEVLINK_TRAP_ACTION_DROP: - case DEVLINK_TRAP_ACTION_TRAP: - case DEVLINK_TRAP_ACTION_MIRROR: - *p_trap_action = val; - break; - default: - return -EINVAL; - } - - return 0; -} - -static int devlink_trap_metadata_put(struct sk_buff *msg, - const struct devlink_trap *trap) -{ - struct nlattr *attr; - - attr = nla_nest_start(msg, DEVLINK_ATTR_TRAP_METADATA); - if (!attr) - return -EMSGSIZE; - - if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) && - nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT)) - goto nla_put_failure; - if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE) && - nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE)) - goto nla_put_failure; - - nla_nest_end(msg, attr); - - return 0; - -nla_put_failure: - nla_nest_cancel(msg, attr); - return -EMSGSIZE; -} - -static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats, - struct devlink_stats *stats) -{ - int i; - - memset(stats, 0, sizeof(*stats)); - for_each_possible_cpu(i) { - struct devlink_stats *cpu_stats; - u64 rx_packets, rx_bytes; - unsigned int start; - - cpu_stats = per_cpu_ptr(trap_stats, i); - do { - start = u64_stats_fetch_begin(&cpu_stats->syncp); - rx_packets = u64_stats_read(&cpu_stats->rx_packets); - rx_bytes = u64_stats_read(&cpu_stats->rx_bytes); - } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); - - u64_stats_add(&stats->rx_packets, rx_packets); - u64_stats_add(&stats->rx_bytes, rx_bytes); - } -} - -static int -devlink_trap_group_stats_put(struct sk_buff *msg, - struct devlink_stats __percpu *trap_stats) -{ - struct devlink_stats stats; - struct nlattr *attr; - - devlink_trap_stats_read(trap_stats, &stats); - - attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); - if (!attr) - return -EMSGSIZE; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS, - u64_stats_read(&stats.rx_packets), - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES, - u64_stats_read(&stats.rx_bytes), - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - nla_nest_end(msg, attr); - - return 0; - -nla_put_failure: - nla_nest_cancel(msg, attr); - return -EMSGSIZE; -} - -static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink, - const struct devlink_trap_item *trap_item) -{ - struct devlink_stats stats; - struct nlattr *attr; - u64 drops = 0; - int err; - - if (devlink->ops->trap_drop_counter_get) { - err = devlink->ops->trap_drop_counter_get(devlink, - trap_item->trap, - &drops); - if (err) - return err; - } - - devlink_trap_stats_read(trap_item->stats, &stats); - - attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); - if (!attr) - return -EMSGSIZE; - - if (devlink->ops->trap_drop_counter_get && - nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops, - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS, - u64_stats_read(&stats.rx_packets), - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES, - u64_stats_read(&stats.rx_bytes), - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - nla_nest_end(msg, attr); - - return 0; - -nla_put_failure: - nla_nest_cancel(msg, attr); - return -EMSGSIZE; -} - -static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink, - const struct devlink_trap_item *trap_item, - enum devlink_command cmd, u32 portid, u32 seq, - int flags) -{ - struct devlink_trap_group_item *group_item = trap_item->group_item; - void *hdr; - int err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME, - group_item->group->name)) - goto nla_put_failure; - - if (nla_put_string(msg, DEVLINK_ATTR_TRAP_NAME, trap_item->trap->name)) - goto nla_put_failure; - - if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_TYPE, trap_item->trap->type)) - goto nla_put_failure; - - if (trap_item->trap->generic && - nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) - goto nla_put_failure; - - if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_ACTION, trap_item->action)) - goto nla_put_failure; - - err = devlink_trap_metadata_put(msg, trap_item->trap); - if (err) - goto nla_put_failure; - - err = devlink_trap_stats_put(msg, devlink, trap_item); - if (err) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - struct devlink_trap_item *trap_item; - struct sk_buff *msg; - int err; - - if (list_empty(&devlink->trap_list)) - return -EOPNOTSUPP; - - trap_item = devlink_trap_item_get_from_info(devlink, info); - if (!trap_item) { - NL_SET_ERR_MSG(extack, "Device did not register this trap"); - return -ENOENT; - } - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_trap_fill(msg, devlink, trap_item, - DEVLINK_CMD_TRAP_NEW, info->snd_portid, - info->snd_seq, 0); - if (err) - goto err_trap_fill; - - return genlmsg_reply(msg, info); - -err_trap_fill: - nlmsg_free(msg); - return err; -} - -static int devlink_nl_trap_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb, int flags) -{ - struct devlink_nl_dump_state *state = devlink_dump_state(cb); - struct devlink_trap_item *trap_item; - int idx = 0; - int err = 0; - - list_for_each_entry(trap_item, &devlink->trap_list, list) { - if (idx < state->idx) { - idx++; - continue; - } - err = devlink_nl_trap_fill(msg, devlink, trap_item, - DEVLINK_CMD_TRAP_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, flags); - if (err) { - state->idx = idx; - break; - } - idx++; - } - - return err; -} - -int devlink_nl_trap_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) -{ - return devlink_nl_dumpit(skb, cb, devlink_nl_trap_get_dump_one); -} - -static int __devlink_trap_action_set(struct devlink *devlink, - struct devlink_trap_item *trap_item, - enum devlink_trap_action trap_action, - struct netlink_ext_ack *extack) -{ - int err; - - if (trap_item->action != trap_action && - trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) { - NL_SET_ERR_MSG(extack, "Cannot change action of non-drop traps. Skipping"); - return 0; - } - - err = devlink->ops->trap_action_set(devlink, trap_item->trap, - trap_action, extack); - if (err) - return err; - - trap_item->action = trap_action; - - return 0; -} - -static int devlink_trap_action_set(struct devlink *devlink, - struct devlink_trap_item *trap_item, - struct genl_info *info) -{ - enum devlink_trap_action trap_action; - int err; - - if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION]) - return 0; - - err = devlink_trap_action_get_from_info(info, &trap_action); - if (err) { - NL_SET_ERR_MSG(info->extack, "Invalid trap action"); - return -EINVAL; - } - - return __devlink_trap_action_set(devlink, trap_item, trap_action, - info->extack); -} - -static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - struct devlink_trap_item *trap_item; - - if (list_empty(&devlink->trap_list)) - return -EOPNOTSUPP; - - trap_item = devlink_trap_item_get_from_info(devlink, info); - if (!trap_item) { - NL_SET_ERR_MSG(extack, "Device did not register this trap"); - return -ENOENT; - } - - return devlink_trap_action_set(devlink, trap_item, info); -} - -static struct devlink_trap_group_item * -devlink_trap_group_item_lookup(struct devlink *devlink, const char *name) -{ - struct devlink_trap_group_item *group_item; - - list_for_each_entry(group_item, &devlink->trap_group_list, list) { - if (!strcmp(group_item->group->name, name)) - return group_item; - } - - return NULL; -} - -static struct devlink_trap_group_item * -devlink_trap_group_item_lookup_by_id(struct devlink *devlink, u16 id) -{ - struct devlink_trap_group_item *group_item; - - list_for_each_entry(group_item, &devlink->trap_group_list, list) { - if (group_item->group->id == id) - return group_item; - } - - return NULL; -} - -static struct devlink_trap_group_item * -devlink_trap_group_item_get_from_info(struct devlink *devlink, - struct genl_info *info) -{ - char *name; - - if (!info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]) - return NULL; - name = nla_data(info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]); - - return devlink_trap_group_item_lookup(devlink, name); -} - -static int -devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink, - const struct devlink_trap_group_item *group_item, - enum devlink_command cmd, u32 portid, u32 seq, - int flags) -{ - void *hdr; - int err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME, - group_item->group->name)) - goto nla_put_failure; - - if (group_item->group->generic && - nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) - goto nla_put_failure; - - if (group_item->policer_item && - nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, - group_item->policer_item->policer->id)) - goto nla_put_failure; - - err = devlink_trap_group_stats_put(msg, group_item->stats); - if (err) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - struct devlink_trap_group_item *group_item; - struct sk_buff *msg; - int err; - - if (list_empty(&devlink->trap_group_list)) - return -EOPNOTSUPP; - - group_item = devlink_trap_group_item_get_from_info(devlink, info); - if (!group_item) { - NL_SET_ERR_MSG(extack, "Device did not register this trap group"); - return -ENOENT; - } - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_trap_group_fill(msg, devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_NEW, - info->snd_portid, info->snd_seq, 0); - if (err) - goto err_trap_group_fill; - - return genlmsg_reply(msg, info); - -err_trap_group_fill: - nlmsg_free(msg); - return err; -} - -static int devlink_nl_trap_group_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb, - int flags) -{ - struct devlink_nl_dump_state *state = devlink_dump_state(cb); - struct devlink_trap_group_item *group_item; - int idx = 0; - int err = 0; - - - list_for_each_entry(group_item, &devlink->trap_group_list, list) { - if (idx < state->idx) { - idx++; - continue; - } - err = devlink_nl_trap_group_fill(msg, devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, flags); - if (err) { - state->idx = idx; - break; - } - idx++; - } - - return err; -} - -int devlink_nl_trap_group_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) -{ - return devlink_nl_dumpit(skb, cb, devlink_nl_trap_group_get_dump_one); -} - -static int -__devlink_trap_group_action_set(struct devlink *devlink, - struct devlink_trap_group_item *group_item, - enum devlink_trap_action trap_action, - struct netlink_ext_ack *extack) -{ - const char *group_name = group_item->group->name; - struct devlink_trap_item *trap_item; - int err; - - if (devlink->ops->trap_group_action_set) { - err = devlink->ops->trap_group_action_set(devlink, group_item->group, - trap_action, extack); - if (err) - return err; - - list_for_each_entry(trap_item, &devlink->trap_list, list) { - if (strcmp(trap_item->group_item->group->name, group_name)) - continue; - if (trap_item->action != trap_action && - trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) - continue; - trap_item->action = trap_action; - } - - return 0; - } - - list_for_each_entry(trap_item, &devlink->trap_list, list) { - if (strcmp(trap_item->group_item->group->name, group_name)) - continue; - err = __devlink_trap_action_set(devlink, trap_item, - trap_action, extack); - if (err) - return err; - } - - return 0; -} - -static int -devlink_trap_group_action_set(struct devlink *devlink, - struct devlink_trap_group_item *group_item, - struct genl_info *info, bool *p_modified) -{ - enum devlink_trap_action trap_action; - int err; - - if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION]) - return 0; - - err = devlink_trap_action_get_from_info(info, &trap_action); - if (err) { - NL_SET_ERR_MSG(info->extack, "Invalid trap action"); - return -EINVAL; - } - - err = __devlink_trap_group_action_set(devlink, group_item, trap_action, - info->extack); - if (err) - return err; - - *p_modified = true; - - return 0; -} - -static int devlink_trap_group_set(struct devlink *devlink, - struct devlink_trap_group_item *group_item, - struct genl_info *info) -{ - struct devlink_trap_policer_item *policer_item; - struct netlink_ext_ack *extack = info->extack; - const struct devlink_trap_policer *policer; - struct nlattr **attrs = info->attrs; - u32 policer_id; - int err; - - if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) - return 0; - - if (!devlink->ops->trap_group_set) - return -EOPNOTSUPP; - - policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); - policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); - if (policer_id && !policer_item) { - NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); - return -ENOENT; - } - policer = policer_item ? policer_item->policer : NULL; - - err = devlink->ops->trap_group_set(devlink, group_item->group, policer, - extack); - if (err) - return err; - - group_item->policer_item = policer_item; - - return 0; -} - -static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - struct devlink_trap_group_item *group_item; - bool modified = false; - int err; - - if (list_empty(&devlink->trap_group_list)) - return -EOPNOTSUPP; - - group_item = devlink_trap_group_item_get_from_info(devlink, info); - if (!group_item) { - NL_SET_ERR_MSG(extack, "Device did not register this trap group"); - return -ENOENT; - } - - err = devlink_trap_group_action_set(devlink, group_item, info, - &modified); - if (err) - return err; - - err = devlink_trap_group_set(devlink, group_item, info); - if (err) - goto err_trap_group_set; - - return 0; - -err_trap_group_set: - if (modified) - NL_SET_ERR_MSG(extack, "Trap group set failed, but some changes were committed already"); - return err; -} - -static struct devlink_trap_policer_item * -devlink_trap_policer_item_get_from_info(struct devlink *devlink, - struct genl_info *info) -{ - u32 id; - - if (!info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) - return NULL; - id = nla_get_u32(info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); - - return devlink_trap_policer_item_lookup(devlink, id); -} - -static int -devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink, - const struct devlink_trap_policer *policer) -{ - struct nlattr *attr; - u64 drops; - int err; - - if (!devlink->ops->trap_policer_counter_get) - return 0; - - err = devlink->ops->trap_policer_counter_get(devlink, policer, &drops); - if (err) - return err; - - attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); - if (!attr) - return -EMSGSIZE; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops, - DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - nla_nest_end(msg, attr); - - return 0; - -nla_put_failure: - nla_nest_cancel(msg, attr); - return -EMSGSIZE; -} - -static int -devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink, - const struct devlink_trap_policer_item *policer_item, - enum devlink_command cmd, u32 portid, u32 seq, - int flags) -{ - void *hdr; - int err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - if (nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, - policer_item->policer->id)) - goto nla_put_failure; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_RATE, - policer_item->rate, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_BURST, - policer_item->burst, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - err = devlink_trap_policer_stats_put(msg, devlink, - policer_item->policer); - if (err) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -int devlink_nl_trap_policer_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_trap_policer_item *policer_item; - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - struct sk_buff *msg; - int err; - - if (list_empty(&devlink->trap_policer_list)) - return -EOPNOTSUPP; - - policer_item = devlink_trap_policer_item_get_from_info(devlink, info); - if (!policer_item) { - NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); - return -ENOENT; - } - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, - DEVLINK_CMD_TRAP_POLICER_NEW, - info->snd_portid, info->snd_seq, 0); - if (err) - goto err_trap_policer_fill; - - return genlmsg_reply(msg, info); - -err_trap_policer_fill: - nlmsg_free(msg); - return err; -} - -static int devlink_nl_trap_policer_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb, - int flags) -{ - struct devlink_nl_dump_state *state = devlink_dump_state(cb); - struct devlink_trap_policer_item *policer_item; - int idx = 0; - int err = 0; - - list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { - if (idx < state->idx) { - idx++; - continue; - } - err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, - DEVLINK_CMD_TRAP_POLICER_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, flags); - if (err) { - state->idx = idx; - break; - } - idx++; - } - - return err; -} - -int devlink_nl_trap_policer_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) -{ - return devlink_nl_dumpit(skb, cb, devlink_nl_trap_policer_get_dump_one); -} - -static int -devlink_trap_policer_set(struct devlink *devlink, - struct devlink_trap_policer_item *policer_item, - struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct nlattr **attrs = info->attrs; - u64 rate, burst; - int err; - - rate = policer_item->rate; - burst = policer_item->burst; - - if (attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]) - rate = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]); - - if (attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]) - burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]); - - if (rate < policer_item->policer->min_rate) { - NL_SET_ERR_MSG(extack, "Policer rate lower than limit"); - return -EINVAL; - } - - if (rate > policer_item->policer->max_rate) { - NL_SET_ERR_MSG(extack, "Policer rate higher than limit"); - return -EINVAL; - } - - if (burst < policer_item->policer->min_burst) { - NL_SET_ERR_MSG(extack, "Policer burst size lower than limit"); - return -EINVAL; - } - - if (burst > policer_item->policer->max_burst) { - NL_SET_ERR_MSG(extack, "Policer burst size higher than limit"); - return -EINVAL; - } - - err = devlink->ops->trap_policer_set(devlink, policer_item->policer, - rate, burst, info->extack); - if (err) - return err; - - policer_item->rate = rate; - policer_item->burst = burst; - - return 0; -} - -static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_trap_policer_item *policer_item; - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - - if (list_empty(&devlink->trap_policer_list)) - return -EOPNOTSUPP; - - if (!devlink->ops->trap_policer_set) - return -EOPNOTSUPP; - - policer_item = devlink_trap_policer_item_get_from_info(devlink, info); - if (!policer_item) { - NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); - return -ENOENT; - } - - return devlink_trap_policer_set(devlink, policer_item, info); -} - -const struct genl_small_ops devlink_nl_small_ops[40] = { - { - .cmd = DEVLINK_CMD_PORT_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_RATE_SET, - .doit = devlink_nl_cmd_rate_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_RATE_NEW, - .doit = devlink_nl_cmd_rate_new_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_RATE_DEL, - .doit = devlink_nl_cmd_rate_del_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_PORT_SPLIT, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_split_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_PORT_UNSPLIT, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_unsplit_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_PORT_NEW, - .doit = devlink_nl_cmd_port_new_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_PORT_DEL, - .doit = devlink_nl_cmd_port_del_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - - { - .cmd = DEVLINK_CMD_LINECARD_SET, - .doit = devlink_nl_cmd_linecard_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_SB_POOL_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_pool_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_SB_PORT_POOL_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_port_pool_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_occ_snapshot_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_occ_max_clear_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_ESWITCH_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_eswitch_get_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_ESWITCH_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_eswitch_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_dpipe_table_get, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_dpipe_entries_get, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_dpipe_headers_get, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_dpipe_table_counters_set, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_RESOURCE_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_resource_set, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_RESOURCE_DUMP, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_resource_dump, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_RELOAD, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_reload, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_PARAM_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_param_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_PORT_PARAM_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_param_get_doit, - .dumpit = devlink_nl_cmd_port_param_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_PORT_PARAM_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_param_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_REGION_NEW, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_region_new, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_REGION_DEL, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_region_del, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_REGION_READ, - .validate = GENL_DONT_VALIDATE_STRICT | - GENL_DONT_VALIDATE_DUMP_STRICT, - .dumpit = devlink_nl_cmd_region_read_dumpit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_recover_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_diagnose_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, - .validate = GENL_DONT_VALIDATE_STRICT | - GENL_DONT_VALIDATE_DUMP_STRICT, - .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_dump_clear_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_test_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - }, - { - .cmd = DEVLINK_CMD_FLASH_UPDATE, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_flash_update, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_TRAP_SET, - .doit = devlink_nl_cmd_trap_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_TRAP_GROUP_SET, - .doit = devlink_nl_cmd_trap_group_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_TRAP_POLICER_SET, - .doit = devlink_nl_cmd_trap_policer_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_SELFTESTS_RUN, - .doit = devlink_nl_cmd_selftests_run, - .flags = GENL_ADMIN_PERM, - }, - /* -- No new ops here! Use split ops going forward! -- */ -}; - -static void devlink_trap_policers_notify_register(struct devlink *devlink); -static void devlink_trap_policers_notify_unregister(struct devlink *devlink); -static void devlink_trap_groups_notify_register(struct devlink *devlink); -static void devlink_trap_groups_notify_unregister(struct devlink *devlink); -static void devlink_traps_notify_register(struct devlink *devlink); -static void devlink_traps_notify_unregister(struct devlink *devlink); - -void devlink_notify_register(struct devlink *devlink) -{ - devlink_notify(devlink, DEVLINK_CMD_NEW); - devlink_linecards_notify_register(devlink); - devlink_ports_notify_register(devlink); - devlink_trap_policers_notify_register(devlink); - devlink_trap_groups_notify_register(devlink); - devlink_traps_notify_register(devlink); - devlink_rates_notify_register(devlink); - devlink_regions_notify_register(devlink); - devlink_params_notify_register(devlink); -} - -void devlink_notify_unregister(struct devlink *devlink) -{ - devlink_params_notify_unregister(devlink); - devlink_regions_notify_unregister(devlink); - devlink_rates_notify_unregister(devlink); - devlink_traps_notify_unregister(devlink); - devlink_trap_groups_notify_unregister(devlink); - devlink_trap_policers_notify_unregister(devlink); - devlink_ports_notify_unregister(devlink); - devlink_linecards_notify_unregister(devlink); - devlink_notify(devlink, DEVLINK_CMD_DEL); -} - -/** - * devl_rate_node_create - create devlink rate node - * @devlink: devlink instance - * @priv: driver private data - * @node_name: name of the resulting node - * @parent: parent devlink_rate struct - * - * Create devlink rate object of type node - */ -struct devlink_rate * -devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, - struct devlink_rate *parent) -{ - struct devlink_rate *rate_node; - - rate_node = devlink_rate_node_get_by_name(devlink, node_name); - if (!IS_ERR(rate_node)) - return ERR_PTR(-EEXIST); - - rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); - if (!rate_node) - return ERR_PTR(-ENOMEM); - - if (parent) { - rate_node->parent = parent; - refcount_inc(&rate_node->parent->refcnt); - } - - rate_node->type = DEVLINK_RATE_TYPE_NODE; - rate_node->devlink = devlink; - rate_node->priv = priv; - - rate_node->name = kstrdup(node_name, GFP_KERNEL); - if (!rate_node->name) { - kfree(rate_node); - return ERR_PTR(-ENOMEM); - } - - refcount_set(&rate_node->refcnt, 1); - list_add(&rate_node->list, &devlink->rate_list); - devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); - return rate_node; -} -EXPORT_SYMBOL_GPL(devl_rate_node_create); - -/** - * devl_rate_leaf_create - create devlink rate leaf - * @devlink_port: devlink port object to create rate object on - * @priv: driver private data - * @parent: parent devlink_rate struct - * - * Create devlink rate object of type leaf on provided @devlink_port. - */ -int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, - struct devlink_rate *parent) -{ - struct devlink *devlink = devlink_port->devlink; - struct devlink_rate *devlink_rate; - - devl_assert_locked(devlink_port->devlink); - - if (WARN_ON(devlink_port->devlink_rate)) - return -EBUSY; - - devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); - if (!devlink_rate) - return -ENOMEM; - - if (parent) { - devlink_rate->parent = parent; - refcount_inc(&devlink_rate->parent->refcnt); - } - - devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; - devlink_rate->devlink = devlink; - devlink_rate->devlink_port = devlink_port; - devlink_rate->priv = priv; - list_add_tail(&devlink_rate->list, &devlink->rate_list); - devlink_port->devlink_rate = devlink_rate; - devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); - - return 0; -} -EXPORT_SYMBOL_GPL(devl_rate_leaf_create); - -/** - * devl_rate_leaf_destroy - destroy devlink rate leaf - * - * @devlink_port: devlink port linked to the rate object - * - * Destroy the devlink rate object of type leaf on provided @devlink_port. - */ -void devl_rate_leaf_destroy(struct devlink_port *devlink_port) -{ - struct devlink_rate *devlink_rate = devlink_port->devlink_rate; - - devl_assert_locked(devlink_port->devlink); - if (!devlink_rate) - return; - - devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); - if (devlink_rate->parent) - refcount_dec(&devlink_rate->parent->refcnt); - list_del(&devlink_rate->list); - devlink_port->devlink_rate = NULL; - kfree(devlink_rate); -} -EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); - -/** - * devl_rate_nodes_destroy - destroy all devlink rate nodes on device - * @devlink: devlink instance - * - * Unset parent for all rate objects and destroy all rate nodes - * on specified device. - */ -void devl_rate_nodes_destroy(struct devlink *devlink) -{ - static struct devlink_rate *devlink_rate, *tmp; - const struct devlink_ops *ops = devlink->ops; - - devl_assert_locked(devlink); - - list_for_each_entry(devlink_rate, &devlink->rate_list, list) { - if (!devlink_rate->parent) - continue; - - refcount_dec(&devlink_rate->parent->refcnt); - if (devlink_rate_is_leaf(devlink_rate)) - ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, - NULL, NULL); - else if (devlink_rate_is_node(devlink_rate)) - ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, - NULL, NULL); - } - list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { - if (devlink_rate_is_node(devlink_rate)) { - ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL); - list_del(&devlink_rate->list); - kfree(devlink_rate->name); - kfree(devlink_rate); - } - } -} -EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy); - -static int devlink_linecard_types_init(struct devlink_linecard *linecard) -{ - struct devlink_linecard_type *linecard_type; - unsigned int count; - int i; - - count = linecard->ops->types_count(linecard, linecard->priv); - linecard->types = kmalloc_array(count, sizeof(*linecard_type), - GFP_KERNEL); - if (!linecard->types) - return -ENOMEM; - linecard->types_count = count; - - for (i = 0; i < count; i++) { - linecard_type = &linecard->types[i]; - linecard->ops->types_get(linecard, linecard->priv, i, - &linecard_type->type, - &linecard_type->priv); - } - return 0; -} - -static void devlink_linecard_types_fini(struct devlink_linecard *linecard) -{ - kfree(linecard->types); -} - -/** - * devl_linecard_create - Create devlink linecard - * - * @devlink: devlink - * @linecard_index: driver-specific numerical identifier of the linecard - * @ops: linecards ops - * @priv: user priv pointer - * - * Create devlink linecard instance with provided linecard index. - * Caller can use any indexing, even hw-related one. - * - * Return: Line card structure or an ERR_PTR() encoded error code. - */ -struct devlink_linecard * -devl_linecard_create(struct devlink *devlink, unsigned int linecard_index, - const struct devlink_linecard_ops *ops, void *priv) -{ - struct devlink_linecard *linecard; - int err; - - if (WARN_ON(!ops || !ops->provision || !ops->unprovision || - !ops->types_count || !ops->types_get)) - return ERR_PTR(-EINVAL); - - if (devlink_linecard_index_exists(devlink, linecard_index)) - return ERR_PTR(-EEXIST); - - linecard = kzalloc(sizeof(*linecard), GFP_KERNEL); - if (!linecard) - return ERR_PTR(-ENOMEM); - - linecard->devlink = devlink; - linecard->index = linecard_index; - linecard->ops = ops; - linecard->priv = priv; - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - mutex_init(&linecard->state_lock); - - err = devlink_linecard_types_init(linecard); - if (err) { - mutex_destroy(&linecard->state_lock); - kfree(linecard); - return ERR_PTR(err); - } - - list_add_tail(&linecard->list, &devlink->linecard_list); - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - return linecard; -} -EXPORT_SYMBOL_GPL(devl_linecard_create); - -/** - * devl_linecard_destroy - Destroy devlink linecard - * - * @linecard: devlink linecard - */ -void devl_linecard_destroy(struct devlink_linecard *linecard) -{ - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); - list_del(&linecard->list); - devlink_linecard_types_fini(linecard); - mutex_destroy(&linecard->state_lock); - kfree(linecard); -} -EXPORT_SYMBOL_GPL(devl_linecard_destroy); - -/** - * devlink_linecard_provision_set - Set provisioning on linecard - * - * @linecard: devlink linecard - * @type: linecard type - * - * This is either called directly from the provision() op call or - * as a result of the provision() op call asynchronously. - */ -void devlink_linecard_provision_set(struct devlink_linecard *linecard, - const char *type) -{ - mutex_lock(&linecard->state_lock); - WARN_ON(linecard->type && strcmp(linecard->type, type)); - linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; - linecard->type = type; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_provision_set); - -/** - * devlink_linecard_provision_clear - Clear provisioning on linecard - * - * @linecard: devlink linecard - * - * This is either called directly from the unprovision() op call or - * as a result of the unprovision() op call asynchronously. - */ -void devlink_linecard_provision_clear(struct devlink_linecard *linecard) -{ - mutex_lock(&linecard->state_lock); - WARN_ON(linecard->nested_devlink); - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - linecard->type = NULL; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear); - -/** - * devlink_linecard_provision_fail - Fail provisioning on linecard - * - * @linecard: devlink linecard - * - * This is either called directly from the provision() op call or - * as a result of the provision() op call asynchronously. - */ -void devlink_linecard_provision_fail(struct devlink_linecard *linecard) -{ - mutex_lock(&linecard->state_lock); - WARN_ON(linecard->nested_devlink); - linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail); - -/** - * devlink_linecard_activate - Set linecard active - * - * @linecard: devlink linecard - */ -void devlink_linecard_activate(struct devlink_linecard *linecard) -{ - mutex_lock(&linecard->state_lock); - WARN_ON(linecard->state != DEVLINK_LINECARD_STATE_PROVISIONED); - linecard->state = DEVLINK_LINECARD_STATE_ACTIVE; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_activate); - -/** - * devlink_linecard_deactivate - Set linecard inactive - * - * @linecard: devlink linecard - */ -void devlink_linecard_deactivate(struct devlink_linecard *linecard) -{ - mutex_lock(&linecard->state_lock); - switch (linecard->state) { - case DEVLINK_LINECARD_STATE_ACTIVE: - linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - break; - case DEVLINK_LINECARD_STATE_UNPROVISIONING: - /* Line card is being deactivated as part - * of unprovisioning flow. - */ - break; - default: - WARN_ON(1); - break; - } - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_deactivate); - -/** - * devlink_linecard_nested_dl_set - Attach/detach nested devlink - * instance to linecard. - * - * @linecard: devlink linecard - * @nested_devlink: devlink instance to attach or NULL to detach - */ -void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, - struct devlink *nested_devlink) -{ - mutex_lock(&linecard->state_lock); - linecard->nested_devlink = nested_devlink; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set); - -#define DEVLINK_TRAP(_id, _type) \ - { \ - .type = DEVLINK_TRAP_TYPE_##_type, \ - .id = DEVLINK_TRAP_GENERIC_ID_##_id, \ - .name = DEVLINK_TRAP_GENERIC_NAME_##_id, \ - } - -static const struct devlink_trap devlink_trap_generic[] = { - DEVLINK_TRAP(SMAC_MC, DROP), - DEVLINK_TRAP(VLAN_TAG_MISMATCH, DROP), - DEVLINK_TRAP(INGRESS_VLAN_FILTER, DROP), - DEVLINK_TRAP(INGRESS_STP_FILTER, DROP), - DEVLINK_TRAP(EMPTY_TX_LIST, DROP), - DEVLINK_TRAP(PORT_LOOPBACK_FILTER, DROP), - DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP), - DEVLINK_TRAP(TTL_ERROR, EXCEPTION), - DEVLINK_TRAP(TAIL_DROP, DROP), - DEVLINK_TRAP(NON_IP_PACKET, DROP), - DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP), - DEVLINK_TRAP(DIP_LB, DROP), - DEVLINK_TRAP(SIP_MC, DROP), - DEVLINK_TRAP(SIP_LB, DROP), - DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP), - DEVLINK_TRAP(IPV4_SIP_BC, DROP), - DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP), - DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP), - DEVLINK_TRAP(MTU_ERROR, EXCEPTION), - DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION), - DEVLINK_TRAP(RPF, EXCEPTION), - DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION), - DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION), - DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION), - DEVLINK_TRAP(NON_ROUTABLE, DROP), - DEVLINK_TRAP(DECAP_ERROR, EXCEPTION), - DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP), - DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP), - DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP), - DEVLINK_TRAP(STP, CONTROL), - DEVLINK_TRAP(LACP, CONTROL), - DEVLINK_TRAP(LLDP, CONTROL), - DEVLINK_TRAP(IGMP_QUERY, CONTROL), - DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL), - DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL), - DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL), - DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL), - DEVLINK_TRAP(MLD_QUERY, CONTROL), - DEVLINK_TRAP(MLD_V1_REPORT, CONTROL), - DEVLINK_TRAP(MLD_V2_REPORT, CONTROL), - DEVLINK_TRAP(MLD_V1_DONE, CONTROL), - DEVLINK_TRAP(IPV4_DHCP, CONTROL), - DEVLINK_TRAP(IPV6_DHCP, CONTROL), - DEVLINK_TRAP(ARP_REQUEST, CONTROL), - DEVLINK_TRAP(ARP_RESPONSE, CONTROL), - DEVLINK_TRAP(ARP_OVERLAY, CONTROL), - DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL), - DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL), - DEVLINK_TRAP(IPV4_BFD, CONTROL), - DEVLINK_TRAP(IPV6_BFD, CONTROL), - DEVLINK_TRAP(IPV4_OSPF, CONTROL), - DEVLINK_TRAP(IPV6_OSPF, CONTROL), - DEVLINK_TRAP(IPV4_BGP, CONTROL), - DEVLINK_TRAP(IPV6_BGP, CONTROL), - DEVLINK_TRAP(IPV4_VRRP, CONTROL), - DEVLINK_TRAP(IPV6_VRRP, CONTROL), - DEVLINK_TRAP(IPV4_PIM, CONTROL), - DEVLINK_TRAP(IPV6_PIM, CONTROL), - DEVLINK_TRAP(UC_LB, CONTROL), - DEVLINK_TRAP(LOCAL_ROUTE, CONTROL), - DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL), - DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL), - DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL), - DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL), - DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL), - DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL), - DEVLINK_TRAP(IPV6_REDIRECT, CONTROL), - DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL), - DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL), - DEVLINK_TRAP(PTP_EVENT, CONTROL), - DEVLINK_TRAP(PTP_GENERAL, CONTROL), - DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL), - DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL), - DEVLINK_TRAP(EARLY_DROP, DROP), - DEVLINK_TRAP(VXLAN_PARSING, DROP), - DEVLINK_TRAP(LLC_SNAP_PARSING, DROP), - DEVLINK_TRAP(VLAN_PARSING, DROP), - DEVLINK_TRAP(PPPOE_PPP_PARSING, DROP), - DEVLINK_TRAP(MPLS_PARSING, DROP), - DEVLINK_TRAP(ARP_PARSING, DROP), - DEVLINK_TRAP(IP_1_PARSING, DROP), - DEVLINK_TRAP(IP_N_PARSING, DROP), - DEVLINK_TRAP(GRE_PARSING, DROP), - DEVLINK_TRAP(UDP_PARSING, DROP), - DEVLINK_TRAP(TCP_PARSING, DROP), - DEVLINK_TRAP(IPSEC_PARSING, DROP), - DEVLINK_TRAP(SCTP_PARSING, DROP), - DEVLINK_TRAP(DCCP_PARSING, DROP), - DEVLINK_TRAP(GTP_PARSING, DROP), - DEVLINK_TRAP(ESP_PARSING, DROP), - DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP), - DEVLINK_TRAP(DMAC_FILTER, DROP), - DEVLINK_TRAP(EAPOL, CONTROL), - DEVLINK_TRAP(LOCKED_PORT, DROP), -}; - -#define DEVLINK_TRAP_GROUP(_id) \ - { \ - .id = DEVLINK_TRAP_GROUP_GENERIC_ID_##_id, \ - .name = DEVLINK_TRAP_GROUP_GENERIC_NAME_##_id, \ - } - -static const struct devlink_trap_group devlink_trap_group_generic[] = { - DEVLINK_TRAP_GROUP(L2_DROPS), - DEVLINK_TRAP_GROUP(L3_DROPS), - DEVLINK_TRAP_GROUP(L3_EXCEPTIONS), - DEVLINK_TRAP_GROUP(BUFFER_DROPS), - DEVLINK_TRAP_GROUP(TUNNEL_DROPS), - DEVLINK_TRAP_GROUP(ACL_DROPS), - DEVLINK_TRAP_GROUP(STP), - DEVLINK_TRAP_GROUP(LACP), - DEVLINK_TRAP_GROUP(LLDP), - DEVLINK_TRAP_GROUP(MC_SNOOPING), - DEVLINK_TRAP_GROUP(DHCP), - DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY), - DEVLINK_TRAP_GROUP(BFD), - DEVLINK_TRAP_GROUP(OSPF), - DEVLINK_TRAP_GROUP(BGP), - DEVLINK_TRAP_GROUP(VRRP), - DEVLINK_TRAP_GROUP(PIM), - DEVLINK_TRAP_GROUP(UC_LB), - DEVLINK_TRAP_GROUP(LOCAL_DELIVERY), - DEVLINK_TRAP_GROUP(EXTERNAL_DELIVERY), - DEVLINK_TRAP_GROUP(IPV6), - DEVLINK_TRAP_GROUP(PTP_EVENT), - DEVLINK_TRAP_GROUP(PTP_GENERAL), - DEVLINK_TRAP_GROUP(ACL_SAMPLE), - DEVLINK_TRAP_GROUP(ACL_TRAP), - DEVLINK_TRAP_GROUP(PARSER_ERROR_DROPS), - DEVLINK_TRAP_GROUP(EAPOL), -}; - -static int devlink_trap_generic_verify(const struct devlink_trap *trap) -{ - if (trap->id > DEVLINK_TRAP_GENERIC_ID_MAX) - return -EINVAL; - - if (strcmp(trap->name, devlink_trap_generic[trap->id].name)) - return -EINVAL; - - if (trap->type != devlink_trap_generic[trap->id].type) - return -EINVAL; - - return 0; -} - -static int devlink_trap_driver_verify(const struct devlink_trap *trap) -{ - int i; - - if (trap->id <= DEVLINK_TRAP_GENERIC_ID_MAX) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(devlink_trap_generic); i++) { - if (!strcmp(trap->name, devlink_trap_generic[i].name)) - return -EEXIST; - } - - return 0; -} - -static int devlink_trap_verify(const struct devlink_trap *trap) -{ - if (!trap || !trap->name) - return -EINVAL; - - if (trap->generic) - return devlink_trap_generic_verify(trap); - else - return devlink_trap_driver_verify(trap); -} - -static int -devlink_trap_group_generic_verify(const struct devlink_trap_group *group) -{ - if (group->id > DEVLINK_TRAP_GROUP_GENERIC_ID_MAX) - return -EINVAL; - - if (strcmp(group->name, devlink_trap_group_generic[group->id].name)) - return -EINVAL; - - return 0; -} - -static int -devlink_trap_group_driver_verify(const struct devlink_trap_group *group) -{ - int i; - - if (group->id <= DEVLINK_TRAP_GROUP_GENERIC_ID_MAX) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(devlink_trap_group_generic); i++) { - if (!strcmp(group->name, devlink_trap_group_generic[i].name)) - return -EEXIST; - } - - return 0; -} - -static int devlink_trap_group_verify(const struct devlink_trap_group *group) -{ - if (group->generic) - return devlink_trap_group_generic_verify(group); - else - return devlink_trap_group_driver_verify(group); -} - -static void -devlink_trap_group_notify(struct devlink *devlink, - const struct devlink_trap_group_item *group_item, - enum devlink_command cmd) -{ - struct sk_buff *msg; - int err; - - WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW && - cmd != DEVLINK_CMD_TRAP_GROUP_DEL); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_trap_group_fill(msg, devlink, group_item, cmd, 0, 0, - 0); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static void devlink_trap_groups_notify_register(struct devlink *devlink) -{ - struct devlink_trap_group_item *group_item; +const struct genl_small_ops devlink_nl_small_ops[40] = { + { + .cmd = DEVLINK_CMD_PORT_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_port_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_RATE_SET, + .doit = devlink_nl_cmd_rate_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_RATE_NEW, + .doit = devlink_nl_cmd_rate_new_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_RATE_DEL, + .doit = devlink_nl_cmd_rate_del_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_PORT_SPLIT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_port_split_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_PORT_UNSPLIT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_port_unsplit_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_PORT_NEW, + .doit = devlink_nl_cmd_port_new_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_PORT_DEL, + .doit = devlink_nl_cmd_port_del_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, - list_for_each_entry(group_item, &devlink->trap_group_list, list) - devlink_trap_group_notify(devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_NEW); -} + { + .cmd = DEVLINK_CMD_LINECARD_SET, + .doit = devlink_nl_cmd_linecard_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_SB_POOL_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_sb_pool_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_SB_PORT_POOL_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_sb_port_pool_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_sb_occ_snapshot_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_sb_occ_max_clear_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_ESWITCH_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_eswitch_get_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_ESWITCH_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_eswitch_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_dpipe_table_get, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_dpipe_entries_get, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_dpipe_headers_get, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_dpipe_table_counters_set, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_RESOURCE_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_resource_set, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_RESOURCE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_resource_dump, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_RELOAD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_reload, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_PARAM_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_param_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_PORT_PARAM_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_port_param_get_doit, + .dumpit = devlink_nl_cmd_port_param_get_dumpit, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_PORT_PARAM_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_port_param_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_REGION_NEW, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_region_new, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_REGION_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_region_del, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_REGION_READ, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, + .dumpit = devlink_nl_cmd_region_read_dumpit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_health_reporter_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_health_reporter_recover_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_health_reporter_diagnose_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, + .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_health_reporter_dump_clear_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_health_reporter_test_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, + }, + { + .cmd = DEVLINK_CMD_FLASH_UPDATE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_flash_update, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_TRAP_SET, + .doit = devlink_nl_cmd_trap_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_TRAP_GROUP_SET, + .doit = devlink_nl_cmd_trap_group_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_TRAP_POLICER_SET, + .doit = devlink_nl_cmd_trap_policer_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_SELFTESTS_RUN, + .doit = devlink_nl_cmd_selftests_run, + .flags = GENL_ADMIN_PERM, + }, + /* -- No new ops here! Use split ops going forward! -- */ +}; -static void devlink_trap_groups_notify_unregister(struct devlink *devlink) +void devlink_notify_register(struct devlink *devlink) { - struct devlink_trap_group_item *group_item; - - list_for_each_entry_reverse(group_item, &devlink->trap_group_list, list) - devlink_trap_group_notify(devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_DEL); + devlink_notify(devlink, DEVLINK_CMD_NEW); + devlink_linecards_notify_register(devlink); + devlink_ports_notify_register(devlink); + devlink_trap_policers_notify_register(devlink); + devlink_trap_groups_notify_register(devlink); + devlink_traps_notify_register(devlink); + devlink_rates_notify_register(devlink); + devlink_regions_notify_register(devlink); + devlink_params_notify_register(devlink); } -static int -devlink_trap_item_group_link(struct devlink *devlink, - struct devlink_trap_item *trap_item) +void devlink_notify_unregister(struct devlink *devlink) { - u16 group_id = trap_item->trap->init_group_id; - struct devlink_trap_group_item *group_item; - - group_item = devlink_trap_group_item_lookup_by_id(devlink, group_id); - if (WARN_ON_ONCE(!group_item)) - return -EINVAL; - - trap_item->group_item = group_item; - - return 0; + devlink_params_notify_unregister(devlink); + devlink_regions_notify_unregister(devlink); + devlink_rates_notify_unregister(devlink); + devlink_traps_notify_unregister(devlink); + devlink_trap_groups_notify_unregister(devlink); + devlink_trap_policers_notify_unregister(devlink); + devlink_ports_notify_unregister(devlink); + devlink_linecards_notify_unregister(devlink); + devlink_notify(devlink, DEVLINK_CMD_DEL); } -static void devlink_trap_notify(struct devlink *devlink, - const struct devlink_trap_item *trap_item, - enum devlink_command cmd) +/** + * devl_rate_node_create - create devlink rate node + * @devlink: devlink instance + * @priv: driver private data + * @node_name: name of the resulting node + * @parent: parent devlink_rate struct + * + * Create devlink rate object of type node + */ +struct devlink_rate * +devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, + struct devlink_rate *parent) { - struct sk_buff *msg; - int err; + struct devlink_rate *rate_node; - WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW && - cmd != DEVLINK_CMD_TRAP_DEL); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; + rate_node = devlink_rate_node_get_by_name(devlink, node_name); + if (!IS_ERR(rate_node)) + return ERR_PTR(-EEXIST); - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; + rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); + if (!rate_node) + return ERR_PTR(-ENOMEM); - err = devlink_nl_trap_fill(msg, devlink, trap_item, cmd, 0, 0, 0); - if (err) { - nlmsg_free(msg); - return; + if (parent) { + rate_node->parent = parent; + refcount_inc(&rate_node->parent->refcnt); } - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static void devlink_traps_notify_register(struct devlink *devlink) -{ - struct devlink_trap_item *trap_item; - - list_for_each_entry(trap_item, &devlink->trap_list, list) - devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); -} - -static void devlink_traps_notify_unregister(struct devlink *devlink) -{ - struct devlink_trap_item *trap_item; - - list_for_each_entry_reverse(trap_item, &devlink->trap_list, list) - devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); -} - -static int -devlink_trap_register(struct devlink *devlink, - const struct devlink_trap *trap, void *priv) -{ - struct devlink_trap_item *trap_item; - int err; - - if (devlink_trap_item_lookup(devlink, trap->name)) - return -EEXIST; - - trap_item = kzalloc(sizeof(*trap_item), GFP_KERNEL); - if (!trap_item) - return -ENOMEM; + rate_node->type = DEVLINK_RATE_TYPE_NODE; + rate_node->devlink = devlink; + rate_node->priv = priv; - trap_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); - if (!trap_item->stats) { - err = -ENOMEM; - goto err_stats_alloc; + rate_node->name = kstrdup(node_name, GFP_KERNEL); + if (!rate_node->name) { + kfree(rate_node); + return ERR_PTR(-ENOMEM); } - trap_item->trap = trap; - trap_item->action = trap->init_action; - trap_item->priv = priv; - - err = devlink_trap_item_group_link(devlink, trap_item); - if (err) - goto err_group_link; - - err = devlink->ops->trap_init(devlink, trap, trap_item); - if (err) - goto err_trap_init; - - list_add_tail(&trap_item->list, &devlink->trap_list); - devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); - - return 0; - -err_trap_init: -err_group_link: - free_percpu(trap_item->stats); -err_stats_alloc: - kfree(trap_item); - return err; -} - -static void devlink_trap_unregister(struct devlink *devlink, - const struct devlink_trap *trap) -{ - struct devlink_trap_item *trap_item; - - trap_item = devlink_trap_item_lookup(devlink, trap->name); - if (WARN_ON_ONCE(!trap_item)) - return; - - devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); - list_del(&trap_item->list); - if (devlink->ops->trap_fini) - devlink->ops->trap_fini(devlink, trap, trap_item); - free_percpu(trap_item->stats); - kfree(trap_item); -} - -static void devlink_trap_disable(struct devlink *devlink, - const struct devlink_trap *trap) -{ - struct devlink_trap_item *trap_item; - - trap_item = devlink_trap_item_lookup(devlink, trap->name); - if (WARN_ON_ONCE(!trap_item)) - return; - - devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP, - NULL); - trap_item->action = DEVLINK_TRAP_ACTION_DROP; + refcount_set(&rate_node->refcnt, 1); + list_add(&rate_node->list, &devlink->rate_list); + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); + return rate_node; } +EXPORT_SYMBOL_GPL(devl_rate_node_create); /** - * devl_traps_register - Register packet traps with devlink. - * @devlink: devlink. - * @traps: Packet traps. - * @traps_count: Count of provided packet traps. - * @priv: Driver private information. + * devl_rate_leaf_create - create devlink rate leaf + * @devlink_port: devlink port object to create rate object on + * @priv: driver private data + * @parent: parent devlink_rate struct * - * Return: Non-zero value on failure. + * Create devlink rate object of type leaf on provided @devlink_port. */ -int devl_traps_register(struct devlink *devlink, - const struct devlink_trap *traps, - size_t traps_count, void *priv) +int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, + struct devlink_rate *parent) { - int i, err; + struct devlink *devlink = devlink_port->devlink; + struct devlink_rate *devlink_rate; - if (!devlink->ops->trap_init || !devlink->ops->trap_action_set) - return -EINVAL; + devl_assert_locked(devlink_port->devlink); - devl_assert_locked(devlink); - for (i = 0; i < traps_count; i++) { - const struct devlink_trap *trap = &traps[i]; + if (WARN_ON(devlink_port->devlink_rate)) + return -EBUSY; - err = devlink_trap_verify(trap); - if (err) - goto err_trap_verify; + devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); + if (!devlink_rate) + return -ENOMEM; - err = devlink_trap_register(devlink, trap, priv); - if (err) - goto err_trap_register; + if (parent) { + devlink_rate->parent = parent; + refcount_inc(&devlink_rate->parent->refcnt); } - return 0; + devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; + devlink_rate->devlink = devlink; + devlink_rate->devlink_port = devlink_port; + devlink_rate->priv = priv; + list_add_tail(&devlink_rate->list, &devlink->rate_list); + devlink_port->devlink_rate = devlink_rate; + devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); -err_trap_register: -err_trap_verify: - for (i--; i >= 0; i--) - devlink_trap_unregister(devlink, &traps[i]); - return err; + return 0; } -EXPORT_SYMBOL_GPL(devl_traps_register); +EXPORT_SYMBOL_GPL(devl_rate_leaf_create); /** - * devlink_traps_register - Register packet traps with devlink. - * @devlink: devlink. - * @traps: Packet traps. - * @traps_count: Count of provided packet traps. - * @priv: Driver private information. - * - * Context: Takes and release devlink->lock . + * devl_rate_leaf_destroy - destroy devlink rate leaf * - * Return: Non-zero value on failure. - */ -int devlink_traps_register(struct devlink *devlink, - const struct devlink_trap *traps, - size_t traps_count, void *priv) -{ - int err; - - devl_lock(devlink); - err = devl_traps_register(devlink, traps, traps_count, priv); - devl_unlock(devlink); - return err; -} -EXPORT_SYMBOL_GPL(devlink_traps_register); - -/** - * devl_traps_unregister - Unregister packet traps from devlink. - * @devlink: devlink. - * @traps: Packet traps. - * @traps_count: Count of provided packet traps. - */ -void devl_traps_unregister(struct devlink *devlink, - const struct devlink_trap *traps, - size_t traps_count) -{ - int i; - - devl_assert_locked(devlink); - /* Make sure we do not have any packets in-flight while unregistering - * traps by disabling all of them and waiting for a grace period. - */ - for (i = traps_count - 1; i >= 0; i--) - devlink_trap_disable(devlink, &traps[i]); - synchronize_rcu(); - for (i = traps_count - 1; i >= 0; i--) - devlink_trap_unregister(devlink, &traps[i]); -} -EXPORT_SYMBOL_GPL(devl_traps_unregister); - -/** - * devlink_traps_unregister - Unregister packet traps from devlink. - * @devlink: devlink. - * @traps: Packet traps. - * @traps_count: Count of provided packet traps. + * @devlink_port: devlink port linked to the rate object * - * Context: Takes and release devlink->lock . - */ -void devlink_traps_unregister(struct devlink *devlink, - const struct devlink_trap *traps, - size_t traps_count) -{ - devl_lock(devlink); - devl_traps_unregister(devlink, traps, traps_count); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_traps_unregister); - -static void -devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats, - size_t skb_len) -{ - struct devlink_stats *stats; - - stats = this_cpu_ptr(trap_stats); - u64_stats_update_begin(&stats->syncp); - u64_stats_add(&stats->rx_bytes, skb_len); - u64_stats_inc(&stats->rx_packets); - u64_stats_update_end(&stats->syncp); -} - -static void -devlink_trap_report_metadata_set(struct devlink_trap_metadata *metadata, - const struct devlink_trap_item *trap_item, - struct devlink_port *in_devlink_port, - const struct flow_action_cookie *fa_cookie) -{ - metadata->trap_name = trap_item->trap->name; - metadata->trap_group_name = trap_item->group_item->group->name; - metadata->fa_cookie = fa_cookie; - metadata->trap_type = trap_item->trap->type; - - spin_lock(&in_devlink_port->type_lock); - if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH) - metadata->input_dev = in_devlink_port->type_eth.netdev; - spin_unlock(&in_devlink_port->type_lock); -} - -/** - * devlink_trap_report - Report trapped packet to drop monitor. - * @devlink: devlink. - * @skb: Trapped packet. - * @trap_ctx: Trap context. - * @in_devlink_port: Input devlink port. - * @fa_cookie: Flow action cookie. Could be NULL. + * Destroy the devlink rate object of type leaf on provided @devlink_port. */ -void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, - void *trap_ctx, struct devlink_port *in_devlink_port, - const struct flow_action_cookie *fa_cookie) - +void devl_rate_leaf_destroy(struct devlink_port *devlink_port) { - struct devlink_trap_item *trap_item = trap_ctx; - - devlink_trap_stats_update(trap_item->stats, skb->len); - devlink_trap_stats_update(trap_item->group_item->stats, skb->len); + struct devlink_rate *devlink_rate = devlink_port->devlink_rate; - if (tracepoint_enabled(devlink_trap_report)) { - struct devlink_trap_metadata metadata = {}; + devl_assert_locked(devlink_port->devlink); + if (!devlink_rate) + return; - devlink_trap_report_metadata_set(&metadata, trap_item, - in_devlink_port, fa_cookie); - trace_devlink_trap_report(devlink, skb, &metadata); - } + devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); + if (devlink_rate->parent) + refcount_dec(&devlink_rate->parent->refcnt); + list_del(&devlink_rate->list); + devlink_port->devlink_rate = NULL; + kfree(devlink_rate); } -EXPORT_SYMBOL_GPL(devlink_trap_report); +EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); /** - * devlink_trap_ctx_priv - Trap context to driver private information. - * @trap_ctx: Trap context. + * devl_rate_nodes_destroy - destroy all devlink rate nodes on device + * @devlink: devlink instance * - * Return: Driver private information passed during registration. + * Unset parent for all rate objects and destroy all rate nodes + * on specified device. */ -void *devlink_trap_ctx_priv(void *trap_ctx) -{ - struct devlink_trap_item *trap_item = trap_ctx; - - return trap_item->priv; -} -EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv); - -static int -devlink_trap_group_item_policer_link(struct devlink *devlink, - struct devlink_trap_group_item *group_item) -{ - u32 policer_id = group_item->group->init_policer_id; - struct devlink_trap_policer_item *policer_item; - - if (policer_id == 0) - return 0; - - policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); - if (WARN_ON_ONCE(!policer_item)) - return -EINVAL; - - group_item->policer_item = policer_item; - - return 0; -} - -static int -devlink_trap_group_register(struct devlink *devlink, - const struct devlink_trap_group *group) +void devl_rate_nodes_destroy(struct devlink *devlink) { - struct devlink_trap_group_item *group_item; - int err; + static struct devlink_rate *devlink_rate, *tmp; + const struct devlink_ops *ops = devlink->ops; - if (devlink_trap_group_item_lookup(devlink, group->name)) - return -EEXIST; + devl_assert_locked(devlink); - group_item = kzalloc(sizeof(*group_item), GFP_KERNEL); - if (!group_item) - return -ENOMEM; + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + if (!devlink_rate->parent) + continue; - group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); - if (!group_item->stats) { - err = -ENOMEM; - goto err_stats_alloc; + refcount_dec(&devlink_rate->parent->refcnt); + if (devlink_rate_is_leaf(devlink_rate)) + ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, + NULL, NULL); + else if (devlink_rate_is_node(devlink_rate)) + ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, + NULL, NULL); + } + list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { + if (devlink_rate_is_node(devlink_rate)) { + ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL); + list_del(&devlink_rate->list); + kfree(devlink_rate->name); + kfree(devlink_rate); + } } +} +EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy); - group_item->group = group; +static int devlink_linecard_types_init(struct devlink_linecard *linecard) +{ + struct devlink_linecard_type *linecard_type; + unsigned int count; + int i; - err = devlink_trap_group_item_policer_link(devlink, group_item); - if (err) - goto err_policer_link; + count = linecard->ops->types_count(linecard, linecard->priv); + linecard->types = kmalloc_array(count, sizeof(*linecard_type), + GFP_KERNEL); + if (!linecard->types) + return -ENOMEM; + linecard->types_count = count; - if (devlink->ops->trap_group_init) { - err = devlink->ops->trap_group_init(devlink, group); - if (err) - goto err_group_init; + for (i = 0; i < count; i++) { + linecard_type = &linecard->types[i]; + linecard->ops->types_get(linecard, linecard->priv, i, + &linecard_type->type, + &linecard_type->priv); } - - list_add_tail(&group_item->list, &devlink->trap_group_list); - devlink_trap_group_notify(devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_NEW); - return 0; - -err_group_init: -err_policer_link: - free_percpu(group_item->stats); -err_stats_alloc: - kfree(group_item); - return err; } -static void -devlink_trap_group_unregister(struct devlink *devlink, - const struct devlink_trap_group *group) +static void devlink_linecard_types_fini(struct devlink_linecard *linecard) { - struct devlink_trap_group_item *group_item; - - group_item = devlink_trap_group_item_lookup(devlink, group->name); - if (WARN_ON_ONCE(!group_item)) - return; - - devlink_trap_group_notify(devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_DEL); - list_del(&group_item->list); - free_percpu(group_item->stats); - kfree(group_item); + kfree(linecard->types); } /** - * devl_trap_groups_register - Register packet trap groups with devlink. - * @devlink: devlink. - * @groups: Packet trap groups. - * @groups_count: Count of provided packet trap groups. + * devl_linecard_create - Create devlink linecard + * + * @devlink: devlink + * @linecard_index: driver-specific numerical identifier of the linecard + * @ops: linecards ops + * @priv: user priv pointer * - * Return: Non-zero value on failure. + * Create devlink linecard instance with provided linecard index. + * Caller can use any indexing, even hw-related one. + * + * Return: Line card structure or an ERR_PTR() encoded error code. */ -int devl_trap_groups_register(struct devlink *devlink, - const struct devlink_trap_group *groups, - size_t groups_count) +struct devlink_linecard * +devl_linecard_create(struct devlink *devlink, unsigned int linecard_index, + const struct devlink_linecard_ops *ops, void *priv) { - int i, err; + struct devlink_linecard *linecard; + int err; - devl_assert_locked(devlink); - for (i = 0; i < groups_count; i++) { - const struct devlink_trap_group *group = &groups[i]; + if (WARN_ON(!ops || !ops->provision || !ops->unprovision || + !ops->types_count || !ops->types_get)) + return ERR_PTR(-EINVAL); - err = devlink_trap_group_verify(group); - if (err) - goto err_trap_group_verify; + if (devlink_linecard_index_exists(devlink, linecard_index)) + return ERR_PTR(-EEXIST); - err = devlink_trap_group_register(devlink, group); - if (err) - goto err_trap_group_register; - } + linecard = kzalloc(sizeof(*linecard), GFP_KERNEL); + if (!linecard) + return ERR_PTR(-ENOMEM); - return 0; + linecard->devlink = devlink; + linecard->index = linecard_index; + linecard->ops = ops; + linecard->priv = priv; + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + mutex_init(&linecard->state_lock); -err_trap_group_register: -err_trap_group_verify: - for (i--; i >= 0; i--) - devlink_trap_group_unregister(devlink, &groups[i]); - return err; + err = devlink_linecard_types_init(linecard); + if (err) { + mutex_destroy(&linecard->state_lock); + kfree(linecard); + return ERR_PTR(err); + } + + list_add_tail(&linecard->list, &devlink->linecard_list); + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + return linecard; } -EXPORT_SYMBOL_GPL(devl_trap_groups_register); +EXPORT_SYMBOL_GPL(devl_linecard_create); /** - * devlink_trap_groups_register - Register packet trap groups with devlink. - * @devlink: devlink. - * @groups: Packet trap groups. - * @groups_count: Count of provided packet trap groups. - * - * Context: Takes and release devlink->lock . + * devl_linecard_destroy - Destroy devlink linecard * - * Return: Non-zero value on failure. + * @linecard: devlink linecard */ -int devlink_trap_groups_register(struct devlink *devlink, - const struct devlink_trap_group *groups, - size_t groups_count) +void devl_linecard_destroy(struct devlink_linecard *linecard) { - int err; - - devl_lock(devlink); - err = devl_trap_groups_register(devlink, groups, groups_count); - devl_unlock(devlink); - return err; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); + list_del(&linecard->list); + devlink_linecard_types_fini(linecard); + mutex_destroy(&linecard->state_lock); + kfree(linecard); } -EXPORT_SYMBOL_GPL(devlink_trap_groups_register); +EXPORT_SYMBOL_GPL(devl_linecard_destroy); /** - * devl_trap_groups_unregister - Unregister packet trap groups from devlink. - * @devlink: devlink. - * @groups: Packet trap groups. - * @groups_count: Count of provided packet trap groups. + * devlink_linecard_provision_set - Set provisioning on linecard + * + * @linecard: devlink linecard + * @type: linecard type + * + * This is either called directly from the provision() op call or + * as a result of the provision() op call asynchronously. */ -void devl_trap_groups_unregister(struct devlink *devlink, - const struct devlink_trap_group *groups, - size_t groups_count) +void devlink_linecard_provision_set(struct devlink_linecard *linecard, + const char *type) { - int i; - - devl_assert_locked(devlink); - for (i = groups_count - 1; i >= 0; i--) - devlink_trap_group_unregister(devlink, &groups[i]); + mutex_lock(&linecard->state_lock); + WARN_ON(linecard->type && strcmp(linecard->type, type)); + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; + linecard->type = type; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); } -EXPORT_SYMBOL_GPL(devl_trap_groups_unregister); +EXPORT_SYMBOL_GPL(devlink_linecard_provision_set); /** - * devlink_trap_groups_unregister - Unregister packet trap groups from devlink. - * @devlink: devlink. - * @groups: Packet trap groups. - * @groups_count: Count of provided packet trap groups. + * devlink_linecard_provision_clear - Clear provisioning on linecard + * + * @linecard: devlink linecard * - * Context: Takes and release devlink->lock . + * This is either called directly from the unprovision() op call or + * as a result of the unprovision() op call asynchronously. */ -void devlink_trap_groups_unregister(struct devlink *devlink, - const struct devlink_trap_group *groups, - size_t groups_count) -{ - devl_lock(devlink); - devl_trap_groups_unregister(devlink, groups, groups_count); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister); - -static void -devlink_trap_policer_notify(struct devlink *devlink, - const struct devlink_trap_policer_item *policer_item, - enum devlink_command cmd) -{ - struct sk_buff *msg; - int err; - - WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW && - cmd != DEVLINK_CMD_TRAP_POLICER_DEL); - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, cmd, 0, - 0, 0); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static void devlink_trap_policers_notify_register(struct devlink *devlink) -{ - struct devlink_trap_policer_item *policer_item; - - list_for_each_entry(policer_item, &devlink->trap_policer_list, list) - devlink_trap_policer_notify(devlink, policer_item, - DEVLINK_CMD_TRAP_POLICER_NEW); -} - -static void devlink_trap_policers_notify_unregister(struct devlink *devlink) +void devlink_linecard_provision_clear(struct devlink_linecard *linecard) { - struct devlink_trap_policer_item *policer_item; - - list_for_each_entry_reverse(policer_item, &devlink->trap_policer_list, - list) - devlink_trap_policer_notify(devlink, policer_item, - DEVLINK_CMD_TRAP_POLICER_DEL); + mutex_lock(&linecard->state_lock); + WARN_ON(linecard->nested_devlink); + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); } +EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear); -static int -devlink_trap_policer_register(struct devlink *devlink, - const struct devlink_trap_policer *policer) +/** + * devlink_linecard_provision_fail - Fail provisioning on linecard + * + * @linecard: devlink linecard + * + * This is either called directly from the provision() op call or + * as a result of the provision() op call asynchronously. + */ +void devlink_linecard_provision_fail(struct devlink_linecard *linecard) { - struct devlink_trap_policer_item *policer_item; - int err; - - if (devlink_trap_policer_item_lookup(devlink, policer->id)) - return -EEXIST; - - policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL); - if (!policer_item) - return -ENOMEM; - - policer_item->policer = policer; - policer_item->rate = policer->init_rate; - policer_item->burst = policer->init_burst; - - if (devlink->ops->trap_policer_init) { - err = devlink->ops->trap_policer_init(devlink, policer); - if (err) - goto err_policer_init; - } - - list_add_tail(&policer_item->list, &devlink->trap_policer_list); - devlink_trap_policer_notify(devlink, policer_item, - DEVLINK_CMD_TRAP_POLICER_NEW); - - return 0; - -err_policer_init: - kfree(policer_item); - return err; + mutex_lock(&linecard->state_lock); + WARN_ON(linecard->nested_devlink); + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); } +EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail); -static void -devlink_trap_policer_unregister(struct devlink *devlink, - const struct devlink_trap_policer *policer) +/** + * devlink_linecard_activate - Set linecard active + * + * @linecard: devlink linecard + */ +void devlink_linecard_activate(struct devlink_linecard *linecard) { - struct devlink_trap_policer_item *policer_item; - - policer_item = devlink_trap_policer_item_lookup(devlink, policer->id); - if (WARN_ON_ONCE(!policer_item)) - return; - - devlink_trap_policer_notify(devlink, policer_item, - DEVLINK_CMD_TRAP_POLICER_DEL); - list_del(&policer_item->list); - if (devlink->ops->trap_policer_fini) - devlink->ops->trap_policer_fini(devlink, policer); - kfree(policer_item); + mutex_lock(&linecard->state_lock); + WARN_ON(linecard->state != DEVLINK_LINECARD_STATE_PROVISIONED); + linecard->state = DEVLINK_LINECARD_STATE_ACTIVE; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); } +EXPORT_SYMBOL_GPL(devlink_linecard_activate); /** - * devl_trap_policers_register - Register packet trap policers with devlink. - * @devlink: devlink. - * @policers: Packet trap policers. - * @policers_count: Count of provided packet trap policers. + * devlink_linecard_deactivate - Set linecard inactive * - * Return: Non-zero value on failure. + * @linecard: devlink linecard */ -int -devl_trap_policers_register(struct devlink *devlink, - const struct devlink_trap_policer *policers, - size_t policers_count) +void devlink_linecard_deactivate(struct devlink_linecard *linecard) { - int i, err; - - devl_assert_locked(devlink); - for (i = 0; i < policers_count; i++) { - const struct devlink_trap_policer *policer = &policers[i]; - - if (WARN_ON(policer->id == 0 || - policer->max_rate < policer->min_rate || - policer->max_burst < policer->min_burst)) { - err = -EINVAL; - goto err_trap_policer_verify; - } - - err = devlink_trap_policer_register(devlink, policer); - if (err) - goto err_trap_policer_register; + mutex_lock(&linecard->state_lock); + switch (linecard->state) { + case DEVLINK_LINECARD_STATE_ACTIVE: + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + break; + case DEVLINK_LINECARD_STATE_UNPROVISIONING: + /* Line card is being deactivated as part + * of unprovisioning flow. + */ + break; + default: + WARN_ON(1); + break; } - return 0; - -err_trap_policer_register: -err_trap_policer_verify: - for (i--; i >= 0; i--) - devlink_trap_policer_unregister(devlink, &policers[i]); - return err; + mutex_unlock(&linecard->state_lock); } -EXPORT_SYMBOL_GPL(devl_trap_policers_register); +EXPORT_SYMBOL_GPL(devlink_linecard_deactivate); /** - * devl_trap_policers_unregister - Unregister packet trap policers from devlink. - * @devlink: devlink. - * @policers: Packet trap policers. - * @policers_count: Count of provided packet trap policers. + * devlink_linecard_nested_dl_set - Attach/detach nested devlink + * instance to linecard. + * + * @linecard: devlink linecard + * @nested_devlink: devlink instance to attach or NULL to detach */ -void -devl_trap_policers_unregister(struct devlink *devlink, - const struct devlink_trap_policer *policers, - size_t policers_count) +void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, + struct devlink *nested_devlink) { - int i; - - devl_assert_locked(devlink); - for (i = policers_count - 1; i >= 0; i--) - devlink_trap_policer_unregister(devlink, &policers[i]); + mutex_lock(&linecard->state_lock); + linecard->nested_devlink = nested_devlink; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); } -EXPORT_SYMBOL_GPL(devl_trap_policers_unregister); +EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set); diff --git a/net/devlink/trap.c b/net/devlink/trap.c new file mode 100644 index 000000000000..c26bf9b29bca --- /dev/null +++ b/net/devlink/trap.c @@ -0,0 +1,1861 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko + */ + +#include + +#include "devl_internal.h" + +struct devlink_stats { + u64_stats_t rx_bytes; + u64_stats_t rx_packets; + struct u64_stats_sync syncp; +}; + +/** + * struct devlink_trap_policer_item - Packet trap policer attributes. + * @policer: Immutable packet trap policer attributes. + * @rate: Rate in packets / sec. + * @burst: Burst size in packets. + * @list: trap_policer_list member. + * + * Describes packet trap policer attributes. Created by devlink during trap + * policer registration. + */ +struct devlink_trap_policer_item { + const struct devlink_trap_policer *policer; + u64 rate; + u64 burst; + struct list_head list; +}; + +/** + * struct devlink_trap_group_item - Packet trap group attributes. + * @group: Immutable packet trap group attributes. + * @policer_item: Associated policer item. Can be NULL. + * @list: trap_group_list member. + * @stats: Trap group statistics. + * + * Describes packet trap group attributes. Created by devlink during trap + * group registration. + */ +struct devlink_trap_group_item { + const struct devlink_trap_group *group; + struct devlink_trap_policer_item *policer_item; + struct list_head list; + struct devlink_stats __percpu *stats; +}; + +/** + * struct devlink_trap_item - Packet trap attributes. + * @trap: Immutable packet trap attributes. + * @group_item: Associated group item. + * @list: trap_list member. + * @action: Trap action. + * @stats: Trap statistics. + * @priv: Driver private information. + * + * Describes both mutable and immutable packet trap attributes. Created by + * devlink during trap registration and used for all trap related operations. + */ +struct devlink_trap_item { + const struct devlink_trap *trap; + struct devlink_trap_group_item *group_item; + struct list_head list; + enum devlink_trap_action action; + struct devlink_stats __percpu *stats; + void *priv; +}; + +static struct devlink_trap_policer_item * +devlink_trap_policer_item_lookup(struct devlink *devlink, u32 id) +{ + struct devlink_trap_policer_item *policer_item; + + list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { + if (policer_item->policer->id == id) + return policer_item; + } + + return NULL; +} + +static struct devlink_trap_item * +devlink_trap_item_lookup(struct devlink *devlink, const char *name) +{ + struct devlink_trap_item *trap_item; + + list_for_each_entry(trap_item, &devlink->trap_list, list) { + if (!strcmp(trap_item->trap->name, name)) + return trap_item; + } + + return NULL; +} + +static struct devlink_trap_item * +devlink_trap_item_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + struct nlattr *attr; + + if (!info->attrs[DEVLINK_ATTR_TRAP_NAME]) + return NULL; + attr = info->attrs[DEVLINK_ATTR_TRAP_NAME]; + + return devlink_trap_item_lookup(devlink, nla_data(attr)); +} + +static int +devlink_trap_action_get_from_info(struct genl_info *info, + enum devlink_trap_action *p_trap_action) +{ + u8 val; + + val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]); + switch (val) { + case DEVLINK_TRAP_ACTION_DROP: + case DEVLINK_TRAP_ACTION_TRAP: + case DEVLINK_TRAP_ACTION_MIRROR: + *p_trap_action = val; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int devlink_trap_metadata_put(struct sk_buff *msg, + const struct devlink_trap *trap) +{ + struct nlattr *attr; + + attr = nla_nest_start(msg, DEVLINK_ATTR_TRAP_METADATA); + if (!attr) + return -EMSGSIZE; + + if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) && + nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT)) + goto nla_put_failure; + if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE) && + nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats, + struct devlink_stats *stats) +{ + int i; + + memset(stats, 0, sizeof(*stats)); + for_each_possible_cpu(i) { + struct devlink_stats *cpu_stats; + u64 rx_packets, rx_bytes; + unsigned int start; + + cpu_stats = per_cpu_ptr(trap_stats, i); + do { + start = u64_stats_fetch_begin(&cpu_stats->syncp); + rx_packets = u64_stats_read(&cpu_stats->rx_packets); + rx_bytes = u64_stats_read(&cpu_stats->rx_bytes); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); + + u64_stats_add(&stats->rx_packets, rx_packets); + u64_stats_add(&stats->rx_bytes, rx_bytes); + } +} + +static int +devlink_trap_group_stats_put(struct sk_buff *msg, + struct devlink_stats __percpu *trap_stats) +{ + struct devlink_stats stats; + struct nlattr *attr; + + devlink_trap_stats_read(trap_stats, &stats); + + attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); + if (!attr) + return -EMSGSIZE; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS, + u64_stats_read(&stats.rx_packets), + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES, + u64_stats_read(&stats.rx_bytes), + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink, + const struct devlink_trap_item *trap_item) +{ + struct devlink_stats stats; + struct nlattr *attr; + u64 drops = 0; + int err; + + if (devlink->ops->trap_drop_counter_get) { + err = devlink->ops->trap_drop_counter_get(devlink, + trap_item->trap, + &drops); + if (err) + return err; + } + + devlink_trap_stats_read(trap_item->stats, &stats); + + attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); + if (!attr) + return -EMSGSIZE; + + if (devlink->ops->trap_drop_counter_get && + nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS, + u64_stats_read(&stats.rx_packets), + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES, + u64_stats_read(&stats.rx_bytes), + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink, + const struct devlink_trap_item *trap_item, + enum devlink_command cmd, u32 portid, u32 seq, + int flags) +{ + struct devlink_trap_group_item *group_item = trap_item->group_item; + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME, + group_item->group->name)) + goto nla_put_failure; + + if (nla_put_string(msg, DEVLINK_ATTR_TRAP_NAME, trap_item->trap->name)) + goto nla_put_failure; + + if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_TYPE, trap_item->trap->type)) + goto nla_put_failure; + + if (trap_item->trap->generic && + nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) + goto nla_put_failure; + + if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_ACTION, trap_item->action)) + goto nla_put_failure; + + err = devlink_trap_metadata_put(msg, trap_item->trap); + if (err) + goto nla_put_failure; + + err = devlink_trap_stats_put(msg, devlink, trap_item); + if (err) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_trap_item *trap_item; + struct sk_buff *msg; + int err; + + if (list_empty(&devlink->trap_list)) + return -EOPNOTSUPP; + + trap_item = devlink_trap_item_get_from_info(devlink, info); + if (!trap_item) { + NL_SET_ERR_MSG(extack, "Device did not register this trap"); + return -ENOENT; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_trap_fill(msg, devlink, trap_item, + DEVLINK_CMD_TRAP_NEW, info->snd_portid, + info->snd_seq, 0); + if (err) + goto err_trap_fill; + + return genlmsg_reply(msg, info); + +err_trap_fill: + nlmsg_free(msg); + return err; +} + +static int devlink_nl_trap_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, int flags) +{ + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + struct devlink_trap_item *trap_item; + int idx = 0; + int err = 0; + + list_for_each_entry(trap_item, &devlink->trap_list, list) { + if (idx < state->idx) { + idx++; + continue; + } + err = devlink_nl_trap_fill(msg, devlink, trap_item, + DEVLINK_CMD_TRAP_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, flags); + if (err) { + state->idx = idx; + break; + } + idx++; + } + + return err; +} + +int devlink_nl_trap_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_trap_get_dump_one); +} + +static int __devlink_trap_action_set(struct devlink *devlink, + struct devlink_trap_item *trap_item, + enum devlink_trap_action trap_action, + struct netlink_ext_ack *extack) +{ + int err; + + if (trap_item->action != trap_action && + trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) { + NL_SET_ERR_MSG(extack, "Cannot change action of non-drop traps. Skipping"); + return 0; + } + + err = devlink->ops->trap_action_set(devlink, trap_item->trap, + trap_action, extack); + if (err) + return err; + + trap_item->action = trap_action; + + return 0; +} + +static int devlink_trap_action_set(struct devlink *devlink, + struct devlink_trap_item *trap_item, + struct genl_info *info) +{ + enum devlink_trap_action trap_action; + int err; + + if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION]) + return 0; + + err = devlink_trap_action_get_from_info(info, &trap_action); + if (err) { + NL_SET_ERR_MSG(info->extack, "Invalid trap action"); + return -EINVAL; + } + + return __devlink_trap_action_set(devlink, trap_item, trap_action, + info->extack); +} + +int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_trap_item *trap_item; + + if (list_empty(&devlink->trap_list)) + return -EOPNOTSUPP; + + trap_item = devlink_trap_item_get_from_info(devlink, info); + if (!trap_item) { + NL_SET_ERR_MSG(extack, "Device did not register this trap"); + return -ENOENT; + } + + return devlink_trap_action_set(devlink, trap_item, info); +} + +static struct devlink_trap_group_item * +devlink_trap_group_item_lookup(struct devlink *devlink, const char *name) +{ + struct devlink_trap_group_item *group_item; + + list_for_each_entry(group_item, &devlink->trap_group_list, list) { + if (!strcmp(group_item->group->name, name)) + return group_item; + } + + return NULL; +} + +static struct devlink_trap_group_item * +devlink_trap_group_item_lookup_by_id(struct devlink *devlink, u16 id) +{ + struct devlink_trap_group_item *group_item; + + list_for_each_entry(group_item, &devlink->trap_group_list, list) { + if (group_item->group->id == id) + return group_item; + } + + return NULL; +} + +static struct devlink_trap_group_item * +devlink_trap_group_item_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + char *name; + + if (!info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]) + return NULL; + name = nla_data(info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]); + + return devlink_trap_group_item_lookup(devlink, name); +} + +static int +devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink, + const struct devlink_trap_group_item *group_item, + enum devlink_command cmd, u32 portid, u32 seq, + int flags) +{ + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME, + group_item->group->name)) + goto nla_put_failure; + + if (group_item->group->generic && + nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) + goto nla_put_failure; + + if (group_item->policer_item && + nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, + group_item->policer_item->policer->id)) + goto nla_put_failure; + + err = devlink_trap_group_stats_put(msg, group_item->stats); + if (err) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_trap_group_item *group_item; + struct sk_buff *msg; + int err; + + if (list_empty(&devlink->trap_group_list)) + return -EOPNOTSUPP; + + group_item = devlink_trap_group_item_get_from_info(devlink, info); + if (!group_item) { + NL_SET_ERR_MSG(extack, "Device did not register this trap group"); + return -ENOENT; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_trap_group_fill(msg, devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) + goto err_trap_group_fill; + + return genlmsg_reply(msg, info); + +err_trap_group_fill: + nlmsg_free(msg); + return err; +} + +static int devlink_nl_trap_group_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags) +{ + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + struct devlink_trap_group_item *group_item; + int idx = 0; + int err = 0; + + list_for_each_entry(group_item, &devlink->trap_group_list, list) { + if (idx < state->idx) { + idx++; + continue; + } + err = devlink_nl_trap_group_fill(msg, devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, flags); + if (err) { + state->idx = idx; + break; + } + idx++; + } + + return err; +} + +int devlink_nl_trap_group_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_trap_group_get_dump_one); +} + +static int +__devlink_trap_group_action_set(struct devlink *devlink, + struct devlink_trap_group_item *group_item, + enum devlink_trap_action trap_action, + struct netlink_ext_ack *extack) +{ + const char *group_name = group_item->group->name; + struct devlink_trap_item *trap_item; + int err; + + if (devlink->ops->trap_group_action_set) { + err = devlink->ops->trap_group_action_set(devlink, group_item->group, + trap_action, extack); + if (err) + return err; + + list_for_each_entry(trap_item, &devlink->trap_list, list) { + if (strcmp(trap_item->group_item->group->name, group_name)) + continue; + if (trap_item->action != trap_action && + trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) + continue; + trap_item->action = trap_action; + } + + return 0; + } + + list_for_each_entry(trap_item, &devlink->trap_list, list) { + if (strcmp(trap_item->group_item->group->name, group_name)) + continue; + err = __devlink_trap_action_set(devlink, trap_item, + trap_action, extack); + if (err) + return err; + } + + return 0; +} + +static int +devlink_trap_group_action_set(struct devlink *devlink, + struct devlink_trap_group_item *group_item, + struct genl_info *info, bool *p_modified) +{ + enum devlink_trap_action trap_action; + int err; + + if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION]) + return 0; + + err = devlink_trap_action_get_from_info(info, &trap_action); + if (err) { + NL_SET_ERR_MSG(info->extack, "Invalid trap action"); + return -EINVAL; + } + + err = __devlink_trap_group_action_set(devlink, group_item, trap_action, + info->extack); + if (err) + return err; + + *p_modified = true; + + return 0; +} + +static int devlink_trap_group_set(struct devlink *devlink, + struct devlink_trap_group_item *group_item, + struct genl_info *info) +{ + struct devlink_trap_policer_item *policer_item; + struct netlink_ext_ack *extack = info->extack; + const struct devlink_trap_policer *policer; + struct nlattr **attrs = info->attrs; + u32 policer_id; + int err; + + if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) + return 0; + + if (!devlink->ops->trap_group_set) + return -EOPNOTSUPP; + + policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); + policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); + if (policer_id && !policer_item) { + NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); + return -ENOENT; + } + policer = policer_item ? policer_item->policer : NULL; + + err = devlink->ops->trap_group_set(devlink, group_item->group, policer, + extack); + if (err) + return err; + + group_item->policer_item = policer_item; + + return 0; +} + +int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_trap_group_item *group_item; + bool modified = false; + int err; + + if (list_empty(&devlink->trap_group_list)) + return -EOPNOTSUPP; + + group_item = devlink_trap_group_item_get_from_info(devlink, info); + if (!group_item) { + NL_SET_ERR_MSG(extack, "Device did not register this trap group"); + return -ENOENT; + } + + err = devlink_trap_group_action_set(devlink, group_item, info, + &modified); + if (err) + return err; + + err = devlink_trap_group_set(devlink, group_item, info); + if (err) + goto err_trap_group_set; + + return 0; + +err_trap_group_set: + if (modified) + NL_SET_ERR_MSG(extack, "Trap group set failed, but some changes were committed already"); + return err; +} + +static struct devlink_trap_policer_item * +devlink_trap_policer_item_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + u32 id; + + if (!info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) + return NULL; + id = nla_get_u32(info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); + + return devlink_trap_policer_item_lookup(devlink, id); +} + +static int +devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink, + const struct devlink_trap_policer *policer) +{ + struct nlattr *attr; + u64 drops; + int err; + + if (!devlink->ops->trap_policer_counter_get) + return 0; + + err = devlink->ops->trap_policer_counter_get(devlink, policer, &drops); + if (err) + return err; + + attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); + if (!attr) + return -EMSGSIZE; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int +devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink, + const struct devlink_trap_policer_item *policer_item, + enum devlink_command cmd, u32 portid, u32 seq, + int flags) +{ + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, + policer_item->policer->id)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_RATE, + policer_item->rate, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_BURST, + policer_item->burst, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + err = devlink_trap_policer_stats_put(msg, devlink, + policer_item->policer); + if (err) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +int devlink_nl_trap_policer_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_trap_policer_item *policer_item; + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + if (list_empty(&devlink->trap_policer_list)) + return -EOPNOTSUPP; + + policer_item = devlink_trap_policer_item_get_from_info(devlink, info); + if (!policer_item) { + NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); + return -ENOENT; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) + goto err_trap_policer_fill; + + return genlmsg_reply(msg, info); + +err_trap_policer_fill: + nlmsg_free(msg); + return err; +} + +static int devlink_nl_trap_policer_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags) +{ + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + struct devlink_trap_policer_item *policer_item; + int idx = 0; + int err = 0; + + list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { + if (idx < state->idx) { + idx++; + continue; + } + err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, flags); + if (err) { + state->idx = idx; + break; + } + idx++; + } + + return err; +} + +int devlink_nl_trap_policer_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_trap_policer_get_dump_one); +} + +static int +devlink_trap_policer_set(struct devlink *devlink, + struct devlink_trap_policer_item *policer_item, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct nlattr **attrs = info->attrs; + u64 rate, burst; + int err; + + rate = policer_item->rate; + burst = policer_item->burst; + + if (attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]) + rate = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]); + + if (attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]) + burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]); + + if (rate < policer_item->policer->min_rate) { + NL_SET_ERR_MSG(extack, "Policer rate lower than limit"); + return -EINVAL; + } + + if (rate > policer_item->policer->max_rate) { + NL_SET_ERR_MSG(extack, "Policer rate higher than limit"); + return -EINVAL; + } + + if (burst < policer_item->policer->min_burst) { + NL_SET_ERR_MSG(extack, "Policer burst size lower than limit"); + return -EINVAL; + } + + if (burst > policer_item->policer->max_burst) { + NL_SET_ERR_MSG(extack, "Policer burst size higher than limit"); + return -EINVAL; + } + + err = devlink->ops->trap_policer_set(devlink, policer_item->policer, + rate, burst, info->extack); + if (err) + return err; + + policer_item->rate = rate; + policer_item->burst = burst; + + return 0; +} + +int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_trap_policer_item *policer_item; + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + + if (list_empty(&devlink->trap_policer_list)) + return -EOPNOTSUPP; + + if (!devlink->ops->trap_policer_set) + return -EOPNOTSUPP; + + policer_item = devlink_trap_policer_item_get_from_info(devlink, info); + if (!policer_item) { + NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); + return -ENOENT; + } + + return devlink_trap_policer_set(devlink, policer_item, info); +} + +#define DEVLINK_TRAP(_id, _type) \ + { \ + .type = DEVLINK_TRAP_TYPE_##_type, \ + .id = DEVLINK_TRAP_GENERIC_ID_##_id, \ + .name = DEVLINK_TRAP_GENERIC_NAME_##_id, \ + } + +static const struct devlink_trap devlink_trap_generic[] = { + DEVLINK_TRAP(SMAC_MC, DROP), + DEVLINK_TRAP(VLAN_TAG_MISMATCH, DROP), + DEVLINK_TRAP(INGRESS_VLAN_FILTER, DROP), + DEVLINK_TRAP(INGRESS_STP_FILTER, DROP), + DEVLINK_TRAP(EMPTY_TX_LIST, DROP), + DEVLINK_TRAP(PORT_LOOPBACK_FILTER, DROP), + DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP), + DEVLINK_TRAP(TTL_ERROR, EXCEPTION), + DEVLINK_TRAP(TAIL_DROP, DROP), + DEVLINK_TRAP(NON_IP_PACKET, DROP), + DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP), + DEVLINK_TRAP(DIP_LB, DROP), + DEVLINK_TRAP(SIP_MC, DROP), + DEVLINK_TRAP(SIP_LB, DROP), + DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP), + DEVLINK_TRAP(IPV4_SIP_BC, DROP), + DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP), + DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP), + DEVLINK_TRAP(MTU_ERROR, EXCEPTION), + DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION), + DEVLINK_TRAP(RPF, EXCEPTION), + DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION), + DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION), + DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION), + DEVLINK_TRAP(NON_ROUTABLE, DROP), + DEVLINK_TRAP(DECAP_ERROR, EXCEPTION), + DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP), + DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP), + DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP), + DEVLINK_TRAP(STP, CONTROL), + DEVLINK_TRAP(LACP, CONTROL), + DEVLINK_TRAP(LLDP, CONTROL), + DEVLINK_TRAP(IGMP_QUERY, CONTROL), + DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL), + DEVLINK_TRAP(MLD_QUERY, CONTROL), + DEVLINK_TRAP(MLD_V1_REPORT, CONTROL), + DEVLINK_TRAP(MLD_V2_REPORT, CONTROL), + DEVLINK_TRAP(MLD_V1_DONE, CONTROL), + DEVLINK_TRAP(IPV4_DHCP, CONTROL), + DEVLINK_TRAP(IPV6_DHCP, CONTROL), + DEVLINK_TRAP(ARP_REQUEST, CONTROL), + DEVLINK_TRAP(ARP_RESPONSE, CONTROL), + DEVLINK_TRAP(ARP_OVERLAY, CONTROL), + DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL), + DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL), + DEVLINK_TRAP(IPV4_BFD, CONTROL), + DEVLINK_TRAP(IPV6_BFD, CONTROL), + DEVLINK_TRAP(IPV4_OSPF, CONTROL), + DEVLINK_TRAP(IPV6_OSPF, CONTROL), + DEVLINK_TRAP(IPV4_BGP, CONTROL), + DEVLINK_TRAP(IPV6_BGP, CONTROL), + DEVLINK_TRAP(IPV4_VRRP, CONTROL), + DEVLINK_TRAP(IPV6_VRRP, CONTROL), + DEVLINK_TRAP(IPV4_PIM, CONTROL), + DEVLINK_TRAP(IPV6_PIM, CONTROL), + DEVLINK_TRAP(UC_LB, CONTROL), + DEVLINK_TRAP(LOCAL_ROUTE, CONTROL), + DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL), + DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL), + DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL), + DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL), + DEVLINK_TRAP(IPV6_REDIRECT, CONTROL), + DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL), + DEVLINK_TRAP(PTP_EVENT, CONTROL), + DEVLINK_TRAP(PTP_GENERAL, CONTROL), + DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL), + DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL), + DEVLINK_TRAP(EARLY_DROP, DROP), + DEVLINK_TRAP(VXLAN_PARSING, DROP), + DEVLINK_TRAP(LLC_SNAP_PARSING, DROP), + DEVLINK_TRAP(VLAN_PARSING, DROP), + DEVLINK_TRAP(PPPOE_PPP_PARSING, DROP), + DEVLINK_TRAP(MPLS_PARSING, DROP), + DEVLINK_TRAP(ARP_PARSING, DROP), + DEVLINK_TRAP(IP_1_PARSING, DROP), + DEVLINK_TRAP(IP_N_PARSING, DROP), + DEVLINK_TRAP(GRE_PARSING, DROP), + DEVLINK_TRAP(UDP_PARSING, DROP), + DEVLINK_TRAP(TCP_PARSING, DROP), + DEVLINK_TRAP(IPSEC_PARSING, DROP), + DEVLINK_TRAP(SCTP_PARSING, DROP), + DEVLINK_TRAP(DCCP_PARSING, DROP), + DEVLINK_TRAP(GTP_PARSING, DROP), + DEVLINK_TRAP(ESP_PARSING, DROP), + DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP), + DEVLINK_TRAP(DMAC_FILTER, DROP), + DEVLINK_TRAP(EAPOL, CONTROL), + DEVLINK_TRAP(LOCKED_PORT, DROP), +}; + +#define DEVLINK_TRAP_GROUP(_id) \ + { \ + .id = DEVLINK_TRAP_GROUP_GENERIC_ID_##_id, \ + .name = DEVLINK_TRAP_GROUP_GENERIC_NAME_##_id, \ + } + +static const struct devlink_trap_group devlink_trap_group_generic[] = { + DEVLINK_TRAP_GROUP(L2_DROPS), + DEVLINK_TRAP_GROUP(L3_DROPS), + DEVLINK_TRAP_GROUP(L3_EXCEPTIONS), + DEVLINK_TRAP_GROUP(BUFFER_DROPS), + DEVLINK_TRAP_GROUP(TUNNEL_DROPS), + DEVLINK_TRAP_GROUP(ACL_DROPS), + DEVLINK_TRAP_GROUP(STP), + DEVLINK_TRAP_GROUP(LACP), + DEVLINK_TRAP_GROUP(LLDP), + DEVLINK_TRAP_GROUP(MC_SNOOPING), + DEVLINK_TRAP_GROUP(DHCP), + DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY), + DEVLINK_TRAP_GROUP(BFD), + DEVLINK_TRAP_GROUP(OSPF), + DEVLINK_TRAP_GROUP(BGP), + DEVLINK_TRAP_GROUP(VRRP), + DEVLINK_TRAP_GROUP(PIM), + DEVLINK_TRAP_GROUP(UC_LB), + DEVLINK_TRAP_GROUP(LOCAL_DELIVERY), + DEVLINK_TRAP_GROUP(EXTERNAL_DELIVERY), + DEVLINK_TRAP_GROUP(IPV6), + DEVLINK_TRAP_GROUP(PTP_EVENT), + DEVLINK_TRAP_GROUP(PTP_GENERAL), + DEVLINK_TRAP_GROUP(ACL_SAMPLE), + DEVLINK_TRAP_GROUP(ACL_TRAP), + DEVLINK_TRAP_GROUP(PARSER_ERROR_DROPS), + DEVLINK_TRAP_GROUP(EAPOL), +}; + +static int devlink_trap_generic_verify(const struct devlink_trap *trap) +{ + if (trap->id > DEVLINK_TRAP_GENERIC_ID_MAX) + return -EINVAL; + + if (strcmp(trap->name, devlink_trap_generic[trap->id].name)) + return -EINVAL; + + if (trap->type != devlink_trap_generic[trap->id].type) + return -EINVAL; + + return 0; +} + +static int devlink_trap_driver_verify(const struct devlink_trap *trap) +{ + int i; + + if (trap->id <= DEVLINK_TRAP_GENERIC_ID_MAX) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(devlink_trap_generic); i++) { + if (!strcmp(trap->name, devlink_trap_generic[i].name)) + return -EEXIST; + } + + return 0; +} + +static int devlink_trap_verify(const struct devlink_trap *trap) +{ + if (!trap || !trap->name) + return -EINVAL; + + if (trap->generic) + return devlink_trap_generic_verify(trap); + else + return devlink_trap_driver_verify(trap); +} + +static int +devlink_trap_group_generic_verify(const struct devlink_trap_group *group) +{ + if (group->id > DEVLINK_TRAP_GROUP_GENERIC_ID_MAX) + return -EINVAL; + + if (strcmp(group->name, devlink_trap_group_generic[group->id].name)) + return -EINVAL; + + return 0; +} + +static int +devlink_trap_group_driver_verify(const struct devlink_trap_group *group) +{ + int i; + + if (group->id <= DEVLINK_TRAP_GROUP_GENERIC_ID_MAX) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(devlink_trap_group_generic); i++) { + if (!strcmp(group->name, devlink_trap_group_generic[i].name)) + return -EEXIST; + } + + return 0; +} + +static int devlink_trap_group_verify(const struct devlink_trap_group *group) +{ + if (group->generic) + return devlink_trap_group_generic_verify(group); + else + return devlink_trap_group_driver_verify(group); +} + +static void +devlink_trap_group_notify(struct devlink *devlink, + const struct devlink_trap_group_item *group_item, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW && + cmd != DEVLINK_CMD_TRAP_GROUP_DEL); + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_trap_group_fill(msg, devlink, group_item, cmd, 0, 0, + 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +void devlink_trap_groups_notify_register(struct devlink *devlink) +{ + struct devlink_trap_group_item *group_item; + + list_for_each_entry(group_item, &devlink->trap_group_list, list) + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_NEW); +} + +void devlink_trap_groups_notify_unregister(struct devlink *devlink) +{ + struct devlink_trap_group_item *group_item; + + list_for_each_entry_reverse(group_item, &devlink->trap_group_list, list) + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_DEL); +} + +static int +devlink_trap_item_group_link(struct devlink *devlink, + struct devlink_trap_item *trap_item) +{ + u16 group_id = trap_item->trap->init_group_id; + struct devlink_trap_group_item *group_item; + + group_item = devlink_trap_group_item_lookup_by_id(devlink, group_id); + if (WARN_ON_ONCE(!group_item)) + return -EINVAL; + + trap_item->group_item = group_item; + + return 0; +} + +static void devlink_trap_notify(struct devlink *devlink, + const struct devlink_trap_item *trap_item, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW && + cmd != DEVLINK_CMD_TRAP_DEL); + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_trap_fill(msg, devlink, trap_item, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +void devlink_traps_notify_register(struct devlink *devlink) +{ + struct devlink_trap_item *trap_item; + + list_for_each_entry(trap_item, &devlink->trap_list, list) + devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); +} + +void devlink_traps_notify_unregister(struct devlink *devlink) +{ + struct devlink_trap_item *trap_item; + + list_for_each_entry_reverse(trap_item, &devlink->trap_list, list) + devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); +} + +static int +devlink_trap_register(struct devlink *devlink, + const struct devlink_trap *trap, void *priv) +{ + struct devlink_trap_item *trap_item; + int err; + + if (devlink_trap_item_lookup(devlink, trap->name)) + return -EEXIST; + + trap_item = kzalloc(sizeof(*trap_item), GFP_KERNEL); + if (!trap_item) + return -ENOMEM; + + trap_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); + if (!trap_item->stats) { + err = -ENOMEM; + goto err_stats_alloc; + } + + trap_item->trap = trap; + trap_item->action = trap->init_action; + trap_item->priv = priv; + + err = devlink_trap_item_group_link(devlink, trap_item); + if (err) + goto err_group_link; + + err = devlink->ops->trap_init(devlink, trap, trap_item); + if (err) + goto err_trap_init; + + list_add_tail(&trap_item->list, &devlink->trap_list); + devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); + + return 0; + +err_trap_init: +err_group_link: + free_percpu(trap_item->stats); +err_stats_alloc: + kfree(trap_item); + return err; +} + +static void devlink_trap_unregister(struct devlink *devlink, + const struct devlink_trap *trap) +{ + struct devlink_trap_item *trap_item; + + trap_item = devlink_trap_item_lookup(devlink, trap->name); + if (WARN_ON_ONCE(!trap_item)) + return; + + devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); + list_del(&trap_item->list); + if (devlink->ops->trap_fini) + devlink->ops->trap_fini(devlink, trap, trap_item); + free_percpu(trap_item->stats); + kfree(trap_item); +} + +static void devlink_trap_disable(struct devlink *devlink, + const struct devlink_trap *trap) +{ + struct devlink_trap_item *trap_item; + + trap_item = devlink_trap_item_lookup(devlink, trap->name); + if (WARN_ON_ONCE(!trap_item)) + return; + + devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP, + NULL); + trap_item->action = DEVLINK_TRAP_ACTION_DROP; +} + +/** + * devl_traps_register - Register packet traps with devlink. + * @devlink: devlink. + * @traps: Packet traps. + * @traps_count: Count of provided packet traps. + * @priv: Driver private information. + * + * Return: Non-zero value on failure. + */ +int devl_traps_register(struct devlink *devlink, + const struct devlink_trap *traps, + size_t traps_count, void *priv) +{ + int i, err; + + if (!devlink->ops->trap_init || !devlink->ops->trap_action_set) + return -EINVAL; + + devl_assert_locked(devlink); + for (i = 0; i < traps_count; i++) { + const struct devlink_trap *trap = &traps[i]; + + err = devlink_trap_verify(trap); + if (err) + goto err_trap_verify; + + err = devlink_trap_register(devlink, trap, priv); + if (err) + goto err_trap_register; + } + + return 0; + +err_trap_register: +err_trap_verify: + for (i--; i >= 0; i--) + devlink_trap_unregister(devlink, &traps[i]); + return err; +} +EXPORT_SYMBOL_GPL(devl_traps_register); + +/** + * devlink_traps_register - Register packet traps with devlink. + * @devlink: devlink. + * @traps: Packet traps. + * @traps_count: Count of provided packet traps. + * @priv: Driver private information. + * + * Context: Takes and release devlink->lock . + * + * Return: Non-zero value on failure. + */ +int devlink_traps_register(struct devlink *devlink, + const struct devlink_trap *traps, + size_t traps_count, void *priv) +{ + int err; + + devl_lock(devlink); + err = devl_traps_register(devlink, traps, traps_count, priv); + devl_unlock(devlink); + return err; +} +EXPORT_SYMBOL_GPL(devlink_traps_register); + +/** + * devl_traps_unregister - Unregister packet traps from devlink. + * @devlink: devlink. + * @traps: Packet traps. + * @traps_count: Count of provided packet traps. + */ +void devl_traps_unregister(struct devlink *devlink, + const struct devlink_trap *traps, + size_t traps_count) +{ + int i; + + devl_assert_locked(devlink); + /* Make sure we do not have any packets in-flight while unregistering + * traps by disabling all of them and waiting for a grace period. + */ + for (i = traps_count - 1; i >= 0; i--) + devlink_trap_disable(devlink, &traps[i]); + synchronize_rcu(); + for (i = traps_count - 1; i >= 0; i--) + devlink_trap_unregister(devlink, &traps[i]); +} +EXPORT_SYMBOL_GPL(devl_traps_unregister); + +/** + * devlink_traps_unregister - Unregister packet traps from devlink. + * @devlink: devlink. + * @traps: Packet traps. + * @traps_count: Count of provided packet traps. + * + * Context: Takes and release devlink->lock . + */ +void devlink_traps_unregister(struct devlink *devlink, + const struct devlink_trap *traps, + size_t traps_count) +{ + devl_lock(devlink); + devl_traps_unregister(devlink, traps, traps_count); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_traps_unregister); + +static void +devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats, + size_t skb_len) +{ + struct devlink_stats *stats; + + stats = this_cpu_ptr(trap_stats); + u64_stats_update_begin(&stats->syncp); + u64_stats_add(&stats->rx_bytes, skb_len); + u64_stats_inc(&stats->rx_packets); + u64_stats_update_end(&stats->syncp); +} + +static void +devlink_trap_report_metadata_set(struct devlink_trap_metadata *metadata, + const struct devlink_trap_item *trap_item, + struct devlink_port *in_devlink_port, + const struct flow_action_cookie *fa_cookie) +{ + metadata->trap_name = trap_item->trap->name; + metadata->trap_group_name = trap_item->group_item->group->name; + metadata->fa_cookie = fa_cookie; + metadata->trap_type = trap_item->trap->type; + + spin_lock(&in_devlink_port->type_lock); + if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH) + metadata->input_dev = in_devlink_port->type_eth.netdev; + spin_unlock(&in_devlink_port->type_lock); +} + +/** + * devlink_trap_report - Report trapped packet to drop monitor. + * @devlink: devlink. + * @skb: Trapped packet. + * @trap_ctx: Trap context. + * @in_devlink_port: Input devlink port. + * @fa_cookie: Flow action cookie. Could be NULL. + */ +void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, + void *trap_ctx, struct devlink_port *in_devlink_port, + const struct flow_action_cookie *fa_cookie) + +{ + struct devlink_trap_item *trap_item = trap_ctx; + + devlink_trap_stats_update(trap_item->stats, skb->len); + devlink_trap_stats_update(trap_item->group_item->stats, skb->len); + + if (tracepoint_enabled(devlink_trap_report)) { + struct devlink_trap_metadata metadata = {}; + + devlink_trap_report_metadata_set(&metadata, trap_item, + in_devlink_port, fa_cookie); + trace_devlink_trap_report(devlink, skb, &metadata); + } +} +EXPORT_SYMBOL_GPL(devlink_trap_report); + +/** + * devlink_trap_ctx_priv - Trap context to driver private information. + * @trap_ctx: Trap context. + * + * Return: Driver private information passed during registration. + */ +void *devlink_trap_ctx_priv(void *trap_ctx) +{ + struct devlink_trap_item *trap_item = trap_ctx; + + return trap_item->priv; +} +EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv); + +static int +devlink_trap_group_item_policer_link(struct devlink *devlink, + struct devlink_trap_group_item *group_item) +{ + u32 policer_id = group_item->group->init_policer_id; + struct devlink_trap_policer_item *policer_item; + + if (policer_id == 0) + return 0; + + policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); + if (WARN_ON_ONCE(!policer_item)) + return -EINVAL; + + group_item->policer_item = policer_item; + + return 0; +} + +static int +devlink_trap_group_register(struct devlink *devlink, + const struct devlink_trap_group *group) +{ + struct devlink_trap_group_item *group_item; + int err; + + if (devlink_trap_group_item_lookup(devlink, group->name)) + return -EEXIST; + + group_item = kzalloc(sizeof(*group_item), GFP_KERNEL); + if (!group_item) + return -ENOMEM; + + group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); + if (!group_item->stats) { + err = -ENOMEM; + goto err_stats_alloc; + } + + group_item->group = group; + + err = devlink_trap_group_item_policer_link(devlink, group_item); + if (err) + goto err_policer_link; + + if (devlink->ops->trap_group_init) { + err = devlink->ops->trap_group_init(devlink, group); + if (err) + goto err_group_init; + } + + list_add_tail(&group_item->list, &devlink->trap_group_list); + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_NEW); + + return 0; + +err_group_init: +err_policer_link: + free_percpu(group_item->stats); +err_stats_alloc: + kfree(group_item); + return err; +} + +static void +devlink_trap_group_unregister(struct devlink *devlink, + const struct devlink_trap_group *group) +{ + struct devlink_trap_group_item *group_item; + + group_item = devlink_trap_group_item_lookup(devlink, group->name); + if (WARN_ON_ONCE(!group_item)) + return; + + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_DEL); + list_del(&group_item->list); + free_percpu(group_item->stats); + kfree(group_item); +} + +/** + * devl_trap_groups_register - Register packet trap groups with devlink. + * @devlink: devlink. + * @groups: Packet trap groups. + * @groups_count: Count of provided packet trap groups. + * + * Return: Non-zero value on failure. + */ +int devl_trap_groups_register(struct devlink *devlink, + const struct devlink_trap_group *groups, + size_t groups_count) +{ + int i, err; + + devl_assert_locked(devlink); + for (i = 0; i < groups_count; i++) { + const struct devlink_trap_group *group = &groups[i]; + + err = devlink_trap_group_verify(group); + if (err) + goto err_trap_group_verify; + + err = devlink_trap_group_register(devlink, group); + if (err) + goto err_trap_group_register; + } + + return 0; + +err_trap_group_register: +err_trap_group_verify: + for (i--; i >= 0; i--) + devlink_trap_group_unregister(devlink, &groups[i]); + return err; +} +EXPORT_SYMBOL_GPL(devl_trap_groups_register); + +/** + * devlink_trap_groups_register - Register packet trap groups with devlink. + * @devlink: devlink. + * @groups: Packet trap groups. + * @groups_count: Count of provided packet trap groups. + * + * Context: Takes and release devlink->lock . + * + * Return: Non-zero value on failure. + */ +int devlink_trap_groups_register(struct devlink *devlink, + const struct devlink_trap_group *groups, + size_t groups_count) +{ + int err; + + devl_lock(devlink); + err = devl_trap_groups_register(devlink, groups, groups_count); + devl_unlock(devlink); + return err; +} +EXPORT_SYMBOL_GPL(devlink_trap_groups_register); + +/** + * devl_trap_groups_unregister - Unregister packet trap groups from devlink. + * @devlink: devlink. + * @groups: Packet trap groups. + * @groups_count: Count of provided packet trap groups. + */ +void devl_trap_groups_unregister(struct devlink *devlink, + const struct devlink_trap_group *groups, + size_t groups_count) +{ + int i; + + devl_assert_locked(devlink); + for (i = groups_count - 1; i >= 0; i--) + devlink_trap_group_unregister(devlink, &groups[i]); +} +EXPORT_SYMBOL_GPL(devl_trap_groups_unregister); + +/** + * devlink_trap_groups_unregister - Unregister packet trap groups from devlink. + * @devlink: devlink. + * @groups: Packet trap groups. + * @groups_count: Count of provided packet trap groups. + * + * Context: Takes and release devlink->lock . + */ +void devlink_trap_groups_unregister(struct devlink *devlink, + const struct devlink_trap_group *groups, + size_t groups_count) +{ + devl_lock(devlink); + devl_trap_groups_unregister(devlink, groups, groups_count); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister); + +static void +devlink_trap_policer_notify(struct devlink *devlink, + const struct devlink_trap_policer_item *policer_item, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW && + cmd != DEVLINK_CMD_TRAP_POLICER_DEL); + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, cmd, 0, + 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +void devlink_trap_policers_notify_register(struct devlink *devlink) +{ + struct devlink_trap_policer_item *policer_item; + + list_for_each_entry(policer_item, &devlink->trap_policer_list, list) + devlink_trap_policer_notify(devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_NEW); +} + +void devlink_trap_policers_notify_unregister(struct devlink *devlink) +{ + struct devlink_trap_policer_item *policer_item; + + list_for_each_entry_reverse(policer_item, &devlink->trap_policer_list, + list) + devlink_trap_policer_notify(devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_DEL); +} + +static int +devlink_trap_policer_register(struct devlink *devlink, + const struct devlink_trap_policer *policer) +{ + struct devlink_trap_policer_item *policer_item; + int err; + + if (devlink_trap_policer_item_lookup(devlink, policer->id)) + return -EEXIST; + + policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL); + if (!policer_item) + return -ENOMEM; + + policer_item->policer = policer; + policer_item->rate = policer->init_rate; + policer_item->burst = policer->init_burst; + + if (devlink->ops->trap_policer_init) { + err = devlink->ops->trap_policer_init(devlink, policer); + if (err) + goto err_policer_init; + } + + list_add_tail(&policer_item->list, &devlink->trap_policer_list); + devlink_trap_policer_notify(devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_NEW); + + return 0; + +err_policer_init: + kfree(policer_item); + return err; +} + +static void +devlink_trap_policer_unregister(struct devlink *devlink, + const struct devlink_trap_policer *policer) +{ + struct devlink_trap_policer_item *policer_item; + + policer_item = devlink_trap_policer_item_lookup(devlink, policer->id); + if (WARN_ON_ONCE(!policer_item)) + return; + + devlink_trap_policer_notify(devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_DEL); + list_del(&policer_item->list); + if (devlink->ops->trap_policer_fini) + devlink->ops->trap_policer_fini(devlink, policer); + kfree(policer_item); +} + +/** + * devl_trap_policers_register - Register packet trap policers with devlink. + * @devlink: devlink. + * @policers: Packet trap policers. + * @policers_count: Count of provided packet trap policers. + * + * Return: Non-zero value on failure. + */ +int +devl_trap_policers_register(struct devlink *devlink, + const struct devlink_trap_policer *policers, + size_t policers_count) +{ + int i, err; + + devl_assert_locked(devlink); + for (i = 0; i < policers_count; i++) { + const struct devlink_trap_policer *policer = &policers[i]; + + if (WARN_ON(policer->id == 0 || + policer->max_rate < policer->min_rate || + policer->max_burst < policer->min_burst)) { + err = -EINVAL; + goto err_trap_policer_verify; + } + + err = devlink_trap_policer_register(devlink, policer); + if (err) + goto err_trap_policer_register; + } + return 0; + +err_trap_policer_register: +err_trap_policer_verify: + for (i--; i >= 0; i--) + devlink_trap_policer_unregister(devlink, &policers[i]); + return err; +} +EXPORT_SYMBOL_GPL(devl_trap_policers_register); + +/** + * devl_trap_policers_unregister - Unregister packet trap policers from devlink. + * @devlink: devlink. + * @policers: Packet trap policers. + * @policers_count: Count of provided packet trap policers. + */ +void +devl_trap_policers_unregister(struct devlink *devlink, + const struct devlink_trap_policer *policers, + size_t policers_count) +{ + int i; + + devl_assert_locked(devlink); + for (i = policers_count - 1; i >= 0; i--) + devlink_trap_policer_unregister(devlink, &policers[i]); +} +EXPORT_SYMBOL_GPL(devl_trap_policers_unregister); -- cgit v1.2.3 From 7cc7194e85ca01185f9d123e68189ef0a40f0c6a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Aug 2023 08:16:53 +0200 Subject: devlink: push rate related code into separate file Cut out another chunk from leftover.c and put rate related code into a separate file. Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230828061657.300667-12-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/Makefile | 2 +- net/devlink/devl_internal.h | 5 + net/devlink/leftover.c | 718 ------------------------------------------- net/devlink/rate.c | 722 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 728 insertions(+), 719 deletions(-) create mode 100644 net/devlink/rate.c (limited to 'net') diff --git a/net/devlink/Makefile b/net/devlink/Makefile index 523efffe522c..f61aa97688f5 100644 --- a/net/devlink/Makefile +++ b/net/devlink/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o port.o sb.o dpipe.o \ - resource.o param.o region.o health.o trap.o + resource.o param.o region.o health.o trap.o rate.o diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 148525aa2847..8b530f15c439 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -166,6 +166,8 @@ void devlink_trap_groups_notify_register(struct devlink *devlink); void devlink_trap_groups_notify_unregister(struct devlink *devlink); void devlink_traps_notify_register(struct devlink *devlink); void devlink_traps_notify_unregister(struct devlink *devlink); +void devlink_rates_notify_register(struct devlink *devlink); +void devlink_rates_notify_unregister(struct devlink *devlink); /* Ports */ #define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \ @@ -278,3 +280,6 @@ int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_rate_set_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_rate_new_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb, struct genl_info *info); diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 5b153ce097ab..13958c3da59a 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -37,80 +37,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); -static inline bool -devlink_rate_is_leaf(struct devlink_rate *devlink_rate) -{ - return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF; -} - -static inline bool -devlink_rate_is_node(struct devlink_rate *devlink_rate) -{ - return devlink_rate->type == DEVLINK_RATE_TYPE_NODE; -} - -static struct devlink_rate * -devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) -{ - struct devlink_rate *devlink_rate; - struct devlink_port *devlink_port; - - devlink_port = devlink_port_get_from_attrs(devlink, info->attrs); - if (IS_ERR(devlink_port)) - return ERR_CAST(devlink_port); - devlink_rate = devlink_port->devlink_rate; - return devlink_rate ?: ERR_PTR(-ENODEV); -} - -static struct devlink_rate * -devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) -{ - static struct devlink_rate *devlink_rate; - - list_for_each_entry(devlink_rate, &devlink->rate_list, list) { - if (devlink_rate_is_node(devlink_rate) && - !strcmp(node_name, devlink_rate->name)) - return devlink_rate; - } - return ERR_PTR(-ENODEV); -} - -static struct devlink_rate * -devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) -{ - const char *rate_node_name; - size_t len; - - if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME]) - return ERR_PTR(-EINVAL); - rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]); - len = strlen(rate_node_name); - /* Name cannot be empty or decimal number */ - if (!len || strspn(rate_node_name, "0123456789") == len) - return ERR_PTR(-EINVAL); - - return devlink_rate_node_get_by_name(devlink, rate_node_name); -} - -static struct devlink_rate * -devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) -{ - return devlink_rate_node_get_from_attrs(devlink, info->attrs); -} - -static struct devlink_rate * -devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) -{ - struct nlattr **attrs = info->attrs; - - if (attrs[DEVLINK_ATTR_PORT_INDEX]) - return devlink_rate_leaf_get_from_info(devlink, info); - else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME]) - return devlink_rate_node_get_from_info(devlink, info); - else - return ERR_PTR(-EINVAL); -} - static struct devlink_linecard * devlink_linecard_get_by_index(struct devlink *devlink, unsigned int linecard_index) @@ -169,491 +95,6 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_nl_rate_fill(struct sk_buff *msg, - struct devlink_rate *devlink_rate, - enum devlink_command cmd, u32 portid, u32 seq, - int flags, struct netlink_ext_ack *extack) -{ - struct devlink *devlink = devlink_rate->devlink; - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type)) - goto nla_put_failure; - - if (devlink_rate_is_leaf(devlink_rate)) { - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, - devlink_rate->devlink_port->index)) - goto nla_put_failure; - } else if (devlink_rate_is_node(devlink_rate)) { - if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME, - devlink_rate->name)) - goto nla_put_failure; - } - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_SHARE, - devlink_rate->tx_share, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_MAX, - devlink_rate->tx_max, DEVLINK_ATTR_PAD)) - goto nla_put_failure; - - if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY, - devlink_rate->tx_priority)) - goto nla_put_failure; - - if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT, - devlink_rate->tx_weight)) - goto nla_put_failure; - - if (devlink_rate->parent) - if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, - devlink_rate->parent->name)) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void devlink_rate_notify(struct devlink_rate *devlink_rate, - enum devlink_command cmd) -{ - struct devlink *devlink = devlink_rate->devlink; - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL); - - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, - 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static void devlink_rates_notify_register(struct devlink *devlink) -{ - struct devlink_rate *rate_node; - - list_for_each_entry(rate_node, &devlink->rate_list, list) - devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); -} - -static void devlink_rates_notify_unregister(struct devlink *devlink) -{ - struct devlink_rate *rate_node; - - list_for_each_entry_reverse(rate_node, &devlink->rate_list, list) - devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); -} - -static int -devlink_nl_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, - struct netlink_callback *cb, int flags) -{ - struct devlink_nl_dump_state *state = devlink_dump_state(cb); - struct devlink_rate *devlink_rate; - int idx = 0; - int err = 0; - - list_for_each_entry(devlink_rate, &devlink->rate_list, list) { - enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; - u32 id = NETLINK_CB(cb->skb).portid; - - if (idx < state->idx) { - idx++; - continue; - } - err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, - cb->nlh->nlmsg_seq, flags, NULL); - if (err) { - state->idx = idx; - break; - } - idx++; - } - - return err; -} - -int devlink_nl_rate_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) -{ - return devlink_nl_dumpit(skb, cb, devlink_nl_rate_get_dump_one); -} - -int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_rate *devlink_rate; - struct sk_buff *msg; - int err; - - devlink_rate = devlink_rate_get_from_info(devlink, info); - if (IS_ERR(devlink_rate)) - return PTR_ERR(devlink_rate); - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW, - info->snd_portid, info->snd_seq, 0, - info->extack); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static bool -devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, - struct devlink_rate *parent) -{ - while (parent) { - if (parent == devlink_rate) - return true; - parent = parent->parent; - } - return false; -} - -static int -devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, - struct genl_info *info, - struct nlattr *nla_parent) -{ - struct devlink *devlink = devlink_rate->devlink; - const char *parent_name = nla_data(nla_parent); - const struct devlink_ops *ops = devlink->ops; - size_t len = strlen(parent_name); - struct devlink_rate *parent; - int err = -EOPNOTSUPP; - - parent = devlink_rate->parent; - - if (parent && !len) { - if (devlink_rate_is_leaf(devlink_rate)) - err = ops->rate_leaf_parent_set(devlink_rate, NULL, - devlink_rate->priv, NULL, - info->extack); - else if (devlink_rate_is_node(devlink_rate)) - err = ops->rate_node_parent_set(devlink_rate, NULL, - devlink_rate->priv, NULL, - info->extack); - if (err) - return err; - - refcount_dec(&parent->refcnt); - devlink_rate->parent = NULL; - } else if (len) { - parent = devlink_rate_node_get_by_name(devlink, parent_name); - if (IS_ERR(parent)) - return -ENODEV; - - if (parent == devlink_rate) { - NL_SET_ERR_MSG(info->extack, "Parent to self is not allowed"); - return -EINVAL; - } - - if (devlink_rate_is_node(devlink_rate) && - devlink_rate_is_parent_node(devlink_rate, parent->parent)) { - NL_SET_ERR_MSG(info->extack, "Node is already a parent of parent node."); - return -EEXIST; - } - - if (devlink_rate_is_leaf(devlink_rate)) - err = ops->rate_leaf_parent_set(devlink_rate, parent, - devlink_rate->priv, parent->priv, - info->extack); - else if (devlink_rate_is_node(devlink_rate)) - err = ops->rate_node_parent_set(devlink_rate, parent, - devlink_rate->priv, parent->priv, - info->extack); - if (err) - return err; - - if (devlink_rate->parent) - /* we're reassigning to other parent in this case */ - refcount_dec(&devlink_rate->parent->refcnt); - - refcount_inc(&parent->refcnt); - devlink_rate->parent = parent; - } - - return 0; -} - -static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, - const struct devlink_ops *ops, - struct genl_info *info) -{ - struct nlattr *nla_parent, **attrs = info->attrs; - int err = -EOPNOTSUPP; - u32 priority; - u32 weight; - u64 rate; - - if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { - rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]); - if (devlink_rate_is_leaf(devlink_rate)) - err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv, - rate, info->extack); - else if (devlink_rate_is_node(devlink_rate)) - err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv, - rate, info->extack); - if (err) - return err; - devlink_rate->tx_share = rate; - } - - if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) { - rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]); - if (devlink_rate_is_leaf(devlink_rate)) - err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv, - rate, info->extack); - else if (devlink_rate_is_node(devlink_rate)) - err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv, - rate, info->extack); - if (err) - return err; - devlink_rate->tx_max = rate; - } - - if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) { - priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]); - if (devlink_rate_is_leaf(devlink_rate)) - err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv, - priority, info->extack); - else if (devlink_rate_is_node(devlink_rate)) - err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv, - priority, info->extack); - - if (err) - return err; - devlink_rate->tx_priority = priority; - } - - if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) { - weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]); - if (devlink_rate_is_leaf(devlink_rate)) - err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv, - weight, info->extack); - else if (devlink_rate_is_node(devlink_rate)) - err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv, - weight, info->extack); - - if (err) - return err; - devlink_rate->tx_weight = weight; - } - - nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; - if (nla_parent) { - err = devlink_nl_rate_parent_node_set(devlink_rate, info, - nla_parent); - if (err) - return err; - } - - return 0; -} - -static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, - struct genl_info *info, - enum devlink_rate_type type) -{ - struct nlattr **attrs = info->attrs; - - if (type == DEVLINK_RATE_TYPE_LEAF) { - if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) { - NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the leafs"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) { - NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the leafs"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && - !ops->rate_leaf_parent_set) { - NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the leafs"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) { - NL_SET_ERR_MSG_ATTR(info->extack, - attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], - "TX priority set isn't supported for the leafs"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) { - NL_SET_ERR_MSG_ATTR(info->extack, - attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], - "TX weight set isn't supported for the leafs"); - return false; - } - } else if (type == DEVLINK_RATE_TYPE_NODE) { - if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { - NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) { - NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the nodes"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && - !ops->rate_node_parent_set) { - NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the nodes"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) { - NL_SET_ERR_MSG_ATTR(info->extack, - attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], - "TX priority set isn't supported for the nodes"); - return false; - } - if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) { - NL_SET_ERR_MSG_ATTR(info->extack, - attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], - "TX weight set isn't supported for the nodes"); - return false; - } - } else { - WARN(1, "Unknown type of rate object"); - return false; - } - - return true; -} - -static int devlink_nl_cmd_rate_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_rate *devlink_rate; - const struct devlink_ops *ops; - int err; - - devlink_rate = devlink_rate_get_from_info(devlink, info); - if (IS_ERR(devlink_rate)) - return PTR_ERR(devlink_rate); - - ops = devlink->ops; - if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type)) - return -EOPNOTSUPP; - - err = devlink_nl_rate_set(devlink_rate, ops, info); - - if (!err) - devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); - return err; -} - -static int devlink_nl_cmd_rate_new_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_rate *rate_node; - const struct devlink_ops *ops; - int err; - - ops = devlink->ops; - if (!ops || !ops->rate_node_new || !ops->rate_node_del) { - NL_SET_ERR_MSG(info->extack, "Rate nodes aren't supported"); - return -EOPNOTSUPP; - } - - if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE)) - return -EOPNOTSUPP; - - rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs); - if (!IS_ERR(rate_node)) - return -EEXIST; - else if (rate_node == ERR_PTR(-EINVAL)) - return -EINVAL; - - rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); - if (!rate_node) - return -ENOMEM; - - rate_node->devlink = devlink; - rate_node->type = DEVLINK_RATE_TYPE_NODE; - rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL); - if (!rate_node->name) { - err = -ENOMEM; - goto err_strdup; - } - - err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack); - if (err) - goto err_node_new; - - err = devlink_nl_rate_set(rate_node, ops, info); - if (err) - goto err_rate_set; - - refcount_set(&rate_node->refcnt, 1); - list_add(&rate_node->list, &devlink->rate_list); - devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); - return 0; - -err_rate_set: - ops->rate_node_del(rate_node, rate_node->priv, info->extack); -err_node_new: - kfree(rate_node->name); -err_strdup: - kfree(rate_node); - return err; -} - -static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_rate *rate_node; - int err; - - rate_node = devlink_rate_node_get_from_info(devlink, info); - if (IS_ERR(rate_node)) - return PTR_ERR(rate_node); - - if (refcount_read(&rate_node->refcnt) > 1) { - NL_SET_ERR_MSG(info->extack, "Node has children. Cannot delete node."); - return -EBUSY; - } - - devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); - err = devlink->ops->rate_node_del(rate_node, rate_node->priv, - info->extack); - if (rate_node->parent) - refcount_dec(&rate_node->parent->refcnt); - list_del(&rate_node->list); - kfree(rate_node->name); - kfree(rate_node); - return err; -} - struct devlink_linecard_type { const char *type; const void *priv; @@ -986,19 +427,6 @@ static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb, return 0; } -int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, - struct netlink_ext_ack *extack) -{ - struct devlink_rate *devlink_rate; - - list_for_each_entry(devlink_rate, &devlink->rate_list, list) - if (devlink_rate_is_node(devlink_rate)) { - NL_SET_ERR_MSG(extack, "Rate node(s) exists."); - return -EBUSY; - } - return 0; -} - const struct genl_small_ops devlink_nl_small_ops[40] = { { .cmd = DEVLINK_CMD_PORT_SET, @@ -1276,152 +704,6 @@ void devlink_notify_unregister(struct devlink *devlink) devlink_notify(devlink, DEVLINK_CMD_DEL); } -/** - * devl_rate_node_create - create devlink rate node - * @devlink: devlink instance - * @priv: driver private data - * @node_name: name of the resulting node - * @parent: parent devlink_rate struct - * - * Create devlink rate object of type node - */ -struct devlink_rate * -devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, - struct devlink_rate *parent) -{ - struct devlink_rate *rate_node; - - rate_node = devlink_rate_node_get_by_name(devlink, node_name); - if (!IS_ERR(rate_node)) - return ERR_PTR(-EEXIST); - - rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); - if (!rate_node) - return ERR_PTR(-ENOMEM); - - if (parent) { - rate_node->parent = parent; - refcount_inc(&rate_node->parent->refcnt); - } - - rate_node->type = DEVLINK_RATE_TYPE_NODE; - rate_node->devlink = devlink; - rate_node->priv = priv; - - rate_node->name = kstrdup(node_name, GFP_KERNEL); - if (!rate_node->name) { - kfree(rate_node); - return ERR_PTR(-ENOMEM); - } - - refcount_set(&rate_node->refcnt, 1); - list_add(&rate_node->list, &devlink->rate_list); - devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); - return rate_node; -} -EXPORT_SYMBOL_GPL(devl_rate_node_create); - -/** - * devl_rate_leaf_create - create devlink rate leaf - * @devlink_port: devlink port object to create rate object on - * @priv: driver private data - * @parent: parent devlink_rate struct - * - * Create devlink rate object of type leaf on provided @devlink_port. - */ -int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, - struct devlink_rate *parent) -{ - struct devlink *devlink = devlink_port->devlink; - struct devlink_rate *devlink_rate; - - devl_assert_locked(devlink_port->devlink); - - if (WARN_ON(devlink_port->devlink_rate)) - return -EBUSY; - - devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); - if (!devlink_rate) - return -ENOMEM; - - if (parent) { - devlink_rate->parent = parent; - refcount_inc(&devlink_rate->parent->refcnt); - } - - devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; - devlink_rate->devlink = devlink; - devlink_rate->devlink_port = devlink_port; - devlink_rate->priv = priv; - list_add_tail(&devlink_rate->list, &devlink->rate_list); - devlink_port->devlink_rate = devlink_rate; - devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); - - return 0; -} -EXPORT_SYMBOL_GPL(devl_rate_leaf_create); - -/** - * devl_rate_leaf_destroy - destroy devlink rate leaf - * - * @devlink_port: devlink port linked to the rate object - * - * Destroy the devlink rate object of type leaf on provided @devlink_port. - */ -void devl_rate_leaf_destroy(struct devlink_port *devlink_port) -{ - struct devlink_rate *devlink_rate = devlink_port->devlink_rate; - - devl_assert_locked(devlink_port->devlink); - if (!devlink_rate) - return; - - devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); - if (devlink_rate->parent) - refcount_dec(&devlink_rate->parent->refcnt); - list_del(&devlink_rate->list); - devlink_port->devlink_rate = NULL; - kfree(devlink_rate); -} -EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); - -/** - * devl_rate_nodes_destroy - destroy all devlink rate nodes on device - * @devlink: devlink instance - * - * Unset parent for all rate objects and destroy all rate nodes - * on specified device. - */ -void devl_rate_nodes_destroy(struct devlink *devlink) -{ - static struct devlink_rate *devlink_rate, *tmp; - const struct devlink_ops *ops = devlink->ops; - - devl_assert_locked(devlink); - - list_for_each_entry(devlink_rate, &devlink->rate_list, list) { - if (!devlink_rate->parent) - continue; - - refcount_dec(&devlink_rate->parent->refcnt); - if (devlink_rate_is_leaf(devlink_rate)) - ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, - NULL, NULL); - else if (devlink_rate_is_node(devlink_rate)) - ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, - NULL, NULL); - } - list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { - if (devlink_rate_is_node(devlink_rate)) { - ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL); - list_del(&devlink_rate->list); - kfree(devlink_rate->name); - kfree(devlink_rate); - } - } -} -EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy); - static int devlink_linecard_types_init(struct devlink_linecard *linecard) { struct devlink_linecard_type *linecard_type; diff --git a/net/devlink/rate.c b/net/devlink/rate.c new file mode 100644 index 000000000000..dff1593b8406 --- /dev/null +++ b/net/devlink/rate.c @@ -0,0 +1,722 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko + */ + +#include "devl_internal.h" + +static inline bool +devlink_rate_is_leaf(struct devlink_rate *devlink_rate) +{ + return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF; +} + +static inline bool +devlink_rate_is_node(struct devlink_rate *devlink_rate) +{ + return devlink_rate->type == DEVLINK_RATE_TYPE_NODE; +} + +static struct devlink_rate * +devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) +{ + struct devlink_rate *devlink_rate; + struct devlink_port *devlink_port; + + devlink_port = devlink_port_get_from_attrs(devlink, info->attrs); + if (IS_ERR(devlink_port)) + return ERR_CAST(devlink_port); + devlink_rate = devlink_port->devlink_rate; + return devlink_rate ?: ERR_PTR(-ENODEV); +} + +static struct devlink_rate * +devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) +{ + static struct devlink_rate *devlink_rate; + + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + if (devlink_rate_is_node(devlink_rate) && + !strcmp(node_name, devlink_rate->name)) + return devlink_rate; + } + return ERR_PTR(-ENODEV); +} + +static struct devlink_rate * +devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) +{ + const char *rate_node_name; + size_t len; + + if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME]) + return ERR_PTR(-EINVAL); + rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]); + len = strlen(rate_node_name); + /* Name cannot be empty or decimal number */ + if (!len || strspn(rate_node_name, "0123456789") == len) + return ERR_PTR(-EINVAL); + + return devlink_rate_node_get_by_name(devlink, rate_node_name); +} + +static struct devlink_rate * +devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) +{ + return devlink_rate_node_get_from_attrs(devlink, info->attrs); +} + +static struct devlink_rate * +devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) +{ + struct nlattr **attrs = info->attrs; + + if (attrs[DEVLINK_ATTR_PORT_INDEX]) + return devlink_rate_leaf_get_from_info(devlink, info); + else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME]) + return devlink_rate_node_get_from_info(devlink, info); + else + return ERR_PTR(-EINVAL); +} + +static int devlink_nl_rate_fill(struct sk_buff *msg, + struct devlink_rate *devlink_rate, + enum devlink_command cmd, u32 portid, u32 seq, + int flags, struct netlink_ext_ack *extack) +{ + struct devlink *devlink = devlink_rate->devlink; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type)) + goto nla_put_failure; + + if (devlink_rate_is_leaf(devlink_rate)) { + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, + devlink_rate->devlink_port->index)) + goto nla_put_failure; + } else if (devlink_rate_is_node(devlink_rate)) { + if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME, + devlink_rate->name)) + goto nla_put_failure; + } + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_SHARE, + devlink_rate->tx_share, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_MAX, + devlink_rate->tx_max, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY, + devlink_rate->tx_priority)) + goto nla_put_failure; + + if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT, + devlink_rate->tx_weight)) + goto nla_put_failure; + + if (devlink_rate->parent) + if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, + devlink_rate->parent->name)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_rate_notify(struct devlink_rate *devlink_rate, + enum devlink_command cmd) +{ + struct devlink *devlink = devlink_rate->devlink; + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL); + + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, + 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +void devlink_rates_notify_register(struct devlink *devlink) +{ + struct devlink_rate *rate_node; + + list_for_each_entry(rate_node, &devlink->rate_list, list) + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); +} + +void devlink_rates_notify_unregister(struct devlink *devlink) +{ + struct devlink_rate *rate_node; + + list_for_each_entry_reverse(rate_node, &devlink->rate_list, list) + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); +} + +static int +devlink_nl_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, + struct netlink_callback *cb, int flags) +{ + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + struct devlink_rate *devlink_rate; + int idx = 0; + int err = 0; + + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; + u32 id = NETLINK_CB(cb->skb).portid; + + if (idx < state->idx) { + idx++; + continue; + } + err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, + cb->nlh->nlmsg_seq, flags, NULL); + if (err) { + state->idx = idx; + break; + } + idx++; + } + + return err; +} + +int devlink_nl_rate_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_rate_get_dump_one); +} + +int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_rate *devlink_rate; + struct sk_buff *msg; + int err; + + devlink_rate = devlink_rate_get_from_info(devlink, info); + if (IS_ERR(devlink_rate)) + return PTR_ERR(devlink_rate); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW, + info->snd_portid, info->snd_seq, 0, + info->extack); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static bool +devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, + struct devlink_rate *parent) +{ + while (parent) { + if (parent == devlink_rate) + return true; + parent = parent->parent; + } + return false; +} + +static int +devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, + struct genl_info *info, + struct nlattr *nla_parent) +{ + struct devlink *devlink = devlink_rate->devlink; + const char *parent_name = nla_data(nla_parent); + const struct devlink_ops *ops = devlink->ops; + size_t len = strlen(parent_name); + struct devlink_rate *parent; + int err = -EOPNOTSUPP; + + parent = devlink_rate->parent; + + if (parent && !len) { + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_parent_set(devlink_rate, NULL, + devlink_rate->priv, NULL, + info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_parent_set(devlink_rate, NULL, + devlink_rate->priv, NULL, + info->extack); + if (err) + return err; + + refcount_dec(&parent->refcnt); + devlink_rate->parent = NULL; + } else if (len) { + parent = devlink_rate_node_get_by_name(devlink, parent_name); + if (IS_ERR(parent)) + return -ENODEV; + + if (parent == devlink_rate) { + NL_SET_ERR_MSG(info->extack, "Parent to self is not allowed"); + return -EINVAL; + } + + if (devlink_rate_is_node(devlink_rate) && + devlink_rate_is_parent_node(devlink_rate, parent->parent)) { + NL_SET_ERR_MSG(info->extack, "Node is already a parent of parent node."); + return -EEXIST; + } + + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_parent_set(devlink_rate, parent, + devlink_rate->priv, parent->priv, + info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_parent_set(devlink_rate, parent, + devlink_rate->priv, parent->priv, + info->extack); + if (err) + return err; + + if (devlink_rate->parent) + /* we're reassigning to other parent in this case */ + refcount_dec(&devlink_rate->parent->refcnt); + + refcount_inc(&parent->refcnt); + devlink_rate->parent = parent; + } + + return 0; +} + +static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, + const struct devlink_ops *ops, + struct genl_info *info) +{ + struct nlattr *nla_parent, **attrs = info->attrs; + int err = -EOPNOTSUPP; + u32 priority; + u32 weight; + u64 rate; + + if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { + rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]); + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + if (err) + return err; + devlink_rate->tx_share = rate; + } + + if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) { + rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]); + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv, + rate, info->extack); + if (err) + return err; + devlink_rate->tx_max = rate; + } + + if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) { + priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]); + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv, + priority, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv, + priority, info->extack); + + if (err) + return err; + devlink_rate->tx_priority = priority; + } + + if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) { + weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]); + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv, + weight, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv, + weight, info->extack); + + if (err) + return err; + devlink_rate->tx_weight = weight; + } + + nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; + if (nla_parent) { + err = devlink_nl_rate_parent_node_set(devlink_rate, info, + nla_parent); + if (err) + return err; + } + + return 0; +} + +static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, + struct genl_info *info, + enum devlink_rate_type type) +{ + struct nlattr **attrs = info->attrs; + + if (type == DEVLINK_RATE_TYPE_LEAF) { + if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) { + NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the leafs"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) { + NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the leafs"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && + !ops->rate_leaf_parent_set) { + NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the leafs"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], + "TX priority set isn't supported for the leafs"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], + "TX weight set isn't supported for the leafs"); + return false; + } + } else if (type == DEVLINK_RATE_TYPE_NODE) { + if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { + NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) { + NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the nodes"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && + !ops->rate_node_parent_set) { + NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the nodes"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], + "TX priority set isn't supported for the nodes"); + return false; + } + if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], + "TX weight set isn't supported for the nodes"); + return false; + } + } else { + WARN(1, "Unknown type of rate object"); + return false; + } + + return true; +} + +int devlink_nl_cmd_rate_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_rate *devlink_rate; + const struct devlink_ops *ops; + int err; + + devlink_rate = devlink_rate_get_from_info(devlink, info); + if (IS_ERR(devlink_rate)) + return PTR_ERR(devlink_rate); + + ops = devlink->ops; + if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type)) + return -EOPNOTSUPP; + + err = devlink_nl_rate_set(devlink_rate, ops, info); + + if (!err) + devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); + return err; +} + +int devlink_nl_cmd_rate_new_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_rate *rate_node; + const struct devlink_ops *ops; + int err; + + ops = devlink->ops; + if (!ops || !ops->rate_node_new || !ops->rate_node_del) { + NL_SET_ERR_MSG(info->extack, "Rate nodes aren't supported"); + return -EOPNOTSUPP; + } + + if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE)) + return -EOPNOTSUPP; + + rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs); + if (!IS_ERR(rate_node)) + return -EEXIST; + else if (rate_node == ERR_PTR(-EINVAL)) + return -EINVAL; + + rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); + if (!rate_node) + return -ENOMEM; + + rate_node->devlink = devlink; + rate_node->type = DEVLINK_RATE_TYPE_NODE; + rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL); + if (!rate_node->name) { + err = -ENOMEM; + goto err_strdup; + } + + err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack); + if (err) + goto err_node_new; + + err = devlink_nl_rate_set(rate_node, ops, info); + if (err) + goto err_rate_set; + + refcount_set(&rate_node->refcnt, 1); + list_add(&rate_node->list, &devlink->rate_list); + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); + return 0; + +err_rate_set: + ops->rate_node_del(rate_node, rate_node->priv, info->extack); +err_node_new: + kfree(rate_node->name); +err_strdup: + kfree(rate_node); + return err; +} + +int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_rate *rate_node; + int err; + + rate_node = devlink_rate_node_get_from_info(devlink, info); + if (IS_ERR(rate_node)) + return PTR_ERR(rate_node); + + if (refcount_read(&rate_node->refcnt) > 1) { + NL_SET_ERR_MSG(info->extack, "Node has children. Cannot delete node."); + return -EBUSY; + } + + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); + err = devlink->ops->rate_node_del(rate_node, rate_node->priv, + info->extack); + if (rate_node->parent) + refcount_dec(&rate_node->parent->refcnt); + list_del(&rate_node->list); + kfree(rate_node->name); + kfree(rate_node); + return err; +} + +int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack) +{ + struct devlink_rate *devlink_rate; + + list_for_each_entry(devlink_rate, &devlink->rate_list, list) + if (devlink_rate_is_node(devlink_rate)) { + NL_SET_ERR_MSG(extack, "Rate node(s) exists."); + return -EBUSY; + } + return 0; +} + +/** + * devl_rate_node_create - create devlink rate node + * @devlink: devlink instance + * @priv: driver private data + * @node_name: name of the resulting node + * @parent: parent devlink_rate struct + * + * Create devlink rate object of type node + */ +struct devlink_rate * +devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, + struct devlink_rate *parent) +{ + struct devlink_rate *rate_node; + + rate_node = devlink_rate_node_get_by_name(devlink, node_name); + if (!IS_ERR(rate_node)) + return ERR_PTR(-EEXIST); + + rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); + if (!rate_node) + return ERR_PTR(-ENOMEM); + + if (parent) { + rate_node->parent = parent; + refcount_inc(&rate_node->parent->refcnt); + } + + rate_node->type = DEVLINK_RATE_TYPE_NODE; + rate_node->devlink = devlink; + rate_node->priv = priv; + + rate_node->name = kstrdup(node_name, GFP_KERNEL); + if (!rate_node->name) { + kfree(rate_node); + return ERR_PTR(-ENOMEM); + } + + refcount_set(&rate_node->refcnt, 1); + list_add(&rate_node->list, &devlink->rate_list); + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); + return rate_node; +} +EXPORT_SYMBOL_GPL(devl_rate_node_create); + +/** + * devl_rate_leaf_create - create devlink rate leaf + * @devlink_port: devlink port object to create rate object on + * @priv: driver private data + * @parent: parent devlink_rate struct + * + * Create devlink rate object of type leaf on provided @devlink_port. + */ +int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, + struct devlink_rate *parent) +{ + struct devlink *devlink = devlink_port->devlink; + struct devlink_rate *devlink_rate; + + devl_assert_locked(devlink_port->devlink); + + if (WARN_ON(devlink_port->devlink_rate)) + return -EBUSY; + + devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); + if (!devlink_rate) + return -ENOMEM; + + if (parent) { + devlink_rate->parent = parent; + refcount_inc(&devlink_rate->parent->refcnt); + } + + devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; + devlink_rate->devlink = devlink; + devlink_rate->devlink_port = devlink_port; + devlink_rate->priv = priv; + list_add_tail(&devlink_rate->list, &devlink->rate_list); + devlink_port->devlink_rate = devlink_rate; + devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); + + return 0; +} +EXPORT_SYMBOL_GPL(devl_rate_leaf_create); + +/** + * devl_rate_leaf_destroy - destroy devlink rate leaf + * + * @devlink_port: devlink port linked to the rate object + * + * Destroy the devlink rate object of type leaf on provided @devlink_port. + */ +void devl_rate_leaf_destroy(struct devlink_port *devlink_port) +{ + struct devlink_rate *devlink_rate = devlink_port->devlink_rate; + + devl_assert_locked(devlink_port->devlink); + if (!devlink_rate) + return; + + devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); + if (devlink_rate->parent) + refcount_dec(&devlink_rate->parent->refcnt); + list_del(&devlink_rate->list); + devlink_port->devlink_rate = NULL; + kfree(devlink_rate); +} +EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); + +/** + * devl_rate_nodes_destroy - destroy all devlink rate nodes on device + * @devlink: devlink instance + * + * Unset parent for all rate objects and destroy all rate nodes + * on specified device. + */ +void devl_rate_nodes_destroy(struct devlink *devlink) +{ + static struct devlink_rate *devlink_rate, *tmp; + const struct devlink_ops *ops = devlink->ops; + + devl_assert_locked(devlink); + + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { + if (!devlink_rate->parent) + continue; + + refcount_dec(&devlink_rate->parent->refcnt); + if (devlink_rate_is_leaf(devlink_rate)) + ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, + NULL, NULL); + else if (devlink_rate_is_node(devlink_rate)) + ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, + NULL, NULL); + } + list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { + if (devlink_rate_is_node(devlink_rate)) { + ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL); + list_del(&devlink_rate->list); + kfree(devlink_rate->name); + kfree(devlink_rate); + } + } +} +EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy); -- cgit v1.2.3 From 9edbe6f36c5f86776cc1c6ba0f546a4aefe2767f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Aug 2023 08:16:54 +0200 Subject: devlink: push linecard related code into separate file Cut out another chunk from leftover.c and put linecard related code into a separate file. Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230828061657.300667-13-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/Makefile | 2 +- net/devlink/devl_internal.h | 34 +-- net/devlink/leftover.c | 599 ------------------------------------------- net/devlink/linecard.c | 606 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 626 insertions(+), 615 deletions(-) create mode 100644 net/devlink/linecard.c (limited to 'net') diff --git a/net/devlink/Makefile b/net/devlink/Makefile index f61aa97688f5..71f490d301d7 100644 --- a/net/devlink/Makefile +++ b/net/devlink/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o port.o sb.o dpipe.o \ - resource.o param.o region.o health.o trap.o rate.o + resource.o param.o region.o health.o trap.o rate.o linecard.o diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 8b530f15c439..8c81f731a4c7 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -168,6 +168,8 @@ void devlink_traps_notify_register(struct devlink *devlink); void devlink_traps_notify_unregister(struct devlink *devlink); void devlink_rates_notify_register(struct devlink *devlink); void devlink_rates_notify_unregister(struct devlink *devlink); +void devlink_linecards_notify_register(struct devlink *devlink); +void devlink_linecards_notify_unregister(struct devlink *devlink); /* Ports */ #define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \ @@ -182,21 +184,6 @@ devlink_port_get_from_info(struct devlink *devlink, struct genl_info *info); struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, struct nlattr **attrs); -/* Linecards */ -struct devlink_linecard { - struct list_head list; - struct devlink *devlink; - unsigned int index; - const struct devlink_linecard_ops *ops; - void *priv; - enum devlink_linecard_state state; - struct mutex state_lock; /* Protects state */ - const char *type; - struct devlink_linecard_type *types; - unsigned int types_count; - struct devlink *nested_devlink; -}; - /* Reload */ bool devlink_reload_actions_valid(const struct devlink_ops *ops); int devlink_reload(struct devlink *devlink, struct net *dest_net, @@ -222,6 +209,21 @@ int devlink_resources_validate(struct devlink *devlink, int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack); +/* Linecards */ +struct devlink_linecard { + struct list_head list; + struct devlink *devlink; + unsigned int index; + const struct devlink_linecard_ops *ops; + void *priv; + enum devlink_linecard_state state; + struct mutex state_lock; /* Protects state */ + const char *type; + struct devlink_linecard_type *types; + unsigned int types_count; + struct devlink *nested_devlink; +}; + /* Devlink nl cmds */ int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info); @@ -283,3 +285,5 @@ int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, int devlink_nl_cmd_rate_set_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_rate_new_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb, struct genl_info *info); +int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb, + struct genl_info *info); diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 13958c3da59a..98ccb3a8393d 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -37,396 +37,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); -static struct devlink_linecard * -devlink_linecard_get_by_index(struct devlink *devlink, - unsigned int linecard_index) -{ - struct devlink_linecard *devlink_linecard; - - list_for_each_entry(devlink_linecard, &devlink->linecard_list, list) { - if (devlink_linecard->index == linecard_index) - return devlink_linecard; - } - return NULL; -} - -static bool devlink_linecard_index_exists(struct devlink *devlink, - unsigned int linecard_index) -{ - return devlink_linecard_get_by_index(devlink, linecard_index); -} - -static struct devlink_linecard * -devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) -{ - if (attrs[DEVLINK_ATTR_LINECARD_INDEX]) { - u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]); - struct devlink_linecard *linecard; - - linecard = devlink_linecard_get_by_index(devlink, linecard_index); - if (!linecard) - return ERR_PTR(-ENODEV); - return linecard; - } - return ERR_PTR(-EINVAL); -} - -static struct devlink_linecard * -devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info) -{ - return devlink_linecard_get_from_attrs(devlink, info->attrs); -} - -static int devlink_nl_put_nested_handle(struct sk_buff *msg, struct devlink *devlink) -{ - struct nlattr *nested_attr; - - nested_attr = nla_nest_start(msg, DEVLINK_ATTR_NESTED_DEVLINK); - if (!nested_attr) - return -EMSGSIZE; - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - - nla_nest_end(msg, nested_attr); - return 0; - -nla_put_failure: - nla_nest_cancel(msg, nested_attr); - return -EMSGSIZE; -} - -struct devlink_linecard_type { - const char *type; - const void *priv; -}; - -static int devlink_nl_linecard_fill(struct sk_buff *msg, - struct devlink *devlink, - struct devlink_linecard *linecard, - enum devlink_command cmd, u32 portid, - u32 seq, int flags, - struct netlink_ext_ack *extack) -{ - struct devlink_linecard_type *linecard_type; - struct nlattr *attr; - void *hdr; - int i; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index)) - goto nla_put_failure; - if (nla_put_u8(msg, DEVLINK_ATTR_LINECARD_STATE, linecard->state)) - goto nla_put_failure; - if (linecard->type && - nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard->type)) - goto nla_put_failure; - - if (linecard->types_count) { - attr = nla_nest_start(msg, - DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES); - if (!attr) - goto nla_put_failure; - for (i = 0; i < linecard->types_count; i++) { - linecard_type = &linecard->types[i]; - if (nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, - linecard_type->type)) { - nla_nest_cancel(msg, attr); - goto nla_put_failure; - } - } - nla_nest_end(msg, attr); - } - - if (linecard->nested_devlink && - devlink_nl_put_nested_handle(msg, linecard->nested_devlink)) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static void devlink_linecard_notify(struct devlink_linecard *linecard, - enum devlink_command cmd) -{ - struct devlink *devlink = linecard->devlink; - struct sk_buff *msg; - int err; - - WARN_ON(cmd != DEVLINK_CMD_LINECARD_NEW && - cmd != DEVLINK_CMD_LINECARD_DEL); - - if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) - return; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return; - - err = devlink_nl_linecard_fill(msg, devlink, linecard, cmd, 0, 0, 0, - NULL); - if (err) { - nlmsg_free(msg); - return; - } - - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); -} - -static void devlink_linecards_notify_register(struct devlink *devlink) -{ - struct devlink_linecard *linecard; - - list_for_each_entry(linecard, &devlink->linecard_list, list) - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); -} - -static void devlink_linecards_notify_unregister(struct devlink *devlink) -{ - struct devlink_linecard *linecard; - - list_for_each_entry_reverse(linecard, &devlink->linecard_list, list) - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); -} - -int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_linecard *linecard; - struct sk_buff *msg; - int err; - - linecard = devlink_linecard_get_from_info(devlink, info); - if (IS_ERR(linecard)) - return PTR_ERR(linecard); - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - mutex_lock(&linecard->state_lock); - err = devlink_nl_linecard_fill(msg, devlink, linecard, - DEVLINK_CMD_LINECARD_NEW, - info->snd_portid, info->snd_seq, 0, - info->extack); - mutex_unlock(&linecard->state_lock); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_linecard_get_dump_one(struct sk_buff *msg, - struct devlink *devlink, - struct netlink_callback *cb, - int flags) -{ - struct devlink_nl_dump_state *state = devlink_dump_state(cb); - struct devlink_linecard *linecard; - int idx = 0; - int err = 0; - - list_for_each_entry(linecard, &devlink->linecard_list, list) { - if (idx < state->idx) { - idx++; - continue; - } - mutex_lock(&linecard->state_lock); - err = devlink_nl_linecard_fill(msg, devlink, linecard, - DEVLINK_CMD_LINECARD_NEW, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, flags, - cb->extack); - mutex_unlock(&linecard->state_lock); - if (err) { - state->idx = idx; - break; - } - idx++; - } - - return err; -} - -int devlink_nl_linecard_get_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) -{ - return devlink_nl_dumpit(skb, cb, devlink_nl_linecard_get_dump_one); -} - -static struct devlink_linecard_type * -devlink_linecard_type_lookup(struct devlink_linecard *linecard, - const char *type) -{ - struct devlink_linecard_type *linecard_type; - int i; - - for (i = 0; i < linecard->types_count; i++) { - linecard_type = &linecard->types[i]; - if (!strcmp(type, linecard_type->type)) - return linecard_type; - } - return NULL; -} - -static int devlink_linecard_type_set(struct devlink_linecard *linecard, - const char *type, - struct netlink_ext_ack *extack) -{ - const struct devlink_linecard_ops *ops = linecard->ops; - struct devlink_linecard_type *linecard_type; - int err; - - mutex_lock(&linecard->state_lock); - if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { - NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); - err = -EBUSY; - goto out; - } - if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { - NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); - err = -EBUSY; - goto out; - } - - linecard_type = devlink_linecard_type_lookup(linecard, type); - if (!linecard_type) { - NL_SET_ERR_MSG(extack, "Unsupported line card type provided"); - err = -EINVAL; - goto out; - } - - if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED && - linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { - NL_SET_ERR_MSG(extack, "Line card already provisioned"); - err = -EBUSY; - /* Check if the line card is provisioned in the same - * way the user asks. In case it is, make the operation - * to return success. - */ - if (ops->same_provision && - ops->same_provision(linecard, linecard->priv, - linecard_type->type, - linecard_type->priv)) - err = 0; - goto out; - } - - linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING; - linecard->type = linecard_type->type; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - err = ops->provision(linecard, linecard->priv, linecard_type->type, - linecard_type->priv, extack); - if (err) { - /* Provisioning failed. Assume the linecard is unprovisioned - * for future operations. - */ - mutex_lock(&linecard->state_lock); - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - linecard->type = NULL; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - } - return err; - -out: - mutex_unlock(&linecard->state_lock); - return err; -} - -static int devlink_linecard_type_unset(struct devlink_linecard *linecard, - struct netlink_ext_ack *extack) -{ - int err; - - mutex_lock(&linecard->state_lock); - if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { - NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); - err = -EBUSY; - goto out; - } - if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { - NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); - err = -EBUSY; - goto out; - } - if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - linecard->type = NULL; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - err = 0; - goto out; - } - - if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) { - NL_SET_ERR_MSG(extack, "Line card is not provisioned"); - err = 0; - goto out; - } - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - err = linecard->ops->unprovision(linecard, linecard->priv, - extack); - if (err) { - /* Unprovisioning failed. Assume the linecard is unprovisioned - * for future operations. - */ - mutex_lock(&linecard->state_lock); - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - linecard->type = NULL; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - } - return err; - -out: - mutex_unlock(&linecard->state_lock); - return err; -} - -static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct netlink_ext_ack *extack = info->extack; - struct devlink *devlink = info->user_ptr[0]; - struct devlink_linecard *linecard; - int err; - - linecard = devlink_linecard_get_from_info(devlink, info); - if (IS_ERR(linecard)) - return PTR_ERR(linecard); - - if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) { - const char *type; - - type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]); - if (strcmp(type, "")) { - err = devlink_linecard_type_set(linecard, type, extack); - if (err) - return err; - } else { - err = devlink_linecard_type_unset(linecard, extack); - if (err) - return err; - } - } - - return 0; -} - const struct genl_small_ops devlink_nl_small_ops[40] = { { .cmd = DEVLINK_CMD_PORT_SET, @@ -703,212 +313,3 @@ void devlink_notify_unregister(struct devlink *devlink) devlink_linecards_notify_unregister(devlink); devlink_notify(devlink, DEVLINK_CMD_DEL); } - -static int devlink_linecard_types_init(struct devlink_linecard *linecard) -{ - struct devlink_linecard_type *linecard_type; - unsigned int count; - int i; - - count = linecard->ops->types_count(linecard, linecard->priv); - linecard->types = kmalloc_array(count, sizeof(*linecard_type), - GFP_KERNEL); - if (!linecard->types) - return -ENOMEM; - linecard->types_count = count; - - for (i = 0; i < count; i++) { - linecard_type = &linecard->types[i]; - linecard->ops->types_get(linecard, linecard->priv, i, - &linecard_type->type, - &linecard_type->priv); - } - return 0; -} - -static void devlink_linecard_types_fini(struct devlink_linecard *linecard) -{ - kfree(linecard->types); -} - -/** - * devl_linecard_create - Create devlink linecard - * - * @devlink: devlink - * @linecard_index: driver-specific numerical identifier of the linecard - * @ops: linecards ops - * @priv: user priv pointer - * - * Create devlink linecard instance with provided linecard index. - * Caller can use any indexing, even hw-related one. - * - * Return: Line card structure or an ERR_PTR() encoded error code. - */ -struct devlink_linecard * -devl_linecard_create(struct devlink *devlink, unsigned int linecard_index, - const struct devlink_linecard_ops *ops, void *priv) -{ - struct devlink_linecard *linecard; - int err; - - if (WARN_ON(!ops || !ops->provision || !ops->unprovision || - !ops->types_count || !ops->types_get)) - return ERR_PTR(-EINVAL); - - if (devlink_linecard_index_exists(devlink, linecard_index)) - return ERR_PTR(-EEXIST); - - linecard = kzalloc(sizeof(*linecard), GFP_KERNEL); - if (!linecard) - return ERR_PTR(-ENOMEM); - - linecard->devlink = devlink; - linecard->index = linecard_index; - linecard->ops = ops; - linecard->priv = priv; - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - mutex_init(&linecard->state_lock); - - err = devlink_linecard_types_init(linecard); - if (err) { - mutex_destroy(&linecard->state_lock); - kfree(linecard); - return ERR_PTR(err); - } - - list_add_tail(&linecard->list, &devlink->linecard_list); - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - return linecard; -} -EXPORT_SYMBOL_GPL(devl_linecard_create); - -/** - * devl_linecard_destroy - Destroy devlink linecard - * - * @linecard: devlink linecard - */ -void devl_linecard_destroy(struct devlink_linecard *linecard) -{ - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); - list_del(&linecard->list); - devlink_linecard_types_fini(linecard); - mutex_destroy(&linecard->state_lock); - kfree(linecard); -} -EXPORT_SYMBOL_GPL(devl_linecard_destroy); - -/** - * devlink_linecard_provision_set - Set provisioning on linecard - * - * @linecard: devlink linecard - * @type: linecard type - * - * This is either called directly from the provision() op call or - * as a result of the provision() op call asynchronously. - */ -void devlink_linecard_provision_set(struct devlink_linecard *linecard, - const char *type) -{ - mutex_lock(&linecard->state_lock); - WARN_ON(linecard->type && strcmp(linecard->type, type)); - linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; - linecard->type = type; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_provision_set); - -/** - * devlink_linecard_provision_clear - Clear provisioning on linecard - * - * @linecard: devlink linecard - * - * This is either called directly from the unprovision() op call or - * as a result of the unprovision() op call asynchronously. - */ -void devlink_linecard_provision_clear(struct devlink_linecard *linecard) -{ - mutex_lock(&linecard->state_lock); - WARN_ON(linecard->nested_devlink); - linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; - linecard->type = NULL; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear); - -/** - * devlink_linecard_provision_fail - Fail provisioning on linecard - * - * @linecard: devlink linecard - * - * This is either called directly from the provision() op call or - * as a result of the provision() op call asynchronously. - */ -void devlink_linecard_provision_fail(struct devlink_linecard *linecard) -{ - mutex_lock(&linecard->state_lock); - WARN_ON(linecard->nested_devlink); - linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail); - -/** - * devlink_linecard_activate - Set linecard active - * - * @linecard: devlink linecard - */ -void devlink_linecard_activate(struct devlink_linecard *linecard) -{ - mutex_lock(&linecard->state_lock); - WARN_ON(linecard->state != DEVLINK_LINECARD_STATE_PROVISIONED); - linecard->state = DEVLINK_LINECARD_STATE_ACTIVE; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_activate); - -/** - * devlink_linecard_deactivate - Set linecard inactive - * - * @linecard: devlink linecard - */ -void devlink_linecard_deactivate(struct devlink_linecard *linecard) -{ - mutex_lock(&linecard->state_lock); - switch (linecard->state) { - case DEVLINK_LINECARD_STATE_ACTIVE: - linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - break; - case DEVLINK_LINECARD_STATE_UNPROVISIONING: - /* Line card is being deactivated as part - * of unprovisioning flow. - */ - break; - default: - WARN_ON(1); - break; - } - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_deactivate); - -/** - * devlink_linecard_nested_dl_set - Attach/detach nested devlink - * instance to linecard. - * - * @linecard: devlink linecard - * @nested_devlink: devlink instance to attach or NULL to detach - */ -void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, - struct devlink *nested_devlink) -{ - mutex_lock(&linecard->state_lock); - linecard->nested_devlink = nested_devlink; - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); -} -EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set); diff --git a/net/devlink/linecard.c b/net/devlink/linecard.c new file mode 100644 index 000000000000..85c32c314b0f --- /dev/null +++ b/net/devlink/linecard.c @@ -0,0 +1,606 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko + */ + +#include "devl_internal.h" + +static struct devlink_linecard * +devlink_linecard_get_by_index(struct devlink *devlink, + unsigned int linecard_index) +{ + struct devlink_linecard *devlink_linecard; + + list_for_each_entry(devlink_linecard, &devlink->linecard_list, list) { + if (devlink_linecard->index == linecard_index) + return devlink_linecard; + } + return NULL; +} + +static bool devlink_linecard_index_exists(struct devlink *devlink, + unsigned int linecard_index) +{ + return devlink_linecard_get_by_index(devlink, linecard_index); +} + +static struct devlink_linecard * +devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) +{ + if (attrs[DEVLINK_ATTR_LINECARD_INDEX]) { + u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]); + struct devlink_linecard *linecard; + + linecard = devlink_linecard_get_by_index(devlink, linecard_index); + if (!linecard) + return ERR_PTR(-ENODEV); + return linecard; + } + return ERR_PTR(-EINVAL); +} + +static struct devlink_linecard * +devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info) +{ + return devlink_linecard_get_from_attrs(devlink, info->attrs); +} + +static int devlink_nl_put_nested_handle(struct sk_buff *msg, struct devlink *devlink) +{ + struct nlattr *nested_attr; + + nested_attr = nla_nest_start(msg, DEVLINK_ATTR_NESTED_DEVLINK); + if (!nested_attr) + return -EMSGSIZE; + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + nla_nest_end(msg, nested_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, nested_attr); + return -EMSGSIZE; +} + +struct devlink_linecard_type { + const char *type; + const void *priv; +}; + +static int devlink_nl_linecard_fill(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_linecard *linecard, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, + struct netlink_ext_ack *extack) +{ + struct devlink_linecard_type *linecard_type; + struct nlattr *attr; + void *hdr; + int i; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index)) + goto nla_put_failure; + if (nla_put_u8(msg, DEVLINK_ATTR_LINECARD_STATE, linecard->state)) + goto nla_put_failure; + if (linecard->type && + nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard->type)) + goto nla_put_failure; + + if (linecard->types_count) { + attr = nla_nest_start(msg, + DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES); + if (!attr) + goto nla_put_failure; + for (i = 0; i < linecard->types_count; i++) { + linecard_type = &linecard->types[i]; + if (nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, + linecard_type->type)) { + nla_nest_cancel(msg, attr); + goto nla_put_failure; + } + } + nla_nest_end(msg, attr); + } + + if (linecard->nested_devlink && + devlink_nl_put_nested_handle(msg, linecard->nested_devlink)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_linecard_notify(struct devlink_linecard *linecard, + enum devlink_command cmd) +{ + struct devlink *devlink = linecard->devlink; + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_LINECARD_NEW && + cmd != DEVLINK_CMD_LINECARD_DEL); + + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_linecard_fill(msg, devlink, linecard, cmd, 0, 0, 0, + NULL); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +void devlink_linecards_notify_register(struct devlink *devlink) +{ + struct devlink_linecard *linecard; + + list_for_each_entry(linecard, &devlink->linecard_list, list) + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); +} + +void devlink_linecards_notify_unregister(struct devlink *devlink) +{ + struct devlink_linecard *linecard; + + list_for_each_entry_reverse(linecard, &devlink->linecard_list, list) + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); +} + +int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_linecard *linecard; + struct sk_buff *msg; + int err; + + linecard = devlink_linecard_get_from_info(devlink, info); + if (IS_ERR(linecard)) + return PTR_ERR(linecard); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&linecard->state_lock); + err = devlink_nl_linecard_fill(msg, devlink, linecard, + DEVLINK_CMD_LINECARD_NEW, + info->snd_portid, info->snd_seq, 0, + info->extack); + mutex_unlock(&linecard->state_lock); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_linecard_get_dump_one(struct sk_buff *msg, + struct devlink *devlink, + struct netlink_callback *cb, + int flags) +{ + struct devlink_nl_dump_state *state = devlink_dump_state(cb); + struct devlink_linecard *linecard; + int idx = 0; + int err = 0; + + list_for_each_entry(linecard, &devlink->linecard_list, list) { + if (idx < state->idx) { + idx++; + continue; + } + mutex_lock(&linecard->state_lock); + err = devlink_nl_linecard_fill(msg, devlink, linecard, + DEVLINK_CMD_LINECARD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, flags, + cb->extack); + mutex_unlock(&linecard->state_lock); + if (err) { + state->idx = idx; + break; + } + idx++; + } + + return err; +} + +int devlink_nl_linecard_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return devlink_nl_dumpit(skb, cb, devlink_nl_linecard_get_dump_one); +} + +static struct devlink_linecard_type * +devlink_linecard_type_lookup(struct devlink_linecard *linecard, + const char *type) +{ + struct devlink_linecard_type *linecard_type; + int i; + + for (i = 0; i < linecard->types_count; i++) { + linecard_type = &linecard->types[i]; + if (!strcmp(type, linecard_type->type)) + return linecard_type; + } + return NULL; +} + +static int devlink_linecard_type_set(struct devlink_linecard *linecard, + const char *type, + struct netlink_ext_ack *extack) +{ + const struct devlink_linecard_ops *ops = linecard->ops; + struct devlink_linecard_type *linecard_type; + int err; + + mutex_lock(&linecard->state_lock); + if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { + NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); + err = -EBUSY; + goto out; + } + if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { + NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); + err = -EBUSY; + goto out; + } + + linecard_type = devlink_linecard_type_lookup(linecard, type); + if (!linecard_type) { + NL_SET_ERR_MSG(extack, "Unsupported line card type provided"); + err = -EINVAL; + goto out; + } + + if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED && + linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { + NL_SET_ERR_MSG(extack, "Line card already provisioned"); + err = -EBUSY; + /* Check if the line card is provisioned in the same + * way the user asks. In case it is, make the operation + * to return success. + */ + if (ops->same_provision && + ops->same_provision(linecard, linecard->priv, + linecard_type->type, + linecard_type->priv)) + err = 0; + goto out; + } + + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING; + linecard->type = linecard_type->type; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + err = ops->provision(linecard, linecard->priv, linecard_type->type, + linecard_type->priv, extack); + if (err) { + /* Provisioning failed. Assume the linecard is unprovisioned + * for future operations. + */ + mutex_lock(&linecard->state_lock); + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + } + return err; + +out: + mutex_unlock(&linecard->state_lock); + return err; +} + +static int devlink_linecard_type_unset(struct devlink_linecard *linecard, + struct netlink_ext_ack *extack) +{ + int err; + + mutex_lock(&linecard->state_lock); + if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { + NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); + err = -EBUSY; + goto out; + } + if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { + NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); + err = -EBUSY; + goto out; + } + if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + err = 0; + goto out; + } + + if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) { + NL_SET_ERR_MSG(extack, "Line card is not provisioned"); + err = 0; + goto out; + } + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + err = linecard->ops->unprovision(linecard, linecard->priv, + extack); + if (err) { + /* Unprovisioning failed. Assume the linecard is unprovisioned + * for future operations. + */ + mutex_lock(&linecard->state_lock); + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + } + return err; + +out: + mutex_unlock(&linecard->state_lock); + return err; +} + +int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + struct devlink_linecard *linecard; + int err; + + linecard = devlink_linecard_get_from_info(devlink, info); + if (IS_ERR(linecard)) + return PTR_ERR(linecard); + + if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) { + const char *type; + + type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]); + if (strcmp(type, "")) { + err = devlink_linecard_type_set(linecard, type, extack); + if (err) + return err; + } else { + err = devlink_linecard_type_unset(linecard, extack); + if (err) + return err; + } + } + + return 0; +} + +static int devlink_linecard_types_init(struct devlink_linecard *linecard) +{ + struct devlink_linecard_type *linecard_type; + unsigned int count; + int i; + + count = linecard->ops->types_count(linecard, linecard->priv); + linecard->types = kmalloc_array(count, sizeof(*linecard_type), + GFP_KERNEL); + if (!linecard->types) + return -ENOMEM; + linecard->types_count = count; + + for (i = 0; i < count; i++) { + linecard_type = &linecard->types[i]; + linecard->ops->types_get(linecard, linecard->priv, i, + &linecard_type->type, + &linecard_type->priv); + } + return 0; +} + +static void devlink_linecard_types_fini(struct devlink_linecard *linecard) +{ + kfree(linecard->types); +} + +/** + * devl_linecard_create - Create devlink linecard + * + * @devlink: devlink + * @linecard_index: driver-specific numerical identifier of the linecard + * @ops: linecards ops + * @priv: user priv pointer + * + * Create devlink linecard instance with provided linecard index. + * Caller can use any indexing, even hw-related one. + * + * Return: Line card structure or an ERR_PTR() encoded error code. + */ +struct devlink_linecard * +devl_linecard_create(struct devlink *devlink, unsigned int linecard_index, + const struct devlink_linecard_ops *ops, void *priv) +{ + struct devlink_linecard *linecard; + int err; + + if (WARN_ON(!ops || !ops->provision || !ops->unprovision || + !ops->types_count || !ops->types_get)) + return ERR_PTR(-EINVAL); + + if (devlink_linecard_index_exists(devlink, linecard_index)) + return ERR_PTR(-EEXIST); + + linecard = kzalloc(sizeof(*linecard), GFP_KERNEL); + if (!linecard) + return ERR_PTR(-ENOMEM); + + linecard->devlink = devlink; + linecard->index = linecard_index; + linecard->ops = ops; + linecard->priv = priv; + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + mutex_init(&linecard->state_lock); + + err = devlink_linecard_types_init(linecard); + if (err) { + mutex_destroy(&linecard->state_lock); + kfree(linecard); + return ERR_PTR(err); + } + + list_add_tail(&linecard->list, &devlink->linecard_list); + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + return linecard; +} +EXPORT_SYMBOL_GPL(devl_linecard_create); + +/** + * devl_linecard_destroy - Destroy devlink linecard + * + * @linecard: devlink linecard + */ +void devl_linecard_destroy(struct devlink_linecard *linecard) +{ + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); + list_del(&linecard->list); + devlink_linecard_types_fini(linecard); + mutex_destroy(&linecard->state_lock); + kfree(linecard); +} +EXPORT_SYMBOL_GPL(devl_linecard_destroy); + +/** + * devlink_linecard_provision_set - Set provisioning on linecard + * + * @linecard: devlink linecard + * @type: linecard type + * + * This is either called directly from the provision() op call or + * as a result of the provision() op call asynchronously. + */ +void devlink_linecard_provision_set(struct devlink_linecard *linecard, + const char *type) +{ + mutex_lock(&linecard->state_lock); + WARN_ON(linecard->type && strcmp(linecard->type, type)); + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; + linecard->type = type; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_provision_set); + +/** + * devlink_linecard_provision_clear - Clear provisioning on linecard + * + * @linecard: devlink linecard + * + * This is either called directly from the unprovision() op call or + * as a result of the unprovision() op call asynchronously. + */ +void devlink_linecard_provision_clear(struct devlink_linecard *linecard) +{ + mutex_lock(&linecard->state_lock); + WARN_ON(linecard->nested_devlink); + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear); + +/** + * devlink_linecard_provision_fail - Fail provisioning on linecard + * + * @linecard: devlink linecard + * + * This is either called directly from the provision() op call or + * as a result of the provision() op call asynchronously. + */ +void devlink_linecard_provision_fail(struct devlink_linecard *linecard) +{ + mutex_lock(&linecard->state_lock); + WARN_ON(linecard->nested_devlink); + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail); + +/** + * devlink_linecard_activate - Set linecard active + * + * @linecard: devlink linecard + */ +void devlink_linecard_activate(struct devlink_linecard *linecard) +{ + mutex_lock(&linecard->state_lock); + WARN_ON(linecard->state != DEVLINK_LINECARD_STATE_PROVISIONED); + linecard->state = DEVLINK_LINECARD_STATE_ACTIVE; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_activate); + +/** + * devlink_linecard_deactivate - Set linecard inactive + * + * @linecard: devlink linecard + */ +void devlink_linecard_deactivate(struct devlink_linecard *linecard) +{ + mutex_lock(&linecard->state_lock); + switch (linecard->state) { + case DEVLINK_LINECARD_STATE_ACTIVE: + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + break; + case DEVLINK_LINECARD_STATE_UNPROVISIONING: + /* Line card is being deactivated as part + * of unprovisioning flow. + */ + break; + default: + WARN_ON(1); + break; + } + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_deactivate); + +/** + * devlink_linecard_nested_dl_set - Attach/detach nested devlink + * instance to linecard. + * + * @linecard: devlink linecard + * @nested_devlink: devlink instance to attach or NULL to detach + */ +void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, + struct devlink *nested_devlink) +{ + mutex_lock(&linecard->state_lock); + linecard->nested_devlink = nested_devlink; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set); -- cgit v1.2.3 From 890c556674377c0abba4ab91ff6f1962175d578c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Aug 2023 08:16:55 +0200 Subject: devlink: move tracepoint definitions into core.c Move remaining tracepoint definitions to most suitable file core.c. Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230828061657.300667-14-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/core.c | 6 ++++++ net/devlink/leftover.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/devlink/core.c b/net/devlink/core.c index c23ebabadc52..6cec4afb01fb 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -5,9 +5,15 @@ */ #include +#define CREATE_TRACE_POINTS +#include #include "devl_internal.h" +EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); +EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); +EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); + DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); void *devlink_priv(struct devlink *devlink) diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 98ccb3a8393d..a477cdbab940 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -28,15 +28,9 @@ #include #include #include -#define CREATE_TRACE_POINTS -#include #include "devl_internal.h" -EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); -EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); -EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); - const struct genl_small_ops devlink_nl_small_ops[40] = { { .cmd = DEVLINK_CMD_PORT_SET, -- cgit v1.2.3 From 29a390d17748d93f9e6bc6fb0e09d89571aa25f6 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Aug 2023 08:16:56 +0200 Subject: devlink: move small_ops definition into netlink.c Move the generic netlink small_ops definition where they are consumed, into netlink.c Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230828061657.300667-15-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/devl_internal.h | 2 - net/devlink/leftover.c | 251 -------------------------------------------- net/devlink/netlink.c | 251 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 251 insertions(+), 253 deletions(-) (limited to 'net') diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 8c81f731a4c7..efca6abf7af7 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -121,8 +121,6 @@ typedef int devlink_nl_dump_one_func_t(struct sk_buff *msg, struct netlink_callback *cb, int flags); -extern const struct genl_small_ops devlink_nl_small_ops[40]; - struct devlink * devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs); diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index a477cdbab940..05e056d6d5ea 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -31,257 +31,6 @@ #include "devl_internal.h" -const struct genl_small_ops devlink_nl_small_ops[40] = { - { - .cmd = DEVLINK_CMD_PORT_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_RATE_SET, - .doit = devlink_nl_cmd_rate_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_RATE_NEW, - .doit = devlink_nl_cmd_rate_new_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_RATE_DEL, - .doit = devlink_nl_cmd_rate_del_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_PORT_SPLIT, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_split_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_PORT_UNSPLIT, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_unsplit_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_PORT_NEW, - .doit = devlink_nl_cmd_port_new_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_PORT_DEL, - .doit = devlink_nl_cmd_port_del_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - - { - .cmd = DEVLINK_CMD_LINECARD_SET, - .doit = devlink_nl_cmd_linecard_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_SB_POOL_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_pool_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_SB_PORT_POOL_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_port_pool_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_occ_snapshot_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_sb_occ_max_clear_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_ESWITCH_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_eswitch_get_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_ESWITCH_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_eswitch_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_dpipe_table_get, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_dpipe_entries_get, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_dpipe_headers_get, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_dpipe_table_counters_set, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_RESOURCE_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_resource_set, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_RESOURCE_DUMP, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_resource_dump, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_RELOAD, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_reload, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_PARAM_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_param_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_PORT_PARAM_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_param_get_doit, - .dumpit = devlink_nl_cmd_port_param_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_PORT_PARAM_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_port_param_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, - }, - { - .cmd = DEVLINK_CMD_REGION_NEW, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_region_new, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_REGION_DEL, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_region_del, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_REGION_READ, - .validate = GENL_DONT_VALIDATE_STRICT | - GENL_DONT_VALIDATE_DUMP_STRICT, - .dumpit = devlink_nl_cmd_region_read_dumpit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_SET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_set_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_recover_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_diagnose_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, - .validate = GENL_DONT_VALIDATE_STRICT | - GENL_DONT_VALIDATE_DUMP_STRICT, - .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_dump_clear_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_health_reporter_test_doit, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, - }, - { - .cmd = DEVLINK_CMD_FLASH_UPDATE, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = devlink_nl_cmd_flash_update, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_TRAP_SET, - .doit = devlink_nl_cmd_trap_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_TRAP_GROUP_SET, - .doit = devlink_nl_cmd_trap_group_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_TRAP_POLICER_SET, - .doit = devlink_nl_cmd_trap_policer_set_doit, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = DEVLINK_CMD_SELFTESTS_RUN, - .doit = devlink_nl_cmd_selftests_run, - .flags = GENL_ADMIN_PERM, - }, - /* -- No new ops here! Use split ops going forward! -- */ -}; - void devlink_notify_register(struct devlink *devlink) { devlink_notify(devlink, DEVLINK_CMD_NEW); diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 5f57afd31dea..fc3e7c029a3b 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -255,6 +255,257 @@ int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, return devlink_nl_inst_iter_dumpit(msg, cb, flags, dump_one); } +static const struct genl_small_ops devlink_nl_small_ops[40] = { + { + .cmd = DEVLINK_CMD_PORT_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_port_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_RATE_SET, + .doit = devlink_nl_cmd_rate_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_RATE_NEW, + .doit = devlink_nl_cmd_rate_new_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_RATE_DEL, + .doit = devlink_nl_cmd_rate_del_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_PORT_SPLIT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_port_split_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_PORT_UNSPLIT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_port_unsplit_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_PORT_NEW, + .doit = devlink_nl_cmd_port_new_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_PORT_DEL, + .doit = devlink_nl_cmd_port_del_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + + { + .cmd = DEVLINK_CMD_LINECARD_SET, + .doit = devlink_nl_cmd_linecard_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_SB_POOL_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_sb_pool_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_SB_PORT_POOL_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_sb_port_pool_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_sb_occ_snapshot_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_sb_occ_max_clear_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_ESWITCH_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_eswitch_get_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_ESWITCH_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_eswitch_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_dpipe_table_get, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_dpipe_entries_get, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_dpipe_headers_get, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_dpipe_table_counters_set, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_RESOURCE_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_resource_set, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_RESOURCE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_resource_dump, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_RELOAD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_reload, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_PARAM_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_param_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_PORT_PARAM_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_port_param_get_doit, + .dumpit = devlink_nl_cmd_port_param_get_dumpit, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_PORT_PARAM_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_port_param_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, + { + .cmd = DEVLINK_CMD_REGION_NEW, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_region_new, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_REGION_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_region_del, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_REGION_READ, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, + .dumpit = devlink_nl_cmd_region_read_dumpit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_health_reporter_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_health_reporter_recover_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_health_reporter_diagnose_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, + .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_health_reporter_dump_clear_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_health_reporter_test_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, + }, + { + .cmd = DEVLINK_CMD_FLASH_UPDATE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_flash_update, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_TRAP_SET, + .doit = devlink_nl_cmd_trap_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_TRAP_GROUP_SET, + .doit = devlink_nl_cmd_trap_group_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_TRAP_POLICER_SET, + .doit = devlink_nl_cmd_trap_policer_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = DEVLINK_CMD_SELFTESTS_RUN, + .doit = devlink_nl_cmd_selftests_run, + .flags = GENL_ADMIN_PERM, + }, + /* -- No new ops here! Use split ops going forward! -- */ +}; + struct genl_family devlink_nl_family __ro_after_init = { .name = DEVLINK_GENL_NAME, .version = DEVLINK_GENL_VERSION, -- cgit v1.2.3 From 71179ac5c21185171556bc438d5f22d566948d7f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 28 Aug 2023 08:16:57 +0200 Subject: devlink: move devlink_notify_register/unregister() to dev.c At last, move the last bits out of leftover.c, the devlink_notify_register/unregister() functions to dev.c Signed-off-by: Jiri Pirko Link: https://lore.kernel.org/r/20230828061657.300667-16-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/Makefile | 2 +- net/devlink/dev.c | 28 +++++++++++++++++++++- net/devlink/devl_internal.h | 6 ++--- net/devlink/leftover.c | 58 --------------------------------------------- 4 files changed, 30 insertions(+), 64 deletions(-) delete mode 100644 net/devlink/leftover.c (limited to 'net') diff --git a/net/devlink/Makefile b/net/devlink/Makefile index 71f490d301d7..000da622116a 100644 --- a/net/devlink/Makefile +++ b/net/devlink/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := leftover.o core.o netlink.o netlink_gen.o dev.o port.o sb.o dpipe.o \ +obj-y := core.o netlink.o netlink_gen.o dev.o port.o sb.o dpipe.o \ resource.o param.o region.o health.o trap.o rate.o linecard.o diff --git a/net/devlink/dev.c b/net/devlink/dev.c index abf3393a7a17..bba4ace7d22b 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -174,7 +174,7 @@ nla_put_failure: return -EMSGSIZE; } -void devlink_notify(struct devlink *devlink, enum devlink_command cmd) +static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) { struct sk_buff *msg; int err; @@ -230,6 +230,32 @@ int devlink_nl_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) return devlink_nl_dumpit(msg, cb, devlink_nl_get_dump_one); } +void devlink_notify_register(struct devlink *devlink) +{ + devlink_notify(devlink, DEVLINK_CMD_NEW); + devlink_linecards_notify_register(devlink); + devlink_ports_notify_register(devlink); + devlink_trap_policers_notify_register(devlink); + devlink_trap_groups_notify_register(devlink); + devlink_traps_notify_register(devlink); + devlink_rates_notify_register(devlink); + devlink_regions_notify_register(devlink); + devlink_params_notify_register(devlink); +} + +void devlink_notify_unregister(struct devlink *devlink) +{ + devlink_params_notify_unregister(devlink); + devlink_regions_notify_unregister(devlink); + devlink_rates_notify_unregister(devlink); + devlink_traps_notify_unregister(devlink); + devlink_trap_groups_notify_unregister(devlink); + devlink_trap_policers_notify_unregister(devlink); + devlink_ports_notify_unregister(devlink); + devlink_linecards_notify_unregister(devlink); + devlink_notify(devlink, DEVLINK_CMD_DEL); +} + static void devlink_reload_failed_set(struct devlink *devlink, bool reload_failed) { diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index efca6abf7af7..f6b5fea2e13c 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -124,9 +124,6 @@ typedef int devlink_nl_dump_one_func_t(struct sk_buff *msg, struct devlink * devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs); -void devlink_notify_unregister(struct devlink *devlink); -void devlink_notify_register(struct devlink *devlink); - int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, devlink_nl_dump_one_func_t *dump_one); @@ -151,7 +148,8 @@ devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) int devlink_nl_msg_reply_and_new(struct sk_buff **msg, struct genl_info *info); /* Notify */ -void devlink_notify(struct devlink *devlink, enum devlink_command cmd); +void devlink_notify_register(struct devlink *devlink); +void devlink_notify_unregister(struct devlink *devlink); void devlink_ports_notify_register(struct devlink *devlink); void devlink_ports_notify_unregister(struct devlink *devlink); void devlink_params_notify_register(struct devlink *devlink); diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c deleted file mode 100644 index 05e056d6d5ea..000000000000 --- a/net/devlink/leftover.c +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/core/devlink.c - Network physical/parent device Netlink interface - * - * Heavily inspired by net/wireless/ - * Copyright (c) 2016 Mellanox Technologies. All rights reserved. - * Copyright (c) 2016 Jiri Pirko - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "devl_internal.h" - -void devlink_notify_register(struct devlink *devlink) -{ - devlink_notify(devlink, DEVLINK_CMD_NEW); - devlink_linecards_notify_register(devlink); - devlink_ports_notify_register(devlink); - devlink_trap_policers_notify_register(devlink); - devlink_trap_groups_notify_register(devlink); - devlink_traps_notify_register(devlink); - devlink_rates_notify_register(devlink); - devlink_regions_notify_register(devlink); - devlink_params_notify_register(devlink); -} - -void devlink_notify_unregister(struct devlink *devlink) -{ - devlink_params_notify_unregister(devlink); - devlink_regions_notify_unregister(devlink); - devlink_rates_notify_unregister(devlink); - devlink_traps_notify_unregister(devlink); - devlink_trap_groups_notify_unregister(devlink); - devlink_trap_policers_notify_unregister(devlink); - devlink_ports_notify_unregister(devlink); - devlink_linecards_notify_unregister(devlink); - devlink_notify(devlink, DEVLINK_CMD_DEL); -} -- cgit v1.2.3