From 8033c6c2fed235b3d571b5a5ede302b752bc5c7d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 10:54:12 -0800 Subject: bpf: remove unused static inlines Remove two dead stubs, sk_msg_clear_meta() was never used, use of xskq_cons_is_full() got replaced by xsk_tx_writeable() in v5.10. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20220126185412.2776254-1-kuba@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 18a717fe62eb..ddde5f620901 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -171,11 +171,6 @@ static inline u32 sk_msg_iter_dist(u32 start, u32 end) #define sk_msg_iter_next(msg, which) \ sk_msg_iter_var_next(msg->sg.which) -static inline void sk_msg_clear_meta(struct sk_msg *msg) -{ - memset(&msg->sg, 0, offsetofend(struct sk_msg_sg, copy)); -} - static inline void sk_msg_init(struct sk_msg *msg) { BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != NR_MSG_FRAG_IDS); -- cgit v1.2.3 From 5a8fb33e530512ee67a11b30f3451a4f030f4b01 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Feb 2022 20:56:14 -0800 Subject: skmsg: convert struct sk_msg_sg::copy to a bitmap We have plans for increasing MAX_SKB_FRAGS, but sk_msg_sg::copy is currently an unsigned long, limiting MAX_SKB_FRAGS to 30 on 32bit arches. Convert it to a bitmap, as Jakub suggested. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skmsg.h | 11 +++++------ net/core/filter.c | 4 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 18a717fe62eb..1ff68a88c58d 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -29,7 +29,7 @@ struct sk_msg_sg { u32 end; u32 size; u32 copybreak; - unsigned long copy; + DECLARE_BITMAP(copy, MAX_MSG_FRAGS + 2); /* The extra two elements: * 1) used for chaining the front and sections when the list becomes * partitioned (e.g. end < start). The crypto APIs require the @@ -38,7 +38,6 @@ struct sk_msg_sg { */ struct scatterlist data[MAX_MSG_FRAGS + 2]; }; -static_assert(BITS_PER_LONG >= NR_MSG_FRAG_IDS); /* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { @@ -234,7 +233,7 @@ static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) { struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); - if (test_bit(msg->sg.start, &msg->sg.copy)) { + if (test_bit(msg->sg.start, msg->sg.copy)) { msg->data = NULL; msg->data_end = NULL; } else { @@ -253,7 +252,7 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, sg_set_page(sge, page, len, offset); sg_unmark_end(sge); - __set_bit(msg->sg.end, &msg->sg.copy); + __set_bit(msg->sg.end, msg->sg.copy); msg->sg.size += len; sk_msg_iter_next(msg, end); } @@ -262,9 +261,9 @@ static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) { do { if (copy_state) - __set_bit(i, &msg->sg.copy); + __set_bit(i, msg->sg.copy); else - __clear_bit(i, &msg->sg.copy); + __clear_bit(i, msg->sg.copy); sk_msg_iter_var_next(i); if (i == msg->sg.end) break; diff --git a/net/core/filter.c b/net/core/filter.c index 9615ae1ab530..f497ca7a16d2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2603,7 +2603,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, * account for the headroom. */ bytes_sg_total = start - offset + bytes; - if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len) + if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist @@ -2809,7 +2809,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; - __clear_bit(new, &msg->sg.copy); + __clear_bit(new, msg->sg.copy); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); -- cgit v1.2.3 From 938d3480b92fa5e454b7734294f12a7b75126f09 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Fri, 4 Mar 2022 16:11:42 +0800 Subject: bpf, sockmap: Fix memleak in sk_psock_queue_msg If tcp_bpf_sendmsg is running during a tear down operation we may enqueue data on the ingress msg queue while tear down is trying to free it. sk1 (redirect sk2) sk2 ------------------- --------------- tcp_bpf_sendmsg() tcp_bpf_send_verdict() tcp_bpf_sendmsg_redir() bpf_tcp_ingress() sock_map_close() lock_sock() lock_sock() ... blocking sk_psock_stop sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); release_sock(sk); lock_sock() sk_mem_charge() get_page() sk_psock_queue_msg() sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED); drop_sk_msg() release_sock() While drop_sk_msg(), the msg has charged memory form sk by sk_mem_charge and has sg pages need to put. To fix we use sk_msg_free() and then kfee() msg. This issue can cause the following info: WARNING: CPU: 0 PID: 9202 at net/core/stream.c:205 sk_stream_kill_queues+0xc8/0xe0 Call Trace: inet_csk_destroy_sock+0x55/0x110 tcp_rcv_state_process+0xe5f/0xe90 ? sk_filter_trim_cap+0x10d/0x230 ? tcp_v4_do_rcv+0x161/0x250 tcp_v4_do_rcv+0x161/0x250 tcp_v4_rcv+0xc3a/0xce0 ip_protocol_deliver_rcu+0x3d/0x230 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0xfd/0x110 ? ip_protocol_deliver_rcu+0x230/0x230 ip_rcv+0xd6/0x100 ? ip_local_deliver+0x110/0x110 __netif_receive_skb_one_core+0x85/0xa0 process_backlog+0xa4/0x160 __napi_poll+0x29/0x1b0 net_rx_action+0x287/0x300 __do_softirq+0xff/0x2fc do_softirq+0x79/0x90 WARNING: CPU: 0 PID: 531 at net/ipv4/af_inet.c:154 inet_sock_destruct+0x175/0x1b0 Call Trace: __sk_destruct+0x24/0x1f0 sk_psock_destroy+0x19b/0x1c0 process_one_work+0x1b3/0x3c0 ? process_one_work+0x3c0/0x3c0 worker_thread+0x30/0x350 ? process_one_work+0x3c0/0x3c0 kthread+0xe6/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 Fixes: 9635720b7c88 ("bpf, sockmap: Fix memleak on ingress msg enqueue") Signed-off-by: Wang Yufen Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220304081145.2037182-2-wangyufen@huawei.com --- include/linux/skmsg.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index fdb5375f0562..c5a2d6f50f25 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -304,21 +304,16 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } -static inline void drop_sk_msg(struct sk_psock *psock, struct sk_msg *msg) -{ - if (msg->skb) - sock_drop(psock->sk, msg->skb); - kfree(msg); -} - static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) list_add_tail(&msg->list, &psock->ingress_msg); - else - drop_sk_msg(psock, msg); + else { + sk_msg_free(psock->sk, msg); + kfree(msg); + } spin_unlock_bh(&psock->ingress_lock); } -- cgit v1.2.3 From d8616ee2affcff37c5d315310da557a694a3303d Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Tue, 24 May 2022 15:53:11 +0800 Subject: bpf, sockmap: Fix sk->sk_forward_alloc warn_on in sk_stream_kill_queues During TCP sockmap redirect pressure test, the following warning is triggered: WARNING: CPU: 3 PID: 2145 at net/core/stream.c:205 sk_stream_kill_queues+0xbc/0xd0 CPU: 3 PID: 2145 Comm: iperf Kdump: loaded Tainted: G W 5.10.0+ #9 Call Trace: inet_csk_destroy_sock+0x55/0x110 inet_csk_listen_stop+0xbb/0x380 tcp_close+0x41b/0x480 inet_release+0x42/0x80 __sock_release+0x3d/0xa0 sock_close+0x11/0x20 __fput+0x9d/0x240 task_work_run+0x62/0x90 exit_to_user_mode_prepare+0x110/0x120 syscall_exit_to_user_mode+0x27/0x190 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The reason we observed is that: When the listener is closing, a connection may have completed the three-way handshake but not accepted, and the client has sent some packets. The child sks in accept queue release by inet_child_forget()->inet_csk_destroy_sock(), but psocks of child sks have not released. To fix, add sock_map_destroy to release psocks. Signed-off-by: Wang Yufen Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220524075311.649153-1-wangyufen@huawei.com --- include/linux/bpf.h | 1 + include/linux/skmsg.h | 1 + net/core/skmsg.c | 1 + net/core/sock_map.c | 23 +++++++++++++++++++++++ net/ipv4/tcp_bpf.c | 1 + 5 files changed, 27 insertions(+) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2b914a56a2c5..8e6092d0ea95 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2104,6 +2104,7 @@ int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); void sock_map_unhash(struct sock *sk); +void sock_map_destroy(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c5a2d6f50f25..153b6dec9b6a 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -95,6 +95,7 @@ struct sk_psock { spinlock_t link_lock; refcount_t refcnt; void (*saved_unhash)(struct sock *sk); + void (*saved_destroy)(struct sock *sk); void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 22b983ade0e7..7e03f96e441b 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -715,6 +715,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) psock->eval = __SK_NONE; psock->sk_proto = prot; psock->saved_unhash = prot->unhash; + psock->saved_destroy = prot->destroy; psock->saved_close = prot->close; psock->saved_write_space = sk->sk_write_space; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 81d4b4756a02..9f08ccfaf6da 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1561,6 +1561,29 @@ void sock_map_unhash(struct sock *sk) } EXPORT_SYMBOL_GPL(sock_map_unhash); +void sock_map_destroy(struct sock *sk) +{ + void (*saved_destroy)(struct sock *sk); + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock_get(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + if (sk->sk_prot->destroy) + sk->sk_prot->destroy(sk); + return; + } + + saved_destroy = psock->saved_destroy; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + sk_psock_stop(psock, true); + sk_psock_put(sk, psock); + saved_destroy(sk); +} +EXPORT_SYMBOL_GPL(sock_map_destroy); + void sock_map_close(struct sock *sk, long timeout) { void (*saved_close)(struct sock *sk, long timeout); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index be3947e70fec..38550bb1b90b 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -540,6 +540,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], struct proto *base) { prot[TCP_BPF_BASE] = *base; + prot[TCP_BPF_BASE].destroy = sock_map_destroy; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; -- cgit v1.2.3 From 2a0133723f9ebeb751cfce19f74ec07e108bef1f Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Fri, 5 Aug 2022 15:48:34 +0800 Subject: net: fix refcount bug in sk_psock_get (2) Syzkaller reports refcount bug as follows: ------------[ cut here ]------------ refcount_t: saturated; leaking memory. WARNING: CPU: 1 PID: 3605 at lib/refcount.c:19 refcount_warn_saturate+0xf4/0x1e0 lib/refcount.c:19 Modules linked in: CPU: 1 PID: 3605 Comm: syz-executor208 Not tainted 5.18.0-syzkaller-03023-g7e062cda7d90 #0 __refcount_add_not_zero include/linux/refcount.h:163 [inline] __refcount_inc_not_zero include/linux/refcount.h:227 [inline] refcount_inc_not_zero include/linux/refcount.h:245 [inline] sk_psock_get+0x3bc/0x410 include/linux/skmsg.h:439 tls_data_ready+0x6d/0x1b0 net/tls/tls_sw.c:2091 tcp_data_ready+0x106/0x520 net/ipv4/tcp_input.c:4983 tcp_data_queue+0x25f2/0x4c90 net/ipv4/tcp_input.c:5057 tcp_rcv_state_process+0x1774/0x4e80 net/ipv4/tcp_input.c:6659 tcp_v4_do_rcv+0x339/0x980 net/ipv4/tcp_ipv4.c:1682 sk_backlog_rcv include/net/sock.h:1061 [inline] __release_sock+0x134/0x3b0 net/core/sock.c:2849 release_sock+0x54/0x1b0 net/core/sock.c:3404 inet_shutdown+0x1e0/0x430 net/ipv4/af_inet.c:909 __sys_shutdown_sock net/socket.c:2331 [inline] __sys_shutdown_sock net/socket.c:2325 [inline] __sys_shutdown+0xf1/0x1b0 net/socket.c:2343 __do_sys_shutdown net/socket.c:2351 [inline] __se_sys_shutdown net/socket.c:2349 [inline] __x64_sys_shutdown+0x50/0x70 net/socket.c:2349 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 During SMC fallback process in connect syscall, kernel will replaces TCP with SMC. In order to forward wakeup smc socket waitqueue after fallback, kernel will sets clcsk->sk_user_data to origin smc socket in smc_fback_replace_callbacks(). Later, in shutdown syscall, kernel will calls sk_psock_get(), which treats the clcsk->sk_user_data as psock type, triggering the refcnt warning. So, the root cause is that smc and psock, both will use sk_user_data field. So they will mismatch this field easily. This patch solves it by using another bit(defined as SK_USER_DATA_PSOCK) in PTRMASK, to mark whether sk_user_data points to a psock object or not. This patch depends on a PTRMASK introduced in commit f1ff5ce2cd5e ("net, sk_msg: Clear sk_user_data pointer on clone if tagged"). For there will possibly be more flags in the sk_user_data field, this patch also refactor sk_user_data flags code to be more generic to improve its maintainability. Reported-and-tested-by: syzbot+5f26f85569bd179c18ce@syzkaller.appspotmail.com Suggested-by: Jakub Kicinski Acked-by: Wen Gu Signed-off-by: Hawkins Jiawei Reviewed-by: Jakub Sitnicki Signed-off-by: Jakub Kicinski --- include/linux/skmsg.h | 3 ++- include/net/sock.h | 68 ++++++++++++++++++++++++++++++++++++--------------- net/core/skmsg.c | 4 ++- 3 files changed, 53 insertions(+), 22 deletions(-) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 153b6dec9b6a..48f4b645193b 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -278,7 +278,8 @@ static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start) static inline struct sk_psock *sk_psock(const struct sock *sk) { - return rcu_dereference_sk_user_data(sk); + return __rcu_dereference_sk_user_data_with_flags(sk, + SK_USER_DATA_PSOCK); } static inline void sk_psock_set_state(struct sk_psock *psock, diff --git a/include/net/sock.h b/include/net/sock.h index a7273b289188..05a1bbdf5805 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -545,14 +545,26 @@ enum sk_pacing { SK_PACING_FQ = 2, }; -/* Pointer stored in sk_user_data might not be suitable for copying - * when cloning the socket. For instance, it can point to a reference - * counted object. sk_user_data bottom bit is set if pointer must not - * be copied. +/* flag bits in sk_user_data + * + * - SK_USER_DATA_NOCOPY: Pointer stored in sk_user_data might + * not be suitable for copying when cloning the socket. For instance, + * it can point to a reference counted object. sk_user_data bottom + * bit is set if pointer must not be copied. + * + * - SK_USER_DATA_BPF: Mark whether sk_user_data field is + * managed/owned by a BPF reuseport array. This bit should be set + * when sk_user_data's sk is added to the bpf's reuseport_array. + * + * - SK_USER_DATA_PSOCK: Mark whether pointer stored in + * sk_user_data points to psock type. This bit should be set + * when sk_user_data is assigned to a psock object. */ #define SK_USER_DATA_NOCOPY 1UL -#define SK_USER_DATA_BPF 2UL /* Managed by BPF */ -#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF) +#define SK_USER_DATA_BPF 2UL +#define SK_USER_DATA_PSOCK 4UL +#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\ + SK_USER_DATA_PSOCK) /** * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied @@ -565,24 +577,40 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk) #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) +/** + * __rcu_dereference_sk_user_data_with_flags - return the pointer + * only if argument flags all has been set in sk_user_data. Otherwise + * return NULL + * + * @sk: socket + * @flags: flag bits + */ +static inline void * +__rcu_dereference_sk_user_data_with_flags(const struct sock *sk, + uintptr_t flags) +{ + uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk)); + + WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); + + if ((sk_user_data & flags) == flags) + return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); + return NULL; +} + #define rcu_dereference_sk_user_data(sk) \ + __rcu_dereference_sk_user_data_with_flags(sk, 0) +#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags) \ ({ \ - void *__tmp = rcu_dereference(__sk_user_data((sk))); \ - (void *)((uintptr_t)__tmp & SK_USER_DATA_PTRMASK); \ -}) -#define rcu_assign_sk_user_data(sk, ptr) \ -({ \ - uintptr_t __tmp = (uintptr_t)(ptr); \ - WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK); \ - rcu_assign_pointer(__sk_user_data((sk)), __tmp); \ -}) -#define rcu_assign_sk_user_data_nocopy(sk, ptr) \ -({ \ - uintptr_t __tmp = (uintptr_t)(ptr); \ - WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK); \ + uintptr_t __tmp1 = (uintptr_t)(ptr), \ + __tmp2 = (uintptr_t)(flags); \ + WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK); \ + WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK); \ rcu_assign_pointer(__sk_user_data((sk)), \ - __tmp | SK_USER_DATA_NOCOPY); \ + __tmp1 | __tmp2); \ }) +#define rcu_assign_sk_user_data(sk, ptr) \ + __rcu_assign_sk_user_data_with_flags(sk, ptr, 0) static inline struct net *sock_net(const struct sock *sk) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 81627892bdd4..57e942a6431a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -739,7 +739,9 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); refcount_set(&psock->refcnt, 1); - rcu_assign_sk_user_data_nocopy(sk, psock); + __rcu_assign_sk_user_data_with_flags(sk, psock, + SK_USER_DATA_NOCOPY | + SK_USER_DATA_PSOCK); sock_hold(sk); out: -- cgit v1.2.3 From 8bbabb3fddcd0f858be69ed5abc9b470a239d6f2 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 1 Nov 2022 21:34:17 -0700 Subject: bpf, sock_map: Move cancel_work_sync() out of sock lock Stanislav reported a lockdep warning, which is caused by the cancel_work_sync() called inside sock_map_close(), as analyzed below by Jakub: psock->work.func = sk_psock_backlog() ACQUIRE psock->work_mutex sk_psock_handle_skb() skb_send_sock() __skb_send_sock() sendpage_unlocked() kernel_sendpage() sock->ops->sendpage = inet_sendpage() sk->sk_prot->sendpage = tcp_sendpage() ACQUIRE sk->sk_lock tcp_sendpage_locked() RELEASE sk->sk_lock RELEASE psock->work_mutex sock_map_close() ACQUIRE sk->sk_lock sk_psock_stop() sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED) cancel_work_sync() __cancel_work_timer() __flush_work() // wait for psock->work to finish RELEASE sk->sk_lock We can move the cancel_work_sync() out of the sock lock protection, but still before saved_close() was called. Fixes: 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Reported-by: Stanislav Fomichev Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Tested-by: Jakub Sitnicki Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20221102043417.279409-1-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 +- net/core/skmsg.c | 7 ++----- net/core/sock_map.c | 7 ++++--- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 48f4b645193b..70d6cb94e580 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -376,7 +376,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err) } struct sk_psock *sk_psock_init(struct sock *sk, int node); -void sk_psock_stop(struct sk_psock *psock, bool wait); +void sk_psock_stop(struct sk_psock *psock); #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 1efdc47a999b..e6b9ced3eda8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -803,16 +803,13 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } -void sk_psock_stop(struct sk_psock *psock, bool wait) +void sk_psock_stop(struct sk_psock *psock) { spin_lock_bh(&psock->ingress_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); sk_psock_cork_free(psock); __sk_psock_zap_ingress(psock); spin_unlock_bh(&psock->ingress_lock); - - if (wait) - cancel_work_sync(&psock->work); } static void sk_psock_done_strp(struct sk_psock *psock); @@ -850,7 +847,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - sk_psock_stop(psock, false); + sk_psock_stop(psock); INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); queue_rcu_work(system_wq, &psock->rwork); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index a660baedd9e7..81beb16ab1eb 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1596,7 +1596,7 @@ void sock_map_destroy(struct sock *sk) saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); - sk_psock_stop(psock, false); + sk_psock_stop(psock); sk_psock_put(sk, psock); saved_destroy(sk); } @@ -1619,9 +1619,10 @@ void sock_map_close(struct sock *sk, long timeout) saved_close = psock->saved_close; sock_map_remove_links(sk, psock); rcu_read_unlock(); - sk_psock_stop(psock, true); - sk_psock_put(sk, psock); + sk_psock_stop(psock); release_sock(sk); + cancel_work_sync(&psock->work); + sk_psock_put(sk, psock); saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); -- cgit v1.2.3