From 3e4bc23926b83c3c67e5f61ae8571602754131a6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Sep 2023 18:13:59 +0000 Subject: xfrm: fix a data-race in xfrm_gen_index() xfrm_gen_index() mutual exclusion uses net->xfrm.xfrm_policy_lock. This means we must use a per-netns idx_generator variable, instead of a static one. Alternative would be to use an atomic variable. syzbot reported: BUG: KCSAN: data-race in xfrm_sk_policy_insert / xfrm_sk_policy_insert write to 0xffffffff87005938 of 4 bytes by task 29466 on cpu 0: xfrm_gen_index net/xfrm/xfrm_policy.c:1385 [inline] xfrm_sk_policy_insert+0x262/0x640 net/xfrm/xfrm_policy.c:2347 xfrm_user_policy+0x413/0x540 net/xfrm/xfrm_state.c:2639 do_ipv6_setsockopt+0x1317/0x2ce0 net/ipv6/ipv6_sockglue.c:943 ipv6_setsockopt+0x57/0x130 net/ipv6/ipv6_sockglue.c:1012 rawv6_setsockopt+0x21e/0x410 net/ipv6/raw.c:1054 sock_common_setsockopt+0x61/0x70 net/core/sock.c:3697 __sys_setsockopt+0x1c9/0x230 net/socket.c:2263 __do_sys_setsockopt net/socket.c:2274 [inline] __se_sys_setsockopt net/socket.c:2271 [inline] __x64_sys_setsockopt+0x66/0x80 net/socket.c:2271 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffffffff87005938 of 4 bytes by task 29460 on cpu 1: xfrm_sk_policy_insert+0x13e/0x640 xfrm_user_policy+0x413/0x540 net/xfrm/xfrm_state.c:2639 do_ipv6_setsockopt+0x1317/0x2ce0 net/ipv6/ipv6_sockglue.c:943 ipv6_setsockopt+0x57/0x130 net/ipv6/ipv6_sockglue.c:1012 rawv6_setsockopt+0x21e/0x410 net/ipv6/raw.c:1054 sock_common_setsockopt+0x61/0x70 net/core/sock.c:3697 __sys_setsockopt+0x1c9/0x230 net/socket.c:2263 __do_sys_setsockopt net/socket.c:2274 [inline] __se_sys_setsockopt net/socket.c:2271 [inline] __x64_sys_setsockopt+0x66/0x80 net/socket.c:2271 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x00006ad8 -> 0x00006b18 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 29460 Comm: syz-executor.1 Not tainted 6.5.0-rc5-syzkaller-00243-g9106536c1aa3 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 Fixes: 1121994c803f ("netns xfrm: policy insertion in netns") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Steffen Klassert Cc: Herbert Xu Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index bd7c3be4af5d..423b52eca908 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -50,6 +50,7 @@ struct netns_xfrm { struct list_head policy_all; struct hlist_head *policy_byidx; unsigned int policy_idx_hmask; + unsigned int idx_generator; struct hlist_head policy_inexact[XFRM_POLICY_MAX]; struct xfrm_policy_hash policy_bydst[XFRM_POLICY_MAX]; unsigned int policy_count[XFRM_POLICY_MAX * 2]; -- cgit v1.2.3 From 419ce133ab928ab5efd7b50b2ef36ddfd4eadbd2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 11 Oct 2023 09:20:55 +0200 Subject: tcp: allow again tcp_disconnect() when threads are waiting As reported by Tom, .NET and applications build on top of it rely on connect(AF_UNSPEC) to async cancel pending I/O operations on TCP socket. The blamed commit below caused a regression, as such cancellation can now fail. As suggested by Eric, this change addresses the problem explicitly causing blocking I/O operation to terminate immediately (with an error) when a concurrent disconnect() is executed. Instead of tracking the number of threads blocked on a given socket, track the number of disconnect() issued on such socket. If such counter changes after a blocking operation releasing and re-acquiring the socket lock, error out the current operation. Fixes: 4faeee0cf8a5 ("tcp: deny tcp_disconnect() when threads are waiting") Reported-by: Tom Deseyn Closes: https://bugzilla.redhat.com/show_bug.cgi?id=1886305 Suggested-by: Eric Dumazet Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/f3b95e47e3dbed840960548aebaa8d954372db41.1697008693.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/chtls/chtls_io.c | 36 +++++++++++++++++----- include/net/sock.h | 10 +++--- net/core/stream.c | 12 +++++--- net/ipv4/af_inet.c | 10 ++++-- net/ipv4/inet_connection_sock.c | 1 - net/ipv4/tcp.c | 16 +++++----- net/ipv4/tcp_bpf.c | 4 +++ net/mptcp/protocol.c | 7 ----- net/tls/tls_main.c | 10 ++++-- net/tls/tls_sw.c | 19 ++++++++---- 10 files changed, 80 insertions(+), 45 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c index 5fc64e47568a..d567e42e1760 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c @@ -911,7 +911,7 @@ static int csk_wait_memory(struct chtls_dev *cdev, struct sock *sk, long *timeo_p) { DEFINE_WAIT_FUNC(wait, woken_wake_function); - int err = 0; + int ret, err = 0; long current_timeo; long vm_wait = 0; bool noblock; @@ -942,10 +942,13 @@ static int csk_wait_memory(struct chtls_dev *cdev, set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; - sk_wait_event(sk, ¤t_timeo, sk->sk_err || - (sk->sk_shutdown & SEND_SHUTDOWN) || - (csk_mem_free(cdev, sk) && !vm_wait), &wait); + ret = sk_wait_event(sk, ¤t_timeo, sk->sk_err || + (sk->sk_shutdown & SEND_SHUTDOWN) || + (csk_mem_free(cdev, sk) && !vm_wait), + &wait); sk->sk_write_pending--; + if (ret < 0) + goto do_error; if (vm_wait) { vm_wait -= current_timeo; @@ -1348,6 +1351,7 @@ static int chtls_pt_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int copied = 0; int target; long timeo; + int ret; buffers_freed = 0; @@ -1423,7 +1427,11 @@ static int chtls_pt_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (copied >= target) break; chtls_cleanup_rbuf(sk, copied); - sk_wait_data(sk, &timeo, NULL); + ret = sk_wait_data(sk, &timeo, NULL); + if (ret < 0) { + copied = copied ? : ret; + goto unlock; + } continue; found_ok_skb: if (!skb->len) { @@ -1518,6 +1526,8 @@ skip_copy: if (buffers_freed) chtls_cleanup_rbuf(sk, copied); + +unlock: release_sock(sk); return copied; } @@ -1534,6 +1544,7 @@ static int peekmsg(struct sock *sk, struct msghdr *msg, int copied = 0; size_t avail; /* amount of available data in current skb */ long timeo; + int ret; lock_sock(sk); timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); @@ -1585,7 +1596,12 @@ static int peekmsg(struct sock *sk, struct msghdr *msg, release_sock(sk); lock_sock(sk); } else { - sk_wait_data(sk, &timeo, NULL); + ret = sk_wait_data(sk, &timeo, NULL); + if (ret < 0) { + /* here 'copied' is 0 due to previous checks */ + copied = ret; + break; + } } if (unlikely(peek_seq != tp->copied_seq)) { @@ -1656,6 +1672,7 @@ int chtls_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int copied = 0; long timeo; int target; /* Read at least this many bytes */ + int ret; buffers_freed = 0; @@ -1747,7 +1764,11 @@ int chtls_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (copied >= target) break; chtls_cleanup_rbuf(sk, copied); - sk_wait_data(sk, &timeo, NULL); + ret = sk_wait_data(sk, &timeo, NULL); + if (ret < 0) { + copied = copied ? : ret; + goto unlock; + } continue; found_ok_skb: @@ -1816,6 +1837,7 @@ skip_copy: if (buffers_freed) chtls_cleanup_rbuf(sk, copied); +unlock: release_sock(sk); return copied; } diff --git a/include/net/sock.h b/include/net/sock.h index b770261fbdaf..92f7ea62a915 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -336,7 +336,7 @@ struct sk_filter; * @sk_cgrp_data: cgroup data for this cgroup * @sk_memcg: this socket's memory cgroup association * @sk_write_pending: a write to stream socket waits to start - * @sk_wait_pending: number of threads blocked on this socket + * @sk_disconnects: number of disconnect operations performed on this sock * @sk_state_change: callback to indicate change in the state of the sock * @sk_data_ready: callback to indicate there is data to be processed * @sk_write_space: callback to indicate there is bf sending space available @@ -429,7 +429,7 @@ struct sock { unsigned int sk_napi_id; #endif int sk_rcvbuf; - int sk_wait_pending; + int sk_disconnects; struct sk_filter __rcu *sk_filter; union { @@ -1189,8 +1189,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk) } #define sk_wait_event(__sk, __timeo, __condition, __wait) \ - ({ int __rc; \ - __sk->sk_wait_pending++; \ + ({ int __rc, __dis = __sk->sk_disconnects; \ release_sock(__sk); \ __rc = __condition; \ if (!__rc) { \ @@ -1200,8 +1199,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk) } \ sched_annotate_sleep(); \ lock_sock(__sk); \ - __sk->sk_wait_pending--; \ - __rc = __condition; \ + __rc = __dis == __sk->sk_disconnects ? __condition : -EPIPE; \ __rc; \ }) diff --git a/net/core/stream.c b/net/core/stream.c index f5c4e47df165..96fbcb9bbb30 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -117,7 +117,7 @@ EXPORT_SYMBOL(sk_stream_wait_close); */ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) { - int err = 0; + int ret, err = 0; long vm_wait = 0; long current_timeo = *timeo_p; DEFINE_WAIT_FUNC(wait, woken_wake_function); @@ -142,11 +142,13 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; - sk_wait_event(sk, ¤t_timeo, READ_ONCE(sk->sk_err) || - (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || - (sk_stream_memory_free(sk) && - !vm_wait), &wait); + ret = sk_wait_event(sk, ¤t_timeo, READ_ONCE(sk->sk_err) || + (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || + (sk_stream_memory_free(sk) && !vm_wait), + &wait); sk->sk_write_pending--; + if (ret < 0) + goto do_error; if (vm_wait) { vm_wait -= current_timeo; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 3d2e30e20473..2713c9b06c4c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -597,7 +597,6 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending += writebias; - sk->sk_wait_pending++; /* Basic assumption: if someone sets sk->sk_err, he _must_ * change state of the socket from TCP_SYN_*. @@ -613,7 +612,6 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) } remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending -= writebias; - sk->sk_wait_pending--; return timeo; } @@ -642,6 +640,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) { + sk->sk_disconnects++; err = sk->sk_prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; goto out; @@ -696,6 +695,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int writebias = (sk->sk_protocol == IPPROTO_TCP) && tcp_sk(sk)->fastopen_req && tcp_sk(sk)->fastopen_req->data ? 1 : 0; + int dis = sk->sk_disconnects; /* Error code is set above */ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) @@ -704,6 +704,11 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; + + if (dis != sk->sk_disconnects) { + err = -EPIPE; + goto out; + } } /* Connection was closed by RST, timeout, ICMP error @@ -725,6 +730,7 @@ out: sock_error: err = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; + sk->sk_disconnects++; if (sk->sk_prot->disconnect(sk, flags)) sock->state = SS_DISCONNECTING; goto out; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index aeebe8816689..394a498c2823 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1145,7 +1145,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); - newsk->sk_wait_pending = 0; inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; newicsk->icsk_bind2_hash = NULL; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3f66cdeef7de..d3456cf840de 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -831,7 +831,9 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, */ if (!skb_queue_empty(&sk->sk_receive_queue)) break; - sk_wait_data(sk, &timeo, NULL); + ret = sk_wait_data(sk, &timeo, NULL); + if (ret < 0) + break; if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; @@ -2442,7 +2444,11 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, __sk_flush_backlog(sk); } else { tcp_cleanup_rbuf(sk, copied); - sk_wait_data(sk, &timeo, last); + err = sk_wait_data(sk, &timeo, last); + if (err < 0) { + err = copied ? : err; + goto out; + } } if ((flags & MSG_PEEK) && @@ -2966,12 +2972,6 @@ int tcp_disconnect(struct sock *sk, int flags) int old_state = sk->sk_state; u32 seq; - /* Deny disconnect if other threads are blocked in sk_wait_event() - * or inet_wait_for_connect(). - */ - if (sk->sk_wait_pending) - return -EBUSY; - if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 327268203001..ba2e92188124 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -307,6 +307,8 @@ msg_bytes_ready: } data = tcp_msg_wait_data(sk, psock, timeo); + if (data < 0) + return data; if (data && !sk_psock_queue_empty(psock)) goto msg_bytes_ready; copied = -EAGAIN; @@ -351,6 +353,8 @@ msg_bytes_ready: timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); data = tcp_msg_wait_data(sk, psock, timeo); + if (data < 0) + return data; if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c3b83cb390d9..d1902373c974 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3098,12 +3098,6 @@ static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); - /* Deny disconnect if other threads are blocked in sk_wait_event() - * or inet_wait_for_connect(). - */ - if (sk->sk_wait_pending) - return -EBUSY; - /* We are on the fastopen error path. We can't call straight into the * subflows cleanup code due to lock nesting (we are already under * msk->firstsocket lock). @@ -3173,7 +3167,6 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); #endif - nsk->sk_wait_pending = 0; __mptcp_init_sock(nsk); msk = mptcp_sk(nsk); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 02f583ff9239..002483e60c19 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -139,8 +139,8 @@ void update_sk_prot(struct sock *sk, struct tls_context *ctx) int wait_on_pending_writer(struct sock *sk, long *timeo) { - int rc = 0; DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret, rc = 0; add_wait_queue(sk_sleep(sk), &wait); while (1) { @@ -154,9 +154,13 @@ int wait_on_pending_writer(struct sock *sk, long *timeo) break; } - if (sk_wait_event(sk, timeo, - !READ_ONCE(sk->sk_write_pending), &wait)) + ret = sk_wait_event(sk, timeo, + !READ_ONCE(sk->sk_write_pending), &wait); + if (ret) { + if (ret < 0) + rc = ret; break; + } } remove_wait_queue(sk_sleep(sk), &wait); return rc; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index d1fc295b83b5..e9d1e83a859d 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1291,6 +1291,7 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; long timeo; timeo = sock_rcvtimeo(sk, nonblock); @@ -1302,6 +1303,9 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, if (sk->sk_err) return sock_error(sk); + if (ret < 0) + return ret; + if (!skb_queue_empty(&sk->sk_receive_queue)) { tls_strp_check_rcv(&ctx->strp); if (tls_strp_msg_ready(ctx)) @@ -1320,10 +1324,10 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, released = true; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - sk_wait_event(sk, &timeo, - tls_strp_msg_ready(ctx) || - !sk_psock_queue_empty(psock), - &wait); + ret = sk_wait_event(sk, &timeo, + tls_strp_msg_ready(ctx) || + !sk_psock_queue_empty(psock), + &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); @@ -1852,6 +1856,7 @@ static int tls_rx_reader_acquire(struct sock *sk, struct tls_sw_context_rx *ctx, bool nonblock) { long timeo; + int ret; timeo = sock_rcvtimeo(sk, nonblock); @@ -1861,14 +1866,16 @@ static int tls_rx_reader_acquire(struct sock *sk, struct tls_sw_context_rx *ctx, ctx->reader_contended = 1; add_wait_queue(&ctx->wq, &wait); - sk_wait_event(sk, &timeo, - !READ_ONCE(ctx->reader_present), &wait); + ret = sk_wait_event(sk, &timeo, + !READ_ONCE(ctx->reader_present), &wait); remove_wait_queue(&ctx->wq, &wait); if (timeo <= 0) return -EAGAIN; if (signal_pending(current)) return sock_intr_errno(timeo); + if (ret < 0) + return ret; } WRITE_ONCE(ctx->reader_present, 1); -- cgit v1.2.3 From cb3871b1cd135a6662b732fbc6b3db4afcdb4a64 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 11 Oct 2023 09:31:44 -0700 Subject: Bluetooth: hci_sock: Correctly bounds check and pad HCI_MON_NEW_INDEX name The code pattern of memcpy(dst, src, strlen(src)) is almost always wrong. In this case it is wrong because it leaves memory uninitialized if it is less than sizeof(ni->name), and overflows ni->name when longer. Normally strtomem_pad() could be used here, but since ni->name is a trailing array in struct hci_mon_new_index, compilers that don't support -fstrict-flex-arrays=3 can't tell how large this array is via __builtin_object_size(). Instead, open-code the helper and use sizeof() since it will work correctly. Additionally mark ni->name as __nonstring since it appears to not be a %NUL terminated C string. Cc: Luiz Augusto von Dentz Cc: Edward AD Cc: Marcel Holtmann Cc: Johan Hedberg Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: linux-bluetooth@vger.kernel.org Cc: netdev@vger.kernel.org Fixes: 18f547f3fc07 ("Bluetooth: hci_sock: fix slab oob read in create_monitor_event") Link: https://lore.kernel.org/lkml/202310110908.F2639D3276@keescook/ Signed-off-by: Kees Cook Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_mon.h | 2 +- net/bluetooth/hci_sock.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h index 2d5fcda1bcd0..082f89531b88 100644 --- a/include/net/bluetooth/hci_mon.h +++ b/include/net/bluetooth/hci_mon.h @@ -56,7 +56,7 @@ struct hci_mon_new_index { __u8 type; __u8 bus; bdaddr_t bdaddr; - char name[8]; + char name[8] __nonstring; } __packed; #define HCI_MON_NEW_INDEX_SIZE 16 diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 72abe54c45dd..3e7cd330d731 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -488,7 +488,8 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) ni->type = hdev->dev_type; ni->bus = hdev->bus; bacpy(&ni->bdaddr, &hdev->bdaddr); - memcpy(ni->name, hdev->name, strlen(hdev->name)); + memcpy_and_pad(ni->name, sizeof(ni->name), hdev->name, + strnlen(hdev->name, sizeof(ni->name)), '\0'); opcode = cpu_to_le16(HCI_MON_NEW_INDEX); break; -- cgit v1.2.3 From 1c2709cfff1dedbb9591e989e2f001484208d914 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 15 Oct 2023 13:47:00 -0400 Subject: tcp: fix excessive TLP and RACK timeouts from HZ rounding We discovered from packet traces of slow loss recovery on kernels with the default HZ=250 setting (and min_rtt < 1ms) that after reordering, when receiving a SACKed sequence range, the RACK reordering timer was firing after about 16ms rather than the desired value of roughly min_rtt/4 + 2ms. The problem is largely due to the RACK reorder timer calculation adding in TCP_TIMEOUT_MIN, which is 2 jiffies. On kernels with HZ=250, this is 2*4ms = 8ms. The TLP timer calculation has the exact same issue. This commit fixes the TLP transmit timer and RACK reordering timer floor calculation to more closely match the intended 2ms floor even on kernels with HZ=250. It does this by adding in a new TCP_TIMEOUT_MIN_US floor of 2000 us and then converting to jiffies, instead of the current approach of converting to jiffies and then adding th TCP_TIMEOUT_MIN value of 2 jiffies. Our testing has verified that on kernels with HZ=1000, as expected, this does not produce significant changes in behavior, but on kernels with the default HZ=250 the latency improvement can be large. For example, our tests show that for HZ=250 kernels at low RTTs this fix roughly halves the latency for the RACK reorder timer: instead of mostly firing at 16ms it mostly fires at 8ms. Suggested-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Fixes: bb4d991a28cc ("tcp: adjust tail loss probe timeout") Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20231015174700.2206872-1-ncardwell.sw@gmail.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 3 +++ net/ipv4/tcp_output.c | 9 +++++---- net/ipv4/tcp_recovery.c | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7b1a720691ae..4b03ca7cb8a5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -141,6 +141,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCP_RTO_MAX ((unsigned)(120*HZ)) #define TCP_RTO_MIN ((unsigned)(HZ/5)) #define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */ + +#define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */ + #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now * used as a fallback RTO for the diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9c8c42c280b7..bbd85672fda7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2788,7 +2788,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - u32 timeout, rto_delta_us; + u32 timeout, timeout_us, rto_delta_us; int early_retrans; /* Don't do any loss probe on a Fast Open connection before 3WHS @@ -2812,11 +2812,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) * sample is available then probe after TCP_TIMEOUT_INIT. */ if (tp->srtt_us) { - timeout = usecs_to_jiffies(tp->srtt_us >> 2); + timeout_us = tp->srtt_us >> 2; if (tp->packets_out == 1) - timeout += TCP_RTO_MIN; + timeout_us += tcp_rto_min_us(sk); else - timeout += TCP_TIMEOUT_MIN; + timeout_us += TCP_TIMEOUT_MIN_US; + timeout = usecs_to_jiffies(timeout_us); } else { timeout = TCP_TIMEOUT_INIT; } diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index acf4869c5d3b..bba10110fbbc 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -104,7 +104,7 @@ bool tcp_rack_mark_lost(struct sock *sk) tp->rack.advanced = 0; tcp_rack_detect_loss(sk, &timeout); if (timeout) { - timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN; + timeout = usecs_to_jiffies(timeout + TCP_TIMEOUT_MIN_US); inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, timeout, inet_csk(sk)->icsk_rto); } -- cgit v1.2.3