From d983ea6f16b835dcde2ee9a58a1e764ce68bfccc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:38 -0700 Subject: tcp: add rcu protection around tp->fastopen_rsk Both tcp_v4_err() and tcp_v6_err() do the following operations while they do not own the socket lock : fastopen = tp->fastopen_rsk; snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; The problem is that without appropriate barrier, the compiler might reload tp->fastopen_rsk and trigger a NULL deref. request sockets are protected by RCU, we can simply add the missing annotations and barriers to solve the issue. Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fec6d67bfd14..84ae4d1449ea 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2482,7 +2482,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ - if (tp->fastopen_rsk) + if (rcu_access_pointer(tp->fastopen_rsk)) return false; early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; -- cgit v1.2.3 From 7db48e983930285b765743ebd665aecf9850582b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:40 -0700 Subject: tcp: annotate tp->copied_seq lockless reads There are few places where we fetch tp->copied_seq while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. Note that tcp_inq_hint() was already using READ_ONCE(tp->copied_seq) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 20 ++++++++++---------- net/ipv4/tcp_diag.c | 3 ++- net/ipv4/tcp_input.c | 6 +++--- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 7 files changed, 19 insertions(+), 18 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 883ee863db43..c322ad071e17 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -477,7 +477,7 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, int target, struct sock *sk) { - return (READ_ONCE(tp->rcv_nxt) - tp->copied_seq >= target) || + return (READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq) >= target) || (sk->sk_prot->stream_memory_read ? sk->sk_prot->stream_memory_read(sk) : false); } @@ -546,7 +546,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX); - if (tp->urg_seq == tp->copied_seq && + if (tp->urg_seq == READ_ONCE(tp->copied_seq) && !sock_flag(sk, SOCK_URGINLINE) && tp->urg_data) target++; @@ -607,7 +607,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) unlock_sock_fast(sk, slow); break; case SIOCATMARK: - answ = tp->urg_data && tp->urg_seq == tp->copied_seq; + answ = tp->urg_data && tp->urg_seq == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: if (sk->sk_state == TCP_LISTEN) @@ -1668,9 +1668,9 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_eat_skb(sk, skb); if (!desc->count) break; - tp->copied_seq = seq; + WRITE_ONCE(tp->copied_seq, seq); } - tp->copied_seq = seq; + WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); @@ -1819,7 +1819,7 @@ static int tcp_zerocopy_receive(struct sock *sk, out: up_read(¤t->mm->mmap_sem); if (length) { - tp->copied_seq = seq; + WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ @@ -2117,7 +2117,7 @@ found_ok_skb: if (urg_offset < used) { if (!urg_offset) { if (!sock_flag(sk, SOCK_URGINLINE)) { - ++*seq; + WRITE_ONCE(*seq, *seq + 1); urg_hole++; offset++; used--; @@ -2139,7 +2139,7 @@ found_ok_skb: } } - *seq += used; + WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; @@ -2166,7 +2166,7 @@ skip_copy: found_fin_ok: /* Process the FIN. */ - ++*seq; + WRITE_ONCE(*seq, *seq + 1); if (!(flags & MSG_PEEK)) sk_eat_skb(sk, skb); break; @@ -2588,7 +2588,7 @@ int tcp_disconnect(struct sock *sk, int flags) __kfree_skb(sk->sk_rx_skb_cache); sk->sk_rx_skb_cache = NULL; } - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->urg_data = 0; tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index cd219161f106..66273c8a55c2 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -26,7 +26,8 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, } else if (sk->sk_type == SOCK_STREAM) { const struct tcp_sock *tp = tcp_sk(sk); - r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) - tp->copied_seq, 0); + r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) - + READ_ONCE(tp->copied_seq), 0); r->idiag_wqueue = tp->write_seq - tp->snd_una; } if (info) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5b7c8768ed5f..a30aae3a6a18 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5961,7 +5961,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); smc_check_reset_syn(tp); @@ -6036,7 +6036,7 @@ discard: } WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; /* RFC1323: The window in SYN & SYN/ACK segments is @@ -6216,7 +6216,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_try_undo_spurious_syn(sk); tp->retrans_stamp = 0; tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5089dd6bee0f..39560f482e0b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2456,7 +2456,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) * we might find a transient negative value. */ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - - tp->copied_seq, 0); + READ_ONCE(tp->copied_seq), 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index adc6ce486a38..c4731d26ab4a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -478,7 +478,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, seq = treq->rcv_isn + 1; newtp->rcv_wup = seq; - newtp->copied_seq = seq; + WRITE_ONCE(newtp->copied_seq, seq); WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 84ae4d1449ea..7dda12720169 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3433,7 +3433,7 @@ static void tcp_connect_init(struct sock *sk) else tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; - tp->copied_seq = tp->rcv_nxt; + WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); inet_csk(sk)->icsk_retransmits = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 89ea0a7018b5..a62c7042fc4a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1896,7 +1896,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) * we might find a transient negative value. */ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - - tp->copied_seq, 0); + READ_ONCE(tp->copied_seq), 0); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " -- cgit v1.2.3 From 0f31746452e6793ad6271337438af8f4defb8940 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:41 -0700 Subject: tcp: annotate tp->write_seq lockless reads There are few places where we fetch tp->write_seq while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp.c | 20 ++++++++++++-------- net/ipv4/tcp_diag.c | 2 +- net/ipv4/tcp_ipv4.c | 21 ++++++++++++--------- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 4 ++-- net/ipv6/tcp_ipv6.c | 13 +++++++------ 7 files changed, 36 insertions(+), 28 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 35f6f7e0fdc2..8e7c3f6801a9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1917,7 +1917,7 @@ static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) static inline bool tcp_stream_memory_free(const struct sock *sk, int wake) { const struct tcp_sock *tp = tcp_sk(sk); - u32 notsent_bytes = tp->write_seq - tp->snd_nxt; + u32 notsent_bytes = READ_ONCE(tp->write_seq) - tp->snd_nxt; return (notsent_bytes << wake) < tcp_notsent_lowat(tp); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c322ad071e17..96dd65cbeb85 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -616,7 +616,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else - answ = tp->write_seq - tp->snd_una; + answ = READ_ONCE(tp->write_seq) - tp->snd_una; break; case SIOCOUTQNSD: if (sk->sk_state == TCP_LISTEN) @@ -625,7 +625,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else - answ = tp->write_seq - tp->snd_nxt; + answ = READ_ONCE(tp->write_seq) - tp->snd_nxt; break; default: return -ENOIOCTLCMD; @@ -1035,7 +1035,7 @@ new_segment: sk->sk_wmem_queued += copy; sk_mem_charge(sk, copy); skb->ip_summed = CHECKSUM_PARTIAL; - tp->write_seq += copy; + WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); @@ -1362,7 +1362,7 @@ new_segment: if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; - tp->write_seq += copy; + WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); @@ -2562,6 +2562,7 @@ int tcp_disconnect(struct sock *sk, int flags) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; + u32 seq; if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); @@ -2604,9 +2605,12 @@ int tcp_disconnect(struct sock *sk, int flags) tp->srtt_us = 0; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); tp->rcv_rtt_last_tsecr = 0; - tp->write_seq += tp->max_window + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; + + seq = tp->write_seq + tp->max_window + 2; + if (!seq) + seq = 1; + WRITE_ONCE(tp->write_seq, seq); + icsk->icsk_backoff = 0; tp->snd_cwnd = 2; icsk->icsk_probes_out = 0; @@ -2933,7 +2937,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (sk->sk_state != TCP_CLOSE) err = -EPERM; else if (tp->repair_queue == TCP_SEND_QUEUE) - tp->write_seq = val; + WRITE_ONCE(tp->write_seq, val); else if (tp->repair_queue == TCP_RECV_QUEUE) WRITE_ONCE(tp->rcv_nxt, val); else diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 66273c8a55c2..549506162dde 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -28,7 +28,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq), 0); - r->idiag_wqueue = tp->write_seq - tp->snd_una; + r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una; } if (info) tcp_get_info(sk, info); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 39560f482e0b..6be568334848 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -164,9 +164,11 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) * without appearing to create any others. */ if (likely(!tp->repair)) { - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; + u32 seq = tcptw->tw_snd_nxt + 65535 + 2; + + if (!seq) + seq = 1; + WRITE_ONCE(tp->write_seq, seq); tp->rx_opt.ts_recent = tcptw->tw_ts_recent; tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; } @@ -253,7 +255,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; if (likely(!tp->repair)) - tp->write_seq = 0; + WRITE_ONCE(tp->write_seq, 0); } inet->inet_dport = usin->sin_port; @@ -291,10 +293,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (likely(!tp->repair)) { if (!tp->write_seq) - tp->write_seq = secure_tcp_seq(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - usin->sin_port); + WRITE_ONCE(tp->write_seq, + secure_tcp_seq(inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + usin->sin_port)); tp->tsoffset = secure_tcp_ts_off(sock_net(sk), inet->inet_saddr, inet->inet_daddr); @@ -2461,7 +2464,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", i, src, srcp, dest, destp, state, - tp->write_seq - tp->snd_una, + READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index c4731d26ab4a..339944690329 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -498,7 +498,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->total_retrans = req->num_retrans; tcp_init_xmit_timers(newsk); - newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; + WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(newsk, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7dda12720169..c17c2a78809d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1196,7 +1196,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); /* Advance write_seq and place onto the write_queue. */ - tp->write_seq = TCP_SKB_CB(skb)->end_seq; + WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; @@ -3449,7 +3449,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) __skb_header_release(skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); - tp->write_seq = tcb->end_seq; + WRITE_ONCE(tp->write_seq, tcb->end_seq); tp->packets_out += tcp_skb_pcount(skb); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a62c7042fc4a..4804b6dc5e65 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -215,7 +215,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) { tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; - tp->write_seq = 0; + WRITE_ONCE(tp->write_seq, 0); } sk->sk_v6_daddr = usin->sin6_addr; @@ -311,10 +311,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (likely(!tp->repair)) { if (!tp->write_seq) - tp->write_seq = secure_tcpv6_seq(np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32, - inet->inet_sport, - inet->inet_dport); + WRITE_ONCE(tp->write_seq, + secure_tcpv6_seq(np->saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32, + inet->inet_sport, + inet->inet_dport)); tp->tsoffset = secure_tcpv6_ts_off(sock_net(sk), np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32); @@ -1907,7 +1908,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, state, - tp->write_seq - tp->snd_una, + READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), -- cgit v1.2.3 From e0d694d638dba768b47be31c22e1a9b4f862f561 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:42 -0700 Subject: tcp: annotate tp->snd_nxt lockless reads There are few places where we fetch tp->snd_nxt while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 3 ++- net/ipv4/tcp.c | 3 ++- net/ipv4/tcp_minisocks.c | 6 ++++-- net/ipv4/tcp_output.c | 10 +++++----- 4 files changed, 13 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 8e7c3f6801a9..e1d08f69fd39 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1917,7 +1917,8 @@ static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) static inline bool tcp_stream_memory_free(const struct sock *sk, int wake) { const struct tcp_sock *tp = tcp_sk(sk); - u32 notsent_bytes = READ_ONCE(tp->write_seq) - tp->snd_nxt; + u32 notsent_bytes = READ_ONCE(tp->write_seq) - + READ_ONCE(tp->snd_nxt); return (notsent_bytes << wake) < tcp_notsent_lowat(tp); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 96dd65cbeb85..652568750cb1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -625,7 +625,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else - answ = READ_ONCE(tp->write_seq) - tp->snd_nxt; + answ = READ_ONCE(tp->write_seq) - + READ_ONCE(tp->snd_nxt); break; default: return -ENOIOCTLCMD; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 339944690329..c802bc80c400 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -482,8 +482,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; - newtp->snd_sml = newtp->snd_una = - newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; + seq = treq->snt_isn + 1; + newtp->snd_sml = newtp->snd_una = seq; + WRITE_ONCE(newtp->snd_nxt, seq); + newtp->snd_up = seq; INIT_LIST_HEAD(&newtp->tsq_node); INIT_LIST_HEAD(&newtp->tsorted_sent_queue); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c17c2a78809d..a115a991dfb5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -67,7 +67,7 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq); __skb_unlink(skb, &sk->sk_write_queue); tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); @@ -3142,7 +3142,7 @@ void tcp_send_fin(struct sock *sk) * if FIN had been sent. This is because retransmit path * does not change tp->snd_nxt. */ - tp->snd_nxt++; + WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1); return; } } else { @@ -3426,7 +3426,7 @@ static void tcp_connect_init(struct sock *sk) tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; - tp->snd_nxt = tp->write_seq; + WRITE_ONCE(tp->snd_nxt, tp->write_seq); if (likely(!tp->repair)) tp->rcv_nxt = 0; @@ -3586,11 +3586,11 @@ int tcp_connect(struct sock *sk) /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ - tp->snd_nxt = tp->write_seq; + WRITE_ONCE(tp->snd_nxt, tp->write_seq); tp->pushed_seq = tp->write_seq; buff = tcp_send_head(sk); if (unlikely(buff)) { - tp->snd_nxt = TCP_SKB_CB(buff)->seq; + WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq); tp->pushed_seq = TCP_SKB_CB(buff)->seq; } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); -- cgit v1.2.3 From ab4e846a82d0ae00176de19f2db3c5c64f8eb5f2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:46 -0700 Subject: tcp: annotate sk->sk_wmem_queued lockless reads For the sake of tcp_poll(), there are few places where we fetch sk->sk_wmem_queued while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. sk_wmem_queued_add() helper is added so that we can in the future convert to ADD_ONCE() or equivalent if/when available. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 15 ++++++++++----- include/trace/events/sock.h | 2 +- net/core/datagram.c | 2 +- net/core/sock.c | 2 +- net/ipv4/inet_diag.c | 2 +- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_output.c | 14 +++++++------- net/sched/em_meta.c | 2 +- 8 files changed, 24 insertions(+), 19 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/include/net/sock.h b/include/net/sock.h index 3d1e7502333e..f69b58bff7e5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -878,12 +878,17 @@ static inline bool sk_acceptq_is_full(const struct sock *sk) */ static inline int sk_stream_min_wspace(const struct sock *sk) { - return sk->sk_wmem_queued >> 1; + return READ_ONCE(sk->sk_wmem_queued) >> 1; } static inline int sk_stream_wspace(const struct sock *sk) { - return READ_ONCE(sk->sk_sndbuf) - sk->sk_wmem_queued; + return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued); +} + +static inline void sk_wmem_queued_add(struct sock *sk, int val) +{ + WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val); } void sk_stream_write_space(struct sock *sk); @@ -1207,7 +1212,7 @@ static inline void sk_refcnt_debug_release(const struct sock *sk) static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { - if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf)) + if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) return false; return sk->sk_prot->stream_memory_free ? @@ -1467,7 +1472,7 @@ DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { sock_set_flag(sk, SOCK_QUEUE_SHRUNK); - sk->sk_wmem_queued -= skb->truesize; + sk_wmem_queued_add(sk, -skb->truesize); sk_mem_uncharge(sk, skb->truesize); if (static_branch_unlikely(&tcp_tx_skb_cache_key) && !sk->sk_tx_skb_cache && !skb_cloned(skb)) { @@ -2014,7 +2019,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro skb->len += copy; skb->data_len += copy; skb->truesize += copy; - sk->sk_wmem_queued += copy; + sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); return 0; } diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index f720c32e7dfd..51fe9f6719eb 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -115,7 +115,7 @@ TRACE_EVENT(sock_exceed_buf_limit, __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->sysctl_wmem = sk_get_wmem0(sk, prot); __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc); - __entry->wmem_queued = sk->sk_wmem_queued; + __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued); __entry->kind = kind; ), diff --git a/net/core/datagram.c b/net/core/datagram.c index 4cc8dc5db2b7..c210fc116103 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -640,7 +640,7 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, skb->len += copied; skb->truesize += truesize; if (sk && sk->sk_type == SOCK_STREAM) { - sk->sk_wmem_queued += truesize; + sk_wmem_queued_add(sk, truesize); sk_mem_charge(sk, truesize); } else { refcount_add(truesize, &skb->sk->sk_wmem_alloc); diff --git a/net/core/sock.c b/net/core/sock.c index cd075bc86407..a515392ba84b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3212,7 +3212,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; - mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; + mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index bbb005eb5218..7dc79b973e6e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -193,7 +193,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { struct inet_diag_meminfo minfo = { .idiag_rmem = sk_rmem_alloc_get(sk), - .idiag_wmem = sk->sk_wmem_queued, + .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), .idiag_fmem = sk->sk_forward_alloc, .idiag_tmem = sk_wmem_alloc_get(sk), }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 111853262972..b2ac4f074e2d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -659,7 +659,7 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb) tcb->sacked = 0; __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; @@ -1034,7 +1034,7 @@ new_segment: skb->len += copy; skb->data_len += copy; skb->truesize += copy; - sk->sk_wmem_queued += copy; + sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); skb->ip_summed = CHECKSUM_PARTIAL; WRITE_ONCE(tp->write_seq, tp->write_seq + copy); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a115a991dfb5..0488607c5cd3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1199,7 +1199,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); } @@ -1333,7 +1333,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); - sk->sk_wmem_queued += buff->truesize; + sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); nlen = skb->len - len - nsize; buff->truesize += nlen; @@ -1443,7 +1443,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) if (delta_truesize) { skb->truesize -= delta_truesize; - sk->sk_wmem_queued -= delta_truesize; + sk_wmem_queued_add(sk, -delta_truesize); sk_mem_uncharge(sk, delta_truesize); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); } @@ -1888,7 +1888,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, return -ENOMEM; skb_copy_decrypted(buff, skb); - sk->sk_wmem_queued += buff->truesize; + sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; skb->truesize -= nlen; @@ -2152,7 +2152,7 @@ static int tcp_mtu_probe(struct sock *sk) nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); if (!nskb) return -1; - sk->sk_wmem_queued += nskb->truesize; + sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); @@ -3222,7 +3222,7 @@ int tcp_send_synack(struct sock *sk) tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); - sk->sk_wmem_queued += nskb->truesize; + sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = nskb; } @@ -3447,7 +3447,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) tcb->end_seq += skb->len; __skb_header_release(skb); - sk->sk_wmem_queued += skb->truesize; + sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); WRITE_ONCE(tp->write_seq, tcb->end_seq); tp->packets_out += tcp_skb_pcount(skb); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 4c9122fc35c9..3177dcb17316 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -446,7 +446,7 @@ META_COLLECTOR(int_sk_wmem_queued) *err = -1; return; } - dst->value = sk->sk_wmem_queued; + dst->value = READ_ONCE(sk->sk_wmem_queued); } META_COLLECTOR(int_sk_fwd_alloc) -- cgit v1.2.3