diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 271 |
1 files changed, 184 insertions, 87 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e51c644484dc..355d3dffd021 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -111,6 +111,25 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define REXMIT_LOST 1 /* retransmit packets marked lost */ #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ +#if IS_ENABLED(CONFIG_TLS_DEVICE) +static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled); + +void clean_acked_data_enable(struct inet_connection_sock *icsk, + void (*cad)(struct sock *sk, u32 ack_seq)) +{ + icsk->icsk_clean_acked = cad; + static_branch_inc(&clean_acked_data_enabled); +} +EXPORT_SYMBOL_GPL(clean_acked_data_enable); + +void clean_acked_data_disable(struct inet_connection_sock *icsk) +{ + static_branch_dec(&clean_acked_data_enabled); + icsk->icsk_clean_acked = NULL; +} +EXPORT_SYMBOL_GPL(clean_acked_data_disable); +#endif + static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, unsigned int len) { @@ -184,21 +203,23 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) } } -static void tcp_incr_quickack(struct sock *sk) +static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); if (quickacks == 0) quickacks = 2; + quickacks = min(quickacks, max_quickacks); if (quickacks > icsk->icsk_ack.quick) - icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); + icsk->icsk_ack.quick = quickacks; } -static void tcp_enter_quickack_mode(struct sock *sk) +static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); - tcp_incr_quickack(sk); + + tcp_incr_quickack(sk, max_quickacks); icsk->icsk_ack.pingpong = 0; icsk->icsk_ack.ato = TCP_ATO_MIN; } @@ -233,8 +254,10 @@ static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } -static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) +static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); + switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { case INET_ECN_NOT_ECT: /* Funny extension: if ECT is not set on a segment, @@ -242,31 +265,31 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) * it is probably a retransmit. */ if (tp->ecn_flags & TCP_ECN_SEEN) - tcp_enter_quickack_mode((struct sock *)tp); + tcp_enter_quickack_mode(sk, 1); break; case INET_ECN_CE: - if (tcp_ca_needs_ecn((struct sock *)tp)) - tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE); + if (tcp_ca_needs_ecn(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ - tcp_enter_quickack_mode((struct sock *)tp); + tcp_enter_quickack_mode(sk, 1); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } tp->ecn_flags |= TCP_ECN_SEEN; break; default: - if (tcp_ca_needs_ecn((struct sock *)tp)) - tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE); + if (tcp_ca_needs_ecn(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; break; } } -static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) +static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) { - if (tp->ecn_flags & TCP_ECN_OK) - __tcp_ecn_check_ce(tp, skb); + if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) + __tcp_ecn_check_ce(sk, skb); } static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) @@ -582,6 +605,8 @@ void tcp_rcv_space_adjust(struct sock *sk) u32 copied; int time; + trace_tcp_rcv_space_adjust(sk); + tcp_mstamp_refresh(tp); time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) @@ -665,7 +690,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) /* The _first_ data packet received, initialize * delayed ACK engine. */ - tcp_incr_quickack(sk); + tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); icsk->icsk_ack.ato = TCP_ATO_MIN; } else { int m = now - icsk->icsk_ack.lrcvtime; @@ -681,13 +706,13 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) /* Too long gap. Apparently sender failed to * restart window, so that we send ACKs quickly. */ - tcp_incr_quickack(sk); + tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); sk_mem_reclaim(sk); } } icsk->icsk_ack.lrcvtime = now; - tcp_ecn_check_ce(tp, skb); + tcp_ecn_check_ce(sk, skb); if (skb->len >= 128) tcp_grow_window(sk, skb); @@ -1896,19 +1921,54 @@ static inline void tcp_init_undo(struct tcp_sock *tp) tp->undo_retrans = tp->retrans_out ? : -1; } -/* Enter Loss state. If we detect SACK reneging, forget all SACK information +static bool tcp_is_rack(const struct sock *sk) +{ + return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION; +} + +/* If we detect SACK reneging, forget all SACK information * and reset tags completely, otherwise preserve SACKs. If receiver * dropped its ofo queue, we will know this due to reneging detection. */ +static void tcp_timeout_mark_lost(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb, *head; + bool is_reneg; /* is receiver reneging on SACKs? */ + + head = tcp_rtx_queue_head(sk); + is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED); + if (is_reneg) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); + tp->sacked_out = 0; + /* Mark SACK reneging until we recover from this loss event. */ + tp->is_sack_reneg = 1; + } else if (tcp_is_reno(tp)) { + tcp_reset_reno_sack(tp); + } + + skb = head; + skb_rbtree_walk_from(skb) { + if (is_reneg) + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; + else if (tcp_is_rack(sk) && skb != head && + tcp_rack_skb_timeout(tp, skb, 0) > 0) + continue; /* Don't mark recently sent ones lost yet */ + tcp_mark_skb_lost(sk, skb); + } + tcp_verify_left_out(tp); + tcp_clear_all_retrans_hints(tp); +} + +/* Enter Loss state. */ void tcp_enter_loss(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); - struct sk_buff *skb; bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; - bool is_reneg; /* is receiver reneging on SACKs? */ - bool mark_lost; + + tcp_timeout_mark_lost(sk); /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || @@ -1920,40 +1980,10 @@ void tcp_enter_loss(struct sock *sk) tcp_ca_event(sk, CA_EVENT_LOSS); tcp_init_undo(tp); } - tp->snd_cwnd = 1; + tp->snd_cwnd = tcp_packets_in_flight(tp) + 1; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_jiffies32; - tp->retrans_out = 0; - tp->lost_out = 0; - - if (tcp_is_reno(tp)) - tcp_reset_reno_sack(tp); - - skb = tcp_rtx_queue_head(sk); - is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); - if (is_reneg) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); - tp->sacked_out = 0; - /* Mark SACK reneging until we recover from this loss event. */ - tp->is_sack_reneg = 1; - } - tcp_clear_all_retrans_hints(tp); - - skb_rbtree_walk_from(skb) { - mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || - is_reneg); - if (mark_lost) - tcp_sum_lost(tp, skb); - TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; - if (mark_lost) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - } - } - tcp_verify_left_out(tp); - /* Timeout in disordered state after receiving substantial DUPACKs * suggests that the degree of reordering is over-estimated. */ @@ -2120,7 +2150,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) return true; /* Not-A-Trick#2 : Classic rule... */ - if (tcp_dupack_heuristics(tp) > tp->reordering) + if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering) return true; return false; @@ -2197,9 +2227,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_is_reno(tp)) { - tcp_mark_head_lost(sk, 1, 1); - } else { + if (tcp_is_sack(tp)) { int sacked_upto = tp->sacked_out - tp->reordering; if (sacked_upto >= 0) tcp_mark_head_lost(sk, sacked_upto, 0); @@ -2697,12 +2725,16 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) return false; } -static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) +static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) { struct tcp_sock *tp = tcp_sk(sk); - /* Use RACK to detect loss */ - if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { + if (tcp_rtx_queue_empty(sk)) + return; + + if (unlikely(tcp_is_reno(tp))) { + tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED); + } else if (tcp_is_rack(sk)) { u32 prior_retrans = tp->retrans_out; tcp_rack_mark_lost(sk); @@ -2798,11 +2830,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, tcp_try_keep_open(sk); return; } - tcp_rack_identify_loss(sk, ack_flag); + tcp_identify_packet_loss(sk, ack_flag); break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack, rexmit); - tcp_rack_identify_loss(sk, ack_flag); + tcp_identify_packet_loss(sk, ack_flag); if (!(icsk->icsk_ca_state == TCP_CA_Open || (*ack_flag & FLAG_LOST_RETRANS))) return; @@ -2819,7 +2851,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); - tcp_rack_identify_loss(sk, ack_flag); + tcp_identify_packet_loss(sk, ack_flag); if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag); return; @@ -2841,7 +2873,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, fast_rexmit = 1; } - if (do_lost) + if (!tcp_is_rack(sk) && do_lost) tcp_update_scoreboard(sk, fast_rexmit); *rexmit = REXMIT_LOST; } @@ -3496,6 +3528,22 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) tcp_xmit_retransmit_queue(sk); } +/* Returns the number of packets newly acked or sacked by the current ACK */ +static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) +{ + const struct net *net = sock_net(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 delivered; + + delivered = tp->delivered - prior_delivered; + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); + if (flag & FLAG_ECE) { + tp->delivered_ce += delivered; + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); + } + return delivered; +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3542,6 +3590,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; icsk->icsk_retransmits = 0; + +#if IS_ENABLED(CONFIG_TLS_DEVICE) + if (static_branch_unlikely(&clean_acked_data_enabled)) + if (icsk->icsk_clean_acked) + icsk->icsk_clean_acked(sk, ack); +#endif } prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; @@ -3619,7 +3673,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ + delivered = tcp_newly_delivered(sk, delivered, flag); lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); @@ -3629,9 +3683,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ - if (flag & FLAG_DSACKING_ACK) + if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, &rexmit); + tcp_newly_delivered(sk, delivered, flag); + } /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3655,6 +3711,7 @@ old_ack: &sack_state); tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, &rexmit); + tcp_newly_delivered(sk, delivered, flag); tcp_xmit_recovery(sk, rexmit); } @@ -4126,7 +4183,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); - tcp_enter_quickack_mode(sk); + tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -4196,6 +4253,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { + if (tp->compressed_ack) + tcp_send_ack(sk); this_sack--; tp->rx_opt.num_sacks--; sp--; @@ -4377,7 +4436,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) u32 seq, end_seq; bool fragstolen; - tcp_ecn_check_ce(tp, skb); + tcp_ecn_check_ce(sk, skb); if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); @@ -4573,6 +4632,17 @@ err: } +void tcp_data_ready(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + int avail = tp->rcv_nxt - tp->copied_seq; + + if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE)) + return; + + sk->sk_data_ready(sk); +} + static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -4630,7 +4700,7 @@ queue_and_out: if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk); + tcp_data_ready(sk); return; } @@ -4640,7 +4710,7 @@ queue_and_out: tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); out_of_window: - tcp_enter_quickack_mode(sk); + tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); inet_csk_schedule_ack(sk); drop: tcp_drop(sk, skb); @@ -4651,8 +4721,6 @@ drop: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) goto out_of_window; - tcp_enter_quickack_mode(sk); - if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", @@ -5019,23 +5087,48 @@ static inline void tcp_data_snd_check(struct sock *sk) static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) { struct tcp_sock *tp = tcp_sk(sk); + unsigned long rtt, delay; /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && /* ... and right edge of window advances far enough. - * (tcp_recvmsg() will send ACK otherwise). Or... + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. */ - __tcp_select_window(sk) >= tp->rcv_wnd) || + (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || + __tcp_select_window(sk) >= tp->rcv_wnd)) || /* We ACK each frame or... */ - tcp_in_quickack_mode(sk) || - /* We have out of order data. */ - (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) { - /* Then ack it now */ + tcp_in_quickack_mode(sk)) { +send_now: tcp_send_ack(sk); - } else { - /* Else, send delayed ack. */ + return; + } + + if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tcp_send_delayed_ack(sk); + return; } + + if (!tcp_is_sack(tp) || + tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) + goto send_now; + tp->compressed_ack++; + + if (hrtimer_is_queued(&tp->compressed_ack_timer)) + return; + + /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */ + + rtt = tp->rcv_rtt_est.rtt_us; + if (tp->srtt_us && tp->srtt_us < rtt) + rtt = tp->srtt_us; + + delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, + rtt * (NSEC_PER_USEC >> 3)/20); + sock_hold(sk); + hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), + HRTIMER_MODE_REL_PINNED_SOFT); } static inline void tcp_ack_snd_check(struct sock *sk) @@ -5299,11 +5392,11 @@ discard: * the rest is checked inline. Fast processing is turned on in * tcp_data_queue when everything is OK. */ -void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th) +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) { - unsigned int len = skb->len; + const struct tcphdr *th = (const struct tcphdr *)skb->data; struct tcp_sock *tp = tcp_sk(sk); + unsigned int len = skb->len; /* TCP congestion window tracking */ trace_tcp_probe(sk, skb); @@ -5428,7 +5521,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, no_ack: if (eaten) kfree_skb_partial(skb, fragstolen); - sk->sk_data_ready(sk); + tcp_data_ready(sk); return; } } @@ -5550,9 +5643,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, return true; } tp->syn_data_acked = tp->syn_data; - if (tp->syn_data_acked) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVE); + if (tp->syn_data_acked) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + /* SYN-data is counted as two separate packets in tcp_ack() */ + if (tp->delivered > 1) + --tp->delivered; + } tcp_fastopen_add_skb(sk, synack); @@ -5698,7 +5794,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * to stand against the temptation 8) --ANK */ inet_csk_schedule_ack(sk); - tcp_enter_quickack_mode(sk); + tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX, TCP_RTO_MAX); @@ -5884,6 +5980,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } switch (sk->sk_state) { case TCP_SYN_RECV: + tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */ if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); |