From fc16dcd8c2e1e9bc91ed765957e1f2bbf334253e Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 18 Jan 2012 17:47:58 +0000 Subject: tcp: fix undo after RTO for BIC This patch fixes BIC so that cwnd reductions made during RTOs can be undone (just as they already can be undone when using the default/Reno behavior). When undoing cwnd reductions, BIC-derived congestion control modules were restoring the cwnd from last_max_cwnd. There were two problems with using last_max_cwnd to restore a cwnd during undo: (a) last_max_cwnd was set to 0 on state transitions into TCP_CA_Loss (by calling the module's reset() functions), so cwnd reductions from RTOs could not be undone. (b) when fast_covergence is enabled (which it is by default) last_max_cwnd does not actually hold the value of snd_cwnd before the loss; instead, it holds a scaled-down version of snd_cwnd. This patch makes the following changes: (1) upon undo, revert snd_cwnd to ca->loss_cwnd, which is already, as the existing comment notes, the "congestion window at last loss" (2) stop forgetting ca->loss_cwnd on TCP_CA_Loss events (3) use ca->last_max_cwnd to check if we're in slow start Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_bic.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 6187eb4d1dcf..f45e1c242440 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -63,7 +63,6 @@ static inline void bictcp_reset(struct bictcp *ca) { ca->cnt = 0; ca->last_max_cwnd = 0; - ca->loss_cwnd = 0; ca->last_cwnd = 0; ca->last_time = 0; ca->epoch_start = 0; @@ -72,7 +71,11 @@ static inline void bictcp_reset(struct bictcp *ca) static void bictcp_init(struct sock *sk) { - bictcp_reset(inet_csk_ca(sk)); + struct bictcp *ca = inet_csk_ca(sk); + + bictcp_reset(ca); + ca->loss_cwnd = 0; + if (initial_ssthresh) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } @@ -127,7 +130,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) } /* if in slow start or link utilization is very low */ - if (ca->loss_cwnd == 0) { + if (ca->last_max_cwnd == 0) { if (ca->cnt > 20) /* increase cwnd 5% per RTT */ ca->cnt = 20; } @@ -185,7 +188,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct bictcp *ca = inet_csk_ca(sk); - return max(tp->snd_cwnd, ca->last_max_cwnd); + return max(tp->snd_cwnd, ca->loss_cwnd); } static void bictcp_state(struct sock *sk, u8 new_state) -- cgit v1.2.3 From 5a45f0086a2dcf50db7e6a0bf5be933880f85127 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 18 Jan 2012 17:47:59 +0000 Subject: tcp: fix undo after RTO for CUBIC This patch fixes CUBIC so that cwnd reductions made during RTOs can be undone (just as they already can be undone when using the default/Reno behavior). When undoing cwnd reductions, BIC-derived congestion control modules were restoring the cwnd from last_max_cwnd. There were two problems with using last_max_cwnd to restore a cwnd during undo: (a) last_max_cwnd was set to 0 on state transitions into TCP_CA_Loss (by calling the module's reset() functions), so cwnd reductions from RTOs could not be undone. (b) when fast_covergence is enabled (which it is by default) last_max_cwnd does not actually hold the value of snd_cwnd before the loss; instead, it holds a scaled-down version of snd_cwnd. This patch makes the following changes: (1) upon undo, revert snd_cwnd to ca->loss_cwnd, which is already, as the existing comment notes, the "congestion window at last loss" (2) stop forgetting ca->loss_cwnd on TCP_CA_Loss events (3) use ca->last_max_cwnd to check if we're in slow start Signed-off-by: Neal Cardwell Acked-by: Stephen Hemminger Acked-by: Sangtae Ha Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index f376b05cca81..a9077f441cb2 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -107,7 +107,6 @@ static inline void bictcp_reset(struct bictcp *ca) { ca->cnt = 0; ca->last_max_cwnd = 0; - ca->loss_cwnd = 0; ca->last_cwnd = 0; ca->last_time = 0; ca->bic_origin_point = 0; @@ -142,7 +141,10 @@ static inline void bictcp_hystart_reset(struct sock *sk) static void bictcp_init(struct sock *sk) { - bictcp_reset(inet_csk_ca(sk)); + struct bictcp *ca = inet_csk_ca(sk); + + bictcp_reset(ca); + ca->loss_cwnd = 0; if (hystart) bictcp_hystart_reset(sk); @@ -275,7 +277,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) * The initial growth of cubic function may be too conservative * when the available bandwidth is still unknown. */ - if (ca->loss_cwnd == 0 && ca->cnt > 20) + if (ca->last_max_cwnd == 0 && ca->cnt > 20) ca->cnt = 20; /* increase cwnd 5% per RTT */ /* TCP Friendly */ @@ -342,7 +344,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); - return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); } static void bictcp_state(struct sock *sk, u8 new_state) -- cgit v1.2.3 From 974c12360dfe6ab01201fe9e708e7755c413f8b6 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jan 2012 14:42:21 +0000 Subject: tcp: detect loss above high_seq in recovery Correctly implement a loss detection heuristic: New sequences (above high_seq) sent during the fast recovery are deemed lost when higher sequences are SACKed. Current code does not catch these losses, because tcp_mark_head_lost() does not check packets beyond high_seq. The fix is straight-forward by checking packets until the highest sacked packet. In addition, all the FLAG_DATA_LOST logic are in-effective and redundant and can be removed. Update the loss heuristic comments. The algorithm above is documented as heuristic B, but it is redundant too because heuristic A already covers B. Note that this change only marks some forward-retransmitted packets LOST. It does NOT forbid TCP performing further CWR on new losses. A potential follow-up patch under preparation is to perform another CWR on "new" losses such as 1) sequence above high_seq is lost (by resetting high_seq to snd_nxt) 2) retransmission is lost. Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/snmp.h | 1 - net/ipv4/proc.c | 1 - net/ipv4/tcp_input.c | 41 +++++++++++++++-------------------------- 3 files changed, 15 insertions(+), 28 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/snmp.h b/include/linux/snmp.h index e16557a357e5..c1241c428179 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -192,7 +192,6 @@ enum LINUX_MIB_TCPPARTIALUNDO, /* TCPPartialUndo */ LINUX_MIB_TCPDSACKUNDO, /* TCPDSACKUndo */ LINUX_MIB_TCPLOSSUNDO, /* TCPLossUndo */ - LINUX_MIB_TCPLOSS, /* TCPLoss */ LINUX_MIB_TCPLOSTRETRANSMIT, /* TCPLostRetransmit */ LINUX_MIB_TCPRENOFAILURES, /* TCPRenoFailures */ LINUX_MIB_TCPSACKFAILURES, /* TCPSackFailures */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3569d8ecaeac..6afc807ee2ad 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -216,7 +216,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO), SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO), SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), - SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS), SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT), SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES), SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2877c3e09587..976034f82320 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -105,7 +105,6 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ -#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ @@ -1040,13 +1039,11 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, * These 6 states form finite state machine, controlled by the following events: * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) - * 3. Loss detection event of one of three flavors: + * 3. Loss detection event of two flavors: * A. Scoreboard estimator decided the packet is lost. * A'. Reno "three dupacks" marks head of queue lost. - * A''. Its FACK modfication, head until snd.fack is lost. - * B. SACK arrives sacking data transmitted after never retransmitted - * hole was sent out. - * C. SACK arrives sacking SND.NXT at the moment, when the + * A''. Its FACK modification, head until snd.fack is lost. + * B. SACK arrives sacking SND.NXT at the moment, when the * segment was retransmitted. * 4. D-SACK added new rule: D-SACK changes any tag to S. * @@ -1153,7 +1150,7 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, } /* Check for lost retransmit. This superb idea is borrowed from "ratehalving". - * Event "C". Later note: FACK people cheated me again 8), we have to account + * Event "B". Later note: FACK people cheated me again 8), we have to account * for reordering! Ugly, but should help. * * Search retransmitted skbs from write_queue that were sent when snd_nxt was @@ -1844,10 +1841,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, if (found_dup_sack && ((i + 1) == first_sack_index)) next_dup = &sp[i + 1]; - /* Event "B" in the comment above. */ - if (after(end_seq, tp->high_seq)) - state.flag |= FLAG_DATA_LOST; - /* Skip too early cached blocks */ while (tcp_sack_cache_ok(tp, cache) && !before(start_seq, cache->end_seq)) @@ -2515,8 +2508,11 @@ static void tcp_timeout_skbs(struct sock *sk) tcp_verify_left_out(tp); } -/* Mark head of queue up as lost. With RFC3517 SACK, the packets is - * is against sacked "cnt", otherwise it's against facked "cnt" +/* Detect loss in event "A" above by marking head of queue up as lost. + * For FACK or non-SACK(Reno) senders, the first "packets" number of segments + * are considered lost. For RFC3517 SACK, a segment is considered lost if it + * has at least tp->reordering SACKed seqments above it; "packets" refers to + * the maximum SACKed segments to pass before reaching this limit. */ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { @@ -2525,6 +2521,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) int cnt, oldcnt; int err; unsigned int mss; + /* Use SACK to deduce losses of new sequences sent during recovery */ + const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; WARN_ON(packets > tp->packets_out); if (tp->lost_skb_hint) { @@ -2546,7 +2544,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) tp->lost_skb_hint = skb; tp->lost_cnt_hint = cnt; - if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) + if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) break; oldcnt = cnt; @@ -3033,19 +3031,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (tcp_check_sack_reneging(sk, flag)) return; - /* C. Process data loss notification, provided it is valid. */ - if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) && - before(tp->snd_una, tp->high_seq) && - icsk->icsk_ca_state != TCP_CA_Open && - tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); - } - - /* D. Check consistency of the current state. */ + /* C. Check consistency of the current state. */ tcp_verify_left_out(tp); - /* E. Check state exit conditions. State can be terminated + /* D. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { WARN_ON(tp->retrans_out != 0); @@ -3077,7 +3066,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, } } - /* F. Process state. */ + /* E. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { -- cgit v1.2.3 From 8a622e71f58ec9f092fc99eacae0e6cf14f6e742 Mon Sep 17 00:00:00 2001 From: shawnlu Date: Fri, 20 Jan 2012 12:22:04 +0000 Subject: tcp: md5: using remote adress for md5 lookup in rst packet md5 key is added in socket through remote address. remote address should be used in finding md5 key when sending out reset packet. Signed-off-by: shawnlu Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1eb4ad57670e..337ba4cca052 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -631,7 +631,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.iov[0].iov_len = sizeof(rep.th); #ifdef CONFIG_TCP_MD5SIG - key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL; + key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL; if (key) { rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 906c7ca43542..3edd05ae4388 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1083,7 +1083,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_TCP_MD5SIG if (sk) - key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr); + key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr); #endif if (th->ack) -- cgit v1.2.3