tcp: introduce tcp_try_coalesce

commit c8628155ece3 (tcp: reduce out_of_order memory use) took care of coalescing tcp segments provided by legacy devices (linear skbs) We extend this idea to fragged skbs, as their truesize can be heavy. ixgbe for example uses 256+1024+PAGE_SIZE/2 = 3328 bytes per segment. Use this coalescing strategy for receive queue too. This contributes to reduce number of tcp collapses, at minimal cost, and reduces memory overhead and packets drops. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Tom Herbert <therbert@google.com> Cc: Maciej Żenczykowski <maze@google.com> Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <edumazet@google.com> 2012-04-23 07:11:42 +0000
committer: David S. Miller <davem@davemloft.net> 2012-04-23 22:42:49 -0400
commit: 1402d366019fedaa2b024f2bac06b7cc9a8782e1 (patch)
tree: f74c00a2308ad13d6970cdc7e5cc29d2f0f809fa /net/ipv4/tcp_input.c
parent: e29ecd51de1683e6aeb88d76251f194b0311f749 (diff)
download: lwn-1402d366019fedaa2b024f2bac06b7cc9a8782e1.tar.gz
lwn-1402d366019fedaa2b024f2bac06b7cc9a8782e1.zip
1 files changed, 62 insertions, 17 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 37e1c5cd2c01..bd7aef59c385 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4449,6 +4449,58 @@ static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
 	return 0;
 }
 
+/**
+ * tcp_try_coalesce - try to merge skb to prior one
+ * @sk: socket
+ * @to: prior buffer
+ * @from: buffer to add in queue
+ *
+ * Before queueing skb @from after @to, try to merge them
+ * to reduce overall memory use and queue lengths, if cost is small.
+ * Packets in ofo or receive queues can stay a long time.
+ * Better try to coalesce them right now to avoid future collapses.
+ * Returns > 0 value if caller should free @from instead of queueing it
+ */
+static int tcp_try_coalesce(struct sock *sk,
+			    struct sk_buff *to,
+			    struct sk_buff *from)
+{
+	int len = from->len;
+
+	if (tcp_hdr(from)->fin)
+		return 0;
+	if (len <= skb_tailroom(to)) {
+		BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
+merge:
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+		TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
+		TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
+		return 1;
+	}
+	if (skb_headlen(from) == 0 &&
+	    !skb_has_frag_list(to) &&
+	    !skb_has_frag_list(from) &&
+	    (skb_shinfo(to)->nr_frags +
+	     skb_shinfo(from)->nr_frags <= MAX_SKB_FRAGS)) {
+		int delta = from->truesize - ksize(from->head) -
+			    SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+		WARN_ON_ONCE(delta < len);
+		memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
+		       skb_shinfo(from)->frags,
+		       skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
+		skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
+		skb_shinfo(from)->nr_frags = 0;
+		to->truesize += delta;
+		atomic_add(delta, &sk->sk_rmem_alloc);
+		sk_mem_charge(sk, delta);
+		to->len += len;
+		to->data_len += len;
+		goto merge;
+	}
+	return 0;
+}
+
 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -4487,23 +4539,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	end_seq = TCP_SKB_CB(skb)->end_seq;
 
 	if (seq == TCP_SKB_CB(skb1)->end_seq) {
-		/* Packets in ofo can stay in queue a long time.
-		 * Better try to coalesce them right now
-		 * to avoid future tcp_collapse_ofo_queue(),
-		 * probably the most expensive function in tcp stack.
-		 */
-		if (skb->len <= skb_tailroom(skb1) && !tcp_hdr(skb)->fin) {
-			NET_INC_STATS_BH(sock_net(sk),
-					 LINUX_MIB_TCPRCVCOALESCE);
-			BUG_ON(skb_copy_bits(skb, 0,
-					     skb_put(skb1, skb->len),
-					     skb->len));
-			TCP_SKB_CB(skb1)->end_seq = end_seq;
-			TCP_SKB_CB(skb1)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
+		if (tcp_try_coalesce(sk, skb1, skb) <= 0) {
+			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+		} else {
 			__kfree_skb(skb);
 			skb = NULL;
-		} else {
-			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
 		}
 
 		if (!tp->rx_opt.num_sacks ||
@@ -4624,13 +4664,18 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 		}
 
 		if (eaten <= 0) {
+			struct sk_buff *tail;
 queue_and_out:
 			if (eaten < 0 &&
 			    tcp_try_rmem_schedule(sk, skb->truesize))
 				goto drop;
 
-			skb_set_owner_r(skb, sk);
-			__skb_queue_tail(&sk->sk_receive_queue, skb);
+			tail = skb_peek_tail(&sk->sk_receive_queue);
+			eaten = tail ? tcp_try_coalesce(sk, tail, skb) : -1;
+			if (eaten <= 0) {
+				skb_set_owner_r(skb, sk);
+				__skb_queue_tail(&sk->sk_receive_queue, skb);
+			}
 		}
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if (skb->len)
author	Eric Dumazet <edumazet@google.com>	2012-04-23 07:11:42 +0000
committer	David S. Miller <davem@davemloft.net>	2012-04-23 22:42:49 -0400
commit	1402d366019fedaa2b024f2bac06b7cc9a8782e1 (patch)
tree	f74c00a2308ad13d6970cdc7e5cc29d2f0f809fa /net/ipv4/tcp_input.c
parent	e29ecd51de1683e6aeb88d76251f194b0311f749 (diff)
download	lwn-1402d366019fedaa2b024f2bac06b7cc9a8782e1.tar.gz lwn-1402d366019fedaa2b024f2bac06b7cc9a8782e1.zip