summaryrefslogtreecommitdiff
path: root/include/net
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-09-23 23:04:42 +0000
committerDavid S. Miller <davem@davemloft.net>2012-09-24 16:31:37 -0400
commit5640f7685831e088fe6c2e1f863a6805962f8e81 (patch)
treefb7660173338a45c27d610eb59ba20cf5c2b91b8 /include/net
parentb98b8babd6e3370fadb7c6eaacb00eb2f6344a6c (diff)
downloadlwn-5640f7685831e088fe6c2e1f863a6805962f8e81.tar.gz
lwn-5640f7685831e088fe6c2e1f863a6805962f8e81.zip
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Vijay Subramanian <subramanian.vijay@gmail.com> Cc: Alexander Duyck <alexander.h.duyck@intel.com> Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include/net')
-rw-r--r--include/net/inet_sock.h4
-rw-r--r--include/net/sock.h27
2 files changed, 16 insertions, 15 deletions
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 613cfa401672..256c1ed2d69a 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -101,10 +101,8 @@ struct inet_cork {
__be32 addr;
struct ip_options *opt;
unsigned int fragsize;
- struct dst_entry *dst;
int length; /* Total length of all frames */
- struct page *page;
- u32 off;
+ struct dst_entry *dst;
u8 tx_flags;
};
diff --git a/include/net/sock.h b/include/net/sock.h
index 84bdaeca1314..f036493b9a61 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -247,8 +247,7 @@ struct cg_proto;
* @sk_stamp: time stamp of last packet received
* @sk_socket: Identd and reporting IO signals
* @sk_user_data: RPC layer private data
- * @sk_sndmsg_page: cached page for sendmsg
- * @sk_sndmsg_off: cached offset for sendmsg
+ * @sk_frag: cached page frag
* @sk_peek_off: current peek_offset value
* @sk_send_head: front of stuff to transmit
* @sk_security: used by security modules
@@ -362,9 +361,8 @@ struct sock {
ktime_t sk_stamp;
struct socket *sk_socket;
void *sk_user_data;
- struct page *sk_sndmsg_page;
+ struct page_frag sk_frag;
struct sk_buff *sk_send_head;
- __u32 sk_sndmsg_off;
__s32 sk_peek_off;
int sk_write_pending;
#ifdef CONFIG_SECURITY
@@ -2034,18 +2032,23 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp);
-static inline struct page *sk_stream_alloc_page(struct sock *sk)
+/**
+ * sk_page_frag - return an appropriate page_frag
+ * @sk: socket
+ *
+ * If socket allocation mode allows current thread to sleep, it means its
+ * safe to use the per task page_frag instead of the per socket one.
+ */
+static inline struct page_frag *sk_page_frag(struct sock *sk)
{
- struct page *page = NULL;
+ if (sk->sk_allocation & __GFP_WAIT)
+ return &current->task_frag;
- page = alloc_pages(sk->sk_allocation, 0);
- if (!page) {
- sk_enter_memory_pressure(sk);
- sk_stream_moderate_sndbuf(sk);
- }
- return page;
+ return &sk->sk_frag;
}
+extern bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
+
/*
* Default write policy as shown to user space via poll/select/SIGIO
*/