summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2019-03-23 21:57:38 -0400
committerDavid S. Miller <davem@davemloft.net>2019-03-23 21:57:38 -0400
commitbdaba8959e9248524f3d148d1aa47f13944ba8e8 (patch)
tree0e6f2cfd66715d2234acda3ae48d1543facc5303
parent7c1508e5f64a784988be4659dd4d6b791c008bbf (diff)
parent8b27dae5a2e89a61c46c6dbc76c040c0e6d0ed4c (diff)
downloadlwn-bdaba8959e9248524f3d148d1aa47f13944ba8e8.tar.gz
lwn-bdaba8959e9248524f3d148d1aa47f13944ba8e8.zip
Merge branch 'tcp-rx-tx-cache'
Eric Dumazet says: ==================== tcp: add rx/tx cache to reduce lock contention On hosts with many cpus we can observe a very serious contention on spinlocks used in mm slab layer. The following can happen quite often : 1) TX path sendmsg() allocates one (fclone) skb on CPU A, sends a clone. ACK is received on CPU B, and consumes the skb that was in the retransmit queue. 2) RX path network driver allocates skb on CPU C recvmsg() happens on CPU D, freeing the skb after it has been delivered to user space. In both cases, we are hitting the asymetric alloc/free pattern for which slab has to drain alien caches. At 8 Mpps per second, this represents 16 Mpps alloc/free per second and has a huge penalty. In an interesting experiment, I tried to use a single kmem_cache for all the skbs (in skb_init() : skbuff_fclone_cache = skbuff_head_cache = kmem_cache_create("skbuff_fclone_cache", sizeof(struct sk_buff_fclones),); qnd most of the contention disappeared, since cpus could better use their local slab per-cpu cache. But we can do actually better, in the following patches. TX : at ACK time, no longer free the skb but put it back in a tcp socket cache, so that next sendmsg() can reuse it immediately. RX : at recvmsg() time, do not free the skb but put it in a tcp socket cache so that it can be freed by the cpu feeding the incoming packets in BH. This increased the performance of small RPC benchmark by about 10 % on a host with 112 hyperthreads. v2 : - Solved a race condition : sk_stream_alloc_skb() to make sure the prior clone has been freed. - Really test rps_needed in sk_eat_skb() as claimed. - Fixed rps_needed use in drivers/net/tun.c v3: Added a #ifdef CONFIG_RPS, to avoid compile error (kbuild robot) ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/tun.c2
-rw-r--r--include/linux/netdevice.h4
-rw-r--r--include/net/sock.h17
-rw-r--r--net/core/dev.c10
-rw-r--r--net/core/net-sysfs.c4
-rw-r--r--net/core/sysctl_net_core.c8
-rw-r--r--net/ipv4/af_inet.c4
-rw-r--r--net/ipv4/tcp.c54
-rw-r--r--net/ipv4/tcp_ipv4.c11
-rw-r--r--net/ipv6/tcp_ipv6.c12
10 files changed, 79 insertions, 47 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 27798aacb671..24d0220b9ba0 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1042,7 +1042,7 @@ static int tun_net_close(struct net_device *dev)
static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
{
#ifdef CONFIG_RPS
- if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
+ if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) {
/* Select queue was not called for the skbuff, so we extract the
* RPS hash and save it into the flow_table here.
*/
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 823762291ebf..166fdc0a78b4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -194,8 +194,8 @@ struct net_device_stats {
#ifdef CONFIG_RPS
#include <linux/static_key.h>
-extern struct static_key rps_needed;
-extern struct static_key rfs_needed;
+extern struct static_key_false rps_needed;
+extern struct static_key_false rfs_needed;
#endif
struct neighbour;
diff --git a/include/net/sock.h b/include/net/sock.h
index 8de5ee258b93..577d91fb5626 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -368,6 +368,7 @@ struct sock {
atomic_t sk_drops;
int sk_rcvlowat;
struct sk_buff_head sk_error_queue;
+ struct sk_buff *sk_rx_skb_cache;
struct sk_buff_head sk_receive_queue;
/*
* The backlog queue is special, it is always used with
@@ -414,6 +415,7 @@ struct sock {
struct sk_buff *sk_send_head;
struct rb_root tcp_rtx_queue;
};
+ struct sk_buff *sk_tx_skb_cache;
struct sk_buff_head sk_write_queue;
__s32 sk_peek_off;
int sk_write_pending;
@@ -966,7 +968,7 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
static inline void sock_rps_record_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
- if (static_key_false(&rfs_needed)) {
+ if (static_branch_unlikely(&rfs_needed)) {
/* Reading sk->sk_rxhash might incur an expensive cache line
* miss.
*
@@ -1463,6 +1465,10 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
{
+ if (!sk->sk_tx_skb_cache) {
+ sk->sk_tx_skb_cache = skb;
+ return;
+ }
sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
sk->sk_wmem_queued -= skb->truesize;
sk_mem_uncharge(sk, skb->truesize);
@@ -2433,6 +2439,15 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
{
__skb_unlink(skb, &sk->sk_receive_queue);
+ if (
+#ifdef CONFIG_RPS
+ !static_branch_unlikely(&rps_needed) &&
+#endif
+ !sk->sk_rx_skb_cache) {
+ sk->sk_rx_skb_cache = skb;
+ skb_orphan(skb);
+ return;
+ }
__kfree_skb(skb);
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 676c9418f8e4..9ca2d3abfd1a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3982,9 +3982,9 @@ EXPORT_SYMBOL(rps_sock_flow_table);
u32 rps_cpu_mask __read_mostly;
EXPORT_SYMBOL(rps_cpu_mask);
-struct static_key rps_needed __read_mostly;
+struct static_key_false rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
-struct static_key rfs_needed __read_mostly;
+struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);
static struct rps_dev_flow *
@@ -4510,7 +4510,7 @@ static int netif_rx_internal(struct sk_buff *skb)
}
#ifdef CONFIG_RPS
- if (static_key_false(&rps_needed)) {
+ if (static_branch_unlikely(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
@@ -5179,7 +5179,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
rcu_read_lock();
#ifdef CONFIG_RPS
- if (static_key_false(&rps_needed)) {
+ if (static_branch_unlikely(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -5227,7 +5227,7 @@ static void netif_receive_skb_list_internal(struct list_head *head)
rcu_read_lock();
#ifdef CONFIG_RPS
- if (static_key_false(&rps_needed)) {
+ if (static_branch_unlikely(&rps_needed)) {
list_for_each_entry_safe(skb, next, head, list) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 4ff661f6f989..851cabb90bce 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -754,9 +754,9 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
rcu_assign_pointer(queue->rps_map, map);
if (map)
- static_key_slow_inc(&rps_needed);
+ static_branch_inc(&rps_needed);
if (old_map)
- static_key_slow_dec(&rps_needed);
+ static_branch_dec(&rps_needed);
mutex_unlock(&rps_map_mutex);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 84bf2861f45f..1a2685694abd 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -95,12 +95,12 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
if (sock_table != orig_sock_table) {
rcu_assign_pointer(rps_sock_flow_table, sock_table);
if (sock_table) {
- static_key_slow_inc(&rps_needed);
- static_key_slow_inc(&rfs_needed);
+ static_branch_inc(&rps_needed);
+ static_branch_inc(&rfs_needed);
}
if (orig_sock_table) {
- static_key_slow_dec(&rps_needed);
- static_key_slow_dec(&rfs_needed);
+ static_branch_dec(&rps_needed);
+ static_branch_dec(&rfs_needed);
synchronize_rcu();
vfree(orig_sock_table);
}
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index eab3ebde981e..7f3a984ad618 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -136,6 +136,10 @@ void inet_sock_destruct(struct sock *sk)
struct inet_sock *inet = inet_sk(sk);
__skb_queue_purge(&sk->sk_receive_queue);
+ if (sk->sk_rx_skb_cache) {
+ __kfree_skb(sk->sk_rx_skb_cache);
+ sk->sk_rx_skb_cache = NULL;
+ }
__skb_queue_purge(&sk->sk_error_queue);
sk_mem_reclaim(sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6baa6dc1b13b..29b94edf05f9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -865,6 +865,21 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
{
struct sk_buff *skb;
+ skb = sk->sk_tx_skb_cache;
+ if (skb && !size) {
+ const struct sk_buff_fclones *fclones;
+
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
+ if (refcount_read(&fclones->fclone_ref) == 1) {
+ sk->sk_wmem_queued -= skb->truesize;
+ sk_mem_uncharge(sk, skb->truesize);
+ skb->truesize -= skb->data_len;
+ sk->sk_tx_skb_cache = NULL;
+ pskb_trim(skb, 0);
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
+ return skb;
+ }
+ }
/* The TCP header must be at least 32-bit aligned. */
size = ALIGN(size, 4);
@@ -1098,30 +1113,6 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
}
EXPORT_SYMBOL(tcp_sendpage);
-/* Do not bother using a page frag for very small frames.
- * But use this heuristic only for the first skb in write queue.
- *
- * Having no payload in skb->head allows better SACK shifting
- * in tcp_shift_skb_data(), reducing sack/rack overhead, because
- * write queue has less skbs.
- * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
- * This also speeds up tso_fragment(), since it wont fallback
- * to tcp_fragment().
- */
-static int linear_payload_sz(bool first_skb)
-{
- if (first_skb)
- return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
- return 0;
-}
-
-static int select_size(bool first_skb, bool zc)
-{
- if (zc)
- return 0;
- return linear_payload_sz(first_skb);
-}
-
void tcp_free_fastopen_req(struct tcp_sock *tp)
{
if (tp->fastopen_req) {
@@ -1272,7 +1263,6 @@ restart:
if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
bool first_skb;
- int linear;
new_segment:
if (!sk_stream_memory_free(sk))
@@ -1283,8 +1273,7 @@ new_segment:
goto restart;
}
first_skb = tcp_rtx_and_write_queues_empty(sk);
- linear = select_size(first_skb, zc);
- skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
+ skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
first_skb);
if (!skb)
goto wait_for_memory;
@@ -2552,6 +2541,13 @@ void tcp_write_queue_purge(struct sock *sk)
sk_wmem_free_skb(sk, skb);
}
tcp_rtx_queue_purge(sk);
+ skb = sk->sk_tx_skb_cache;
+ if (skb) {
+ sk->sk_wmem_queued -= skb->truesize;
+ sk_mem_uncharge(sk, skb->truesize);
+ __kfree_skb(skb);
+ sk->sk_tx_skb_cache = NULL;
+ }
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
@@ -2587,6 +2583,10 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
+ if (sk->sk_rx_skb_cache) {
+ __kfree_skb(sk->sk_rx_skb_cache);
+ sk->sk_rx_skb_cache = NULL;
+ }
tp->copied_seq = tp->rcv_nxt;
tp->urg_data = 0;
tcp_write_queue_purge(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 277d71239d75..3979939804b7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1774,6 +1774,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
int tcp_v4_rcv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
+ struct sk_buff *skb_to_free;
int sdif = inet_sdif(skb);
const struct iphdr *iph;
const struct tcphdr *th;
@@ -1905,11 +1906,17 @@ process:
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
+ skb_to_free = sk->sk_rx_skb_cache;
+ sk->sk_rx_skb_cache = NULL;
ret = tcp_v4_do_rcv(sk, skb);
- } else if (tcp_add_backlog(sk, skb)) {
- goto discard_and_relse;
+ } else {
+ if (tcp_add_backlog(sk, skb))
+ goto discard_and_relse;
+ skb_to_free = NULL;
}
bh_unlock_sock(sk);
+ if (skb_to_free)
+ __kfree_skb(skb_to_free);
put_and_return:
if (refcounted)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 983ad7a75102..77d723bbe050 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1436,6 +1436,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
static int tcp_v6_rcv(struct sk_buff *skb)
{
+ struct sk_buff *skb_to_free;
int sdif = inet6_sdif(skb);
const struct tcphdr *th;
const struct ipv6hdr *hdr;
@@ -1562,12 +1563,17 @@ process:
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
+ skb_to_free = sk->sk_rx_skb_cache;
+ sk->sk_rx_skb_cache = NULL;
ret = tcp_v6_do_rcv(sk, skb);
- } else if (tcp_add_backlog(sk, skb)) {
- goto discard_and_relse;
+ } else {
+ if (tcp_add_backlog(sk, skb))
+ goto discard_and_relse;
+ skb_to_free = NULL;
}
bh_unlock_sock(sk);
-
+ if (skb_to_free)
+ __kfree_skb(skb_to_free);
put_and_return:
if (refcounted)
sock_put(sk);