From 1c5f2ced136a7196c41ca2c16b8c2646e9b042ce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Nov 2020 07:08:09 -0800 Subject: tcp: avoid indirect call to tcp_stream_memory_free() Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/sock.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index a5c6ae78df77..1d29aeae74fd 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -60,7 +60,7 @@ #include #include #include - +#include #include #include #include @@ -1264,13 +1264,17 @@ static inline void sk_refcnt_debug_release(const struct sock *sk) #define sk_refcnt_debug_release(sk) do { } while (0) #endif /* SOCK_REFCNT_DEBUG */ +INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake)); + static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) return false; return sk->sk_prot->stream_memory_free ? - sk->sk_prot->stream_memory_free(sk, wake) : true; + INDIRECT_CALL_1(sk->sk_prot->stream_memory_free, + tcp_stream_memory_free, + sk, wake) : true; } static inline bool sk_stream_memory_free(const struct sock *sk) -- cgit v1.2.3 From 12f4bd86225e348ef3a3c8d2bb42dc23ee0f0a4c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 17 Nov 2020 19:43:49 +0100 Subject: net: add annotation for sock_{lock,unlock}_fast The static checker is fooled by the non-static locking scheme implemented by the mentioned helpers. Let's make its life easier adding some unconditional annotation so that the helpers are now interpreted as a plain spinlock from sparse. v1 -> v2: - add __releases() annotation to unlock_sock_fast() Signed-off-by: Paolo Abeni Link: https://lore.kernel.org/r/6ed7ae627d8271fb7f20e0a9c6750fbba1ac2635.1605634911.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 10 +++++++--- net/core/sock.c | 3 ++- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 1d29aeae74fd..093b51719c69 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1595,7 +1595,8 @@ void release_sock(struct sock *sk); SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) -bool lock_sock_fast(struct sock *sk); +bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock); + /** * unlock_sock_fast - complement of lock_sock_fast * @sk: socket @@ -1605,11 +1606,14 @@ bool lock_sock_fast(struct sock *sk); * If slow mode is on, we call regular release_sock() */ static inline void unlock_sock_fast(struct sock *sk, bool slow) + __releases(&sk->sk_lock.slock) { - if (slow) + if (slow) { release_sock(sk); - else + __release(&sk->sk_lock.slock); + } else { spin_unlock_bh(&sk->sk_lock.slock); + } } /* Used by processes to "lock" a socket state, so that diff --git a/net/core/sock.c b/net/core/sock.c index 727ea1cc633c..9badbe7bb4e4 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3078,7 +3078,7 @@ EXPORT_SYMBOL(release_sock); * * sk_lock.slock unlocked, owned = 1, BH enabled */ -bool lock_sock_fast(struct sock *sk) +bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); @@ -3096,6 +3096,7 @@ bool lock_sock_fast(struct sock *sk) * The sk_lock has mutex_lock() semantics here: */ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); + __acquire(&sk->sk_lock.slock); local_bh_enable(); return true; } -- cgit v1.2.3 From fc9840fbef0c3740e0fc3640eb8628d70fcb2215 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 18 Nov 2020 11:44:38 -0800 Subject: net: stream: fix TCP references when INET is not enabled Fix build of net/core/stream.o when CONFIG_INET is not enabled. Fixes these build errors (sample): ld: net/core/stream.o: in function `sk_stream_write_space': (.text+0x27e): undefined reference to `tcp_stream_memory_free' ld: (.text+0x29c): undefined reference to `tcp_stream_memory_free' ld: (.text+0x2ab): undefined reference to `tcp_stream_memory_free' ld: net/core/stream.o: in function `sk_stream_wait_memory': (.text+0x5a1): undefined reference to `tcp_stream_memory_free' ld: (.text+0x5bf): undefined reference to `tcp_stream_memory_free' Fixes: 1c5f2ced136a ("tcp: avoid indirect call to tcp_stream_memory_free()") Signed-off-by: Randy Dunlap Reported-by: Randy Dunlap Link: https://lore.kernel.org/r/20201118194438.674-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/sock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 093b51719c69..80469c2c448d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1271,10 +1271,15 @@ static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) return false; +#ifdef CONFIG_INET return sk->sk_prot->stream_memory_free ? INDIRECT_CALL_1(sk->sk_prot->stream_memory_free, tcp_stream_memory_free, sk, wake) : true; +#else + return sk->sk_prot->stream_memory_free ? + sk->sk_prot->stream_memory_free(sk, wake) : true; +#endif } static inline bool sk_stream_memory_free(const struct sock *sk) -- cgit v1.2.3 From 7fd3253a7de6a317a0683f83739479fb880bffc8 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 30 Nov 2020 19:51:56 +0100 Subject: net: Introduce preferred busy-polling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing busy-polling mode, enabled by the SO_BUSY_POLL socket option or system-wide using the /proc/sys/net/core/busy_read knob, is an opportunistic. That means that if the NAPI context is not scheduled, it will poll it. If, after busy-polling, the budget is exceeded the busy-polling logic will schedule the NAPI onto the regular softirq handling. One implication of the behavior above is that a busy/heavy loaded NAPI context will never enter/allow for busy-polling. Some applications prefer that most NAPI processing would be done by busy-polling. This series adds a new socket option, SO_PREFER_BUSY_POLL, that works in concert with the napi_defer_hard_irqs and gro_flush_timeout knobs. The napi_defer_hard_irqs and gro_flush_timeout knobs were introduced in commit 6f8b12d661d0 ("net: napi: add hard irqs deferral feature"), and allows for a user to defer interrupts to be enabled and instead schedule the NAPI context from a watchdog timer. When a user enables the SO_PREFER_BUSY_POLL, again with the other knobs enabled, and the NAPI context is being processed by a softirq, the softirq NAPI processing will exit early to allow the busy-polling to be performed. If the application stops performing busy-polling via a system call, the watchdog timer defined by gro_flush_timeout will timeout, and regular softirq handling will resume. In summary; Heavy traffic applications that prefer busy-polling over softirq processing should use this option. Example usage: $ echo 2 | sudo tee /sys/class/net/ens785f1/napi_defer_hard_irqs $ echo 200000 | sudo tee /sys/class/net/ens785f1/gro_flush_timeout Note that the timeout should be larger than the userspace processing window, otherwise the watchdog will timeout and fall back to regular softirq processing. Enable the SO_BUSY_POLL/SO_PREFER_BUSY_POLL options on your socket. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20201130185205.196029-2-bjorn.topel@gmail.com --- arch/alpha/include/uapi/asm/socket.h | 2 + arch/mips/include/uapi/asm/socket.h | 2 + arch/parisc/include/uapi/asm/socket.h | 2 + arch/sparc/include/uapi/asm/socket.h | 2 + fs/eventpoll.c | 2 +- include/linux/netdevice.h | 35 +++++++++------- include/net/busy_poll.h | 5 ++- include/net/sock.h | 4 ++ include/uapi/asm-generic/socket.h | 2 + net/core/dev.c | 78 ++++++++++++++++++++++++++++------- net/core/sock.c | 9 ++++ 11 files changed, 111 insertions(+), 32 deletions(-) (limited to 'include/net/sock.h') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index de6c4df61082..538359642554 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -124,6 +124,8 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_PREFER_BUSY_POLL 69 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index d0a9ed2ca2d6..e406e73b5e6e 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -135,6 +135,8 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_PREFER_BUSY_POLL 69 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 10173c32195e..1bc46200889d 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -116,6 +116,8 @@ #define SO_DETACH_REUSEPORT_BPF 0x4042 +#define SO_PREFER_BUSY_POLL 0x4043 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 8029b681fc7c..99688cf673a4 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -117,6 +117,8 @@ #define SO_DETACH_REUSEPORT_BPF 0x0047 +#define SO_PREFER_BUSY_POLL 0x0048 + #if !defined(__KERNEL__) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4df61129566d..e11fab3a0b9e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -397,7 +397,7 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock) unsigned int napi_id = READ_ONCE(ep->napi_id); if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false); } static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7ce648a564f7..52d1cc2bd8a7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -350,23 +350,25 @@ struct napi_struct { }; enum { - NAPI_STATE_SCHED, /* Poll is scheduled */ - NAPI_STATE_MISSED, /* reschedule a napi */ - NAPI_STATE_DISABLE, /* Disable pending */ - NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ - NAPI_STATE_LISTED, /* NAPI added to system lists */ - NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ - NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ + NAPI_STATE_SCHED, /* Poll is scheduled */ + NAPI_STATE_MISSED, /* reschedule a napi */ + NAPI_STATE_DISABLE, /* Disable pending */ + NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ + NAPI_STATE_LISTED, /* NAPI added to system lists */ + NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ + NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ + NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ }; enum { - NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), - NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), - NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), - NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), - NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), - NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), - NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), + NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), + NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), + NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), + NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), + NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), + NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), + NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), + NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), }; enum gro_result { @@ -437,6 +439,11 @@ static inline bool napi_disable_pending(struct napi_struct *n) return test_bit(NAPI_STATE_DISABLE, &n->state); } +static inline bool napi_prefer_busy_poll(struct napi_struct *n) +{ + return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); +} + bool napi_schedule_prep(struct napi_struct *n); /** diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index b001fa91c14e..0292b8353d7e 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -43,7 +43,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg); + void *loop_end_arg, bool prefer_busy_poll); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) @@ -105,7 +105,8 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) unsigned int napi_id = READ_ONCE(sk->sk_napi_id); if (napi_id >= MIN_NAPI_ID) - napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk); + napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, + READ_ONCE(sk->sk_prefer_busy_poll)); #endif } diff --git a/include/net/sock.h b/include/net/sock.h index a5c6ae78df77..d49b89b071b6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -301,6 +301,7 @@ struct bpf_local_storage; * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner + * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family @@ -479,6 +480,9 @@ struct sock { u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; +#ifdef CONFIG_NET_RX_BUSY_POLL + u8 sk_prefer_busy_poll; +#endif struct pid *sk_peer_pid; const struct cred *sk_peer_cred; long sk_rcvtimeo; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 77f7c1638eb1..7dd02408b7ce 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -119,6 +119,8 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_PREFER_BUSY_POLL 69 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/net/core/dev.c b/net/core/dev.c index 60d325bda0d7..6f8d2cffb7c5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6458,7 +6458,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done) WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); - new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | + NAPIF_STATE_PREFER_BUSY_POLL); /* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. @@ -6497,8 +6498,29 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) #define BUSY_POLL_BUDGET 8 -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) +static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { + if (!skip_schedule) { + gro_normal_list(napi); + __napi_schedule(napi); + return; + } + + if (napi->gro_bitmask) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + napi_gro_flush(napi, HZ >= 1000); + } + + gro_normal_list(napi); + clear_bit(NAPI_STATE_SCHED, &napi->state); +} + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll) +{ + bool skip_schedule = false; + unsigned long timeout; int rc; /* Busy polling means there is a high chance device driver hard irq @@ -6515,6 +6537,15 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) local_bh_disable(); + if (prefer_busy_poll) { + napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); + timeout = READ_ONCE(napi->dev->gro_flush_timeout); + if (napi->defer_hard_irqs_count && timeout) { + hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); + skip_schedule = true; + } + } + /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ @@ -6525,19 +6556,14 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) */ trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); netpoll_poll_unlock(have_poll_lock); - if (rc == BUSY_POLL_BUDGET) { - /* As the whole budget was spent, we still own the napi so can - * safely handle the rx_list. - */ - gro_normal_list(napi); - __napi_schedule(napi); - } + if (rc == BUSY_POLL_BUDGET) + __busy_poll_stop(napi, skip_schedule); local_bh_enable(); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg) + void *loop_end_arg, bool prefer_busy_poll) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); @@ -6565,12 +6591,18 @@ restart: * we avoid dirtying napi->state as much as we can. */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | - NAPIF_STATE_IN_BUSY_POLL)) + NAPIF_STATE_IN_BUSY_POLL)) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | - NAPIF_STATE_SCHED) != val) + NAPIF_STATE_SCHED) != val) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } @@ -6588,7 +6620,7 @@ count: if (unlikely(need_resched())) { if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); preempt_enable(); rcu_read_unlock(); cond_resched(); @@ -6599,7 +6631,7 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); preempt_enable(); out: rcu_read_unlock(); @@ -6650,8 +6682,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ if (!napi_disable_pending(napi) && - !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); __napi_schedule_irqoff(napi); + } return HRTIMER_NORESTART; } @@ -6709,6 +6743,7 @@ void napi_disable(struct napi_struct *n) hrtimer_cancel(&n->timer); + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); clear_bit(NAPI_STATE_DISABLE, &n->state); } EXPORT_SYMBOL(napi_disable); @@ -6781,6 +6816,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } + /* The NAPI context has more processing work, but busy-polling + * is preferred. Exit early. + */ + if (napi_prefer_busy_poll(n)) { + if (napi_complete_done(n, work)) { + /* If timeout is not set, we need to make sure + * that the NAPI is re-scheduled. + */ + napi_schedule(n); + } + goto out_unlock; + } + if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. diff --git a/net/core/sock.c b/net/core/sock.c index 727ea1cc633c..e05f2e52b5a8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1159,6 +1159,12 @@ set_sndbuf: sk->sk_ll_usec = val; } break; + case SO_PREFER_BUSY_POLL: + if (valbool && !capable(CAP_NET_ADMIN)) + ret = -EPERM; + else + WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); + break; #endif case SO_MAX_PACING_RATE: @@ -1523,6 +1529,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_BUSY_POLL: v.val = sk->sk_ll_usec; break; + case SO_PREFER_BUSY_POLL: + v.val = READ_ONCE(sk->sk_prefer_busy_poll); + break; #endif case SO_MAX_PACING_RATE: -- cgit v1.2.3 From 7c951cafc0cb2e575f1d58677b95ac387ac0a5bd Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 30 Nov 2020 19:51:57 +0100 Subject: net: Add SO_BUSY_POLL_BUDGET socket option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This option lets a user set a per socket NAPI budget for busy-polling. If the options is not set, it will use the default of 8. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20201130185205.196029-3-bjorn.topel@gmail.com --- arch/alpha/include/uapi/asm/socket.h | 1 + arch/mips/include/uapi/asm/socket.h | 1 + arch/parisc/include/uapi/asm/socket.h | 1 + arch/sparc/include/uapi/asm/socket.h | 1 + fs/eventpoll.c | 3 ++- include/net/busy_poll.h | 7 +++++-- include/net/sock.h | 2 ++ include/uapi/asm-generic/socket.h | 1 + net/core/dev.c | 21 ++++++++++----------- net/core/sock.c | 10 ++++++++++ 10 files changed, 34 insertions(+), 14 deletions(-) (limited to 'include/net/sock.h') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 538359642554..57420356ce4c 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -125,6 +125,7 @@ #define SO_DETACH_REUSEPORT_BPF 68 #define SO_PREFER_BUSY_POLL 69 +#define SO_BUSY_POLL_BUDGET 70 #if !defined(__KERNEL__) diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index e406e73b5e6e..2d949969313b 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -136,6 +136,7 @@ #define SO_DETACH_REUSEPORT_BPF 68 #define SO_PREFER_BUSY_POLL 69 +#define SO_BUSY_POLL_BUDGET 70 #if !defined(__KERNEL__) diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 1bc46200889d..f60904329bbc 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -117,6 +117,7 @@ #define SO_DETACH_REUSEPORT_BPF 0x4042 #define SO_PREFER_BUSY_POLL 0x4043 +#define SO_BUSY_POLL_BUDGET 0x4044 #if !defined(__KERNEL__) diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 99688cf673a4..848a22fbac20 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -118,6 +118,7 @@ #define SO_DETACH_REUSEPORT_BPF 0x0047 #define SO_PREFER_BUSY_POLL 0x0048 +#define SO_BUSY_POLL_BUDGET 0x0049 #if !defined(__KERNEL__) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index e11fab3a0b9e..73c346e503d7 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -397,7 +397,8 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock) unsigned int napi_id = READ_ONCE(ep->napi_id); if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false); + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, + BUSY_POLL_BUDGET); } static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 0292b8353d7e..2f8f51807b83 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -23,6 +23,8 @@ */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) +#define BUSY_POLL_BUDGET 8 + #ifdef CONFIG_NET_RX_BUSY_POLL struct napi_struct; @@ -43,7 +45,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg, bool prefer_busy_poll); + void *loop_end_arg, bool prefer_busy_poll, u16 budget); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) @@ -106,7 +108,8 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) if (napi_id >= MIN_NAPI_ID) napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, - READ_ONCE(sk->sk_prefer_busy_poll)); + READ_ONCE(sk->sk_prefer_busy_poll), + READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); #endif } diff --git a/include/net/sock.h b/include/net/sock.h index d49b89b071b6..77ba2c2737db 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -302,6 +302,7 @@ struct bpf_local_storage; * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner * @sk_prefer_busy_poll: prefer busypolling over softirq processing + * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family @@ -482,6 +483,7 @@ struct sock { kuid_t sk_uid; #ifdef CONFIG_NET_RX_BUSY_POLL u8 sk_prefer_busy_poll; + u16 sk_busy_poll_budget; #endif struct pid *sk_peer_pid; const struct cred *sk_peer_cred; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 7dd02408b7ce..4dcd13d097a9 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -120,6 +120,7 @@ #define SO_DETACH_REUSEPORT_BPF 68 #define SO_PREFER_BUSY_POLL 69 +#define SO_BUSY_POLL_BUDGET 70 #if !defined(__KERNEL__) diff --git a/net/core/dev.c b/net/core/dev.c index 6f8d2cffb7c5..7a1e5936c67f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6496,8 +6496,6 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) #if defined(CONFIG_NET_RX_BUSY_POLL) -#define BUSY_POLL_BUDGET 8 - static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { if (!skip_schedule) { @@ -6517,7 +6515,8 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) clear_bit(NAPI_STATE_SCHED, &napi->state); } -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll) +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, + u16 budget) { bool skip_schedule = false; unsigned long timeout; @@ -6549,21 +6548,21 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ - rc = napi->poll(napi, BUSY_POLL_BUDGET); + rc = napi->poll(napi, budget); /* We can't gro_normal_list() here, because napi->poll() might have * rearmed the napi (napi_complete_done()) in which case it could * already be running on another CPU. */ - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); - if (rc == BUSY_POLL_BUDGET) + if (rc == budget) __busy_poll_stop(napi, skip_schedule); local_bh_enable(); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg, bool prefer_busy_poll) + void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); @@ -6606,8 +6605,8 @@ restart: have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } - work = napi_poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, work, BUSY_POLL_BUDGET); + work = napi_poll(napi, budget); + trace_napi_poll(napi, work, budget); gro_normal_list(napi); count: if (work > 0) @@ -6620,7 +6619,7 @@ count: if (unlikely(need_resched())) { if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); rcu_read_unlock(); cond_resched(); @@ -6631,7 +6630,7 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); out: rcu_read_unlock(); diff --git a/net/core/sock.c b/net/core/sock.c index e05f2e52b5a8..d422a6808405 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1165,6 +1165,16 @@ set_sndbuf: else WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); break; + case SO_BUSY_POLL_BUDGET: + if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { + ret = -EPERM; + } else { + if (val < 0 || val > U16_MAX) + ret = -EINVAL; + else + WRITE_ONCE(sk->sk_busy_poll_budget, val); + } + break; #endif case SO_MAX_PACING_RATE: -- cgit v1.2.3 From ad80b0fc6e7f56bb1b09af86749ff3014477cfe6 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 27 Nov 2020 11:10:22 +0100 Subject: mptcp: open code mptcp variant for lock_sock This allows invoking an additional callback under the socket spin lock. Will be used by the next patches to avoid additional spin lock contention. Acked-by: Florian Westphal Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/net/sock.h | 1 + net/core/sock.c | 2 +- net/mptcp/protocol.h | 13 +++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 80469c2c448d..f59764614e30 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1590,6 +1590,7 @@ static inline void lock_sock(struct sock *sk) lock_sock_nested(sk, 0); } +void __lock_sock(struct sock *sk); void __release_sock(struct sock *sk); void release_sock(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 9badbe7bb4e4..f0f096852876 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2486,7 +2486,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); -static void __lock_sock(struct sock *sk) +void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 82d5626323b1..6abac8238de3 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -253,6 +253,19 @@ struct mptcp_sock { } rcvq_space; }; +#define mptcp_lock_sock(___sk, cb) do { \ + struct sock *__sk = (___sk); /* silence macro reuse warning */ \ + might_sleep(); \ + spin_lock_bh(&__sk->sk_lock.slock); \ + if (__sk->sk_lock.owned) \ + __lock_sock(__sk); \ + cb; \ + __sk->sk_lock.owned = 1; \ + spin_unlock(&__sk->sk_lock.slock); \ + mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_); \ + local_bh_enable(); \ +} while (0) + #define mptcp_for_each_subflow(__msk, __subflow) \ list_for_each_entry(__subflow, &((__msk)->conn_list), node) -- cgit v1.2.3