From 3aefd7d6ea05a1a573a26c20e474d36c218016cd Mon Sep 17 00:00:00 2001 From: Yi Li Date: Tue, 27 Oct 2020 13:59:04 +0800 Subject: net: core: Use skb_is_gso() in skb_checksum_help() No functional changes, just minor refactoring. Signed-off-by: Yi Li Link: https://lore.kernel.org/r/20201027055904.2683444-1-yili@winhong.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 9499a414d67e..55f66e108059 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3205,7 +3205,7 @@ int skb_checksum_help(struct sk_buff *skb) if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed; - if (unlikely(skb_shinfo(skb)->gso_size)) { + if (unlikely(skb_is_gso(skb))) { skb_warn_bad_offload(skb); return -EINVAL; } -- cgit v1.2.3 From 5d867245c46a7c4e52acdc8d2625c11c1d3e72b1 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 1 Nov 2020 07:36:47 -0800 Subject: net: core: remove unneeded semicolon A semicolon is not needed after a switch statement. Signed-off-by: Tom Rix Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20201101153647.2292322-1-trix@redhat.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 9e7f071b846c..bd6100da66f4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8902,7 +8902,7 @@ static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) return dev->netdev_ops->ndo_bpf; default: return NULL; - }; + } } static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, -- cgit v1.2.3 From a18394269fc87276963e8d965c730900178d7e4b Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 7 Nov 2020 21:49:07 +0100 Subject: net: core: add dev_get_tstats64 as a ndo_get_stats64 implementation It's a frequent pattern to use netdev->stats for the less frequently accessed counters and per-cpu counters for the frequently accessed counters (rx/tx bytes/packets). Add a default ndo_get_stats64() implementation for this use case. Reviewed-by: Florian Fainelli Signed-off-by: Heiner Kallweit Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + net/core/dev.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a53ed2d1ed1d..7ce648a564f7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4527,6 +4527,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats); void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats); +void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); extern int netdev_max_backlog; extern int netdev_tstamp_prequeue; diff --git a/net/core/dev.c b/net/core/dev.c index bd6100da66f4..60d325bda0d7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10366,6 +10366,21 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, } EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); +/** + * dev_get_tstats64 - ndo_get_stats64 implementation + * @dev: device to get statistics from + * @s: place to store stats + * + * Populate @s from dev->stats and dev->tstats. Can be used as + * ndo_get_stats64() callback. + */ +void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s) +{ + netdev_stats_to_stats64(s, &dev->stats); + dev_fetch_sw_netstats(s, dev->tstats); +} +EXPORT_SYMBOL_GPL(dev_get_tstats64); + struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); -- cgit v1.2.3 From c1639be98b4281ac537f2ed77b0afaa1d336ce6c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 16 Nov 2020 11:17:58 +0100 Subject: net: datagram: fix some kernel-doc markups Some identifiers have different names between their prototypes and the kernel-doc markup. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Kirill Tkhai Signed-off-by: Jakub Kicinski --- net/core/datagram.c | 2 +- net/core/dev.c | 4 ++-- net/core/skbuff.c | 2 +- net/ethernet/eth.c | 6 +++--- net/sunrpc/rpc_pipe.c | 3 ++- 5 files changed, 9 insertions(+), 8 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/datagram.c b/net/core/datagram.c index 9fcaa544f11a..81809fa735a7 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -709,7 +709,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) EXPORT_SYMBOL(zerocopy_sg_from_iter); /** - * skb_copy_and_csum_datagram_iter - Copy datagram to an iovec iterator + * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator * and update a checksum. * @skb: buffer to copy * @offset: offset in the buffer to start copying from diff --git a/net/core/dev.c b/net/core/dev.c index 60d325bda0d7..4bfdcd6b20e8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6919,7 +6919,7 @@ bool netdev_has_upper_dev(struct net_device *dev, EXPORT_SYMBOL(netdev_has_upper_dev); /** - * netdev_has_upper_dev_all - Check if device is linked to an upper device + * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * @@ -8157,7 +8157,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private); /** - * netdev_lower_change - Dispatch event about lower device state change + * netdev_lower_state_changed - Dispatch event about lower device state change * @lower_dev: device * @lower_state_info: state to dispatch * diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c9a5a3c262c8..ffe3dcc0ebea 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -842,7 +842,7 @@ EXPORT_SYMBOL(consume_skb); #endif /** - * consume_stateless_skb - free an skbuff, assuming it is stateless + * __consume_stateless_skb - free an skbuff, assuming it is stateless * @skb: buffer to free * * Alike consume_skb(), but this variant assumes that this is the last diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index dac65180c4ef..4106373180c6 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -272,7 +272,7 @@ void eth_header_cache_update(struct hh_cache *hh, EXPORT_SYMBOL(eth_header_cache_update); /** - * eth_header_parser_protocol - extract protocol from L2 header + * eth_header_parse_protocol - extract protocol from L2 header * @skb: packet to extract protocol from */ __be16 eth_header_parse_protocol(const struct sk_buff *skb) @@ -523,8 +523,8 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) EXPORT_SYMBOL(eth_platform_get_mac_address); /** - * Obtain the MAC address from an nvmem cell named 'mac-address' associated - * with given device. + * nvmem_get_mac_address - Obtain the MAC address from an nvmem cell named + * 'mac-address' associated with given device. * * @dev: Device with which the mac-address cell is associated. * @addrbuf: Buffer to which the MAC address will be copied on success. diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index eadc0ede928c..8241f5a4a01c 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -781,7 +781,8 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, } /** - * rpc_mkpipe - make an rpc_pipefs file for kernel<->userspace communication + * rpc_mkpipe_dentry - make an rpc_pipefs file for kernel<->userspace + * communication * @parent: dentry of directory to create new "pipe" in * @name: name of pipe * @private: private data to associate with the pipe, for the caller's use -- cgit v1.2.3 From 1d155dfdf50efc2b0793bce93c06d1a5b23d0877 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 21 Nov 2020 00:22:20 +0100 Subject: net: warn if gso_type isn't set for a GSO SKB In bug report [0] a warning in r8169 driver was reported that was caused by an invalid GSO SKB (gso_type was 0). See [1] for a discussion about this issue. Still the origin of the invalid GSO SKB isn't clear. It shouldn't be a network drivers task to check for invalid GSO SKB's. Also, even if issue [0] can be fixed, we can't be sure that a similar issue doesn't pop up again at another place. Therefore let gso_features_check() check for such invalid GSO SKB's. [0] https://bugzilla.kernel.org/show_bug.cgi?id=209423 [1] https://www.spinics.net/lists/netdev/msg690794.html Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/97c78d21-7f0b-d843-df17-3589f224d2cf@gmail.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 4bfdcd6b20e8..3c3070d9d26f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3495,6 +3495,11 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (gso_segs > dev->gso_max_segs) return features & ~NETIF_F_GSO_MASK; + if (!skb_shinfo(skb)->gso_type) { + skb_warn_bad_offload(skb); + return features & ~NETIF_F_GSO_MASK; + } + /* Support for GSO partial features requires software * intervention before we can actually process the packets * so we need to strip support for any partial features now -- cgit v1.2.3 From aadaca9e7c392dbf877af8cefb156199f1a67bbe Mon Sep 17 00:00:00 2001 From: wenxu Date: Wed, 25 Nov 2020 12:01:21 +0800 Subject: net/sched: fix miss init the mru in qdisc_skb_cb The mru in the qdisc_skb_cb should be init as 0. Only defrag packets in the act_ct will set the value. Fixes: 038ebb1a713d ("net/sched: act_ct: fix miss set mru for ovs after defrag in act_ct") Signed-off-by: wenxu Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 3c3070d9d26f..51b263076124 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3872,6 +3872,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) return skb; /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ + qdisc_skb_cb(skb)->mru = 0; mini_qdisc_bstats_cpu_update(miniq, skb); switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { @@ -4959,6 +4960,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, } qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_skb_cb(skb)->mru = 0; skb->tc_at_ingress = 1; mini_qdisc_bstats_cpu_update(miniq, skb); -- cgit v1.2.3 From 7fd3253a7de6a317a0683f83739479fb880bffc8 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 30 Nov 2020 19:51:56 +0100 Subject: net: Introduce preferred busy-polling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing busy-polling mode, enabled by the SO_BUSY_POLL socket option or system-wide using the /proc/sys/net/core/busy_read knob, is an opportunistic. That means that if the NAPI context is not scheduled, it will poll it. If, after busy-polling, the budget is exceeded the busy-polling logic will schedule the NAPI onto the regular softirq handling. One implication of the behavior above is that a busy/heavy loaded NAPI context will never enter/allow for busy-polling. Some applications prefer that most NAPI processing would be done by busy-polling. This series adds a new socket option, SO_PREFER_BUSY_POLL, that works in concert with the napi_defer_hard_irqs and gro_flush_timeout knobs. The napi_defer_hard_irqs and gro_flush_timeout knobs were introduced in commit 6f8b12d661d0 ("net: napi: add hard irqs deferral feature"), and allows for a user to defer interrupts to be enabled and instead schedule the NAPI context from a watchdog timer. When a user enables the SO_PREFER_BUSY_POLL, again with the other knobs enabled, and the NAPI context is being processed by a softirq, the softirq NAPI processing will exit early to allow the busy-polling to be performed. If the application stops performing busy-polling via a system call, the watchdog timer defined by gro_flush_timeout will timeout, and regular softirq handling will resume. In summary; Heavy traffic applications that prefer busy-polling over softirq processing should use this option. Example usage: $ echo 2 | sudo tee /sys/class/net/ens785f1/napi_defer_hard_irqs $ echo 200000 | sudo tee /sys/class/net/ens785f1/gro_flush_timeout Note that the timeout should be larger than the userspace processing window, otherwise the watchdog will timeout and fall back to regular softirq processing. Enable the SO_BUSY_POLL/SO_PREFER_BUSY_POLL options on your socket. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20201130185205.196029-2-bjorn.topel@gmail.com --- arch/alpha/include/uapi/asm/socket.h | 2 + arch/mips/include/uapi/asm/socket.h | 2 + arch/parisc/include/uapi/asm/socket.h | 2 + arch/sparc/include/uapi/asm/socket.h | 2 + fs/eventpoll.c | 2 +- include/linux/netdevice.h | 35 +++++++++------- include/net/busy_poll.h | 5 ++- include/net/sock.h | 4 ++ include/uapi/asm-generic/socket.h | 2 + net/core/dev.c | 78 ++++++++++++++++++++++++++++------- net/core/sock.c | 9 ++++ 11 files changed, 111 insertions(+), 32 deletions(-) (limited to 'net/core/dev.c') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index de6c4df61082..538359642554 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -124,6 +124,8 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_PREFER_BUSY_POLL 69 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index d0a9ed2ca2d6..e406e73b5e6e 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -135,6 +135,8 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_PREFER_BUSY_POLL 69 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 10173c32195e..1bc46200889d 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -116,6 +116,8 @@ #define SO_DETACH_REUSEPORT_BPF 0x4042 +#define SO_PREFER_BUSY_POLL 0x4043 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 8029b681fc7c..99688cf673a4 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -117,6 +117,8 @@ #define SO_DETACH_REUSEPORT_BPF 0x0047 +#define SO_PREFER_BUSY_POLL 0x0048 + #if !defined(__KERNEL__) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4df61129566d..e11fab3a0b9e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -397,7 +397,7 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock) unsigned int napi_id = READ_ONCE(ep->napi_id); if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false); } static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7ce648a564f7..52d1cc2bd8a7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -350,23 +350,25 @@ struct napi_struct { }; enum { - NAPI_STATE_SCHED, /* Poll is scheduled */ - NAPI_STATE_MISSED, /* reschedule a napi */ - NAPI_STATE_DISABLE, /* Disable pending */ - NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ - NAPI_STATE_LISTED, /* NAPI added to system lists */ - NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ - NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ + NAPI_STATE_SCHED, /* Poll is scheduled */ + NAPI_STATE_MISSED, /* reschedule a napi */ + NAPI_STATE_DISABLE, /* Disable pending */ + NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ + NAPI_STATE_LISTED, /* NAPI added to system lists */ + NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ + NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ + NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ }; enum { - NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), - NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), - NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), - NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), - NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), - NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), - NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), + NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), + NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), + NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), + NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), + NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), + NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), + NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), + NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), }; enum gro_result { @@ -437,6 +439,11 @@ static inline bool napi_disable_pending(struct napi_struct *n) return test_bit(NAPI_STATE_DISABLE, &n->state); } +static inline bool napi_prefer_busy_poll(struct napi_struct *n) +{ + return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); +} + bool napi_schedule_prep(struct napi_struct *n); /** diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index b001fa91c14e..0292b8353d7e 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -43,7 +43,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg); + void *loop_end_arg, bool prefer_busy_poll); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) @@ -105,7 +105,8 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) unsigned int napi_id = READ_ONCE(sk->sk_napi_id); if (napi_id >= MIN_NAPI_ID) - napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk); + napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, + READ_ONCE(sk->sk_prefer_busy_poll)); #endif } diff --git a/include/net/sock.h b/include/net/sock.h index a5c6ae78df77..d49b89b071b6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -301,6 +301,7 @@ struct bpf_local_storage; * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner + * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family @@ -479,6 +480,9 @@ struct sock { u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; +#ifdef CONFIG_NET_RX_BUSY_POLL + u8 sk_prefer_busy_poll; +#endif struct pid *sk_peer_pid; const struct cred *sk_peer_cred; long sk_rcvtimeo; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 77f7c1638eb1..7dd02408b7ce 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -119,6 +119,8 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_PREFER_BUSY_POLL 69 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/net/core/dev.c b/net/core/dev.c index 60d325bda0d7..6f8d2cffb7c5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6458,7 +6458,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done) WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); - new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | + NAPIF_STATE_PREFER_BUSY_POLL); /* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. @@ -6497,8 +6498,29 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) #define BUSY_POLL_BUDGET 8 -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) +static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { + if (!skip_schedule) { + gro_normal_list(napi); + __napi_schedule(napi); + return; + } + + if (napi->gro_bitmask) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + napi_gro_flush(napi, HZ >= 1000); + } + + gro_normal_list(napi); + clear_bit(NAPI_STATE_SCHED, &napi->state); +} + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll) +{ + bool skip_schedule = false; + unsigned long timeout; int rc; /* Busy polling means there is a high chance device driver hard irq @@ -6515,6 +6537,15 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) local_bh_disable(); + if (prefer_busy_poll) { + napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); + timeout = READ_ONCE(napi->dev->gro_flush_timeout); + if (napi->defer_hard_irqs_count && timeout) { + hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); + skip_schedule = true; + } + } + /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ @@ -6525,19 +6556,14 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) */ trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); netpoll_poll_unlock(have_poll_lock); - if (rc == BUSY_POLL_BUDGET) { - /* As the whole budget was spent, we still own the napi so can - * safely handle the rx_list. - */ - gro_normal_list(napi); - __napi_schedule(napi); - } + if (rc == BUSY_POLL_BUDGET) + __busy_poll_stop(napi, skip_schedule); local_bh_enable(); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg) + void *loop_end_arg, bool prefer_busy_poll) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); @@ -6565,12 +6591,18 @@ restart: * we avoid dirtying napi->state as much as we can. */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | - NAPIF_STATE_IN_BUSY_POLL)) + NAPIF_STATE_IN_BUSY_POLL)) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | - NAPIF_STATE_SCHED) != val) + NAPIF_STATE_SCHED) != val) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } @@ -6588,7 +6620,7 @@ count: if (unlikely(need_resched())) { if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); preempt_enable(); rcu_read_unlock(); cond_resched(); @@ -6599,7 +6631,7 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); preempt_enable(); out: rcu_read_unlock(); @@ -6650,8 +6682,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ if (!napi_disable_pending(napi) && - !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); __napi_schedule_irqoff(napi); + } return HRTIMER_NORESTART; } @@ -6709,6 +6743,7 @@ void napi_disable(struct napi_struct *n) hrtimer_cancel(&n->timer); + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); clear_bit(NAPI_STATE_DISABLE, &n->state); } EXPORT_SYMBOL(napi_disable); @@ -6781,6 +6816,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } + /* The NAPI context has more processing work, but busy-polling + * is preferred. Exit early. + */ + if (napi_prefer_busy_poll(n)) { + if (napi_complete_done(n, work)) { + /* If timeout is not set, we need to make sure + * that the NAPI is re-scheduled. + */ + napi_schedule(n); + } + goto out_unlock; + } + if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. diff --git a/net/core/sock.c b/net/core/sock.c index 727ea1cc633c..e05f2e52b5a8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1159,6 +1159,12 @@ set_sndbuf: sk->sk_ll_usec = val; } break; + case SO_PREFER_BUSY_POLL: + if (valbool && !capable(CAP_NET_ADMIN)) + ret = -EPERM; + else + WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); + break; #endif case SO_MAX_PACING_RATE: @@ -1523,6 +1529,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_BUSY_POLL: v.val = sk->sk_ll_usec; break; + case SO_PREFER_BUSY_POLL: + v.val = READ_ONCE(sk->sk_prefer_busy_poll); + break; #endif case SO_MAX_PACING_RATE: -- cgit v1.2.3 From 7c951cafc0cb2e575f1d58677b95ac387ac0a5bd Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 30 Nov 2020 19:51:57 +0100 Subject: net: Add SO_BUSY_POLL_BUDGET socket option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This option lets a user set a per socket NAPI budget for busy-polling. If the options is not set, it will use the default of 8. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20201130185205.196029-3-bjorn.topel@gmail.com --- arch/alpha/include/uapi/asm/socket.h | 1 + arch/mips/include/uapi/asm/socket.h | 1 + arch/parisc/include/uapi/asm/socket.h | 1 + arch/sparc/include/uapi/asm/socket.h | 1 + fs/eventpoll.c | 3 ++- include/net/busy_poll.h | 7 +++++-- include/net/sock.h | 2 ++ include/uapi/asm-generic/socket.h | 1 + net/core/dev.c | 21 ++++++++++----------- net/core/sock.c | 10 ++++++++++ 10 files changed, 34 insertions(+), 14 deletions(-) (limited to 'net/core/dev.c') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 538359642554..57420356ce4c 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -125,6 +125,7 @@ #define SO_DETACH_REUSEPORT_BPF 68 #define SO_PREFER_BUSY_POLL 69 +#define SO_BUSY_POLL_BUDGET 70 #if !defined(__KERNEL__) diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index e406e73b5e6e..2d949969313b 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -136,6 +136,7 @@ #define SO_DETACH_REUSEPORT_BPF 68 #define SO_PREFER_BUSY_POLL 69 +#define SO_BUSY_POLL_BUDGET 70 #if !defined(__KERNEL__) diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 1bc46200889d..f60904329bbc 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -117,6 +117,7 @@ #define SO_DETACH_REUSEPORT_BPF 0x4042 #define SO_PREFER_BUSY_POLL 0x4043 +#define SO_BUSY_POLL_BUDGET 0x4044 #if !defined(__KERNEL__) diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 99688cf673a4..848a22fbac20 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -118,6 +118,7 @@ #define SO_DETACH_REUSEPORT_BPF 0x0047 #define SO_PREFER_BUSY_POLL 0x0048 +#define SO_BUSY_POLL_BUDGET 0x0049 #if !defined(__KERNEL__) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index e11fab3a0b9e..73c346e503d7 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -397,7 +397,8 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock) unsigned int napi_id = READ_ONCE(ep->napi_id); if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false); + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, + BUSY_POLL_BUDGET); } static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 0292b8353d7e..2f8f51807b83 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -23,6 +23,8 @@ */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) +#define BUSY_POLL_BUDGET 8 + #ifdef CONFIG_NET_RX_BUSY_POLL struct napi_struct; @@ -43,7 +45,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg, bool prefer_busy_poll); + void *loop_end_arg, bool prefer_busy_poll, u16 budget); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) @@ -106,7 +108,8 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) if (napi_id >= MIN_NAPI_ID) napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, - READ_ONCE(sk->sk_prefer_busy_poll)); + READ_ONCE(sk->sk_prefer_busy_poll), + READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); #endif } diff --git a/include/net/sock.h b/include/net/sock.h index d49b89b071b6..77ba2c2737db 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -302,6 +302,7 @@ struct bpf_local_storage; * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner * @sk_prefer_busy_poll: prefer busypolling over softirq processing + * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family @@ -482,6 +483,7 @@ struct sock { kuid_t sk_uid; #ifdef CONFIG_NET_RX_BUSY_POLL u8 sk_prefer_busy_poll; + u16 sk_busy_poll_budget; #endif struct pid *sk_peer_pid; const struct cred *sk_peer_cred; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 7dd02408b7ce..4dcd13d097a9 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -120,6 +120,7 @@ #define SO_DETACH_REUSEPORT_BPF 68 #define SO_PREFER_BUSY_POLL 69 +#define SO_BUSY_POLL_BUDGET 70 #if !defined(__KERNEL__) diff --git a/net/core/dev.c b/net/core/dev.c index 6f8d2cffb7c5..7a1e5936c67f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6496,8 +6496,6 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) #if defined(CONFIG_NET_RX_BUSY_POLL) -#define BUSY_POLL_BUDGET 8 - static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { if (!skip_schedule) { @@ -6517,7 +6515,8 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) clear_bit(NAPI_STATE_SCHED, &napi->state); } -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll) +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, + u16 budget) { bool skip_schedule = false; unsigned long timeout; @@ -6549,21 +6548,21 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ - rc = napi->poll(napi, BUSY_POLL_BUDGET); + rc = napi->poll(napi, budget); /* We can't gro_normal_list() here, because napi->poll() might have * rearmed the napi (napi_complete_done()) in which case it could * already be running on another CPU. */ - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); - if (rc == BUSY_POLL_BUDGET) + if (rc == budget) __busy_poll_stop(napi, skip_schedule); local_bh_enable(); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg, bool prefer_busy_poll) + void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); @@ -6606,8 +6605,8 @@ restart: have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } - work = napi_poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, work, BUSY_POLL_BUDGET); + work = napi_poll(napi, budget); + trace_napi_poll(napi, work, budget); gro_normal_list(napi); count: if (work > 0) @@ -6620,7 +6619,7 @@ count: if (unlikely(need_resched())) { if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); rcu_read_unlock(); cond_resched(); @@ -6631,7 +6630,7 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); out: rcu_read_unlock(); diff --git a/net/core/sock.c b/net/core/sock.c index e05f2e52b5a8..d422a6808405 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1165,6 +1165,16 @@ set_sndbuf: else WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); break; + case SO_BUSY_POLL_BUDGET: + if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { + ret = -EPERM; + } else { + if (val < 0 || val > U16_MAX) + ret = -EINVAL; + else + WRITE_ONCE(sk->sk_busy_poll_budget, val); + } + break; #endif case SO_MAX_PACING_RATE: -- cgit v1.2.3 From b02e5a0ebb172c8276cea3151942aac681f7a4a6 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 30 Nov 2020 19:52:01 +0100 Subject: xsk: Propagate napi_id to XDP socket Rx path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add napi_id to the xdp_rxq_info structure, and make sure the XDP socket pick up the napi_id in the Rx path. The napi_id is used to find the corresponding NAPI structure for socket busy polling. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Ilias Apalodimas Acked-by: Michael S. Tsirkin Acked-by: Tariq Toukan Link: https://lore.kernel.org/bpf/20201130185205.196029-7-bjorn.topel@gmail.com --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 2 +- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 2 +- drivers/net/ethernet/intel/ice/ice_base.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_txrx.c | 2 +- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 +- drivers/net/ethernet/marvell/mvneta.c | 2 +- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 4 ++-- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +- drivers/net/ethernet/qlogic/qede/qede_main.c | 2 +- drivers/net/ethernet/sfc/rx_common.c | 2 +- drivers/net/ethernet/socionext/netsec.c | 2 +- drivers/net/ethernet/ti/cpsw_priv.c | 2 +- drivers/net/hyperv/netvsc.c | 2 +- drivers/net/tun.c | 2 +- drivers/net/veth.c | 12 ++++++++---- drivers/net/virtio_net.c | 2 +- drivers/net/xen-netfront.c | 2 +- include/net/busy_poll.h | 19 +++++++++++++++---- include/net/xdp.h | 3 ++- net/core/dev.c | 2 +- net/core/xdp.c | 3 ++- net/xdp/xsk.c | 1 + 29 files changed, 54 insertions(+), 36 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index e8131dadc22c..6ad59f0068f6 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -416,7 +416,7 @@ static int ena_xdp_register_rxq_info(struct ena_ring *rx_ring) { int rc; - rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid); + rc = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, rx_ring->qid, 0); if (rc) { netif_err(rx_ring->adapter, ifup, rx_ring->netdev, diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 7975f59735d6..725d929eddb1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2884,7 +2884,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp) if (rc) return rc; - rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i); + rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i, 0); if (rc < 0) return rc; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c index 7a141ce32e86..f782e6af45e9 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c @@ -770,7 +770,7 @@ static void nicvf_rcv_queue_config(struct nicvf *nic, struct queue_set *qs, rq->caching = 1; /* Driver have no proper error path for failed XDP RX-queue info reg */ - WARN_ON(xdp_rxq_info_reg(&rq->xdp_rxq, nic->netdev, qidx) < 0); + WARN_ON(xdp_rxq_info_reg(&rq->xdp_rxq, nic->netdev, qidx, 0) < 0); /* Send a mailbox msg to PF to config RQ */ mbx.rq.msg = NIC_MBOX_MSG_RQ_CFG; diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index cf9400a9886d..40953980e846 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -3334,7 +3334,7 @@ static int dpaa2_eth_setup_rx_flow(struct dpaa2_eth_priv *priv, return 0; err = xdp_rxq_info_reg(&fq->channel->xdp_rxq, priv->net_dev, - fq->flowid); + fq->flowid, 0); if (err) { dev_err(dev, "xdp_rxq_info_reg failed\n"); return err; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index c21548c71bb1..9f73cd7aee09 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1447,7 +1447,7 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) /* XDP RX-queue info only needed for RX rings exposed to XDP */ if (rx_ring->vsi->type == I40E_VSI_MAIN) { err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, - rx_ring->queue_index); + rx_ring->queue_index, rx_ring->q_vector->napi.napi_id); if (err < 0) return err; } diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index fe4320e2d1f2..3124a3bf519a 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -306,7 +306,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring) if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) /* coverity[check_return] */ xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, - ring->q_index); + ring->q_index, ring->q_vector->napi.napi_id); ring->xsk_pool = ice_xsk_pool(ring); if (ring->xsk_pool) { @@ -333,7 +333,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring) /* coverity[check_return] */ xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, - ring->q_index); + ring->q_index, ring->q_vector->napi.napi_id); err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, MEM_TYPE_PAGE_SHARED, diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index eae75260fe20..77d5eae6b4c2 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -483,7 +483,7 @@ int ice_setup_rx_ring(struct ice_ring *rx_ring) if (rx_ring->vsi->type == ICE_VSI_PF && !xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, - rx_ring->q_index)) + rx_ring->q_index, rx_ring->q_vector->napi.napi_id)) goto err; return 0; diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 5fc2c381da55..6a4ef4934fcf 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -4352,7 +4352,7 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring) /* XDP RX-queue info */ if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, - rx_ring->queue_index) < 0) + rx_ring->queue_index, 0) < 0) goto err; return 0; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 45ae33e15303..50e6b8b6ba7b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6577,7 +6577,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, /* XDP RX-queue info */ if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev, - rx_ring->queue_index) < 0) + rx_ring->queue_index, rx_ring->q_vector->napi.napi_id) < 0) goto err; rx_ring->xdp_prog = adapter->xdp_prog; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 82fce27f682b..4061cd7db5dd 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -3493,7 +3493,7 @@ int ixgbevf_setup_rx_resources(struct ixgbevf_adapter *adapter, /* XDP RX-queue info */ if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev, - rx_ring->queue_index) < 0) + rx_ring->queue_index, 0) < 0) goto err; rx_ring->xdp_prog = adapter->xdp_prog; diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 183530ed4d1d..ba6dcb19bb1d 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -3227,7 +3227,7 @@ static int mvneta_create_page_pool(struct mvneta_port *pp, return err; } - err = xdp_rxq_info_reg(&rxq->xdp_rxq, pp->dev, rxq->id); + err = xdp_rxq_info_reg(&rxq->xdp_rxq, pp->dev, rxq->id, 0); if (err < 0) goto err_free_pp; diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 3069e192d773..5504cbc24970 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -2614,11 +2614,11 @@ static int mvpp2_rxq_init(struct mvpp2_port *port, mvpp2_rxq_status_update(port, rxq->id, 0, rxq->size); if (priv->percpu_pools) { - err = xdp_rxq_info_reg(&rxq->xdp_rxq_short, port->dev, rxq->id); + err = xdp_rxq_info_reg(&rxq->xdp_rxq_short, port->dev, rxq->id, 0); if (err < 0) goto err_free_dma; - err = xdp_rxq_info_reg(&rxq->xdp_rxq_long, port->dev, rxq->id); + err = xdp_rxq_info_reg(&rxq->xdp_rxq_long, port->dev, rxq->id, 0); if (err < 0) goto err_unregister_rxq_short; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index b0f79a5151cf..40775cb8fb2a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -283,7 +283,7 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv, ring->log_stride = ffs(ring->stride) - 1; ring->buf_size = ring->size * ring->stride + TXBB_SIZE; - if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index) < 0) + if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index, 0) < 0) goto err_ring; tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS * diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 527c5f12c5af..427fc376fe1a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -434,7 +434,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq_xdp_ix = rq->ix; if (xsk) rq_xdp_ix += params->num_channels * MLX5E_RQ_GROUP_XSK; - err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix); + err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix, 0); if (err < 0) goto err_rq_xdp_prog; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index b150da43adb2..b4acf2f41e84 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2533,7 +2533,7 @@ nfp_net_rx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring) if (dp->netdev) { err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, dp->netdev, - rx_ring->idx); + rx_ring->idx, rx_ring->r_vec->napi.napi_id); if (err < 0) return err; } diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 05e3a3b60269..9cf960a6d007 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1762,7 +1762,7 @@ static void qede_init_fp(struct qede_dev *edev) /* Driver have no error path from here */ WARN_ON(xdp_rxq_info_reg(&fp->rxq->xdp_rxq, edev->ndev, - fp->rxq->rxq_id) < 0); + fp->rxq->rxq_id, 0) < 0); if (xdp_rxq_info_reg_mem_model(&fp->rxq->xdp_rxq, MEM_TYPE_PAGE_ORDER0, diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c index 19cf7cac1e6e..68fc7d317693 100644 --- a/drivers/net/ethernet/sfc/rx_common.c +++ b/drivers/net/ethernet/sfc/rx_common.c @@ -262,7 +262,7 @@ void efx_init_rx_queue(struct efx_rx_queue *rx_queue) /* Initialise XDP queue information */ rc = xdp_rxq_info_reg(&rx_queue->xdp_rxq_info, efx->net_dev, - rx_queue->core_index); + rx_queue->core_index, 0); if (rc) { netif_err(efx, rx_err, efx->net_dev, diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 1503cc9ec6e2..27d3c9d9210e 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -1304,7 +1304,7 @@ static int netsec_setup_rx_dring(struct netsec_priv *priv) goto err_out; } - err = xdp_rxq_info_reg(&dring->xdp_rxq, priv->ndev, 0); + err = xdp_rxq_info_reg(&dring->xdp_rxq, priv->ndev, 0, priv->napi.napi_id); if (err) goto err_out; diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index 31c5e36ff706..6dd73bd0f458 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -1186,7 +1186,7 @@ static int cpsw_ndev_create_xdp_rxq(struct cpsw_priv *priv, int ch) pool = cpsw->page_pool[ch]; rxq = &priv->xdp_rxq[ch]; - ret = xdp_rxq_info_reg(rxq, priv->ndev, ch); + ret = xdp_rxq_info_reg(rxq, priv->ndev, ch, 0); if (ret) return ret; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 0c3de94b5178..fa8341f8359a 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1499,7 +1499,7 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, u64_stats_init(&nvchan->tx_stats.syncp); u64_stats_init(&nvchan->rx_stats.syncp); - ret = xdp_rxq_info_reg(&nvchan->xdp_rxq, ndev, i); + ret = xdp_rxq_info_reg(&nvchan->xdp_rxq, ndev, i, 0); if (ret) { netdev_err(ndev, "xdp_rxq_info_reg fail: %d\n", ret); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 3d45d56172cb..8867d39db6ac 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -780,7 +780,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file, } else { /* Setup XDP RX-queue info, for new tfile getting attached */ err = xdp_rxq_info_reg(&tfile->xdp_rxq, - tun->dev, tfile->queue_index); + tun->dev, tfile->queue_index, 0); if (err < 0) goto out; err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq, diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 8c737668008a..9bd37c7151f8 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -884,7 +884,6 @@ static int veth_napi_add(struct net_device *dev) for (i = 0; i < dev->real_num_rx_queues; i++) { struct veth_rq *rq = &priv->rq[i]; - netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); napi_enable(&rq->xdp_napi); } @@ -926,7 +925,8 @@ static int veth_enable_xdp(struct net_device *dev) for (i = 0; i < dev->real_num_rx_queues; i++) { struct veth_rq *rq = &priv->rq[i]; - err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); + netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); + err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); if (err < 0) goto err_rxq_reg; @@ -952,8 +952,12 @@ static int veth_enable_xdp(struct net_device *dev) err_reg_mem: xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); err_rxq_reg: - for (i--; i >= 0; i--) - xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); + for (i--; i >= 0; i--) { + struct veth_rq *rq = &priv->rq[i]; + + xdp_rxq_info_unreg(&rq->xdp_rxq); + netif_napi_del(&rq->xdp_napi); + } return err; } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 21b71148c532..052975ea0af4 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1485,7 +1485,7 @@ static int virtnet_open(struct net_device *dev) if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); - err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i); + err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id); if (err < 0) return err; diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 920cac4385bf..b01848ef4649 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -2014,7 +2014,7 @@ static int xennet_create_page_pool(struct netfront_queue *queue) } err = xdp_rxq_info_reg(&queue->xdp_rxq, queue->info->netdev, - queue->id); + queue->id, 0); if (err) { netdev_err(queue->info->netdev, "xdp_rxq_info_reg failed\n"); goto err_free_pp; diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 2f8f51807b83..45b3e04b99d3 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -135,14 +135,25 @@ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) sk_rx_queue_set(sk, skb); } -/* variant used for unconnected sockets */ -static inline void sk_mark_napi_id_once(struct sock *sk, - const struct sk_buff *skb) +static inline void __sk_mark_napi_id_once_xdp(struct sock *sk, unsigned int napi_id) { #ifdef CONFIG_NET_RX_BUSY_POLL if (!READ_ONCE(sk->sk_napi_id)) - WRITE_ONCE(sk->sk_napi_id, skb->napi_id); + WRITE_ONCE(sk->sk_napi_id, napi_id); #endif } +/* variant used for unconnected sockets */ +static inline void sk_mark_napi_id_once(struct sock *sk, + const struct sk_buff *skb) +{ + __sk_mark_napi_id_once_xdp(sk, skb->napi_id); +} + +static inline void sk_mark_napi_id_once_xdp(struct sock *sk, + const struct xdp_buff *xdp) +{ + __sk_mark_napi_id_once_xdp(sk, xdp->rxq->napi_id); +} + #endif /* _LINUX_NET_BUSY_POLL_H */ diff --git a/include/net/xdp.h b/include/net/xdp.h index 7d48b2ae217a..700ad5db7f5d 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -59,6 +59,7 @@ struct xdp_rxq_info { u32 queue_index; u32 reg_state; struct xdp_mem_info mem; + unsigned int napi_id; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ struct xdp_txq_info { @@ -226,7 +227,7 @@ static inline void xdp_release_frame(struct xdp_frame *xdpf) } int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index); + struct net_device *dev, u32 queue_index, unsigned int napi_id); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); diff --git a/net/core/dev.c b/net/core/dev.c index 7a1e5936c67f..3b6b0e175fe7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9810,7 +9810,7 @@ static int netif_alloc_rx_queues(struct net_device *dev) rx[i].dev = dev; /* XDP RX-queue setup */ - err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); if (err < 0) goto err_rxq_info; } diff --git a/net/core/xdp.c b/net/core/xdp.c index 3d330ebda893..17ffd33c6b18 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -158,7 +158,7 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) /* Returns 0 on success, negative on failure */ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index) + struct net_device *dev, u32 queue_index, unsigned int napi_id) { if (xdp_rxq->reg_state == REG_STATE_UNUSED) { WARN(1, "Driver promised not to register this"); @@ -179,6 +179,7 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, xdp_rxq_info_init(xdp_rxq); xdp_rxq->dev = dev; xdp_rxq->queue_index = queue_index; + xdp_rxq->napi_id = napi_id; xdp_rxq->reg_state = REG_STATE_REGISTERED; return 0; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index a8501a5477cf..7588e599a048 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -233,6 +233,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; + sk_mark_napi_id_once_xdp(&xs->sk, xdp); len = xdp->data_end - xdp->data; return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? -- cgit v1.2.3 From c214550ff8eae9623605149fa4e3da3ba43df145 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 29 Nov 2020 22:05:50 +0200 Subject: net: delete __dev_getfirstbyhwtype The last user of the RTNL brother of dev_getfirstbyhwtype (the latter being synchronized under RCU) has been deleted in commit b4db2b35fc44 ("afs: Use core kernel UUID generation"). Cc: Arnd Bergmann Cc: David Howells Cc: Eric Dumazet Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20201129200550.2433401-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - net/core/dev.c | 13 ------------- 2 files changed, 14 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8eeb73ac58bd..2e077e289e75 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2815,7 +2815,6 @@ unsigned long netdev_boot_base(const char *prefix, int unit); struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *hwaddr); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type); -struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type); void dev_add_pack(struct packet_type *pt); void dev_remove_pack(struct packet_type *pt); void __dev_remove_pack(struct packet_type *pt); diff --git a/net/core/dev.c b/net/core/dev.c index 51b263076124..b5130fd1cdaa 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1069,19 +1069,6 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); -struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) -{ - struct net_device *dev; - - ASSERT_RTNL(); - for_each_netdev(net, dev) - if (dev->type == type) - return dev; - - return NULL; -} -EXPORT_SYMBOL(__dev_getfirstbyhwtype); - struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; -- cgit v1.2.3 From ae0b04b238e283cafd906cdc3489cf5dc9a825cf Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 13 Dec 2020 16:39:29 +0200 Subject: net: Disable NETIF_F_HW_TLS_TX when HW_CSUM is disabled With NETIF_F_HW_TLS_TX packets are encrypted in HW. This cannot be logically done when HW_CSUM offload is off. Fixes: 2342a8512a1e ("net: Add TLS TX offload features") Signed-off-by: Tariq Toukan Reviewed-by: Boris Pismenny Link: https://lore.kernel.org/r/20201213143929.26253-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/networking/tls-offload.rst | 8 +++++++- net/core/dev.c | 5 +++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst index 37773da2bee5..0f55c6d540f9 100644 --- a/Documentation/networking/tls-offload.rst +++ b/Documentation/networking/tls-offload.rst @@ -524,7 +524,13 @@ on TCP retransmissions to handle corner cases is not acceptable. TLS device features ------------------- -Drivers should ignore the changes to TLS the device feature flags. +Drivers should ignore the changes to the TLS device feature flags. These flags will be acted upon accordingly by the core ``ktls`` code. TLS device feature flags only control adding of new TLS connection offloads, old connections will remain active after flags are cleared. + +TLS encryption cannot be offloaded to devices without checksum calculation +offload. Hence, TLS TX device feature flag requires NETIF_F_HW_CSUM being set. +Disabling the latter implies clearing the former. Disabling TX checksum offload +should not affect old connections, and drivers should make sure checksum +calculation does not break for them. diff --git a/net/core/dev.c b/net/core/dev.c index bde98cfd166f..0fd0d4eb678c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9643,6 +9643,11 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } + if ((features & NETIF_F_HW_TLS_TX) && !(features & NETIF_F_HW_CSUM)) { + netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_TX; + } + return features; } -- cgit v1.2.3