From b253a0680ceadc5d7b4acca7aa2d870326cad8ad Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Wed, 20 Apr 2022 10:34:41 +0800 Subject: tcp: ensure to use the most recently sent skb when filling the rate sample If an ACK (s)acks multiple skbs, we favor the information from the most recently sent skb by choosing the skb with the highest prior_delivered count. But in the interval between receiving ACKs, we send multiple skbs with the same prior_delivered, because the tp->delivered only changes when we receive an ACK. We used RACK's solution, copying tcp_rack_sent_after() as tcp_skb_sent_after() helper to determine "which packet was sent last?". Later, we will use tcp_skb_sent_after() instead in RACK. Fixes: b9f64820fb22 ("tcp: track data delivery rate for a TCP connection") Signed-off-by: Pengcheng Yang Cc: Paolo Abeni Acked-by: Neal Cardwell Tested-by: Neal Cardwell Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/1650422081-22153-1-git-send-email-yangpc@wangsu.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 70ca4a5e330a..be712fb9ddd7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1042,6 +1042,7 @@ struct rate_sample { int losses; /* number of packets marked lost upon ACK */ u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ u32 prior_in_flight; /* in flight before this ACK */ + u32 last_end_seq; /* end_seq of most recently ACKed packet */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ @@ -1164,6 +1165,11 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, bool is_sack_reneg, struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); +static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) +{ + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); +} + /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. -- cgit v1.2.3 From 31c417c948d7f6909cb63f0ac3298f3c38f8ce20 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Thu, 21 Apr 2022 15:09:02 -0700 Subject: ip_gre, ip6_gre: Fix race condition on o_seqno in collect_md mode As pointed out by Jakub Kicinski, currently using TUNNEL_SEQ in collect_md mode is racy for [IP6]GRE[TAP] devices. Consider the following sequence of events: 1. An [IP6]GRE[TAP] device is created in collect_md mode using "ip link add ... external". "ip" ignores "[o]seq" if "external" is specified, so TUNNEL_SEQ is off, and the device is marked as NETIF_F_LLTX (i.e. it uses lockless TX); 2. Someone sets TUNNEL_SEQ on outgoing skb's, using e.g. bpf_skb_set_tunnel_key() in an eBPF program attached to this device; 3. gre_fb_xmit() or __gre6_xmit() processes these skb's: gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); ^^^^^^^^^^^^^^^^^ Since we are not using the TX lock (&txq->_xmit_lock), multiple CPUs may try to do this tunnel->o_seqno++ in parallel, which is racy. Fix it by making o_seqno atomic_t. As mentioned by Eric Dumazet in commit b790e01aee74 ("ip_gre: lockless xmit"), making o_seqno atomic_t increases "chance for packets being out of order at receiver" when NETIF_F_LLTX is on. Maybe a better fix would be: 1. Do not ignore "oseq" in external mode. Users MUST specify "oseq" if they want the kernel to allow sequencing of outgoing packets; 2. Reject all outgoing TUNNEL_SEQ packets if the device was not created with "oseq". Unfortunately, that would break userspace. We could now make [IP6]GRE[TAP] devices always NETIF_F_LLTX, but let us do it in separate patches to keep this fix minimal. Suggested-by: Jakub Kicinski Fixes: 77a5196a804e ("gre: add sequence number for collect md mode.") Signed-off-by: Peilin Ye Acked-by: William Tu Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 2 +- include/net/ip_tunnels.h | 2 +- net/ipv4/ip_gre.c | 6 +++--- net/ipv6/ip6_gre.c | 7 ++++--- 4 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index a38c4f1e4e5c..74b369bddf49 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -58,7 +58,7 @@ struct ip6_tnl { /* These fields used only by GRE */ __u32 i_seqno; /* The last seen seqno */ - __u32 o_seqno; /* The last output seqno */ + atomic_t o_seqno; /* The last output seqno */ int hlen; /* tun_hlen + encap_hlen */ int tun_hlen; /* Precalculated header length */ int encap_hlen; /* Encap header length (FOU,GUE) */ diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 88dee57eac8a..c24fa934221d 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -116,7 +116,7 @@ struct ip_tunnel { /* These four fields used only by GRE */ u32 i_seqno; /* The last seen seqno */ - u32 o_seqno; /* The last output seqno */ + atomic_t o_seqno; /* The last output seqno */ int tun_hlen; /* Precalculated header length */ /* These four fields used only by ERSPAN */ diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 21a8943f6fa4..aacee9dd771b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -464,7 +464,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, /* Push GRE header. */ gre_build_header(skb, tunnel->tun_hlen, flags, proto, tunnel->parms.o_key, - (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); + (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } @@ -502,7 +502,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); gre_build_header(skb, tunnel_hlen, flags, proto, tunnel_id_to_key32(tun_info->key.tun_id), - (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); + (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); @@ -579,7 +579,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) } gre_build_header(skb, 8, TUNNEL_SEQ, - proto, 0, htonl(tunnel->o_seqno++)); + proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno))); ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index d9e4ac94eab4..5136959b3dc5 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -766,7 +766,7 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), - (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) + (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); } else { @@ -777,7 +777,8 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, gre_build_header(skb, tunnel->tun_hlen, flags, protocol, tunnel->parms.o_key, - (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); + (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) + : 0); } return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, @@ -1055,7 +1056,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, /* Push GRE header. */ proto = (t->parms.erspan_ver == 1) ? htons(ETH_P_ERSPAN) : htons(ETH_P_ERSPAN2); - gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(t->o_seqno++)); + gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(atomic_fetch_inc(&t->o_seqno))); /* TooBig packet may have updated dst->dev's mtu */ if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) -- cgit v1.2.3 From 4bfe744ff1644fbc0a991a2677dc874475dd6776 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 24 Apr 2022 17:34:07 -0700 Subject: tcp: fix potential xmit stalls caused by TCP_NOTSENT_LOWAT I had this bug sitting for too long in my pile, it is time to fix it. Thanks to Doug Porter for reminding me of it! We had various attempts in the past, including commit 0cbe6a8f089e ("tcp: remove SOCK_QUEUE_SHRUNK"), but the issue is that TCP stack currently only generates EPOLLOUT from input path, when tp->snd_una has advanced and skb(s) cleaned from rtx queue. If a flow has a big RTT, and/or receives SACKs, it is possible that the notsent part (tp->write_seq - tp->snd_nxt) reaches 0 and no more data can be sent until tp->snd_una finally advances. What is needed is to also check if POLLOUT needs to be generated whenever tp->snd_nxt is advanced, from output path. This bug triggers more often after an idle period, as we do not receive ACK for at least one RTT. tcp_notsent_lowat could be a fraction of what CWND and pacing rate would allow to send during this RTT. In a followup patch, I will remove the bogus call to tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED) from tcp_check_space(). Fact that we have decided to generate an EPOLLOUT does not mean the application has immediately refilled the transmit queue. This optimistic call might have been the reason the bug seemed not too serious. Tested: 200 ms rtt, 1% packet loss, 32 MB tcp_rmem[2] and tcp_wmem[2] $ echo 500000 >/proc/sys/net/ipv4/tcp_notsent_lowat $ cat bench_rr.sh SUM=0 for i in {1..10} do V=`netperf -H remote_host -l30 -t TCP_RR -- -r 10000000,10000 -o LOCAL_BYTES_SENT | egrep -v "MIGRATED|Bytes"` echo $V SUM=$(($SUM + $V)) done echo SUM=$SUM Before patch: $ bench_rr.sh 130000000 80000000 140000000 140000000 140000000 140000000 130000000 40000000 90000000 110000000 SUM=1140000000 After patch: $ bench_rr.sh 430000000 590000000 530000000 450000000 450000000 350000000 450000000 490000000 480000000 460000000 SUM=4680000000 # This is 410 % of the value before patch. Fixes: c9bee3b7fdec ("tcp: TCP_NOTSENT_LOWAT socket option") Signed-off-by: Eric Dumazet Reported-by: Doug Porter Cc: Soheil Hassas Yeganeh Cc: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp_input.c | 12 +++++++++++- net/ipv4/tcp_output.c | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index be712fb9ddd7..b99d9d9cbd99 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -620,6 +620,7 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); void tcp_fin(struct sock *sk); +void tcp_check_space(struct sock *sk); /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2088f93fa37b..48f607522860 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5454,7 +5454,17 @@ static void tcp_new_space(struct sock *sk) INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk); } -static void tcp_check_space(struct sock *sk) +/* Caller made space either from: + * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced) + * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt) + * + * We might be able to generate EPOLLOUT to the application if: + * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2 + * 2) notsent amount (tp->write_seq - tp->snd_nxt) became + * small enough that tcp_stream_memory_free() decides it + * is time to generate EPOLLOUT. + */ +void tcp_check_space(struct sock *sk) { /* pairs with tcp_poll() */ smp_mb(); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9ede847f4199..1ca2f28c9981 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -82,6 +82,7 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, tcp_skb_pcount(skb)); + tcp_check_space(sk); } /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one -- cgit v1.2.3 From ba5a4fdd63ae0c575707030db0b634b160baddd7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 24 Apr 2022 13:35:09 -0700 Subject: tcp: make sure treq->af_specific is initialized syzbot complained about a recent change in TCP stack, hitting a NULL pointer [1] tcp request sockets have an af_specific pointer, which was used before the blamed change only for SYNACK generation in non SYNCOOKIE mode. tcp requests sockets momentarily created when third packet coming from client in SYNCOOKIE mode were not using treq->af_specific. Make sure this field is populated, in the same way normal TCP requests sockets do in tcp_conn_request(). [1] TCP: request_sock_TCPv6: Possible SYN flooding on port 20002. Sending cookies. Check SNMP counters. general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 1 PID: 3695 Comm: syz-executor864 Not tainted 5.18.0-rc3-syzkaller-00224-g5fd1fe4807f9 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:tcp_create_openreq_child+0xe16/0x16b0 net/ipv4/tcp_minisocks.c:534 Code: 48 c1 ea 03 80 3c 02 00 0f 85 e5 07 00 00 4c 8b b3 28 01 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d 7e 08 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 c9 07 00 00 48 8b 3c 24 48 89 de 41 ff 56 08 48 RSP: 0018:ffffc90000de0588 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: ffff888076490330 RCX: 0000000000000100 RDX: 0000000000000001 RSI: ffffffff87d67ff0 RDI: 0000000000000008 RBP: ffff88806ee1c7f8 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff87d67f00 R11: 0000000000000000 R12: ffff88806ee1bfc0 R13: ffff88801b0e0368 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f517fe58700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffcead76960 CR3: 000000006f97b000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcp_v6_syn_recv_sock+0x199/0x23b0 net/ipv6/tcp_ipv6.c:1267 tcp_get_cookie_sock+0xc9/0x850 net/ipv4/syncookies.c:207 cookie_v6_check+0x15c3/0x2340 net/ipv6/syncookies.c:258 tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:1131 [inline] tcp_v6_do_rcv+0x1148/0x13b0 net/ipv6/tcp_ipv6.c:1486 tcp_v6_rcv+0x3305/0x3840 net/ipv6/tcp_ipv6.c:1725 ip6_protocol_deliver_rcu+0x2e9/0x1900 net/ipv6/ip6_input.c:422 ip6_input_finish+0x14c/0x2c0 net/ipv6/ip6_input.c:464 NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ip6_input+0x9c/0xd0 net/ipv6/ip6_input.c:473 dst_input include/net/dst.h:461 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:76 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ipv6_rcv+0x27f/0x3b0 net/ipv6/ip6_input.c:297 __netif_receive_skb_one_core+0x114/0x180 net/core/dev.c:5405 __netif_receive_skb+0x24/0x1b0 net/core/dev.c:5519 process_backlog+0x3a0/0x7c0 net/core/dev.c:5847 __napi_poll+0xb3/0x6e0 net/core/dev.c:6413 napi_poll net/core/dev.c:6480 [inline] net_rx_action+0x8ec/0xc60 net/core/dev.c:6567 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 invoke_softirq kernel/softirq.c:432 [inline] __irq_exit_rcu+0x123/0x180 kernel/softirq.c:637 irq_exit_rcu+0x5/0x20 kernel/softirq.c:649 sysvec_apic_timer_interrupt+0x93/0xc0 arch/x86/kernel/apic/apic.c:1097 Fixes: 5b0b9e4c2c89 ("tcp: md5: incorrect tcp_header_len for incoming connections") Signed-off-by: Eric Dumazet Cc: Francesco Ruggeri Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/syncookies.c | 8 +++++++- net/ipv6/syncookies.c | 3 ++- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index b99d9d9cbd99..cc1295037533 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -480,6 +480,7 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, u32 cookie); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, + const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 2cb3b852d148..f33c31dd7366 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -281,6 +281,7 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, EXPORT_SYMBOL(cookie_ecn_ok); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, + const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) { @@ -297,6 +298,10 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, return NULL; treq = tcp_rsk(req); + + /* treq->af_specific might be used to perform TCP_MD5 lookup */ + treq->af_specific = af_ops; + treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; #if IS_ENABLED(CONFIG_MPTCP) treq->is_mptcp = sk_is_mptcp(sk); @@ -364,7 +369,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb); + req = cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, + &tcp_request_sock_ipv4_ops, sk, skb); if (!req) goto out; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index d1b61d00368e..9cc123f000fb 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -170,7 +170,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, sk, skb); + req = cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, + &tcp_request_sock_ipv6_ops, sk, skb); if (!req) goto out; -- cgit v1.2.3 From ba3beec2ec1d3b4fd8672ca6e781dac4b3267f6e Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 25 Apr 2022 17:37:45 +0200 Subject: xsk: Fix possible crash when multiple sockets are created Fix a crash that happens if an Rx only socket is created first, then a second socket is created that is Tx only and bound to the same umem as the first socket and also the same netdev and queue_id together with the XDP_SHARED_UMEM flag. In this specific case, the tx_descs array page pool was not created by the first socket as it was an Rx only socket. When the second socket is bound it needs this tx_descs array of this shared page pool as it has a Tx component, but unfortunately it was never allocated, leading to a crash. Note that this array is only used for zero-copy drivers using the batched Tx APIs, currently only ice and i40e. [ 5511.150360] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 5511.158419] #PF: supervisor write access in kernel mode [ 5511.164472] #PF: error_code(0x0002) - not-present page [ 5511.170416] PGD 0 P4D 0 [ 5511.173347] Oops: 0002 [#1] PREEMPT SMP PTI [ 5511.178186] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G E 5.18.0-rc1+ #97 [ 5511.187245] Hardware name: Intel Corp. GRANTLEY/GRANTLEY, BIOS GRRFCRB1.86B.0276.D07.1605190235 05/19/2016 [ 5511.198418] RIP: 0010:xsk_tx_peek_release_desc_batch+0x198/0x310 [ 5511.205375] Code: c0 83 c6 01 84 c2 74 6d 8d 46 ff 23 07 44 89 e1 48 83 c0 14 48 c1 e1 04 48 c1 e0 04 48 03 47 10 4c 01 c1 48 8b 50 08 48 8b 00 <48> 89 51 08 48 89 01 41 80 bd d7 00 00 00 00 75 82 48 8b 19 49 8b [ 5511.227091] RSP: 0018:ffffc90000003dd0 EFLAGS: 00010246 [ 5511.233135] RAX: 0000000000000000 RBX: ffff88810c8da600 RCX: 0000000000000000 [ 5511.241384] RDX: 000000000000003c RSI: 0000000000000001 RDI: ffff888115f555c0 [ 5511.249634] RBP: ffffc90000003e08 R08: 0000000000000000 R09: ffff889092296b48 [ 5511.257886] R10: 0000ffffffffffff R11: ffff889092296800 R12: 0000000000000000 [ 5511.266138] R13: ffff88810c8db500 R14: 0000000000000040 R15: 0000000000000100 [ 5511.274387] FS: 0000000000000000(0000) GS:ffff88903f800000(0000) knlGS:0000000000000000 [ 5511.283746] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 5511.290389] CR2: 0000000000000008 CR3: 00000001046e2001 CR4: 00000000003706f0 [ 5511.298640] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 5511.306892] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 5511.315142] Call Trace: [ 5511.317972] [ 5511.320301] ice_xmit_zc+0x68/0x2f0 [ice] [ 5511.324977] ? ktime_get+0x38/0xa0 [ 5511.328913] ice_napi_poll+0x7a/0x6a0 [ice] [ 5511.333784] __napi_poll+0x2c/0x160 [ 5511.337821] net_rx_action+0xdd/0x200 [ 5511.342058] __do_softirq+0xe6/0x2dd [ 5511.346198] irq_exit_rcu+0xb5/0x100 [ 5511.350339] common_interrupt+0xa4/0xc0 [ 5511.354777] [ 5511.357201] [ 5511.359625] asm_common_interrupt+0x1e/0x40 [ 5511.364466] RIP: 0010:cpuidle_enter_state+0xd2/0x360 [ 5511.370211] Code: 49 89 c5 0f 1f 44 00 00 31 ff e8 e9 00 7b ff 45 84 ff 74 12 9c 58 f6 c4 02 0f 85 72 02 00 00 31 ff e8 02 0c 80 ff fb 45 85 f6 <0f> 88 11 01 00 00 49 63 c6 4c 2b 2c 24 48 8d 14 40 48 8d 14 90 49 [ 5511.391921] RSP: 0018:ffffffff82a03e60 EFLAGS: 00000202 [ 5511.397962] RAX: ffff88903f800000 RBX: 0000000000000001 RCX: 000000000000001f [ 5511.406214] RDX: 0000000000000000 RSI: ffffffff823400b9 RDI: ffffffff8234c046 [ 5511.424646] RBP: ffff88810a384800 R08: 000005032a28c046 R09: 0000000000000008 [ 5511.443233] R10: 000000000000000b R11: 0000000000000006 R12: ffffffff82bcf700 [ 5511.461922] R13: 000005032a28c046 R14: 0000000000000001 R15: 0000000000000000 [ 5511.480300] cpuidle_enter+0x29/0x40 [ 5511.494329] do_idle+0x1c7/0x250 [ 5511.507610] cpu_startup_entry+0x19/0x20 [ 5511.521394] start_kernel+0x649/0x66e [ 5511.534626] secondary_startup_64_no_verify+0xc3/0xcb [ 5511.549230] Detect such case during bind() and allocate this memory region via newly introduced xp_alloc_tx_descs(). Also, use kvcalloc instead of kcalloc as for other buffer pool allocations, so that it matches the kvfree() from xp_destroy(). Fixes: d1bc532e99be ("i40e: xsk: Move tmp desc array from driver to pool") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220425153745.481322-1-maciej.fijalkowski@intel.com --- include/net/xsk_buff_pool.h | 1 + net/xdp/xsk.c | 13 +++++++++++++ net/xdp/xsk_buff_pool.c | 16 ++++++++++++---- 3 files changed, 26 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 5554ee75e7da..647722e847b4 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -97,6 +97,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, u16 queue_id, u16 flags); int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, struct net_device *dev, u16 queue_id); +int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs); void xp_destroy(struct xsk_buff_pool *pool); void xp_get_pool(struct xsk_buff_pool *pool); bool xp_put_pool(struct xsk_buff_pool *pool); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 7d3a00cb24ec..3a9348030e20 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -967,6 +967,19 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xp_get_pool(umem_xs->pool); xs->pool = umem_xs->pool; + + /* If underlying shared umem was created without Tx + * ring, allocate Tx descs array that Tx batching API + * utilizes + */ + if (xs->tx && !xs->pool->tx_descs) { + err = xp_alloc_tx_descs(xs->pool, xs); + if (err) { + xp_put_pool(xs->pool); + sockfd_put(sock); + goto out_unlock; + } + } } xdp_get_umem(umem_xs->umem); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index af040ffa14ff..87bdd71c7bb6 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -42,6 +42,16 @@ void xp_destroy(struct xsk_buff_pool *pool) kvfree(pool); } +int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs) +{ + pool->tx_descs = kvcalloc(xs->tx->nentries, sizeof(*pool->tx_descs), + GFP_KERNEL); + if (!pool->tx_descs) + return -ENOMEM; + + return 0; +} + struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, struct xdp_umem *umem) { @@ -59,11 +69,9 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, if (!pool->heads) goto out; - if (xs->tx) { - pool->tx_descs = kcalloc(xs->tx->nentries, sizeof(*pool->tx_descs), GFP_KERNEL); - if (!pool->tx_descs) + if (xs->tx) + if (xp_alloc_tx_descs(pool, xs)) goto out; - } pool->chunk_mask = ~((u64)umem->chunk_size - 1); pool->addrs_cnt = umem->size; -- cgit v1.2.3 From c86cc5a3ec70f5644f1fa21610b943d0441bc1f7 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 22 Apr 2022 12:58:16 -0700 Subject: Bluetooth: hci_event: Fix checking for invalid handle on error status Commit d5ebaa7c5f6f6 introduces checks for handle range (e.g HCI_CONN_HANDLE_MAX) but controllers like Intel AX200 don't seem to respect the valid range int case of error status: > HCI Event: Connect Complete (0x03) plen 11 Status: Page Timeout (0x04) Handle: 65535 Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment& Sound Products Inc) Link type: ACL (0x01) Encryption: Disabled (0x00) [1644965.827560] Bluetooth: hci0: Ignoring HCI_Connection_Complete for invalid handle Because of it is impossible to cleanup the connections properly since the stack would attempt to cancel the connection which is no longer in progress causing the following trace: < HCI Command: Create Connection Cancel (0x01|0x0008) plen 6 Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment& Sound Products Inc) = bluetoothd: src/profile.c:record_cb() Unable to get Hands-Free Voice gateway SDP record: Connection timed out > HCI Event: Command Complete (0x0e) plen 10 Create Connection Cancel (0x01|0x0008) ncmd 1 Status: Unknown Connection Identifier (0x02) Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment& Sound Products Inc) < HCI Command: Create Connection Cancel (0x01|0x0008) plen 6 Address: 94:DB:56:XX:XX:XX (Sony Home Entertainment& Sound Products Inc) Fixes: d5ebaa7c5f6f6 ("Bluetooth: hci_event: Ignore multiple conn complete events") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_event.c | 65 +++++++++++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 5cb095b09a94..69ef31cea582 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -578,6 +578,7 @@ enum { #define HCI_ERROR_CONNECTION_TIMEOUT 0x08 #define HCI_ERROR_REJ_LIMITED_RESOURCES 0x0d #define HCI_ERROR_REJ_BAD_ADDR 0x0f +#define HCI_ERROR_INVALID_PARAMETERS 0x12 #define HCI_ERROR_REMOTE_USER_TERM 0x13 #define HCI_ERROR_REMOTE_LOW_RESOURCES 0x14 #define HCI_ERROR_REMOTE_POWER_OFF 0x15 diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index abaabfae19cc..3a9071b987f4 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3067,13 +3067,9 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, { struct hci_ev_conn_complete *ev = data; struct hci_conn *conn; + u8 status = ev->status; - if (__le16_to_cpu(ev->handle) > HCI_CONN_HANDLE_MAX) { - bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for invalid handle"); - return; - } - - bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); + bt_dev_dbg(hdev, "status 0x%2.2x", status); hci_dev_lock(hdev); @@ -3122,8 +3118,14 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, goto unlock; } - if (!ev->status) { + if (!status) { conn->handle = __le16_to_cpu(ev->handle); + if (conn->handle > HCI_CONN_HANDLE_MAX) { + bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", + conn->handle, HCI_CONN_HANDLE_MAX); + status = HCI_ERROR_INVALID_PARAMETERS; + goto done; + } if (conn->type == ACL_LINK) { conn->state = BT_CONFIG; @@ -3164,18 +3166,18 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, hci_send_cmd(hdev, HCI_OP_CHANGE_CONN_PTYPE, sizeof(cp), &cp); } - } else { - conn->state = BT_CLOSED; - if (conn->type == ACL_LINK) - mgmt_connect_failed(hdev, &conn->dst, conn->type, - conn->dst_type, ev->status); } if (conn->type == ACL_LINK) hci_sco_setup(conn, ev->status); - if (ev->status) { - hci_connect_cfm(conn, ev->status); +done: + if (status) { + conn->state = BT_CLOSED; + if (conn->type == ACL_LINK) + mgmt_connect_failed(hdev, &conn->dst, conn->type, + conn->dst_type, status); + hci_connect_cfm(conn, status); hci_conn_del(conn); } else if (ev->link_type == SCO_LINK) { switch (conn->setting & SCO_AIRMODE_MASK) { @@ -3185,7 +3187,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, break; } - hci_connect_cfm(conn, ev->status); + hci_connect_cfm(conn, status); } unlock: @@ -4676,6 +4678,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data, { struct hci_ev_sync_conn_complete *ev = data; struct hci_conn *conn; + u8 status = ev->status; switch (ev->link_type) { case SCO_LINK: @@ -4690,12 +4693,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data, return; } - if (__le16_to_cpu(ev->handle) > HCI_CONN_HANDLE_MAX) { - bt_dev_err(hdev, "Ignoring HCI_Sync_Conn_Complete for invalid handle"); - return; - } - - bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); + bt_dev_dbg(hdev, "status 0x%2.2x", status); hci_dev_lock(hdev); @@ -4729,9 +4727,17 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data, goto unlock; } - switch (ev->status) { + switch (status) { case 0x00: conn->handle = __le16_to_cpu(ev->handle); + if (conn->handle > HCI_CONN_HANDLE_MAX) { + bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", + conn->handle, HCI_CONN_HANDLE_MAX); + status = HCI_ERROR_INVALID_PARAMETERS; + conn->state = BT_CLOSED; + break; + } + conn->state = BT_CONNECTED; conn->type = ev->link_type; @@ -4775,8 +4781,8 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data, } } - hci_connect_cfm(conn, ev->status); - if (ev->status) + hci_connect_cfm(conn, status); + if (status) hci_conn_del(conn); unlock: @@ -5527,11 +5533,6 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, struct smp_irk *irk; u8 addr_type; - if (handle > HCI_CONN_HANDLE_MAX) { - bt_dev_err(hdev, "Ignoring HCI_LE_Connection_Complete for invalid handle"); - return; - } - hci_dev_lock(hdev); /* All controllers implicitly stop advertising in the event of a @@ -5603,6 +5604,12 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, conn->dst_type = ev_bdaddr_type(hdev, conn->dst_type, NULL); + if (handle > HCI_CONN_HANDLE_MAX) { + bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", handle, + HCI_CONN_HANDLE_MAX); + status = HCI_ERROR_INVALID_PARAMETERS; + } + if (status) { hci_le_conn_failed(conn, status); goto unlock; -- cgit v1.2.3 From 9b3628d79b46f06157affc56fdb218fdd4988321 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 22 Apr 2022 12:58:18 -0700 Subject: Bluetooth: hci_sync: Cleanup hci_conn if it cannot be aborted This attempts to cleanup the hci_conn if it cannot be aborted as otherwise it would likely result in having the controller and host stack out of sync with respect to connection handle. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 +- net/bluetooth/hci_conn.c | 32 ++++++++++++++++++++++++-------- net/bluetooth/hci_event.c | 13 ++++--------- net/bluetooth/hci_sync.c | 11 ++++++++++- 4 files changed, 39 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index d5377740e99c..8abd08245326 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1156,7 +1156,7 @@ int hci_conn_switch_role(struct hci_conn *conn, __u8 role); void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active); -void hci_le_conn_failed(struct hci_conn *conn, u8 status); +void hci_conn_failed(struct hci_conn *conn, u8 status); /* * hci_conn_get() and hci_conn_put() are used to control the life-time of an diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 84312c836549..fe803bee419a 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -670,7 +670,7 @@ static void le_conn_timeout(struct work_struct *work) /* Disable LE Advertising */ le_disable_advertising(hdev); hci_dev_lock(hdev); - hci_le_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT); + hci_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT); hci_dev_unlock(hdev); return; } @@ -873,7 +873,7 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type) EXPORT_SYMBOL(hci_get_route); /* This function requires the caller holds hdev->lock */ -void hci_le_conn_failed(struct hci_conn *conn, u8 status) +static void hci_le_conn_failed(struct hci_conn *conn, u8 status) { struct hci_dev *hdev = conn->hdev; struct hci_conn_params *params; @@ -886,8 +886,6 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) params->conn = NULL; } - conn->state = BT_CLOSED; - /* If the status indicates successful cancellation of * the attempt (i.e. Unknown Connection Id) there's no point of * notifying failure since we'll go back to keep trying to @@ -899,10 +897,6 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) mgmt_connect_failed(hdev, &conn->dst, conn->type, conn->dst_type, status); - hci_connect_cfm(conn, status); - - hci_conn_del(conn); - /* Since we may have temporarily stopped the background scanning in * favor of connection establishment, we should restart it. */ @@ -914,6 +908,28 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) hci_enable_advertising(hdev); } +/* This function requires the caller holds hdev->lock */ +void hci_conn_failed(struct hci_conn *conn, u8 status) +{ + struct hci_dev *hdev = conn->hdev; + + bt_dev_dbg(hdev, "status 0x%2.2x", status); + + switch (conn->type) { + case LE_LINK: + hci_le_conn_failed(conn, status); + break; + case ACL_LINK: + mgmt_connect_failed(hdev, &conn->dst, conn->type, + conn->dst_type, status); + break; + } + + conn->state = BT_CLOSED; + hci_connect_cfm(conn, status); + hci_conn_del(conn); +} + static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) { struct hci_conn *conn = data; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 5a6c8afc51a0..66451661283c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2834,7 +2834,7 @@ static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status) bt_dev_dbg(hdev, "status 0x%2.2x", status); /* All connection failure handling is taken care of by the - * hci_le_conn_failed function which is triggered by the HCI + * hci_conn_failed function which is triggered by the HCI * request completion callbacks used for connecting. */ if (status) @@ -2859,7 +2859,7 @@ static void hci_cs_le_ext_create_conn(struct hci_dev *hdev, u8 status) bt_dev_dbg(hdev, "status 0x%2.2x", status); /* All connection failure handling is taken care of by the - * hci_le_conn_failed function which is triggered by the HCI + * hci_conn_failed function which is triggered by the HCI * request completion callbacks used for connecting. */ if (status) @@ -3179,12 +3179,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, done: if (status) { - conn->state = BT_CLOSED; - if (conn->type == ACL_LINK) - mgmt_connect_failed(hdev, &conn->dst, conn->type, - conn->dst_type, status); - hci_connect_cfm(conn, status); - hci_conn_del(conn); + hci_conn_failed(conn, status); } else if (ev->link_type == SCO_LINK) { switch (conn->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_CVSD: @@ -5623,7 +5618,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, } if (status) { - hci_le_conn_failed(conn, status); + hci_conn_failed(conn, status); goto unlock; } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 8f4c5698913d..13600bf120b0 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4408,12 +4408,21 @@ static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, static int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { + int err; + switch (conn->state) { case BT_CONNECTED: case BT_CONFIG: return hci_disconnect_sync(hdev, conn, reason); case BT_CONNECT: - return hci_connect_cancel_sync(hdev, conn); + err = hci_connect_cancel_sync(hdev, conn); + /* Cleanup hci_conn object if it cannot be cancelled as it + * likelly means the controller and host stack are out of sync. + */ + if (err) + hci_conn_failed(conn, err); + + return err; case BT_CONNECT2: return hci_reject_conn_sync(hdev, conn, reason); default: -- cgit v1.2.3 From 6510ea973d8d9d4a0cb2fb557b36bd1ab3eb49f6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 25 Apr 2022 18:39:46 +0200 Subject: net: Use this_cpu_inc() to increment net->core_stats The macro dev_core_stats_##FIELD##_inc() disables preemption and invokes netdev_core_stats_alloc() to return a per-CPU pointer. netdev_core_stats_alloc() will allocate memory on its first invocation which breaks on PREEMPT_RT because it requires non-atomic context for memory allocation. This can be avoided by enabling preemption in netdev_core_stats_alloc() assuming the caller always disables preemption. It might be better to replace local_inc() with this_cpu_inc() now that dev_core_stats_##FIELD##_inc() gained a preempt-disable section and does not rely on already disabled preemption. This results in less instructions on x86-64: local_inc: | incl %gs:__preempt_count(%rip) # __preempt_count | movq 488(%rdi), %rax # _1->core_stats, _22 | testq %rax, %rax # _22 | je .L585 #, | add %gs:this_cpu_off(%rip), %rax # this_cpu_off, tcp_ptr__ | .L586: | testq %rax, %rax # _27 | je .L587 #, | incq (%rax) # _6->a.counter | .L587: | decl %gs:__preempt_count(%rip) # __preempt_count this_cpu_inc(), this patch: | movq 488(%rdi), %rax # _1->core_stats, _5 | testq %rax, %rax # _5 | je .L591 #, | .L585: | incq %gs:(%rax) # _18->rx_dropped Use unsigned long as type for the counter. Use this_cpu_inc() to increment the counter. Use a plain read of the counter. Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/YmbO0pxgtKpCw4SY@linutronix.de Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 21 +++++++++------------ net/core/dev.c | 14 +++++--------- 2 files changed, 14 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 59e27a2b7bf0..b1fbe21650bb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -199,10 +199,10 @@ struct net_device_stats { * Try to fit them in a single cache line, for dev_get_stats() sake. */ struct net_device_core_stats { - local_t rx_dropped; - local_t tx_dropped; - local_t rx_nohandler; -} __aligned(4 * sizeof(local_t)); + unsigned long rx_dropped; + unsigned long tx_dropped; + unsigned long rx_nohandler; +} __aligned(4 * sizeof(unsigned long)); #include #include @@ -3843,15 +3843,15 @@ static __always_inline bool __is_skb_forwardable(const struct net_device *dev, return false; } -struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev); +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev); -static inline struct net_device_core_stats *dev_core_stats(struct net_device *dev) +static inline struct net_device_core_stats __percpu *dev_core_stats(struct net_device *dev) { /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats); if (likely(p)) - return this_cpu_ptr(p); + return p; return netdev_core_stats_alloc(dev); } @@ -3859,14 +3859,11 @@ static inline struct net_device_core_stats *dev_core_stats(struct net_device *de #define DEV_CORE_STATS_INC(FIELD) \ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev) \ { \ - struct net_device_core_stats *p; \ + struct net_device_core_stats __percpu *p; \ \ - preempt_disable(); \ p = dev_core_stats(dev); \ - \ if (p) \ - local_inc(&p->FIELD); \ - preempt_enable(); \ + this_cpu_inc(p->FIELD); \ } DEV_CORE_STATS_INC(rx_dropped) DEV_CORE_STATS_INC(tx_dropped) diff --git a/net/core/dev.c b/net/core/dev.c index 8c6c08446556..1461c2d9dec8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10304,7 +10304,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, } EXPORT_SYMBOL(netdev_stats_to_stats64); -struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev) +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev) { struct net_device_core_stats __percpu *p; @@ -10315,11 +10315,7 @@ struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev) free_percpu(p); /* This READ_ONCE() pairs with the cmpxchg() above */ - p = READ_ONCE(dev->core_stats); - if (!p) - return NULL; - - return this_cpu_ptr(p); + return READ_ONCE(dev->core_stats); } EXPORT_SYMBOL(netdev_core_stats_alloc); @@ -10356,9 +10352,9 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, for_each_possible_cpu(i) { core_stats = per_cpu_ptr(p, i); - storage->rx_dropped += local_read(&core_stats->rx_dropped); - storage->tx_dropped += local_read(&core_stats->tx_dropped); - storage->rx_nohandler += local_read(&core_stats->rx_nohandler); + storage->rx_dropped += READ_ONCE(core_stats->rx_dropped); + storage->tx_dropped += READ_ONCE(core_stats->tx_dropped); + storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler); } } return storage; -- cgit v1.2.3