From 5a42f112d367bb4700a8a41f5c12724fde6bfbb9 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 11 Aug 2022 20:21:48 +0200 Subject: ice: xsk: prohibit usage of non-balanced queue id Fix the following scenario: 1. ethtool -L $IFACE rx 8 tx 96 2. xdpsock -q 10 -t -z Above refers to a case where user would like to attach XSK socket in txonly mode at a queue id that does not have a corresponding Rx queue. At this moment ice's XSK logic is tightly bound to act on a "queue pair", e.g. both Tx and Rx queues at a given queue id are disabled/enabled and both of them will get XSK pool assigned, which is broken for the presented queue configuration. This results in the splat included at the bottom, which is basically an OOB access to Rx ring array. To fix this, allow using the ids only in scope of "combined" queues reported by ethtool. However, logic should be rewritten to allow such configurations later on, which would end up as a complete rewrite of the control path, so let us go with this temporary fix. [420160.558008] BUG: kernel NULL pointer dereference, address: 0000000000000082 [420160.566359] #PF: supervisor read access in kernel mode [420160.572657] #PF: error_code(0x0000) - not-present page [420160.579002] PGD 0 P4D 0 [420160.582756] Oops: 0000 [#1] PREEMPT SMP NOPTI [420160.588396] CPU: 10 PID: 21232 Comm: xdpsock Tainted: G OE 5.19.0-rc7+ #10 [420160.597893] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019 [420160.609894] RIP: 0010:ice_xsk_pool_setup+0x44/0x7d0 [ice] [420160.616968] Code: f3 48 83 ec 40 48 8b 4f 20 48 8b 3f 65 48 8b 04 25 28 00 00 00 48 89 44 24 38 31 c0 48 8d 04 ed 00 00 00 00 48 01 c1 48 8b 11 <0f> b7 92 82 00 00 00 48 85 d2 0f 84 2d 75 00 00 48 8d 72 ff 48 85 [420160.639421] RSP: 0018:ffffc9002d2afd48 EFLAGS: 00010282 [420160.646650] RAX: 0000000000000050 RBX: ffff88811d8bdd00 RCX: ffff888112c14ff8 [420160.655893] RDX: 0000000000000000 RSI: ffff88811d8bdd00 RDI: ffff888109861000 [420160.665166] RBP: 000000000000000a R08: 000000000000000a R09: 0000000000000000 [420160.674493] R10: 000000000000889f R11: 0000000000000000 R12: 000000000000000a [420160.683833] R13: 000000000000000a R14: 0000000000000000 R15: ffff888117611828 [420160.693211] FS: 00007fa869fc1f80(0000) GS:ffff8897e0880000(0000) knlGS:0000000000000000 [420160.703645] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [420160.711783] CR2: 0000000000000082 CR3: 00000001d076c001 CR4: 00000000007706e0 [420160.721399] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [420160.731045] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [420160.740707] PKRU: 55555554 [420160.745960] Call Trace: [420160.750962] [420160.755597] ? kmalloc_large_node+0x79/0x90 [420160.762703] ? __kmalloc_node+0x3f5/0x4b0 [420160.769341] xp_assign_dev+0xfd/0x210 [420160.775661] ? shmem_file_read_iter+0x29a/0x420 [420160.782896] xsk_bind+0x152/0x490 [420160.788943] __sys_bind+0xd0/0x100 [420160.795097] ? exit_to_user_mode_prepare+0x20/0x120 [420160.802801] __x64_sys_bind+0x16/0x20 [420160.809298] do_syscall_64+0x38/0x90 [420160.815741] entry_SYSCALL_64_after_hwframe+0x63/0xcd [420160.823731] RIP: 0033:0x7fa86a0dd2fb [420160.830264] Code: c3 66 0f 1f 44 00 00 48 8b 15 69 8b 0c 00 f7 d8 64 89 02 b8 ff ff ff ff eb bc 0f 1f 44 00 00 f3 0f 1e fa b8 31 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 3d 8b 0c 00 f7 d8 64 89 01 48 [420160.855410] RSP: 002b:00007ffc1146f618 EFLAGS: 00000246 ORIG_RAX: 0000000000000031 [420160.866366] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fa86a0dd2fb [420160.876957] RDX: 0000000000000010 RSI: 00007ffc1146f680 RDI: 0000000000000003 [420160.887604] RBP: 000055d7113a0520 R08: 00007fa868fb8000 R09: 0000000080000000 [420160.898293] R10: 0000000000008001 R11: 0000000000000246 R12: 000055d7113a04e0 [420160.909038] R13: 000055d7113a0320 R14: 000000000000000a R15: 0000000000000000 [420160.919817] [420160.925659] Modules linked in: ice(OE) af_packet binfmt_misc nls_iso8859_1 ipmi_ssif intel_rapl_msr intel_rapl_common x86_pkg_temp_thermal intel_powerclamp mei_me coretemp ioatdma mei ipmi_si wmi ipmi_msghandler acpi_pad acpi_power_meter ip_tables x_tables autofs4 ixgbe i40e crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel crypto_simd cryptd ahci mdio dca libahci lpc_ich [last unloaded: ice] [420160.977576] CR2: 0000000000000082 [420160.985037] ---[ end trace 0000000000000000 ]--- [420161.097724] RIP: 0010:ice_xsk_pool_setup+0x44/0x7d0 [ice] [420161.107341] Code: f3 48 83 ec 40 48 8b 4f 20 48 8b 3f 65 48 8b 04 25 28 00 00 00 48 89 44 24 38 31 c0 48 8d 04 ed 00 00 00 00 48 01 c1 48 8b 11 <0f> b7 92 82 00 00 00 48 85 d2 0f 84 2d 75 00 00 48 8d 72 ff 48 85 [420161.134741] RSP: 0018:ffffc9002d2afd48 EFLAGS: 00010282 [420161.144274] RAX: 0000000000000050 RBX: ffff88811d8bdd00 RCX: ffff888112c14ff8 [420161.155690] RDX: 0000000000000000 RSI: ffff88811d8bdd00 RDI: ffff888109861000 [420161.168088] RBP: 000000000000000a R08: 000000000000000a R09: 0000000000000000 [420161.179295] R10: 000000000000889f R11: 0000000000000000 R12: 000000000000000a [420161.190420] R13: 000000000000000a R14: 0000000000000000 R15: ffff888117611828 [420161.201505] FS: 00007fa869fc1f80(0000) GS:ffff8897e0880000(0000) knlGS:0000000000000000 [420161.213628] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [420161.223413] CR2: 0000000000000082 CR3: 00000001d076c001 CR4: 00000000007706e0 [420161.234653] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [420161.245893] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [420161.257052] PKRU: 55555554 Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Signed-off-by: Maciej Fijalkowski Tested-by: George Kuruvinakunnel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_xsk.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/net/ethernet') diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 49ba8bfdbf04..45f88e6ec25e 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -329,6 +329,12 @@ int ice_xsk_pool_setup(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid) bool if_running, pool_present = !!pool; int ret = 0, pool_failure = 0; + if (qid >= vsi->num_rxq || qid >= vsi->num_txq) { + netdev_err(vsi->netdev, "Please use queue id in scope of combined queues count\n"); + pool_failure = -EINVAL; + goto failure; + } + if (!is_power_of_2(vsi->rx_rings[qid]->count) || !is_power_of_2(vsi->tx_rings[qid]->count)) { netdev_err(vsi->netdev, "Please align ring sizes to power of 2\n"); -- cgit v1.2.3 From 9ead7e74bfd6dd54db12ef133b8604add72511de Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 11 Aug 2022 20:21:49 +0200 Subject: ice: xsk: use Rx ring's XDP ring when picking NAPI context Ice driver allocates per cpu XDP queues so that redirect path can safely use smp_processor_id() as an index to the array. At the same time though, XDP rings are used to pick NAPI context to call napi_schedule() or set NAPIF_STATE_MISSED. When user reduces queue count, say to 8, and num_possible_cpus() of underlying platform is 44, then this means queue vectors with correlated NAPI contexts will carry several XDP queues. This in turn can result in a broken behavior where NAPI context of interest will never be scheduled and AF_XDP socket will not process any traffic. To fix this, let us change the way how XDP rings are assigned to Rx rings and use this information later on when setting ice_tx_ring::xsk_pool pointer. For each Rx ring, grab the associated queue vector and walk through Tx ring's linked list. Once we stumble upon XDP ring in it, assign this ring to ice_rx_ring::xdp_ring. Previous [0] approach of fixing this issue was for txonly scenario because of the described grouping of XDP rings across queue vectors. So, relying on Rx ring meant that NAPI context could be scheduled with a queue vector without XDP ring with associated XSK pool. [0]: https://lore.kernel.org/netdev/20220707161128.54215-1-maciej.fijalkowski@intel.com/ Fixes: 2d4238f55697 ("ice: Add support for AF_XDP") Fixes: 22bf877e528f ("ice: introduce XDP_TX fallback path") Signed-off-by: Maciej Fijalkowski Tested-by: George Kuruvinakunnel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 36 ++++++++++++++++++++----------- drivers/net/ethernet/intel/ice/ice_lib.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_main.c | 25 ++++++++++++++------- drivers/net/ethernet/intel/ice/ice_xsk.c | 12 +++++------ 4 files changed, 48 insertions(+), 29 deletions(-) (limited to 'drivers/net/ethernet') diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index cc5b85afd437..841fa149c407 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -684,8 +684,8 @@ static inline void ice_set_ring_xdp(struct ice_tx_ring *ring) * ice_xsk_pool - get XSK buffer pool bound to a ring * @ring: Rx ring to use * - * Returns a pointer to xdp_umem structure if there is a buffer pool present, - * NULL otherwise. + * Returns a pointer to xsk_buff_pool structure if there is a buffer pool + * present, NULL otherwise. */ static inline struct xsk_buff_pool *ice_xsk_pool(struct ice_rx_ring *ring) { @@ -699,23 +699,33 @@ static inline struct xsk_buff_pool *ice_xsk_pool(struct ice_rx_ring *ring) } /** - * ice_tx_xsk_pool - get XSK buffer pool bound to a ring - * @ring: Tx ring to use + * ice_tx_xsk_pool - assign XSK buff pool to XDP ring + * @vsi: pointer to VSI + * @qid: index of a queue to look at XSK buff pool presence * - * Returns a pointer to xdp_umem structure if there is a buffer pool present, - * NULL otherwise. Tx equivalent of ice_xsk_pool. + * Sets XSK buff pool pointer on XDP ring. + * + * XDP ring is picked from Rx ring, whereas Rx ring is picked based on provided + * queue id. Reason for doing so is that queue vectors might have assigned more + * than one XDP ring, e.g. when user reduced the queue count on netdev; Rx ring + * carries a pointer to one of these XDP rings for its own purposes, such as + * handling XDP_TX action, therefore we can piggyback here on the + * rx_ring->xdp_ring assignment that was done during XDP rings initialization. */ -static inline struct xsk_buff_pool *ice_tx_xsk_pool(struct ice_tx_ring *ring) +static inline void ice_tx_xsk_pool(struct ice_vsi *vsi, u16 qid) { - struct ice_vsi *vsi = ring->vsi; - u16 qid; + struct ice_tx_ring *ring; - qid = ring->q_index - vsi->alloc_txq; + ring = vsi->rx_rings[qid]->xdp_ring; + if (!ring) + return; - if (!ice_is_xdp_ena_vsi(vsi) || !test_bit(qid, vsi->af_xdp_zc_qps)) - return NULL; + if (!ice_is_xdp_ena_vsi(vsi) || !test_bit(qid, vsi->af_xdp_zc_qps)) { + ring->xsk_pool = NULL; + return; + } - return xsk_get_pool_from_qid(vsi->netdev, qid); + ring->xsk_pool = xsk_get_pool_from_qid(vsi->netdev, qid); } /** diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 733c455f6574..0c4ec9264071 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -1986,8 +1986,8 @@ int ice_vsi_cfg_xdp_txqs(struct ice_vsi *vsi) if (ret) return ret; - ice_for_each_xdp_txq(vsi, i) - vsi->xdp_rings[i]->xsk_pool = ice_tx_xsk_pool(vsi->xdp_rings[i]); + ice_for_each_rxq(vsi, i) + ice_tx_xsk_pool(vsi, i); return ret; } diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 4ecaf40cf946..173fe6c31341 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2581,7 +2581,6 @@ static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi) if (ice_setup_tx_ring(xdp_ring)) goto free_xdp_rings; ice_set_ring_xdp(xdp_ring); - xdp_ring->xsk_pool = ice_tx_xsk_pool(xdp_ring); spin_lock_init(&xdp_ring->tx_lock); for (j = 0; j < xdp_ring->count; j++) { tx_desc = ICE_TX_DESC(xdp_ring, j); @@ -2589,13 +2588,6 @@ static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi) } } - ice_for_each_rxq(vsi, i) { - if (static_key_enabled(&ice_xdp_locking_key)) - vsi->rx_rings[i]->xdp_ring = vsi->xdp_rings[i % vsi->num_xdp_txq]; - else - vsi->rx_rings[i]->xdp_ring = vsi->xdp_rings[i]; - } - return 0; free_xdp_rings: @@ -2685,6 +2677,23 @@ int ice_prepare_xdp_rings(struct ice_vsi *vsi, struct bpf_prog *prog) xdp_rings_rem -= xdp_rings_per_v; } + ice_for_each_rxq(vsi, i) { + if (static_key_enabled(&ice_xdp_locking_key)) { + vsi->rx_rings[i]->xdp_ring = vsi->xdp_rings[i % vsi->num_xdp_txq]; + } else { + struct ice_q_vector *q_vector = vsi->rx_rings[i]->q_vector; + struct ice_tx_ring *ring; + + ice_for_each_tx_ring(ring, q_vector->tx) { + if (ice_ring_is_xdp(ring)) { + vsi->rx_rings[i]->xdp_ring = ring; + break; + } + } + } + ice_tx_xsk_pool(vsi, i); + } + /* omit the scheduler update if in reset path; XDP queues will be * taken into account at the end of ice_vsi_rebuild, where * ice_cfg_vsi_lan is being called diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 45f88e6ec25e..e48e29258450 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -243,7 +243,7 @@ static int ice_qp_ena(struct ice_vsi *vsi, u16 q_idx) if (err) goto free_buf; ice_set_ring_xdp(xdp_ring); - xdp_ring->xsk_pool = ice_tx_xsk_pool(xdp_ring); + ice_tx_xsk_pool(vsi, q_idx); } err = ice_vsi_cfg_rxq(rx_ring); @@ -359,7 +359,7 @@ xsk_pool_if_up: if (if_running) { ret = ice_qp_ena(vsi, qid); if (!ret && pool_present) - napi_schedule(&vsi->xdp_rings[qid]->q_vector->napi); + napi_schedule(&vsi->rx_rings[qid]->xdp_ring->q_vector->napi); else if (ret) netdev_err(vsi->netdev, "ice_qp_ena error = %d\n", ret); } @@ -950,13 +950,13 @@ ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, if (!ice_is_xdp_ena_vsi(vsi)) return -EINVAL; - if (queue_id >= vsi->num_txq) + if (queue_id >= vsi->num_txq || queue_id >= vsi->num_rxq) return -EINVAL; - if (!vsi->xdp_rings[queue_id]->xsk_pool) - return -EINVAL; + ring = vsi->rx_rings[queue_id]->xdp_ring; - ring = vsi->xdp_rings[queue_id]; + if (!ring->xsk_pool) + return -EINVAL; /* The idea here is that if NAPI is running, mark a miss, so * it will run again. If not, trigger an interrupt and -- cgit v1.2.3