diff options
Diffstat (limited to 'drivers/net/ethernet/intel/igb/igb_xsk.c')
-rw-r--r-- | drivers/net/ethernet/intel/igb/igb_xsk.c | 562 |
1 files changed, 562 insertions, 0 deletions
diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c new file mode 100644 index 000000000000..157d43787fa0 --- /dev/null +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2018 Intel Corporation. */ + +#include <linux/bpf_trace.h> +#include <net/xdp_sock_drv.h> +#include <net/xdp.h> + +#include "e1000_hw.h" +#include "igb.h" + +static int igb_realloc_rx_buffer_info(struct igb_ring *ring, bool pool_present) +{ + int size = pool_present ? + sizeof(*ring->rx_buffer_info_zc) * ring->count : + sizeof(*ring->rx_buffer_info) * ring->count; + void *buff_info = vmalloc(size); + + if (!buff_info) + return -ENOMEM; + + if (pool_present) { + vfree(ring->rx_buffer_info); + ring->rx_buffer_info = NULL; + ring->rx_buffer_info_zc = buff_info; + } else { + vfree(ring->rx_buffer_info_zc); + ring->rx_buffer_info_zc = NULL; + ring->rx_buffer_info = buff_info; + } + + return 0; +} + +static void igb_txrx_ring_disable(struct igb_adapter *adapter, u16 qid) +{ + struct igb_ring *tx_ring = adapter->tx_ring[qid]; + struct igb_ring *rx_ring = adapter->rx_ring[qid]; + struct e1000_hw *hw = &adapter->hw; + + set_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags); + + wr32(E1000_TXDCTL(tx_ring->reg_idx), 0); + wr32(E1000_RXDCTL(rx_ring->reg_idx), 0); + + synchronize_net(); + + /* Rx/Tx share the same napi context. */ + napi_disable(&rx_ring->q_vector->napi); + + igb_clean_tx_ring(tx_ring); + igb_clean_rx_ring(rx_ring); + + memset(&rx_ring->rx_stats, 0, sizeof(rx_ring->rx_stats)); + memset(&tx_ring->tx_stats, 0, sizeof(tx_ring->tx_stats)); +} + +static void igb_txrx_ring_enable(struct igb_adapter *adapter, u16 qid) +{ + struct igb_ring *tx_ring = adapter->tx_ring[qid]; + struct igb_ring *rx_ring = adapter->rx_ring[qid]; + + igb_configure_tx_ring(adapter, tx_ring); + igb_configure_rx_ring(adapter, rx_ring); + + synchronize_net(); + + clear_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags); + + /* call igb_desc_unused which always leaves + * at least 1 descriptor unused to make sure + * next_to_use != next_to_clean + */ + if (rx_ring->xsk_pool) + igb_alloc_rx_buffers_zc(rx_ring, rx_ring->xsk_pool, + igb_desc_unused(rx_ring)); + else + igb_alloc_rx_buffers(rx_ring, igb_desc_unused(rx_ring)); + + /* Rx/Tx share the same napi context. */ + napi_enable(&rx_ring->q_vector->napi); +} + +struct xsk_buff_pool *igb_xsk_pool(struct igb_adapter *adapter, + struct igb_ring *ring) +{ + int qid = ring->queue_index; + struct xsk_buff_pool *pool; + + pool = xsk_get_pool_from_qid(adapter->netdev, qid); + + if (!igb_xdp_is_enabled(adapter)) + return NULL; + + return (pool && pool->dev) ? pool : NULL; +} + +static int igb_xsk_pool_enable(struct igb_adapter *adapter, + struct xsk_buff_pool *pool, + u16 qid) +{ + struct net_device *netdev = adapter->netdev; + struct igb_ring *rx_ring; + bool if_running; + int err; + + if (qid >= adapter->num_rx_queues) + return -EINVAL; + + if (qid >= netdev->real_num_rx_queues || + qid >= netdev->real_num_tx_queues) + return -EINVAL; + + err = xsk_pool_dma_map(pool, &adapter->pdev->dev, IGB_RX_DMA_ATTR); + if (err) + return err; + + rx_ring = adapter->rx_ring[qid]; + if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter); + if (if_running) + igb_txrx_ring_disable(adapter, qid); + + if (if_running) { + err = igb_realloc_rx_buffer_info(rx_ring, true); + if (!err) { + igb_txrx_ring_enable(adapter, qid); + /* Kick start the NAPI context so that receiving will start */ + err = igb_xsk_wakeup(adapter->netdev, qid, XDP_WAKEUP_RX); + } + + if (err) { + xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR); + return err; + } + } + + return 0; +} + +static int igb_xsk_pool_disable(struct igb_adapter *adapter, u16 qid) +{ + struct xsk_buff_pool *pool; + struct igb_ring *rx_ring; + bool if_running; + int err; + + pool = xsk_get_pool_from_qid(adapter->netdev, qid); + if (!pool) + return -EINVAL; + + rx_ring = adapter->rx_ring[qid]; + if_running = netif_running(adapter->netdev) && igb_xdp_is_enabled(adapter); + if (if_running) + igb_txrx_ring_disable(adapter, qid); + + xsk_pool_dma_unmap(pool, IGB_RX_DMA_ATTR); + + if (if_running) { + err = igb_realloc_rx_buffer_info(rx_ring, false); + if (err) + return err; + + igb_txrx_ring_enable(adapter, qid); + } + + return 0; +} + +int igb_xsk_pool_setup(struct igb_adapter *adapter, + struct xsk_buff_pool *pool, + u16 qid) +{ + return pool ? igb_xsk_pool_enable(adapter, pool, qid) : + igb_xsk_pool_disable(adapter, qid); +} + +static u16 igb_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp, + union e1000_adv_rx_desc *rx_desc, u16 count) +{ + dma_addr_t dma; + u16 buffs; + int i; + + /* nothing to do */ + if (!count) + return 0; + + buffs = xsk_buff_alloc_batch(pool, xdp, count); + for (i = 0; i < buffs; i++) { + dma = xsk_buff_xdp_get_dma(*xdp); + rx_desc->read.pkt_addr = cpu_to_le64(dma); + rx_desc->wb.upper.length = 0; + + rx_desc++; + xdp++; + } + + return buffs; +} + +bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring, + struct xsk_buff_pool *xsk_pool, u16 count) +{ + u32 nb_buffs_extra = 0, nb_buffs = 0; + union e1000_adv_rx_desc *rx_desc; + u16 ntu = rx_ring->next_to_use; + u16 total_count = count; + struct xdp_buff **xdp; + + rx_desc = IGB_RX_DESC(rx_ring, ntu); + xdp = &rx_ring->rx_buffer_info_zc[ntu]; + + if (ntu + count >= rx_ring->count) { + nb_buffs_extra = igb_fill_rx_descs(xsk_pool, xdp, rx_desc, + rx_ring->count - ntu); + if (nb_buffs_extra != rx_ring->count - ntu) { + ntu += nb_buffs_extra; + goto exit; + } + rx_desc = IGB_RX_DESC(rx_ring, 0); + xdp = rx_ring->rx_buffer_info_zc; + ntu = 0; + count -= nb_buffs_extra; + } + + nb_buffs = igb_fill_rx_descs(xsk_pool, xdp, rx_desc, count); + ntu += nb_buffs; + if (ntu == rx_ring->count) + ntu = 0; + + /* clear the length for the next_to_use descriptor */ + rx_desc = IGB_RX_DESC(rx_ring, ntu); + rx_desc->wb.upper.length = 0; + +exit: + if (rx_ring->next_to_use != ntu) { + rx_ring->next_to_use = ntu; + + /* Force memory writes to complete before letting h/w + * know there are new descriptors to fetch. (Only + * applicable for weak-ordered memory model archs, + * such as IA-64). + */ + wmb(); + writel(ntu, rx_ring->tail); + } + + return total_count == (nb_buffs + nb_buffs_extra); +} + +void igb_clean_rx_ring_zc(struct igb_ring *rx_ring) +{ + u16 ntc = rx_ring->next_to_clean; + u16 ntu = rx_ring->next_to_use; + + while (ntc != ntu) { + struct xdp_buff *xdp = rx_ring->rx_buffer_info_zc[ntc]; + + xsk_buff_free(xdp); + ntc++; + if (ntc >= rx_ring->count) + ntc = 0; + } +} + +static struct sk_buff *igb_construct_skb_zc(struct igb_ring *rx_ring, + struct xdp_buff *xdp, + ktime_t timestamp) +{ + unsigned int totalsize = xdp->data_end - xdp->data_meta; + unsigned int metasize = xdp->data - xdp->data_meta; + struct sk_buff *skb; + + net_prefetch(xdp->data_meta); + + /* allocate a skb to store the frags */ + skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize); + if (unlikely(!skb)) + return NULL; + + if (timestamp) + skb_hwtstamps(skb)->hwtstamp = timestamp; + + memcpy(__skb_put(skb, totalsize), xdp->data_meta, + ALIGN(totalsize, sizeof(long))); + + if (metasize) { + skb_metadata_set(skb, metasize); + __skb_pull(skb, metasize); + } + + return skb; +} + +static int igb_run_xdp_zc(struct igb_adapter *adapter, struct igb_ring *rx_ring, + struct xdp_buff *xdp, struct xsk_buff_pool *xsk_pool, + struct bpf_prog *xdp_prog) +{ + int err, result = IGB_XDP_PASS; + u32 act; + + prefetchw(xdp->data_hard_start); /* xdp_frame write */ + + act = bpf_prog_run_xdp(xdp_prog, xdp); + + if (likely(act == XDP_REDIRECT)) { + err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog); + if (!err) + return IGB_XDP_REDIR; + + if (xsk_uses_need_wakeup(xsk_pool) && + err == -ENOBUFS) + result = IGB_XDP_EXIT; + else + result = IGB_XDP_CONSUMED; + goto out_failure; + } + + switch (act) { + case XDP_PASS: + break; + case XDP_TX: + result = igb_xdp_xmit_back(adapter, xdp); + if (result == IGB_XDP_CONSUMED) + goto out_failure; + break; + default: + bpf_warn_invalid_xdp_action(adapter->netdev, xdp_prog, act); + fallthrough; + case XDP_ABORTED: +out_failure: + trace_xdp_exception(rx_ring->netdev, xdp_prog, act); + fallthrough; + case XDP_DROP: + result = IGB_XDP_CONSUMED; + break; + } + + return result; +} + +int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, + struct xsk_buff_pool *xsk_pool, const int budget) +{ + struct igb_adapter *adapter = q_vector->adapter; + unsigned int total_bytes = 0, total_packets = 0; + struct igb_ring *rx_ring = q_vector->rx.ring; + u32 ntc = rx_ring->next_to_clean; + struct bpf_prog *xdp_prog; + unsigned int xdp_xmit = 0; + bool failure = false; + u16 entries_to_alloc; + struct sk_buff *skb; + + /* xdp_prog cannot be NULL in the ZC path */ + xdp_prog = READ_ONCE(rx_ring->xdp_prog); + + while (likely(total_packets < budget)) { + union e1000_adv_rx_desc *rx_desc; + ktime_t timestamp = 0; + struct xdp_buff *xdp; + unsigned int size; + int xdp_res = 0; + + rx_desc = IGB_RX_DESC(rx_ring, ntc); + size = le16_to_cpu(rx_desc->wb.upper.length); + if (!size) + break; + + /* This memory barrier is needed to keep us from reading + * any other fields out of the rx_desc until we know the + * descriptor has been written back + */ + dma_rmb(); + + xdp = rx_ring->rx_buffer_info_zc[ntc]; + xsk_buff_set_size(xdp, size); + xsk_buff_dma_sync_for_cpu(xdp); + + /* pull rx packet timestamp if available and valid */ + if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) { + int ts_hdr_len; + + ts_hdr_len = igb_ptp_rx_pktstamp(rx_ring->q_vector, + xdp->data, + ×tamp); + + xdp->data += ts_hdr_len; + xdp->data_meta += ts_hdr_len; + size -= ts_hdr_len; + } + + xdp_res = igb_run_xdp_zc(adapter, rx_ring, xdp, xsk_pool, + xdp_prog); + + if (xdp_res) { + if (likely(xdp_res & (IGB_XDP_TX | IGB_XDP_REDIR))) { + xdp_xmit |= xdp_res; + } else if (xdp_res == IGB_XDP_EXIT) { + failure = true; + break; + } else if (xdp_res == IGB_XDP_CONSUMED) { + xsk_buff_free(xdp); + } + + total_packets++; + total_bytes += size; + ntc++; + if (ntc == rx_ring->count) + ntc = 0; + continue; + } + + skb = igb_construct_skb_zc(rx_ring, xdp, timestamp); + + /* exit if we failed to retrieve a buffer */ + if (!skb) { + rx_ring->rx_stats.alloc_failed++; + break; + } + + xsk_buff_free(xdp); + ntc++; + if (ntc == rx_ring->count) + ntc = 0; + + if (eth_skb_pad(skb)) + continue; + + /* probably a little skewed due to removing CRC */ + total_bytes += skb->len; + + /* populate checksum, timestamp, VLAN, and protocol */ + igb_process_skb_fields(rx_ring, rx_desc, skb); + + napi_gro_receive(&q_vector->napi, skb); + + /* update budget accounting */ + total_packets++; + } + + rx_ring->next_to_clean = ntc; + + if (xdp_xmit) + igb_finalize_xdp(adapter, xdp_xmit); + + igb_update_rx_stats(q_vector, total_packets, total_bytes); + + entries_to_alloc = igb_desc_unused(rx_ring); + if (entries_to_alloc >= IGB_RX_BUFFER_WRITE) + failure |= !igb_alloc_rx_buffers_zc(rx_ring, xsk_pool, + entries_to_alloc); + + if (xsk_uses_need_wakeup(xsk_pool)) { + if (failure || rx_ring->next_to_clean == rx_ring->next_to_use) + xsk_set_rx_need_wakeup(xsk_pool); + else + xsk_clear_rx_need_wakeup(xsk_pool); + + return (int)total_packets; + } + return failure ? budget : (int)total_packets; +} + +bool igb_xmit_zc(struct igb_ring *tx_ring, struct xsk_buff_pool *xsk_pool) +{ + unsigned int budget = igb_desc_unused(tx_ring); + u32 cmd_type, olinfo_status, nb_pkts, i = 0; + struct xdp_desc *descs = xsk_pool->tx_descs; + union e1000_adv_tx_desc *tx_desc = NULL; + struct igb_tx_buffer *tx_buffer_info; + unsigned int total_bytes = 0; + dma_addr_t dma; + + if (!netif_carrier_ok(tx_ring->netdev)) + return true; + + if (test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags)) + return true; + + nb_pkts = xsk_tx_peek_release_desc_batch(xsk_pool, budget); + if (!nb_pkts) + return true; + + while (nb_pkts-- > 0) { + dma = xsk_buff_raw_get_dma(xsk_pool, descs[i].addr); + xsk_buff_raw_dma_sync_for_device(xsk_pool, dma, descs[i].len); + + tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use]; + tx_buffer_info->bytecount = descs[i].len; + tx_buffer_info->type = IGB_TYPE_XSK; + tx_buffer_info->xdpf = NULL; + tx_buffer_info->gso_segs = 1; + tx_buffer_info->time_stamp = jiffies; + + tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use); + tx_desc->read.buffer_addr = cpu_to_le64(dma); + + /* put descriptor type bits */ + cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT | + E1000_ADVTXD_DCMD_IFCS; + olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT; + + /* FIXME: This sets the Report Status (RS) bit for every + * descriptor. One nice to have optimization would be to set it + * only for the last descriptor in the whole batch. See Intel + * ice driver for an example on how to do it. + */ + cmd_type |= descs[i].len | IGB_TXD_DCMD; + tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type); + tx_desc->read.olinfo_status = cpu_to_le32(olinfo_status); + + total_bytes += descs[i].len; + + i++; + tx_ring->next_to_use++; + tx_buffer_info->next_to_watch = tx_desc; + if (tx_ring->next_to_use == tx_ring->count) + tx_ring->next_to_use = 0; + } + + netdev_tx_sent_queue(txring_txq(tx_ring), total_bytes); + igb_xdp_ring_update_tail(tx_ring); + + return nb_pkts < budget; +} + +int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) +{ + struct igb_adapter *adapter = netdev_priv(dev); + struct e1000_hw *hw = &adapter->hw; + struct igb_ring *ring; + u32 eics = 0; + + if (test_bit(__IGB_DOWN, &adapter->state)) + return -ENETDOWN; + + if (!igb_xdp_is_enabled(adapter)) + return -EINVAL; + + if (qid >= adapter->num_tx_queues) + return -EINVAL; + + ring = adapter->tx_ring[qid]; + + if (test_bit(IGB_RING_FLAG_TX_DISABLED, &ring->flags)) + return -ENETDOWN; + + if (!READ_ONCE(ring->xsk_pool)) + return -EINVAL; + + if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) { + /* Cause software interrupt */ + if (adapter->flags & IGB_FLAG_HAS_MSIX) { + eics |= ring->q_vector->eims_value; + wr32(E1000_EICS, eics); + } else { + wr32(E1000_ICS, E1000_ICS_RXDMT0); + } + } + + return 0; +} |