diff options
75 files changed, 3471 insertions, 331 deletions
diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt index 41b49e6075f5..128da752fec9 100644 --- a/Documentation/devicetree/bindings/net/stmmac.txt +++ b/Documentation/devicetree/bindings/net/stmmac.txt @@ -1,7 +1,7 @@ * STMicroelectronics 10/100/1000 Ethernet driver (GMAC) Required properties: -- compatible: Should be "snps,dwmac-<ip_version>" "snps,dwmac" +- compatible: Should be "snps,dwmac-<ip_version>", "snps,dwmac" For backwards compatibility: "st,spear600-gmac" is also supported. - reg: Address and length of the register set for the device - interrupt-parent: Should be the phandle for the interrupt controller @@ -34,7 +34,13 @@ Optional properties: platforms. - tx-fifo-depth: See ethernet.txt file in the same directory - rx-fifo-depth: See ethernet.txt file in the same directory -- snps,pbl Programmable Burst Length +- snps,pbl Programmable Burst Length (tx and rx) +- snps,txpbl Tx Programmable Burst Length. Only for GMAC and newer. + If set, DMA tx will use this value rather than snps,pbl. +- snps,rxpbl Rx Programmable Burst Length. Only for GMAC and newer. + If set, DMA rx will use this value rather than snps,pbl. +- snps,no-pbl-x8 Don't multiply the pbl/txpbl/rxpbl values by 8. + For core rev < 3.50, don't multiply the values by 4. - snps,aal Address-Aligned Beats - snps,fixed-burst Program the DMA to use the fixed burst mode - snps,mixed-burst Program the DMA to use the mixed burst mode @@ -50,6 +56,8 @@ Optional properties: - snps,ps-speed: port selection speed that can be passed to the core when PCS is supported. For example, this is used in case of SGMII and MAC2MAC connection. +- snps,tso: this enables the TSO feature otherwise it will be managed by + MAC HW capability register. Only for GMAC4 and newer. - AXI BUS Mode parameters: below the list of all the parameters to program the AXI register inside the DMA module: - snps,lpi_en: enable Low Power Interface @@ -62,8 +70,6 @@ Optional properties: - snps,fb: fixed-burst - snps,mb: mixed-burst - snps,rb: rebuild INCRx Burst - - snps,tso: this enables the TSO feature otherwise it will be managed by - MAC HW capability register. - mdio: with compatible = "snps,dwmac-mdio", create and register mdio bus. Examples: diff --git a/Documentation/networking/phy.txt b/Documentation/networking/phy.txt index e017d933d530..16f90d817224 100644 --- a/Documentation/networking/phy.txt +++ b/Documentation/networking/phy.txt @@ -407,6 +407,15 @@ Board Fixups The stubs set one of the two matching criteria, and set the other one to match anything. + When phy_register_fixup() or *_for_uid()/*_for_id() is called at module, + unregister fixup and free allocate memory are required. + + Call one of following function before unloading module. + + int phy_unregister_fixup(const char *phy_id, u32 phy_uid, u32 phy_uid_mask); + int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); + int phy_register_fixup_for_id(const char *phy_id); + Standards IEEE Standard 802.3: CSMA/CD Access Method and Physical Layer Specifications, Section Two: diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt index 014f4f756cb7..2bb07078f535 100644 --- a/Documentation/networking/stmmac.txt +++ b/Documentation/networking/stmmac.txt @@ -152,8 +152,10 @@ Where: o dma_cfg: internal DMA parameters o pbl: the Programmable Burst Length is maximum number of beats to be transferred in one DMA transaction. - GMAC also enables the 4xPBL by default. - o fixed_burst/mixed_burst/burst_len + GMAC also enables the 4xPBL by default. (8xPBL for GMAC 3.50 and newer) + o txpbl/rxpbl: GMAC and newer supports independent DMA pbl for tx/rx. + o pblx8: Enable 8xPBL (4xPBL for core rev < 3.50). Enabled by default. + o fixed_burst/mixed_burst/aal o clk_csr: fixed CSR Clock range selection. o has_gmac: uses the GMAC core. o enh_desc: if sets the MAC will use the enhanced descriptor structure. @@ -205,16 +207,24 @@ tuned according to the HW capabilities. struct stmmac_dma_cfg { int pbl; + int txpbl; + int rxpbl; + bool pblx8; int fixed_burst; - int burst_len_supported; + int mixed_burst; + bool aal; }; Where: - o pbl: Programmable Burst Length + o pbl: Programmable Burst Length (tx and rx) + o txpbl: Transmit Programmable Burst Length. Only for GMAC and newer. + If set, DMA tx will use this value rather than pbl. + o rxpbl: Receive Programmable Burst Length. Only for GMAC and newer. + If set, DMA rx will use this value rather than pbl. + o pblx8: Enable 8xPBL (4xPBL for core rev < 3.50). Enabled by default. o fixed_burst: program the DMA to use the fixed burst mode - o burst_len: this is the value we put in the register - supported values are provided as macros in - linux/stmmac.h header file. + o mixed_burst: program the DMA to use the mixed burst mode + o aal: Address-Aligned Beats --- diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 0fe98a567125..73a5cf18fd84 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -766,7 +766,7 @@ emit_clear: func = (u8 *) __bpf_call_base + imm; /* Save skb pointer if we need to re-cache skb data */ - if (bpf_helper_changes_skb_data(func)) + if (bpf_helper_changes_pkt_data(func)) PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx)); bpf_jit_emit_func_call(image, ctx, (u64)func); @@ -775,7 +775,7 @@ emit_clear: PPC_MR(b2p[BPF_REG_0], 3); /* refresh skb cache */ - if (bpf_helper_changes_skb_data(func)) { + if (bpf_helper_changes_pkt_data(func)) { /* reload skb pointer to r3 */ PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx)); bpf_jit_emit_skb_loads(image, ctx); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index bee281f3163d..167b31b186c1 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -981,7 +981,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT2(0x0d00, REG_14, REG_W1); /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb9040000, BPF_REG_0, REG_2); - if (bpf_helper_changes_skb_data((void *)func)) { + if (bpf_helper_changes_pkt_data((void *)func)) { jit->seen |= SEEN_SKB_CHANGE; /* lg %b1,ST_OFF_SKBP(%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0, diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index fe04a04dab8e..e76d1af60f7a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -853,7 +853,7 @@ xadd: if (is_imm8(insn->off)) func = (u8 *) __bpf_call_base + imm32; jmp_offset = func - (image + addrs[i]); if (seen_ld_abs) { - reload_skb_data = bpf_helper_changes_skb_data(func); + reload_skb_data = bpf_helper_changes_pkt_data(func); if (reload_skb_data) { EMIT1(0x57); /* push %rdi */ jmp_offset += 22; /* pop, mov, sub, mov */ diff --git a/drivers/net/ethernet/alacritech/slicoss.c b/drivers/net/ethernet/alacritech/slicoss.c index e77ecd5b307c..b9fbd0107008 100644 --- a/drivers/net/ethernet/alacritech/slicoss.c +++ b/drivers/net/ethernet/alacritech/slicoss.c @@ -1863,18 +1863,7 @@ static struct pci_driver slic_driver = { .remove = slic_remove, }; -static int __init slic_init_module(void) -{ - return pci_register_driver(&slic_driver); -} - -static void __exit slic_cleanup_module(void) -{ - pci_unregister_driver(&slic_driver); -} - -module_init(slic_init_module); -module_exit(slic_cleanup_module); +module_pci_driver(slic_driver); MODULE_DESCRIPTION("Alacritech non-accelerated SLIC driver"); MODULE_AUTHOR("Lino Sanfilippo <LinoSanfilippo@gmx.de>"); diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index 6c7eea8b36af..884a334e82d0 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -635,6 +635,7 @@ static void xgene_enet_free_pagepool(struct xgene_enet_desc_ring *buf_pool, return; dev = ndev_to_dev(buf_pool->ndev); + slots = buf_pool->slots - 1; head = buf_pool->head; for (i = 0; i < 4; i++) { diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c b/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c index 108e4878e608..b6117b6a1de2 100644 --- a/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c +++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_vf_device.c @@ -529,6 +529,26 @@ static u64 cn23xx_vf_msix_interrupt_handler(void *dev) return ret; } +static u32 cn23xx_update_read_index(struct octeon_instr_queue *iq) +{ + u32 pkt_in_done = readl(iq->inst_cnt_reg); + u32 last_done; + u32 new_idx; + + last_done = pkt_in_done - iq->pkt_in_done; + iq->pkt_in_done = pkt_in_done; + + /* Modulo of the new index with the IQ size will give us + * the new index. The iq->reset_instr_cnt is always zero for + * cn23xx, so no extra adjustments are needed. + */ + new_idx = (iq->octeon_read_index + + (u32)(last_done & CN23XX_PKT_IN_DONE_CNT_MASK)) % + iq->max_count; + + return new_idx; +} + static void cn23xx_enable_vf_interrupt(struct octeon_device *oct, u8 intr_flag) { struct octeon_cn23xx_vf *cn23xx = (struct octeon_cn23xx_vf *)oct->chip; @@ -660,6 +680,7 @@ int cn23xx_setup_octeon_vf_device(struct octeon_device *oct) oct->fn_list.msix_interrupt_handler = cn23xx_vf_msix_interrupt_handler; oct->fn_list.setup_device_regs = cn23xx_setup_vf_device_regs; + oct->fn_list.update_iq_read_idx = cn23xx_update_read_index; oct->fn_list.enable_interrupt = cn23xx_enable_vf_interrupt; oct->fn_list.disable_interrupt = cn23xx_disable_vf_interrupt; diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c index e6321f35399c..9989ac393e94 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c @@ -22,7 +22,9 @@ #include "octeon_iq.h" #include "response_manager.h" #include "octeon_device.h" +#include "octeon_nic.h" #include "octeon_main.h" +#include "octeon_network.h" #include "cn23xx_vf_device.h" MODULE_AUTHOR("Cavium Networks, <support@cavium.com>"); @@ -30,6 +32,76 @@ MODULE_DESCRIPTION("Cavium LiquidIO Intelligent Server Adapter Virtual Function MODULE_LICENSE("GPL"); MODULE_VERSION(LIQUIDIO_VERSION); +static int debug = -1; +module_param(debug, int, 0644); +MODULE_PARM_DESC(debug, "NETIF_MSG debug bits"); + +#define DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK) + +/* Bit mask values for lio->ifstate */ +#define LIO_IFSTATE_DROQ_OPS 0x01 +#define LIO_IFSTATE_REGISTERED 0x02 +#define LIO_IFSTATE_RUNNING 0x04 + +struct liquidio_if_cfg_context { + int octeon_id; + + wait_queue_head_t wc; + + int cond; +}; + +struct liquidio_if_cfg_resp { + u64 rh; + struct liquidio_if_cfg_info cfg_info; + u64 status; +}; + +struct liquidio_rx_ctl_context { + int octeon_id; + + wait_queue_head_t wc; + + int cond; +}; + +union tx_info { + u64 u64; + struct { +#ifdef __BIG_ENDIAN_BITFIELD + u16 gso_size; + u16 gso_segs; + u32 reserved; +#else + u32 reserved; + u16 gso_segs; + u16 gso_size; +#endif + } s; +}; + +#define OCTNIC_MAX_SG (MAX_SKB_FRAGS) + +#define OCTNIC_GSO_MAX_HEADER_SIZE 128 +#define OCTNIC_GSO_MAX_SIZE \ + (CN23XX_DEFAULT_INPUT_JABBER - OCTNIC_GSO_MAX_HEADER_SIZE) + +struct octnic_gather { + /* List manipulation. Next and prev pointers. */ + struct list_head list; + + /* Size of the gather component at sg in bytes. */ + int sg_size; + + /* Number of bytes that sg was adjusted to make it 8B-aligned. */ + int adjust; + + /* Gather component that can accommodate max sized fragment list + * received from the IP layer. + */ + struct octeon_sg_entry *sg; +}; + struct octeon_device_priv { /* Tasklet structures for this device. */ struct tasklet_struct droq_tasklet; @@ -40,6 +112,7 @@ static int liquidio_vf_probe(struct pci_dev *pdev, const struct pci_device_id *ent); static void liquidio_vf_remove(struct pci_dev *pdev); static int octeon_device_init(struct octeon_device *oct); +static int liquidio_stop(struct net_device *netdev); static int lio_wait_for_oq_pkts(struct octeon_device *oct) { @@ -113,6 +186,375 @@ static struct pci_driver liquidio_vf_pci_driver = { .remove = liquidio_vf_remove, }; +/** + * \brief check interface state + * @param lio per-network private data + * @param state_flag flag state to check + */ +static int ifstate_check(struct lio *lio, int state_flag) +{ + return atomic_read(&lio->ifstate) & state_flag; +} + +/** + * \brief set interface state + * @param lio per-network private data + * @param state_flag flag state to set + */ +static void ifstate_set(struct lio *lio, int state_flag) +{ + atomic_set(&lio->ifstate, (atomic_read(&lio->ifstate) | state_flag)); +} + +/** + * \brief clear interface state + * @param lio per-network private data + * @param state_flag flag state to clear + */ +static void ifstate_reset(struct lio *lio, int state_flag) +{ + atomic_set(&lio->ifstate, (atomic_read(&lio->ifstate) & ~(state_flag))); +} + +/** + * \brief Stop Tx queues + * @param netdev network device + */ +static void txqs_stop(struct net_device *netdev) +{ + if (netif_is_multiqueue(netdev)) { + int i; + + for (i = 0; i < netdev->num_tx_queues; i++) + netif_stop_subqueue(netdev, i); + } else { + netif_stop_queue(netdev); + } +} + +/** + * \brief Start Tx queues + * @param netdev network device + */ +static void txqs_start(struct net_device *netdev) +{ + if (netif_is_multiqueue(netdev)) { + int i; + + for (i = 0; i < netdev->num_tx_queues; i++) + netif_start_subqueue(netdev, i); + } else { + netif_start_queue(netdev); + } +} + +/** + * \brief Wake Tx queues + * @param netdev network device + */ +static void txqs_wake(struct net_device *netdev) +{ + struct lio *lio = GET_LIO(netdev); + + if (netif_is_multiqueue(netdev)) { + int i; + + for (i = 0; i < netdev->num_tx_queues; i++) { + int qno = lio->linfo.txpciq[i % (lio->linfo.num_txpciq)] + .s.q_no; + if (__netif_subqueue_stopped(netdev, i)) { + INCR_INSTRQUEUE_PKT_COUNT(lio->oct_dev, qno, + tx_restart, 1); + netif_wake_subqueue(netdev, i); + } + } + } else { + INCR_INSTRQUEUE_PKT_COUNT(lio->oct_dev, lio->txq, + tx_restart, 1); + netif_wake_queue(netdev); + } +} + +/** + * \brief Start Tx queue + * @param netdev network device + */ +static void start_txq(struct net_device *netdev) +{ + struct lio *lio = GET_LIO(netdev); + + if (lio->linfo.link.s.link_up) { + txqs_start(netdev); + return; + } +} + +/** + * \brief Wake a queue + * @param netdev network device + * @param q which queue to wake + */ +static void wake_q(struct net_device *netdev, int q) +{ + if (netif_is_multiqueue(netdev)) + netif_wake_subqueue(netdev, q); + else + netif_wake_queue(netdev); +} + +/** + * \brief Stop a queue + * @param netdev network device + * @param q which queue to stop + */ +static void stop_q(struct net_device *netdev, int q) +{ + if (netif_is_multiqueue(netdev)) + netif_stop_subqueue(netdev, q); + else + netif_stop_queue(netdev); +} + +/** + * Remove the node at the head of the list. The list would be empty at + * the end of this call if there are no more nodes in the list. + */ +static struct list_head *list_delete_head(struct list_head *root) +{ + struct list_head *node; + + if ((root->prev == root) && (root->next == root)) + node = NULL; + else + node = root->next; + + if (node) + list_del(node); + + return node; +} + +/** + * \brief Delete gather lists + * @param lio per-network private data + */ +static void delete_glists(struct lio *lio) +{ + struct octnic_gather *g; + int i; + + if (!lio->glist) + return; + + for (i = 0; i < lio->linfo.num_txpciq; i++) { + do { + g = (struct octnic_gather *) + list_delete_head(&lio->glist[i]); + if (g) { + if (g->sg) + kfree((void *)((unsigned long)g->sg - + g->adjust)); + kfree(g); + } + } while (g); + } + + kfree(lio->glist); + kfree(lio->glist_lock); +} + +/** + * \brief Setup gather lists + * @param lio per-network private data + */ +static int setup_glists(struct lio *lio, int num_iqs) +{ + struct octnic_gather *g; + int i, j; + + lio->glist_lock = + kzalloc(sizeof(*lio->glist_lock) * num_iqs, GFP_KERNEL); + if (!lio->glist_lock) + return 1; + + lio->glist = + kzalloc(sizeof(*lio->glist) * num_iqs, GFP_KERNEL); + if (!lio->glist) { + kfree(lio->glist_lock); + return 1; + } + + for (i = 0; i < num_iqs; i++) { + spin_lock_init(&lio->glist_lock[i]); + + INIT_LIST_HEAD(&lio->glist[i]); + + for (j = 0; j < lio->tx_qsize; j++) { + g = kzalloc(sizeof(*g), GFP_KERNEL); + if (!g) + break; + + g->sg_size = ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * + OCT_SG_ENTRY_SIZE); + + g->sg = kmalloc(g->sg_size + 8, GFP_KERNEL); + if (!g->sg) { + kfree(g); + break; + } + + /* The gather component should be aligned on 64-bit + * boundary + */ + if (((unsigned long)g->sg) & 7) { + g->adjust = 8 - (((unsigned long)g->sg) & 7); + g->sg = (struct octeon_sg_entry *) + ((unsigned long)g->sg + g->adjust); + } + list_add_tail(&g->list, &lio->glist[i]); + } + + if (j != lio->tx_qsize) { + delete_glists(lio); + return 1; + } + } + + return 0; +} + +/** + * \brief Print link information + * @param netdev network device + */ +static void print_link_info(struct net_device *netdev) +{ + struct lio *lio = GET_LIO(netdev); + + if (atomic_read(&lio->ifstate) & LIO_IFSTATE_REGISTERED) { + struct oct_link_info *linfo = &lio->linfo; + + if (linfo->link.s.link_up) { + netif_info(lio, link, lio->netdev, "%d Mbps %s Duplex UP\n", + linfo->link.s.speed, + (linfo->link.s.duplex) ? "Full" : "Half"); + } else { + netif_info(lio, link, lio->netdev, "Link Down\n"); + } + } +} + +/** + * \brief Routine to notify MTU change + * @param work work_struct data structure + */ +static void octnet_link_status_change(struct work_struct *work) +{ + struct cavium_wk *wk = (struct cavium_wk *)work; + struct lio *lio = (struct lio *)wk->ctxptr; + + rtnl_lock(); + call_netdevice_notifiers(NETDEV_CHANGEMTU, lio->netdev); + rtnl_unlock(); +} + +/** + * \brief Sets up the mtu status change work + * @param netdev network device + */ +static int setup_link_status_change_wq(struct net_device *netdev) +{ + struct lio *lio = GET_LIO(netdev); + struct octeon_device *oct = lio->oct_dev; + + lio->link_status_wq.wq = alloc_workqueue("link-status", + WQ_MEM_RECLAIM, 0); + if (!lio->link_status_wq.wq) { + dev_err(&oct->pci_dev->dev, "unable to create cavium link status wq\n"); + return -1; + } + INIT_DELAYED_WORK(&lio->link_status_wq.wk.work, + octnet_link_status_change); + lio->link_status_wq.wk.ctxptr = lio; + + return 0; +} + +static void cleanup_link_status_change_wq(struct net_device *netdev) +{ + struct lio *lio = GET_LIO(netdev); + + if (lio->link_status_wq.wq) { + cancel_delayed_work_sync(&lio->link_status_wq.wk.work); + destroy_workqueue(lio->link_status_wq.wq); + } +} + +/** + * \brief Update link status + * @param netdev network device + * @param ls link status structure + * + * Called on receipt of a link status response from the core application to + * update each interface's link status. + */ +static void update_link_status(struct net_device *netdev, + union oct_link_status *ls) +{ + struct lio *lio = GET_LIO(netdev); + struct octeon_device *oct = lio->oct_dev; + + if ((lio->intf_open) && (lio->linfo.link.u64 != ls->u64)) { + lio->linfo.link.u64 = ls->u64; + + print_link_info(netdev); + lio->link_changes++; + + if (lio->linfo.link.s.link_up) { + netif_carrier_on(netdev); + txqs_wake(netdev); + } else { + netif_carrier_off(netdev); + txqs_stop(netdev); + } + + if (lio->linfo.link.s.mtu < netdev->mtu) { + dev_warn(&oct->pci_dev->dev, + "PF has changed the MTU for gmx port. Reducing the mtu from %d to %d\n", + netdev->mtu, lio->linfo.link.s.mtu); + lio->mtu = lio->linfo.link.s.mtu; + netdev->mtu = lio->linfo.link.s.mtu; + queue_delayed_work(lio->link_status_wq.wq, + &lio->link_status_wq.wk.work, 0); + } + } +} + +static void update_txq_status(struct octeon_device *oct, int iq_num) +{ + struct octeon_instr_queue *iq = oct->instr_queue[iq_num]; + struct net_device *netdev; + struct lio *lio; + + netdev = oct->props[iq->ifidx].netdev; + lio = GET_LIO(netdev); + if (netif_is_multiqueue(netdev)) { + if (__netif_subqueue_stopped(netdev, iq->q_index) && + lio->linfo.link.s.link_up && + (!octnet_iq_is_full(oct, iq_num))) { + netif_wake_subqueue(netdev, iq->q_index); + INCR_INSTRQUEUE_PKT_COUNT(lio->oct_dev, iq_num, + tx_restart, 1); + } else { + if (!octnet_iq_is_full(oct, lio->txq)) { + INCR_INSTRQUEUE_PKT_COUNT( + lio->oct_dev, lio->txq, tx_restart, 1); + wake_q(netdev, lio->txq); + } + } + } +} + static int liquidio_schedule_msix_droq_pkt_handler(struct octeon_droq *droq, u64 ret) { @@ -316,6 +758,7 @@ static void octeon_destroy_resources(struct octeon_device *oct) /* No more instructions will be forwarded. */ atomic_set(&oct->status, OCT_DEV_IN_RESET); + oct->app_mode = CVM_DRV_INVALID_APP; dev_dbg(&oct->pci_dev->dev, "Device state is now %s\n", lio_get_state_string(&oct->status)); @@ -420,6 +863,174 @@ static void octeon_destroy_resources(struct octeon_device *oct) } /** + * \brief Callback for rx ctrl + * @param status status of request + * @param buf pointer to resp structure + */ +static void rx_ctl_callback(struct octeon_device *oct, + u32 status, void *buf) +{ + struct octeon_soft_command *sc = (struct octeon_soft_command *)buf; + struct liquidio_rx_ctl_context *ctx; + + ctx = (struct liquidio_rx_ctl_context *)sc->ctxptr; + + oct = lio_get_device(ctx->octeon_id); + if (status) + dev_err(&oct->pci_dev->dev, "rx ctl instruction failed. Status: %llx\n", + CVM_CAST64(status)); + WRITE_ONCE(ctx->cond, 1); + + /* This barrier is required to be sure that the response has been + * written fully before waking up the handler + */ + wmb(); + + wake_up_interruptible(&ctx->wc); +} + +/** + * \brief Send Rx control command + * @param lio per-network private data + * @param start_stop whether to start or stop + */ +static void send_rx_ctrl_cmd(struct lio *lio, int start_stop) +{ + struct octeon_device *oct = (struct octeon_device *)lio->oct_dev; + int ctx_size = sizeof(struct liquidio_rx_ctl_context); + struct liquidio_rx_ctl_context *ctx; + struct octeon_soft_command *sc; + union octnet_cmd *ncmd; + int retval; + + if (oct->props[lio->ifidx].rx_on == start_stop) + return; + + sc = (struct octeon_soft_command *) + octeon_alloc_soft_command(oct, OCTNET_CMD_SIZE, + 16, ctx_size); + + ncmd = (union octnet_cmd *)sc->virtdptr; + ctx = (struct liquidio_rx_ctl_context *)sc->ctxptr; + + WRITE_ONCE(ctx->cond, 0); + ctx->octeon_id = lio_get_device_id(oct); + init_waitqueue_head(&ctx->wc); + + ncmd->u64 = 0; + ncmd->s.cmd = OCTNET_CMD_RX_CTL; + ncmd->s.param1 = start_stop; + + octeon_swap_8B_data((u64 *)ncmd, (OCTNET_CMD_SIZE >> 3)); + + sc->iq_no = lio->linfo.txpciq[0].s.q_no; + + octeon_prepare_soft_command(oct, sc, OPCODE_NIC, + OPCODE_NIC_CMD, 0, 0, 0); + + sc->callback = rx_ctl_callback; + sc->callback_arg = sc; + sc->wait_time = 5000; + + retval = octeon_send_soft_command(oct, sc); + if (retval == IQ_SEND_FAILED) { + netif_info(lio, rx_err, lio->netdev, "Failed to send RX Control message\n"); + } else { + /* Sleep on a wait queue till the cond flag indicates that the + * response arrived or timed-out. + */ + if (sleep_cond(&ctx->wc, &ctx->cond) == -EINTR) + return; + oct->props[lio->ifidx].rx_on = start_stop; + } + + octeon_free_soft_command(oct, sc); +} + +/** + * \brief Destroy NIC device interface + * @param oct octeon device + * @param ifidx which interface to destroy + * + * Cleanup associated with each interface for an Octeon device when NIC + * module is being unloaded or if initialization fails during load. + */ +static void liquidio_destroy_nic_device(struct octeon_device *oct, int ifidx) +{ + struct net_device *netdev = oct->props[ifidx].netdev; + struct napi_struct *napi, *n; + struct lio *lio; + + if (!netdev) { + dev_err(&oct->pci_dev->dev, "%s No netdevice ptr for index %d\n", + __func__, ifidx); + return; + } + + lio = GET_LIO(netdev); + + dev_dbg(&oct->pci_dev->dev, "NIC device cleanup\n"); + + if (atomic_read(&lio->ifstate) & LIO_IFSTATE_RUNNING) + liquidio_stop(netdev); + + if (oct->props[lio->ifidx].napi_enabled == 1) { + list_for_each_entry_safe(napi, n, &netdev->napi_list, dev_list) + napi_disable(napi); + + oct->props[lio->ifidx].napi_enabled = 0; + + oct->droq[0]->ops.poll_mode = 0; + } + + if (atomic_read(&lio->ifstate) & LIO_IFSTATE_REGISTERED) + unregister_netdev(netdev); + + cleanup_link_status_change_wq(netdev); + + delete_glists(lio); + + free_netdev(netdev); + + oct->props[ifidx].gmxport = -1; + + oct->props[ifidx].netdev = NULL; +} + +/** + * \brief Stop complete NIC functionality + * @param oct octeon device + */ +static int liquidio_stop_nic_module(struct octeon_device *oct) +{ + struct lio *lio; + int i, j; + + dev_dbg(&oct->pci_dev->dev, "Stopping network interfaces\n"); + if (!oct->ifcount) { + dev_err(&oct->pci_dev->dev, "Init for Octeon was not completed\n"); + return 1; + } + + spin_lock_bh(&oct->cmd_resp_wqlock); + oct->cmd_resp_state = OCT_DRV_OFFLINE; + spin_unlock_bh(&oct->cmd_resp_wqlock); + + for (i = 0; i < oct->ifcount; i++) { + lio = GET_LIO(oct->props[i].netdev); + for (j = 0; j < lio->linfo.num_rxpciq; j++) + octeon_unregister_droq_ops(oct, + lio->linfo.rxpciq[j].s.q_no); + } + + for (i = 0; i < oct->ifcount; i++) + liquidio_destroy_nic_device(oct, i); + + dev_dbg(&oct->pci_dev->dev, "Network interfaces stopped\n"); + return 0; +} + +/** * \brief Cleans up resources at unload time * @param pdev PCI device structure */ @@ -429,6 +1040,9 @@ static void liquidio_vf_remove(struct pci_dev *pdev) dev_dbg(&oct_dev->pci_dev->dev, "Stopping device\n"); + if (oct_dev->app_mode == CVM_DRV_NIC_APP) + liquidio_stop_nic_module(oct_dev); + /* Reset the octeon device and cleanup all memory allocated for * the octeon device by driver. */ @@ -471,6 +1085,1457 @@ static int octeon_pci_os_setup(struct octeon_device *oct) return 0; } +static int skb_iq(struct lio *lio, struct sk_buff *skb) +{ + int q = 0; + + if (netif_is_multiqueue(lio->netdev)) + q = skb->queue_mapping % lio->linfo.num_txpciq; + + return q; +} + +/** + * \brief Check Tx queue state for a given network buffer + * @param lio per-network private data + * @param skb network buffer + */ +static int check_txq_state(struct lio *lio, struct sk_buff *skb) +{ + int q = 0, iq = 0; + + if (netif_is_multiqueue(lio->netdev)) { + q = skb->queue_mapping; + iq = lio->linfo.txpciq[(q % (lio->linfo.num_txpciq))].s.q_no; + } else { + iq = lio->txq; + q = iq; + } + + if (octnet_iq_is_full(lio->oct_dev, iq)) + return 0; + + if (__netif_subqueue_stopped(lio->netdev, q)) { + INCR_INSTRQUEUE_PKT_COUNT(lio->oct_dev, iq, tx_restart, 1); + wake_q(lio->netdev, q); + } + + return 1; +} + +/** + * \brief Unmap and free network buffer + * @param buf buffer + */ +static void free_netbuf(void *buf) +{ + struct octnet_buf_free_info *finfo; + struct sk_buff *skb; + struct lio *lio; + + finfo = (struct octnet_buf_free_info *)buf; + skb = finfo->skb; + lio = finfo->lio; + + dma_unmap_single(&lio->oct_dev->pci_dev->dev, finfo->dptr, skb->len, + DMA_TO_DEVICE); + + check_txq_state(lio, skb); + + tx_buffer_free(skb); +} + +/** + * \brief Unmap and free gather buffer + * @param buf buffer + */ +static void free_netsgbuf(void *buf) +{ + struct octnet_buf_free_info *finfo; + struct octnic_gather *g; + struct sk_buff *skb; + int i, frags, iq; + struct lio *lio; + + finfo = (struct octnet_buf_free_info *)buf; + skb = finfo->skb; + lio = finfo->lio; + g = finfo->g; + frags = skb_shinfo(skb)->nr_frags; + + dma_unmap_single(&lio->oct_dev->pci_dev->dev, + g->sg[0].ptr[0], (skb->len - skb->data_len), + DMA_TO_DEVICE); + + i = 1; + while (frags--) { + struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1]; + + pci_unmap_page((lio->oct_dev)->pci_dev, + g->sg[(i >> 2)].ptr[(i & 3)], + frag->size, DMA_TO_DEVICE); + i++; + } + + dma_unmap_single(&lio->oct_dev->pci_dev->dev, + finfo->dptr, g->sg_size, + DMA_TO_DEVICE); + + iq = skb_iq(lio, skb); + + spin_lock(&lio->glist_lock[iq]); + list_add_tail(&g->list, &lio->glist[iq]); + spin_unlock(&lio->glist_lock[iq]); + + check_txq_state(lio, skb); /* mq support: sub-queue state check */ + + tx_buffer_free(skb); +} + +/** + * \brief Unmap and free gather buffer with response + * @param buf buffer + */ +static void free_netsgbuf_with_resp(void *buf) +{ + struct octnet_buf_free_info *finfo; + struct octeon_soft_command *sc; + struct octnic_gather *g; + struct sk_buff *skb; + int i, frags, iq; + struct lio *lio; + + sc = (struct octeon_soft_command *)buf; + skb = (struct sk_buff *)sc->callback_arg; + finfo = (struct octnet_buf_free_info *)&skb->cb; + + lio = finfo->lio; + g = finfo->g; + frags = skb_shinfo(skb)->nr_frags; + + dma_unmap_single(&lio->oct_dev->pci_dev->dev, + g->sg[0].ptr[0], (skb->len - skb->data_len), + DMA_TO_DEVICE); + + i = 1; + while (frags--) { + struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1]; + + pci_unmap_page((lio->oct_dev)->pci_dev, + g->sg[(i >> 2)].ptr[(i & 3)], + frag->size, DMA_TO_DEVICE); + i++; + } + + dma_unmap_single(&lio->oct_dev->pci_dev->dev, + finfo->dptr, g->sg_size, + DMA_TO_DEVICE); + + iq = skb_iq(lio, skb); + + spin_lock(&lio->glist_lock[iq]); + list_add_tail(&g->list, &lio->glist[iq]); + spin_unlock(&lio->glist_lock[iq]); + + /* Don't free the skb yet */ + + check_txq_state(lio, skb); +} + +/** + * \brief Setup output queue + * @param oct octeon device + * @param q_no which queue + * @param num_descs how many descriptors + * @param desc_size size of each descriptor + * @param app_ctx application context + */ +static int octeon_setup_droq(struct octeon_device *oct, int q_no, int num_descs, + int desc_size, void *app_ctx) +{ + int ret_val; + + dev_dbg(&oct->pci_dev->dev, "Creating Droq: %d\n", q_no); + /* droq creation and local register settings. */ + ret_val = octeon_create_droq(oct, q_no, num_descs, desc_size, app_ctx); + if (ret_val < 0) + return ret_val; + + if (ret_val == 1) { + dev_dbg(&oct->pci_dev->dev, "Using default droq %d\n", q_no); + return 0; + } + + /* Enable the droq queues */ + octeon_set_droq_pkt_op(oct, q_no, 1); + + /* Send Credit for Octeon Output queues. Credits are always + * sent after the output queue is enabled. + */ + writel(oct->droq[q_no]->max_count, oct->droq[q_no]->pkts_credit_reg); + + return ret_val; +} + +/** + * \brief Callback for getting interface configuration + * @param status status of request + * @param buf pointer to resp structure + */ +static void if_cfg_callback(struct octeon_device *oct, + u32 status __attribute__((unused)), void *buf) +{ + struct octeon_soft_command *sc = (struct octeon_soft_command *)buf; + struct liquidio_if_cfg_context *ctx; + struct liquidio_if_cfg_resp *resp; + + resp = (struct liquidio_if_cfg_resp *)sc->virtrptr; + ctx = (struct liquidio_if_cfg_context *)sc->ctxptr; + + oct = lio_get_device(ctx->octeon_id); + if (resp->status) + dev_err(&oct->pci_dev->dev, "nic if cfg instruction failed. Status: %llx\n", + CVM_CAST64(resp->status)); + WRITE_ONCE(ctx->cond, 1); + + snprintf(oct->fw_info.liquidio_firmware_version, 32, "%s", + resp->cfg_info.liquidio_firmware_version); + + /* This barrier is required to be sure that the response has been + * written fully before waking up the handler + */ + wmb(); + + wake_up_interruptible(&ctx->wc); +} + +/** + * \brief Select queue based on hash + * @param dev Net device + * @param skb sk_buff structure + * @returns selected queue number + */ +static u16 select_q(struct net_device *dev, struct sk_buff *skb, + void *accel_priv __attribute__((unused)), + select_queue_fallback_t fallback __attribute__((unused))) +{ + struct lio *lio; + u32 qindex; + + lio = GET_LIO(dev); + + qindex = skb_tx_hash(dev, skb); + + return (u16)(qindex % (lio->linfo.num_txpciq)); +} + +/** Routine to push packets arriving on Octeon interface upto network layer. + * @param oct_id - octeon device id. + * @param skbuff - skbuff struct to be passed to network layer. + * @param len - size of total data received. + * @param rh - Control header associated with the packet + * @param param - additional control data with the packet + * @param arg - farg registered in droq_ops + */ +static void +liquidio_push_packet(u32 octeon_id __attribute__((unused)), + void *skbuff, + u32 len, + union octeon_rh *rh, + void *param, + void *arg) +{ + struct napi_struct *napi = param; + struct octeon_droq *droq = + container_of(param, struct octeon_droq, napi); + struct net_device *netdev = (struct net_device *)arg; + struct sk_buff *skb = (struct sk_buff *)skbuff; + + if (netdev) { + struct lio *lio = GET_LIO(netdev); + int packet_was_received; + + /* Do not proceed if the interface is not in RUNNING state. */ + if (!ifstate_check(lio, LIO_IFSTATE_RUNNING)) { + recv_buffer_free(skb); + droq->stats.rx_dropped++; + return; + } + + skb->dev = netdev; + + skb_record_rx_queue(skb, droq->q_no); + if (likely(len > MIN_SKB_SIZE)) { + struct octeon_skb_page_info *pg_info; + unsigned char *va; + + pg_info = ((struct octeon_skb_page_info *)(skb->cb)); + if (pg_info->page) { + /* For Paged allocation use the frags */ + va = page_address(pg_info->page) + + pg_info->page_offset; + memcpy(skb->data, va, MIN_SKB_SIZE); + skb_put(skb, MIN_SKB_SIZE); + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, + pg_info->page, + pg_info->page_offset + + MIN_SKB_SIZE, + len - MIN_SKB_SIZE, + LIO_RXBUFFER_SZ); + } + } else { + struct octeon_skb_page_info *pg_info = + ((struct octeon_skb_page_info *)(skb->cb)); + skb_copy_to_linear_data(skb, + page_address(pg_info->page) + + pg_info->page_offset, len); + skb_put(skb, len); + put_page(pg_info->page); + } + + skb_pull(skb, rh->r_dh.len * 8); + skb->protocol = eth_type_trans(skb, skb->dev); + + if ((netdev->features & NETIF_F_RXCSUM) && + (rh->r_dh.csum_verified & CNNIC_CSUM_VERIFIED)) + /* checksum has already been verified */ + skb->ip_summed = CHECKSUM_UNNECESSARY; + else + skb->ip_summed = CHECKSUM_NONE; + + packet_was_received = (napi_gro_receive(napi, skb) != GRO_DROP); + + if (packet_was_received) { + droq->stats.rx_bytes_received += len; + droq->stats.rx_pkts_received++; + netdev->last_rx = jiffies; + } else { + droq->stats.rx_dropped++; + netif_info(lio, rx_err, lio->netdev, + "droq:%d error rx_dropped:%llu\n", + droq->q_no, droq->stats.rx_dropped); + } + + } else { + recv_buffer_free(skb); + } +} + +/** + * \brief callback when receive interrupt occurs and we are in NAPI mode + * @param arg pointer to octeon output queue + */ +static void liquidio_vf_napi_drv_callback(void *arg) +{ + struct octeon_droq *droq = arg; + + napi_schedule_irqoff(&droq->napi); +} + +/** + * \brief Entry point for NAPI polling + * @param napi NAPI structure + * @param budget maximum number of items to process + */ +static int liquidio_napi_poll(struct napi_struct *napi, int budget) +{ + struct octeon_instr_queue *iq; + struct octeon_device *oct; + struct octeon_droq *droq; + int tx_done = 0, iq_no; + int work_done; + + droq = container_of(napi, struct octeon_droq, napi); + oct = droq->oct_dev; + iq_no = droq->q_no; + + /* Handle Droq descriptors */ + work_done = octeon_process_droq_poll_cmd(oct, droq->q_no, + POLL_EVENT_PROCESS_PKTS, + budget); + + /* Flush the instruction queue */ + iq = oct->instr_queue[iq_no]; + if (iq) { + /* Process iq buffers with in the budget limits */ + tx_done = octeon_flush_iq(oct, iq, 1, budget); + /* Update iq read-index rather than waiting for next interrupt. + * Return back if tx_done is false. + */ + update_txq_status(oct, iq_no); + } else { + dev_err(&oct->pci_dev->dev, "%s: iq (%d) num invalid\n", + __func__, iq_no); + } + + if ((work_done < budget) && (tx_done)) { + napi_complete(napi); + octeon_process_droq_poll_cmd(droq->oct_dev, droq->q_no, + POLL_EVENT_ENABLE_INTR, 0); + return 0; + } + + return (!tx_done) ? (budget) : (work_done); +} + +/** + * \brief Setup input and output queues + * @param octeon_dev octeon device + * @param ifidx Interface index + * + * Note: Queues are with respect to the octeon device. Thus + * an input queue is for egress packets, and output queues + * are for ingress packets. + */ +static int setup_io_queues(struct octeon_device *octeon_dev, int ifidx) +{ + struct octeon_droq_ops droq_ops; + struct net_device *netdev; + static int cpu_id_modulus; + struct octeon_droq *droq; + struct napi_struct *napi; + static int cpu_id; + int num_tx_descs; + struct lio *lio; + int retval = 0; + int q, q_no; + + netdev = octeon_dev->props[ifidx].netdev; + + lio = GET_LIO(netdev); + + memset(&droq_ops, 0, sizeof(struct octeon_droq_ops)); + + droq_ops.fptr = liquidio_push_packet; + droq_ops.farg = netdev; + + droq_ops.poll_mode = 1; + droq_ops.napi_fn = liquidio_vf_napi_drv_callback; + cpu_id = 0; + cpu_id_modulus = num_present_cpus(); + + /* set up DROQs. */ + for (q = 0; q < lio->linfo.num_rxpciq; q++) { + q_no = lio->linfo.rxpciq[q].s.q_no; + + retval = octeon_setup_droq( + octeon_dev, q_no, + CFG_GET_NUM_RX_DESCS_NIC_IF(octeon_get_conf(octeon_dev), + lio->ifidx), + CFG_GET_NUM_RX_BUF_SIZE_NIC_IF(octeon_get_conf(octeon_dev), + lio->ifidx), + NULL); + if (retval) { + dev_err(&octeon_dev->pci_dev->dev, + "%s : Runtime DROQ(RxQ) creation failed.\n", + __func__); + return 1; + } + + droq = octeon_dev->droq[q_no]; + napi = &droq->napi; + netif_napi_add(netdev, napi, liquidio_napi_poll, 64); + + /* designate a CPU for this droq */ + droq->cpu_id = cpu_id; + cpu_id++; + if (cpu_id >= cpu_id_modulus) + cpu_id = 0; + + octeon_register_droq_ops(octeon_dev, q_no, &droq_ops); + } + + /* 23XX VF can send/recv control messages (via the first VF-owned + * droq) from the firmware even if the ethX interface is down, + * so that's why poll_mode must be off for the first droq. + */ + octeon_dev->droq[0]->ops.poll_mode = 0; + + /* set up IQs. */ + for (q = 0; q < lio->linfo.num_txpciq; q++) { + num_tx_descs = CFG_GET_NUM_TX_DESCS_NIC_IF( + octeon_get_conf(octeon_dev), lio->ifidx); + retval = octeon_setup_iq(octeon_dev, ifidx, q, + lio->linfo.txpciq[q], num_tx_descs, + netdev_get_tx_queue(netdev, q)); + if (retval) { + dev_err(&octeon_dev->pci_dev->dev, + " %s : Runtime IQ(TxQ) creation failed.\n", + __func__); + return 1; + } + } + + return 0; +} + +/** + * \brief Net device open for LiquidIO + * @param netdev network device + */ +static int liquidio_open(struct net_device *netdev) +{ + struct lio *lio = GET_LIO(netdev); + struct octeon_device *oct = lio->oct_dev; + struct napi_struct *napi, *n; + + if (!oct->props[lio->ifidx].napi_enabled) { + list_for_each_entry_safe(napi, n, &netdev->napi_list, dev_list) + napi_enable(napi); + + oct->props[lio->ifidx].napi_enabled = 1; + + oct->droq[0]->ops.poll_mode = 1; + } + + ifstate_set(lio, LIO_IFSTATE_RUNNING); + + /* Ready for link status updates */ + lio->intf_open = 1; + + netif_info(lio, ifup, lio->netdev, "Interface Open, ready for traffic\n"); + start_txq(netdev); + + /* tell Octeon to start forwarding packets to host */ + send_rx_ctrl_cmd(lio, 1); + + dev_info(&oct->pci_dev->dev, "%s interface is opened\n", netdev->name); + + return 0; +} + +/** + * \brief Net device stop for LiquidIO + * @param netdev network device + */ +static int liquidio_stop(struct net_device *netdev) +{ + struct lio *lio = GET_LIO(netdev); + struct octeon_device *oct = lio->oct_dev; + + netif_info(lio, ifdown, lio->netdev, "Stopping interface!\n"); + /* Inform that netif carrier is down */ + lio->intf_open = 0; + lio->linfo.link.s.link_up = 0; + + netif_carrier_off(netdev); + lio->link_changes++; + + /* tell Octeon to stop forwarding packets to host */ + send_rx_ctrl_cmd(lio, 0); + + ifstate_reset(lio, LIO_IFSTATE_RUNNING); + + txqs_stop(netdev); + + dev_info(&oct->pci_dev->dev, "%s interface is stopped\n", netdev->name); + + return 0; +} + +/** + * \brief Converts a mask based on net device flags + * @param netdev network device + * + * This routine generates a octnet_ifflags mask from the net device flags + * received from the OS. + */ +static enum octnet_ifflags get_new_flags(struct net_device *netdev) +{ + enum octnet_ifflags f = OCTNET_IFFLAG_UNICAST; + + if (netdev->flags & IFF_PROMISC) + f |= OCTNET_IFFLAG_PROMISC; + + if (netdev->flags & IFF_ALLMULTI) + f |= OCTNET_IFFLAG_ALLMULTI; + + if (netdev->flags & IFF_MULTICAST) { + f |= OCTNET_IFFLAG_MULTICAST; + + /* Accept all multicast addresses if there are more than we + * can handle + */ + if (netdev_mc_count(netdev) > MAX_OCTEON_MULTICAST_ADDR) + f |= OCTNET_IFFLAG_ALLMULTI; + } + + if (netdev->flags & IFF_BROADCAST) + f |= OCTNET_IFFLAG_BROADCAST; + + return f; +} + +static void liquidio_set_uc_list(struct net_device *netdev) +{ + struct lio *lio = GET_LIO(netdev); + struct octeon_device *oct = lio->oct_dev; + struct octnic_ctrl_pkt nctrl; + struct netdev_hw_addr *ha; + u64 *mac; + + if (lio->netdev_uc_count == netdev_uc_count(netdev)) + return; + + if (netdev_uc_count(netdev) > MAX_NCTRL_UDD) { + dev_err(&oct->pci_dev->dev, "too many MAC addresses in netdev uc list\n"); + return; + } + + lio->netdev_uc_count = netdev_uc_count(netdev); + + memset(&nctrl, 0, sizeof(struct octnic_ctrl_pkt)); + nctrl.ncmd.s.cmd = OCTNET_CMD_SET_UC_LIST; + nctrl.ncmd.s.more = lio->netdev_uc_count; + nctrl.ncmd.s.param1 = oct->vf_num; + nctrl.iq_no = lio->linfo.txpciq[0].s.q_no; + nctrl.netpndev = (u64)netdev; + nctrl.cb_fn = liquidio_link_ctrl_cmd_completion; + + /* copy all the addresses into the udd */ + mac = &nctrl.udd[0]; + netdev_for_each_uc_addr(ha, netdev) { + ether_addr_copy(((u8 *)mac) + 2, ha->addr); + mac++; + } + + octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl); +} + +/** + * \brief Net device set_multicast_list + * @param netdev network device + */ +static void liquidio_set_mcast_list(struct net_device *netdev) +{ + int mc_count = min(netdev_mc_count(netdev), MAX_OCTEON_MULTICAST_ADDR); + struct lio *lio = GET_LIO(netdev); + struct octeon_device *oct = lio->oct_dev; + struct octnic_ctrl_pkt nctrl; + struct netdev_hw_addr *ha; + u64 *mc; + int ret; + + memset(&nctrl, 0, sizeof(struct octnic_ctrl_pkt)); + + /* Create a ctrl pkt command to be sent to core app. */ + nctrl.ncmd.u64 = 0; + nctrl.ncmd.s.cmd = OCTNET_CMD_SET_MULTI_LIST; + nctrl.ncmd.s.param1 = get_new_flags(netdev); + nctrl.ncmd.s.param2 = mc_count; + nctrl.ncmd.s.more = mc_count; + nctrl.netpndev = (u64)netdev; + nctrl.cb_fn = liquidio_link_ctrl_cmd_completion; + + /* copy all the addresses into the udd */ + mc = &nctrl.udd[0]; + netdev_for_each_mc_addr(ha, netdev) { + *mc = 0; + ether_addr_copy(((u8 *)mc) + 2, ha->addr); + /* no need to swap bytes */ + if (++mc > &nctrl.udd[mc_count]) + break; + } + + nctrl.iq_no = lio->linfo.txpciq[0].s.q_no; + + /* Apparently, any activity in this call from the kernel has to + * be atomic. So we won't wait for response. + */ + nctrl.wait_time = 0; + + ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl); + if (ret < 0) { + dev_err(&oct->pci_dev->dev, "DEVFLAGS change failed in core (ret: 0x%x)\n", + ret); + } + + liquidio_set_uc_list(netdev); +} + +/** + * \brief Net device set_mac_address + * @param netdev network device + */ +static int liquidio_set_mac(struct net_device *netdev, void *p) +{ + struct sockaddr *addr = (struct sockaddr *)p; + struct lio *lio = GET_LIO(netdev); + struct octeon_device *oct = lio->oct_dev; + struct octnic_ctrl_pkt nctrl; + int ret = 0; + + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + + if (ether_addr_equal(addr->sa_data, netdev->dev_addr)) + return 0; + + if (lio->linfo.macaddr_is_admin_asgnd) + return -EPERM; + + memset(&nctrl, 0, sizeof(struct octnic_ctrl_pkt)); + + nctrl.ncmd.u64 = 0; + nctrl.ncmd.s.cmd = OCTNET_CMD_CHANGE_MACADDR; + nctrl.ncmd.s.param1 = 0; + nctrl.ncmd.s.more = 1; + nctrl.iq_no = lio->linfo.txpciq[0].s.q_no; + nctrl.netpndev = (u64)netdev; + nctrl.cb_fn = liquidio_link_ctrl_cmd_completion; + nctrl.wait_time = 100; + + nctrl.udd[0] = 0; + /* The MAC Address is presented in network byte order. */ + ether_addr_copy((u8 *)&nctrl.udd[0] + 2, addr->sa_data); + + ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl); + if (ret < 0) { + dev_err(&oct->pci_dev->dev, "MAC Address change failed\n"); + return -ENOMEM; + } + memcpy(netdev->dev_addr, addr->sa_data, netdev->addr_len); + ether_addr_copy(((u8 *)&lio->linfo.hw_addr) + 2, addr->sa_data); + + return 0; +} + +/** + * \brief Net device change_mtu + * @param netdev network device + */ +static int liquidio_change_mtu(struct net_device *netdev, int new_mtu) +{ + struct lio *lio = GET_LIO(netdev); + struct octeon_device *oct = lio->oct_dev; + + lio->mtu = new_mtu; + + netif_info(lio, probe, lio->netdev, "MTU Changed from %d to %d\n", + netdev->mtu, new_mtu); + dev_info(&oct->pci_dev->dev, "%s MTU Changed from %d to %d\n", + netdev->name, netdev->mtu, new_mtu); + + netdev->mtu = new_mtu; + + return 0; +} + +/** \brief Transmit networks packets to the Octeon interface + * @param skbuff skbuff struct to be passed to network layer. + * @param netdev pointer to network device + * @returns whether the packet was transmitted to the device okay or not + * (NETDEV_TX_OK or NETDEV_TX_BUSY) + */ +static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + struct octnet_buf_free_info *finfo; + union octnic_cmd_setup cmdsetup; + struct octnic_data_pkt ndata; + struct octeon_instr_irh *irh; + struct oct_iq_stats *stats; + struct octeon_device *oct; + int q_idx = 0, iq_no = 0; + union tx_info *tx_info; + struct lio *lio; + int status = 0; + u64 dptr = 0; + u32 tag = 0; + int j; + + lio = GET_LIO(netdev); + oct = lio->oct_dev; + + if (netif_is_multiqueue(netdev)) { + q_idx = skb->queue_mapping; + q_idx = (q_idx % (lio->linfo.num_txpciq)); + tag = q_idx; + iq_no = lio->linfo.txpciq[q_idx].s.q_no; + } else { + iq_no = lio->txq; + } + + stats = &oct->instr_queue[iq_no]->stats; + + /* Check for all conditions in which the current packet cannot be + * transmitted. + */ + if (!(atomic_read(&lio->ifstate) & LIO_IFSTATE_RUNNING) || + (!lio->linfo.link.s.link_up) || (skb->len <= 0)) { + netif_info(lio, tx_err, lio->netdev, "Transmit failed link_status : %d\n", + lio->linfo.link.s.link_up); + goto lio_xmit_failed; + } + + /* Use space in skb->cb to store info used to unmap and + * free the buffers. + */ + finfo = (struct octnet_buf_free_info *)skb->cb; + finfo->lio = lio; + finfo->skb = skb; + finfo->sc = NULL; + + /* Prepare the attributes for the data to be passed to OSI. */ + memset(&ndata, 0, sizeof(struct octnic_data_pkt)); + + ndata.buf = finfo; + + ndata.q_no = iq_no; + + if (netif_is_multiqueue(netdev)) { + if (octnet_iq_is_full(oct, ndata.q_no)) { + /* defer sending if queue is full */ + netif_info(lio, tx_err, lio->netdev, "Transmit failed iq:%d full\n", + ndata.q_no); + stats->tx_iq_busy++; + return NETDEV_TX_BUSY; + } + } else { + if (octnet_iq_is_full(oct, lio->txq)) { + /* defer sending if queue is full */ + stats->tx_iq_busy++; + netif_info(lio, tx_err, lio->netdev, "Transmit failed iq:%d full\n", + ndata.q_no); + return NETDEV_TX_BUSY; + } + } + + ndata.datasize = skb->len; + + cmdsetup.u64 = 0; + cmdsetup.s.iq_no = iq_no; + + if (skb->ip_summed == CHECKSUM_PARTIAL) + cmdsetup.s.transport_csum = 1; + + if (!skb_shinfo(skb)->nr_frags) { + cmdsetup.s.u.datasize = skb->len; + octnet_prepare_pci_cmd(oct, &ndata.cmd, &cmdsetup, tag); + /* Offload checksum calculation for TCP/UDP packets */ + dptr = dma_map_single(&oct->pci_dev->dev, + skb->data, + skb->len, + DMA_TO_DEVICE); + if (dma_mapping_error(&oct->pci_dev->dev, dptr)) { + dev_err(&oct->pci_dev->dev, "%s DMA mapping error 1\n", + __func__); + return NETDEV_TX_BUSY; + } + + ndata.cmd.cmd3.dptr = dptr; + finfo->dptr = dptr; + ndata.reqtype = REQTYPE_NORESP_NET; + + } else { + struct skb_frag_struct *frag; + struct octnic_gather *g; + int i, frags; + + spin_lock(&lio->glist_lock[q_idx]); + g = (struct octnic_gather *)list_delete_head( + &lio->glist[q_idx]); + spin_unlock(&lio->glist_lock[q_idx]); + + if (!g) { + netif_info(lio, tx_err, lio->netdev, + "Transmit scatter gather: glist null!\n"); + goto lio_xmit_failed; + } + + cmdsetup.s.gather = 1; + cmdsetup.s.u.gatherptrs = (skb_shinfo(skb)->nr_frags + 1); + octnet_prepare_pci_cmd(oct, &ndata.cmd, &cmdsetup, tag); + + memset(g->sg, 0, g->sg_size); + + g->sg[0].ptr[0] = dma_map_single(&oct->pci_dev->dev, + skb->data, + (skb->len - skb->data_len), + DMA_TO_DEVICE); + if (dma_mapping_error(&oct->pci_dev->dev, g->sg[0].ptr[0])) { + dev_err(&oct->pci_dev->dev, "%s DMA mapping error 2\n", + __func__); + return NETDEV_TX_BUSY; + } + add_sg_size(&g->sg[0], (skb->len - skb->data_len), 0); + + frags = skb_shinfo(skb)->nr_frags; + i = 1; + while (frags--) { + frag = &skb_shinfo(skb)->frags[i - 1]; + + g->sg[(i >> 2)].ptr[(i & 3)] = + dma_map_page(&oct->pci_dev->dev, + frag->page.p, + frag->page_offset, + frag->size, + DMA_TO_DEVICE); + if (dma_mapping_error(&oct->pci_dev->dev, + g->sg[i >> 2].ptr[i & 3])) { + dma_unmap_single(&oct->pci_dev->dev, + g->sg[0].ptr[0], + skb->len - skb->data_len, + DMA_TO_DEVICE); + for (j = 1; j < i; j++) { + frag = &skb_shinfo(skb)->frags[j - 1]; + dma_unmap_page(&oct->pci_dev->dev, + g->sg[j >> 2].ptr[j & 3], + frag->size, + DMA_TO_DEVICE); + } + dev_err(&oct->pci_dev->dev, "%s DMA mapping error 3\n", + __func__); + return NETDEV_TX_BUSY; + } + + add_sg_size(&g->sg[(i >> 2)], frag->size, (i & 3)); + i++; + } + + dptr = dma_map_single(&oct->pci_dev->dev, + g->sg, g->sg_size, + DMA_TO_DEVICE); + if (dma_mapping_error(&oct->pci_dev->dev, dptr)) { + dev_err(&oct->pci_dev->dev, "%s DMA mapping error 4\n", + __func__); + dma_unmap_single(&oct->pci_dev->dev, g->sg[0].ptr[0], + skb->len - skb->data_len, + DMA_TO_DEVICE); + for (j = 1; j <= frags; j++) { + frag = &skb_shinfo(skb)->frags[j - 1]; + dma_unmap_page(&oct->pci_dev->dev, + g->sg[j >> 2].ptr[j & 3], + frag->size, DMA_TO_DEVICE); + } + return NETDEV_TX_BUSY; + } + + ndata.cmd.cmd3.dptr = dptr; + finfo->dptr = dptr; + finfo->g = g; + + ndata.reqtype = REQTYPE_NORESP_NET_SG; + } + + irh = (struct octeon_instr_irh *)&ndata.cmd.cmd3.irh; + tx_info = (union tx_info *)&ndata.cmd.cmd3.ossp[0]; + + if (skb_shinfo(skb)->gso_size) { + tx_info->s.gso_size = skb_shinfo(skb)->gso_size; + tx_info->s.gso_segs = skb_shinfo(skb)->gso_segs; + } + + status = octnet_send_nic_data_pkt(oct, &ndata); + if (status == IQ_SEND_FAILED) + goto lio_xmit_failed; + + netif_info(lio, tx_queued, lio->netdev, "Transmit queued successfully\n"); + + if (status == IQ_SEND_STOP) { + dev_err(&oct->pci_dev->dev, "Rcvd IQ_SEND_STOP signal; stopping IQ-%d\n", + iq_no); + stop_q(lio->netdev, q_idx); + } + + netif_trans_update(netdev); + + if (skb_shinfo(skb)->gso_size) + stats->tx_done += skb_shinfo(skb)->gso_segs; + else + stats->tx_done++; + stats->tx_tot_bytes += skb->len; + + return NETDEV_TX_OK; + +lio_xmit_failed: + stats->tx_dropped++; + netif_info(lio, tx_err, lio->netdev, "IQ%d Transmit dropped:%llu\n", + iq_no, stats->tx_dropped); + if (dptr) + dma_unmap_single(&oct->pci_dev->dev, dptr, + ndata.datasize, DMA_TO_DEVICE); + tx_buffer_free(skb); + return NETDEV_TX_OK; +} + +/** \brief Network device Tx timeout + * @param netdev pointer to network device + */ +static void liquidio_tx_timeout(struct net_device *netdev) +{ + struct lio *lio; + + lio = GET_LIO(netdev); + + netif_info(lio, tx_err, lio->netdev, + "Transmit timeout tx_dropped:%ld, waking up queues now!!\n", + netdev->stats.tx_dropped); + netif_trans_update(netdev); + txqs_wake(netdev); +} + +/** Sending command to enable/disable RX checksum offload + * @param netdev pointer to network device + * @param command OCTNET_CMD_TNL_RX_CSUM_CTL + * @param rx_cmd_bit OCTNET_CMD_RXCSUM_ENABLE/ + * OCTNET_CMD_RXCSUM_DISABLE + * @returns SUCCESS or FAILURE + */ +static int liquidio_set_rxcsum_command(struct net_device *netdev, int command, + u8 rx_cmd) +{ + struct lio *lio = GET_LIO(netdev); + struct octeon_device *oct = lio->oct_dev; + struct octnic_ctrl_pkt nctrl; + int ret = 0; + + nctrl.ncmd.u64 = 0; + nctrl.ncmd.s.cmd = command; + nctrl.ncmd.s.param1 = rx_cmd; + nctrl.iq_no = lio->linfo.txpciq[0].s.q_no; + nctrl.wait_time = 100; + nctrl.netpndev = (u64)netdev; + nctrl.cb_fn = liquidio_link_ctrl_cmd_completion; + + ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl); + if (ret < 0) { + dev_err(&oct->pci_dev->dev, "DEVFLAGS RXCSUM change failed in core (ret:0x%x)\n", + ret); + } + return ret; +} + +/** \brief Net device fix features + * @param netdev pointer to network device + * @param request features requested + * @returns updated features list + */ +static netdev_features_t liquidio_fix_features(struct net_device *netdev, + netdev_features_t request) +{ + struct lio *lio = netdev_priv(netdev); + + if ((request & NETIF_F_RXCSUM) && + !(lio->dev_capability & NETIF_F_RXCSUM)) + request &= ~NETIF_F_RXCSUM; + + if ((request & NETIF_F_HW_CSUM) && + !(lio->dev_capability & NETIF_F_HW_CSUM)) + request &= ~NETIF_F_HW_CSUM; + + if ((request & NETIF_F_TSO) && !(lio->dev_capability & NETIF_F_TSO)) + request &= ~NETIF_F_TSO; + + if ((request & NETIF_F_TSO6) && !(lio->dev_capability & NETIF_F_TSO6)) + request &= ~NETIF_F_TSO6; + + if ((request & NETIF_F_LRO) && !(lio->dev_capability & NETIF_F_LRO)) + request &= ~NETIF_F_LRO; + + /* Disable LRO if RXCSUM is off */ + if (!(request & NETIF_F_RXCSUM) && (netdev->features & NETIF_F_LRO) && + (lio->dev_capability & NETIF_F_LRO)) + request &= ~NETIF_F_LRO; + + return request; +} + +/** \brief Net device set features + * @param netdev pointer to network device + * @param features features to enable/disable + */ +static int liquidio_set_features(struct net_device *netdev, + netdev_features_t features) +{ + struct lio *lio = netdev_priv(netdev); + + if (!((netdev->features ^ features) & NETIF_F_LRO)) + return 0; + + if ((features & NETIF_F_LRO) && (lio->dev_capability & NETIF_F_LRO)) + liquidio_set_feature(netdev, OCTNET_CMD_LRO_ENABLE, + OCTNIC_LROIPV4 | OCTNIC_LROIPV6); + else if (!(features & NETIF_F_LRO) && + (lio->dev_capability & NETIF_F_LRO)) + liquidio_set_feature(netdev, OCTNET_CMD_LRO_DISABLE, + OCTNIC_LROIPV4 | OCTNIC_LROIPV6); + if (!(netdev->features & NETIF_F_RXCSUM) && + (lio->enc_dev_capability & NETIF_F_RXCSUM) && + (features & NETIF_F_RXCSUM)) + liquidio_set_rxcsum_command(netdev, OCTNET_CMD_TNL_RX_CSUM_CTL, + OCTNET_CMD_RXCSUM_ENABLE); + else if ((netdev->features & NETIF_F_RXCSUM) && + (lio->enc_dev_capability & NETIF_F_RXCSUM) && + !(features & NETIF_F_RXCSUM)) + liquidio_set_rxcsum_command(netdev, OCTNET_CMD_TNL_RX_CSUM_CTL, + OCTNET_CMD_RXCSUM_DISABLE); + + return 0; +} + +static const struct net_device_ops lionetdevops = { + .ndo_open = liquidio_open, + .ndo_stop = liquidio_stop, + .ndo_start_xmit = liquidio_xmit, + .ndo_set_mac_address = liquidio_set_mac, + .ndo_set_rx_mode = liquidio_set_mcast_list, + .ndo_tx_timeout = liquidio_tx_timeout, + .ndo_change_mtu = liquidio_change_mtu, + .ndo_fix_features = liquidio_fix_features, + .ndo_set_features = liquidio_set_features, + .ndo_select_queue = select_q, +}; + +static int lio_nic_info(struct octeon_recv_info *recv_info, void *buf) +{ + struct octeon_device *oct = (struct octeon_device *)buf; + struct octeon_recv_pkt *recv_pkt = recv_info->recv_pkt; + union oct_link_status *ls; + int gmxport = 0; + int i; + + if (recv_pkt->buffer_size[0] != sizeof(*ls)) { + dev_err(&oct->pci_dev->dev, "Malformed NIC_INFO, len=%d, ifidx=%d\n", + recv_pkt->buffer_size[0], + recv_pkt->rh.r_nic_info.gmxport); + goto nic_info_err; + } + + gmxport = recv_pkt->rh.r_nic_info.gmxport; + ls = (union oct_link_status *)get_rbd(recv_pkt->buffer_ptr[0]); + + octeon_swap_8B_data((u64 *)ls, (sizeof(union oct_link_status)) >> 3); + + for (i = 0; i < oct->ifcount; i++) { + if (oct->props[i].gmxport == gmxport) { + update_link_status(oct->props[i].netdev, ls); + break; + } + } + +nic_info_err: + for (i = 0; i < recv_pkt->buffer_count; i++) + recv_buffer_free(recv_pkt->buffer_ptr[i]); + octeon_free_recv_info(recv_info); + return 0; +} + +/** + * \brief Setup network interfaces + * @param octeon_dev octeon device + * + * Called during init time for each device. It assumes the NIC + * is already up and running. The link information for each + * interface is passed in link_info. + */ +static int setup_nic_devices(struct octeon_device *octeon_dev) +{ + int retval, num_iqueues, num_oqueues; + struct liquidio_if_cfg_context *ctx; + u32 resp_size, ctx_size, data_size; + struct liquidio_if_cfg_resp *resp; + struct octeon_soft_command *sc; + union oct_nic_if_cfg if_cfg; + struct octdev_props *props; + struct net_device *netdev; + struct lio_version *vdata; + struct lio *lio = NULL; + u8 mac[ETH_ALEN], i, j; + u32 ifidx_or_pfnum; + + ifidx_or_pfnum = octeon_dev->pf_num; + + /* This is to handle link status changes */ + octeon_register_dispatch_fn(octeon_dev, OPCODE_NIC, OPCODE_NIC_INFO, + lio_nic_info, octeon_dev); + + /* REQTYPE_RESP_NET and REQTYPE_SOFT_COMMAND do not have free functions. + * They are handled directly. + */ + octeon_register_reqtype_free_fn(octeon_dev, REQTYPE_NORESP_NET, + free_netbuf); + + octeon_register_reqtype_free_fn(octeon_dev, REQTYPE_NORESP_NET_SG, + free_netsgbuf); + + octeon_register_reqtype_free_fn(octeon_dev, REQTYPE_RESP_NET_SG, + free_netsgbuf_with_resp); + + for (i = 0; i < octeon_dev->ifcount; i++) { + resp_size = sizeof(struct liquidio_if_cfg_resp); + ctx_size = sizeof(struct liquidio_if_cfg_context); + data_size = sizeof(struct lio_version); + sc = (struct octeon_soft_command *) + octeon_alloc_soft_command(octeon_dev, data_size, + resp_size, ctx_size); + resp = (struct liquidio_if_cfg_resp *)sc->virtrptr; + ctx = (struct liquidio_if_cfg_context *)sc->ctxptr; + vdata = (struct lio_version *)sc->virtdptr; + + *((u64 *)vdata) = 0; + vdata->major = cpu_to_be16(LIQUIDIO_BASE_MAJOR_VERSION); + vdata->minor = cpu_to_be16(LIQUIDIO_BASE_MINOR_VERSION); + vdata->micro = cpu_to_be16(LIQUIDIO_BASE_MICRO_VERSION); + + WRITE_ONCE(ctx->cond, 0); + ctx->octeon_id = lio_get_device_id(octeon_dev); + init_waitqueue_head(&ctx->wc); + + if_cfg.u64 = 0; + + if_cfg.s.num_iqueues = octeon_dev->sriov_info.rings_per_vf; + if_cfg.s.num_oqueues = octeon_dev->sriov_info.rings_per_vf; + if_cfg.s.base_queue = 0; + + sc->iq_no = 0; + + octeon_prepare_soft_command(octeon_dev, sc, OPCODE_NIC, + OPCODE_NIC_IF_CFG, 0, if_cfg.u64, + 0); + + sc->callback = if_cfg_callback; + sc->callback_arg = sc; + sc->wait_time = 5000; + + retval = octeon_send_soft_command(octeon_dev, sc); + if (retval == IQ_SEND_FAILED) { + dev_err(&octeon_dev->pci_dev->dev, + "iq/oq config failed status: %x\n", retval); + /* Soft instr is freed by driver in case of failure. */ + goto setup_nic_dev_fail; + } + + /* Sleep on a wait queue till the cond flag indicates that the + * response arrived or timed-out. + */ + if (sleep_cond(&ctx->wc, &ctx->cond) == -EINTR) { + dev_err(&octeon_dev->pci_dev->dev, "Wait interrupted\n"); + goto setup_nic_wait_intr; + } + + retval = resp->status; + if (retval) { + dev_err(&octeon_dev->pci_dev->dev, "iq/oq config failed\n"); + goto setup_nic_dev_fail; + } + + octeon_swap_8B_data((u64 *)(&resp->cfg_info), + (sizeof(struct liquidio_if_cfg_info)) >> 3); + + num_iqueues = hweight64(resp->cfg_info.iqmask); + num_oqueues = hweight64(resp->cfg_info.oqmask); + + if (!(num_iqueues) || !(num_oqueues)) { + dev_err(&octeon_dev->pci_dev->dev, + "Got bad iqueues (%016llx) or oqueues (%016llx) from firmware.\n", + resp->cfg_info.iqmask, resp->cfg_info.oqmask); + goto setup_nic_dev_fail; + } + dev_dbg(&octeon_dev->pci_dev->dev, + "interface %d, iqmask %016llx, oqmask %016llx, numiqueues %d, numoqueues %d\n", + i, resp->cfg_info.iqmask, resp->cfg_info.oqmask, + num_iqueues, num_oqueues); + + netdev = alloc_etherdev_mq(LIO_SIZE, num_iqueues); + + if (!netdev) { + dev_err(&octeon_dev->pci_dev->dev, "Device allocation failed\n"); + goto setup_nic_dev_fail; + } + + SET_NETDEV_DEV(netdev, &octeon_dev->pci_dev->dev); + + /* Associate the routines that will handle different + * netdev tasks. + */ + netdev->netdev_ops = &lionetdevops; + + lio = GET_LIO(netdev); + + memset(lio, 0, sizeof(struct lio)); + + lio->ifidx = ifidx_or_pfnum; + + props = &octeon_dev->props[i]; + props->gmxport = resp->cfg_info.linfo.gmxport; + props->netdev = netdev; + + lio->linfo.num_rxpciq = num_oqueues; + lio->linfo.num_txpciq = num_iqueues; + + for (j = 0; j < num_oqueues; j++) { + lio->linfo.rxpciq[j].u64 = + resp->cfg_info.linfo.rxpciq[j].u64; + } + for (j = 0; j < num_iqueues; j++) { + lio->linfo.txpciq[j].u64 = + resp->cfg_info.linfo.txpciq[j].u64; + } + + lio->linfo.hw_addr = resp->cfg_info.linfo.hw_addr; + lio->linfo.gmxport = resp->cfg_info.linfo.gmxport; + lio->linfo.link.u64 = resp->cfg_info.linfo.link.u64; + lio->linfo.macaddr_is_admin_asgnd = + resp->cfg_info.linfo.macaddr_is_admin_asgnd; + + lio->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE); + + lio->dev_capability = NETIF_F_HIGHDMA + | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM + | NETIF_F_SG | NETIF_F_RXCSUM + | NETIF_F_TSO | NETIF_F_TSO6 + | NETIF_F_GRO + | NETIF_F_LRO; + netif_set_gso_max_size(netdev, OCTNIC_GSO_MAX_SIZE); + + netdev->features = (lio->dev_capability & ~NETIF_F_LRO); + + netdev->hw_features = lio->dev_capability; + + /* MTU range: 68 - 16000 */ + netdev->min_mtu = LIO_MIN_MTU_SIZE; + netdev->max_mtu = LIO_MAX_MTU_SIZE; + + /* Point to the properties for octeon device to which this + * interface belongs. + */ + lio->oct_dev = octeon_dev; + lio->octprops = props; + lio->netdev = netdev; + + dev_dbg(&octeon_dev->pci_dev->dev, + "if%d gmx: %d hw_addr: 0x%llx\n", i, + lio->linfo.gmxport, CVM_CAST64(lio->linfo.hw_addr)); + + /* 64-bit swap required on LE machines */ + octeon_swap_8B_data(&lio->linfo.hw_addr, 1); + for (j = 0; j < ETH_ALEN; j++) + mac[j] = *((u8 *)(((u8 *)&lio->linfo.hw_addr) + 2 + j)); + + /* Copy MAC Address to OS network device structure */ + ether_addr_copy(netdev->dev_addr, mac); + + if (setup_io_queues(octeon_dev, i)) { + dev_err(&octeon_dev->pci_dev->dev, "I/O queues creation failed\n"); + goto setup_nic_dev_fail; + } + + ifstate_set(lio, LIO_IFSTATE_DROQ_OPS); + + /* For VFs, enable Octeon device interrupts here, + * as this is contingent upon IO queue setup + */ + octeon_dev->fn_list.enable_interrupt(octeon_dev, + OCTEON_ALL_INTR); + + /* By default all interfaces on a single Octeon uses the same + * tx and rx queues + */ + lio->txq = lio->linfo.txpciq[0].s.q_no; + lio->rxq = lio->linfo.rxpciq[0].s.q_no; + + lio->tx_qsize = octeon_get_tx_qsize(octeon_dev, lio->txq); + lio->rx_qsize = octeon_get_rx_qsize(octeon_dev, lio->rxq); + + if (setup_glists(lio, num_iqueues)) { + dev_err(&octeon_dev->pci_dev->dev, + "Gather list allocation failed\n"); + goto setup_nic_dev_fail; + } + + if (netdev->features & NETIF_F_LRO) + liquidio_set_feature(netdev, OCTNET_CMD_LRO_ENABLE, + OCTNIC_LROIPV4 | OCTNIC_LROIPV6); + + if ((debug != -1) && (debug & NETIF_MSG_HW)) + liquidio_set_feature(netdev, OCTNET_CMD_VERBOSE_ENABLE, + 0); + + if (setup_link_status_change_wq(netdev)) + goto setup_nic_dev_fail; + + /* Register the network device with the OS */ + if (register_netdev(netdev)) { + dev_err(&octeon_dev->pci_dev->dev, "Device registration failed\n"); + goto setup_nic_dev_fail; + } + + dev_dbg(&octeon_dev->pci_dev->dev, + "Setup NIC ifidx:%d mac:%02x%02x%02x%02x%02x%02x\n", + i, mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); + netif_carrier_off(netdev); + lio->link_changes++; + + ifstate_set(lio, LIO_IFSTATE_REGISTERED); + + /* Sending command to firmware to enable Rx checksum offload + * by default at the time of setup of Liquidio driver for + * this device + */ + liquidio_set_rxcsum_command(netdev, OCTNET_CMD_TNL_RX_CSUM_CTL, + OCTNET_CMD_RXCSUM_ENABLE); + liquidio_set_feature(netdev, OCTNET_CMD_TNL_TX_CSUM_CTL, + OCTNET_CMD_TXCSUM_ENABLE); + + dev_dbg(&octeon_dev->pci_dev->dev, + "NIC ifidx:%d Setup successful\n", i); + + octeon_free_soft_command(octeon_dev, sc); + } + + return 0; + +setup_nic_dev_fail: + + octeon_free_soft_command(octeon_dev, sc); + +setup_nic_wait_intr: + + while (i--) { + dev_err(&octeon_dev->pci_dev->dev, + "NIC ifidx:%d Setup failed\n", i); + liquidio_destroy_nic_device(octeon_dev, i); + } + return -ENODEV; +} + +/** + * \brief initialize the NIC + * @param oct octeon device + * + * This initialization routine is called once the Octeon device application is + * up and running + */ +static int liquidio_init_nic_module(struct octeon_device *oct) +{ + int num_nic_ports = 1; + int i, retval = 0; + + dev_dbg(&oct->pci_dev->dev, "Initializing network interfaces\n"); + + /* only default iq and oq were initialized + * initialize the rest as well run port_config command for each port + */ + oct->ifcount = num_nic_ports; + memset(oct->props, 0, + sizeof(struct octdev_props) * num_nic_ports); + + for (i = 0; i < MAX_OCTEON_LINKS; i++) + oct->props[i].gmxport = -1; + + retval = setup_nic_devices(oct); + if (retval) { + dev_err(&oct->pci_dev->dev, "Setup NIC devices failed\n"); + goto octnet_init_failure; + } + +octnet_init_failure: + + oct->ifcount = 0; + + return retval; +} + /** * \brief Device initialization for each Octeon device that is probed * @param octeon_dev octeon device @@ -498,6 +2563,8 @@ static int octeon_device_init(struct octeon_device *oct) atomic_set(&oct->status, OCT_DEV_PCI_MAP_DONE); + oct->app_mode = CVM_DRV_NIC_APP; + /* Initialize the dispatch mechanism used to push packets arriving on * Octeon Output queues. */ @@ -594,6 +2661,9 @@ static int octeon_device_init(struct octeon_device *oct) atomic_set(&oct->status, OCT_DEV_RUNNING); + if (liquidio_init_nic_module(oct)) + return 1; + return 0; } diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h index f308ee49a754..ba329f6ca779 100644 --- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h +++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h @@ -212,6 +212,7 @@ static inline void add_sg_size(struct octeon_sg_entry *sg_entry, #define OCTNET_CMD_ID_ACTIVE 0x1a +#define OCTNET_CMD_SET_UC_LIST 0x1b #define OCTNET_CMD_SET_VF_LINKSTATE 0x1c #define OCTNET_CMD_VXLAN_PORT_ADD 0x0 #define OCTNET_CMD_VXLAN_PORT_DEL 0x1 diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.c b/drivers/net/ethernet/cavium/liquidio/octeon_device.c index 6d54032b10ab..a8df493a5012 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_device.c +++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.c @@ -1221,6 +1221,9 @@ struct octeon_config *octeon_get_conf(struct octeon_device *oct) } else if (OCTEON_CN23XX_PF(oct)) { default_oct_conf = (struct octeon_config *) (CHIP_CONF(oct, cn23xx_pf)); + } else if (OCTEON_CN23XX_VF(oct)) { + default_oct_conf = (struct octeon_config *) + (CHIP_CONF(oct, cn23xx_vf)); } return default_oct_conf; } @@ -1371,7 +1374,7 @@ void lio_enable_irq(struct octeon_droq *droq, struct octeon_instr_queue *iq) /*write resend. Writing RESEND in SLI_PKTX_CNTS should be enough *to trigger tx interrupts as well, if they are pending. */ - if (oct && OCTEON_CN23XX_PF(oct)) { + if (oct && (OCTEON_CN23XX_PF(oct) || OCTEON_CN23XX_VF(oct))) { if (droq) writeq(CN23XX_INTR_RESEND, droq->pkts_sent_reg); /*we race with firmrware here. read and write the IN_DONE_CNTS*/ diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.c b/drivers/net/ethernet/cavium/liquidio/octeon_droq.c index 8bf1ac76bcdc..0be87d119a97 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.c +++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.c @@ -28,6 +28,7 @@ #include "cn66xx_regs.h" #include "cn66xx_device.h" #include "cn23xx_pf_device.h" +#include "cn23xx_vf_device.h" struct niclist { struct list_head list; @@ -261,6 +262,11 @@ int octeon_init_droq(struct octeon_device *oct, c_pkts_per_intr = (u32)CFG_GET_OQ_PKTS_PER_INTR(conf23); c_refill_threshold = (u32)CFG_GET_OQ_REFILL_THRESHOLD(conf23); + } else if (OCTEON_CN23XX_VF(oct)) { + struct octeon_config *conf23 = CHIP_CONF(oct, cn23xx_vf); + + c_pkts_per_intr = (u32)CFG_GET_OQ_PKTS_PER_INTR(conf23); + c_refill_threshold = (u32)CFG_GET_OQ_REFILL_THRESHOLD(conf23); } else { return 1; } @@ -889,6 +895,10 @@ octeon_process_droq_poll_cmd(struct octeon_device *oct, u32 q_no, int cmd, lio_enable_irq(oct->droq[q_no], oct->instr_queue[q_no]); } break; + + case OCTEON_CN23XX_VF_VID: + lio_enable_irq(oct->droq[q_no], oct->instr_queue[q_no]); + break; } return 0; } diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h index e94edc841cad..6bb89419006e 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h @@ -123,6 +123,7 @@ struct lio { /* work queue for link status */ struct cavium_wq link_status_wq; + int netdev_uc_count; }; #define LIO_SIZE (sizeof(struct lio)) diff --git a/drivers/net/ethernet/cavium/liquidio/request_manager.c b/drivers/net/ethernet/cavium/liquidio/request_manager.c index ea2b7e46631d..3ce66759e80a 100644 --- a/drivers/net/ethernet/cavium/liquidio/request_manager.c +++ b/drivers/net/ethernet/cavium/liquidio/request_manager.c @@ -394,7 +394,7 @@ lio_process_iq_request_list(struct octeon_device *oct, case REQTYPE_SOFT_COMMAND: sc = buf; - if (OCTEON_CN23XX_PF(oct)) + if (OCTEON_CN23XX_PF(oct) || OCTEON_CN23XX_VF(oct)) irh = (struct octeon_instr_irh *) &sc->cmd.cmd3.irh; else @@ -607,7 +607,7 @@ octeon_prepare_soft_command(struct octeon_device *oct, oct_cfg = octeon_get_conf(oct); - if (OCTEON_CN23XX_PF(oct)) { + if (OCTEON_CN23XX_PF(oct) || OCTEON_CN23XX_VF(oct)) { ih3 = (struct octeon_instr_ih3 *)&sc->cmd.cmd3.ih3; ih3->pkind = oct->instr_queue[sc->iq_no]->txpciq.s.pkind; @@ -700,7 +700,7 @@ int octeon_send_soft_command(struct octeon_device *oct, struct octeon_instr_irh *irh; u32 len; - if (OCTEON_CN23XX_PF(oct)) { + if (OCTEON_CN23XX_PF(oct) || OCTEON_CN23XX_VF(oct)) { ih3 = (struct octeon_instr_ih3 *)&sc->cmd.cmd3.ih3; if (ih3->dlengsz) { WARN_ON(!sc->dmadptr); diff --git a/drivers/net/ethernet/cavium/liquidio/response_manager.c b/drivers/net/ethernet/cavium/liquidio/response_manager.c index fdaf742a59cb..2fbaae96b505 100644 --- a/drivers/net/ethernet/cavium/liquidio/response_manager.c +++ b/drivers/net/ethernet/cavium/liquidio/response_manager.c @@ -84,7 +84,8 @@ int lio_process_ordered_list(struct octeon_device *octeon_dev, sc = (struct octeon_soft_command *)ordered_sc_list-> head.next; - if (OCTEON_CN23XX_PF(octeon_dev)) { + if (OCTEON_CN23XX_PF(octeon_dev) || + OCTEON_CN23XX_VF(octeon_dev)) { rdp = (struct octeon_instr_rdp *)&sc->cmd.cmd3.rdp; rptr = sc->cmd.cmd3.rptr; } else { diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 449884f8dd67..48113c6609db 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -134,24 +134,6 @@ MODULE_FIRMWARE(FW5_FNAME); MODULE_FIRMWARE(FW6_FNAME); /* - * Normally we're willing to become the firmware's Master PF but will be happy - * if another PF has already become the Master and initialized the adapter. - * Setting "force_init" will cause this driver to forcibly establish itself as - * the Master PF and initialize the adapter. - */ -static uint force_init; - -module_param(force_init, uint, 0644); -MODULE_PARM_DESC(force_init, "Forcibly become Master PF and initialize adapter," - "deprecated parameter"); - -static int dflt_msg_enable = DFLT_MSG_ENABLE; - -module_param(dflt_msg_enable, int, 0644); -MODULE_PARM_DESC(dflt_msg_enable, "Chelsio T4 default message enable bitmap, " - "deprecated parameter"); - -/* * The driver uses the best interrupt scheme available on a platform in the * order MSI-X, MSI, legacy INTx interrupts. This parameter determines which * of these schemes the driver may consider as follows: @@ -179,16 +161,6 @@ MODULE_PARM_DESC(msi, "whether to use INTx (0), MSI (1) or MSI-X (2)"); */ static int rx_dma_offset = 2; -#ifdef CONFIG_PCI_IOV -/* Configure the number of PCI-E Virtual Function which are to be instantiated - * on SR-IOV Capable Physical Functions. - */ -static unsigned int num_vf[NUM_OF_PF_WITH_SRIOV]; - -module_param_array(num_vf, uint, NULL, 0644); -MODULE_PARM_DESC(num_vf, "number of VFs for each of PFs 0-3, deprecated parameter - please use the pci sysfs interface."); -#endif - /* TX Queue select used to determine what algorithm to use for selecting TX * queue. Select between the kernel provided function (select_queue=0) or user * cxgb_select_queue function (select_queue=1) @@ -4729,7 +4701,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) adapter->name = pci_name(pdev); adapter->mbox = func; adapter->pf = func; - adapter->msg_enable = dflt_msg_enable; + adapter->msg_enable = DFLT_MSG_ENABLE; memset(adapter->chan_map, 0xff, sizeof(adapter->chan_map)); spin_lock_init(&adapter->stats_lock); @@ -4988,17 +4960,6 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) sriov: #ifdef CONFIG_PCI_IOV - if (func < ARRAY_SIZE(num_vf) && num_vf[func] > 0) { - dev_warn(&pdev->dev, - "Enabling SR-IOV VFs using the num_vf module " - "parameter is deprecated - please use the pci sysfs " - "interface instead.\n"); - if (pci_enable_sriov(pdev, num_vf[func]) == 0) - dev_info(&pdev->dev, - "instantiated %u virtual functions\n", - num_vf[func]); - } - adapter = kzalloc(sizeof(*adapter), GFP_KERNEL); if (!adapter) { err = -ENOMEM; diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c index 5d4da0e8acaa..fa43e06d3a29 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c @@ -70,13 +70,6 @@ NETIF_MSG_TIMER | NETIF_MSG_IFDOWN | NETIF_MSG_IFUP |\ NETIF_MSG_RX_ERR | NETIF_MSG_TX_ERR) -static int dflt_msg_enable = DFLT_MSG_ENABLE; - -module_param(dflt_msg_enable, int, 0644); -MODULE_PARM_DESC(dflt_msg_enable, - "default adapter ethtool message level bitmap, " - "deprecated parameter"); - /* * The driver uses the best interrupt scheme available on a platform in the * order MSI-X then MSI. This parameter determines which of these schemes the @@ -2891,7 +2884,7 @@ static int cxgb4vf_pci_probe(struct pci_dev *pdev, * Initialize adapter level features. */ adapter->name = pci_name(pdev); - adapter->msg_enable = dflt_msg_enable; + adapter->msg_enable = DFLT_MSG_ENABLE; err = adap_init0(adapter); if (err) goto err_unmap_bar; diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 5e5b259dd2cc..e05e22705cf7 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -3578,9 +3578,10 @@ static int mvneta_stop(struct net_device *dev) mvneta_stop_dev(pp); mvneta_mdio_remove(pp); - cpuhp_state_remove_instance_nocalls(online_hpstate, &pp->node_online); - cpuhp_state_remove_instance_nocalls(CPUHP_NET_MVNETA_DEAD, - &pp->node_dead); + cpuhp_state_remove_instance_nocalls(online_hpstate, + &pp->node_online); + cpuhp_state_remove_instance_nocalls(CPUHP_NET_MVNETA_DEAD, + &pp->node_dead); on_each_cpu(mvneta_percpu_disable, pp, true); free_percpu_irq(dev->irq, pp->ports); } else { diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 49a81f1fc1d6..bcd955339058 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -51,6 +51,9 @@ #include "mlx4_en.h" #include "en_port.h" +#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) - \ + XDP_PACKET_HEADROOM)) + int mlx4_en_setup_tc(struct net_device *dev, u8 up) { struct mlx4_en_priv *priv = netdev_priv(dev); @@ -2249,6 +2252,19 @@ void mlx4_en_destroy_netdev(struct net_device *dev) free_netdev(dev); } +static bool mlx4_en_check_xdp_mtu(struct net_device *dev, int mtu) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + if (mtu > MLX4_EN_MAX_XDP_MTU) { + en_err(priv, "mtu:%d > max:%d when XDP prog is attached\n", + mtu, MLX4_EN_MAX_XDP_MTU); + return false; + } + + return true; +} + static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) { struct mlx4_en_priv *priv = netdev_priv(dev); @@ -2258,11 +2274,10 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) en_dbg(DRV, priv, "Change MTU called - current:%d new:%d\n", dev->mtu, new_mtu); - if (priv->tx_ring_num[TX_XDP] && MLX4_EN_EFF_MTU(new_mtu) > FRAG_SZ0) { - en_err(priv, "MTU size:%d requires frags but XDP running\n", - new_mtu); - return -EOPNOTSUPP; - } + if (priv->tx_ring_num[TX_XDP] && + !mlx4_en_check_xdp_mtu(dev, new_mtu)) + return -ENOTSUPP; + dev->mtu = new_mtu; if (netif_running(dev)) { @@ -2710,10 +2725,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) return 0; } - if (priv->num_frags > 1) { - en_err(priv, "Cannot set XDP if MTU requires multiple frags\n"); + if (!mlx4_en_check_xdp_mtu(dev, dev->mtu)) return -EOPNOTSUPP; - } tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); if (!tmp) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 6562f78b07f4..3c37e216bbf3 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -96,7 +96,6 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv, struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS]; const struct mlx4_en_frag_info *frag_info; struct page *page; - dma_addr_t dma; int i; for (i = 0; i < priv->num_frags; i++) { @@ -115,9 +114,10 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv, for (i = 0; i < priv->num_frags; i++) { frags[i] = ring_alloc[i]; - dma = ring_alloc[i].dma + ring_alloc[i].page_offset; + frags[i].page_offset += priv->frag_info[i].rx_headroom; + rx_desc->data[i].addr = cpu_to_be64(frags[i].dma + + frags[i].page_offset); ring_alloc[i] = page_alloc[i]; - rx_desc->data[i].addr = cpu_to_be64(dma); } return 0; @@ -250,7 +250,8 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv, if (ring->page_cache.index > 0) { frags[0] = ring->page_cache.buf[--ring->page_cache.index]; - rx_desc->data[0].addr = cpu_to_be64(frags[0].dma); + rx_desc->data[0].addr = cpu_to_be64(frags[0].dma + + frags[0].page_offset); return 0; } @@ -889,6 +890,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud if (xdp_prog) { struct xdp_buff xdp; dma_addr_t dma; + void *orig_data; u32 act; dma = be64_to_cpu(rx_desc->data[0].addr); @@ -896,11 +898,19 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud priv->frag_info[0].frag_size, DMA_FROM_DEVICE); - xdp.data = page_address(frags[0].page) + - frags[0].page_offset; + xdp.data_hard_start = page_address(frags[0].page); + xdp.data = xdp.data_hard_start + frags[0].page_offset; xdp.data_end = xdp.data + length; + orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); + + if (xdp.data != orig_data) { + length = xdp.data_end - xdp.data; + frags[0].page_offset = xdp.data - + xdp.data_hard_start; + } + switch (act) { case XDP_PASS: break; @@ -1164,37 +1174,41 @@ static const int frag_sizes[] = { void mlx4_en_calc_rx_buf(struct net_device *dev) { - enum dma_data_direction dma_dir = PCI_DMA_FROMDEVICE; struct mlx4_en_priv *priv = netdev_priv(dev); int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu); - int order = MLX4_EN_ALLOC_PREFER_ORDER; - u32 align = SMP_CACHE_BYTES; - int buf_size = 0; int i = 0; /* bpf requires buffers to be set up as 1 packet per page. * This only works when num_frags == 1. */ if (priv->tx_ring_num[TX_XDP]) { - dma_dir = PCI_DMA_BIDIRECTIONAL; - /* This will gain efficient xdp frame recycling at the expense - * of more costly truesize accounting + priv->frag_info[0].order = 0; + priv->frag_info[0].frag_size = eff_mtu; + priv->frag_info[0].frag_prefix_size = 0; + /* This will gain efficient xdp frame recycling at the + * expense of more costly truesize accounting */ - align = PAGE_SIZE; - order = 0; - } - - while (buf_size < eff_mtu) { - priv->frag_info[i].order = order; - priv->frag_info[i].frag_size = - (eff_mtu > buf_size + frag_sizes[i]) ? - frag_sizes[i] : eff_mtu - buf_size; - priv->frag_info[i].frag_prefix_size = buf_size; - priv->frag_info[i].frag_stride = - ALIGN(priv->frag_info[i].frag_size, align); - priv->frag_info[i].dma_dir = dma_dir; - buf_size += priv->frag_info[i].frag_size; - i++; + priv->frag_info[0].frag_stride = PAGE_SIZE; + priv->frag_info[0].dma_dir = PCI_DMA_BIDIRECTIONAL; + priv->frag_info[0].rx_headroom = XDP_PACKET_HEADROOM; + i = 1; + } else { + int buf_size = 0; + + while (buf_size < eff_mtu) { + priv->frag_info[i].order = MLX4_EN_ALLOC_PREFER_ORDER; + priv->frag_info[i].frag_size = + (eff_mtu > buf_size + frag_sizes[i]) ? + frag_sizes[i] : eff_mtu - buf_size; + priv->frag_info[i].frag_prefix_size = buf_size; + priv->frag_info[i].frag_stride = + ALIGN(priv->frag_info[i].frag_size, + SMP_CACHE_BYTES); + priv->frag_info[i].dma_dir = PCI_DMA_FROMDEVICE; + priv->frag_info[i].rx_headroom = 0; + buf_size += priv->frag_info[i].frag_size; + i++; + } } priv->num_frags = i; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 4b597dca5c52..5886ad78058f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -354,7 +354,7 @@ u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv, struct mlx4_en_rx_alloc frame = { .page = tx_info->page, .dma = tx_info->map0_dma, - .page_offset = 0, + .page_offset = XDP_PACKET_HEADROOM, .page_size = PAGE_SIZE, }; @@ -1132,7 +1132,7 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring, tx_info->page = frame->page; frame->page = NULL; tx_info->map0_dma = dma; - tx_info->map0_byte_count = length; + tx_info->map0_byte_count = PAGE_SIZE; tx_info->nr_txbb = nr_txbb; tx_info->nr_bytes = max_t(unsigned int, length, ETH_ZLEN); tx_info->data_offset = (void *)data - (void *)tx_desc; @@ -1141,9 +1141,10 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring, tx_info->linear = 1; tx_info->inl = 0; - dma_sync_single_for_device(priv->ddev, dma, length, PCI_DMA_TODEVICE); + dma_sync_single_range_for_device(priv->ddev, dma, frame->page_offset, + length, PCI_DMA_TODEVICE); - data->addr = cpu_to_be64(dma); + data->addr = cpu_to_be64(dma + frame->page_offset); data->lkey = ring->mr_key; dma_wmb(); data->byte_count = cpu_to_be32(length); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 20a936428f4a..ba1c6cd0cc79 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -475,7 +475,8 @@ struct mlx4_en_frag_info { u16 frag_prefix_size; u32 frag_stride; enum dma_data_direction dma_dir; - int order; + u16 order; + u16 rx_headroom; }; #ifdef CONFIG_MLX4_EN_DCB diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 07020276fe73..cbfa38fc72c0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3183,6 +3183,11 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog) bool reset, was_opened; int i; + if (prog && prog->xdp_adjust_head) { + netdev_err(netdev, "Does not support bpf_xdp_adjust_head()\n"); + return -EOPNOTSUPP; + } + mutex_lock(&priv->state_lock); if ((netdev->features & NETIF_F_LRO) && prog) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index f07ef8c7da55..f8829b517156 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -31,6 +31,7 @@ */ #include <net/flow_dissector.h> +#include <net/sch_generic.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_gact.h> #include <net/tc_act/tc_skbedit.h> @@ -363,7 +364,18 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, skb_flow_dissector_target(f->dissector, FLOW_DISSECTOR_KEY_CONTROL, f->key); + + struct flow_dissector_key_control *mask = + skb_flow_dissector_target(f->dissector, + FLOW_DISSECTOR_KEY_CONTROL, + f->mask); addr_type = key->addr_type; + + if (mask->flags & FLOW_DIS_IS_FRAGMENT) { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, frag, 1); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag, + key->flags & FLOW_DIS_IS_FRAGMENT); + } } if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_BASIC)) { diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 00d9a03be31d..e8d448109e03 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2946,6 +2946,10 @@ static int nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog) }; int err; + if (prog && prog->xdp_adjust_head) { + nn_err(nn, "Does not support bpf_xdp_adjust_head()\n"); + return -EOPNOTSUPP; + } if (!prog && !nn->xdp_prog) return 0; if (prog && nn->xdp_prog) { diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index cf1dd1436d93..aecdd1c5c0ea 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -2507,6 +2507,11 @@ static int qede_xdp_set(struct qede_dev *edev, struct bpf_prog *prog) { struct qede_reload_args args; + if (prog && prog->xdp_adjust_head) { + DP_ERR(edev, "Does not support bpf_xdp_adjust_head()\n"); + return -EOPNOTSUPP; + } + /* If we're called, there was already a bpf reference increment */ args.func = &qede_xdp_reload_func; args.u.new_prog = prog; diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 3ced2e1703c1..b13a144f72ad 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -412,8 +412,8 @@ extern const struct stmmac_desc_ops ndesc_ops; struct stmmac_dma_ops { /* DMA core initialization */ int (*reset)(void __iomem *ioaddr); - void (*init)(void __iomem *ioaddr, int pbl, int fb, int mb, - int aal, u32 dma_tx, u32 dma_rx, int atds); + void (*init)(void __iomem *ioaddr, struct stmmac_dma_cfg *dma_cfg, + u32 dma_tx, u32 dma_rx, int atds); /* Configure the AXI Bus Mode Register */ void (*axi)(void __iomem *ioaddr, struct stmmac_axi *axi); /* Dump DMA registers */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c index e6e6c2fcc4b7..3304095c934c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c @@ -71,9 +71,12 @@ err_remove_config_dt: static const struct of_device_id dwmac_generic_match[] = { { .compatible = "st,spear600-gmac"}, + { .compatible = "snps,dwmac-3.50a"}, { .compatible = "snps,dwmac-3.610"}, { .compatible = "snps,dwmac-3.70a"}, { .compatible = "snps,dwmac-3.710"}, + { .compatible = "snps,dwmac-4.00"}, + { .compatible = "snps,dwmac-4.10a"}, { .compatible = "snps,dwmac"}, { } }; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h index ff3e5ab39bd0..52b9407a8a39 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h @@ -225,7 +225,7 @@ enum rx_tx_priority_ratio { #define DMA_BUS_MODE_FB 0x00010000 /* Fixed burst */ #define DMA_BUS_MODE_MB 0x04000000 /* Mixed burst */ -#define DMA_BUS_MODE_RPBL_MASK 0x003e0000 /* Rx-Programmable Burst Len */ +#define DMA_BUS_MODE_RPBL_MASK 0x007e0000 /* Rx-Programmable Burst Len */ #define DMA_BUS_MODE_RPBL_SHIFT 17 #define DMA_BUS_MODE_USP 0x00800000 #define DMA_BUS_MODE_MAXPBL 0x01000000 diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c index f35385266fbf..612d3aaac9a4 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c @@ -84,37 +84,39 @@ static void dwmac1000_dma_axi(void __iomem *ioaddr, struct stmmac_axi *axi) writel(value, ioaddr + DMA_AXI_BUS_MODE); } -static void dwmac1000_dma_init(void __iomem *ioaddr, int pbl, int fb, int mb, - int aal, u32 dma_tx, u32 dma_rx, int atds) +static void dwmac1000_dma_init(void __iomem *ioaddr, + struct stmmac_dma_cfg *dma_cfg, + u32 dma_tx, u32 dma_rx, int atds) { u32 value = readl(ioaddr + DMA_BUS_MODE); + int txpbl = dma_cfg->txpbl ?: dma_cfg->pbl; + int rxpbl = dma_cfg->rxpbl ?: dma_cfg->pbl; /* * Set the DMA PBL (Programmable Burst Length) mode. * * Note: before stmmac core 3.50 this mode bit was 4xPBL, and * post 3.5 mode bit acts as 8*PBL. - * - * This configuration doesn't take care about the Separate PBL - * so only the bits: 13-8 are programmed with the PBL passed from the - * platform. */ - value |= DMA_BUS_MODE_MAXPBL; - value &= ~DMA_BUS_MODE_PBL_MASK; - value |= (pbl << DMA_BUS_MODE_PBL_SHIFT); + if (dma_cfg->pblx8) + value |= DMA_BUS_MODE_MAXPBL; + value |= DMA_BUS_MODE_USP; + value &= ~(DMA_BUS_MODE_PBL_MASK | DMA_BUS_MODE_RPBL_MASK); + value |= (txpbl << DMA_BUS_MODE_PBL_SHIFT); + value |= (rxpbl << DMA_BUS_MODE_RPBL_SHIFT); /* Set the Fixed burst mode */ - if (fb) + if (dma_cfg->fixed_burst) value |= DMA_BUS_MODE_FB; /* Mixed Burst has no effect when fb is set */ - if (mb) + if (dma_cfg->mixed_burst) value |= DMA_BUS_MODE_MB; if (atds) value |= DMA_BUS_MODE_ATDS; - if (aal) + if (dma_cfg->aal) value |= DMA_BUS_MODE_AAL; writel(value, ioaddr + DMA_BUS_MODE); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c index 61f54c99a7de..e5664da382f3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c @@ -32,11 +32,12 @@ #include "dwmac100.h" #include "dwmac_dma.h" -static void dwmac100_dma_init(void __iomem *ioaddr, int pbl, int fb, int mb, - int aal, u32 dma_tx, u32 dma_rx, int atds) +static void dwmac100_dma_init(void __iomem *ioaddr, + struct stmmac_dma_cfg *dma_cfg, + u32 dma_tx, u32 dma_rx, int atds) { /* Enable Application Access by writing to DMA CSR0 */ - writel(DMA_BUS_MODE_DEFAULT | (pbl << DMA_BUS_MODE_PBL_SHIFT), + writel(DMA_BUS_MODE_DEFAULT | (dma_cfg->pbl << DMA_BUS_MODE_PBL_SHIFT), ioaddr + DMA_BUS_MODE); /* Mask interrupts by writing to CSR7 */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c index e81b6e565c29..8196ab5fc33c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c @@ -71,25 +71,29 @@ static void dwmac4_dma_axi(void __iomem *ioaddr, struct stmmac_axi *axi) writel(value, ioaddr + DMA_SYS_BUS_MODE); } -static void dwmac4_dma_init_channel(void __iomem *ioaddr, int pbl, +static void dwmac4_dma_init_channel(void __iomem *ioaddr, + struct stmmac_dma_cfg *dma_cfg, u32 dma_tx_phy, u32 dma_rx_phy, u32 channel) { u32 value; + int txpbl = dma_cfg->txpbl ?: dma_cfg->pbl; + int rxpbl = dma_cfg->rxpbl ?: dma_cfg->pbl; /* set PBL for each channels. Currently we affect same configuration * on each channel */ value = readl(ioaddr + DMA_CHAN_CONTROL(channel)); - value = value | DMA_BUS_MODE_PBL; + if (dma_cfg->pblx8) + value = value | DMA_BUS_MODE_PBL; writel(value, ioaddr + DMA_CHAN_CONTROL(channel)); value = readl(ioaddr + DMA_CHAN_TX_CONTROL(channel)); - value = value | (pbl << DMA_BUS_MODE_PBL_SHIFT); + value = value | (txpbl << DMA_BUS_MODE_PBL_SHIFT); writel(value, ioaddr + DMA_CHAN_TX_CONTROL(channel)); value = readl(ioaddr + DMA_CHAN_RX_CONTROL(channel)); - value = value | (pbl << DMA_BUS_MODE_RPBL_SHIFT); + value = value | (rxpbl << DMA_BUS_MODE_RPBL_SHIFT); writel(value, ioaddr + DMA_CHAN_RX_CONTROL(channel)); /* Mask interrupts by writing to CSR7 */ @@ -99,27 +103,28 @@ static void dwmac4_dma_init_channel(void __iomem *ioaddr, int pbl, writel(dma_rx_phy, ioaddr + DMA_CHAN_RX_BASE_ADDR(channel)); } -static void dwmac4_dma_init(void __iomem *ioaddr, int pbl, int fb, int mb, - int aal, u32 dma_tx, u32 dma_rx, int atds) +static void dwmac4_dma_init(void __iomem *ioaddr, + struct stmmac_dma_cfg *dma_cfg, + u32 dma_tx, u32 dma_rx, int atds) { u32 value = readl(ioaddr + DMA_SYS_BUS_MODE); int i; /* Set the Fixed burst mode */ - if (fb) + if (dma_cfg->fixed_burst) value |= DMA_SYS_BUS_FB; /* Mixed Burst has no effect when fb is set */ - if (mb) + if (dma_cfg->mixed_burst) value |= DMA_SYS_BUS_MB; - if (aal) + if (dma_cfg->aal) value |= DMA_SYS_BUS_AAL; writel(value, ioaddr + DMA_SYS_BUS_MODE); for (i = 0; i < DMA_CHANNEL_NB_MAX; i++) - dwmac4_dma_init_channel(ioaddr, pbl, dma_tx, dma_rx, i); + dwmac4_dma_init_channel(ioaddr, dma_cfg, dma_tx, dma_rx, i); } static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 6ab7e2bdcadd..699ee1d30426 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -263,7 +263,7 @@ static void stmmac_ethtool_getdrvinfo(struct net_device *dev, { struct stmmac_priv *priv = netdev_priv(dev); - if (priv->plat->has_gmac) + if (priv->plat->has_gmac || priv->plat->has_gmac4) strlcpy(info->driver, GMAC_ETHTOOL_NAME, sizeof(info->driver)); else strlcpy(info->driver, MAC100_ETHTOOL_NAME, @@ -446,7 +446,7 @@ static void stmmac_ethtool_gregs(struct net_device *dev, memset(reg_space, 0x0, REG_SPACE_SIZE); - if (!priv->plat->has_gmac) { + if (!(priv->plat->has_gmac || priv->plat->has_gmac4)) { /* MAC registers */ for (i = 0; i < 12; i++) reg_space[i] = readl(priv->ioaddr + (i * 4)); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 982c95213da4..b5188122bc15 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1578,16 +1578,12 @@ static void stmmac_check_ether_addr(struct stmmac_priv *priv) */ static int stmmac_init_dma_engine(struct stmmac_priv *priv) { - int pbl = DEFAULT_DMA_PBL, fixed_burst = 0, aal = 0; - int mixed_burst = 0; int atds = 0; int ret = 0; - if (priv->plat->dma_cfg) { - pbl = priv->plat->dma_cfg->pbl; - fixed_burst = priv->plat->dma_cfg->fixed_burst; - mixed_burst = priv->plat->dma_cfg->mixed_burst; - aal = priv->plat->dma_cfg->aal; + if (!priv->plat->dma_cfg || !priv->plat->dma_cfg->pbl) { + dev_err(priv->device, "Invalid DMA configuration\n"); + return -EINVAL; } if (priv->extend_desc && (priv->mode == STMMAC_RING_MODE)) @@ -1599,8 +1595,8 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv) return ret; } - priv->hw->dma->init(priv->ioaddr, pbl, fixed_burst, mixed_burst, - aal, priv->dma_tx_phy, priv->dma_rx_phy, atds); + priv->hw->dma->init(priv->ioaddr, priv->plat->dma_cfg, + priv->dma_tx_phy, priv->dma_rx_phy, atds); if (priv->synopsys_id >= DWMAC_CORE_4_00) { priv->rx_tail_addr = priv->dma_rx_phy + diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c index 56c8a2342c14..a2831773431a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c @@ -81,6 +81,7 @@ static void stmmac_default_data(struct plat_stmmacenet_data *plat) plat->mdio_bus_data->phy_mask = 0; plat->dma_cfg->pbl = 32; + plat->dma_cfg->pblx8 = true; /* TODO: AXI */ /* Set default value for multicast hash bins */ @@ -115,6 +116,7 @@ static int quark_default_data(struct plat_stmmacenet_data *plat, plat->mdio_bus_data->phy_mask = 0; plat->dma_cfg->pbl = 16; + plat->dma_cfg->pblx8 = true; plat->dma_cfg->fixed_burst = 1; /* AXI (TODO) */ diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index e528e7126b65..082cd48db6a7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -292,6 +292,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac) if (of_device_is_compatible(np, "snps,dwmac-4.00") || of_device_is_compatible(np, "snps,dwmac-4.10a")) { plat->has_gmac4 = 1; + plat->has_gmac = 0; plat->pmt = 1; plat->tso_en = of_property_read_bool(np, "snps,tso"); } @@ -303,21 +304,25 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac) plat->force_sf_dma_mode = 1; } - if (of_find_property(np, "snps,pbl", NULL)) { - dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*dma_cfg), - GFP_KERNEL); - if (!dma_cfg) { - stmmac_remove_config_dt(pdev, plat); - return ERR_PTR(-ENOMEM); - } - plat->dma_cfg = dma_cfg; - of_property_read_u32(np, "snps,pbl", &dma_cfg->pbl); - dma_cfg->aal = of_property_read_bool(np, "snps,aal"); - dma_cfg->fixed_burst = - of_property_read_bool(np, "snps,fixed-burst"); - dma_cfg->mixed_burst = - of_property_read_bool(np, "snps,mixed-burst"); + dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*dma_cfg), + GFP_KERNEL); + if (!dma_cfg) { + stmmac_remove_config_dt(pdev, plat); + return ERR_PTR(-ENOMEM); } + plat->dma_cfg = dma_cfg; + + of_property_read_u32(np, "snps,pbl", &dma_cfg->pbl); + if (!dma_cfg->pbl) + dma_cfg->pbl = DEFAULT_DMA_PBL; + of_property_read_u32(np, "snps,txpbl", &dma_cfg->txpbl); + of_property_read_u32(np, "snps,rxpbl", &dma_cfg->rxpbl); + dma_cfg->pblx8 = !of_property_read_bool(np, "snps,no-pbl-x8"); + + dma_cfg->aal = of_property_read_bool(np, "snps,aal"); + dma_cfg->fixed_burst = of_property_read_bool(np, "snps,fixed-burst"); + dma_cfg->mixed_burst = of_property_read_bool(np, "snps,mixed-burst"); + plat->force_thresh_dma_mode = of_property_read_bool(np, "snps,force_thresh_dma_mode"); if (plat->force_thresh_dma_mode) { plat->force_sf_dma_mode = 0; diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index bcd7b76dde9f..d73da8afe08e 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -37,6 +37,7 @@ #include <linux/of_device.h> #include <linux/of_irq.h> #include <linux/of_mdio.h> +#include <linux/of_net.h> #include <linux/of_platform.h> #include <linux/of_address.h> #include <linux/skbuff.h> @@ -332,7 +333,7 @@ static void temac_do_set_mac_address(struct net_device *ndev) mutex_unlock(&lp->indirect_mutex); } -static int temac_init_mac_address(struct net_device *ndev, void *address) +static int temac_init_mac_address(struct net_device *ndev, const void *address) { memcpy(ndev->dev_addr, address, ETH_ALEN); if (!is_valid_ether_addr(ndev->dev_addr)) @@ -982,7 +983,7 @@ static int temac_of_probe(struct platform_device *op) struct net_device *ndev; const void *addr; __be32 *p; - int size, rc = 0; + int rc = 0; /* Init network device structure */ ndev = alloc_etherdev(sizeof(*lp)); @@ -1074,13 +1075,13 @@ static int temac_of_probe(struct platform_device *op) /* Retrieve the MAC address */ - addr = of_get_property(op->dev.of_node, "local-mac-address", &size); - if ((!addr) || (size != 6)) { + addr = of_get_mac_address(op->dev.of_node); + if (!addr) { dev_err(&op->dev, "could not find MAC address\n"); rc = -ENODEV; goto err_iounmap_2; } - temac_init_mac_address(ndev, (void *)addr); + temac_init_mac_address(ndev, addr); rc = temac_mdio_setup(lp, op->dev.of_node); if (rc) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index c9c8a3be9f1b..b96e96919e31 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -25,6 +25,7 @@ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/of_mdio.h> +#include <linux/of_net.h> #include <linux/of_platform.h> #include <linux/of_irq.h> #include <linux/of_address.h> @@ -292,7 +293,8 @@ out: * This function is called to initialize the MAC address of the Axi Ethernet * core. It writes to the UAW0 and UAW1 registers of the core. */ -static void axienet_set_mac_address(struct net_device *ndev, void *address) +static void axienet_set_mac_address(struct net_device *ndev, + const void *address) { struct axienet_local *lp = netdev_priv(ndev); @@ -1456,7 +1458,7 @@ static int axienet_probe(struct platform_device *pdev) struct device_node *np; struct axienet_local *lp; struct net_device *ndev; - u8 mac_addr[6]; + const void *mac_addr; struct resource *ethres, dmares; u32 value; @@ -1567,13 +1569,12 @@ static int axienet_probe(struct platform_device *pdev) } /* Retrieve the MAC address */ - ret = of_property_read_u8_array(pdev->dev.of_node, - "local-mac-address", mac_addr, 6); - if (ret) { + mac_addr = of_get_mac_address(pdev->dev.of_node); + if (!mac_addr) { dev_err(&pdev->dev, "could not find MAC address\n"); goto free_netdev; } - axienet_set_mac_address(ndev, (void *)mac_addr); + axienet_set_mac_address(ndev, mac_addr); lp->coalesce_count_rx = XAXIDMA_DFT_RX_THRESHOLD; lp->coalesce_count_tx = XAXIDMA_DFT_TX_THRESHOLD; diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index cc00eb0db5d2..f83cf6696820 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -3346,19 +3346,18 @@ static struct net *macsec_get_link_net(const struct net_device *dev) static size_t macsec_get_size(const struct net_device *dev) { - return 0 + - nla_total_size_64bit(8) + /* SCI */ - nla_total_size(1) + /* ICV_LEN */ - nla_total_size_64bit(8) + /* CIPHER_SUITE */ - nla_total_size(4) + /* WINDOW */ - nla_total_size(1) + /* ENCODING_SA */ - nla_total_size(1) + /* ENCRYPT */ - nla_total_size(1) + /* PROTECT */ - nla_total_size(1) + /* INC_SCI */ - nla_total_size(1) + /* ES */ - nla_total_size(1) + /* SCB */ - nla_total_size(1) + /* REPLAY_PROTECT */ - nla_total_size(1) + /* VALIDATION */ + return nla_total_size_64bit(8) + /* IFLA_MACSEC_SCI */ + nla_total_size(1) + /* IFLA_MACSEC_ICV_LEN */ + nla_total_size_64bit(8) + /* IFLA_MACSEC_CIPHER_SUITE */ + nla_total_size(4) + /* IFLA_MACSEC_WINDOW */ + nla_total_size(1) + /* IFLA_MACSEC_ENCODING_SA */ + nla_total_size(1) + /* IFLA_MACSEC_ENCRYPT */ + nla_total_size(1) + /* IFLA_MACSEC_PROTECT */ + nla_total_size(1) + /* IFLA_MACSEC_INC_SCI */ + nla_total_size(1) + /* IFLA_MACSEC_ES */ + nla_total_size(1) + /* IFLA_MACSEC_SCB */ + nla_total_size(1) + /* IFLA_MACSEC_REPLAY_PROTECT */ + nla_total_size(1) + /* IFLA_MACSEC_VALIDATION */ 0; } diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index aeaf1bcb12d0..32fa7c76f29c 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -235,6 +235,53 @@ int phy_register_fixup_for_id(const char *bus_id, } EXPORT_SYMBOL(phy_register_fixup_for_id); +/** + * phy_unregister_fixup - remove a phy_fixup from the list + * @bus_id: A string matches fixup->bus_id (or PHY_ANY_ID) in phy_fixup_list + * @phy_uid: A phy id matches fixup->phy_id (or PHY_ANY_UID) in phy_fixup_list + * @phy_uid_mask: Applied to phy_uid and fixup->phy_uid before comparison + */ +int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask) +{ + struct list_head *pos, *n; + struct phy_fixup *fixup; + int ret; + + ret = -ENODEV; + + mutex_lock(&phy_fixup_lock); + list_for_each_safe(pos, n, &phy_fixup_list) { + fixup = list_entry(pos, struct phy_fixup, list); + + if ((!strcmp(fixup->bus_id, bus_id)) && + ((fixup->phy_uid & phy_uid_mask) == + (phy_uid & phy_uid_mask))) { + list_del(&fixup->list); + kfree(fixup); + ret = 0; + break; + } + } + mutex_unlock(&phy_fixup_lock); + + return ret; +} +EXPORT_SYMBOL(phy_unregister_fixup); + +/* Unregisters a fixup of any PHY with the UID in phy_uid */ +int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask) +{ + return phy_unregister_fixup(PHY_ANY_ID, phy_uid, phy_uid_mask); +} +EXPORT_SYMBOL(phy_unregister_fixup_for_uid); + +/* Unregisters a fixup of the PHY with id string bus_id */ +int phy_unregister_fixup_for_id(const char *bus_id) +{ + return phy_unregister_fixup(bus_id, PHY_ANY_UID, 0xffffffff); +} +EXPORT_SYMBOL(phy_unregister_fixup_for_id); + /* Returns 1 if fixup matches phydev in bus_id and phy_uid. * Fixups can be set to match any in one or more fields. */ diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig index cdde59089f72..3dd490f53e48 100644 --- a/drivers/net/usb/Kconfig +++ b/drivers/net/usb/Kconfig @@ -114,6 +114,11 @@ config USB_LAN78XX help This option adds support for Microchip LAN78XX based USB 2 & USB 3 10/100/1000 Ethernet adapters. + LAN7800 : USB 3 to 10/100/1000 Ethernet adapter + LAN7850 : USB 2 to 10/100/1000 Ethernet adapter + LAN7801 : USB 3 to 10/100/1000 Ethernet adapter (MAC only) + + Proper PHY driver is required for LAN7801. To compile this driver as a module, choose M here: the module will be called lan78xx. diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 019f758953fc..08f8703e4d54 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -40,7 +40,7 @@ #define DRIVER_AUTHOR "WOOJUNG HUH <woojung.huh@microchip.com>" #define DRIVER_DESC "LAN78XX USB 3.0 Gigabit Ethernet Devices" #define DRIVER_NAME "lan78xx" -#define DRIVER_VERSION "1.0.5" +#define DRIVER_VERSION "1.0.6" #define TX_TIMEOUT_JIFFIES (5 * HZ) #define THROTTLE_JIFFIES (HZ / 8) @@ -67,6 +67,7 @@ #define LAN78XX_USB_VENDOR_ID (0x0424) #define LAN7800_USB_PRODUCT_ID (0x7800) #define LAN7850_USB_PRODUCT_ID (0x7850) +#define LAN7801_USB_PRODUCT_ID (0x7801) #define LAN78XX_EEPROM_MAGIC (0x78A5) #define LAN78XX_OTP_MAGIC (0x78F3) @@ -390,6 +391,7 @@ struct lan78xx_net { u32 chipid; u32 chiprev; struct mii_bus *mdiobus; + phy_interface_t interface; int fc_autoneg; u8 fc_request_control; @@ -400,6 +402,10 @@ struct lan78xx_net { struct irq_domain_data domain_data; }; +/* define external phy id */ +#define PHY_LAN8835 (0x0007C130) +#define PHY_KSZ9031RNX (0x00221620) + /* use ethtool to change the level for any given device */ static int msg_level = -1; module_param(msg_level, int, 0); @@ -1697,6 +1703,7 @@ static int lan78xx_mdiobus_read(struct mii_bus *bus, int phy_id, int idx) done: mutex_unlock(&dev->phy_mutex); usb_autopm_put_interface(dev->intf); + return ret; } @@ -1759,6 +1766,10 @@ static int lan78xx_mdio_init(struct lan78xx_net *dev) /* set to internal PHY id */ dev->mdiobus->phy_mask = ~(1 << 1); break; + case ID_REV_CHIP_ID_7801_: + /* scan thru PHYAD[2..0] */ + dev->mdiobus->phy_mask = ~(0xFF); + break; } ret = mdiobus_register(dev->mdiobus); @@ -1933,6 +1944,47 @@ static void lan78xx_remove_irq_domain(struct lan78xx_net *dev) dev->domain_data.irqdomain = NULL; } +static int lan8835_fixup(struct phy_device *phydev) +{ + int buf; + int ret; + struct lan78xx_net *dev = netdev_priv(phydev->attached_dev); + + /* LED2/PME_N/IRQ_N/RGMII_ID pin to IRQ_N mode */ + buf = phy_read_mmd_indirect(phydev, 0x8010, 3); + buf &= ~0x1800; + buf |= 0x0800; + phy_write_mmd_indirect(phydev, 0x8010, 3, buf); + + /* RGMII MAC TXC Delay Enable */ + ret = lan78xx_write_reg(dev, MAC_RGMII_ID, + MAC_RGMII_ID_TXC_DELAY_EN_); + + /* RGMII TX DLL Tune Adjust */ + ret = lan78xx_write_reg(dev, RGMII_TX_BYP_DLL, 0x3D00); + + dev->interface = PHY_INTERFACE_MODE_RGMII_TXID; + + return 1; +} + +static int ksz9031rnx_fixup(struct phy_device *phydev) +{ + struct lan78xx_net *dev = netdev_priv(phydev->attached_dev); + + /* Micrel9301RNX PHY configuration */ + /* RGMII Control Signal Pad Skew */ + phy_write_mmd_indirect(phydev, 4, 2, 0x0077); + /* RGMII RX Data Pad Skew */ + phy_write_mmd_indirect(phydev, 5, 2, 0x7777); + /* RGMII RX Clock Pad Skew */ + phy_write_mmd_indirect(phydev, 8, 2, 0x1FF); + + dev->interface = PHY_INTERFACE_MODE_RGMII_RXID; + + return 1; +} + static int lan78xx_phy_init(struct lan78xx_net *dev) { int ret; @@ -1945,6 +1997,42 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) return -EIO; } + if ((dev->chipid == ID_REV_CHIP_ID_7800_) || + (dev->chipid == ID_REV_CHIP_ID_7850_)) { + phydev->is_internal = true; + dev->interface = PHY_INTERFACE_MODE_GMII; + + } else if (dev->chipid == ID_REV_CHIP_ID_7801_) { + if (!phydev->drv) { + netdev_err(dev->net, "no PHY driver found\n"); + return -EIO; + } + + dev->interface = PHY_INTERFACE_MODE_RGMII; + + /* external PHY fixup for KSZ9031RNX */ + ret = phy_register_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0, + ksz9031rnx_fixup); + if (ret < 0) { + netdev_err(dev->net, "fail to register fixup\n"); + return ret; + } + /* external PHY fixup for LAN8835 */ + ret = phy_register_fixup_for_uid(PHY_LAN8835, 0xfffffff0, + lan8835_fixup); + if (ret < 0) { + netdev_err(dev->net, "fail to register fixup\n"); + return ret; + } + /* add more external PHY fixup here if needed */ + + phydev->is_internal = false; + } else { + netdev_err(dev->net, "unknown ID found\n"); + ret = -EIO; + goto error; + } + /* if phyirq is not set, use polling mode in phylib */ if (dev->domain_data.phyirq > 0) phydev->irq = dev->domain_data.phyirq; @@ -1957,7 +2045,7 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) ret = phy_connect_direct(dev->net, phydev, lan78xx_link_status_change, - PHY_INTERFACE_MODE_GMII); + dev->interface); if (ret) { netdev_err(dev->net, "can't attach PHY to %s\n", dev->mdiobus->id); @@ -1982,6 +2070,12 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) netif_dbg(dev, ifup, dev->net, "phy initialised successfully"); return 0; + +error: + phy_unregister_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0); + phy_unregister_fixup_for_uid(PHY_LAN8835, 0xfffffff0); + + return ret; } static int lan78xx_set_rx_max_frame_length(struct lan78xx_net *dev, int size) @@ -2338,6 +2432,9 @@ static int lan78xx_reset(struct lan78xx_net *dev) } while ((buf & PMT_CTL_PHY_RST_) || !(buf & PMT_CTL_READY_)); ret = lan78xx_read_reg(dev, MAC_CR, &buf); + /* LAN7801 only has RGMII mode */ + if (dev->chipid == ID_REV_CHIP_ID_7801_) + buf &= ~MAC_CR_GMII_EN_; buf |= MAC_CR_AUTO_DUPLEX_ | MAC_CR_AUTO_SPEED_; ret = lan78xx_write_reg(dev, MAC_CR, buf); @@ -2464,8 +2561,12 @@ static int lan78xx_stop(struct net_device *net) if (timer_pending(&dev->stat_monitor)) del_timer_sync(&dev->stat_monitor); + phy_unregister_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0); + phy_unregister_fixup_for_uid(PHY_LAN8835, 0xfffffff0); + phy_stop(net->phydev); phy_disconnect(net->phydev); + net->phydev = NULL; clear_bit(EVENT_DEV_OPEN, &dev->flags); @@ -3888,6 +3989,10 @@ static const struct usb_device_id products[] = { /* LAN7850 USB Gigabit Ethernet Device */ USB_DEVICE(LAN78XX_USB_VENDOR_ID, LAN7850_USB_PRODUCT_ID), }, + { + /* LAN7801 USB Gigabit Ethernet Device */ + USB_DEVICE(LAN78XX_USB_VENDOR_ID, LAN7801_USB_PRODUCT_ID), + }, {}, }; MODULE_DEVICE_TABLE(usb, products); diff --git a/drivers/net/usb/lan78xx.h b/drivers/net/usb/lan78xx.h index 40927906109a..25aa54611774 100644 --- a/drivers/net/usb/lan78xx.h +++ b/drivers/net/usb/lan78xx.h @@ -108,6 +108,7 @@ #define ID_REV_CHIP_REV_MASK_ (0x0000FFFF) #define ID_REV_CHIP_ID_7800_ (0x7800) #define ID_REV_CHIP_ID_7850_ (0x7850) +#define ID_REV_CHIP_ID_7801_ (0x7801) #define FPGA_REV (0x04) #define FPGA_REV_MINOR_MASK_ (0x0000FF00) @@ -550,6 +551,7 @@ #define LTM_INACTIVE1_TIMER10_ (0x0000FFFF) #define MAC_CR (0x100) +#define MAC_CR_GMII_EN_ (0x00080000) #define MAC_CR_EEE_TX_CLK_STOP_EN_ (0x00040000) #define MAC_CR_EEE_EN_ (0x00020000) #define MAC_CR_EEE_TLAR_EN_ (0x00010000) @@ -787,6 +789,18 @@ #define PHY_DEV_ID_MODEL_MASK_ (0x0FC00000) #define PHY_DEV_ID_OUI_MASK_ (0x003FFFFF) +#define RGMII_TX_BYP_DLL (0x708) +#define RGMII_TX_BYP_DLL_TX_TUNE_ADJ_MASK_ (0x000FC00) +#define RGMII_TX_BYP_DLL_TX_TUNE_SEL_MASK_ (0x00003F0) +#define RGMII_TX_BYP_DLL_TX_DLL_RESET_ (0x0000002) +#define RGMII_TX_BYP_DLL_TX_DLL_BYPASS_ (0x0000001) + +#define RGMII_RX_BYP_DLL (0x70C) +#define RGMII_RX_BYP_DLL_RX_TUNE_ADJ_MASK_ (0x000FC00) +#define RGMII_RX_BYP_DLL_RX_TUNE_SEL_MASK_ (0x00003F0) +#define RGMII_RX_BYP_DLL_RX_DLL_RESET_ (0x0000002) +#define RGMII_RX_BYP_DLL_RX_DLL_BYPASS_ (0x0000001) + #define OTP_BASE_ADDR (0x00001000) #define OTP_ADDR_RANGE_ (0x1FF) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index c6f2d89c0e97..266354390c8f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -261,8 +261,8 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) { /* We can only add the work to the list after we're * sure it was not in the list. + * test_and_set_bit() implies a memory barrier. */ - smp_mb(); llist_add(&work->node, &dev->work_list); wake_up_process(dev->worker); } diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index e3b30ea9ece5..9c3c68b9a49e 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -195,7 +195,6 @@ static int vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt) { struct vhost_vsock *vsock; - struct vhost_virtqueue *vq; int len = pkt->len; /* Find the vhost_vsock according to guest context id */ @@ -205,8 +204,6 @@ vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt) return -ENODEV; } - vq = &vsock->vqs[VSOCK_VQ_RX]; - if (pkt->reply) atomic_inc(&vsock->queued_replies); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7453c1281531..a13b031dc6b8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -18,13 +18,6 @@ struct bpf_reg_state { enum bpf_reg_type type; - /* - * Used to determine if any memory access using this register will - * result in a bad access. - */ - s64 min_value; - u64 max_value; - u32 id; union { /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ s64 imm; @@ -40,6 +33,13 @@ struct bpf_reg_state { */ struct bpf_map *map_ptr; }; + u32 id; + /* Used to determine if any memory access using this register will + * result in a bad access. These two fields must be last. + * See states_equal() + */ + s64 min_value; + u64 max_value; }; enum bpf_stack_slot_type { diff --git a/include/linux/filter.h b/include/linux/filter.h index f078d2b1cff6..6a1658308612 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -406,7 +406,8 @@ struct bpf_prog { u16 jited:1, /* Is our filter JIT'ed? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ - dst_needed:1; /* Do we need dst entry? */ + dst_needed:1, /* Do we need dst entry? */ + xdp_adjust_head:1; /* Adjusting pkt head? */ kmemcheck_bitfield_end(meta); enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ @@ -440,6 +441,7 @@ struct bpf_skb_data_end { struct xdp_buff { void *data; void *data_end; + void *data_hard_start; }; /* compute the linear packet data range [data, data_end) which @@ -595,7 +597,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); -bool bpf_helper_changes_skb_data(void *func); +bool bpf_helper_changes_pkt_data(void *func); struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1ff5ea6e1221..994f7423a74b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -192,6 +192,7 @@ struct net_device_stats { #ifdef CONFIG_RPS #include <linux/static_key.h> extern struct static_key rps_needed; +extern struct static_key rfs_needed; #endif struct neighbour; diff --git a/include/linux/phy.h b/include/linux/phy.h index feb8a98e8dd3..f7d95f644eed 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -860,6 +860,10 @@ int phy_register_fixup_for_id(const char *bus_id, int phy_register_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask, int (*run)(struct phy_device *)); +int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask); +int phy_unregister_fixup_for_id(const char *bus_id); +int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); + int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable); int phy_get_eee_err(struct phy_device *phydev); int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9c535fbccf2c..0cd92b0f2af5 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1966,6 +1966,8 @@ static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len) return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL; } +void skb_condense(struct sk_buff *skb); + /** * skb_headroom - bytes at buffer head * @skb: buffer to check diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 3537fb33cc90..266dab9ad782 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -88,6 +88,9 @@ struct stmmac_mdio_bus_data { struct stmmac_dma_cfg { int pbl; + int txpbl; + int rxpbl; + bool pblx8; int fixed_burst; int mixed_burst; bool aal; diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index c4f31666afd2..d896a33e00d4 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -104,6 +104,22 @@ struct flow_dissector_key_ports { }; }; +/** + * flow_dissector_key_icmp: + * @ports: type and code of ICMP header + * icmp: ICMP type (high) and code (low) + * type: ICMP type + * code: ICMP code + */ +struct flow_dissector_key_icmp { + union { + __be16 icmp; + struct { + u8 type; + u8 code; + }; + }; +}; /** * struct flow_dissector_key_eth_addrs: @@ -122,6 +138,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */ + FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */ FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */ FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */ FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_flow_vlan */ diff --git a/include/net/sock.h b/include/net/sock.h index 1749e38d0301..e17aa3de2b4d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -913,17 +913,20 @@ static inline void sock_rps_record_flow_hash(__u32 hash) static inline void sock_rps_record_flow(const struct sock *sk) { #ifdef CONFIG_RPS - /* Reading sk->sk_rxhash might incur an expensive cache line miss. - * - * TCP_ESTABLISHED does cover almost all states where RFS - * might be useful, and is cheaper [1] than testing : - * IPv4: inet_sk(sk)->inet_daddr - * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) - * OR an additional socket flag - * [1] : sk_state and sk_prot are in the same cache line. - */ - if (sk->sk_state == TCP_ESTABLISHED) - sock_rps_record_flow_hash(sk->sk_rxhash); + if (static_key_false(&rfs_needed)) { + /* Reading sk->sk_rxhash might incur an expensive cache line + * miss. + * + * TCP_ESTABLISHED does cover almost all states where RFS + * might be useful, and is cheaper [1] than testing : + * IPv4: inet_sk(sk)->inet_daddr + * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) + * OR an additional socket flag + * [1] : sk_state and sk_prot are in the same cache line. + */ + if (sk->sk_state == TCP_ESTABLISHED) + sock_rps_record_flow_hash(sk->sk_rxhash); + } #endif } @@ -2160,7 +2163,8 @@ struct sock_skb_cb { static inline void sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) { - SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops); + SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? + atomic_read(&sk->sk_drops) : 0; } static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6123d9b8e828..0eb0e87dbe9f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -424,6 +424,12 @@ union bpf_attr { * @len: length of header to be pushed in front * @flags: Flags (unused for now) * Return: 0 on success or negative error + * + * int bpf_xdp_adjust_head(xdp_md, delta) + * Adjust the xdp_md.data by delta + * @xdp_md: pointer to xdp_md + * @delta: An positive/negative integer to be added to xdp_md.data + * Return: 0 on success or negative on error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -469,7 +475,8 @@ union bpf_attr { FN(csum_update), \ FN(set_hash_invalid), \ FN(get_numa_node_id), \ - FN(skb_change_head), + FN(skb_change_head), \ + FN(xdp_adjust_head), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -576,6 +583,8 @@ struct bpf_sock { __u32 protocol; }; +#define XDP_PACKET_HEADROOM 256 + /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other * return codes are reserved for future use. Unknown return codes will result diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 1adc0b654996..cb4bcdc58543 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -458,11 +458,28 @@ enum { TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK, /* be16 */ TCA_FLOWER_KEY_ENC_UDP_DST_PORT, /* be16 */ TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, /* be16 */ + + TCA_FLOWER_KEY_FLAGS, /* be32 */ + TCA_FLOWER_KEY_FLAGS_MASK, /* be32 */ + + TCA_FLOWER_KEY_ICMPV4_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,/* u8 */ + __TCA_FLOWER_MAX, }; #define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1) +enum { + TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), +}; + /* Match-all classifier */ enum { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index bdcc9f4ba767..83e0d153b0b4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1143,7 +1143,7 @@ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) return prog; } -bool __weak bpf_helper_changes_skb_data(void *func) +bool __weak bpf_helper_changes_pkt_data(void *func) { return false; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 88f609f1c0c3..4819ec9d95f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -579,6 +579,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_xdp_adjust_head) + prog->xdp_adjust_head = 1; if (insn->imm == BPF_FUNC_tail_call) { /* mark bpf_tail_call as different opcode * to avoid conditional branch in diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index da9fb2a9b7eb..d28f9a3380a9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1216,7 +1216,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) return -EINVAL; } - changes_data = bpf_helper_changes_skb_data(fn->func); + changes_data = bpf_helper_changes_pkt_data(fn->func); memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; @@ -2528,7 +2528,7 @@ static bool states_equal(struct bpf_verifier_env *env, * we didn't do a variable access into a map then we are a-ok. */ if (!varlen_map_access && - rold->type == rcur->type && rold->imm == rcur->imm) + memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0) continue; /* If we didn't map access then again we don't care about the diff --git a/net/core/dev.c b/net/core/dev.c index bffb5253e778..1d33ce03365f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3447,6 +3447,8 @@ EXPORT_SYMBOL(rps_cpu_mask); struct static_key rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); +struct static_key rfs_needed __read_mostly; +EXPORT_SYMBOL(rfs_needed); static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, diff --git a/net/core/filter.c b/net/core/filter.c index b751202e12f8..b1461708a977 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2234,7 +2234,28 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; -bool bpf_helper_changes_skb_data(void *func) +BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) +{ + void *data = xdp->data + offset; + + if (unlikely(data < xdp->data_hard_start || + data > xdp->data_end - ETH_HLEN)) + return -EINVAL; + + xdp->data = data; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { + .func = bpf_xdp_adjust_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || func == bpf_skb_vlan_pop || @@ -2244,7 +2265,8 @@ bool bpf_helper_changes_skb_data(void *func) func == bpf_skb_change_tail || func == bpf_skb_pull_data || func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace) + func == bpf_l4_csum_replace || + func == bpf_xdp_adjust_head) return true; return false; @@ -2670,6 +2692,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_xdp_adjust_head: + return &bpf_xdp_adjust_head_proto; default: return sk_filter_func_proto(func_id); } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 1eb6f949e5b2..d6447dc10371 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -58,6 +58,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, EXPORT_SYMBOL(skb_flow_dissector_init); /** + * skb_flow_get_be16 - extract be16 entity + * @skb: sk_buff to extract from + * @poff: offset to extract at + * @data: raw buffer pointer to the packet + * @hlen: packet header length + * + * The function will try to retrieve a be32 entity at + * offset poff + */ +__be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, void *data, + int hlen) +{ + __be16 *u, _u; + + u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u); + if (u) + return *u; + + return 0; +} + +/** * __skb_flow_get_ports - extract the upper layer ports and return them * @skb: sk_buff to extract the ports from * @thoff: transport header offset @@ -117,6 +139,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; struct flow_dissector_key_keyid *key_keyid; @@ -546,6 +569,14 @@ ip_proto_again: data, hlen); } + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP)) { + key_icmp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP, + target_container); + key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen); + } + out_good: ret = true; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b45cd1494243..84151cf40aeb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4931,3 +4931,31 @@ struct sk_buff *pskb_extract(struct sk_buff *skb, int off, return clone; } EXPORT_SYMBOL(pskb_extract); + +/** + * skb_condense - try to get rid of fragments/frag_list if possible + * @skb: buffer + * + * Can be used to save memory before skb is added to a busy queue. + * If packet has bytes in frags and enough tail room in skb->head, + * pull all of them, so that we can free the frags right now and adjust + * truesize. + * Notes: + * We do not reallocate skb->head thus can not fail. + * Caller must re-evaluate skb->truesize if needed. + */ +void skb_condense(struct sk_buff *skb) +{ + if (!skb->data_len || + skb->data_len > skb->end - skb->tail || + skb_cloned(skb)) + return; + + /* Nice, we can free page frag(s) right now */ + __pskb_pull_tail(skb, skb->data_len); + + /* Now adjust skb->truesize, since __pskb_pull_tail() does + * not do this. + */ + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); +} diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 0df2aa652530..2a46e4009f62 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -79,10 +79,13 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); - if (sock_table) + if (sock_table) { static_key_slow_inc(&rps_needed); + static_key_slow_inc(&rfs_needed); + } if (orig_sock_table) { static_key_slow_dec(&rps_needed); + static_key_slow_dec(&rfs_needed); synchronize_rcu(); vfree(orig_sock_table); } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 691146abde2d..f79d7a8ab1c6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1047,12 +1047,12 @@ int icmp_rcv(struct sk_buff *skb) if (success) { consume_skb(skb); - return 0; + return NET_RX_SUCCESS; } drop: kfree_skb(skb); - return 0; + return NET_RX_DROP; csum_error: __ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS); error: diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 16d88ba9ff1c..f5628ada47b5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1199,7 +1199,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; int rmem, delta, amt, err = -ENOMEM; - int size = skb->truesize; + int size; /* try to avoid the costly atomic add/sub pair when the receive * queue is full; always allow at least a packet @@ -1208,6 +1208,16 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) if (rmem > sk->sk_rcvbuf) goto drop; + /* Under mem pressure, it might be helpful to help udp_recvmsg() + * having linear skbs : + * - Reduce memory overhead and thus increase receive queue capacity + * - Less cache line misses at copyout() time + * - Less work at consume_skb() (less alien page frag freeing) + */ + if (rmem > (sk->sk_rcvbuf >> 1)) + skb_condense(skb); + size = skb->truesize; + /* we drop only if the receive buf is full and the receive * queue contains some other skb */ diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 29a9e6d9f274..e040c5140f61 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -39,6 +39,7 @@ struct fl_flow_key { struct flow_dissector_key_ipv6_addrs ipv6; }; struct flow_dissector_key_ports tp; + struct flow_dissector_key_icmp icmp; struct flow_dissector_key_keyid enc_key_id; union { struct flow_dissector_key_ipv4_addrs enc_ipv4; @@ -386,6 +387,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_UDP_DST_PORT] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_FLAGS] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_FLAGS_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ICMPV4_TYPE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV4_TYPE_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV4_CODE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV4_CODE_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV6_TYPE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV6_CODE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -420,6 +431,39 @@ static void fl_set_key_vlan(struct nlattr **tb, } } +static void fl_set_key_flag(u32 flower_key, u32 flower_mask, + u32 *dissector_key, u32 *dissector_mask, + u32 flower_flag_bit, u32 dissector_flag_bit) +{ + if (flower_mask & flower_flag_bit) { + *dissector_mask |= dissector_flag_bit; + if (flower_key & flower_flag_bit) + *dissector_key |= dissector_flag_bit; + } +} + +static void fl_set_key_flags(struct nlattr **tb, + u32 *flags_key, u32 *flags_mask) +{ + u32 key, mask; + + if (!tb[TCA_FLOWER_KEY_FLAGS]) + return; + + key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS])); + + if (!tb[TCA_FLOWER_KEY_FLAGS_MASK]) + mask = ~0; + else + mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK])); + + *flags_key = 0; + *flags_mask = 0; + + fl_set_key_flag(key, mask, flags_key, flags_mask, + TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT); +} + static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask) { @@ -502,6 +546,26 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST, &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK, sizeof(key->tp.dst)); + } else if (key->basic.n_proto == htons(ETH_P_IP) && + key->basic.ip_proto == IPPROTO_ICMP) { + fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV4_TYPE, + &mask->icmp.type, + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK, + sizeof(key->icmp.type)); + fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE, + &mask->icmp.code, + TCA_FLOWER_KEY_ICMPV4_CODE_MASK, + sizeof(key->icmp.code)); + } else if (key->basic.n_proto == htons(ETH_P_IPV6) && + key->basic.ip_proto == IPPROTO_ICMPV6) { + fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV6_TYPE, + &mask->icmp.type, + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK, + sizeof(key->icmp.type)); + fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE, + &mask->icmp.code, + TCA_FLOWER_KEY_ICMPV4_CODE_MASK, + sizeof(key->icmp.code)); } if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || @@ -546,6 +610,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, sizeof(key->enc_tp.dst)); + fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); + return 0; } @@ -612,6 +678,8 @@ static void fl_init_dissector(struct cls_fl_head *head, FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_ICMP, icmp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_VLAN, vlan); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); @@ -880,6 +948,42 @@ static int fl_dump_key_vlan(struct sk_buff *skb, return 0; } +static void fl_get_key_flag(u32 dissector_key, u32 dissector_mask, + u32 *flower_key, u32 *flower_mask, + u32 flower_flag_bit, u32 dissector_flag_bit) +{ + if (dissector_mask & dissector_flag_bit) { + *flower_mask |= flower_flag_bit; + if (dissector_key & dissector_flag_bit) + *flower_key |= flower_flag_bit; + } +} + +static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask) +{ + u32 key, mask; + __be32 _key, _mask; + int err; + + if (!memchr_inv(&flags_mask, 0, sizeof(flags_mask))) + return 0; + + key = 0; + mask = 0; + + fl_get_key_flag(flags_key, flags_mask, &key, &mask, + TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT); + + _key = cpu_to_be32(key); + _mask = cpu_to_be32(mask); + + err = nla_put(skb, TCA_FLOWER_KEY_FLAGS, 4, &_key); + if (err) + return err; + + return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask); +} + static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { @@ -977,6 +1081,28 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK, sizeof(key->tp.dst)))) goto nla_put_failure; + else if (key->basic.n_proto == htons(ETH_P_IP) && + key->basic.ip_proto == IPPROTO_ICMP && + (fl_dump_key_val(skb, &key->icmp.type, + TCA_FLOWER_KEY_ICMPV4_TYPE, &mask->icmp.type, + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK, + sizeof(key->icmp.type)) || + fl_dump_key_val(skb, &key->icmp.code, + TCA_FLOWER_KEY_ICMPV4_CODE, &mask->icmp.code, + TCA_FLOWER_KEY_ICMPV4_CODE_MASK, + sizeof(key->icmp.code)))) + goto nla_put_failure; + else if (key->basic.n_proto == htons(ETH_P_IPV6) && + key->basic.ip_proto == IPPROTO_ICMPV6 && + (fl_dump_key_val(skb, &key->icmp.type, + TCA_FLOWER_KEY_ICMPV6_TYPE, &mask->icmp.type, + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK, + sizeof(key->icmp.type)) || + fl_dump_key_val(skb, &key->icmp.code, + TCA_FLOWER_KEY_ICMPV6_CODE, &mask->icmp.code, + TCA_FLOWER_KEY_ICMPV6_CODE_MASK, + sizeof(key->icmp.code)))) + goto nla_put_failure; if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && (fl_dump_key_val(skb, &key->enc_ipv4.src, @@ -1015,6 +1141,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, sizeof(key->enc_tp.dst))) goto nla_put_failure; + if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) + goto nla_put_failure; + nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags); if (tcf_exts_dump(skb, &f->exts)) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 00cd3081c038..f2219c1489e5 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -33,6 +33,7 @@ hostprogs-y += trace_event hostprogs-y += sampleip hostprogs-y += tc_l2_redirect hostprogs-y += lwt_len_hist +hostprogs-y += xdp_tx_iptunnel test_lru_dist-objs := test_lru_dist.o libbpf.o sock_example-objs := sock_example.o libbpf.o @@ -67,6 +68,7 @@ trace_event-objs := bpf_load.o libbpf.o trace_event_user.o sampleip-objs := bpf_load.o libbpf.o sampleip_user.o tc_l2_redirect-objs := bpf_load.o libbpf.o tc_l2_redirect_user.o lwt_len_hist-objs := bpf_load.o libbpf.o lwt_len_hist_user.o +xdp_tx_iptunnel-objs := bpf_load.o libbpf.o xdp_tx_iptunnel_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -99,6 +101,7 @@ always += test_current_task_under_cgroup_kern.o always += trace_event_kern.o always += sampleip_kern.o always += lwt_len_hist_kern.o +always += xdp_tx_iptunnel_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ @@ -129,6 +132,7 @@ HOSTLOADLIBES_trace_event += -lelf HOSTLOADLIBES_sampleip += -lelf HOSTLOADLIBES_tc_l2_redirect += -l elf HOSTLOADLIBES_lwt_len_hist += -l elf +HOSTLOADLIBES_xdp_tx_iptunnel += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index 8370a6e3839d..faaffe2e139a 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -57,6 +57,8 @@ static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) = (void *) BPF_FUNC_skb_set_tunnel_opt; static unsigned long long (*bpf_get_prandom_u32)(void) = (void *) BPF_FUNC_get_prandom_u32; +static int (*bpf_xdp_adjust_head)(void *ctx, int offset) = + (void *) BPF_FUNC_xdp_adjust_head; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 49b45ccbe153..e30b6de94f2e 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -12,6 +12,10 @@ #include <linux/bpf.h> #include <linux/filter.h> #include <linux/perf_event.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <sys/types.h> +#include <sys/socket.h> #include <sys/syscall.h> #include <sys/ioctl.h> #include <sys/mman.h> @@ -450,3 +454,93 @@ struct ksym *ksym_search(long key) /* out of range. return _stext */ return &syms[0]; } + +int set_link_xdp_fd(int ifindex, int fd) +{ + struct sockaddr_nl sa; + int sock, seq = 0, len, ret = -1; + char buf[4096]; + struct nlattr *nla, *nla_xdp; + struct { + struct nlmsghdr nh; + struct ifinfomsg ifinfo; + char attrbuf[64]; + } req; + struct nlmsghdr *nh; + struct nlmsgerr *err; + + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + goto cleanup; + } + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_SETLINK; + req.nh.nlmsg_pid = 0; + req.nh.nlmsg_seq = ++seq; + req.ifinfo.ifi_family = AF_UNSPEC; + req.ifinfo.ifi_index = ifindex; + nla = (struct nlattr *)(((char *)&req) + + NLMSG_ALIGN(req.nh.nlmsg_len)); + nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/; + + nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN); + nla_xdp->nla_type = 1/*IFLA_XDP_FD*/; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); + memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); + nla->nla_len = NLA_HDRLEN + nla_xdp->nla_len; + + req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); + + if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { + printf("send to netlink: %s\n", strerror(errno)); + goto cleanup; + } + + len = recv(sock, buf, sizeof(buf), 0); + if (len < 0) { + printf("recv from netlink: %s\n", strerror(errno)); + goto cleanup; + } + + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); + nh = NLMSG_NEXT(nh, len)) { + if (nh->nlmsg_pid != getpid()) { + printf("Wrong pid %d, expected %d\n", + nh->nlmsg_pid, getpid()); + goto cleanup; + } + if (nh->nlmsg_seq != seq) { + printf("Wrong seq %d, expected %d\n", + nh->nlmsg_seq, seq); + goto cleanup; + } + switch (nh->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(nh); + if (!err->error) + continue; + printf("nlmsg error %s\n", strerror(-err->error)); + goto cleanup; + case NLMSG_DONE: + break; + } + } + + ret = 0; + +cleanup: + close(sock); + return ret; +} diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h index 4adeeef53ad6..fb46a421ab41 100644 --- a/samples/bpf/bpf_load.h +++ b/samples/bpf/bpf_load.h @@ -31,4 +31,5 @@ struct ksym { int load_kallsyms(void); struct ksym *ksym_search(long key); +int set_link_xdp_fd(int ifindex, int fd); #endif diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c index 2b2150d6d6f7..5f040a0d7712 100644 --- a/samples/bpf/xdp1_user.c +++ b/samples/bpf/xdp1_user.c @@ -5,111 +5,18 @@ * License as published by the Free Software Foundation. */ #include <linux/bpf.h> -#include <linux/netlink.h> -#include <linux/rtnetlink.h> #include <assert.h> #include <errno.h> #include <signal.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <sys/socket.h> #include <unistd.h> #include "bpf_load.h" #include "bpf_util.h" #include "libbpf.h" -static int set_link_xdp_fd(int ifindex, int fd) -{ - struct sockaddr_nl sa; - int sock, seq = 0, len, ret = -1; - char buf[4096]; - struct nlattr *nla, *nla_xdp; - struct { - struct nlmsghdr nh; - struct ifinfomsg ifinfo; - char attrbuf[64]; - } req; - struct nlmsghdr *nh; - struct nlmsgerr *err; - - memset(&sa, 0, sizeof(sa)); - sa.nl_family = AF_NETLINK; - - sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); - if (sock < 0) { - printf("open netlink socket: %s\n", strerror(errno)); - return -1; - } - - if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { - printf("bind to netlink: %s\n", strerror(errno)); - goto cleanup; - } - - memset(&req, 0, sizeof(req)); - req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); - req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; - req.nh.nlmsg_type = RTM_SETLINK; - req.nh.nlmsg_pid = 0; - req.nh.nlmsg_seq = ++seq; - req.ifinfo.ifi_family = AF_UNSPEC; - req.ifinfo.ifi_index = ifindex; - nla = (struct nlattr *)(((char *)&req) - + NLMSG_ALIGN(req.nh.nlmsg_len)); - nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/; - - nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN); - nla_xdp->nla_type = 1/*IFLA_XDP_FD*/; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); - memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); - nla->nla_len = NLA_HDRLEN + nla_xdp->nla_len; - - req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); - - if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { - printf("send to netlink: %s\n", strerror(errno)); - goto cleanup; - } - - len = recv(sock, buf, sizeof(buf), 0); - if (len < 0) { - printf("recv from netlink: %s\n", strerror(errno)); - goto cleanup; - } - - for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); - nh = NLMSG_NEXT(nh, len)) { - if (nh->nlmsg_pid != getpid()) { - printf("Wrong pid %d, expected %d\n", - nh->nlmsg_pid, getpid()); - goto cleanup; - } - if (nh->nlmsg_seq != seq) { - printf("Wrong seq %d, expected %d\n", - nh->nlmsg_seq, seq); - goto cleanup; - } - switch (nh->nlmsg_type) { - case NLMSG_ERROR: - err = (struct nlmsgerr *)NLMSG_DATA(nh); - if (!err->error) - continue; - printf("nlmsg error %s\n", strerror(-err->error)); - goto cleanup; - case NLMSG_DONE: - break; - } - } - - ret = 0; - -cleanup: - close(sock); - return ret; -} - static int ifindex; static void int_exit(int sig) diff --git a/samples/bpf/xdp_tx_iptunnel_common.h b/samples/bpf/xdp_tx_iptunnel_common.h new file mode 100644 index 000000000000..dd12cc35110f --- /dev/null +++ b/samples/bpf/xdp_tx_iptunnel_common.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H +#define _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H + +#include <linux/types.h> + +#define MAX_IPTNL_ENTRIES 256U + +struct vip { + union { + __u32 v6[4]; + __u32 v4; + } daddr; + __u16 dport; + __u16 family; + __u8 protocol; +}; + +struct iptnl_info { + union { + __u32 v6[4]; + __u32 v4; + } saddr; + union { + __u32 v6[4]; + __u32 v4; + } daddr; + __u16 family; + __u8 dmac[6]; +}; + +#endif diff --git a/samples/bpf/xdp_tx_iptunnel_kern.c b/samples/bpf/xdp_tx_iptunnel_kern.c new file mode 100644 index 000000000000..85c38ecd3a2d --- /dev/null +++ b/samples/bpf/xdp_tx_iptunnel_kern.c @@ -0,0 +1,236 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program shows how to use bpf_xdp_adjust_head() by + * encapsulating the incoming packet in an IPv4/v6 header + * and then XDP_TX it out. + */ +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include "bpf_helpers.h" +#include "xdp_tx_iptunnel_common.h" + +struct bpf_map_def SEC("maps") rxcnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u64), + .max_entries = 256, +}; + +struct bpf_map_def SEC("maps") vip2tnl = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(struct vip), + .value_size = sizeof(struct iptnl_info), + .max_entries = MAX_IPTNL_ENTRIES, +}; + +static __always_inline void count_tx(u32 protocol) +{ + u64 *rxcnt_count; + + rxcnt_count = bpf_map_lookup_elem(&rxcnt, &protocol); + if (rxcnt_count) + *rxcnt_count += 1; +} + +static __always_inline int get_dport(void *trans_data, void *data_end, + u8 protocol) +{ + struct tcphdr *th; + struct udphdr *uh; + + switch (protocol) { + case IPPROTO_TCP: + th = (struct tcphdr *)trans_data; + if (th + 1 > data_end) + return -1; + return th->dest; + case IPPROTO_UDP: + uh = (struct udphdr *)trans_data; + if (uh + 1 > data_end) + return -1; + return uh->dest; + default: + return 0; + } +} + +static __always_inline void set_ethhdr(struct ethhdr *new_eth, + const struct ethhdr *old_eth, + const struct iptnl_info *tnl, + __be16 h_proto) +{ + memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source)); + memcpy(new_eth->h_dest, tnl->dmac, sizeof(new_eth->h_dest)); + new_eth->h_proto = h_proto; +} + +static __always_inline int handle_ipv4(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + struct iptnl_info *tnl; + struct ethhdr *new_eth; + struct ethhdr *old_eth; + struct iphdr *iph = data + sizeof(struct ethhdr); + u16 *next_iph_u16; + u16 payload_len; + struct vip vip = {}; + int dport; + u32 csum = 0; + int i; + + if (iph + 1 > data_end) + return XDP_DROP; + + dport = get_dport(iph + 1, data_end, iph->protocol); + if (dport == -1) + return XDP_DROP; + + vip.protocol = iph->protocol; + vip.family = AF_INET; + vip.daddr.v4 = iph->daddr; + vip.dport = dport; + payload_len = ntohs(iph->tot_len); + + tnl = bpf_map_lookup_elem(&vip2tnl, &vip); + /* It only does v4-in-v4 */ + if (!tnl || tnl->family != AF_INET) + return XDP_PASS; + + /* The vip key is found. Add an IP header and send it out */ + + if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr))) + return XDP_DROP; + + data = (void *)(long)xdp->data; + data_end = (void *)(long)xdp->data_end; + + new_eth = data; + iph = data + sizeof(*new_eth); + old_eth = data + sizeof(*iph); + + if (new_eth + 1 > data_end || + old_eth + 1 > data_end || + iph + 1 > data_end) + return XDP_DROP; + + set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IP)); + + iph->version = 4; + iph->ihl = sizeof(*iph) >> 2; + iph->frag_off = 0; + iph->protocol = IPPROTO_IPIP; + iph->check = 0; + iph->tos = 0; + iph->tot_len = htons(payload_len + sizeof(*iph)); + iph->daddr = tnl->daddr.v4; + iph->saddr = tnl->saddr.v4; + iph->ttl = 8; + + next_iph_u16 = (u16 *)iph; +#pragma clang loop unroll(full) + for (i = 0; i < sizeof(*iph) >> 1; i++) + csum += *next_iph_u16++; + + iph->check = ~((csum & 0xffff) + (csum >> 16)); + + count_tx(vip.protocol); + + return XDP_TX; +} + +static __always_inline int handle_ipv6(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + struct iptnl_info *tnl; + struct ethhdr *new_eth; + struct ethhdr *old_eth; + struct ipv6hdr *ip6h = data + sizeof(struct ethhdr); + __u16 payload_len; + struct vip vip = {}; + int dport; + + if (ip6h + 1 > data_end) + return XDP_DROP; + + dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr); + if (dport == -1) + return XDP_DROP; + + vip.protocol = ip6h->nexthdr; + vip.family = AF_INET6; + memcpy(vip.daddr.v6, ip6h->daddr.s6_addr32, sizeof(vip.daddr)); + vip.dport = dport; + payload_len = ip6h->payload_len; + + tnl = bpf_map_lookup_elem(&vip2tnl, &vip); + /* It only does v6-in-v6 */ + if (!tnl || tnl->family != AF_INET6) + return XDP_PASS; + + /* The vip key is found. Add an IP header and send it out */ + + if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr))) + return XDP_DROP; + + data = (void *)(long)xdp->data; + data_end = (void *)(long)xdp->data_end; + + new_eth = data; + ip6h = data + sizeof(*new_eth); + old_eth = data + sizeof(*ip6h); + + if (new_eth + 1 > data_end || + old_eth + 1 > data_end || + ip6h + 1 > data_end) + return XDP_DROP; + + set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IPV6)); + + ip6h->version = 6; + ip6h->priority = 0; + memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl)); + ip6h->payload_len = htons(ntohs(payload_len) + sizeof(*ip6h)); + ip6h->nexthdr = IPPROTO_IPV6; + ip6h->hop_limit = 8; + memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6)); + memcpy(ip6h->daddr.s6_addr32, tnl->daddr.v6, sizeof(tnl->daddr.v6)); + + count_tx(vip.protocol); + + return XDP_TX; +} + +SEC("xdp_tx_iptunnel") +int _xdp_tx_iptunnel(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + struct ethhdr *eth = data; + __u16 h_proto; + + if (eth + 1 > data_end) + return XDP_DROP; + + h_proto = eth->h_proto; + + if (h_proto == htons(ETH_P_IP)) + return handle_ipv4(xdp); + else if (h_proto == htons(ETH_P_IPV6)) + + return handle_ipv6(xdp); + else + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c new file mode 100644 index 000000000000..7a71f5c74684 --- /dev/null +++ b/samples/bpf/xdp_tx_iptunnel_user.c @@ -0,0 +1,256 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/bpf.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/resource.h> +#include <arpa/inet.h> +#include <netinet/ether.h> +#include <unistd.h> +#include <time.h> +#include "bpf_load.h" +#include "libbpf.h" +#include "bpf_util.h" +#include "xdp_tx_iptunnel_common.h" + +#define STATS_INTERVAL_S 2U + +static int ifindex = -1; + +static void int_exit(int sig) +{ + if (ifindex > -1) + set_link_xdp_fd(ifindex, -1); + exit(0); +} + +/* simple per-protocol drop counter + */ +static void poll_stats(unsigned int kill_after_s) +{ + const unsigned int nr_protos = 256; + unsigned int nr_cpus = bpf_num_possible_cpus(); + time_t started_at = time(NULL); + __u64 values[nr_cpus], prev[nr_protos][nr_cpus]; + __u32 proto; + int i; + + memset(prev, 0, sizeof(prev)); + + while (!kill_after_s || time(NULL) - started_at <= kill_after_s) { + sleep(STATS_INTERVAL_S); + + for (proto = 0; proto < nr_protos; proto++) { + __u64 sum = 0; + + assert(bpf_lookup_elem(map_fd[0], &proto, values) == 0); + for (i = 0; i < nr_cpus; i++) + sum += (values[i] - prev[proto][i]); + + if (sum) + printf("proto %u: sum:%10llu pkts, rate:%10llu pkts/s\n", + proto, sum, sum / STATS_INTERVAL_S); + memcpy(prev[proto], values, sizeof(values)); + } + } +} + +static void usage(const char *cmd) +{ + printf("Start a XDP prog which encapsulates incoming packets\n" + "in an IPv4/v6 header and XDP_TX it out. The dst <VIP:PORT>\n" + "is used to select packets to encapsulate\n\n"); + printf("Usage: %s [...]\n", cmd); + printf(" -i <ifindex> Interface Index\n"); + printf(" -a <vip-service-address> IPv4 or IPv6\n"); + printf(" -p <vip-service-port> A port range (e.g. 433-444) is also allowed\n"); + printf(" -s <source-ip> Used in the IPTunnel header\n"); + printf(" -d <dest-ip> Used in the IPTunnel header\n"); + printf(" -m <dest-MAC> Used in sending the IP Tunneled pkt\n"); + printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n"); + printf(" -P <IP-Protocol> Default is TCP\n"); + printf(" -h Display this help\n"); +} + +static int parse_ipstr(const char *ipstr, unsigned int *addr) +{ + if (inet_pton(AF_INET6, ipstr, addr) == 1) { + return AF_INET6; + } else if (inet_pton(AF_INET, ipstr, addr) == 1) { + addr[1] = addr[2] = addr[3] = 0; + return AF_INET; + } + + fprintf(stderr, "%s is an invalid IP\n", ipstr); + return AF_UNSPEC; +} + +static int parse_ports(const char *port_str, int *min_port, int *max_port) +{ + char *end; + long tmp_min_port; + long tmp_max_port; + + tmp_min_port = strtol(optarg, &end, 10); + if (tmp_min_port < 1 || tmp_min_port > 65535) { + fprintf(stderr, "Invalid port(s):%s\n", optarg); + return 1; + } + + if (*end == '-') { + end++; + tmp_max_port = strtol(end, NULL, 10); + if (tmp_max_port < 1 || tmp_max_port > 65535) { + fprintf(stderr, "Invalid port(s):%s\n", optarg); + return 1; + } + } else { + tmp_max_port = tmp_min_port; + } + + if (tmp_min_port > tmp_max_port) { + fprintf(stderr, "Invalid port(s):%s\n", optarg); + return 1; + } + + if (tmp_max_port - tmp_min_port + 1 > MAX_IPTNL_ENTRIES) { + fprintf(stderr, "Port range (%s) is larger than %u\n", + port_str, MAX_IPTNL_ENTRIES); + return 1; + } + *min_port = tmp_min_port; + *max_port = tmp_max_port; + + return 0; +} + +int main(int argc, char **argv) +{ + unsigned char opt_flags[256] = {}; + unsigned int kill_after_s = 0; + const char *optstr = "i:a:p:s:d:m:T:P:h"; + int min_port = 0, max_port = 0; + struct iptnl_info tnl = {}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct vip vip = {}; + char filename[256]; + int opt; + int i; + + tnl.family = AF_UNSPEC; + vip.protocol = IPPROTO_TCP; + + for (i = 0; i < strlen(optstr); i++) + if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z') + opt_flags[(unsigned char)optstr[i]] = 1; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + unsigned short family; + unsigned int *v6; + + switch (opt) { + case 'i': + ifindex = atoi(optarg); + break; + case 'a': + vip.family = parse_ipstr(optarg, vip.daddr.v6); + if (vip.family == AF_UNSPEC) + return 1; + break; + case 'p': + if (parse_ports(optarg, &min_port, &max_port)) + return 1; + break; + case 'P': + vip.protocol = atoi(optarg); + break; + case 's': + case 'd': + if (opt == 's') + v6 = tnl.saddr.v6; + else + v6 = tnl.daddr.v6; + + family = parse_ipstr(optarg, v6); + if (family == AF_UNSPEC) + return 1; + if (tnl.family == AF_UNSPEC) { + tnl.family = family; + } else if (tnl.family != family) { + fprintf(stderr, + "The IP version of the src and dst addresses used in the IP encapsulation does not match\n"); + return 1; + } + break; + case 'm': + if (!ether_aton_r(optarg, + (struct ether_addr *)tnl.dmac)) { + fprintf(stderr, "Invalid mac address:%s\n", + optarg); + return 1; + } + break; + case 'T': + kill_after_s = atoi(optarg); + break; + default: + usage(argv[0]); + return 1; + } + opt_flags[opt] = 0; + } + + for (i = 0; i < strlen(optstr); i++) { + if (opt_flags[(unsigned int)optstr[i]]) { + fprintf(stderr, "Missing argument -%c\n", optstr[i]); + usage(argv[0]); + return 1; + } + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); + return 1; + } + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + if (!prog_fd[0]) { + printf("load_bpf_file: %s\n", strerror(errno)); + return 1; + } + + signal(SIGINT, int_exit); + + while (min_port <= max_port) { + vip.dport = htons(min_port++); + if (bpf_update_elem(map_fd[1], &vip, &tnl, BPF_NOEXIST)) { + perror("bpf_update_elem(&vip2tnl)"); + return 1; + } + } + + if (set_link_xdp_fd(ifindex, prog_fd[0]) < 0) { + printf("link set xdp fd failed\n"); + return 1; + } + + poll_stats(kill_after_s); + + set_link_xdp_fd(ifindex, -1); + + return 0; +} |