diff options
Diffstat (limited to 'net/core/dev.c')
-rw-r--r-- | net/core/dev.c | 645 |
1 files changed, 369 insertions, 276 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 2f7f5fd9ffec..be17e0660144 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -156,9 +156,11 @@ #include <linux/pm_runtime.h> #include <linux/prandom.h> #include <linux/once_lite.h> +#include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/types.h> #include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> #include <net/rps.h> #include <linux/phy_link_topology.h> @@ -570,10 +572,18 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) static inline struct list_head *ptype_head(const struct packet_type *pt) { - if (pt->type == htons(ETH_P_ALL)) - return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all; - else - return pt->dev ? &pt->dev->ptype_specific : + if (pt->type == htons(ETH_P_ALL)) { + if (!pt->af_packet_net && !pt->dev) + return NULL; + + return pt->dev ? &pt->dev->ptype_all : + &pt->af_packet_net->ptype_all; + } + + if (pt->dev) + return &pt->dev->ptype_specific; + + return pt->af_packet_net ? &pt->af_packet_net->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; } @@ -594,6 +604,9 @@ void dev_add_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); + if (WARN_ON_ONCE(!head)) + return; + spin_lock(&ptype_lock); list_add_rcu(&pt->list, head); spin_unlock(&ptype_lock); @@ -618,6 +631,9 @@ void __dev_remove_pack(struct packet_type *pt) struct list_head *head = ptype_head(pt); struct packet_type *pt1; + if (!head) + return; + spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { @@ -1007,7 +1023,7 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id) WARN_ON_ONCE(!rcu_read_lock_held()); - if (napi_id < MIN_NAPI_ID) + if (!napi_id_valid(napi_id)) return NULL; napi = napi_by_id(napi_id); @@ -1370,15 +1386,7 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev, return ret < 0 ? ret : 0; } -/** - * dev_change_name - change name of a device - * @dev: device - * @newname: name (or format string) must be at least IFNAMSIZ - * - * Change name of a device, can pass format strings "eth%d". - * for wildcarding. - */ -int dev_change_name(struct net_device *dev, const char *newname) +int netif_change_name(struct net_device *dev, const char *newname) { struct net *net = dev_net(dev); unsigned char old_assign_type; @@ -1448,15 +1456,7 @@ rollback: return err; } -/** - * dev_set_alias - change ifalias of a device - * @dev: device - * @alias: name up to IFALIASZ - * @len: limit of bytes to copy from info - * - * Set ifalias for a device, - */ -int dev_set_alias(struct net_device *dev, const char *alias, size_t len) +int netif_set_alias(struct net_device *dev, const char *alias, size_t len) { struct dev_ifalias *new_alias = NULL; @@ -1482,7 +1482,6 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) return len; } -EXPORT_SYMBOL(dev_set_alias); /** * dev_get_alias - get ifalias of a device @@ -1628,6 +1627,8 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) set_bit(__LINK_STATE_START, &dev->state); + netdev_ops_assert_locked(dev); + if (ops->ndo_validate_addr) ret = ops->ndo_validate_addr(dev); @@ -1648,20 +1649,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) return ret; } -/** - * dev_open - prepare an interface for use. - * @dev: device to open - * @extack: netlink extended ack - * - * Takes a device from down to up state. The device's private open - * function is invoked and then the multicast lists are loaded. Finally - * the device is moved into the up state and a %NETDEV_UP message is - * sent to the netdev notifier chain. - * - * Calling this function on an active interface is a nop. On a failure - * a negative errno code is returned. - */ -int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) +int netif_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; @@ -1677,7 +1665,6 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) return ret; } -EXPORT_SYMBOL(dev_open); static void __dev_close_many(struct list_head *head) { @@ -1715,6 +1702,9 @@ static void __dev_close_many(struct list_head *head) * We allow it to be called even after a DETACH hot-plug * event. */ + + netdev_ops_assert_locked(dev); + if (ops->ndo_stop) ops->ndo_stop(dev); @@ -1752,16 +1742,7 @@ void dev_close_many(struct list_head *head, bool unlink) } EXPORT_SYMBOL(dev_close_many); -/** - * dev_close - shutdown an interface. - * @dev: device to shutdown - * - * This function moves an active device into down state. A - * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device - * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier - * chain. - */ -void dev_close(struct net_device *dev) +void netif_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); @@ -1771,18 +1752,9 @@ void dev_close(struct net_device *dev) list_del(&single); } } -EXPORT_SYMBOL(dev_close); +EXPORT_SYMBOL(netif_close); - -/** - * dev_disable_lro - disable Large Receive Offload on a device - * @dev: device - * - * Disable Large Receive Offload (LRO) on a net device. Must be - * called under RTNL. This is needed if received packets may be - * forwarded to another interface. - */ -void dev_disable_lro(struct net_device *dev) +void netif_disable_lro(struct net_device *dev) { struct net_device *lower_dev; struct list_head *iter; @@ -1793,10 +1765,12 @@ void dev_disable_lro(struct net_device *dev) if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); - netdev_for_each_lower_dev(dev, lower_dev, iter) - dev_disable_lro(lower_dev); + netdev_for_each_lower_dev(dev, lower_dev, iter) { + netdev_lock_ops(lower_dev); + netif_disable_lro(lower_dev); + netdev_unlock_ops(lower_dev); + } } -EXPORT_SYMBOL(dev_disable_lro); /** * dev_disable_gro_hw - disable HW Generic Receive Offload on a device @@ -2481,16 +2455,21 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) } /** - * dev_nit_active - return true if any network interface taps are in use + * dev_nit_active_rcu - return true if any network interface taps are in use + * + * The caller must hold the RCU lock * * @dev: network device to check for the presence of taps */ -bool dev_nit_active(struct net_device *dev) +bool dev_nit_active_rcu(const struct net_device *dev) { - return !list_empty(&net_hotdata.ptype_all) || + /* Callers may hold either RCU or RCU BH lock */ + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + + return !list_empty(&dev_net(dev)->ptype_all) || !list_empty(&dev->ptype_all); } -EXPORT_SYMBOL_GPL(dev_nit_active); +EXPORT_SYMBOL_GPL(dev_nit_active_rcu); /* * Support routine. Sends outgoing frames to any network @@ -2499,11 +2478,12 @@ EXPORT_SYMBOL_GPL(dev_nit_active); void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { - struct list_head *ptype_list = &net_hotdata.ptype_all; struct packet_type *ptype, *pt_prev = NULL; + struct list_head *ptype_list; struct sk_buff *skb2 = NULL; rcu_read_lock(); + ptype_list = &dev_net_rcu(dev)->ptype_all; again: list_for_each_entry_rcu(ptype, ptype_list, list) { if (READ_ONCE(ptype->ignore_outgoing)) @@ -2547,7 +2527,7 @@ again: pt_prev = ptype; } - if (ptype_list == &net_hotdata.ptype_all) { + if (ptype_list != &dev->ptype_all) { ptype_list = &dev->ptype_all; goto again; } @@ -3150,6 +3130,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) { ASSERT_RTNL(); + netdev_ops_assert_locked(dev); rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, txq); @@ -3180,7 +3161,6 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) } EXPORT_SYMBOL(netif_set_real_num_tx_queues); -#ifdef CONFIG_SYSFS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device @@ -3200,6 +3180,7 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) if (dev->reg_state == NETREG_REGISTERED) { ASSERT_RTNL(); + netdev_ops_assert_locked(dev); rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, rxq); @@ -3211,7 +3192,6 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) return 0; } EXPORT_SYMBOL(netif_set_real_num_rx_queues); -#endif /** * netif_set_real_num_queues - set actual number of RX and TX queues used @@ -3792,7 +3772,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, unsigned int len; int rc; - if (dev_nit_active(dev)) + if (dev_nit_active_rcu(dev)) dev_queue_xmit_nit(skb, dev); len = skb->len; @@ -4568,7 +4548,8 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_reset_mac_header(skb); skb_assert_len(skb); - if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) + if (unlikely(skb_shinfo(skb)->tx_flags & + (SKBTX_SCHED_TSTAMP | SKBTX_BPF))) __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also @@ -4770,6 +4751,11 @@ EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); +static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) +{ + return hash_32(hash, flow_table->log); +} + static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) @@ -4796,7 +4782,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = skb_get_hash(skb) & flow_table->mask; + flow_id = rfs_slot(skb_get_hash(skb), flow_table); rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) @@ -4875,7 +4861,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ - rflow = &flow_table->flows[hash & flow_table->mask]; + rflow = &flow_table->flows[rfs_slot(hash, flow_table)]; tcpu = rflow->cpu; /* @@ -4942,13 +4928,13 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); - if (flow_table && flow_id <= flow_table->mask) { + if (flow_table && flow_id < (1UL << flow_table->log)) { rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - READ_ONCE(rflow->last_qtail)) < - (int)(10 * flow_table->mask))) + (int)(10 << flow_table->log))) expire = false; } rcu_read_unlock(); @@ -5735,7 +5721,8 @@ another_round: if (pfmemalloc) goto skip_taps; - list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) { + list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all, + list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -5847,6 +5834,14 @@ check_vlan_id: deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &ptype_base[ntohs(type) & PTYPE_HASH_MASK]); + + /* orig_dev and skb->dev could belong to different netns; + * Even in such case we need to traverse only the list + * coming from skb->dev, as the ptype owner (packet socket) + * will use dev_net(skb->dev) to do namespace filtering. + */ + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &dev_net_rcu(skb->dev)->ptype_specific); } deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, @@ -6057,7 +6052,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) static_branch_dec(&generic_xdp_needed_key); } else if (new && !old) { static_branch_inc(&generic_xdp_needed_key); - dev_disable_lro(dev); + netif_disable_lro(dev); dev_disable_gro_hw(dev); } break; @@ -6187,16 +6182,18 @@ EXPORT_SYMBOL(netif_receive_skb_list); static void flush_backlog(struct work_struct *work) { struct sk_buff *skb, *tmp; + struct sk_buff_head list; struct softnet_data *sd; + __skb_queue_head_init(&list); local_bh_disable(); sd = this_cpu_ptr(&softnet_data); backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { - if (skb->dev->reg_state == NETREG_UNREGISTERING) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); - dev_kfree_skb_irq(skb); + __skb_queue_tail(&list, skb); rps_input_queue_head_incr(sd); } } @@ -6204,14 +6201,16 @@ static void flush_backlog(struct work_struct *work) local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { - if (skb->dev->reg_state == NETREG_UNREGISTERING) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); - kfree_skb(skb); + __skb_queue_tail(&list, skb); rps_input_queue_head_incr(sd); } } local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); local_bh_enable(); + + __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY); } static bool flush_required(int cpu) @@ -6475,7 +6474,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) return false; if (work_done) { - if (n->gro_bitmask) + if (n->gro.bitmask) timeout = napi_get_gro_flush_timeout(n); n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); } @@ -6485,15 +6484,14 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (timeout) ret = false; } - if (n->gro_bitmask) { - /* When the NAPI instance uses a timeout and keeps postponing - * it, we need to bound somehow the time packets are kept in - * the GRO layer - */ - napi_gro_flush(n, !!timeout); - } - gro_normal_list(n); + /* + * When the NAPI instance uses a timeout and keeps postponing + * it, we need to bound somehow the time packets are kept in + * the GRO layer. + */ + gro_flush(&n->gro, !!timeout); + gro_normal_list(&n->gro); if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ @@ -6557,19 +6555,15 @@ static void skb_defer_free_flush(struct softnet_data *sd) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { if (!skip_schedule) { - gro_normal_list(napi); + gro_normal_list(&napi->gro); __napi_schedule(napi); return; } - if (napi->gro_bitmask) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - napi_gro_flush(napi, HZ >= 1000); - } + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush(&napi->gro, HZ >= 1000); + gro_normal_list(&napi->gro); - gro_normal_list(napi); clear_bit(NAPI_STATE_SCHED, &napi->state); } @@ -6676,7 +6670,7 @@ restart: } work = napi_poll(napi, budget); trace_napi_poll(napi, work, budget); - gro_normal_list(napi); + gro_normal_list(&napi->gro); count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), @@ -6776,6 +6770,8 @@ void napi_resume_irqs(unsigned int napi_id) static void __napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id) { + napi->gro.cached_napi_id = napi_id; + WRITE_ONCE(napi->napi_id, napi_id); hlist_add_head_rcu(&napi->napi_hash_node, &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); @@ -6803,7 +6799,7 @@ static void napi_hash_add(struct napi_struct *napi) /* 0..NR_CPUS range is reserved for sender_cpu use */ do { - if (unlikely(++napi_gen_id < MIN_NAPI_ID)) + if (unlikely(!napi_id_valid(++napi_gen_id))) napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); @@ -6844,17 +6840,6 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void init_gro_hash(struct napi_struct *napi) -{ - int i; - - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - INIT_LIST_HEAD(&napi->gro_hash[i].list); - napi->gro_hash[i].count = 0; - } - napi->gro_bitmask = 0; -} - int dev_set_threaded(struct net_device *dev, bool threaded) { struct napi_struct *napi; @@ -6916,8 +6901,7 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, if (WARN_ON_ONCE(napi && !napi->dev)) return; - if (dev->reg_state >= NETREG_REGISTERED) - ASSERT_RTNL(); + netdev_ops_assert_locked_or_invisible(dev); switch (type) { case NETDEV_QUEUE_TYPE_RX: @@ -6934,11 +6918,175 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, } EXPORT_SYMBOL(netif_queue_set_napi); +static void +netif_napi_irq_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) +{ + struct napi_struct *napi = + container_of(notify, struct napi_struct, notify); +#ifdef CONFIG_RFS_ACCEL + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; + int err; +#endif + + if (napi->config && napi->dev->irq_affinity_auto) + cpumask_copy(&napi->config->affinity_mask, mask); + +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); + if (err) + netdev_warn(napi->dev, "RMAP update failed (%d)\n", + err); + } +#endif +} + +#ifdef CONFIG_RFS_ACCEL +static void netif_napi_affinity_release(struct kref *ref) +{ + struct napi_struct *napi = + container_of(ref, struct napi_struct, notify.kref); + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; + + netdev_assert_locked(napi->dev); + WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, + &napi->state)); + + if (!napi->dev->rx_cpu_rmap_auto) + return; + rmap->obj[napi->napi_rmap_idx] = NULL; + napi->napi_rmap_idx = -1; + cpu_rmap_put(rmap); +} + +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + if (dev->rx_cpu_rmap_auto) + return 0; + + dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs); + if (!dev->rx_cpu_rmap) + return -ENOMEM; + + dev->rx_cpu_rmap_auto = true; + return 0; +} +EXPORT_SYMBOL(netif_enable_cpu_rmap); + +static void netif_del_cpu_rmap(struct net_device *dev) +{ + struct cpu_rmap *rmap = dev->rx_cpu_rmap; + + if (!dev->rx_cpu_rmap_auto) + return; + + /* Free the rmap */ + cpu_rmap_put(rmap); + dev->rx_cpu_rmap = NULL; + dev->rx_cpu_rmap_auto = false; +} + +#else +static void netif_napi_affinity_release(struct kref *ref) +{ +} + +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + return 0; +} +EXPORT_SYMBOL(netif_enable_cpu_rmap); + +static void netif_del_cpu_rmap(struct net_device *dev) +{ +} +#endif + +void netif_set_affinity_auto(struct net_device *dev) +{ + unsigned int i, maxqs, numa; + + maxqs = max(dev->num_tx_queues, dev->num_rx_queues); + numa = dev_to_node(&dev->dev); + + for (i = 0; i < maxqs; i++) + cpumask_set_cpu(cpumask_local_spread(i, numa), + &dev->napi_config[i].affinity_mask); + + dev->irq_affinity_auto = true; +} +EXPORT_SYMBOL(netif_set_affinity_auto); + +void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) +{ + int rc; + + netdev_assert_locked_or_invisible(napi->dev); + + if (napi->irq == irq) + return; + + /* Remove existing resources */ + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + + napi->irq = irq; + if (irq < 0 || + (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto)) + return; + + /* Abort for buggy drivers */ + if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config)) + return; + +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi); + if (rc < 0) + return; + + cpu_rmap_get(napi->dev->rx_cpu_rmap); + napi->napi_rmap_idx = rc; + } +#endif + + /* Use core IRQ notifier */ + napi->notify.notify = netif_napi_irq_notify; + napi->notify.release = netif_napi_affinity_release; + rc = irq_set_affinity_notifier(irq, &napi->notify); + if (rc) { + netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n", + rc); + goto put_rmap; + } + + set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state); + return; + +put_rmap: +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL; + cpu_rmap_put(napi->dev->rx_cpu_rmap); + napi->napi_rmap_idx = -1; + } +#endif + napi->notify.notify = NULL; + napi->notify.release = NULL; +} +EXPORT_SYMBOL(netif_napi_set_irq_locked); + static void napi_restore_config(struct napi_struct *n) { n->defer_hard_irqs = n->config->defer_hard_irqs; n->gro_flush_timeout = n->config->gro_flush_timeout; n->irq_suspend_timeout = n->config->irq_suspend_timeout; + + if (n->dev->irq_affinity_auto && + test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state)) + irq_set_affinity(n->irq, &n->config->affinity_mask); + /* a NAPI ID might be stored in the config, if so use it. if not, use * napi_hash_add to generate one for us. */ @@ -6974,7 +7122,7 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) higher = &dev->napi_list; list_for_each_entry(pos, &dev->napi_list, dev_list) { - if (pos->napi_id >= MIN_NAPI_ID) + if (napi_id_valid(pos->napi_id)) pos_id = pos->napi_id; else if (pos->config) pos_id = pos->config->napi_id; @@ -7016,12 +7164,9 @@ void netif_napi_add_weight_locked(struct net_device *dev, INIT_LIST_HEAD(&napi->poll_list); INIT_HLIST_NODE(&napi->napi_hash_node); - hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - napi->timer.function = napi_watchdog; - init_gro_hash(napi); + hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + gro_init(&napi->gro); napi->skb = NULL; - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) netdev_err_once(dev, "%s() called with weight %d\n", __func__, @@ -7135,19 +7280,6 @@ void napi_enable(struct napi_struct *n) } EXPORT_SYMBOL(napi_enable); -static void flush_gro_hash(struct napi_struct *napi) -{ - int i; - - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - struct sk_buff *skb, *n; - - list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) - kfree_skb(skb); - napi->gro_hash[i].count = 0; - } -} - /* Must be called in process context */ void __netif_napi_del_locked(struct napi_struct *napi) { @@ -7156,6 +7288,12 @@ void __netif_napi_del_locked(struct napi_struct *napi) if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return; + /* Make sure NAPI is disabled (or was never enabled). */ + WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); + + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + if (napi->config) { napi->index = -1; napi->config = NULL; @@ -7164,8 +7302,7 @@ void __netif_napi_del_locked(struct napi_struct *napi) list_del_rcu(&napi->dev_list); napi_free_frags(napi); - flush_gro_hash(napi); - napi->gro_bitmask = 0; + gro_cleanup(&napi->gro); if (napi->thread) { kthread_stop(napi->thread); @@ -7224,14 +7361,9 @@ static int __napi_poll(struct napi_struct *n, bool *repoll) return work; } - if (n->gro_bitmask) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - napi_gro_flush(n, HZ >= 1000); - } - - gro_normal_list(n); + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush(&n->gro, HZ >= 1000); + gro_normal_list(&n->gro); /* Some drivers may have called napi_schedule * prior to exhausting their budget. @@ -9090,7 +9222,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc) } EXPORT_SYMBOL(dev_set_promiscuity); -static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) +int netif_set_allmulti(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags, old_gflags = dev->gflags; unsigned int allmulti, flags; @@ -9125,25 +9257,6 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) return 0; } -/** - * dev_set_allmulti - update allmulti count on a device - * @dev: device - * @inc: modifier - * - * Add or remove reception of all multicast frames to a device. While the - * count in the device remains above zero the interface remains listening - * to all interfaces. Once it hits zero the device reverts back to normal - * filtering operation. A negative @inc value is used to drop the counter - * when releasing a resource needing all multicasts. - * Return 0 if successful or a negative errno code on error. - */ - -int dev_set_allmulti(struct net_device *dev, int inc) -{ - return __dev_set_allmulti(dev, inc, true); -} -EXPORT_SYMBOL(dev_set_allmulti); - /* * Upload unicast and multicast address lists to device and * configure RX filtering. When the device doesn't support unicast @@ -9259,7 +9372,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; - unsigned int old_flags = dev->flags; + old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; @@ -9276,7 +9389,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; - __dev_set_allmulti(dev, inc, false); + netif_set_allmulti(dev, inc, false); } return ret; @@ -9311,17 +9424,8 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, } } -/** - * dev_change_flags - change device settings - * @dev: device - * @flags: device state flags - * @extack: netlink extended ack - * - * Change settings on device based state flags. The flags are - * in the userspace exported format. - */ -int dev_change_flags(struct net_device *dev, unsigned int flags, - struct netlink_ext_ack *extack) +int netif_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; @@ -9334,7 +9438,6 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, __dev_notify_flags(dev, old_flags, changes, 0, NULL); return ret; } -EXPORT_SYMBOL(dev_change_flags); int __dev_set_mtu(struct net_device *dev, int new_mtu) { @@ -9366,15 +9469,15 @@ int dev_validate_mtu(struct net_device *dev, int new_mtu, } /** - * dev_set_mtu_ext - Change maximum transfer unit + * netif_set_mtu_ext - Change maximum transfer unit * @dev: device * @new_mtu: new transfer unit * @extack: netlink extended ack * * Change the maximum transfer size of the network device. */ -int dev_set_mtu_ext(struct net_device *dev, int new_mtu, - struct netlink_ext_ack *extack) +int netif_set_mtu_ext(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack) { int err, orig_mtu; @@ -9412,25 +9515,20 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, return err; } -int dev_set_mtu(struct net_device *dev, int new_mtu) +int netif_set_mtu(struct net_device *dev, int new_mtu) { struct netlink_ext_ack extack; int err; memset(&extack, 0, sizeof(extack)); - err = dev_set_mtu_ext(dev, new_mtu, &extack); + err = netif_set_mtu_ext(dev, new_mtu, &extack); if (err && extack._msg) net_err_ratelimited("%s: %s\n", dev->name, extack._msg); return err; } -EXPORT_SYMBOL(dev_set_mtu); +EXPORT_SYMBOL(netif_set_mtu); -/** - * dev_change_tx_queue_len - Change TX queue length of a netdevice - * @dev: device - * @new_len: new tx queue length - */ -int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) +int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len) { unsigned int orig_len = dev->tx_queue_len; int res; @@ -9457,12 +9555,7 @@ err_rollback: return res; } -/** - * dev_set_group - Change group this device belongs to - * @dev: device - * @new_group: group this device should belong to - */ -void dev_set_group(struct net_device *dev, int new_group) +void netif_set_group(struct net_device *dev, int new_group) { dev->group = new_group; } @@ -9488,16 +9581,8 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, } EXPORT_SYMBOL(dev_pre_changeaddr_notify); -/** - * dev_set_mac_address - Change Media Access Control Address - * @dev: device - * @sa: new address - * @extack: netlink extended ack - * - * Change the hardware (MAC) address of the device - */ -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, - struct netlink_ext_ack *extack) +int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; @@ -9521,22 +9606,9 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, add_device_randomness(dev->dev_addr, dev->addr_len); return 0; } -EXPORT_SYMBOL(dev_set_mac_address); DECLARE_RWSEM(dev_addr_sem); -int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, - struct netlink_ext_ack *extack) -{ - int ret; - - down_write(&dev_addr_sem); - ret = dev_set_mac_address(dev, sa, extack); - up_write(&dev_addr_sem); - return ret; -} -EXPORT_SYMBOL(dev_set_mac_address_user); - int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { size_t size = sizeof(sa->sa_data_min); @@ -9565,14 +9637,7 @@ unlock: } EXPORT_SYMBOL(dev_get_mac_address); -/** - * dev_change_carrier - Change device carrier - * @dev: device - * @new_carrier: new value - * - * Change device carrier - */ -int dev_change_carrier(struct net_device *dev, bool new_carrier) +int netif_change_carrier(struct net_device *dev, bool new_carrier) { const struct net_device_ops *ops = dev->netdev_ops; @@ -9683,13 +9748,7 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) } EXPORT_SYMBOL(netdev_port_same_parent_id); -/** - * dev_change_proto_down - set carrier according to proto_down. - * - * @dev: device - * @proto_down: new value - */ -int dev_change_proto_down(struct net_device *dev, bool proto_down) +int netif_change_proto_down(struct net_device *dev, bool proto_down) { if (!dev->change_proto_down) return -EOPNOTSUPP; @@ -9704,14 +9763,14 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } /** - * dev_change_proto_down_reason - proto down reason + * netdev_change_proto_down_reason_locked - proto down reason * * @dev: device * @mask: proto down mask * @value: proto down value */ -void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, - u32 value) +void netdev_change_proto_down_reason_locked(struct net_device *dev, + unsigned long mask, u32 value) { u32 proto_down_reason; int b; @@ -9800,7 +9859,7 @@ u8 dev_xdp_sb_prog_count(struct net_device *dev) return count; } -int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) +int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) { if (!dev->netdev_ops->ndo_bpf) return -EOPNOTSUPP; @@ -9820,7 +9879,6 @@ int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) return dev->netdev_ops->ndo_bpf(dev, bpf); } -EXPORT_SYMBOL_GPL(dev_xdp_propagate); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { @@ -9850,6 +9908,8 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, struct netdev_bpf xdp; int err; + netdev_ops_assert_locked(dev); + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && prog && !prog->aux->xdp_has_frags) { NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split"); @@ -10082,7 +10142,9 @@ static void bpf_xdp_link_release(struct bpf_link *link) * already NULL, in which case link was already auto-detached */ if (xdp_link->dev) { + netdev_lock_ops(xdp_link->dev); WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + netdev_unlock_ops(xdp_link->dev); xdp_link->dev = NULL; } @@ -10164,10 +10226,12 @@ static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, goto out_unlock; } + netdev_lock_ops(xdp_link->dev); mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags); bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, xdp_link->flags, new_prog); + netdev_unlock_ops(xdp_link->dev); if (err) goto out_unlock; @@ -10293,7 +10357,7 @@ u32 dev_get_min_mp_channel_count(const struct net_device *dev) { int i; - ASSERT_RTNL(); + netdev_ops_assert_locked(dev); for (i = dev->real_num_rx_queues - 1; i >= 0; i--) if (dev->_rx[i].mp_params.mp_priv) @@ -10519,6 +10583,7 @@ int __netdev_update_features(struct net_device *dev) int err = -1; ASSERT_RTNL(); + netdev_ops_assert_locked(dev); features = netdev_get_wanted_features(dev); @@ -10952,7 +11017,9 @@ int register_netdevice(struct net_device *dev) if (ret) goto err_uninit_notify; + netdev_lock_ops(dev); __netdev_update_features(dev); + netdev_unlock_ops(dev); /* * Default initial state at registry is that the @@ -11197,9 +11264,8 @@ void netdev_run_todo(void) list_replace_init(&net_unlink_list, &unlink_list); while (!list_empty(&unlink_list)) { - struct net_device *dev = list_first_entry(&unlink_list, - struct net_device, - unlink_list); + dev = list_first_entry(&unlink_list, struct net_device, + unlink_list); list_del_init(&dev->unlink_list); dev->nested_level = dev->lower_level - 1; } @@ -11709,6 +11775,8 @@ void free_netdev(struct net_device *dev) netdev_napi_exit(dev); + netif_del_cpu_rmap(dev); + ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); @@ -11823,6 +11891,19 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) } EXPORT_SYMBOL(unregister_netdevice_queue); +static void dev_memory_provider_uninstall(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct netdev_rx_queue *rxq = &dev->_rx[i]; + struct pp_memory_provider_params *p = &rxq->mp_params; + + if (p->mp_ops && p->mp_ops->uninstall) + p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq); + } +} + void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { @@ -11854,11 +11935,14 @@ void unregister_netdevice_many_notify(struct list_head *head, } /* If device is running, close it first. */ - list_for_each_entry(dev, head, unreg_list) + list_for_each_entry(dev, head, unreg_list) { list_add_tail(&dev->close_list, &close_head); + netdev_lock_ops(dev); + } dev_close_many(&close_head, true); list_for_each_entry(dev, head, unreg_list) { + netdev_unlock_ops(dev); /* And unlink it from device chain. */ unlist_netdevice(dev); netdev_lock(dev); @@ -11875,9 +11959,11 @@ void unregister_netdevice_many_notify(struct list_head *head, /* Shutdown queueing discipline. */ dev_shutdown(dev); dev_tcx_uninstall(dev); + netdev_lock_ops(dev); dev_xdp_uninstall(dev); + dev_memory_provider_uninstall(dev); + netdev_unlock_ops(dev); bpf_dev_bound_netdev_unregister(dev); - dev_dmabuf_uninstall(dev); netdev_offload_xstats_disable_all(dev); @@ -11971,24 +12057,9 @@ void unregister_netdev(struct net_device *dev) } EXPORT_SYMBOL(unregister_netdev); -/** - * __dev_change_net_namespace - move device to different nethost namespace - * @dev: device - * @net: network namespace - * @pat: If not NULL name pattern to try if the current device name - * is already taken in the destination network namespace. - * @new_ifindex: If not zero, specifies device index in the target - * namespace. - * - * This function shuts down a device interface and moves it - * to a new network namespace. On success 0 is returned, on - * a failure a netagive errno code is returned. - * - * Callers must hold the rtnl semaphore. - */ - -int __dev_change_net_namespace(struct net_device *dev, struct net *net, - const char *pat, int new_ifindex) +int netif_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex, + struct netlink_ext_ack *extack) { struct netdev_name_node *name_node; struct net *net_old = dev_net(dev); @@ -11999,12 +12070,16 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, /* Don't allow namespace local devices to be moved. */ err = -EINVAL; - if (dev->netns_local) + if (dev->netns_immutable) { + NL_SET_ERR_MSG(extack, "The interface netns is immutable"); goto out; + } /* Ensure the device has been registered */ - if (dev->reg_state != NETREG_REGISTERED) + if (dev->reg_state != NETREG_REGISTERED) { + NL_SET_ERR_MSG(extack, "The interface isn't registered"); goto out; + } /* Get out if there is nothing todo */ err = 0; @@ -12017,30 +12092,49 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, err = -EEXIST; if (netdev_name_in_use(net, dev->name)) { /* We get here if we can't use the current device name */ - if (!pat) + if (!pat) { + NL_SET_ERR_MSG(extack, + "An interface with the same name exists in the target netns"); goto out; + } err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG_FMT(extack, + "Unable to use '%s' for the new interface name in the target netns", + pat); goto out; + } } /* Check that none of the altnames conflicts. */ err = -EEXIST; - netdev_for_each_altname(dev, name_node) - if (netdev_name_in_use(net, name_node->name)) + netdev_for_each_altname(dev, name_node) { + if (netdev_name_in_use(net, name_node->name)) { + NL_SET_ERR_MSG_FMT(extack, + "An interface with the altname %s exists in the target netns", + name_node->name); goto out; + } + } /* Check that new_ifindex isn't used yet. */ if (new_ifindex) { err = dev_index_reserve(net, new_ifindex); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG_FMT(extack, + "The ifindex %d is not available in the target netns", + new_ifindex); goto out; + } } else { /* If there is an ifindex conflict assign a new one */ err = dev_index_reserve(net, dev->ifindex); if (err == -EBUSY) err = dev_index_reserve(net, 0); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG(extack, + "Unable to allocate a new ifindex in the target netns"); goto out; + } new_ifindex = err; } @@ -12049,7 +12143,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, */ /* If device is running close it first. */ - dev_close(dev); + netif_close(dev); /* And unlink it from device chain */ unlist_netdevice(dev); @@ -12131,7 +12225,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, out: return err; } -EXPORT_SYMBOL_GPL(__dev_change_net_namespace); static int dev_cpu_dead(unsigned int oldcpu) { @@ -12246,7 +12339,7 @@ static struct hlist_head * __net_init netdev_create_hash(void) static int __net_init netdev_init(struct net *net) { BUILD_BUG_ON(GRO_HASH_BUCKETS > - 8 * sizeof_field(struct napi_struct, gro_bitmask)); + BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask)); INIT_LIST_HEAD(&net->dev_base_head); @@ -12381,7 +12474,7 @@ static void __net_exit default_device_exit_net(struct net *net) char fb_name[IFNAMSIZ]; /* Ignore unmoveable devices (i.e. loopback) */ - if (dev->netns_local) + if (dev->netns_immutable) continue; /* Leave virtual devices for the generic cleanup */ @@ -12611,7 +12704,7 @@ static int __init net_dev_init(void) INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); spin_lock_init(&sd->defer_lock); - init_gro_hash(&sd->backlog); + gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; INIT_LIST_HEAD(&sd->backlog.poll_list); |