summaryrefslogtreecommitdiff
path: root/net/core/dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c1256
1 files changed, 850 insertions, 406 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index a9f62f5aeb84..be17e0660144 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -92,6 +92,7 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
#include <linux/skbuff.h>
#include <linux/kthread.h>
#include <linux/bpf.h>
@@ -105,6 +106,7 @@
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/gro.h>
+#include <net/netdev_queues.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/checksum.h>
@@ -154,9 +156,11 @@
#include <linux/pm_runtime.h>
#include <linux/prandom.h>
#include <linux/once_lite.h>
+#include <net/netdev_lock.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/types.h>
#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
#include <net/rps.h>
#include <linux/phy_link_topology.h>
@@ -180,8 +184,6 @@ static DEFINE_SPINLOCK(napi_hash_lock);
static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
-static DECLARE_RWSEM(devnet_rename_sem);
-
static inline void dev_base_seq_inc(struct net *net)
{
unsigned int val = net->dev_base_seq + 1;
@@ -460,7 +462,7 @@ EXPORT_PER_CPU_SYMBOL(softnet_data);
* PP consumers must pay attention to run APIs in the appropriate context
* (e.g. NAPI context).
*/
-static DEFINE_PER_CPU(struct page_pool *, system_page_pool);
+DEFINE_PER_CPU(struct page_pool *, system_page_pool);
#ifdef CONFIG_LOCKDEP
/*
@@ -570,10 +572,18 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
static inline struct list_head *ptype_head(const struct packet_type *pt)
{
- if (pt->type == htons(ETH_P_ALL))
- return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
- else
- return pt->dev ? &pt->dev->ptype_specific :
+ if (pt->type == htons(ETH_P_ALL)) {
+ if (!pt->af_packet_net && !pt->dev)
+ return NULL;
+
+ return pt->dev ? &pt->dev->ptype_all :
+ &pt->af_packet_net->ptype_all;
+ }
+
+ if (pt->dev)
+ return &pt->dev->ptype_specific;
+
+ return pt->af_packet_net ? &pt->af_packet_net->ptype_specific :
&ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}
@@ -594,6 +604,9 @@ void dev_add_pack(struct packet_type *pt)
{
struct list_head *head = ptype_head(pt);
+ if (WARN_ON_ONCE(!head))
+ return;
+
spin_lock(&ptype_lock);
list_add_rcu(&pt->list, head);
spin_unlock(&ptype_lock);
@@ -618,6 +631,9 @@ void __dev_remove_pack(struct packet_type *pt)
struct list_head *head = ptype_head(pt);
struct packet_type *pt1;
+ if (!head)
+ return;
+
spin_lock(&ptype_lock);
list_for_each_entry(pt1, head, list) {
@@ -767,7 +783,8 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
}
/* must be called under rcu_read_lock(), as we dont take a reference */
-struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id)
+static struct napi_struct *
+netdev_napi_by_id(struct net *net, unsigned int napi_id)
{
struct napi_struct *napi;
@@ -784,6 +801,49 @@ struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id)
}
/**
+ * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it
+ * @net: the applicable net namespace
+ * @napi_id: ID of a NAPI of a target device
+ *
+ * Find a NAPI instance with @napi_id. Lock its device.
+ * The device must be in %NETREG_REGISTERED state for lookup to succeed.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to NAPI, its device with lock held, NULL if not found.
+ */
+struct napi_struct *
+netdev_napi_by_id_lock(struct net *net, unsigned int napi_id)
+{
+ struct napi_struct *napi;
+ struct net_device *dev;
+
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ dev = napi->dev;
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock(dev);
+ if (!dev)
+ return NULL;
+
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (napi && napi->dev != dev)
+ napi = NULL;
+ rcu_read_unlock();
+
+ if (!napi)
+ netdev_unlock(dev);
+ return napi;
+}
+
+/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace
* @name: name to find
@@ -957,21 +1017,86 @@ EXPORT_SYMBOL(netdev_get_by_index);
* its reference counter increased so the caller must be careful
* about locking. The caller must hold RCU lock.
*/
-
struct net_device *dev_get_by_napi_id(unsigned int napi_id)
{
struct napi_struct *napi;
WARN_ON_ONCE(!rcu_read_lock_held());
- if (napi_id < MIN_NAPI_ID)
+ if (!napi_id_valid(napi_id))
return NULL;
napi = napi_by_id(napi_id);
return napi ? napi->dev : NULL;
}
-EXPORT_SYMBOL(dev_get_by_napi_id);
+
+/* Release the held reference on the net_device, and if the net_device
+ * is still registered try to lock the instance lock. If device is being
+ * unregistered NULL will be returned (but the reference has been released,
+ * either way!)
+ *
+ * This helper is intended for locking net_device after it has been looked up
+ * using a lockless lookup helper. Lock prevents the instance from going away.
+ */
+struct net_device *__netdev_put_lock(struct net_device *dev)
+{
+ netdev_lock(dev);
+ if (dev->reg_state > NETREG_REGISTERED) {
+ netdev_unlock(dev);
+ dev_put(dev);
+ return NULL;
+ }
+ dev_put(dev);
+ return dev;
+}
+
+/**
+ * netdev_get_by_index_lock() - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. If a valid device
+ * with @ifindex is found it will be returned with netdev->lock held.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to a device with lock held, NULL if not found.
+ */
+struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return NULL;
+
+ return __netdev_put_lock(dev);
+}
+
+struct net_device *
+netdev_xa_find_lock(struct net *net, struct net_device *dev,
+ unsigned long *index)
+{
+ if (dev)
+ netdev_unlock(dev);
+
+ do {
+ rcu_read_lock();
+ dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
+ if (!dev) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock(dev);
+ if (dev)
+ return dev;
+
+ (*index)++;
+ } while (true);
+}
static DEFINE_SEQLOCK(netdev_rename_lock);
@@ -1012,6 +1137,12 @@ out:
return ret;
}
+static bool dev_addr_cmp(struct net_device *dev, unsigned short type,
+ const char *ha)
+{
+ return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len);
+}
+
/**
* dev_getbyhwaddr_rcu - find a device by its hardware address
* @net: the applicable net namespace
@@ -1020,7 +1151,7 @@ out:
*
* Search for an interface by MAC address. Returns NULL if the device
* is not found or a pointer to the device.
- * The caller must hold RCU or RTNL.
+ * The caller must hold RCU.
* The returned device has not had its ref count increased
* and the caller must therefore be careful about locking
*
@@ -1032,14 +1163,39 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
struct net_device *dev;
for_each_netdev_rcu(net, dev)
- if (dev->type == type &&
- !memcmp(dev->dev_addr, ha, dev->addr_len))
+ if (dev_addr_cmp(dev, type, ha))
return dev;
return NULL;
}
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
+/**
+ * dev_getbyhwaddr() - find a device by its hardware address
+ * @net: the applicable net namespace
+ * @type: media type of device
+ * @ha: hardware address
+ *
+ * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold
+ * rtnl_lock.
+ *
+ * Context: rtnl_lock() must be held.
+ * Return: pointer to the net_device, or NULL if not found
+ */
+struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
+ const char *ha)
+{
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+ for_each_netdev(net, dev)
+ if (dev_addr_cmp(dev, type, ha))
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(dev_getbyhwaddr);
+
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
struct net_device *dev, *ret = NULL;
@@ -1230,33 +1386,18 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev,
return ret < 0 ? ret : 0;
}
-/**
- * dev_change_name - change name of a device
- * @dev: device
- * @newname: name (or format string) must be at least IFNAMSIZ
- *
- * Change name of a device, can pass format strings "eth%d".
- * for wildcarding.
- */
-int dev_change_name(struct net_device *dev, const char *newname)
+int netif_change_name(struct net_device *dev, const char *newname)
{
+ struct net *net = dev_net(dev);
unsigned char old_assign_type;
char oldname[IFNAMSIZ];
int err = 0;
int ret;
- struct net *net;
-
- ASSERT_RTNL();
- BUG_ON(!dev_net(dev));
-
- net = dev_net(dev);
- down_write(&devnet_rename_sem);
+ ASSERT_RTNL_NET(net);
- if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
- up_write(&devnet_rename_sem);
+ if (!strncmp(newname, dev->name, IFNAMSIZ))
return 0;
- }
memcpy(oldname, dev->name, IFNAMSIZ);
@@ -1264,10 +1405,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
err = dev_get_valid_name(net, dev, newname);
write_sequnlock_bh(&netdev_rename_lock);
- if (err < 0) {
- up_write(&devnet_rename_sem);
+ if (err < 0)
return err;
- }
if (oldname[0] && !strchr(oldname, '%'))
netdev_info(dev, "renamed from %s%s\n", oldname,
@@ -1279,14 +1418,13 @@ int dev_change_name(struct net_device *dev, const char *newname)
rollback:
ret = device_rename(&dev->dev, dev->name);
if (ret) {
+ write_seqlock_bh(&netdev_rename_lock);
memcpy(dev->name, oldname, IFNAMSIZ);
+ write_sequnlock_bh(&netdev_rename_lock);
WRITE_ONCE(dev->name_assign_type, old_assign_type);
- up_write(&devnet_rename_sem);
return ret;
}
- up_write(&devnet_rename_sem);
-
netdev_adjacent_rename_links(dev, oldname);
netdev_name_node_del(dev->name_node);
@@ -1302,7 +1440,6 @@ rollback:
/* err >= 0 after dev_alloc_name() or stores the first errno */
if (err >= 0) {
err = ret;
- down_write(&devnet_rename_sem);
write_seqlock_bh(&netdev_rename_lock);
memcpy(dev->name, oldname, IFNAMSIZ);
write_sequnlock_bh(&netdev_rename_lock);
@@ -1319,15 +1456,7 @@ rollback:
return err;
}
-/**
- * dev_set_alias - change ifalias of a device
- * @dev: device
- * @alias: name up to IFALIASZ
- * @len: limit of bytes to copy from info
- *
- * Set ifalias for a device,
- */
-int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
+int netif_set_alias(struct net_device *dev, const char *alias, size_t len)
{
struct dev_ifalias *new_alias = NULL;
@@ -1353,7 +1482,6 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
return len;
}
-EXPORT_SYMBOL(dev_set_alias);
/**
* dev_get_alias - get ifalias of a device
@@ -1499,6 +1627,8 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
set_bit(__LINK_STATE_START, &dev->state);
+ netdev_ops_assert_locked(dev);
+
if (ops->ndo_validate_addr)
ret = ops->ndo_validate_addr(dev);
@@ -1510,7 +1640,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
else {
- dev->flags |= IFF_UP;
+ netif_set_up(dev, true);
dev_set_rx_mode(dev);
dev_activate(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
@@ -1519,20 +1649,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
return ret;
}
-/**
- * dev_open - prepare an interface for use.
- * @dev: device to open
- * @extack: netlink extended ack
- *
- * Takes a device from down to up state. The device's private open
- * function is invoked and then the multicast lists are loaded. Finally
- * the device is moved into the up state and a %NETDEV_UP message is
- * sent to the netdev notifier chain.
- *
- * Calling this function on an active interface is a nop. On a failure
- * a negative errno code is returned.
- */
-int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
+int netif_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
int ret;
@@ -1548,7 +1665,6 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
return ret;
}
-EXPORT_SYMBOL(dev_open);
static void __dev_close_many(struct list_head *head)
{
@@ -1586,10 +1702,13 @@ static void __dev_close_many(struct list_head *head)
* We allow it to be called even after a DETACH hot-plug
* event.
*/
+
+ netdev_ops_assert_locked(dev);
+
if (ops->ndo_stop)
ops->ndo_stop(dev);
- dev->flags &= ~IFF_UP;
+ netif_set_up(dev, false);
netpoll_poll_enable(dev);
}
}
@@ -1623,16 +1742,7 @@ void dev_close_many(struct list_head *head, bool unlink)
}
EXPORT_SYMBOL(dev_close_many);
-/**
- * dev_close - shutdown an interface.
- * @dev: device to shutdown
- *
- * This function moves an active device into down state. A
- * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
- * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
- * chain.
- */
-void dev_close(struct net_device *dev)
+void netif_close(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
LIST_HEAD(single);
@@ -1642,18 +1752,9 @@ void dev_close(struct net_device *dev)
list_del(&single);
}
}
-EXPORT_SYMBOL(dev_close);
-
+EXPORT_SYMBOL(netif_close);
-/**
- * dev_disable_lro - disable Large Receive Offload on a device
- * @dev: device
- *
- * Disable Large Receive Offload (LRO) on a net device. Must be
- * called under RTNL. This is needed if received packets may be
- * forwarded to another interface.
- */
-void dev_disable_lro(struct net_device *dev)
+void netif_disable_lro(struct net_device *dev)
{
struct net_device *lower_dev;
struct list_head *iter;
@@ -1664,10 +1765,12 @@ void dev_disable_lro(struct net_device *dev)
if (unlikely(dev->features & NETIF_F_LRO))
netdev_WARN(dev, "failed to disable LRO!\n");
- netdev_for_each_lower_dev(dev, lower_dev, iter)
- dev_disable_lro(lower_dev);
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ netdev_lock_ops(lower_dev);
+ netif_disable_lro(lower_dev);
+ netdev_unlock_ops(lower_dev);
+ }
}
-EXPORT_SYMBOL(dev_disable_lro);
/**
* dev_disable_gro_hw - disable HW Generic Receive Offload on a device
@@ -1799,14 +1902,19 @@ int register_netdevice_notifier(struct notifier_block *nb)
/* Close race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
+
+ /* When RTNL is removed, we need protection for netdev_chain. */
rtnl_lock();
+
err = raw_notifier_chain_register(&netdev_chain, nb);
if (err)
goto unlock;
if (dev_boot_phase)
goto unlock;
for_each_net(net) {
+ __rtnl_net_lock(net);
err = call_netdevice_register_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
if (err)
goto rollback;
}
@@ -1817,8 +1925,11 @@ unlock:
return err;
rollback:
- for_each_net_continue_reverse(net)
+ for_each_net_continue_reverse(net) {
+ __rtnl_net_lock(net);
call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ }
raw_notifier_chain_unregister(&netdev_chain, nb);
goto unlock;
@@ -1851,8 +1962,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
if (err)
goto unlock;
- for_each_net(net)
+ for_each_net(net) {
+ __rtnl_net_lock(net);
call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ }
unlock:
rtnl_unlock();
@@ -1916,9 +2030,10 @@ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
{
int err;
- rtnl_lock();
+ rtnl_net_lock(net);
err = __register_netdevice_notifier_net(net, nb, false);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_net);
@@ -1944,9 +2059,10 @@ int unregister_netdevice_notifier_net(struct net *net,
{
int err;
- rtnl_lock();
+ rtnl_net_lock(net);
err = __unregister_netdevice_notifier_net(net, nb);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_net);
@@ -1959,19 +2075,56 @@ static void __move_netdevice_notifier_net(struct net *src_net,
__register_netdevice_notifier_net(dst_net, nb, true);
}
+static void rtnl_net_dev_lock(struct net_device *dev)
+{
+ bool again;
+
+ do {
+ struct net *net;
+
+ again = false;
+
+ /* netns might be being dismantled. */
+ rcu_read_lock();
+ net = dev_net_rcu(dev);
+ net_passive_inc(net);
+ rcu_read_unlock();
+
+ rtnl_net_lock(net);
+
+#ifdef CONFIG_NET_NS
+ /* dev might have been moved to another netns. */
+ if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+ again = true;
+ }
+#endif
+ } while (again);
+}
+
+static void rtnl_net_dev_unlock(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+}
+
int register_netdevice_notifier_dev_net(struct net_device *dev,
struct notifier_block *nb,
struct netdev_net_notifier *nn)
{
int err;
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
if (!err) {
nn->nb = nb;
list_add(&nn->list, &dev->net_notifier_list);
}
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
+
return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
@@ -1982,10 +2135,11 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev,
{
int err;
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
list_del(&nn->list);
err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
+
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
@@ -2134,8 +2288,8 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif
#ifdef CONFIG_NET_CLS_ACT
-DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key);
-EXPORT_SYMBOL(tcf_bypass_check_needed_key);
+DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key);
+EXPORT_SYMBOL(tcf_sw_enabled_key);
#endif
DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
@@ -2301,16 +2455,21 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
}
/**
- * dev_nit_active - return true if any network interface taps are in use
+ * dev_nit_active_rcu - return true if any network interface taps are in use
+ *
+ * The caller must hold the RCU lock
*
* @dev: network device to check for the presence of taps
*/
-bool dev_nit_active(struct net_device *dev)
+bool dev_nit_active_rcu(const struct net_device *dev)
{
- return !list_empty(&net_hotdata.ptype_all) ||
+ /* Callers may hold either RCU or RCU BH lock */
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+
+ return !list_empty(&dev_net(dev)->ptype_all) ||
!list_empty(&dev->ptype_all);
}
-EXPORT_SYMBOL_GPL(dev_nit_active);
+EXPORT_SYMBOL_GPL(dev_nit_active_rcu);
/*
* Support routine. Sends outgoing frames to any network
@@ -2319,11 +2478,12 @@ EXPORT_SYMBOL_GPL(dev_nit_active);
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
- struct list_head *ptype_list = &net_hotdata.ptype_all;
struct packet_type *ptype, *pt_prev = NULL;
+ struct list_head *ptype_list;
struct sk_buff *skb2 = NULL;
rcu_read_lock();
+ ptype_list = &dev_net_rcu(dev)->ptype_all;
again:
list_for_each_entry_rcu(ptype, ptype_list, list) {
if (READ_ONCE(ptype->ignore_outgoing))
@@ -2367,7 +2527,7 @@ again:
pt_prev = ptype;
}
- if (ptype_list == &net_hotdata.ptype_all) {
+ if (ptype_list != &dev->ptype_all) {
ptype_list = &dev->ptype_all;
goto again;
}
@@ -2970,6 +3130,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (dev->reg_state == NETREG_REGISTERED ||
dev->reg_state == NETREG_UNREGISTERING) {
ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
txq);
@@ -3000,7 +3161,6 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);
-#ifdef CONFIG_SYSFS
/**
* netif_set_real_num_rx_queues - set actual number of RX queues used
* @dev: Network device
@@ -3020,6 +3180,7 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
if (dev->reg_state == NETREG_REGISTERED) {
ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
rxq);
@@ -3031,7 +3192,6 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
return 0;
}
EXPORT_SYMBOL(netif_set_real_num_rx_queues);
-#endif
/**
* netif_set_real_num_queues - set actual number of RX and TX queues used
@@ -3263,7 +3423,7 @@ void netif_device_attach(struct net_device *dev)
if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
netif_running(dev)) {
netif_tx_wake_all_queues(dev);
- __netdev_watchdog_up(dev);
+ netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(netif_device_attach);
@@ -3612,7 +3772,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
unsigned int len;
int rc;
- if (dev_nit_active(dev))
+ if (dev_nit_active_rcu(dev))
dev_queue_xmit_nit(skb, dev);
len = skb->len;
@@ -3692,6 +3852,9 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
{
netdev_features_t features;
+ if (!skb_frags_readable(skb))
+ goto out_kfree_skb;
+
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
@@ -4030,10 +4193,13 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
if (!miniq)
return ret;
- if (static_branch_unlikely(&tcf_bypass_check_needed_key)) {
- if (tcf_block_bypass_sw(miniq->block))
- return ret;
- }
+ /* Global bypass */
+ if (!static_branch_likely(&tcf_sw_enabled_key))
+ return ret;
+
+ /* Block-wise bypass */
+ if (tcf_block_bypass_sw(miniq->block))
+ return ret;
tc_skb_cb(skb)->mru = 0;
tc_skb_cb(skb)->post_ct = false;
@@ -4382,7 +4548,8 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
skb_reset_mac_header(skb);
skb_assert_len(skb);
- if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
+ if (unlikely(skb_shinfo(skb)->tx_flags &
+ (SKBTX_SCHED_TSTAMP | SKBTX_BPF)))
__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
/* Disable soft irqs for various locks below. Also
@@ -4574,7 +4741,7 @@ use_local_napi:
* we have to raise NET_RX_SOFTIRQ.
*/
if (!sd->in_net_rx_action)
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
#ifdef CONFIG_RPS
@@ -4584,6 +4751,11 @@ EXPORT_SYMBOL(rps_needed);
struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);
+static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
+{
+ return hash_32(hash, flow_table->log);
+}
+
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow *rflow, u16 next_cpu)
@@ -4610,7 +4782,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
- flow_id = skb_get_hash(skb) & flow_table->mask;
+ flow_id = rfs_slot(skb_get_hash(skb), flow_table);
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
@@ -4689,7 +4861,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
- rflow = &flow_table->flows[hash & flow_table->mask];
+ rflow = &flow_table->flows[rfs_slot(hash, flow_table)];
tcpu = rflow->cpu;
/*
@@ -4756,13 +4928,13 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
- if (flow_table && flow_id <= flow_table->mask) {
+ if (flow_table && flow_id < (1UL << flow_table->log)) {
rflow = &flow_table->flows[flow_id];
cpu = READ_ONCE(rflow->cpu);
if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
READ_ONCE(rflow->last_qtail)) <
- (int)(10 * flow_table->mask)))
+ (int)(10 << flow_table->log)))
expire = false;
}
rcu_read_unlock();
@@ -4963,7 +5135,7 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
}
u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
void *orig_data, *orig_data_end, *hard_start;
struct netdev_rx_queue *rxqueue;
@@ -5065,7 +5237,7 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
}
static int
-netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
+netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog)
{
struct sk_buff *skb = *pskb;
int err, hroom, troom;
@@ -5089,7 +5261,7 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
struct sk_buff *skb = *pskb;
u32 mac_len, act = XDP_DROP;
@@ -5142,7 +5314,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
* and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
* queues, so they do not have this starvation issue.
*/
-void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
+void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
@@ -5167,7 +5339,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
-int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
+int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
@@ -5506,8 +5678,14 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
orig_dev = skb->dev;
skb_reset_network_header(skb);
+#if !defined(CONFIG_DEBUG_NET)
+ /* We plan to no longer reset the transport header here.
+ * Give some time to fuzzers and dev build to catch bugs
+ * in network stacks.
+ */
if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
+#endif
skb_reset_mac_len(skb);
pt_prev = NULL;
@@ -5543,7 +5721,8 @@ another_round:
if (pfmemalloc)
goto skip_taps;
- list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
+ list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all,
+ list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
@@ -5655,6 +5834,14 @@ check_vlan_id:
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK]);
+
+ /* orig_dev and skb->dev could belong to different netns;
+ * Even in such case we need to traverse only the list
+ * coming from skb->dev, as the ptype owner (packet socket)
+ * will use dev_net(skb->dev) to do namespace filtering.
+ */
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &dev_net_rcu(skb->dev)->ptype_specific);
}
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
@@ -5865,7 +6052,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
static_branch_dec(&generic_xdp_needed_key);
} else if (new && !old) {
static_branch_inc(&generic_xdp_needed_key);
- dev_disable_lro(dev);
+ netif_disable_lro(dev);
dev_disable_gro_hw(dev);
}
break;
@@ -5991,22 +6178,22 @@ void netif_receive_skb_list(struct list_head *head)
}
EXPORT_SYMBOL(netif_receive_skb_list);
-static DEFINE_PER_CPU(struct work_struct, flush_works);
-
/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
{
struct sk_buff *skb, *tmp;
+ struct sk_buff_head list;
struct softnet_data *sd;
+ __skb_queue_head_init(&list);
local_bh_disable();
sd = this_cpu_ptr(&softnet_data);
backlog_lock_irq_disable(sd);
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
- if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+ if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
- dev_kfree_skb_irq(skb);
+ __skb_queue_tail(&list, skb);
rps_input_queue_head_incr(sd);
}
}
@@ -6014,14 +6201,16 @@ static void flush_backlog(struct work_struct *work)
local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
- if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+ if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->process_queue);
- kfree_skb(skb);
+ __skb_queue_tail(&list, skb);
rps_input_queue_head_incr(sd);
}
}
local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
local_bh_enable();
+
+ __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY);
}
static bool flush_required(int cpu)
@@ -6049,36 +6238,54 @@ static bool flush_required(int cpu)
return true;
}
+struct flush_backlogs {
+ cpumask_t flush_cpus;
+ struct work_struct w[];
+};
+
+static struct flush_backlogs *flush_backlogs_alloc(void)
+{
+ return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids),
+ GFP_KERNEL);
+}
+
+static struct flush_backlogs *flush_backlogs_fallback;
+static DEFINE_MUTEX(flush_backlogs_mutex);
+
static void flush_all_backlogs(void)
{
- static cpumask_t flush_cpus;
+ struct flush_backlogs *ptr = flush_backlogs_alloc();
unsigned int cpu;
- /* since we are under rtnl lock protection we can use static data
- * for the cpumask and avoid allocating on stack the possibly
- * large mask
- */
- ASSERT_RTNL();
+ if (!ptr) {
+ mutex_lock(&flush_backlogs_mutex);
+ ptr = flush_backlogs_fallback;
+ }
+ cpumask_clear(&ptr->flush_cpus);
cpus_read_lock();
- cpumask_clear(&flush_cpus);
for_each_online_cpu(cpu) {
if (flush_required(cpu)) {
- queue_work_on(cpu, system_highpri_wq,
- per_cpu_ptr(&flush_works, cpu));
- cpumask_set_cpu(cpu, &flush_cpus);
+ INIT_WORK(&ptr->w[cpu], flush_backlog);
+ queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]);
+ __cpumask_set_cpu(cpu, &ptr->flush_cpus);
}
}
/* we can have in flight packet[s] on the cpus we are not flushing,
* synchronize_net() in unregister_netdevice_many() will take care of
- * them
+ * them.
*/
- for_each_cpu(cpu, &flush_cpus)
- flush_work(per_cpu_ptr(&flush_works, cpu));
+ for_each_cpu(cpu, &ptr->flush_cpus)
+ flush_work(&ptr->w[cpu]);
cpus_read_unlock();
+
+ if (ptr != flush_backlogs_fallback)
+ kfree(ptr);
+ else
+ mutex_unlock(&flush_backlogs_mutex);
}
static void net_rps_send_ipi(struct softnet_data *remsd)
@@ -6267,7 +6474,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
return false;
if (work_done) {
- if (n->gro_bitmask)
+ if (n->gro.bitmask)
timeout = napi_get_gro_flush_timeout(n);
n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
}
@@ -6277,15 +6484,14 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
if (timeout)
ret = false;
}
- if (n->gro_bitmask) {
- /* When the NAPI instance uses a timeout and keeps postponing
- * it, we need to bound somehow the time packets are kept in
- * the GRO layer
- */
- napi_gro_flush(n, !!timeout);
- }
- gro_normal_list(n);
+ /*
+ * When the NAPI instance uses a timeout and keeps postponing
+ * it, we need to bound somehow the time packets are kept in
+ * the GRO layer.
+ */
+ gro_flush(&n->gro, !!timeout);
+ gro_normal_list(&n->gro);
if (unlikely(!list_empty(&n->poll_list))) {
/* If n->poll_list is not empty, we need to mask irqs */
@@ -6349,19 +6555,15 @@ static void skb_defer_free_flush(struct softnet_data *sd)
static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
{
if (!skip_schedule) {
- gro_normal_list(napi);
+ gro_normal_list(&napi->gro);
__napi_schedule(napi);
return;
}
- if (napi->gro_bitmask) {
- /* flush too old packets
- * If HZ < 1000, flush all packets.
- */
- napi_gro_flush(napi, HZ >= 1000);
- }
+ /* Flush too old packets. If HZ < 1000, flush all packets */
+ gro_flush(&napi->gro, HZ >= 1000);
+ gro_normal_list(&napi->gro);
- gro_normal_list(napi);
clear_bit(NAPI_STATE_SCHED, &napi->state);
}
@@ -6468,7 +6670,7 @@ restart:
}
work = napi_poll(napi, budget);
trace_napi_poll(napi, work, budget);
- gro_normal_list(napi);
+ gro_normal_list(&napi->gro);
count:
if (work > 0)
__NET_ADD_STATS(dev_net(napi->dev),
@@ -6568,7 +6770,9 @@ void napi_resume_irqs(unsigned int napi_id)
static void __napi_hash_add_with_id(struct napi_struct *napi,
unsigned int napi_id)
{
- napi->napi_id = napi_id;
+ napi->gro.cached_napi_id = napi_id;
+
+ WRITE_ONCE(napi->napi_id, napi_id);
hlist_add_head_rcu(&napi->napi_hash_node,
&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
}
@@ -6595,7 +6799,7 @@ static void napi_hash_add(struct napi_struct *napi)
/* 0..NR_CPUS range is reserved for sender_cpu use */
do {
- if (unlikely(++napi_gen_id < MIN_NAPI_ID))
+ if (unlikely(!napi_id_valid(++napi_gen_id)))
napi_gen_id = MIN_NAPI_ID;
} while (napi_by_id(napi_gen_id));
@@ -6636,22 +6840,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void init_gro_hash(struct napi_struct *napi)
-{
- int i;
-
- for (i = 0; i < GRO_HASH_BUCKETS; i++) {
- INIT_LIST_HEAD(&napi->gro_hash[i].list);
- napi->gro_hash[i].count = 0;
- }
- napi->gro_bitmask = 0;
-}
-
int dev_set_threaded(struct net_device *dev, bool threaded)
{
struct napi_struct *napi;
int err = 0;
+ netdev_assert_locked_or_invisible(dev);
+
if (dev->threaded == threaded)
return 0;
@@ -6706,8 +6901,7 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
if (WARN_ON_ONCE(napi && !napi->dev))
return;
- if (dev->reg_state >= NETREG_REGISTERED)
- ASSERT_RTNL();
+ netdev_ops_assert_locked_or_invisible(dev);
switch (type) {
case NETDEV_QUEUE_TYPE_RX:
@@ -6724,19 +6918,184 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
}
EXPORT_SYMBOL(netif_queue_set_napi);
+static void
+netif_napi_irq_notify(struct irq_affinity_notify *notify,
+ const cpumask_t *mask)
+{
+ struct napi_struct *napi =
+ container_of(notify, struct napi_struct, notify);
+#ifdef CONFIG_RFS_ACCEL
+ struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
+ int err;
+#endif
+
+ if (napi->config && napi->dev->irq_affinity_auto)
+ cpumask_copy(&napi->config->affinity_mask, mask);
+
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask);
+ if (err)
+ netdev_warn(napi->dev, "RMAP update failed (%d)\n",
+ err);
+ }
+#endif
+}
+
+#ifdef CONFIG_RFS_ACCEL
+static void netif_napi_affinity_release(struct kref *ref)
+{
+ struct napi_struct *napi =
+ container_of(ref, struct napi_struct, notify.kref);
+ struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
+
+ netdev_assert_locked(napi->dev);
+ WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER,
+ &napi->state));
+
+ if (!napi->dev->rx_cpu_rmap_auto)
+ return;
+ rmap->obj[napi->napi_rmap_idx] = NULL;
+ napi->napi_rmap_idx = -1;
+ cpu_rmap_put(rmap);
+}
+
+int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
+{
+ if (dev->rx_cpu_rmap_auto)
+ return 0;
+
+ dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs);
+ if (!dev->rx_cpu_rmap)
+ return -ENOMEM;
+
+ dev->rx_cpu_rmap_auto = true;
+ return 0;
+}
+EXPORT_SYMBOL(netif_enable_cpu_rmap);
+
+static void netif_del_cpu_rmap(struct net_device *dev)
+{
+ struct cpu_rmap *rmap = dev->rx_cpu_rmap;
+
+ if (!dev->rx_cpu_rmap_auto)
+ return;
+
+ /* Free the rmap */
+ cpu_rmap_put(rmap);
+ dev->rx_cpu_rmap = NULL;
+ dev->rx_cpu_rmap_auto = false;
+}
+
+#else
+static void netif_napi_affinity_release(struct kref *ref)
+{
+}
+
+int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
+{
+ return 0;
+}
+EXPORT_SYMBOL(netif_enable_cpu_rmap);
+
+static void netif_del_cpu_rmap(struct net_device *dev)
+{
+}
+#endif
+
+void netif_set_affinity_auto(struct net_device *dev)
+{
+ unsigned int i, maxqs, numa;
+
+ maxqs = max(dev->num_tx_queues, dev->num_rx_queues);
+ numa = dev_to_node(&dev->dev);
+
+ for (i = 0; i < maxqs; i++)
+ cpumask_set_cpu(cpumask_local_spread(i, numa),
+ &dev->napi_config[i].affinity_mask);
+
+ dev->irq_affinity_auto = true;
+}
+EXPORT_SYMBOL(netif_set_affinity_auto);
+
+void netif_napi_set_irq_locked(struct napi_struct *napi, int irq)
+{
+ int rc;
+
+ netdev_assert_locked_or_invisible(napi->dev);
+
+ if (napi->irq == irq)
+ return;
+
+ /* Remove existing resources */
+ if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
+ irq_set_affinity_notifier(napi->irq, NULL);
+
+ napi->irq = irq;
+ if (irq < 0 ||
+ (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto))
+ return;
+
+ /* Abort for buggy drivers */
+ if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config))
+ return;
+
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi);
+ if (rc < 0)
+ return;
+
+ cpu_rmap_get(napi->dev->rx_cpu_rmap);
+ napi->napi_rmap_idx = rc;
+ }
+#endif
+
+ /* Use core IRQ notifier */
+ napi->notify.notify = netif_napi_irq_notify;
+ napi->notify.release = netif_napi_affinity_release;
+ rc = irq_set_affinity_notifier(irq, &napi->notify);
+ if (rc) {
+ netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n",
+ rc);
+ goto put_rmap;
+ }
+
+ set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state);
+ return;
+
+put_rmap:
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL;
+ cpu_rmap_put(napi->dev->rx_cpu_rmap);
+ napi->napi_rmap_idx = -1;
+ }
+#endif
+ napi->notify.notify = NULL;
+ napi->notify.release = NULL;
+}
+EXPORT_SYMBOL(netif_napi_set_irq_locked);
+
static void napi_restore_config(struct napi_struct *n)
{
n->defer_hard_irqs = n->config->defer_hard_irqs;
n->gro_flush_timeout = n->config->gro_flush_timeout;
n->irq_suspend_timeout = n->config->irq_suspend_timeout;
+
+ if (n->dev->irq_affinity_auto &&
+ test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state))
+ irq_set_affinity(n->irq, &n->config->affinity_mask);
+
/* a NAPI ID might be stored in the config, if so use it. if not, use
- * napi_hash_add to generate one for us. It will be saved to the config
- * in napi_disable.
+ * napi_hash_add to generate one for us.
*/
- if (n->config->napi_id)
+ if (n->config->napi_id) {
napi_hash_add_with_id(n, n->config->napi_id);
- else
+ } else {
napi_hash_add(n);
+ n->config->napi_id = n->napi_id;
+ }
}
static void napi_save_config(struct napi_struct *n)
@@ -6744,24 +7103,70 @@ static void napi_save_config(struct napi_struct *n)
n->config->defer_hard_irqs = n->defer_hard_irqs;
n->config->gro_flush_timeout = n->gro_flush_timeout;
n->config->irq_suspend_timeout = n->irq_suspend_timeout;
- n->config->napi_id = n->napi_id;
napi_hash_del(n);
}
-void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
- int (*poll)(struct napi_struct *, int), int weight)
+/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will
+ * inherit an existing ID try to insert it at the right position.
+ */
+static void
+netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
+{
+ unsigned int new_id, pos_id;
+ struct list_head *higher;
+ struct napi_struct *pos;
+
+ new_id = UINT_MAX;
+ if (napi->config && napi->config->napi_id)
+ new_id = napi->config->napi_id;
+
+ higher = &dev->napi_list;
+ list_for_each_entry(pos, &dev->napi_list, dev_list) {
+ if (napi_id_valid(pos->napi_id))
+ pos_id = pos->napi_id;
+ else if (pos->config)
+ pos_id = pos->config->napi_id;
+ else
+ pos_id = UINT_MAX;
+
+ if (pos_id <= new_id)
+ break;
+ higher = &pos->dev_list;
+ }
+ list_add_rcu(&napi->dev_list, higher); /* adds after higher */
+}
+
+/* Double check that napi_get_frags() allocates skbs with
+ * skb->head being backed by slab, not a page fragment.
+ * This is to make sure bug fixed in 3226b158e67c
+ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
+ * does not accidentally come back.
+ */
+static void napi_get_frags_check(struct napi_struct *napi)
{
+ struct sk_buff *skb;
+
+ local_bh_disable();
+ skb = napi_get_frags(napi);
+ WARN_ON_ONCE(skb && skb->head_frag);
+ napi_free_frags(napi);
+ local_bh_enable();
+}
+
+void netif_napi_add_weight_locked(struct net_device *dev,
+ struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int),
+ int weight)
+{
+ netdev_assert_locked(dev);
if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
return;
INIT_LIST_HEAD(&napi->poll_list);
INIT_HLIST_NODE(&napi->napi_hash_node);
- hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- napi->timer.function = napi_watchdog;
- init_gro_hash(napi);
+ hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ gro_init(&napi->gro);
napi->skb = NULL;
- INIT_LIST_HEAD(&napi->rx_list);
- napi->rx_count = 0;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
netdev_err_once(dev, "%s() called with weight %d\n", __func__,
@@ -6774,7 +7179,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
napi->list_owner = -1;
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
- list_add_rcu(&napi->dev_list, &dev->napi_list);
+ netif_napi_dev_list_add(dev, napi);
/* default settings from sysfs are applied to all NAPIs. any per-NAPI
* configuration will be loaded in napi_enable
@@ -6789,15 +7194,17 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
*/
if (dev->threaded && napi_kthread_create(napi))
dev->threaded = false;
- netif_napi_set_irq(napi, -1);
+ netif_napi_set_irq_locked(napi, -1);
}
-EXPORT_SYMBOL(netif_napi_add_weight);
+EXPORT_SYMBOL(netif_napi_add_weight_locked);
-void napi_disable(struct napi_struct *n)
+void napi_disable_locked(struct napi_struct *n)
{
unsigned long val, new;
might_sleep();
+ netdev_assert_locked(n->dev);
+
set_bit(NAPI_STATE_DISABLE, &n->state);
val = READ_ONCE(n->state);
@@ -6820,16 +7227,25 @@ void napi_disable(struct napi_struct *n)
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
-EXPORT_SYMBOL(napi_disable);
+EXPORT_SYMBOL(napi_disable_locked);
/**
- * napi_enable - enable NAPI scheduling
- * @n: NAPI context
+ * napi_disable() - prevent NAPI from scheduling
+ * @n: NAPI context
*
- * Resume NAPI from being scheduled on this context.
- * Must be paired with napi_disable.
+ * Stop NAPI from being scheduled on this context.
+ * Waits till any outstanding processing completes.
+ * Takes netdev_lock() for associated net_device.
*/
-void napi_enable(struct napi_struct *n)
+void napi_disable(struct napi_struct *n)
+{
+ netdev_lock(n->dev);
+ napi_disable_locked(n);
+ netdev_unlock(n->dev);
+}
+EXPORT_SYMBOL(napi_disable);
+
+void napi_enable_locked(struct napi_struct *n)
{
unsigned long new, val = READ_ONCE(n->state);
@@ -6846,27 +7262,38 @@ void napi_enable(struct napi_struct *n)
new |= NAPIF_STATE_THREADED;
} while (!try_cmpxchg(&n->state, &val, new));
}
-EXPORT_SYMBOL(napi_enable);
+EXPORT_SYMBOL(napi_enable_locked);
-static void flush_gro_hash(struct napi_struct *napi)
+/**
+ * napi_enable() - enable NAPI scheduling
+ * @n: NAPI context
+ *
+ * Enable scheduling of a NAPI instance.
+ * Must be paired with napi_disable().
+ * Takes netdev_lock() for associated net_device.
+ */
+void napi_enable(struct napi_struct *n)
{
- int i;
-
- for (i = 0; i < GRO_HASH_BUCKETS; i++) {
- struct sk_buff *skb, *n;
-
- list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
- kfree_skb(skb);
- napi->gro_hash[i].count = 0;
- }
+ netdev_lock(n->dev);
+ napi_enable_locked(n);
+ netdev_unlock(n->dev);
}
+EXPORT_SYMBOL(napi_enable);
/* Must be called in process context */
-void __netif_napi_del(struct napi_struct *napi)
+void __netif_napi_del_locked(struct napi_struct *napi)
{
+ netdev_assert_locked(napi->dev);
+
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
return;
+ /* Make sure NAPI is disabled (or was never enabled). */
+ WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));
+
+ if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
+ irq_set_affinity_notifier(napi->irq, NULL);
+
if (napi->config) {
napi->index = -1;
napi->config = NULL;
@@ -6875,15 +7302,14 @@ void __netif_napi_del(struct napi_struct *napi)
list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
- flush_gro_hash(napi);
- napi->gro_bitmask = 0;
+ gro_cleanup(&napi->gro);
if (napi->thread) {
kthread_stop(napi->thread);
napi->thread = NULL;
}
}
-EXPORT_SYMBOL(__netif_napi_del);
+EXPORT_SYMBOL(__netif_napi_del_locked);
static int __napi_poll(struct napi_struct *n, bool *repoll)
{
@@ -6935,14 +7361,9 @@ static int __napi_poll(struct napi_struct *n, bool *repoll)
return work;
}
- if (n->gro_bitmask) {
- /* flush too old packets
- * If HZ < 1000, flush all packets.
- */
- napi_gro_flush(n, HZ >= 1000);
- }
-
- gro_normal_list(n);
+ /* Flush too old packets. If HZ < 1000, flush all packets */
+ gro_flush(&n->gro, HZ >= 1000);
+ gro_normal_list(&n->gro);
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
@@ -8801,7 +9222,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc)
}
EXPORT_SYMBOL(dev_set_promiscuity);
-static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
+int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
unsigned int allmulti, flags;
@@ -8836,25 +9257,6 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
return 0;
}
-/**
- * dev_set_allmulti - update allmulti count on a device
- * @dev: device
- * @inc: modifier
- *
- * Add or remove reception of all multicast frames to a device. While the
- * count in the device remains above zero the interface remains listening
- * to all interfaces. Once it hits zero the device reverts back to normal
- * filtering operation. A negative @inc value is used to drop the counter
- * when releasing a resource needing all multicasts.
- * Return 0 if successful or a negative errno code on error.
- */
-
-int dev_set_allmulti(struct net_device *dev, int inc)
-{
- return __dev_set_allmulti(dev, inc, true);
-}
-EXPORT_SYMBOL(dev_set_allmulti);
-
/*
* Upload unicast and multicast address lists to device and
* configure RX filtering. When the device doesn't support unicast
@@ -8970,7 +9372,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
if ((flags ^ dev->gflags) & IFF_PROMISC) {
int inc = (flags & IFF_PROMISC) ? 1 : -1;
- unsigned int old_flags = dev->flags;
+ old_flags = dev->flags;
dev->gflags ^= IFF_PROMISC;
@@ -8987,7 +9389,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
dev->gflags ^= IFF_ALLMULTI;
- __dev_set_allmulti(dev, inc, false);
+ netif_set_allmulti(dev, inc, false);
}
return ret;
@@ -9022,17 +9424,8 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
}
}
-/**
- * dev_change_flags - change device settings
- * @dev: device
- * @flags: device state flags
- * @extack: netlink extended ack
- *
- * Change settings on device based state flags. The flags are
- * in the userspace exported format.
- */
-int dev_change_flags(struct net_device *dev, unsigned int flags,
- struct netlink_ext_ack *extack)
+int netif_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
{
int ret;
unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
@@ -9045,7 +9438,6 @@ int dev_change_flags(struct net_device *dev, unsigned int flags,
__dev_notify_flags(dev, old_flags, changes, 0, NULL);
return ret;
}
-EXPORT_SYMBOL(dev_change_flags);
int __dev_set_mtu(struct net_device *dev, int new_mtu)
{
@@ -9077,15 +9469,15 @@ int dev_validate_mtu(struct net_device *dev, int new_mtu,
}
/**
- * dev_set_mtu_ext - Change maximum transfer unit
+ * netif_set_mtu_ext - Change maximum transfer unit
* @dev: device
* @new_mtu: new transfer unit
* @extack: netlink extended ack
*
* Change the maximum transfer size of the network device.
*/
-int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
- struct netlink_ext_ack *extack)
+int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack)
{
int err, orig_mtu;
@@ -9123,25 +9515,20 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
return err;
}
-int dev_set_mtu(struct net_device *dev, int new_mtu)
+int netif_set_mtu(struct net_device *dev, int new_mtu)
{
struct netlink_ext_ack extack;
int err;
memset(&extack, 0, sizeof(extack));
- err = dev_set_mtu_ext(dev, new_mtu, &extack);
+ err = netif_set_mtu_ext(dev, new_mtu, &extack);
if (err && extack._msg)
net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
return err;
}
-EXPORT_SYMBOL(dev_set_mtu);
+EXPORT_SYMBOL(netif_set_mtu);
-/**
- * dev_change_tx_queue_len - Change TX queue length of a netdevice
- * @dev: device
- * @new_len: new tx queue length
- */
-int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
+int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
unsigned int orig_len = dev->tx_queue_len;
int res;
@@ -9168,12 +9555,7 @@ err_rollback:
return res;
}
-/**
- * dev_set_group - Change group this device belongs to
- * @dev: device
- * @new_group: group this device should belong to
- */
-void dev_set_group(struct net_device *dev, int new_group)
+void netif_set_group(struct net_device *dev, int new_group)
{
dev->group = new_group;
}
@@ -9199,16 +9581,8 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
}
EXPORT_SYMBOL(dev_pre_changeaddr_notify);
-/**
- * dev_set_mac_address - Change Media Access Control Address
- * @dev: device
- * @sa: new address
- * @extack: netlink extended ack
- *
- * Change the hardware (MAC) address of the device
- */
-int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
- struct netlink_ext_ack *extack)
+int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa,
+ struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
int err;
@@ -9232,22 +9606,9 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
add_device_randomness(dev->dev_addr, dev->addr_len);
return 0;
}
-EXPORT_SYMBOL(dev_set_mac_address);
DECLARE_RWSEM(dev_addr_sem);
-int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
- struct netlink_ext_ack *extack)
-{
- int ret;
-
- down_write(&dev_addr_sem);
- ret = dev_set_mac_address(dev, sa, extack);
- up_write(&dev_addr_sem);
- return ret;
-}
-EXPORT_SYMBOL(dev_set_mac_address_user);
-
int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
{
size_t size = sizeof(sa->sa_data_min);
@@ -9276,14 +9637,7 @@ unlock:
}
EXPORT_SYMBOL(dev_get_mac_address);
-/**
- * dev_change_carrier - Change device carrier
- * @dev: device
- * @new_carrier: new value
- *
- * Change device carrier
- */
-int dev_change_carrier(struct net_device *dev, bool new_carrier)
+int netif_change_carrier(struct net_device *dev, bool new_carrier)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -9394,13 +9748,7 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
}
EXPORT_SYMBOL(netdev_port_same_parent_id);
-/**
- * dev_change_proto_down - set carrier according to proto_down.
- *
- * @dev: device
- * @proto_down: new value
- */
-int dev_change_proto_down(struct net_device *dev, bool proto_down)
+int netif_change_proto_down(struct net_device *dev, bool proto_down)
{
if (!dev->change_proto_down)
return -EOPNOTSUPP;
@@ -9415,14 +9763,14 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
}
/**
- * dev_change_proto_down_reason - proto down reason
+ * netdev_change_proto_down_reason_locked - proto down reason
*
* @dev: device
* @mask: proto down mask
* @value: proto down value
*/
-void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
- u32 value)
+void netdev_change_proto_down_reason_locked(struct net_device *dev,
+ unsigned long mask, u32 value)
{
u32 proto_down_reason;
int b;
@@ -9499,11 +9847,31 @@ u8 dev_xdp_prog_count(struct net_device *dev)
}
EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
-int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
+u8 dev_xdp_sb_prog_count(struct net_device *dev)
+{
+ u8 count = 0;
+ int i;
+
+ for (i = 0; i < __MAX_XDP_MODE; i++)
+ if (dev->xdp_state[i].prog &&
+ !dev->xdp_state[i].prog->aux->xdp_has_frags)
+ count++;
+ return count;
+}
+
+int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
{
if (!dev->netdev_ops->ndo_bpf)
return -EOPNOTSUPP;
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ bpf->command == XDP_SETUP_PROG &&
+ bpf->prog && !bpf->prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(bpf->extack,
+ "unable to propagate XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
if (dev_get_min_mp_channel_count(dev)) {
NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider");
return -EBUSY;
@@ -9511,7 +9879,6 @@ int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
return dev->netdev_ops->ndo_bpf(dev, bpf);
}
-EXPORT_SYMBOL_GPL(dev_xdp_propagate);
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{
@@ -9541,6 +9908,14 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
struct netdev_bpf xdp;
int err;
+ netdev_ops_assert_locked(dev);
+
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ prog && !prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
if (dev_get_min_mp_channel_count(dev)) {
NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider");
return -EBUSY;
@@ -9694,6 +10069,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
NL_SET_ERR_MSG(extack, "Program bound to different device");
return -EINVAL;
}
+ if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) {
+ NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode");
+ return -EINVAL;
+ }
if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
return -EINVAL;
@@ -9763,7 +10142,9 @@ static void bpf_xdp_link_release(struct bpf_link *link)
* already NULL, in which case link was already auto-detached
*/
if (xdp_link->dev) {
+ netdev_lock_ops(xdp_link->dev);
WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
+ netdev_unlock_ops(xdp_link->dev);
xdp_link->dev = NULL;
}
@@ -9845,10 +10226,12 @@ static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
goto out_unlock;
}
+ netdev_lock_ops(xdp_link->dev);
mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
xdp_link->flags, new_prog);
+ netdev_unlock_ops(xdp_link->dev);
if (err)
goto out_unlock;
@@ -9974,7 +10357,7 @@ u32 dev_get_min_mp_channel_count(const struct net_device *dev)
{
int i;
- ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
for (i = dev->real_num_rx_queues - 1; i >= 0; i--)
if (dev->_rx[i].mp_params.mp_priv)
@@ -10021,6 +10404,15 @@ static void dev_index_release(struct net *net, int ifindex)
WARN_ON(xa_erase(&net->dev_by_index, ifindex));
}
+static bool from_cleanup_net(void)
+{
+#ifdef CONFIG_NET_NS
+ return current == cleanup_net_task;
+#else
+ return false;
+#endif
+}
+
/* Delayed registration/unregisteration */
LIST_HEAD(net_todo_list);
DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
@@ -10191,6 +10583,7 @@ int __netdev_update_features(struct net_device *dev)
int err = -1;
ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
features = netdev_get_wanted_features(dev);
@@ -10617,12 +11010,16 @@ int register_netdevice(struct net_device *dev)
ret = netdev_register_kobject(dev);
+ netdev_lock(dev);
WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
+ netdev_unlock(dev);
if (ret)
goto err_uninit_notify;
+ netdev_lock_ops(dev);
__netdev_update_features(dev);
+ netdev_unlock_ops(dev);
/*
* Default initial state at registry is that the
@@ -10685,26 +11082,20 @@ err_free_name:
EXPORT_SYMBOL(register_netdevice);
/* Initialize the core of a dummy net device.
- * This is useful if you are calling this function after alloc_netdev(),
- * since it does not memset the net_device fields.
+ * The setup steps dummy netdevs need which normal netdevs get by going
+ * through register_netdevice().
*/
-static void init_dummy_netdev_core(struct net_device *dev)
+static void init_dummy_netdev(struct net_device *dev)
{
/* make sure we BUG if trying to hit standard
* register/unregister code path
*/
dev->reg_state = NETREG_DUMMY;
- /* NAPI wants this */
- INIT_LIST_HEAD(&dev->napi_list);
-
/* a dummy interface is started by default */
set_bit(__LINK_STATE_PRESENT, &dev->state);
set_bit(__LINK_STATE_START, &dev->state);
- /* napi_busy_loop stats accounting wants this */
- dev_net_set(dev, &init_net);
-
/* Note : We dont allocate pcpu_refcnt for dummy devices,
* because users of this 'device' dont need to change
* its refcount.
@@ -10712,28 +11103,6 @@ static void init_dummy_netdev_core(struct net_device *dev)
}
/**
- * init_dummy_netdev - init a dummy network device for NAPI
- * @dev: device to init
- *
- * This takes a network device structure and initializes the minimum
- * amount of fields so it can be used to schedule NAPI polls without
- * registering a full blown interface. This is to be used by drivers
- * that need to tie several hardware interfaces to a single NAPI
- * poll scheduler due to HW limitations.
- */
-void init_dummy_netdev(struct net_device *dev)
-{
- /* Clear everything. Note we don't initialize spinlocks
- * as they aren't supposed to be taken by any of the
- * NAPI code and this dummy netdev is supposed to be
- * only ever used for NAPI polls
- */
- memset(dev, 0, sizeof(struct net_device));
- init_dummy_netdev_core(dev);
-}
-EXPORT_SYMBOL_GPL(init_dummy_netdev);
-
-/**
* register_netdev - register a network device
* @dev: device to register
*
@@ -10748,12 +11117,16 @@ EXPORT_SYMBOL_GPL(init_dummy_netdev);
*/
int register_netdev(struct net_device *dev)
{
+ struct net *net = dev_net(dev);
int err;
- if (rtnl_lock_killable())
+ if (rtnl_net_lock_killable(net))
return -EINTR;
+
err = register_netdevice(dev);
- rtnl_unlock();
+
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(register_netdev);
@@ -10891,9 +11264,8 @@ void netdev_run_todo(void)
list_replace_init(&net_unlink_list, &unlink_list);
while (!list_empty(&unlink_list)) {
- struct net_device *dev = list_first_entry(&unlink_list,
- struct net_device,
- unlink_list);
+ dev = list_first_entry(&unlink_list, struct net_device,
+ unlink_list);
list_del_init(&dev->unlink_list);
dev->nested_level = dev->lower_level - 1;
}
@@ -10915,7 +11287,9 @@ void netdev_run_todo(void)
continue;
}
+ netdev_lock(dev);
WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
+ netdev_unlock(dev);
linkwatch_sync_dev(dev);
}
@@ -11063,6 +11437,20 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
const struct net_device_ops *ops = dev->netdev_ops;
const struct net_device_core_stats __percpu *p;
+ /*
+ * IPv{4,6} and udp tunnels share common stat helpers and use
+ * different stat type (NETDEV_PCPU_STAT_TSTATS vs
+ * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent.
+ */
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) !=
+ offsetof(struct pcpu_dstats, rx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) !=
+ offsetof(struct pcpu_dstats, rx_packets));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) !=
+ offsetof(struct pcpu_dstats, tx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) !=
+ offsetof(struct pcpu_dstats, tx_packets));
+
if (ops->ndo_get_stats64) {
memset(storage, 0, sizeof(*storage));
ops->ndo_get_stats64(dev, storage);
@@ -11301,6 +11689,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
if (!dev->ethtool)
goto free_all;
+ dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT);
+ if (!dev->cfg)
+ goto free_all;
+ dev->cfg_pending = dev->cfg;
+
napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
if (!dev->napi_config)
@@ -11330,6 +11723,22 @@ free_dev:
}
EXPORT_SYMBOL(alloc_netdev_mqs);
+static void netdev_napi_exit(struct net_device *dev)
+{
+ if (!list_empty(&dev->napi_list)) {
+ struct napi_struct *p, *n;
+
+ netdev_lock(dev);
+ list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+ __netif_napi_del_locked(p);
+ netdev_unlock(dev);
+
+ synchronize_net();
+ }
+
+ kvfree(dev->napi_config);
+}
+
/**
* free_netdev - free network device
* @dev: device
@@ -11341,8 +11750,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
*/
void free_netdev(struct net_device *dev)
{
- struct napi_struct *p, *n;
-
might_sleep();
/* When called immediately after register_netdevice() failed the unwind
@@ -11355,8 +11762,8 @@ void free_netdev(struct net_device *dev)
return;
}
- mutex_destroy(&dev->lock);
-
+ WARN_ON(dev->cfg != dev->cfg_pending);
+ kfree(dev->cfg);
kfree(dev->ethtool);
netif_free_tx_queues(dev);
netif_free_rx_queues(dev);
@@ -11366,10 +11773,9 @@ void free_netdev(struct net_device *dev)
/* Flush device addresses */
dev_addr_flush(dev);
- list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
- netif_napi_del(p);
+ netdev_napi_exit(dev);
- kvfree(dev->napi_config);
+ netif_del_cpu_rmap(dev);
ref_tracker_dir_exit(&dev->refcnt_tracker);
#ifdef CONFIG_PCPU_DEV_REFCNT
@@ -11383,6 +11789,8 @@ void free_netdev(struct net_device *dev)
netdev_free_phy_link_topology(dev);
+ mutex_destroy(&dev->lock);
+
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED ||
dev->reg_state == NETREG_DUMMY) {
@@ -11407,7 +11815,7 @@ EXPORT_SYMBOL(free_netdev);
struct net_device *alloc_netdev_dummy(int sizeof_priv)
{
return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
- init_dummy_netdev_core);
+ init_dummy_netdev);
}
EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
@@ -11420,7 +11828,7 @@ EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
void synchronize_net(void)
{
might_sleep();
- if (rtnl_is_locked())
+ if (from_cleanup_net() || rtnl_is_locked())
synchronize_rcu_expedited();
else
synchronize_rcu();
@@ -11483,6 +11891,19 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
}
EXPORT_SYMBOL(unregister_netdevice_queue);
+static void dev_memory_provider_uninstall(struct net_device *dev)
+{
+ unsigned int i;
+
+ for (i = 0; i < dev->real_num_rx_queues; i++) {
+ struct netdev_rx_queue *rxq = &dev->_rx[i];
+ struct pp_memory_provider_params *p = &rxq->mp_params;
+
+ if (p->mp_ops && p->mp_ops->uninstall)
+ p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq);
+ }
+}
+
void unregister_netdevice_many_notify(struct list_head *head,
u32 portid, const struct nlmsghdr *nlh)
{
@@ -11514,14 +11935,19 @@ void unregister_netdevice_many_notify(struct list_head *head,
}
/* If device is running, close it first. */
- list_for_each_entry(dev, head, unreg_list)
+ list_for_each_entry(dev, head, unreg_list) {
list_add_tail(&dev->close_list, &close_head);
+ netdev_lock_ops(dev);
+ }
dev_close_many(&close_head, true);
list_for_each_entry(dev, head, unreg_list) {
+ netdev_unlock_ops(dev);
/* And unlink it from device chain. */
unlist_netdevice(dev);
+ netdev_lock(dev);
WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
+ netdev_unlock(dev);
}
flush_all_backlogs();
@@ -11533,9 +11959,11 @@ void unregister_netdevice_many_notify(struct list_head *head,
/* Shutdown queueing discipline. */
dev_shutdown(dev);
dev_tcx_uninstall(dev);
+ netdev_lock_ops(dev);
dev_xdp_uninstall(dev);
+ dev_memory_provider_uninstall(dev);
+ netdev_unlock_ops(dev);
bpf_dev_bound_netdev_unregister(dev);
- dev_dmabuf_uninstall(dev);
netdev_offload_xstats_disable_all(dev);
@@ -11623,30 +12051,15 @@ EXPORT_SYMBOL(unregister_netdevice_many);
*/
void unregister_netdev(struct net_device *dev)
{
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
unregister_netdevice(dev);
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
}
EXPORT_SYMBOL(unregister_netdev);
-/**
- * __dev_change_net_namespace - move device to different nethost namespace
- * @dev: device
- * @net: network namespace
- * @pat: If not NULL name pattern to try if the current device name
- * is already taken in the destination network namespace.
- * @new_ifindex: If not zero, specifies device index in the target
- * namespace.
- *
- * This function shuts down a device interface and moves it
- * to a new network namespace. On success 0 is returned, on
- * a failure a netagive errno code is returned.
- *
- * Callers must hold the rtnl semaphore.
- */
-
-int __dev_change_net_namespace(struct net_device *dev, struct net *net,
- const char *pat, int new_ifindex)
+int netif_change_net_namespace(struct net_device *dev, struct net *net,
+ const char *pat, int new_ifindex,
+ struct netlink_ext_ack *extack)
{
struct netdev_name_node *name_node;
struct net *net_old = dev_net(dev);
@@ -11657,12 +12070,16 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
/* Don't allow namespace local devices to be moved. */
err = -EINVAL;
- if (dev->netns_local)
+ if (dev->netns_immutable) {
+ NL_SET_ERR_MSG(extack, "The interface netns is immutable");
goto out;
+ }
/* Ensure the device has been registered */
- if (dev->reg_state != NETREG_REGISTERED)
+ if (dev->reg_state != NETREG_REGISTERED) {
+ NL_SET_ERR_MSG(extack, "The interface isn't registered");
goto out;
+ }
/* Get out if there is nothing todo */
err = 0;
@@ -11675,30 +12092,49 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
err = -EEXIST;
if (netdev_name_in_use(net, dev->name)) {
/* We get here if we can't use the current device name */
- if (!pat)
+ if (!pat) {
+ NL_SET_ERR_MSG(extack,
+ "An interface with the same name exists in the target netns");
goto out;
+ }
err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "Unable to use '%s' for the new interface name in the target netns",
+ pat);
goto out;
+ }
}
/* Check that none of the altnames conflicts. */
err = -EEXIST;
- netdev_for_each_altname(dev, name_node)
- if (netdev_name_in_use(net, name_node->name))
+ netdev_for_each_altname(dev, name_node) {
+ if (netdev_name_in_use(net, name_node->name)) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "An interface with the altname %s exists in the target netns",
+ name_node->name);
goto out;
+ }
+ }
/* Check that new_ifindex isn't used yet. */
if (new_ifindex) {
err = dev_index_reserve(net, new_ifindex);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "The ifindex %d is not available in the target netns",
+ new_ifindex);
goto out;
+ }
} else {
/* If there is an ifindex conflict assign a new one */
err = dev_index_reserve(net, dev->ifindex);
if (err == -EBUSY)
err = dev_index_reserve(net, 0);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Unable to allocate a new ifindex in the target netns");
goto out;
+ }
new_ifindex = err;
}
@@ -11707,7 +12143,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
*/
/* If device is running close it first. */
- dev_close(dev);
+ netif_close(dev);
/* And unlink it from device chain */
unlist_netdevice(dev);
@@ -11789,7 +12225,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
out:
return err;
}
-EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
static int dev_cpu_dead(unsigned int oldcpu)
{
@@ -11904,7 +12339,7 @@ static struct hlist_head * __net_init netdev_create_hash(void)
static int __net_init netdev_init(struct net *net)
{
BUILD_BUG_ON(GRO_HASH_BUCKETS >
- 8 * sizeof_field(struct napi_struct, gro_bitmask));
+ BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask));
INIT_LIST_HEAD(&net->dev_base_head);
@@ -12039,7 +12474,7 @@ static void __net_exit default_device_exit_net(struct net *net)
char fb_name[IFNAMSIZ];
/* Ignore unmoveable devices (i.e. loopback) */
- if (dev->netns_local)
+ if (dev->netns_immutable)
continue;
/* Leave virtual devices for the generic cleanup */
@@ -12171,11 +12606,18 @@ static int net_page_pool_create(int cpuid)
.nid = cpu_to_mem(cpuid),
};
struct page_pool *pp_ptr;
+ int err;
pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
if (IS_ERR(pp_ptr))
return -ENOMEM;
+ err = xdp_reg_page_pool(pp_ptr);
+ if (err) {
+ page_pool_destroy(pp_ptr);
+ return err;
+ }
+
per_cpu(system_page_pool, cpuid) = pp_ptr;
#endif
return 0;
@@ -12241,12 +12683,13 @@ static int __init net_dev_init(void)
* Initialise the packet receive queues.
*/
+ flush_backlogs_fallback = flush_backlogs_alloc();
+ if (!flush_backlogs_fallback)
+ goto out;
+
for_each_possible_cpu(i) {
- struct work_struct *flush = per_cpu_ptr(&flush_works, i);
struct softnet_data *sd = &per_cpu(softnet_data, i);
- INIT_WORK(flush, flush_backlog);
-
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
@@ -12261,7 +12704,7 @@ static int __init net_dev_init(void)
INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
spin_lock_init(&sd->defer_lock);
- init_gro_hash(&sd->backlog);
+ gro_init(&sd->backlog.gro);
sd->backlog.poll = process_backlog;
sd->backlog.weight = weight_p;
INIT_LIST_HEAD(&sd->backlog.poll_list);
@@ -12309,6 +12752,7 @@ out:
if (!pp_ptr)
continue;
+ xdp_unreg_page_pool(pp_ptr);
page_pool_destroy(pp_ptr);
per_cpu(system_page_pool, i) = NULL;
}