diff options
Diffstat (limited to 'net')
156 files changed, 2107 insertions, 1831 deletions
diff --git a/net/6lowpan/Kconfig b/net/6lowpan/Kconfig index 4c1f4c0aa58a..d8fc459492b0 100644 --- a/net/6lowpan/Kconfig +++ b/net/6lowpan/Kconfig @@ -2,7 +2,7 @@ menuconfig 6LOWPAN tristate "6LoWPAN Support" depends on IPV6 - ---help--- + help This enables IPv6 over Low power Wireless Personal Area Network - "6LoWPAN" which is supported by IEEE 802.15.4 or Bluetooth stacks. @@ -10,7 +10,7 @@ config 6LOWPAN_DEBUGFS bool "6LoWPAN debugfs support" depends on 6LOWPAN depends on DEBUG_FS - ---help--- + help This enables 6LoWPAN debugfs support. For example to manipulate IPHC context information at runtime. @@ -18,7 +18,7 @@ menuconfig 6LOWPAN_NHC tristate "Next Header and Generic Header Compression Support" depends on 6LOWPAN default y - ---help--- + help Support for next header and generic header compression defined in RFC6282 and RFC7400. @@ -27,78 +27,78 @@ if 6LOWPAN_NHC config 6LOWPAN_NHC_DEST tristate "Destination Options Header Support" default y - ---help--- + help 6LoWPAN IPv6 Destination Options Header compression according to RFC6282. config 6LOWPAN_NHC_FRAGMENT tristate "Fragment Header Support" default y - ---help--- + help 6LoWPAN IPv6 Fragment Header compression according to RFC6282. config 6LOWPAN_NHC_HOP tristate "Hop-by-Hop Options Header Support" default y - ---help--- + help 6LoWPAN IPv6 Hop-by-Hop Options Header compression according to RFC6282. config 6LOWPAN_NHC_IPV6 tristate "IPv6 Header Support" default y - ---help--- + help 6LoWPAN IPv6 Header compression according to RFC6282. config 6LOWPAN_NHC_MOBILITY tristate "Mobility Header Support" default y - ---help--- + help 6LoWPAN IPv6 Mobility Header compression according to RFC6282. config 6LOWPAN_NHC_ROUTING tristate "Routing Header Support" default y - ---help--- + help 6LoWPAN IPv6 Routing Header compression according to RFC6282. config 6LOWPAN_NHC_UDP tristate "UDP Header Support" default y - ---help--- + help 6LoWPAN IPv6 UDP Header compression according to RFC6282. config 6LOWPAN_GHC_EXT_HDR_HOP tristate "GHC Hop-by-Hop Options Header Support" - ---help--- + help 6LoWPAN IPv6 Hop-by-Hop option generic header compression according to RFC7400. config 6LOWPAN_GHC_UDP tristate "GHC UDP Support" - ---help--- + help 6LoWPAN IPv6 UDP generic header compression according to RFC7400. config 6LOWPAN_GHC_ICMPV6 tristate "GHC ICMPv6 Support" - ---help--- + help 6LoWPAN IPv6 ICMPv6 generic header compression according to RFC7400. config 6LOWPAN_GHC_EXT_HDR_DEST tristate "GHC Destination Options Header Support" - ---help--- + help 6LoWPAN IPv6 destination option generic header compression according to RFC7400. config 6LOWPAN_GHC_EXT_HDR_FRAG tristate "GHC Fragmentation Options Header Support" - ---help--- + help 6LoWPAN IPv6 fragmentation option generic header compression according to RFC7400. config 6LOWPAN_GHC_EXT_HDR_ROUTE tristate "GHC Routing Options Header Support" - ---help--- + help 6LoWPAN IPv6 routing option generic header compression according to RFC7400. diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig index 5510b4b90ff0..8bf7a1765b78 100644 --- a/net/8021q/Kconfig +++ b/net/8021q/Kconfig @@ -5,7 +5,7 @@ config VLAN_8021Q tristate "802.1Q/802.1ad VLAN Support" - ---help--- + help Select this and you will be able to create 802.1Q VLAN interfaces on your Ethernet interfaces. 802.1Q VLAN supports almost everything a regular Ethernet interface does, including diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index f00bb57f0f60..c8d6a07e23c5 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -494,6 +494,7 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) * separate class since they always nest. */ static struct lock_class_key vlan_netdev_xmit_lock_key; +static struct lock_class_key vlan_netdev_addr_lock_key; static void vlan_dev_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, @@ -502,8 +503,11 @@ static void vlan_dev_set_lockdep_one(struct net_device *dev, lockdep_set_class(&txq->_xmit_lock, &vlan_netdev_xmit_lock_key); } -static void vlan_dev_set_lockdep_class(struct net_device *dev) +static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass) { + lockdep_set_class_and_subclass(&dev->addr_list_lock, + &vlan_netdev_addr_lock_key, + subclass); netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL); } @@ -597,7 +601,7 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); - vlan_dev_set_lockdep_class(dev); + vlan_dev_set_lockdep_class(dev, dev->lower_level); vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 3963eb11c3fb..3debad93be1a 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -43,8 +43,8 @@ #include <net/9p/transport.h> #define XEN_9PFS_NUM_RINGS 2 -#define XEN_9PFS_RING_ORDER 6 -#define XEN_9PFS_RING_SIZE XEN_FLEX_RING_SIZE(XEN_9PFS_RING_ORDER) +#define XEN_9PFS_RING_ORDER 9 +#define XEN_9PFS_RING_SIZE(ring) XEN_FLEX_RING_SIZE(ring->intf->ring_order) struct xen_9pfs_header { uint32_t size; @@ -132,8 +132,8 @@ static bool p9_xen_write_todo(struct xen_9pfs_dataring *ring, RING_IDX size) prod = ring->intf->out_prod; virt_mb(); - return XEN_9PFS_RING_SIZE - - xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE) >= size; + return XEN_9PFS_RING_SIZE(ring) - + xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE(ring)) >= size; } static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) @@ -167,17 +167,18 @@ again: prod = ring->intf->out_prod; virt_mb(); - if (XEN_9PFS_RING_SIZE - xen_9pfs_queued(prod, cons, - XEN_9PFS_RING_SIZE) < size) { + if (XEN_9PFS_RING_SIZE(ring) - + xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE(ring)) < size) { spin_unlock_irqrestore(&ring->lock, flags); goto again; } - masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE); - masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); + masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE(ring)); + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring)); xen_9pfs_write_packet(ring->data.out, p9_req->tc.sdata, size, - &masked_prod, masked_cons, XEN_9PFS_RING_SIZE); + &masked_prod, masked_cons, + XEN_9PFS_RING_SIZE(ring)); p9_req->status = REQ_STATUS_SENT; virt_wmb(); /* write ring before updating pointer */ @@ -207,19 +208,19 @@ static void p9_xen_response(struct work_struct *work) prod = ring->intf->in_prod; virt_rmb(); - if (xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE) < + if (xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE(ring)) < sizeof(h)) { notify_remote_via_irq(ring->irq); return; } - masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE); - masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); + masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE(ring)); + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring)); /* First, read just the header */ xen_9pfs_read_packet(&h, ring->data.in, sizeof(h), masked_prod, &masked_cons, - XEN_9PFS_RING_SIZE); + XEN_9PFS_RING_SIZE(ring)); req = p9_tag_lookup(priv->client, h.tag); if (!req || req->status != REQ_STATUS_SENT) { @@ -233,11 +234,11 @@ static void p9_xen_response(struct work_struct *work) memcpy(&req->rc, &h, sizeof(h)); req->rc.offset = 0; - masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring)); /* Then, read the whole packet (including the header) */ xen_9pfs_read_packet(req->rc.sdata, ring->data.in, h.size, masked_prod, &masked_cons, - XEN_9PFS_RING_SIZE); + XEN_9PFS_RING_SIZE(ring)); virt_mb(); cons += h.size; @@ -267,7 +268,7 @@ static irqreturn_t xen_9pfs_front_event_handler(int irq, void *r) static struct p9_trans_module p9_xen_trans = { .name = "xen", - .maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT), + .maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT - 2), .def = 1, .create = p9_xen_create, .close = p9_xen_close, @@ -295,14 +296,16 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) if (priv->rings[i].irq > 0) unbind_from_irqhandler(priv->rings[i].irq, priv->dev); if (priv->rings[i].data.in) { - for (j = 0; j < (1 << XEN_9PFS_RING_ORDER); j++) { + for (j = 0; + j < (1 << priv->rings[i].intf->ring_order); + j++) { grant_ref_t ref; ref = priv->rings[i].intf->ref[j]; gnttab_end_foreign_access(ref, 0, 0); } free_pages((unsigned long)priv->rings[i].data.in, - XEN_9PFS_RING_ORDER - + priv->rings[i].intf->ring_order - (PAGE_SHIFT - XEN_PAGE_SHIFT)); } gnttab_end_foreign_access(priv->rings[i].ref, 0, 0); @@ -323,7 +326,8 @@ static int xen_9pfs_front_remove(struct xenbus_device *dev) } static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, - struct xen_9pfs_dataring *ring) + struct xen_9pfs_dataring *ring, + unsigned int order) { int i = 0; int ret = -ENOMEM; @@ -342,21 +346,21 @@ static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, goto out; ring->ref = ret; bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - XEN_9PFS_RING_ORDER - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + order - (PAGE_SHIFT - XEN_PAGE_SHIFT)); if (!bytes) { ret = -ENOMEM; goto out; } - for (; i < (1 << XEN_9PFS_RING_ORDER); i++) { + for (; i < (1 << order); i++) { ret = gnttab_grant_foreign_access( dev->otherend_id, virt_to_gfn(bytes) + i, 0); if (ret < 0) goto out; ring->intf->ref[i] = ret; } - ring->intf->ring_order = XEN_9PFS_RING_ORDER; + ring->intf->ring_order = order; ring->data.in = bytes; - ring->data.out = bytes + XEN_9PFS_RING_SIZE; + ring->data.out = bytes + XEN_FLEX_RING_SIZE(order); ret = xenbus_alloc_evtchn(dev, &ring->evtchn); if (ret) @@ -374,7 +378,7 @@ out: for (i--; i >= 0; i--) gnttab_end_foreign_access(ring->intf->ref[i], 0, 0); free_pages((unsigned long)bytes, - XEN_9PFS_RING_ORDER - + ring->intf->ring_order - (PAGE_SHIFT - XEN_PAGE_SHIFT)); } gnttab_end_foreign_access(ring->ref, 0, 0); @@ -404,8 +408,10 @@ static int xen_9pfs_front_probe(struct xenbus_device *dev, return -EINVAL; max_ring_order = xenbus_read_unsigned(dev->otherend, "max-ring-page-order", 0); - if (max_ring_order < XEN_9PFS_RING_ORDER) - return -EINVAL; + if (max_ring_order > XEN_9PFS_RING_ORDER) + max_ring_order = XEN_9PFS_RING_ORDER; + if (p9_xen_trans.maxsize > XEN_FLEX_RING_SIZE(max_ring_order)) + p9_xen_trans.maxsize = XEN_FLEX_RING_SIZE(max_ring_order) / 2; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) @@ -422,7 +428,8 @@ static int xen_9pfs_front_probe(struct xenbus_device *dev, for (i = 0; i < priv->num_rings; i++) { priv->rings[i].priv = priv; - ret = xen_9pfs_front_alloc_dataring(dev, &priv->rings[i]); + ret = xen_9pfs_front_alloc_dataring(dev, &priv->rings[i], + max_ring_order); if (ret < 0) goto error; } diff --git a/net/Kconfig b/net/Kconfig index 5c524c6ee75d..d1672280d6a4 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -8,7 +8,7 @@ menuconfig NET select NLATTR select GENERIC_NET_UTILS select BPF - ---help--- + help Unless you really know what you are doing, you should say Y here. The reason is that some programs need kernel networking support even when running on a stand-alone machine that isn't connected to any @@ -70,7 +70,7 @@ source "net/xdp/Kconfig" config INET bool "TCP/IP networking" - ---help--- + help These are the protocols used on the Internet and on most local Ethernets. It is highly recommended to say Y here (this will enlarge your kernel by about 400 KB), since some programs (e.g. the X window @@ -121,7 +121,7 @@ config NETWORK_PHY_TIMESTAMPING menuconfig NETFILTER bool "Network packet filtering framework (Netfilter)" - ---help--- + help Netfilter is a framework for filtering and mangling network packets that pass through your Linux box. @@ -192,7 +192,7 @@ config BRIDGE_NETFILTER depends on NETFILTER_ADVANCED select NETFILTER_FAMILY_BRIDGE select SKB_EXTENSIONS - ---help--- + help Enabling this option will let arptables resp. iptables see bridged ARP resp. IP traffic. If you want a bridging firewall, you probably want this option enabled. @@ -268,7 +268,7 @@ config CGROUP_NET_PRIO bool "Network priority cgroup" depends on CGROUPS select SOCK_CGROUP_DATA - ---help--- + help Cgroup subsystem for use in assigning processes to network priorities on a per-interface basis. @@ -276,7 +276,7 @@ config CGROUP_NET_CLASSID bool "Network classid cgroup" depends on CGROUPS select SOCK_CGROUP_DATA - ---help--- + help Cgroup subsystem for use as general purpose socket classid marker that is being used in cls_cgroup and for netfilter matching. @@ -294,7 +294,7 @@ config BPF_JIT bool "enable BPF Just In Time compiler" depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT depends on MODULES - ---help--- + help Berkeley Packet Filter filtering capabilities are normally handled by an interpreter. This option allows kernel to generate a native code when filter is loaded in memory. This should speedup @@ -312,7 +312,7 @@ config BPF_STREAM_PARSER depends on CGROUP_BPF select STREAM_PARSER select NET_SOCK_MSG - ---help--- + help Enabling this allows a stream parser to be used with BPF_MAP_TYPE_SOCKMAP. @@ -324,7 +324,7 @@ config NET_FLOW_LIMIT bool depends on RPS default y - ---help--- + help The network stack has to drop packets when a receive processing CPU's backlog reaches netdev_max_backlog. If a few out of many active flows generate the vast majority of load, drop their traffic earlier to @@ -337,7 +337,7 @@ menu "Network testing" config NET_PKTGEN tristate "Packet Generator (USE WITH CAUTION)" depends on INET && PROC_FS - ---help--- + help This module will inject preconfigured packets, at a configurable rate, out of a given interface. It is used for network interface stress testing and performance analysis. If you don't understand @@ -352,7 +352,7 @@ config NET_PKTGEN config NET_DROP_MONITOR tristate "Network packet drop alerting service" depends on INET && TRACEPOINTS - ---help--- + help This feature provides an alerting service to userspace in the event that packets are discarded in the network stack. Alerts are broadcast via netlink socket to any listening user space @@ -398,7 +398,7 @@ source "net/ife/Kconfig" config LWTUNNEL bool "Network light weight tunnels" - ---help--- + help This feature provides an infrastructure to support light weight tunnels like mpls. There is no netdevice associated with a light weight tunnel endpoint. Tunnel encapsulation parameters are stored @@ -408,7 +408,7 @@ config LWTUNNEL_BPF bool "Execute BPF program as route nexthop action" depends on LWTUNNEL && INET default y if LWTUNNEL=y - ---help--- + help Allows to run BPF programs as a nexthop action following a route lookup for incoming and outgoing packets. diff --git a/net/atm/Kconfig b/net/atm/Kconfig index e61dcc9f85b2..77343d57ff2a 100644 --- a/net/atm/Kconfig +++ b/net/atm/Kconfig @@ -5,7 +5,7 @@ config ATM tristate "Asynchronous Transfer Mode (ATM)" - ---help--- + help ATM is a high-speed networking technology for Local Area Networks and Wide Area Networks. It uses a fixed packet size and is connection oriented, allowing for the negotiation of minimum diff --git a/net/atm/lec.c b/net/atm/lec.c index ca37f5a71f5e..875fc0bc1780 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1536,10 +1536,8 @@ static struct lec_arp_table *make_entry(struct lec_priv *priv, struct lec_arp_table *to_return; to_return = kzalloc(sizeof(struct lec_arp_table), GFP_ATOMIC); - if (!to_return) { - pr_info("LEC: Arp entry kmalloc failed\n"); + if (!to_return) return NULL; - } ether_addr_copy(to_return->mac_addr, mac_addr); INIT_HLIST_NODE(&to_return->next); timer_setup(&to_return->timer, lec_arp_expire_arp, 0); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 0ddd80130ea3..f1f1c86f3419 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -745,6 +745,7 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, * separate class since they always nest. */ static struct lock_class_key batadv_netdev_xmit_lock_key; +static struct lock_class_key batadv_netdev_addr_lock_key; /** * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue @@ -765,6 +766,7 @@ static void batadv_set_lockdep_class_one(struct net_device *dev, */ static void batadv_set_lockdep_class(struct net_device *dev) { + lockdep_set_class(&dev->addr_list_lock, &batadv_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); } diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig index fed9290e3b41..84015ef3ee27 100644 --- a/net/bpfilter/Kconfig +++ b/net/bpfilter/Kconfig @@ -9,8 +9,12 @@ menuconfig BPFILTER if BPFILTER config BPFILTER_UMH tristate "bpfilter kernel module with user mode helper" - depends on CC_CAN_LINK + depends on CC_CAN_LINK_STATIC default m help This builds bpfilter kernel module with embedded user mode helper + + Note: your toolchain must support building static binaries, since + rootfs isn't mounted at the time when __init functions are called + and do_execv won't be able to find the elf interpreter. endif diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index 36580301da70..f23b53294fba 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -3,17 +3,14 @@ # Makefile for the Linux BPFILTER layer. # -hostprogs := bpfilter_umh +userprogs := bpfilter_umh bpfilter_umh-objs := main.o -KBUILD_HOSTCFLAGS += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi -HOSTCC := $(CC) +userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi -ifeq ($(CONFIG_BPFILTER_UMH), y) -# builtin bpfilter_umh should be compiled with -static +# builtin bpfilter_umh should be linked with -static # since rootfs isn't mounted at the time of __init # function is called and do_execv won't find elf interpreter -KBUILD_HOSTLDFLAGS += -static -endif +userldflags += -static $(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig index 51a6414145d2..80879196560c 100644 --- a/net/bridge/Kconfig +++ b/net/bridge/Kconfig @@ -8,7 +8,7 @@ config BRIDGE select LLC select STP depends on IPV6 || IPV6=n - ---help--- + help If you say Y here, then your Linux box will be able to act as an Ethernet bridge, which means that the different Ethernet segments it is connected to will appear as one Ethernet to the participants. @@ -39,7 +39,7 @@ config BRIDGE_IGMP_SNOOPING depends on BRIDGE depends on INET default y - ---help--- + help If you say Y here, then the Ethernet bridge will be able selectively forward multicast traffic based on IGMP/MLD traffic received from each port. @@ -53,7 +53,7 @@ config BRIDGE_VLAN_FILTERING depends on BRIDGE depends on VLAN_8021Q default n - ---help--- + help If you say Y here, then the Ethernet bridge will be able selectively receive and forward traffic based on VLAN information in the packet any VLAN information configured on the bridge port or bridge device. diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 8ec1362588af..8c7b78f8bc23 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -105,6 +105,13 @@ out: return NETDEV_TX_OK; } +static struct lock_class_key bridge_netdev_addr_lock_key; + +static void br_set_lockdep_class(struct net_device *dev) +{ + lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key); +} + static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -143,6 +150,7 @@ static int br_dev_init(struct net_device *dev) br_fdb_hash_fini(br); } + br_set_lockdep_class(dev); return err; } diff --git a/net/caif/Kconfig b/net/caif/Kconfig index b7532a79ca7a..87205251cc25 100644 --- a/net/caif/Kconfig +++ b/net/caif/Kconfig @@ -7,7 +7,7 @@ menuconfig CAIF tristate "CAIF support" select CRC_CCITT default n - ---help--- + help The "Communication CPU to Application CPU Interface" (CAIF) is a packet based connection-oriented MUX protocol developed by ST-Ericsson for use with its modems. It is accessed from user space as sockets (PF_CAIF). @@ -26,7 +26,7 @@ config CAIF_DEBUG bool "Enable Debug" depends on CAIF default n - ---help--- + help Enable the inclusion of debug code in the CAIF stack. Be aware that doing this will impact performance. If unsure say N. @@ -35,7 +35,7 @@ config CAIF_NETDEV tristate "CAIF GPRS Network device" depends on CAIF default CAIF - ---help--- + help Say Y if you will be using a CAIF based GPRS network device. This can be either built-in or a loadable module. If you select to build it as a built-in then the main CAIF device must @@ -46,7 +46,7 @@ config CAIF_USB tristate "CAIF USB support" depends on CAIF default n - ---help--- + help Say Y if you are using CAIF over USB CDC NCM. This can be either built-in or a loadable module. If you select to build it as a built-in then the main CAIF device must diff --git a/net/can/Kconfig b/net/can/Kconfig index d77042752457..25436a715db3 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -6,7 +6,7 @@ menuconfig CAN depends on NET tristate "CAN bus subsystem support" - ---help--- + help Controller Area Network (CAN) is a slow (up to 1Mbit/s) serial communications protocol. Development of the CAN bus started in 1983 at Robert Bosch GmbH, and the protocol was officially @@ -23,7 +23,7 @@ if CAN config CAN_RAW tristate "Raw CAN Protocol (raw access with CAN-ID filtering)" default y - ---help--- + help The raw CAN protocol option offers access to the CAN bus via the BSD socket API. You probably want to use the raw socket in most cases where no higher level protocol is being used. The raw @@ -33,7 +33,7 @@ config CAN_RAW config CAN_BCM tristate "Broadcast Manager CAN Protocol (with content filtering)" default y - ---help--- + help The Broadcast Manager offers content filtering, timeout monitoring, sending of RTR frames, and cyclic CAN messages without permanent user interaction. The BCM can be 'programmed' via the BSD socket API and @@ -45,7 +45,7 @@ config CAN_BCM config CAN_GW tristate "CAN Gateway/Router (with netlink configuration)" default y - ---help--- + help The CAN Gateway/Router is used to route (and modify) CAN frames. It is based on the PF_CAN core infrastructure for msg filtering and msg sending and can optionally modify routed CAN frames on the fly. diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 66f22e8aa529..afe0e8184c23 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -176,6 +176,10 @@ int ceph_compare_options(struct ceph_options *new_opt, } } + ret = ceph_compare_crush_locs(&opt1->crush_locs, &opt2->crush_locs); + if (ret) + return ret; + /* any matching mon ip implies a match */ for (i = 0; i < opt1->num_mon; i++) { if (ceph_monmap_contains(client->monc.monmap, @@ -259,6 +263,8 @@ enum { Opt_secret, Opt_key, Opt_ip, + Opt_crush_location, + Opt_read_from_replica, /* string args above */ Opt_share, Opt_crc, @@ -268,11 +274,25 @@ enum { Opt_abort_on_full, }; +enum { + Opt_read_from_replica_no, + Opt_read_from_replica_balance, + Opt_read_from_replica_localize, +}; + +static const struct constant_table ceph_param_read_from_replica[] = { + {"no", Opt_read_from_replica_no}, + {"balance", Opt_read_from_replica_balance}, + {"localize", Opt_read_from_replica_localize}, + {} +}; + static const struct fs_parameter_spec ceph_parameters[] = { fsparam_flag ("abort_on_full", Opt_abort_on_full), fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures), fsparam_flag_no ("cephx_sign_messages", Opt_cephx_sign_messages), fsparam_flag_no ("crc", Opt_crc), + fsparam_string ("crush_location", Opt_crush_location), fsparam_string ("fsid", Opt_fsid), fsparam_string ("ip", Opt_ip), fsparam_string ("key", Opt_key), @@ -283,6 +303,8 @@ static const struct fs_parameter_spec ceph_parameters[] = { fsparam_u32 ("osdkeepalive", Opt_osdkeepalivetimeout), __fsparam (fs_param_is_s32, "osdtimeout", Opt_osdtimeout, fs_param_deprecated, NULL), + fsparam_enum ("read_from_replica", Opt_read_from_replica, + ceph_param_read_from_replica), fsparam_string ("secret", Opt_secret), fsparam_flag_no ("share", Opt_share), fsparam_flag_no ("tcp_nodelay", Opt_tcp_nodelay), @@ -297,6 +319,7 @@ struct ceph_options *ceph_alloc_options(void) if (!opt) return NULL; + opt->crush_locs = RB_ROOT; opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), GFP_KERNEL); if (!opt->mon_addr) { @@ -319,6 +342,7 @@ void ceph_destroy_options(struct ceph_options *opt) if (!opt) return; + ceph_clear_crush_locs(&opt->crush_locs); kfree(opt->name); if (opt->key) { ceph_crypto_key_destroy(opt->key); @@ -453,6 +477,34 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, if (!opt->key) return -ENOMEM; return get_secret(opt->key, param->string, &log); + case Opt_crush_location: + ceph_clear_crush_locs(&opt->crush_locs); + err = ceph_parse_crush_location(param->string, + &opt->crush_locs); + if (err) { + error_plog(&log, "Failed to parse CRUSH location: %d", + err); + return err; + } + break; + case Opt_read_from_replica: + switch (result.uint_32) { + case Opt_read_from_replica_no: + opt->osd_req_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS); + break; + case Opt_read_from_replica_balance: + opt->osd_req_flags |= CEPH_OSD_FLAG_BALANCE_READS; + opt->osd_req_flags &= ~CEPH_OSD_FLAG_LOCALIZE_READS; + break; + case Opt_read_from_replica_localize: + opt->osd_req_flags |= CEPH_OSD_FLAG_LOCALIZE_READS; + opt->osd_req_flags &= ~CEPH_OSD_FLAG_BALANCE_READS; + break; + default: + BUG(); + } + break; case Opt_osdtimeout: warn_plog(&log, "Ignoring osdtimeout"); @@ -535,6 +587,7 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, { struct ceph_options *opt = client->options; size_t pos = m->count; + struct rb_node *n; if (opt->name) { seq_puts(m, "name="); @@ -544,6 +597,28 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, if (opt->key) seq_puts(m, "secret=<hidden>,"); + if (!RB_EMPTY_ROOT(&opt->crush_locs)) { + seq_puts(m, "crush_location="); + for (n = rb_first(&opt->crush_locs); ; ) { + struct crush_loc_node *loc = + rb_entry(n, struct crush_loc_node, cl_node); + + seq_printf(m, "%s:%s", loc->cl_loc.cl_type_name, + loc->cl_loc.cl_name); + n = rb_next(n); + if (!n) + break; + + seq_putc(m, '|'); + } + seq_putc(m, ','); + } + if (opt->osd_req_flags & CEPH_OSD_FLAG_BALANCE_READS) { + seq_puts(m, "read_from_replica=balance,"); + } else if (opt->osd_req_flags & CEPH_OSD_FLAG_LOCALIZE_READS) { + seq_puts(m, "read_from_replica=localize,"); + } + if (opt->flags & CEPH_OPT_FSID) seq_printf(m, "fsid=%pU,", &opt->fsid); if (opt->flags & CEPH_OPT_NOSHARE) diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 3d70244bc1b6..254ded0b05f6 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -2,7 +2,6 @@ #ifdef __KERNEL__ # include <linux/slab.h> # include <linux/crush/crush.h> -void clear_choose_args(struct crush_map *c); #else # include "crush_compat.h" # include "crush.h" @@ -130,6 +129,8 @@ void crush_destroy(struct crush_map *map) #ifndef __KERNEL__ kfree(map->choose_tries); #else + clear_crush_names(&map->type_names); + clear_crush_names(&map->names); clear_choose_args(map); #endif kfree(map); diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 1344f232ecc5..409d505ff320 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -81,11 +81,13 @@ static int osdmap_show(struct seq_file *s, void *p) u32 state = map->osd_state[i]; char sb[64]; - seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", + seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\t%2d\n", i, ceph_pr_addr(addr), ((map->osd_weight[i]*100) >> 16), ceph_osdmap_state_str(sb, sizeof(sb), state), - ((ceph_get_primary_affinity(map, i)*100) >> 16)); + ((ceph_get_primary_affinity(map, i)*100) >> 16), + ceph_get_crush_locality(map, i, + &client->options->crush_locs)); } for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) { struct ceph_pg_mapping *pg = diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 1d4973f8cd7a..4fea3c33af2a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -932,10 +932,14 @@ static void osd_req_op_watch_init(struct ceph_osd_request *req, int which, op->watch.gen = 0; } +/* + * @flags: CEPH_OSD_OP_ALLOC_HINT_FLAG_* + */ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, unsigned int which, u64 expected_object_size, - u64 expected_write_size) + u64 expected_write_size, + u32 flags) { struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_SETALLOCHINT, @@ -943,6 +947,7 @@ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, op->alloc_hint.expected_object_size = expected_object_size; op->alloc_hint.expected_write_size = expected_write_size; + op->alloc_hint.flags = flags; /* * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed @@ -1018,6 +1023,7 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, cpu_to_le64(src->alloc_hint.expected_object_size); dst->alloc_hint.expected_write_size = cpu_to_le64(src->alloc_hint.expected_write_size); + dst->alloc_hint.flags = cpu_to_le32(src->alloc_hint.flags); break; case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: @@ -1497,6 +1503,45 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, (osdc->osdmap->epoch < osdc->epoch_barrier); } +static int pick_random_replica(const struct ceph_osds *acting) +{ + int i = prandom_u32() % acting->size; + + dout("%s picked osd%d, primary osd%d\n", __func__, + acting->osds[i], acting->primary); + return i; +} + +/* + * Picks the closest replica based on client's location given by + * crush_location option. Prefers the primary if the locality is + * the same. + */ +static int pick_closest_replica(struct ceph_osd_client *osdc, + const struct ceph_osds *acting) +{ + struct ceph_options *opt = osdc->client->options; + int best_i, best_locality; + int i = 0, locality; + + do { + locality = ceph_get_crush_locality(osdc->osdmap, + acting->osds[i], + &opt->crush_locs); + if (i == 0 || + (locality >= 0 && best_locality < 0) || + (locality >= 0 && best_locality >= 0 && + locality < best_locality)) { + best_i = i; + best_locality = locality; + } + } while (++i < acting->size); + + dout("%s picked osd%d with locality %d, primary osd%d\n", __func__, + acting->osds[best_i], best_locality, acting->primary); + return best_i; +} + enum calc_target_result { CALC_TARGET_NO_ACTION = 0, CALC_TARGET_NEED_RESEND, @@ -1510,6 +1555,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_pg_pool_info *pi; struct ceph_pg pgid, last_pgid; struct ceph_osds up, acting; + bool is_read = t->flags & CEPH_OSD_FLAG_READ; + bool is_write = t->flags & CEPH_OSD_FLAG_WRITE; bool force_resend = false; bool unpaused = false; bool legacy_change = false; @@ -1540,9 +1587,9 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, ceph_oid_copy(&t->target_oid, &t->base_oid); ceph_oloc_copy(&t->target_oloc, &t->base_oloc); if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { - if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0) + if (is_read && pi->read_tier >= 0) t->target_oloc.pool = pi->read_tier; - if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0) + if (is_write && pi->write_tier >= 0) t->target_oloc.pool = pi->write_tier; pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool); @@ -1581,7 +1628,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, unpaused = true; } legacy_change = ceph_pg_compare(&t->pgid, &pgid) || - ceph_osds_changed(&t->acting, &acting, any_change); + ceph_osds_changed(&t->acting, &acting, + t->used_replica || any_change); if (t->pg_num) split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num); @@ -1597,7 +1645,24 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, t->sort_bitwise = sort_bitwise; t->recovery_deletes = recovery_deletes; - t->osd = acting.primary; + if ((t->flags & (CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS)) && + !is_write && pi->type == CEPH_POOL_TYPE_REP && + acting.size > 1) { + int pos; + + WARN_ON(!is_read || acting.osds[0] != acting.primary); + if (t->flags & CEPH_OSD_FLAG_BALANCE_READS) { + pos = pick_random_replica(&acting); + } else { + pos = pick_closest_replica(osdc, &acting); + } + t->osd = acting.osds[pos]; + t->used_replica = pos > 0; + } else { + t->osd = acting.primary; + t->used_replica = false; + } } if (unpaused || legacy_change || force_resend || split) @@ -2366,13 +2431,17 @@ promote: static void account_request(struct ceph_osd_request *req) { + struct ceph_osd_client *osdc = req->r_osdc; + WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK)); WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))); req->r_flags |= CEPH_OSD_FLAG_ONDISK; - atomic_inc(&req->r_osdc->num_requests); + req->r_flags |= osdc->client->options->osd_req_flags; + atomic_inc(&osdc->num_requests); req->r_start_stamp = jiffies; + req->r_start_latency = ktime_get(); } static void submit_request(struct ceph_osd_request *req, bool wrlocked) @@ -2389,6 +2458,8 @@ static void finish_request(struct ceph_osd_request *req) WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid)); dout("%s req %p tid %llu\n", __func__, req, req->r_tid); + req->r_end_latency = ktime_get(); + if (req->r_osd) unlink_request(req->r_osd, req); atomic_dec(&osdc->num_requests); @@ -3657,6 +3728,26 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) goto out_unlock_osdc; } + if (m.result == -EAGAIN) { + dout("req %p tid %llu EAGAIN\n", req, req->r_tid); + unlink_request(osd, req); + mutex_unlock(&osd->lock); + + /* + * The object is missing on the replica or not (yet) + * readable. Clear pgid to force a resend to the primary + * via legacy_change. + */ + req->r_t.pgid.pool = 0; + req->r_t.pgid.seed = 0; + WARN_ON(!req->r_t.used_replica); + req->r_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS); + req->r_tid = 0; + __submit_request(req, false); + goto out_unlock_osdc; + } + if (m.num_ops != req->r_num_ops) { pr_err("num_ops %d != %d for tid %llu\n", m.num_ops, req->r_num_ops, req->r_tid); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 2a6e63a8edbe..96c25f5e064a 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -138,6 +138,79 @@ bad: return -EINVAL; } +struct crush_name_node { + struct rb_node cn_node; + int cn_id; + char cn_name[]; +}; + +static struct crush_name_node *alloc_crush_name(size_t name_len) +{ + struct crush_name_node *cn; + + cn = kmalloc(sizeof(*cn) + name_len + 1, GFP_NOIO); + if (!cn) + return NULL; + + RB_CLEAR_NODE(&cn->cn_node); + return cn; +} + +static void free_crush_name(struct crush_name_node *cn) +{ + WARN_ON(!RB_EMPTY_NODE(&cn->cn_node)); + + kfree(cn); +} + +DEFINE_RB_FUNCS(crush_name, struct crush_name_node, cn_id, cn_node) + +static int decode_crush_names(void **p, void *end, struct rb_root *root) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + struct crush_name_node *cn; + int id; + u32 name_len; + + ceph_decode_32_safe(p, end, id, e_inval); + ceph_decode_32_safe(p, end, name_len, e_inval); + ceph_decode_need(p, end, name_len, e_inval); + + cn = alloc_crush_name(name_len); + if (!cn) + return -ENOMEM; + + cn->cn_id = id; + memcpy(cn->cn_name, *p, name_len); + cn->cn_name[name_len] = '\0'; + *p += name_len; + + if (!__insert_crush_name(root, cn)) { + free_crush_name(cn); + return -EEXIST; + } + } + + return 0; + +e_inval: + return -EINVAL; +} + +void clear_crush_names(struct rb_root *root) +{ + while (!RB_EMPTY_ROOT(root)) { + struct crush_name_node *cn = + rb_entry(rb_first(root), struct crush_name_node, cn_node); + + erase_crush_name(root, cn); + free_crush_name(cn); + } +} + static struct crush_choose_arg_map *alloc_choose_arg_map(void) { struct crush_choose_arg_map *arg_map; @@ -354,6 +427,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end) if (c == NULL) return ERR_PTR(-ENOMEM); + c->type_names = RB_ROOT; + c->names = RB_ROOT; c->choose_args = RB_ROOT; /* set tunables to default values */ @@ -510,8 +585,14 @@ static struct crush_map *crush_decode(void *pbyval, void *end) } } - ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */ - ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */ + err = decode_crush_names(p, end, &c->type_names); + if (err) + goto fail; + + err = decode_crush_names(p, end, &c->names); + if (err) + goto fail; + ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */ /* tunables */ @@ -636,48 +717,11 @@ DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare, /* * rbtree of pg pool info */ -static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct ceph_pg_pool_info *pi = NULL; - - while (*p) { - parent = *p; - pi = rb_entry(parent, struct ceph_pg_pool_info, node); - if (new->id < pi->id) - p = &(*p)->rb_left; - else if (new->id > pi->id) - p = &(*p)->rb_right; - else - return -EEXIST; - } - - rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, root); - return 0; -} - -static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) -{ - struct ceph_pg_pool_info *pi; - struct rb_node *n = root->rb_node; - - while (n) { - pi = rb_entry(n, struct ceph_pg_pool_info, node); - if (id < pi->id) - n = n->rb_left; - else if (id > pi->id) - n = n->rb_right; - else - return pi; - } - return NULL; -} +DEFINE_RB_FUNCS(pg_pool, struct ceph_pg_pool_info, id, node) struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) { - return __lookup_pg_pool(&map->pg_pools, id); + return lookup_pg_pool(&map->pg_pools, id); } const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) @@ -690,8 +734,7 @@ const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) if (WARN_ON_ONCE(id > (u64) INT_MAX)) return NULL; - pi = __lookup_pg_pool(&map->pg_pools, (int) id); - + pi = lookup_pg_pool(&map->pg_pools, id); return pi ? pi->name : NULL; } EXPORT_SYMBOL(ceph_pg_pool_name_by_id); @@ -714,14 +757,14 @@ u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id) { struct ceph_pg_pool_info *pi; - pi = __lookup_pg_pool(&map->pg_pools, id); + pi = lookup_pg_pool(&map->pg_pools, id); return pi ? pi->flags : 0; } EXPORT_SYMBOL(ceph_pg_pool_flags); static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) { - rb_erase(&pi->node, root); + erase_pg_pool(root, pi); kfree(pi->name); kfree(pi); } @@ -903,7 +946,7 @@ static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map) ceph_decode_32_safe(p, end, len, bad); dout(" pool %llu len %d\n", pool, len); ceph_decode_need(p, end, len, bad); - pi = __lookup_pg_pool(&map->pg_pools, pool); + pi = lookup_pg_pool(&map->pg_pools, pool); if (pi) { char *name = kstrndup(*p, len, GFP_NOFS); @@ -1154,18 +1197,18 @@ static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, ceph_decode_64_safe(p, end, pool, e_inval); - pi = __lookup_pg_pool(&map->pg_pools, pool); + pi = lookup_pg_pool(&map->pg_pools, pool); if (!incremental || !pi) { pi = kzalloc(sizeof(*pi), GFP_NOFS); if (!pi) return -ENOMEM; + RB_CLEAR_NODE(&pi->node); pi->id = pool; - ret = __insert_pg_pool(&map->pg_pools, pi); - if (ret) { + if (!__insert_pg_pool(&map->pg_pools, pi)) { kfree(pi); - return ret; + return -EEXIST; } } @@ -1829,7 +1872,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_pg_pool_info *pi; ceph_decode_64_safe(p, end, pool, e_inval); - pi = __lookup_pg_pool(&map->pg_pools, pool); + pi = lookup_pg_pool(&map->pg_pools, pool); if (pi) __remove_pg_pool(&map->pg_pools, pi); } @@ -2672,3 +2715,221 @@ int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, return acting.primary; } EXPORT_SYMBOL(ceph_pg_to_acting_primary); + +static struct crush_loc_node *alloc_crush_loc(size_t type_name_len, + size_t name_len) +{ + struct crush_loc_node *loc; + + loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO); + if (!loc) + return NULL; + + RB_CLEAR_NODE(&loc->cl_node); + return loc; +} + +static void free_crush_loc(struct crush_loc_node *loc) +{ + WARN_ON(!RB_EMPTY_NODE(&loc->cl_node)); + + kfree(loc); +} + +static int crush_loc_compare(const struct crush_loc *loc1, + const struct crush_loc *loc2) +{ + return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?: + strcmp(loc1->cl_name, loc2->cl_name); +} + +DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare, + RB_BYPTR, const struct crush_loc *, cl_node) + +/* + * Parses a set of <bucket type name>':'<bucket name> pairs separated + * by '|', e.g. "rack:foo1|rack:foo2|datacenter:bar". + * + * Note that @crush_location is modified by strsep(). + */ +int ceph_parse_crush_location(char *crush_location, struct rb_root *locs) +{ + struct crush_loc_node *loc; + const char *type_name, *name, *colon; + size_t type_name_len, name_len; + + dout("%s '%s'\n", __func__, crush_location); + while ((type_name = strsep(&crush_location, "|"))) { + colon = strchr(type_name, ':'); + if (!colon) + return -EINVAL; + + type_name_len = colon - type_name; + if (type_name_len == 0) + return -EINVAL; + + name = colon + 1; + name_len = strlen(name); + if (name_len == 0) + return -EINVAL; + + loc = alloc_crush_loc(type_name_len, name_len); + if (!loc) + return -ENOMEM; + + loc->cl_loc.cl_type_name = loc->cl_data; + memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len); + loc->cl_loc.cl_type_name[type_name_len] = '\0'; + + loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1; + memcpy(loc->cl_loc.cl_name, name, name_len); + loc->cl_loc.cl_name[name_len] = '\0'; + + if (!__insert_crush_loc(locs, loc)) { + free_crush_loc(loc); + return -EEXIST; + } + + dout("%s type_name '%s' name '%s'\n", __func__, + loc->cl_loc.cl_type_name, loc->cl_loc.cl_name); + } + + return 0; +} + +int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2) +{ + struct rb_node *n1 = rb_first(locs1); + struct rb_node *n2 = rb_first(locs2); + int ret; + + for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) { + struct crush_loc_node *loc1 = + rb_entry(n1, struct crush_loc_node, cl_node); + struct crush_loc_node *loc2 = + rb_entry(n2, struct crush_loc_node, cl_node); + + ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc); + if (ret) + return ret; + } + + if (!n1 && n2) + return -1; + if (n1 && !n2) + return 1; + return 0; +} + +void ceph_clear_crush_locs(struct rb_root *locs) +{ + while (!RB_EMPTY_ROOT(locs)) { + struct crush_loc_node *loc = + rb_entry(rb_first(locs), struct crush_loc_node, cl_node); + + erase_crush_loc(locs, loc); + free_crush_loc(loc); + } +} + +/* + * [a-zA-Z0-9-_.]+ + */ +static bool is_valid_crush_name(const char *name) +{ + do { + if (!('a' <= *name && *name <= 'z') && + !('A' <= *name && *name <= 'Z') && + !('0' <= *name && *name <= '9') && + *name != '-' && *name != '_' && *name != '.') + return false; + } while (*++name != '\0'); + + return true; +} + +/* + * Gets the parent of an item. Returns its id (<0 because the + * parent is always a bucket), type id (>0 for the same reason, + * via @parent_type_id) and location (via @parent_loc). If no + * parent, returns 0. + * + * Does a linear search, as there are no parent pointers of any + * kind. Note that the result is ambigous for items that occur + * multiple times in the map. + */ +static int get_immediate_parent(struct crush_map *c, int id, + u16 *parent_type_id, + struct crush_loc *parent_loc) +{ + struct crush_bucket *b; + struct crush_name_node *type_cn, *cn; + int i, j; + + for (i = 0; i < c->max_buckets; i++) { + b = c->buckets[i]; + if (!b) + continue; + + /* ignore per-class shadow hierarchy */ + cn = lookup_crush_name(&c->names, b->id); + if (!cn || !is_valid_crush_name(cn->cn_name)) + continue; + + for (j = 0; j < b->size; j++) { + if (b->items[j] != id) + continue; + + *parent_type_id = b->type; + type_cn = lookup_crush_name(&c->type_names, b->type); + parent_loc->cl_type_name = type_cn->cn_name; + parent_loc->cl_name = cn->cn_name; + return b->id; + } + } + + return 0; /* no parent */ +} + +/* + * Calculates the locality/distance from an item to a client + * location expressed in terms of CRUSH hierarchy as a set of + * (bucket type name, bucket name) pairs. Specifically, looks + * for the lowest-valued bucket type for which the location of + * @id matches one of the locations in @locs, so for standard + * bucket types (host = 1, rack = 3, datacenter = 8, zone = 9) + * a matching host is closer than a matching rack and a matching + * data center is closer than a matching zone. + * + * Specifying multiple locations (a "multipath" location) such + * as "rack=foo1 rack=foo2 datacenter=bar" is allowed -- @locs + * is a multimap. The locality will be: + * + * - 3 for OSDs in racks foo1 and foo2 + * - 8 for OSDs in data center bar + * - -1 for all other OSDs + * + * The lowest possible bucket type is 1, so the best locality + * for an OSD is 1 (i.e. a matching host). Locality 0 would be + * the OSD itself. + */ +int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id, + struct rb_root *locs) +{ + struct crush_loc loc; + u16 type_id; + + /* + * Instead of repeated get_immediate_parent() calls, + * the location of @id could be obtained with a single + * depth-first traversal. + */ + for (;;) { + id = get_immediate_parent(osdmap->crush, id, &type_id, &loc); + if (id >= 0) + return -1; /* not local */ + + if (lookup_crush_loc(locs, &loc)) + return type_id; + } +} diff --git a/net/core/dev.c b/net/core/dev.c index 10684833f864..90b59fc50dc9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -79,6 +79,7 @@ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> @@ -194,7 +195,7 @@ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id = NR_CPUS; static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); -static seqcount_t devnet_rename_seq; +static DECLARE_RWSEM(devnet_rename_sem); static inline void dev_base_seq_inc(struct net *net) { @@ -438,6 +439,7 @@ static const char *const netdev_lock_name[] = { "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; +static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; static inline unsigned short netdev_lock_pos(unsigned short dev_type) { @@ -459,11 +461,25 @@ static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], netdev_lock_name[i]); } + +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ + int i; + + i = netdev_lock_pos(dev->type); + lockdep_set_class_and_name(&dev->addr_list_lock, + &netdev_addr_lock_key[i], + netdev_lock_name[i]); +} #else static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { } + +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ +} #endif /******************************************************************************* @@ -998,33 +1014,28 @@ EXPORT_SYMBOL(dev_get_by_napi_id); * @net: network namespace * @name: a pointer to the buffer where the name will be stored. * @ifindex: the ifindex of the interface to get the name from. - * - * The use of raw_seqcount_begin() and cond_resched() before - * retrying is required as we want to give the writers a chance - * to complete when CONFIG_PREEMPTION is not set. */ int netdev_get_name(struct net *net, char *name, int ifindex) { struct net_device *dev; - unsigned int seq; + int ret; -retry: - seq = raw_seqcount_begin(&devnet_rename_seq); + down_read(&devnet_rename_sem); rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { - rcu_read_unlock(); - return -ENODEV; + ret = -ENODEV; + goto out; } strcpy(name, dev->name); - rcu_read_unlock(); - if (read_seqcount_retry(&devnet_rename_seq, seq)) { - cond_resched(); - goto retry; - } - return 0; + ret = 0; +out: + rcu_read_unlock(); + up_read(&devnet_rename_sem); + return ret; } /** @@ -1296,10 +1307,10 @@ int dev_change_name(struct net_device *dev, const char *newname) likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK))) return -EBUSY; - write_seqcount_begin(&devnet_rename_seq); + down_write(&devnet_rename_sem); if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { - write_seqcount_end(&devnet_rename_seq); + up_write(&devnet_rename_sem); return 0; } @@ -1307,7 +1318,7 @@ int dev_change_name(struct net_device *dev, const char *newname) err = dev_get_valid_name(net, dev, newname); if (err < 0) { - write_seqcount_end(&devnet_rename_seq); + up_write(&devnet_rename_sem); return err; } @@ -1322,11 +1333,11 @@ rollback: if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); dev->name_assign_type = old_assign_type; - write_seqcount_end(&devnet_rename_seq); + up_write(&devnet_rename_sem); return ret; } - write_seqcount_end(&devnet_rename_seq); + up_write(&devnet_rename_sem); netdev_adjacent_rename_links(dev, oldname); @@ -1347,7 +1358,7 @@ rollback: /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; - write_seqcount_begin(&devnet_rename_seq); + down_write(&devnet_rename_sem); memcpy(dev->name, oldname, IFNAMSIZ); memcpy(oldname, newname, IFNAMSIZ); dev->name_assign_type = old_assign_type; @@ -4181,10 +4192,12 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) local_bh_disable(); + dev_xmit_recursion_inc(); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_drv_stopped(txq)) ret = netdev_start_xmit(skb, dev, txq, false); HARD_TX_UNLOCK(dev, txq); + dev_xmit_recursion_dec(); local_bh_enable(); @@ -9377,15 +9390,6 @@ void netif_tx_stop_all_queues(struct net_device *dev) } EXPORT_SYMBOL(netif_tx_stop_all_queues); -void netdev_update_lockdep_key(struct net_device *dev) -{ - lockdep_unregister_key(&dev->addr_list_lock_key); - lockdep_register_key(&dev->addr_list_lock_key); - - lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key); -} -EXPORT_SYMBOL(netdev_update_lockdep_key); - /** * register_netdevice - register a network device * @dev: device to register @@ -9424,7 +9428,7 @@ int register_netdevice(struct net_device *dev) return ret; spin_lock_init(&dev->addr_list_lock); - lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key); + netdev_set_addr_lockdep_class(dev); ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) @@ -9545,6 +9549,13 @@ int register_netdevice(struct net_device *dev) rcu_barrier(); dev->reg_state = NETREG_UNREGISTERED; + /* We should put the kobject that hold in + * netdev_unregister_kobject(), otherwise + * the net device cannot be freed when + * driver calls free_netdev(), because the + * kobject is being hold. + */ + kobject_put(&dev->dev.kobj); } /* * Prevent userspace races by waiting until the network @@ -9943,8 +9954,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - lockdep_register_key(&dev->addr_list_lock_key); - dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; dev->upper_level = 1; @@ -10032,8 +10041,6 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; - lockdep_unregister_key(&dev->addr_list_lock_key); - /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 2f949b5a1eb9..6393ba930097 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -637,7 +637,7 @@ int dev_uc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock(to); + netif_addr_lock_nested(to); err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -667,7 +667,7 @@ int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock(to); + netif_addr_lock_nested(to); err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -691,7 +691,7 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from) return; netif_addr_lock_bh(from); - netif_addr_lock(to); + netif_addr_lock_nested(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); @@ -858,7 +858,7 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock(to); + netif_addr_lock_nested(to); err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -888,7 +888,7 @@ int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock(to); + netif_addr_lock_nested(to); err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -912,7 +912,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from) return; netif_addr_lock_bh(from); - netif_addr_lock(to); + netif_addr_lock_nested(to); __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); diff --git a/net/core/filter.c b/net/core/filter.c index d01a244b5087..73395384afe2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1755,25 +1755,27 @@ BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, u32, offset, void *, to, u32, len, u32, start_header) { u8 *end = skb_tail_pointer(skb); - u8 *net = skb_network_header(skb); - u8 *mac = skb_mac_header(skb); - u8 *ptr; + u8 *start, *ptr; - if (unlikely(offset > 0xffff || len > (end - mac))) + if (unlikely(offset > 0xffff)) goto err_clear; switch (start_header) { case BPF_HDR_START_MAC: - ptr = mac + offset; + if (unlikely(!skb_mac_header_was_set(skb))) + goto err_clear; + start = skb_mac_header(skb); break; case BPF_HDR_START_NET: - ptr = net + offset; + start = skb_network_header(skb); break; default: goto err_clear; } - if (likely(ptr >= mac && ptr + len <= end)) { + ptr = start + offset; + + if (likely(ptr + len <= end)) { memcpy(to, ptr, len); return 0; } @@ -4340,8 +4342,6 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, } break; case SO_BINDTODEVICE: - ret = -ENOPROTOOPT; -#ifdef CONFIG_NETDEVICES optlen = min_t(long, optlen, IFNAMSIZ - 1); strncpy(devname, optval, optlen); devname[optlen] = 0; @@ -4360,7 +4360,6 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, dev_put(dev); } ret = sock_bindtoindex(sk, ifindex, false); -#endif break; default: ret = -EINVAL; @@ -5050,7 +5049,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len int err; struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; - if (!seg6_validate_srh(srh, len)) + if (!seg6_validate_srh(srh, len, false)) return -EINVAL; switch (type) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2269199c5891..9aedc15736ad 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2462,7 +2462,6 @@ static int do_set_master(struct net_device *dev, int ifindex, err = ops->ndo_del_slave(upper_dev, dev); if (err) return err; - netdev_update_lockdep_key(dev); } else { return -EOPNOTSUPP; } diff --git a/net/core/sock.c b/net/core/sock.c index 6c4acf1f0220..94391da27754 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -718,7 +718,7 @@ bool sk_mc_loop(struct sock *sk) return inet6_sk(sk)->mc_loop; #endif } - WARN_ON(1); + WARN_ON_ONCE(1); return true; } EXPORT_SYMBOL(sk_mc_loop); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 00a26cf2cfe9..4059f94e9bb5 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -424,10 +424,7 @@ static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) return 0; } -static bool sock_map_redirect_allowed(const struct sock *sk) -{ - return sk->sk_state != TCP_LISTEN; -} +static bool sock_map_redirect_allowed(const struct sock *sk); static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) @@ -508,6 +505,11 @@ static bool sk_is_udp(const struct sock *sk) sk->sk_protocol == IPPROTO_UDP; } +static bool sock_map_redirect_allowed(const struct sock *sk) +{ + return sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN; +} + static bool sock_map_sk_is_suitable(const struct sock *sk) { return sk_is_tcp(sk) || sk_is_udp(sk); @@ -989,11 +991,15 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) err = -EINVAL; goto free_htab; } + err = bpf_map_charge_init(&htab->map.memory, cost); + if (err) + goto free_htab; htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_htab_bucket), htab->map.numa_node); if (!htab->buckets) { + bpf_map_charge_finish(&htab->map.memory); err = -ENOMEM; goto free_htab; } @@ -1013,6 +1019,7 @@ static void sock_hash_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct bpf_htab_bucket *bucket; + struct hlist_head unlink_list; struct bpf_htab_elem *elem; struct hlist_node *node; int i; @@ -1024,13 +1031,32 @@ static void sock_hash_free(struct bpf_map *map) synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); - hlist_for_each_entry_safe(elem, node, &bucket->head, node) { - hlist_del_rcu(&elem->node); + + /* We are racing with sock_hash_delete_from_link to + * enter the spin-lock critical section. Every socket on + * the list is still linked to sockhash. Since link + * exists, psock exists and holds a ref to socket. That + * lets us to grab a socket ref too. + */ + raw_spin_lock_bh(&bucket->lock); + hlist_for_each_entry(elem, &bucket->head, node) + sock_hold(elem->sk); + hlist_move_list(&bucket->head, &unlink_list); + raw_spin_unlock_bh(&bucket->lock); + + /* Process removed entries out of atomic context to + * block for socket lock before deleting the psock's + * link to sockhash. + */ + hlist_for_each_entry_safe(elem, node, &unlink_list, node) { + hlist_del(&elem->node); lock_sock(elem->sk); rcu_read_lock(); sock_map_unref(elem->sk, elem); rcu_read_unlock(); release_sock(elem->sk); + sock_put(elem->sk); + sock_hash_free_elem(htab, elem); } } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index b109cc8a6dd8..f93f8ace6c56 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -128,7 +128,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, return -ENOMEM; if (write) { - ret = cpumask_parse_user(buffer, *lenp, mask); + ret = cpumask_parse(buffer, mask); if (ret) goto done; diff --git a/net/core/xdp.c b/net/core/xdp.c index 90f44f382115..3c45f99e26d5 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -462,6 +462,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) xdpf->len = totsize - metasize; xdpf->headroom = 0; xdpf->metasize = metasize; + xdpf->frame_sz = PAGE_SIZE; xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; xsk_buff_free(xdp); diff --git a/net/dcb/Kconfig b/net/dcb/Kconfig index 917e6e7b1cac..efee8b9fe1d4 100644 --- a/net/dcb/Kconfig +++ b/net/dcb/Kconfig @@ -2,7 +2,7 @@ config DCB bool "Data Center Bridging support" default n - ---help--- + help This enables support for configuring Data Center Bridging (DCB) features on DCB capable Ethernet adapters via rtnetlink. Say 'Y' if you have a DCB capable Ethernet adapter which supports this diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index f7c7495677b0..51ac2631fb48 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -2,7 +2,7 @@ menuconfig IP_DCCP tristate "The DCCP Protocol" depends on INET - ---help--- + help Datagram Congestion Control Protocol (RFC 4340) From http://www.ietf.org/rfc/rfc4340.txt: @@ -32,7 +32,7 @@ menu "DCCP Kernel Hacking" config IP_DCCP_DEBUG bool "DCCP debug messages" - ---help--- + help Only use this if you're hacking DCCP. When compiling DCCP as a module, this debugging output can be toggled diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 4a358e6847a8..4d7771f36eff 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -3,7 +3,7 @@ menu "DCCP CCIDs Configuration" config IP_DCCP_CCID2_DEBUG bool "CCID-2 debugging messages" - ---help--- + help Enable CCID-2 specific debugging messages. The debugging output can additionally be toggled by setting the @@ -14,7 +14,7 @@ config IP_DCCP_CCID2_DEBUG config IP_DCCP_CCID3 bool "CCID-3 (TCP-Friendly)" def_bool y if (IP_DCCP = y || IP_DCCP = m) - ---help--- + help CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based rate-controlled congestion control mechanism. TFRC is designed to be reasonably fair when competing for bandwidth with TCP-like flows, @@ -39,7 +39,7 @@ config IP_DCCP_CCID3 config IP_DCCP_CCID3_DEBUG bool "CCID-3 debugging messages" depends on IP_DCCP_CCID3 - ---help--- + help Enable CCID-3 specific debugging messages. The debugging output can additionally be toggled by setting the diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 4af8a98fe784..c13b6609474b 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1139,14 +1139,14 @@ static int __init dccp_init(void) inet_hashinfo_init(&dccp_hashinfo); rc = inet_hashinfo2_init_mod(&dccp_hashinfo); if (rc) - goto out_fail; + goto out_free_percpu; rc = -ENOBUFS; dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN, NULL); if (!dccp_hashinfo.bind_bucket_cachep) - goto out_free_percpu; + goto out_free_hashinfo2; /* * Size and allocate the main established and bind bucket @@ -1242,6 +1242,8 @@ out_free_dccp_ehash: free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); +out_free_hashinfo2: + inet_hashinfo2_free_mod(&dccp_hashinfo); out_free_percpu: percpu_counter_destroy(&dccp_orphan_count); out_fail: @@ -1265,6 +1267,7 @@ static void __exit dccp_fini(void) kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); dccp_ackvec_exit(); dccp_sysctl_exit(); + inet_hashinfo2_free_mod(&dccp_hashinfo); percpu_counter_destroy(&dccp_orphan_count); } diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig index 8f98fb2f2ec9..24336bdb1054 100644 --- a/net/decnet/Kconfig +++ b/net/decnet/Kconfig @@ -4,7 +4,7 @@ # config DECNET tristate "DECnet Support" - ---help--- + help The DECnet networking protocol was used in many products made by Digital (now Compaq). It provides reliable stream and sequenced packet communications over which run a variety of services similar @@ -29,7 +29,7 @@ config DECNET_ROUTER bool "DECnet: router support" depends on DECNET select FIB_RULES - ---help--- + help Add support for turning your DECnet Endnode into a level 1 or 2 router. This is an experimental, but functional option. If you do say Y here, then make sure that you also say Y to "Kernel/User diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 739613070d07..d5bc6ac599ef 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -13,7 +13,7 @@ menuconfig NET_DSA select NET_SWITCHDEV select PHYLINK select NET_DEVLINK - ---help--- + help Say Y if you want to enable support for the hardware switches supported by the Distributed Switch Architecture. diff --git a/net/dsa/master.c b/net/dsa/master.c index a621367c6e8c..480a61460c23 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -327,6 +327,8 @@ static void dsa_master_reset_mtu(struct net_device *dev) rtnl_unlock(); } +static struct lock_class_key dsa_master_addr_list_lock_key; + int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { int ret; @@ -345,6 +347,8 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) wmb(); dev->dsa_ptr = cpu_dp; + lockdep_set_class(&dev->addr_list_lock, + &dsa_master_addr_list_lock_key); ret = dsa_master_ethtool_setup(dev); if (ret) return ret; diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 423e640e3876..47f63526818e 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -43,6 +43,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation", + [NETIF_F_GSO_FRAGLIST_BIT] = "tx-gso-list", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c index 677068deb68c..5eaf173eaaca 100644 --- a/net/ethtool/linkinfo.c +++ b/net/ethtool/linkinfo.c @@ -140,8 +140,7 @@ int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info) ret = __ethtool_get_link_ksettings(dev, &ksettings); if (ret < 0) { - if (info) - GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); goto out_ops; } lsettings = &ksettings.base; diff --git a/net/hsr/Kconfig b/net/hsr/Kconfig index 9c58f8763997..8095b034e76e 100644 --- a/net/hsr/Kconfig +++ b/net/hsr/Kconfig @@ -5,7 +5,7 @@ config HSR tristate "High-availability Seamless Redundancy (HSR)" - ---help--- + help If you say Y here, then your Linux box will be able to act as a DANH ("Doubly attached node implementing HSR"). For this to work, your Linux box needs (at least) two physical Ethernet interfaces, diff --git a/net/ieee802154/6lowpan/Kconfig b/net/ieee802154/6lowpan/Kconfig index d1b4655a6d43..e808e4db2678 100644 --- a/net/ieee802154/6lowpan/Kconfig +++ b/net/ieee802154/6lowpan/Kconfig @@ -2,5 +2,5 @@ config IEEE802154_6LOWPAN tristate "6lowpan support over IEEE 802.15.4" depends on 6LOWPAN - ---help--- + help IPv6 compression over IEEE 802.15.4. diff --git a/net/ieee802154/Kconfig b/net/ieee802154/Kconfig index 5dbbc2ca95b4..bcb05ba97686 100644 --- a/net/ieee802154/Kconfig +++ b/net/ieee802154/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only menuconfig IEEE802154 tristate "IEEE Std 802.15.4 Low-Rate Wireless Personal Area Networks support" - ---help--- + help IEEE Std 802.15.4 defines a low data rate, low power and low complexity short range wireless personal area networks. It was designed to organise networks of sensors, switches, etc automation @@ -15,13 +15,13 @@ if IEEE802154 config IEEE802154_NL802154_EXPERIMENTAL bool "IEEE 802.15.4 experimental netlink support" - ---help--- + help Adds experimental netlink support for nl802154. config IEEE802154_SOCKET tristate "IEEE 802.15.4 socket interface" default y - ---help--- + help Socket interface for IEEE 802.15.4. Contains DGRAM sockets interface for 802.15.4 dataframes. Also RAW socket interface to build MAC header from userspace. diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index dc9dfaef77e5..e64e59b536d3 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -14,7 +14,7 @@ config IP_MULTICAST config IP_ADVANCED_ROUTER bool "IP: advanced router" - ---help--- + help If you intend to run your Linux box mostly as a router, i.e. as a computer that forwards and redistributes network packets, say Y; you will then be presented with several options that allow more precise @@ -56,7 +56,7 @@ config IP_ADVANCED_ROUTER config IP_FIB_TRIE_STATS bool "FIB TRIE statistics" depends on IP_ADVANCED_ROUTER - ---help--- + help Keep track of statistics on structure of FIB TRIE table. Useful for testing and measuring TRIE performance. @@ -64,7 +64,7 @@ config IP_MULTIPLE_TABLES bool "IP: policy routing" depends on IP_ADVANCED_ROUTER select FIB_RULES - ---help--- + help Normally, a router decides what to do with a received packet based solely on the packet's final destination address. If you say Y here, the Linux router will also be able to take the packet's source @@ -117,7 +117,7 @@ config IP_PNP config IP_PNP_DHCP bool "IP: DHCP support" depends on IP_PNP - ---help--- + help If you want your Linux box to mount its whole root file system (the one containing the directory /) from some other computer over the net via NFS and you want the IP address of your computer to be @@ -134,7 +134,7 @@ config IP_PNP_DHCP config IP_PNP_BOOTP bool "IP: BOOTP support" depends on IP_PNP - ---help--- + help If you want your Linux box to mount its whole root file system (the one containing the directory /) from some other computer over the net via NFS and you want the IP address of your computer to be @@ -163,7 +163,7 @@ config NET_IPIP tristate "IP: tunneling" select INET_TUNNEL select NET_IP_TUNNEL - ---help--- + help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the encapsulating protocol. This particular tunneling driver implements @@ -267,7 +267,7 @@ config IP_PIMSM_V2 config SYN_COOKIES bool "IP: TCP syncookie support" - ---help--- + help Normal TCP/IP networking is open to an attack known as "SYN flooding". This denial-of-service attack prevents legitimate remote users from being able to connect to your computer during an ongoing @@ -307,7 +307,7 @@ config NET_IPVTI select INET_TUNNEL select NET_IP_TUNNEL select XFRM - ---help--- + help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the encapsulating protocol. This can be used with xfrm mode tunnel to give @@ -323,7 +323,7 @@ config NET_FOU tristate "IP: Foo (IP protocols) over UDP" select XFRM select NET_UDP_TUNNEL - ---help--- + help Foo over UDP allows any IP protocol to be directly encapsulated over UDP include tunnels (IPIP, GRE, SIT). By encapsulating in UDP network mechanisms and optimizations for UDP (such as ECMP @@ -333,7 +333,7 @@ config NET_FOU_IP_TUNNELS bool "IP: FOU encapsulation of IP tunnels" depends on NET_IPIP || NET_IPGRE || IPV6_SIT select NET_FOU - ---help--- + help Allow configuration of FOU or GUE encapsulation for IP tunnels. When this option is enabled IP tunnels can be configured to use FOU or GUE encapsulation. @@ -341,7 +341,7 @@ config NET_FOU_IP_TUNNELS config INET_AH tristate "IP: AH transformation" select XFRM_AH - ---help--- + help Support for IPsec AH (Authentication Header). AH can be used with various authentication algorithms. Besides @@ -356,7 +356,7 @@ config INET_AH config INET_ESP tristate "IP: ESP transformation" select XFRM_ESP - ---help--- + help Support for IPsec ESP (Encapsulating Security Payload). ESP can be used with various encryption and authentication algorithms. @@ -373,7 +373,7 @@ config INET_ESP_OFFLOAD depends on INET_ESP select XFRM_OFFLOAD default n - ---help--- + help Support for ESP transformation offload. This makes sense only if this system really does IPsec and want to do it with high throughput. A typical desktop system does not @@ -397,7 +397,7 @@ config INET_IPCOMP tristate "IP: IPComp transformation" select INET_XFRM_TUNNEL select XFRM_IPCOMP - ---help--- + help Support for IP Payload Compression Protocol (IPComp) (RFC3173), typically needed for IPsec. @@ -415,7 +415,7 @@ config INET_TUNNEL config INET_DIAG tristate "INET: socket monitoring interface" default y - ---help--- + help Support for INET (TCP, DCCP, etc) socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable at: @@ -432,7 +432,7 @@ config INET_UDP_DIAG tristate "UDP: socket monitoring interface" depends on INET_DIAG && (IPV6 || IPV6=n) default n - ---help--- + help Support for UDP socket monitoring interface used by the ss tool. If unsure, say Y. @@ -440,7 +440,7 @@ config INET_RAW_DIAG tristate "RAW: socket monitoring interface" depends on INET_DIAG && (IPV6 || IPV6=n) default n - ---help--- + help Support for RAW socket monitoring interface used by the ss tool. If unsure, say Y. @@ -448,7 +448,7 @@ config INET_DIAG_DESTROY bool "INET: allow privileged process to administratively close sockets" depends on INET_DIAG default n - ---help--- + help Provides a SOCK_DESTROY operation that allows privileged processes (e.g., a connection manager or a network administration tool such as ss) to close sockets opened by other processes. Closing a socket in @@ -459,7 +459,7 @@ config INET_DIAG_DESTROY menuconfig TCP_CONG_ADVANCED bool "TCP: advanced congestion control" - ---help--- + help Support for selection of various TCP congestion control modules. @@ -473,7 +473,7 @@ if TCP_CONG_ADVANCED config TCP_CONG_BIC tristate "Binary Increase Congestion (BIC) control" default m - ---help--- + help BIC-TCP is a sender-side only change that ensures a linear RTT fairness under large windows while offering both scalability and bounded TCP-friendliness. The protocol combines two schemes @@ -487,7 +487,7 @@ config TCP_CONG_BIC config TCP_CONG_CUBIC tristate "CUBIC TCP" default y - ---help--- + help This is version 2.0 of BIC-TCP which uses a cubic growth function among other techniques. See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf @@ -495,7 +495,7 @@ config TCP_CONG_CUBIC config TCP_CONG_WESTWOOD tristate "TCP Westwood+" default m - ---help--- + help TCP Westwood+ is a sender-side only modification of the TCP Reno protocol stack that optimizes the performance of TCP congestion control. It is based on end-to-end bandwidth estimation to set @@ -509,7 +509,7 @@ config TCP_CONG_WESTWOOD config TCP_CONG_HTCP tristate "H-TCP" default m - ---help--- + help H-TCP is a send-side only modifications of the TCP Reno protocol stack that optimizes the performance of TCP congestion control for high speed network links. It uses a @@ -520,7 +520,7 @@ config TCP_CONG_HTCP config TCP_CONG_HSTCP tristate "High Speed TCP" default n - ---help--- + help Sally Floyd's High Speed TCP (RFC 3649) congestion control. A modification to TCP's congestion control mechanism for use with large congestion windows. A table indicates how much to @@ -530,7 +530,7 @@ config TCP_CONG_HSTCP config TCP_CONG_HYBLA tristate "TCP-Hybla congestion control algorithm" default n - ---help--- + help TCP-Hybla is a sender-side only change that eliminates penalization of long-RTT, large-bandwidth connections, like when satellite legs are involved, especially when sharing a common bottleneck with normal @@ -539,7 +539,7 @@ config TCP_CONG_HYBLA config TCP_CONG_VEGAS tristate "TCP Vegas" default n - ---help--- + help TCP Vegas is a sender-side only change to TCP that anticipates the onset of congestion by estimating the bandwidth. TCP Vegas adjusts the sending rate by modifying the congestion @@ -549,7 +549,7 @@ config TCP_CONG_VEGAS config TCP_CONG_NV tristate "TCP NV" default n - ---help--- + help TCP NV is a follow up to TCP Vegas. It has been modified to deal with 10G networks, measurement noise introduced by LRO, GRO and interrupt coalescence. In addition, it will decrease its cwnd multiplicatively @@ -565,7 +565,7 @@ config TCP_CONG_NV config TCP_CONG_SCALABLE tristate "Scalable TCP" default n - ---help--- + help Scalable TCP is a sender-side only change to TCP which uses a MIMD congestion control algorithm which has some nice scaling properties, though is known to have fairness issues. @@ -574,7 +574,7 @@ config TCP_CONG_SCALABLE config TCP_CONG_LP tristate "TCP Low Priority" default n - ---help--- + help TCP Low Priority (TCP-LP), a distributed algorithm whose goal is to utilize only the excess network bandwidth as compared to the ``fair share`` of bandwidth as targeted by TCP. @@ -583,7 +583,7 @@ config TCP_CONG_LP config TCP_CONG_VENO tristate "TCP Veno" default n - ---help--- + help TCP Veno is a sender-side only enhancement of TCP to obtain better throughput over wireless networks. TCP Veno makes use of state distinguishing to circumvent the difficult judgment of the packet loss @@ -595,7 +595,7 @@ config TCP_CONG_YEAH tristate "YeAH TCP" select TCP_CONG_VEGAS default n - ---help--- + help YeAH-TCP is a sender-side high-speed enabled TCP congestion control algorithm, which uses a mixed loss/delay approach to compute the congestion window. It's design goals target high efficiency, @@ -608,7 +608,7 @@ config TCP_CONG_YEAH config TCP_CONG_ILLINOIS tristate "TCP Illinois" default n - ---help--- + help TCP-Illinois is a sender-side modification of TCP Reno for high speed long delay links. It uses round-trip-time to adjust the alpha and beta parameters to achieve a higher average @@ -620,7 +620,7 @@ config TCP_CONG_ILLINOIS config TCP_CONG_DCTCP tristate "DataCenter TCP (DCTCP)" default n - ---help--- + help DCTCP leverages Explicit Congestion Notification (ECN) in the network to provide multi-bit feedback to the end hosts. It is designed to provide: @@ -641,7 +641,7 @@ config TCP_CONG_DCTCP config TCP_CONG_CDG tristate "CAIA Delay-Gradient (CDG)" default n - ---help--- + help CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies the TCP sender in order to: @@ -657,7 +657,7 @@ config TCP_CONG_CDG config TCP_CONG_BBR tristate "BBR TCP" default n - ---help--- + help BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to maximize network utilization and minimize queues. It builds an explicit @@ -736,7 +736,7 @@ config TCP_MD5SIG bool "TCP: MD5 Signature Option support (RFC2385)" select CRYPTO select CRYPTO_MD5 - ---help--- + help RFC2385 specifies a method of giving MD5 protection to TCP sessions. Its main (only?) use is to protect BGP sessions between core routers on the Internet. diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e53871e4a097..1f75dc686b6b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1109,7 +1109,7 @@ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table, if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; - if (table) + if (table && table != RT_TABLE_MAIN) tbl = fib_get_table(net, table); if (tbl) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f40b1b72f979..afaf582a5aa9 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -902,6 +902,7 @@ void inet_csk_prepare_forced_close(struct sock *sk) bh_unlock_sock(sk); sock_put(sk); inet_csk_prepare_for_destroy_sock(sk); + inet_sk(sk)->inet_num = 0; } EXPORT_SYMBOL(inet_csk_prepare_forced_close); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index f4f1d11eab50..0c1f36404471 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -85,9 +85,10 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, __be32 remote, __be32 local, __be32 key) { - unsigned int hash; struct ip_tunnel *t, *cand = NULL; struct hlist_head *head; + struct net_device *ndev; + unsigned int hash; hash = ip_tunnel_hash(key, remote); head = &itn->tunnels[hash]; @@ -162,8 +163,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, if (t && t->dev->flags & IFF_UP) return t; - if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) - return netdev_priv(itn->fb_tunnel_dev); + ndev = READ_ONCE(itn->fb_tunnel_dev); + if (ndev && ndev->flags & IFF_UP) + return netdev_priv(ndev); return NULL; } @@ -1259,9 +1261,9 @@ void ip_tunnel_uninit(struct net_device *dev) struct ip_tunnel_net *itn; itn = net_generic(net, tunnel->ip_tnl_net_id); - /* fb_tunnel_dev will be unregisted in net-exit call. */ - if (itn->fb_tunnel_dev != dev) - ip_tunnel_del(itn, netdev_priv(dev)); + ip_tunnel_del(itn, netdev_priv(dev)); + if (itn->fb_tunnel_dev == dev) + WRITE_ONCE(itn->fb_tunnel_dev, NULL); dst_cache_reset(&tunnel->dst_cache); } diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index f17b402111ce..a2f4f894be2b 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -94,7 +94,7 @@ config NF_NAT_SNMP_BASIC depends on NETFILTER_ADVANCED default NF_NAT && NF_CONNTRACK_SNMP select ASN1 - ---help--- + help This module implements an Application Layer Gateway (ALG) for SNMP payloads. In conjunction with NAT, it allows a network @@ -146,7 +146,7 @@ config IP_NF_MATCH_ECN tristate '"ecn" match support' depends on NETFILTER_ADVANCED select NETFILTER_XT_MATCH_ECN - ---help--- + help This is a backwards-compat option for the user's convenience (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_MATCH_ECN. @@ -155,7 +155,7 @@ config IP_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' depends on NETFILTER_ADVANCED depends on IP_NF_MANGLE || IP_NF_RAW - ---help--- + help This option allows you to match packets whose replies would go out via the interface the packet came in. @@ -166,7 +166,7 @@ config IP_NF_MATCH_TTL tristate '"ttl" match support' depends on NETFILTER_ADVANCED select NETFILTER_XT_MATCH_HL - ---help--- + help This is a backwards-compat option for the user's convenience (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_MATCH_HL. @@ -234,7 +234,7 @@ config IP_NF_TARGET_NETMAP tristate "NETMAP target support" depends on NETFILTER_ADVANCED select NETFILTER_XT_TARGET_NETMAP - ---help--- + help This is a backwards-compat option for the user's convenience (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_TARGET_NETMAP. @@ -243,7 +243,7 @@ config IP_NF_TARGET_REDIRECT tristate "REDIRECT target support" depends on NETFILTER_ADVANCED select NETFILTER_XT_TARGET_REDIRECT - ---help--- + help This is a backwards-compat option for the user's convenience (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_TARGET_REDIRECT. @@ -279,7 +279,7 @@ config IP_NF_TARGET_ECN tristate "ECN target support" depends on IP_NF_MANGLE depends on NETFILTER_ADVANCED - ---help--- + help This option adds a `ECN' target, which can be used in the iptables mangle table. @@ -294,7 +294,7 @@ config IP_NF_TARGET_TTL tristate '"TTL" target support' depends on NETFILTER_ADVANCED && IP_NF_MANGLE select NETFILTER_XT_TARGET_HL - ---help--- + help This is a backwards-compatible option for the user's convenience (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_TARGET_HL. diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 400a9f89ebdb..cc8049b100b2 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -247,12 +247,11 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; - if (nh->is_fdb_nh && nla_put_flag(skb, NHA_FDB)) - goto nla_put_failure; - if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB)) + goto nla_put_failure; if (nla_put_nh_group(skb, nhg)) goto nla_put_failure; goto out; @@ -264,7 +263,10 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nla_put_flag(skb, NHA_BLACKHOLE)) goto nla_put_failure; goto out; - } else if (!nh->is_fdb_nh) { + } else if (nhi->fdb_nh) { + if (nla_put_flag(skb, NHA_FDB)) + goto nla_put_failure; + } else { const struct net_device *dev; dev = nhi->fib_nhc.nhc_dev; @@ -385,7 +387,7 @@ errout: } static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, - struct netlink_ext_ack *extack) + bool *is_fdb, struct netlink_ext_ack *extack) { if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); @@ -398,6 +400,7 @@ static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, "Multipath group can not be a nexthop within a group"); return false; } + *is_fdb = nhg->fdb_nh; } else { struct nh_info *nhi = rtnl_dereference(nh->nh_info); @@ -406,6 +409,7 @@ static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, "Blackhole nexthop can not be used in a group with more than 1 path"); return false; } + *is_fdb = nhi->fdb_nh; } return true; @@ -416,12 +420,13 @@ static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, { struct nh_info *nhi; - if (!nh->is_fdb_nh) { + nhi = rtnl_dereference(nh->nh_info); + + if (!nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); return -EINVAL; } - nhi = rtnl_dereference(nh->nh_info); if (*nh_family == AF_UNSPEC) { *nh_family = nhi->family; } else if (*nh_family != nhi->family) { @@ -473,19 +478,20 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[], nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { struct nexthop *nh; + bool is_fdb_nh; nh = nexthop_find_by_id(net, nhg[i].id); if (!nh) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } - if (!valid_group_nh(nh, len, extack)) + if (!valid_group_nh(nh, len, &is_fdb_nh, extack)) return -EINVAL; if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) return -EINVAL; - if (!nhg_fdb && nh->is_fdb_nh) { + if (!nhg_fdb && is_fdb_nh) { NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); return -EINVAL; } @@ -553,13 +559,13 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) if (hash > atomic_read(&nhge->upper_bound)) continue; - if (nhge->nh->is_fdb_nh) + nhi = rcu_dereference(nhge->nh->nh_info); + if (nhi->fdb_nh) return nhge->nh; /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ - nhi = rcu_dereference(nhge->nh->nh_info); switch (nhi->family) { case AF_INET: if (ipv4_good_nh(&nhi->fib_nh)) @@ -624,11 +630,7 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; - - if (nh->is_fdb_nh) { - NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); - return -EINVAL; - } + bool is_fdb_nh; /* fib6_src is unique to a fib6_info and limits the ability to cache * routes in fib6_nh within a nexthop that is potentially shared @@ -645,10 +647,17 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, nhg = rtnl_dereference(nh->nh_grp); if (nhg->has_v4) goto no_v4_nh; + is_fdb_nh = nhg->fdb_nh; } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->family == AF_INET) goto no_v4_nh; + is_fdb_nh = nhi->fdb_nh; + } + + if (is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); + return -EINVAL; } return 0; @@ -677,12 +686,9 @@ static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, return fib6_check_nexthop(new, NULL, extack); } -static int nexthop_check_scope(struct nexthop *nh, u8 scope, +static int nexthop_check_scope(struct nh_info *nhi, u8 scope, struct netlink_ext_ack *extack) { - struct nh_info *nhi; - - nhi = rtnl_dereference(nh->nh_info); if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); @@ -704,29 +710,38 @@ static int nexthop_check_scope(struct nexthop *nh, u8 scope, int fib_check_nexthop(struct nexthop *nh, u8 scope, struct netlink_ext_ack *extack) { + struct nh_info *nhi; int err = 0; - if (nh->is_fdb_nh) { - NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); - err = -EINVAL; - goto out; - } - if (nh->is_group) { struct nh_group *nhg; + nhg = rtnl_dereference(nh->nh_grp); + if (nhg->fdb_nh) { + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); + err = -EINVAL; + goto out; + } + if (scope == RT_SCOPE_HOST) { NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); err = -EINVAL; goto out; } - nhg = rtnl_dereference(nh->nh_grp); /* all nexthops in a group have the same scope */ - err = nexthop_check_scope(nhg->nh_entries[0].nh, scope, extack); + nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info); + err = nexthop_check_scope(nhi, scope, extack); } else { - err = nexthop_check_scope(nh, scope, extack); + nhi = rtnl_dereference(nh->nh_info); + if (nhi->fdb_nh) { + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); + err = -EINVAL; + goto out; + } + err = nexthop_check_scope(nhi, scope, extack); } + out: return err; } @@ -787,6 +802,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, newg->has_v4 = nhg->has_v4; newg->mpath = nhg->mpath; + newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; /* copy old entries to new except the one getting removed */ @@ -1216,7 +1232,7 @@ static struct nexthop *nexthop_create_group(struct net *net, } if (cfg->nh_fdb) - nh->is_fdb_nh = 1; + nhg->fdb_nh = 1; rcu_assign_pointer(nh->nh_grp, nhg); @@ -1255,7 +1271,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, goto out; } - if (nh->is_fdb_nh) + if (nhi->fdb_nh) goto out; /* sets nh_dev if successful */ @@ -1326,7 +1342,7 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; if (cfg->nh_fdb) - nh->is_fdb_nh = 1; + nhi->fdb_nh = 1; if (cfg->nh_blackhole) { nhi->reject_nh = 1; @@ -1349,7 +1365,7 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, } /* add the entry to the device based hash */ - if (!nh->is_fdb_nh) + if (!nhi->fdb_nh) nexthop_devhash_add(net, nhi); rcu_assign_pointer(nh->nh_info, nhi); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 15d47d5e7951..810cc164f795 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1734,7 +1734,7 @@ int tcp_mmap(struct file *file, struct socket *sock, return -EPERM; vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); - /* Instruct vm_insert_page() to not down_read(mmap_sem) */ + /* Instruct vm_insert_page() to not mmap_read_lock(mm) */ vma->vm_flags |= VM_MIXEDMAP; vma->vm_ops = &tcp_vm_ops; @@ -1742,14 +1742,48 @@ int tcp_mmap(struct file *file, struct socket *sock, } EXPORT_SYMBOL(tcp_mmap); +static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, + struct page **pages, + unsigned long pages_to_map, + unsigned long *insert_addr, + u32 *length_with_pending, + u32 *seq, + struct tcp_zerocopy_receive *zc) +{ + unsigned long pages_remaining = pages_to_map; + int bytes_mapped; + int ret; + + ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining); + bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining); + /* Even if vm_insert_pages fails, it may have partially succeeded in + * mapping (some but not all of the pages). + */ + *seq += bytes_mapped; + *insert_addr += bytes_mapped; + if (ret) { + /* But if vm_insert_pages did fail, we have to unroll some state + * we speculatively touched before. + */ + const int bytes_not_mapped = PAGE_SIZE * pages_remaining; + *length_with_pending -= bytes_not_mapped; + zc->recv_skip_hint += bytes_not_mapped; + } + return ret; +} + static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc) { unsigned long address = (unsigned long)zc->address; u32 length = 0, seq, offset, zap_len; + #define PAGE_BATCH_SIZE 8 + struct page *pages[PAGE_BATCH_SIZE]; const skb_frag_t *frags = NULL; struct vm_area_struct *vma; struct sk_buff *skb = NULL; + unsigned long pg_idx = 0; + unsigned long curr_addr; struct tcp_sock *tp; int inq; int ret; @@ -1762,16 +1796,17 @@ static int tcp_zerocopy_receive(struct sock *sk, sock_rps_record_flow(sk); - down_read(¤t->mm->mmap_sem); + tp = tcp_sk(sk); + + mmap_read_lock(current->mm); vma = find_vma(current->mm, address); if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) { - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); return -EINVAL; } zc->length = min_t(unsigned long, zc->length, vma->vm_end - address); - tp = tcp_sk(sk); seq = tp->copied_seq; inq = tcp_inq(sk); zc->length = min_t(u32, zc->length, inq); @@ -1783,8 +1818,20 @@ static int tcp_zerocopy_receive(struct sock *sk, zc->recv_skip_hint = zc->length; } ret = 0; + curr_addr = address; while (length + PAGE_SIZE <= zc->length) { if (zc->recv_skip_hint < PAGE_SIZE) { + /* If we're here, finish the current batch. */ + if (pg_idx) { + ret = tcp_zerocopy_vm_insert_batch(vma, pages, + pg_idx, + &curr_addr, + &length, + &seq, zc); + if (ret) + goto out; + pg_idx = 0; + } if (skb) { if (zc->recv_skip_hint > 0) break; @@ -1793,7 +1840,6 @@ static int tcp_zerocopy_receive(struct sock *sk, } else { skb = tcp_recv_skb(sk, seq, &offset); } - zc->recv_skip_hint = skb->len - offset; offset -= skb_headlen(skb); if ((int)offset < 0 || skb_has_frag_list(skb)) @@ -1817,17 +1863,27 @@ static int tcp_zerocopy_receive(struct sock *sk, zc->recv_skip_hint -= remaining; break; } - ret = vm_insert_page(vma, address + length, - skb_frag_page(frags)); - if (ret) - break; + pages[pg_idx] = skb_frag_page(frags); + pg_idx++; length += PAGE_SIZE; - seq += PAGE_SIZE; zc->recv_skip_hint -= PAGE_SIZE; frags++; + if (pg_idx == PAGE_BATCH_SIZE) { + ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, + &curr_addr, &length, + &seq, zc); + if (ret) + goto out; + pg_idx = 0; + } + } + if (pg_idx) { + ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, + &curr_addr, &length, &seq, + zc); } out: - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (length) { WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 629aaa9a1eb9..7aa68f4aae6c 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -64,6 +64,9 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, } while (i != msg_rx->sg.end); if (unlikely(peek)) { + if (msg_rx == list_last_entry(&psock->ingress_msg, + struct sk_msg, list)) + break; msg_rx = list_next_entry(msg_rx, list); continue; } @@ -242,6 +245,9 @@ static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + if (!timeo) return ret; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 83330a6cb242..12fda8f27b08 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4605,7 +4605,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { coalesce_done: - tcp_grow_window(sk, skb); + /* For non sack flows, do not grow window to force DUPACK + * and trigger fast retransmit. + */ + if (tcp_is_sack(tp)) + tcp_grow_window(sk, skb); kfree_skb_partial(skb, fragstolen); skb = NULL; goto add_sack; @@ -4689,7 +4693,11 @@ add_sack: tcp_sack_new_ofo_skb(sk, seq, end_seq); end: if (skb) { - tcp_grow_window(sk, skb); + /* For non sack flows, do not grow window to force DUPACK + * and trigger fast retransmit. + */ + if (tcp_is_sack(tp)) + tcp_grow_window(sk, skb); skb_condense(skb); skb_set_owner_r(skb, sk); } diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 414a68b16869..f4f19e89af5e 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -7,7 +7,7 @@ menuconfig IPV6 tristate "The IPv6 protocol" default y - ---help--- + help Support for IP version 6 (IPv6). For general information about IPv6, see @@ -23,7 +23,7 @@ if IPV6 config IPV6_ROUTER_PREF bool "IPv6: Router Preference (RFC 4191) support" - ---help--- + help Router Preference is an optional extension to the Router Advertisement message which improves the ability of hosts to pick an appropriate router, especially when the hosts @@ -34,14 +34,14 @@ config IPV6_ROUTER_PREF config IPV6_ROUTE_INFO bool "IPv6: Route Information (RFC 4191) support" depends on IPV6_ROUTER_PREF - ---help--- + help Support of Route Information. If unsure, say N. config IPV6_OPTIMISTIC_DAD bool "IPv6: Enable RFC 4429 Optimistic DAD" - ---help--- + help Support for optimistic Duplicate Address Detection. It allows for autoconfigured addresses to be used more quickly. @@ -50,7 +50,7 @@ config IPV6_OPTIMISTIC_DAD config INET6_AH tristate "IPv6: AH transformation" select XFRM_AH - ---help--- + help Support for IPsec AH (Authentication Header). AH can be used with various authentication algorithms. Besides @@ -65,7 +65,7 @@ config INET6_AH config INET6_ESP tristate "IPv6: ESP transformation" select XFRM_ESP - ---help--- + help Support for IPsec ESP (Encapsulating Security Payload). ESP can be used with various encryption and authentication algorithms. @@ -82,7 +82,7 @@ config INET6_ESP_OFFLOAD depends on INET6_ESP select XFRM_OFFLOAD default n - ---help--- + help Support for ESP transformation offload. This makes sense only if this system really does IPsec and want to do it with high throughput. A typical desktop system does not @@ -106,7 +106,7 @@ config INET6_IPCOMP tristate "IPv6: IPComp transformation" select INET6_XFRM_TUNNEL select XFRM_IPCOMP - ---help--- + help Support for IP Payload Compression Protocol (IPComp) (RFC3173), typically needed for IPsec. @@ -115,7 +115,7 @@ config INET6_IPCOMP config IPV6_MIP6 tristate "IPv6: Mobility" select XFRM - ---help--- + help Support for IPv6 Mobility described in RFC 3775. If unsure, say N. @@ -125,7 +125,7 @@ config IPV6_ILA depends on NETFILTER select DST_CACHE select LWTUNNEL - ---help--- + help Support for IPv6 Identifier Locator Addressing (ILA). ILA is a mechanism to do network virtualization without @@ -155,7 +155,7 @@ tristate "Virtual (secure) IPv6: tunneling" select IPV6_TUNNEL select NET_IP_TUNNEL select XFRM - ---help--- + help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the encapsulating protocol. This can be used with xfrm mode tunnel to give @@ -168,7 +168,7 @@ config IPV6_SIT select NET_IP_TUNNEL select IPV6_NDISC_NODETYPE default y - ---help--- + help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the encapsulating protocol. This driver implements encapsulation of IPv6 @@ -181,7 +181,7 @@ config IPV6_SIT_6RD bool "IPv6: IPv6 Rapid Deployment (6RD)" depends on IPV6_SIT default n - ---help--- + help IPv6 Rapid Deployment (6rd; draft-ietf-softwire-ipv6-6rd) builds upon mechanisms of 6to4 (RFC3056) to enable a service provider to rapidly deploy IPv6 unicast service to IPv4 sites to which it provides @@ -204,7 +204,7 @@ config IPV6_TUNNEL select INET6_TUNNEL select DST_CACHE select GRO_CELLS - ---help--- + help Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in RFC 2473. @@ -215,7 +215,7 @@ config IPV6_GRE select IPV6_TUNNEL select NET_IP_TUNNEL depends on NET_IPGRE_DEMUX - ---help--- + help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the encapsulating protocol. This particular tunneling driver implements @@ -240,13 +240,13 @@ config IPV6_FOU_TUNNEL config IPV6_MULTIPLE_TABLES bool "IPv6: Multiple Routing Tables" select FIB_RULES - ---help--- + help Support multiple routing tables. config IPV6_SUBTREES bool "IPv6: source address based routing" depends on IPV6_MULTIPLE_TABLES - ---help--- + help Enable routing by source address or prefix. The destination address is still the primary routing key, so mixing @@ -261,7 +261,7 @@ config IPV6_MROUTE bool "IPv6: multicast routing" depends on IPV6 select IP_MROUTE_COMMON - ---help--- + help Support for IPv6 multicast forwarding. If unsure, say N. @@ -282,7 +282,7 @@ config IPV6_MROUTE_MULTIPLE_TABLES config IPV6_PIMSM_V2 bool "IPv6: PIM-SM version 2 support" depends on IPV6_MROUTE - ---help--- + help Support for IPv6 PIM multicast routing protocol PIM-SMv2. If unsure, say N. @@ -292,7 +292,7 @@ config IPV6_SEG6_LWTUNNEL select LWTUNNEL select DST_CACHE select IPV6_MULTIPLE_TABLES - ---help--- + help Support for encapsulation of packets within an outer IPv6 header and a Segment Routing Header using the lightweight tunnels mechanism. Also enable support for advanced local @@ -306,7 +306,7 @@ config IPV6_SEG6_HMAC select CRYPTO_HMAC select CRYPTO_SHA1 select CRYPTO_SHA256 - ---help--- + help Support for HMAC signature generation and verification of SR-enabled packets. @@ -321,7 +321,7 @@ config IPV6_RPL_LWTUNNEL bool "IPv6: RPL Source Routing Header support" depends on IPV6 select LWTUNNEL - ---help--- + help Support for RFC6554 RPL Source Routing Header using the lightweight tunnels mechanism. diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index d64b83e85642..ce4fbba4acce 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -779,7 +779,7 @@ static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); - state->pid_ns = proc_pid_ns(file_inode(seq->file)); + state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb); rcu_read_lock_bh(); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 781ca8c07a0d..6532bde82b40 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -127,6 +127,7 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, gre_proto == htons(ETH_P_ERSPAN2)) ? ARPHRD_ETHER : ARPHRD_IP6GRE; int score, cand_score = 4; + struct net_device *ndev; for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || @@ -238,9 +239,9 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, if (t && t->dev->flags & IFF_UP) return t; - dev = ign->fb_tunnel_dev; - if (dev && dev->flags & IFF_UP) - return netdev_priv(dev); + ndev = READ_ONCE(ign->fb_tunnel_dev); + if (ndev && ndev->flags & IFF_UP) + return netdev_priv(ndev); return NULL; } @@ -413,6 +414,8 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) ip6gre_tunnel_unlink_md(ign, t); ip6gre_tunnel_unlink(ign, t); + if (ign->fb_tunnel_dev == dev) + WRITE_ONCE(ign->fb_tunnel_dev, NULL); dst_cache_reset(&t->dst_cache); dev_put(dev); } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 2c843ff5e3a9..20576e87a5f7 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -493,7 +493,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *) opt->srcrt; - if (!seg6_validate_srh(srh, optlen)) + if (!seg6_validate_srh(srh, optlen, false)) goto sticky_done; break; } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 7e12d2114158..8cd2782a31e4 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2615,6 +2615,7 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) idev->mc_list = i->next; write_unlock_bh(&idev->lock); + ip6_mc_clear_src(i); ma_put(i); write_lock_bh(&idev->lock); } diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 0594131fa46d..262bb51a2d99 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -127,7 +127,7 @@ config IP6_NF_MATCH_HL tristate '"hl" hoplimit match support' depends on NETFILTER_ADVANCED select NETFILTER_XT_MATCH_HL - ---help--- + help This is a backwards-compat option for the user's convenience (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_MATCH_HL. @@ -153,7 +153,7 @@ config IP6_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' depends on NETFILTER_ADVANCED depends on IP6_NF_MANGLE || IP6_NF_RAW - ---help--- + help This option allows you to match packets whose replies would go out via the interface the packet came in. @@ -183,7 +183,7 @@ config IP6_NF_TARGET_HL tristate '"HL" hoplimit target support' depends on NETFILTER_ADVANCED && IP6_NF_MANGLE select NETFILTER_XT_TARGET_HL - ---help--- + help This is a backwards-compatible option for the user's convenience (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_TARGET_HL. diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 37b434293bda..d2f8138e5a73 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -25,7 +25,7 @@ #include <net/seg6_hmac.h> #endif -bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len) +bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced) { unsigned int tlv_offset; int max_last_entry; @@ -37,13 +37,17 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len) if (((srh->hdrlen + 1) << 3) != len) return false; - max_last_entry = (srh->hdrlen / 2) - 1; - - if (srh->first_segment > max_last_entry) + if (!reduced && srh->segments_left > srh->first_segment) { return false; + } else { + max_last_entry = (srh->hdrlen / 2) - 1; - if (srh->segments_left > srh->first_segment + 1) - return false; + if (srh->first_segment > max_last_entry) + return false; + + if (srh->segments_left > srh->first_segment + 1) + return false; + } tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4); diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index c7cbfeae94f5..e0e9f48ab14f 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -426,7 +426,7 @@ static int seg6_build_state(struct net *net, struct nlattr *nla, } /* verify that SRH is consistent */ - if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo))) + if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo), false)) return -EINVAL; newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt)); diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 52493423f329..eba23279912d 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -87,7 +87,7 @@ static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb) */ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); - if (!seg6_validate_srh(srh, len)) + if (!seg6_validate_srh(srh, len, true)) return NULL; return srh; @@ -495,7 +495,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb) return false; srh->hdrlen = (u8)(srh_state->hdrlen >> 3); - if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)) + if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3, true)) return false; srh_state->valid = true; @@ -670,7 +670,7 @@ static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt) if (len < sizeof(*srh) + sizeof(struct in6_addr)) return -EINVAL; - if (!seg6_validate_srh(srh, len)) + if (!seg6_validate_srh(srh, len, false)) return -EINVAL; slwt->srh = kmemdup(srh, len, GFP_KERNEL); diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig index bf7e970fad65..16f39f2565d9 100644 --- a/net/kcm/Kconfig +++ b/net/kcm/Kconfig @@ -5,7 +5,7 @@ config AF_KCM depends on INET select BPF_SYSCALL select STREAM_PARSER - ---help--- + help KCM (Kernel Connection Multiplexor) sockets provide a method for multiplexing messages of a message based application protocol over kernel connectons (e.g. TCP connections). diff --git a/net/l2tp/Kconfig b/net/l2tp/Kconfig index 655e0646895b..b7856748e960 100644 --- a/net/l2tp/Kconfig +++ b/net/l2tp/Kconfig @@ -8,7 +8,7 @@ menuconfig L2TP depends on (IPV6 || IPV6=n) depends on INET select NET_UDP_TUNNEL - ---help--- + help Layer Two Tunneling Protocol From RFC 2661 <http://www.ietf.org/rfc/rfc2661.txt>. diff --git a/net/l3mdev/Kconfig b/net/l3mdev/Kconfig index de186dff8f63..2b2861e1fb7d 100644 --- a/net/l3mdev/Kconfig +++ b/net/l3mdev/Kconfig @@ -6,6 +6,6 @@ config NET_L3_MASTER_DEV bool "L3 Master device support" depends on INET || IPV6 - ---help--- + help This module provides glue between core networking code and device drivers to support L3 master devices like VRF. diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig index 5b50e8d64f26..da87b47f0dff 100644 --- a/net/lapb/Kconfig +++ b/net/lapb/Kconfig @@ -5,7 +5,7 @@ config LAPB tristate "LAPB Data Link Driver" - ---help--- + help Link Access Procedure, Balanced (LAPB) is the data link layer (i.e. the lower) part of the X.25 protocol. It offers a reliable connection service to exchange data frames with one other host, and diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index 0c93b1b7a826..cd9a9bd242ba 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -9,7 +9,7 @@ config MAC80211 select CRYPTO_GCM select CRYPTO_CMAC select CRC32 - ---help--- + help This option enables the hardware independent IEEE 802.11 networking stack. @@ -25,14 +25,14 @@ config MAC80211_RC_MINSTREL bool "Minstrel" if EXPERT select MAC80211_HAS_RC default y - ---help--- + help This option enables the 'minstrel' TX rate control algorithm choice prompt "Default rate control algorithm" depends on MAC80211_HAS_RC default MAC80211_RC_DEFAULT_MINSTREL - ---help--- + help This option selects the default rate control algorithm mac80211 will use. Note that this default can still be overridden through the ieee80211_default_rc_algo module @@ -41,7 +41,7 @@ choice config MAC80211_RC_DEFAULT_MINSTREL bool "Minstrel" depends on MAC80211_RC_MINSTREL - ---help--- + help Select Minstrel as the default rate control algorithm. @@ -60,7 +60,7 @@ comment "Some wireless drivers require a rate control algorithm" config MAC80211_MESH bool "Enable mac80211 mesh networking support" depends on MAC80211 - ---help--- + help Select this option to enable 802.11 mesh operation in mac80211 drivers that support it. 802.11 mesh connects multiple stations over (possibly multi-hop) wireless links to form a single logical @@ -71,14 +71,14 @@ config MAC80211_LEDS depends on MAC80211 depends on LEDS_CLASS select LEDS_TRIGGERS - ---help--- + help This option enables a few LED triggers for different packet receive/transmit events. config MAC80211_DEBUGFS bool "Export mac80211 internals in DebugFS" depends on MAC80211 && DEBUG_FS - ---help--- + help Select this to see extensive information about the internal state of mac80211 in debugfs. @@ -87,7 +87,7 @@ config MAC80211_DEBUGFS config MAC80211_MESSAGE_TRACING bool "Trace all mac80211 debug messages" depends on MAC80211 - ---help--- + help Select this option to have mac80211 register the mac80211_msg trace subsystem with tracepoints to collect all debugging messages, independent of @@ -100,13 +100,13 @@ config MAC80211_MESSAGE_TRACING menuconfig MAC80211_DEBUG_MENU bool "Select mac80211 debugging features" depends on MAC80211 - ---help--- + help This option collects various mac80211 debug settings. config MAC80211_NOINLINE bool "Do not inline TX/RX handlers" depends on MAC80211_DEBUG_MENU - ---help--- + help This option affects code generation in mac80211, when selected some functions are marked "noinline" to allow easier debugging of problems in the transmit and receive @@ -122,7 +122,7 @@ config MAC80211_NOINLINE config MAC80211_VERBOSE_DEBUG bool "Verbose debugging output" depends on MAC80211_DEBUG_MENU - ---help--- + help Selecting this option causes mac80211 to print out many debugging messages. It should not be selected on production systems as some of the messages are @@ -133,7 +133,7 @@ config MAC80211_VERBOSE_DEBUG config MAC80211_MLME_DEBUG bool "Verbose managed MLME output" depends on MAC80211_DEBUG_MENU - ---help--- + help Selecting this option causes mac80211 to print out debugging messages for the managed-mode MLME. It should not be selected on production systems as some @@ -144,7 +144,7 @@ config MAC80211_MLME_DEBUG config MAC80211_STA_DEBUG bool "Verbose station debugging" depends on MAC80211_DEBUG_MENU - ---help--- + help Selecting this option causes mac80211 to print out debugging messages for station addition/removal. @@ -153,7 +153,7 @@ config MAC80211_STA_DEBUG config MAC80211_HT_DEBUG bool "Verbose HT debugging" depends on MAC80211_DEBUG_MENU - ---help--- + help This option enables 802.11n High Throughput features debug tracing output. @@ -165,7 +165,7 @@ config MAC80211_HT_DEBUG config MAC80211_OCB_DEBUG bool "Verbose OCB debugging" depends on MAC80211_DEBUG_MENU - ---help--- + help Selecting this option causes mac80211 to print out very verbose OCB debugging messages. It should not be selected on production systems as those messages @@ -176,7 +176,7 @@ config MAC80211_OCB_DEBUG config MAC80211_IBSS_DEBUG bool "Verbose IBSS debugging" depends on MAC80211_DEBUG_MENU - ---help--- + help Selecting this option causes mac80211 to print out very verbose IBSS debugging messages. It should not be selected on production systems as those messages @@ -187,7 +187,7 @@ config MAC80211_IBSS_DEBUG config MAC80211_PS_DEBUG bool "Verbose powersave mode debugging" depends on MAC80211_DEBUG_MENU - ---help--- + help Selecting this option causes mac80211 to print out very verbose power save mode debugging messages (when mac80211 is an AP and has power saving stations.) @@ -200,7 +200,7 @@ config MAC80211_MPL_DEBUG bool "Verbose mesh peer link debugging" depends on MAC80211_DEBUG_MENU depends on MAC80211_MESH - ---help--- + help Selecting this option causes mac80211 to print out very verbose mesh peer link debugging messages (when mac80211 is taking part in a mesh network). @@ -213,7 +213,7 @@ config MAC80211_MPATH_DEBUG bool "Verbose mesh path debugging" depends on MAC80211_DEBUG_MENU depends on MAC80211_MESH - ---help--- + help Selecting this option causes mac80211 to print out very verbose mesh path selection debugging messages (when mac80211 is taking part in a mesh network). @@ -226,7 +226,7 @@ config MAC80211_MHWMP_DEBUG bool "Verbose mesh HWMP routing debugging" depends on MAC80211_DEBUG_MENU depends on MAC80211_MESH - ---help--- + help Selecting this option causes mac80211 to print out very verbose mesh routing (HWMP) debugging messages (when mac80211 is taking part in a mesh network). @@ -239,7 +239,7 @@ config MAC80211_MESH_SYNC_DEBUG bool "Verbose mesh synchronization debugging" depends on MAC80211_DEBUG_MENU depends on MAC80211_MESH - ---help--- + help Selecting this option causes mac80211 to print out very verbose mesh synchronization debugging messages (when mac80211 is taking part in a mesh network). @@ -250,7 +250,7 @@ config MAC80211_MESH_CSA_DEBUG bool "Verbose mesh channel switch debugging" depends on MAC80211_DEBUG_MENU depends on MAC80211_MESH - ---help--- + help Selecting this option causes mac80211 to print out very verbose mesh channel switch debugging messages (when mac80211 is taking part in a mesh network). @@ -261,7 +261,7 @@ config MAC80211_MESH_PS_DEBUG bool "Verbose mesh powersave debugging" depends on MAC80211_DEBUG_MENU depends on MAC80211_MESH - ---help--- + help Selecting this option causes mac80211 to print out very verbose mesh powersave debugging messages (when mac80211 is taking part in a mesh network). @@ -271,7 +271,7 @@ config MAC80211_MESH_PS_DEBUG config MAC80211_TDLS_DEBUG bool "Verbose TDLS debugging" depends on MAC80211_DEBUG_MENU - ---help--- + help Selecting this option causes mac80211 to print out very verbose TDLS selection debugging messages (when mac80211 is a TDLS STA). @@ -284,7 +284,7 @@ config MAC80211_DEBUG_COUNTERS bool "Extra statistics for TX/RX debugging" depends on MAC80211_DEBUG_MENU depends on MAC80211_DEBUGFS - ---help--- + help Selecting this option causes mac80211 to keep additional and very verbose statistics about TX and RX handler use as well as a few selected dot11 counters. These will be @@ -298,7 +298,7 @@ config MAC80211_DEBUG_COUNTERS config MAC80211_STA_HASH_MAX_SIZE int "Station hash table maximum size" if MAC80211_DEBUG_MENU default 0 - ---help--- + help Setting this option to a low value (e.g. 4) allows testing the hash table with collisions relatively deterministically (just connect more stations than the number selected here.) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 5820ef02a587..b2a9d47cf86d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -167,6 +167,8 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT | IEEE80211_STA_DISABLE_HE; + else + ret = 0; vht_chandef = *chandef; goto out; } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 21854a61a2b7..a88ab6fb16f2 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4694,7 +4694,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, * rate_idx is MCS index, which can be [0-76] * as documented on: * - * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n + * https://wireless.wiki.kernel.org/en/developers/Documentation/ieee80211/802.11n * * Anything else would be some sort of driver or * hardware error. The driver should catch hardware diff --git a/net/mac802154/Kconfig b/net/mac802154/Kconfig index 742624e4f7bb..901167b1e6f5 100644 --- a/net/mac802154/Kconfig +++ b/net/mac802154/Kconfig @@ -8,7 +8,7 @@ config MAC802154 select CRYPTO_CCM select CRYPTO_CTR select CRYPTO_AES - ---help--- + help This option enables the hardware independent IEEE 802.15.4 networking stack for SoftMAC devices (the ones implementing only PHY level of IEEE 802.15.4 standard). diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig index d1ad69b7942a..d672ab72ab12 100644 --- a/net/mpls/Kconfig +++ b/net/mpls/Kconfig @@ -6,7 +6,7 @@ menuconfig MPLS bool "MultiProtocol Label Switching" default n - ---help--- + help MultiProtocol Label Switching routes packets through logical circuits. Originally conceived as a way of routing packets at hardware speeds (before hardware was capable of routing ipv4 packets), @@ -27,13 +27,13 @@ config MPLS_ROUTING tristate "MPLS: routing support" depends on NET_IP_TUNNEL || NET_IP_TUNNEL=n depends on PROC_SYSCTL - ---help--- + help Add support for forwarding of mpls packets. config MPLS_IPTUNNEL tristate "MPLS: IP over MPLS tunnel support" depends on LWTUNNEL && MPLS_ROUTING - ---help--- + help mpls ip tunnel support. endif # MPLS diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 01f1f4cf4902..490b92534afc 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -273,6 +273,8 @@ static void mptcp_parse_option(const struct sk_buff *skb, if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE) break; + ptr++; + mp_opt->rm_addr = 1; mp_opt->rm_id = *ptr++; pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 14b253d10ccf..3980fbb6f31e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -374,6 +374,27 @@ void mptcp_subflow_eof(struct sock *sk) sock_hold(sk); } +static void mptcp_check_for_eof(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int receivers = 0; + + mptcp_for_each_subflow(msk, subflow) + receivers += !subflow->rx_eof; + + if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + /* hopefully temporary hack: propagate shutdown status + * to msk, when all subflows agree on it + */ + sk->sk_shutdown |= RCV_SHUTDOWN; + + smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ + set_bit(MPTCP_DATA_READY, &msk->flags); + sk->sk_data_ready(sk); + } +} + static void mptcp_stop_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -1011,6 +1032,9 @@ fallback: break; } + if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) + mptcp_check_for_eof(msk); + if (sk->sk_shutdown & RCV_SHUTDOWN) break; @@ -1148,27 +1172,6 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) return 0; } -static void mptcp_check_for_eof(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - struct sock *sk = (struct sock *)msk; - int receivers = 0; - - mptcp_for_each_subflow(msk, subflow) - receivers += !subflow->rx_eof; - - if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) { - /* hopefully temporary hack: propagate shutdown status - * to msk, when all subflows agree on it - */ - sk->sk_shutdown |= RCV_SHUTDOWN; - - smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - set_bit(MPTCP_DATA_READY, &msk->flags); - sk->sk_data_ready(sk); - } -} - static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 809687d3f410..c6eeaf3e8dcb 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -135,8 +135,6 @@ static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) ((nib & 0xF) << 8) | field); } -#define MPTCP_PM_MAX_ADDR 4 - struct mptcp_addr_info { sa_family_t family; __be16 port; @@ -234,10 +232,7 @@ static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (list_empty(&msk->rtx_queue)) - return NULL; - - return list_first_entry(&msk->rtx_queue, struct mptcp_data_frag, list); + return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); } struct mptcp_subflow_request_sock { @@ -254,6 +249,7 @@ struct mptcp_subflow_request_sock { u64 thmac; u32 local_nonce; u32 remote_nonce; + struct mptcp_sock *msk; }; static inline struct mptcp_subflow_request_sock * diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 493b98a0825c..3838a0b3a21f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -69,6 +69,9 @@ static void subflow_req_destructor(struct request_sock *req) pr_debug("subflow_req=%p", subflow_req); + if (subflow_req->msk) + sock_put((struct sock *)subflow_req->msk); + if (subflow_req->mp_capable) mptcp_token_destroy_request(subflow_req->token); tcp_request_sock_ops.destructor(req); @@ -86,8 +89,8 @@ static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, } /* validate received token and create truncated hmac and nonce for SYN-ACK */ -static bool subflow_token_join_request(struct request_sock *req, - const struct sk_buff *skb) +static struct mptcp_sock *subflow_token_join_request(struct request_sock *req, + const struct sk_buff *skb) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); u8 hmac[SHA256_DIGEST_SIZE]; @@ -97,13 +100,13 @@ static bool subflow_token_join_request(struct request_sock *req, msk = mptcp_token_get_sock(subflow_req->token); if (!msk) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); - return false; + return NULL; } local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); if (local_id < 0) { sock_put((struct sock *)msk); - return false; + return NULL; } subflow_req->local_id = local_id; @@ -114,9 +117,7 @@ static bool subflow_token_join_request(struct request_sock *req, subflow_req->remote_nonce, hmac); subflow_req->thmac = get_unaligned_be64(hmac); - - sock_put((struct sock *)msk); - return true; + return msk; } static void subflow_init_req(struct request_sock *req, @@ -133,6 +134,7 @@ static void subflow_init_req(struct request_sock *req, subflow_req->mp_capable = 0; subflow_req->mp_join = 0; + subflow_req->msk = NULL; #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of @@ -166,12 +168,9 @@ static void subflow_init_req(struct request_sock *req, subflow_req->remote_id = mp_opt.join_id; subflow_req->token = mp_opt.token; subflow_req->remote_nonce = mp_opt.nonce; - pr_debug("token=%u, remote_nonce=%u", subflow_req->token, - subflow_req->remote_nonce); - if (!subflow_token_join_request(req, skb)) { - subflow_req->mp_join = 0; - // @@ need to trigger RST - } + subflow_req->msk = subflow_token_join_request(req, skb); + pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, + subflow_req->remote_nonce, subflow_req->msk); } } @@ -354,10 +353,9 @@ static bool subflow_hmac_valid(const struct request_sock *req, const struct mptcp_subflow_request_sock *subflow_req; u8 hmac[SHA256_DIGEST_SIZE]; struct mptcp_sock *msk; - bool ret; subflow_req = mptcp_subflow_rsk(req); - msk = mptcp_token_get_sock(subflow_req->token); + msk = subflow_req->msk; if (!msk) return false; @@ -365,12 +363,7 @@ static bool subflow_hmac_valid(const struct request_sock *req, subflow_req->remote_nonce, subflow_req->local_nonce, hmac); - ret = true; - if (crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN)) - ret = false; - - sock_put((struct sock *)msk); - return ret; + return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); } static void mptcp_sock_destruct(struct sock *sk) @@ -393,6 +386,7 @@ static void mptcp_sock_destruct(struct sock *sk) sock_orphan(sk); } + mptcp_token_destroy(mptcp_sk(sk)->token); inet_sock_destruct(sk); } @@ -437,22 +431,25 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); struct mptcp_subflow_request_sock *subflow_req; struct mptcp_options_received mp_opt; - bool fallback_is_fatal = false; + bool fallback, fallback_is_fatal; struct sock *new_msk = NULL; - bool fallback = false; struct sock *child; pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); - /* we need later a valid 'mp_capable' value even when options are not - * parsed + /* After child creation we must look for 'mp_capable' even when options + * are not parsed */ mp_opt.mp_capable = 0; - if (tcp_rsk(req)->is_mptcp == 0) + + /* hopefully temporary handling for MP_JOIN+syncookie */ + subflow_req = mptcp_subflow_rsk(req); + fallback_is_fatal = subflow_req->mp_join; + fallback = !tcp_rsk(req)->is_mptcp; + if (fallback) goto create_child; /* if the sk is MP_CAPABLE, we try to fetch the client key */ - subflow_req = mptcp_subflow_rsk(req); if (subflow_req->mp_capable) { if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) { /* here we can receive and accept an in-window, @@ -473,12 +470,11 @@ create_msk: if (!new_msk) fallback = true; } else if (subflow_req->mp_join) { - fallback_is_fatal = true; mptcp_get_options(skb, &mp_opt); if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); - return NULL; + fallback = true; } } @@ -521,10 +517,12 @@ create_child: } else if (ctx->mp_join) { struct mptcp_sock *owner; - owner = mptcp_token_get_sock(ctx->token); + owner = subflow_req->msk; if (!owner) goto dispose_child; + /* move the msk reference ownership to the subflow */ + subflow_req->msk = NULL; ctx->conn = (struct sock *)owner; if (!mptcp_finish_join(child)) goto dispose_child; @@ -1052,8 +1050,10 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) err = tcp_set_ulp(sf->sk, "mptcp"); release_sock(sf->sk); - if (err) + if (err) { + sock_release(sf); return err; + } /* the newly created socket really belongs to the owning MPTCP master * socket, even if for additional subflows the allocation is performed diff --git a/net/ncsi/Kconfig b/net/ncsi/Kconfig index 2f1e5756c03a..93309081f5a4 100644 --- a/net/ncsi/Kconfig +++ b/net/ncsi/Kconfig @@ -6,7 +6,7 @@ config NET_NCSI bool "NCSI interface support" depends on INET - ---help--- + help This module provides NCSI (Network Controller Sideband Interface) support. Enable this only if your system connects to a network device via NCSI and the ethernet driver you're using supports @@ -14,6 +14,6 @@ config NET_NCSI config NCSI_OEM_CMD_GET_MAC bool "Get NCSI OEM MAC Address" depends on NET_NCSI - ---help--- + help This allows to get MAC address from NCSI firmware and set them back to controller. diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 3a3915d2e1ea..0ffe2b8723c4 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -120,7 +120,7 @@ config NF_CONNTRACK_PROCFS bool "Supply CT list in procfs (OBSOLETE)" default y depends on PROC_FS - ---help--- + help This option enables for the list of known conntrack entries to be shown in procfs under net/netfilter/nf_conntrack. This is considered obsolete in favor of using the conntrack(8) @@ -717,7 +717,7 @@ comment "Xtables combined modules" config NETFILTER_XT_MARK tristate 'nfmark target and match support' default m if NETFILTER_ADVANCED=n - ---help--- + help This option adds the "MARK" target and "mark" match. Netfilter mark matching allows you to match packets based on the @@ -733,7 +733,7 @@ config NETFILTER_XT_CONNMARK depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK - ---help--- + help This option adds the "CONNMARK" target and "connmark" match. Netfilter allows you to store a mark value per connection (a.k.a. @@ -760,7 +760,7 @@ config NETFILTER_XT_TARGET_AUDIT tristate "AUDIT target support" depends on AUDIT depends on NETFILTER_ADVANCED - ---help--- + help This option adds a 'AUDIT' target, which can be used to create audit records for packets dropped/accepted. @@ -770,7 +770,7 @@ config NETFILTER_XT_TARGET_CHECKSUM tristate "CHECKSUM target support" depends on IP_NF_MANGLE || IP6_NF_MANGLE depends on NETFILTER_ADVANCED - ---help--- + help This option adds a `CHECKSUM' target, which can be used in the iptables mangle table to work around buggy DHCP clients in virtualized environments. @@ -799,7 +799,7 @@ config NETFILTER_XT_TARGET_CONNMARK depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NETFILTER_XT_CONNMARK - ---help--- + help This is a backwards-compat option for the user's convenience (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module). @@ -848,7 +848,7 @@ config NETFILTER_XT_TARGET_HL tristate '"HL" hoplimit target support' depends on IP_NF_MANGLE || IP6_NF_MANGLE depends on NETFILTER_ADVANCED - ---help--- + help This option adds the "HL" (for IPv6) and "TTL" (for IPv4) targets, which enable the user to change the hoplimit/time-to-live value of the IP header. @@ -863,7 +863,7 @@ config NETFILTER_XT_TARGET_HMARK tristate '"HMARK" target support' depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n depends on NETFILTER_ADVANCED - ---help--- + help This option adds the "HMARK" target. The target allows you to create rules in the "raw" and "mangle" tables @@ -925,7 +925,7 @@ config NETFILTER_XT_TARGET_MARK tristate '"MARK" target support' depends on NETFILTER_ADVANCED select NETFILTER_XT_MARK - ---help--- + help This is a backwards-compat option for the user's convenience (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_MARK (combined mark/MARK module). @@ -933,7 +933,7 @@ config NETFILTER_XT_TARGET_MARK config NETFILTER_XT_NAT tristate '"SNAT and DNAT" targets support' depends on NF_NAT - ---help--- + help This option enables the SNAT and DNAT targets. To compile it as a module, choose M here. If unsure, say N. @@ -941,7 +941,7 @@ config NETFILTER_XT_NAT config NETFILTER_XT_TARGET_NETMAP tristate '"NETMAP" target support' depends on NF_NAT - ---help--- + help NETMAP is an implementation of static 1:1 NAT mapping of network addresses. It maps the network address part, while keeping the host address part intact. @@ -991,7 +991,7 @@ config NETFILTER_XT_TARGET_REDIRECT tristate "REDIRECT target support" depends on NF_NAT select NF_NAT_REDIRECT - ---help--- + help REDIRECT is a special case of NAT: all incoming connections are mapped onto the incoming interface's address, causing the packets to come to the local machine instead of passing through. This is @@ -1021,7 +1021,7 @@ config NETFILTER_XT_TARGET_TEE depends on IP6_NF_IPTABLES || !IP6_NF_IPTABLES select NF_DUP_IPV4 select NF_DUP_IPV6 if IP6_NF_IPTABLES - ---help--- + help This option adds a "TEE" target with which a packet can be cloned and this clone be rerouted to another nexthop. @@ -1073,7 +1073,7 @@ config NETFILTER_XT_TARGET_TCPMSS tristate '"TCPMSS" target support' depends on IPV6 || IPV6=n default m if NETFILTER_ADVANCED=n - ---help--- + help This option adds a `TCPMSS' target, which allows you to alter the MSS value of TCP SYN packets, to control the maximum size for that connection (usually limiting it to your outgoing interface's MTU @@ -1111,7 +1111,7 @@ comment "Xtables matches" config NETFILTER_XT_MATCH_ADDRTYPE tristate '"addrtype" address type match support' default m if NETFILTER_ADVANCED=n - ---help--- + help This option allows you to match what routing thinks of an address, eg. UNICAST, LOCAL, BROADCAST, ... @@ -1132,7 +1132,7 @@ config NETFILTER_XT_MATCH_CGROUP depends on NETFILTER_ADVANCED depends on CGROUPS select CGROUP_NET_CLASSID - ---help--- + help Socket/process control group matching allows you to match locally generated packets based on which net_cls control group processes belong to. @@ -1141,7 +1141,7 @@ config NETFILTER_XT_MATCH_CLUSTER tristate '"cluster" match support' depends on NF_CONNTRACK depends on NETFILTER_ADVANCED - ---help--- + help This option allows you to build work-load-sharing clusters of network servers/stateful firewalls without having a dedicated load-balancing router/server/switch. Basically, this match returns @@ -1179,7 +1179,7 @@ config NETFILTER_XT_MATCH_CONNLABEL select NF_CONNTRACK_LABELS depends on NF_CONNTRACK depends on NETFILTER_ADVANCED - ---help--- + help This match allows you to test and assign userspace-defined labels names to a connection. The kernel only stores bit values - mapping names to bits is done by userspace. @@ -1192,7 +1192,7 @@ config NETFILTER_XT_MATCH_CONNLIMIT depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NETFILTER_CONNCOUNT - ---help--- + help This match allows you to match against the number of parallel connections to a server per client IP address (or address block). @@ -1201,7 +1201,7 @@ config NETFILTER_XT_MATCH_CONNMARK depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NETFILTER_XT_CONNMARK - ---help--- + help This is a backwards-compat option for the user's convenience (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module). @@ -1267,7 +1267,7 @@ config NETFILTER_XT_MATCH_DSCP config NETFILTER_XT_MATCH_ECN tristate '"ecn" match support' depends on NETFILTER_ADVANCED - ---help--- + help This option adds an "ECN" match, which allows you to match against the IPv4 and TCP header ECN fields. @@ -1310,7 +1310,7 @@ config NETFILTER_XT_MATCH_HELPER config NETFILTER_XT_MATCH_HL tristate '"hl" hoplimit/TTL match support' depends on NETFILTER_ADVANCED - ---help--- + help HL matching allows you to match packets based on the hoplimit in the IPv6 header, or the time-to-live field in the IPv4 header of the packet. @@ -1327,7 +1327,7 @@ config NETFILTER_XT_MATCH_IPCOMP config NETFILTER_XT_MATCH_IPRANGE tristate '"iprange" address range match support' depends on NETFILTER_ADVANCED - ---help--- + help This option adds a "iprange" match, which allows you to match based on an IP address range. (Normal iptables only matches on single addresses with an optional mask.) @@ -1348,7 +1348,7 @@ config NETFILTER_XT_MATCH_L2TP tristate '"l2tp" match support' depends on NETFILTER_ADVANCED default L2TP - ---help--- + help This option adds an "L2TP" match, which allows you to match against L2TP protocol header fields. @@ -1386,7 +1386,7 @@ config NETFILTER_XT_MATCH_MARK tristate '"mark" match support' depends on NETFILTER_ADVANCED select NETFILTER_XT_MARK - ---help--- + help This is a backwards-compat option for the user's convenience (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_MARK (combined mark/MARK module). @@ -1428,7 +1428,7 @@ config NETFILTER_XT_MATCH_OSF config NETFILTER_XT_MATCH_OWNER tristate '"owner" match support' depends on NETFILTER_ADVANCED - ---help--- + help Socket owner matching allows you to match locally-generated packets based on who created the socket: the user or group. It is also possible to check whether a socket actually exists. @@ -1503,7 +1503,7 @@ config NETFILTER_XT_MATCH_REALM config NETFILTER_XT_MATCH_RECENT tristate '"recent" match support' depends on NETFILTER_ADVANCED - ---help--- + help This match is used for creating one or many lists of recently used addresses and then matching against that/those list(s). @@ -1586,7 +1586,7 @@ config NETFILTER_XT_MATCH_TCPMSS config NETFILTER_XT_MATCH_TIME tristate '"time" match support' depends on NETFILTER_ADVANCED - ---help--- + help This option adds a "time" match, which allows you to match based on the packet arrival time (at the machine which netfilter is running) on) or departure time/date (for locally generated packets). @@ -1600,7 +1600,7 @@ config NETFILTER_XT_MATCH_TIME config NETFILTER_XT_MATCH_U32 tristate '"u32" match support' depends on NETFILTER_ADVANCED - ---help--- + help u32 allows you to extract quantities of up to 4 bytes from a packet, AND them with specified masks, shift them by specified amounts and test whether the results are in any of a set of specified ranges. diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 78f046ec506f..3ac7c8c1548d 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -376,7 +376,7 @@ static bool nf_remove_net_hook(struct nf_hook_entries *old, if (orig_ops[i] != unreg) continue; WRITE_ONCE(old->hooks[i].hook, accept_all); - WRITE_ONCE(orig_ops[i], &dummy_ops); + WRITE_ONCE(orig_ops[i], (void *)&dummy_ops); return true; } diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 5b672e05d758..2c1593089ede 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -6,7 +6,7 @@ menuconfig IP_VS tristate "IP virtual server support" depends on NET && INET && NETFILTER depends on (NF_CONNTRACK || NF_CONNTRACK=n) - ---help--- + help IP Virtual Server support will let you build a high-performance virtual server based on cluster of two or more real servers. This option must be enabled for at least one of the clustered computers @@ -31,14 +31,14 @@ config IP_VS_IPV6 depends on IPV6 = y || IP_VS = IPV6 select IP6_NF_IPTABLES select NF_DEFRAG_IPV6 - ---help--- + help Add IPv6 support to IPVS. Say Y if unsure. config IP_VS_DEBUG bool "IP virtual server debugging" - ---help--- + help Say Y here if you want to get additional messages useful in debugging the IP virtual server code. You can change the debug level in /proc/sys/net/ipv4/vs/debug_level @@ -47,7 +47,7 @@ config IP_VS_TAB_BITS int "IPVS connection table size (the Nth power of 2)" range 8 20 default 12 - ---help--- + help The IPVS connection hash table uses the chaining scheme to handle hash collisions. Using a big IPVS connection hash table will greatly reduce conflicts when there are hundreds of thousands of connections @@ -78,13 +78,13 @@ comment "IPVS transport protocol load balancing support" config IP_VS_PROTO_TCP bool "TCP load balancing support" - ---help--- + help This option enables support for load balancing TCP transport protocol. Say Y if unsure. config IP_VS_PROTO_UDP bool "UDP load balancing support" - ---help--- + help This option enables support for load balancing UDP transport protocol. Say Y if unsure. @@ -93,20 +93,20 @@ config IP_VS_PROTO_AH_ESP config IP_VS_PROTO_ESP bool "ESP load balancing support" - ---help--- + help This option enables support for load balancing ESP (Encapsulation Security Payload) transport protocol. Say Y if unsure. config IP_VS_PROTO_AH bool "AH load balancing support" - ---help--- + help This option enables support for load balancing AH (Authentication Header) transport protocol. Say Y if unsure. config IP_VS_PROTO_SCTP bool "SCTP load balancing support" select LIBCRC32C - ---help--- + help This option enables support for load balancing SCTP transport protocol. Say Y if unsure. @@ -114,7 +114,7 @@ comment "IPVS scheduler" config IP_VS_RR tristate "round-robin scheduling" - ---help--- + help The robin-robin scheduling algorithm simply directs network connections to different real servers in a round-robin manner. @@ -123,7 +123,7 @@ config IP_VS_RR config IP_VS_WRR tristate "weighted round-robin scheduling" - ---help--- + help The weighted robin-robin scheduling algorithm directs network connections to different real servers based on server weights in a round-robin manner. Servers with higher weights receive @@ -136,7 +136,7 @@ config IP_VS_WRR config IP_VS_LC tristate "least-connection scheduling" - ---help--- + help The least-connection scheduling algorithm directs network connections to the server with the least number of active connections. @@ -146,7 +146,7 @@ config IP_VS_LC config IP_VS_WLC tristate "weighted least-connection scheduling" - ---help--- + help The weighted least-connection scheduling algorithm directs network connections to the server with the least active connections normalized by the server weight. @@ -156,7 +156,7 @@ config IP_VS_WLC config IP_VS_FO tristate "weighted failover scheduling" - ---help--- + help The weighted failover scheduling algorithm directs network connections to the server with the highest weight that is currently available. @@ -166,7 +166,7 @@ config IP_VS_FO config IP_VS_OVF tristate "weighted overflow scheduling" - ---help--- + help The weighted overflow scheduling algorithm directs network connections to the server with the highest weight that is currently available and overflows to the next when active @@ -177,7 +177,7 @@ config IP_VS_OVF config IP_VS_LBLC tristate "locality-based least-connection scheduling" - ---help--- + help The locality-based least-connection scheduling algorithm is for destination IP load balancing. It is usually used in cache cluster. This algorithm usually directs packet destined for an IP address to @@ -191,7 +191,7 @@ config IP_VS_LBLC config IP_VS_LBLCR tristate "locality-based least-connection with replication scheduling" - ---help--- + help The locality-based least-connection with replication scheduling algorithm is also for destination IP load balancing. It is usually used in cache cluster. It differs from the LBLC scheduling @@ -209,7 +209,7 @@ config IP_VS_LBLCR config IP_VS_DH tristate "destination hashing scheduling" - ---help--- + help The destination hashing scheduling algorithm assigns network connections to the servers through looking up a statically assigned hash table by their destination IP addresses. @@ -219,7 +219,7 @@ config IP_VS_DH config IP_VS_SH tristate "source hashing scheduling" - ---help--- + help The source hashing scheduling algorithm assigns network connections to the servers through looking up a statically assigned hash table by their source IP addresses. @@ -229,7 +229,7 @@ config IP_VS_SH config IP_VS_MH tristate "maglev hashing scheduling" - ---help--- + help The maglev consistent hashing scheduling algorithm provides the Google's Maglev hashing algorithm as a IPVS scheduler. It assigns network connections to the servers through looking up a statically @@ -248,7 +248,7 @@ config IP_VS_MH config IP_VS_SED tristate "shortest expected delay scheduling" - ---help--- + help The shortest expected delay scheduling algorithm assigns network connections to the server with the shortest expected delay. The expected delay that the job will experience is (Ci + 1) / Ui if @@ -261,7 +261,7 @@ config IP_VS_SED config IP_VS_NQ tristate "never queue scheduling" - ---help--- + help The never queue scheduling algorithm adopts a two-speed model. When there is an idle server available, the job will be sent to the idle server, instead of waiting for a fast one. When there @@ -278,7 +278,7 @@ config IP_VS_SH_TAB_BITS int "IPVS source hashing table size (the Nth power of 2)" range 4 20 default 8 - ---help--- + help The source hashing scheduler maps source IPs to destinations stored in a hash table. This table is tiled by each destination until all slots in the table are filled. When using weights to @@ -293,7 +293,7 @@ config IP_VS_MH_TAB_INDEX int "IPVS maglev hashing table index of size (the prime numbers)" range 8 17 default 12 - ---help--- + help The maglev hashing scheduler maps source IPs to destinations stored in a hash table. This table is assigned by a preference list of the positions to each destination until all slots in @@ -312,7 +312,7 @@ config IP_VS_FTP depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \ NF_CONNTRACK_FTP select IP_VS_NFCT - ---help--- + help FTP is a protocol that transfers IP address and/or port number in the payload. In the virtual server via Network Address Translation, the IP address and port number of real servers cannot be sent to @@ -326,7 +326,7 @@ config IP_VS_FTP config IP_VS_NFCT bool "Netfilter connection tracking" depends on NF_CONNTRACK - ---help--- + help The Netfilter connection tracking support allows the IPVS connection state to be exported to the Netfilter framework for filtering purposes. @@ -335,7 +335,7 @@ config IP_VS_PE_SIP tristate "SIP persistence engine" depends on IP_VS_PROTO_UDP depends on NF_CONNTRACK_SIP - ---help--- + help Allow persistence based on the SIP Call-ID endif # IP_VS diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index d7bd8b1f27d5..832eabecfbdd 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -939,7 +939,8 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) filter->mark.mask = 0xffffffff; } } else if (cda[CTA_MARK_MASK]) { - return ERR_PTR(-EINVAL); + err = -EINVAL; + goto err_filter; } #endif if (!cda[CTA_FILTER]) @@ -947,15 +948,17 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); if (err < 0) - return ERR_PTR(err); + goto err_filter; err = ctnetlink_parse_filter(cda[CTA_FILTER], filter); if (err < 0) - return ERR_PTR(err); + goto err_filter; if (filter->orig_flags) { - if (!cda[CTA_TUPLE_ORIG]) - return ERR_PTR(-EINVAL); + if (!cda[CTA_TUPLE_ORIG]) { + err = -EINVAL; + goto err_filter; + } err = ctnetlink_parse_tuple_filter(cda, &filter->orig, CTA_TUPLE_ORIG, @@ -963,23 +966,32 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) &filter->zone, filter->orig_flags); if (err < 0) - return ERR_PTR(err); + goto err_filter; } if (filter->reply_flags) { - if (!cda[CTA_TUPLE_REPLY]) - return ERR_PTR(-EINVAL); + if (!cda[CTA_TUPLE_REPLY]) { + err = -EINVAL; + goto err_filter; + } err = ctnetlink_parse_tuple_filter(cda, &filter->reply, CTA_TUPLE_REPLY, filter->family, &filter->zone, filter->orig_flags); - if (err < 0) - return ERR_PTR(err); + if (err < 0) { + err = -EINVAL; + goto err_filter; + } } return filter; + +err_filter: + kfree(filter); + + return ERR_PTR(err); } static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda) diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 6a3034f84ab6..afa85171df38 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -387,51 +387,6 @@ static void nf_flow_offload_work_gc(struct work_struct *work) queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); } -int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, - flow_setup_cb_t *cb, void *cb_priv) -{ - struct flow_block *block = &flow_table->flow_block; - struct flow_block_cb *block_cb; - int err = 0; - - down_write(&flow_table->flow_block_lock); - block_cb = flow_block_cb_lookup(block, cb, cb_priv); - if (block_cb) { - err = -EEXIST; - goto unlock; - } - - block_cb = flow_block_cb_alloc(cb, cb_priv, cb_priv, NULL); - if (IS_ERR(block_cb)) { - err = PTR_ERR(block_cb); - goto unlock; - } - - list_add_tail(&block_cb->list, &block->cb_list); - -unlock: - up_write(&flow_table->flow_block_lock); - return err; -} -EXPORT_SYMBOL_GPL(nf_flow_table_offload_add_cb); - -void nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table, - flow_setup_cb_t *cb, void *cb_priv) -{ - struct flow_block *block = &flow_table->flow_block; - struct flow_block_cb *block_cb; - - down_write(&flow_table->flow_block_lock); - block_cb = flow_block_cb_lookup(block, cb, cb_priv); - if (block_cb) { - list_del(&block_cb->list); - flow_block_cb_free(block_cb); - } else { - WARN_ON(true); - } - up_write(&flow_table->flow_block_lock); -} -EXPORT_SYMBOL_GPL(nf_flow_table_offload_del_cb); static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, __be16 port, __be16 new_port) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 073aa1051d43..7647ecfa0d40 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6550,12 +6550,22 @@ err1: return err; } +static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook) +{ + struct nft_hook *this, *next; + + list_for_each_entry_safe(this, next, &flowtable_hook->list, list) { + list_del(&this->list); + kfree(this); + } +} + static int nft_delflowtable_hook(struct nft_ctx *ctx, struct nft_flowtable *flowtable) { const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; - struct nft_hook *this, *next, *hook; + struct nft_hook *this, *hook; struct nft_trans *trans; int err; @@ -6564,33 +6574,40 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, if (err < 0) return err; - list_for_each_entry_safe(this, next, &flowtable_hook.list, list) { + list_for_each_entry(this, &flowtable_hook.list, list) { hook = nft_hook_list_find(&flowtable->hook_list, this); if (!hook) { err = -ENOENT; goto err_flowtable_del_hook; } hook->inactive = true; - list_del(&this->list); - kfree(this); } trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE, sizeof(struct nft_trans_flowtable)); - if (!trans) - return -ENOMEM; + if (!trans) { + err = -ENOMEM; + goto err_flowtable_del_hook; + } nft_trans_flowtable(trans) = flowtable; nft_trans_flowtable_update(trans) = true; INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); + nft_flowtable_hook_release(&flowtable_hook); list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; err_flowtable_del_hook: - list_for_each_entry(hook, &flowtable_hook.list, list) + list_for_each_entry(this, &flowtable_hook.list, list) { + hook = nft_hook_list_find(&flowtable->hook_list, this); + if (!hook) + break; + hook->inactive = false; + } + nft_flowtable_hook_release(&flowtable_hook); return err; } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 8b5acc6910fd..8c04388296b0 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1242,7 +1242,9 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } - if (!*this_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { + if (!*get_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { + put_cpu_ptr(m->scratch); + err = pipapo_realloc_scratch(m, bsize_max); if (err) return err; @@ -1250,6 +1252,8 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, this_cpu_write(nft_pipapo_scratch_index, false); m->bsize_max = bsize_max; + } else { + put_cpu_ptr(m->scratch); } *ext2 = &e->ext; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 62f416bc0579..b6aad3fc46c3 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -271,12 +271,14 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, if (nft_rbtree_interval_start(new)) { if (nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, genmask)) + nft_set_elem_active(&rbe->ext, genmask) && + !nft_set_elem_expired(&rbe->ext)) overlap = false; } else { overlap = nft_rbtree_interval_end(rbe) && nft_set_elem_active(&rbe->ext, - genmask); + genmask) && + !nft_set_elem_expired(&rbe->ext); } } else if (d > 0) { p = &parent->rb_right; @@ -284,9 +286,11 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, if (nft_rbtree_interval_end(new)) { overlap = nft_rbtree_interval_end(rbe) && nft_set_elem_active(&rbe->ext, - genmask); + genmask) && + !nft_set_elem_expired(&rbe->ext); } else if (nft_rbtree_interval_end(rbe) && - nft_set_elem_active(&rbe->ext, genmask)) { + nft_set_elem_active(&rbe->ext, genmask) && + !nft_set_elem_expired(&rbe->ext)) { overlap = true; } } else { @@ -294,15 +298,18 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, nft_rbtree_interval_start(new)) { p = &parent->rb_left; - if (nft_set_elem_active(&rbe->ext, genmask)) + if (nft_set_elem_active(&rbe->ext, genmask) && + !nft_set_elem_expired(&rbe->ext)) overlap = false; } else if (nft_rbtree_interval_start(rbe) && nft_rbtree_interval_end(new)) { p = &parent->rb_right; - if (nft_set_elem_active(&rbe->ext, genmask)) + if (nft_set_elem_active(&rbe->ext, genmask) && + !nft_set_elem_expired(&rbe->ext)) overlap = false; - } else if (nft_set_elem_active(&rbe->ext, genmask)) { + } else if (nft_set_elem_active(&rbe->ext, genmask) && + !nft_set_elem_expired(&rbe->ext)) { *ext = &rbe->ext; return -EEXIST; } else { diff --git a/net/netlabel/Kconfig b/net/netlabel/Kconfig index 07b03c306f28..4383ac29693e 100644 --- a/net/netlabel/Kconfig +++ b/net/netlabel/Kconfig @@ -8,7 +8,7 @@ config NETLABEL depends on SECURITY select CRC_CCITT if IPV6 default n - ---help--- + help NetLabel provides support for explicit network packet labeling protocols such as CIPSO and RIPSO. For more information see Documentation/netlabel as well as the NetLabel SourceForge project diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig index 20f967974da0..1039d4f2ce11 100644 --- a/net/netlink/Kconfig +++ b/net/netlink/Kconfig @@ -6,6 +6,6 @@ config NETLINK_DIAG tristate "NETLINK: socket monitoring interface" default n - ---help--- + help Support for NETLINK socket monitoring interface used by the ss tool. If unsure, say Y. diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 2f049692e012..55ee680e9db1 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -474,8 +474,7 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family, struct netlink_ext_ack *extack, const struct genl_ops *ops, int hdrlen, - enum genl_validate_flags no_strict_flag, - bool parallel) + enum genl_validate_flags no_strict_flag) { enum netlink_validation validate = ops->validate & no_strict_flag ? NL_VALIDATE_LIBERAL : @@ -486,7 +485,7 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family, if (!family->maxattr) return NULL; - if (parallel) { + if (family->parallel_ops) { attrbuf = kmalloc_array(family->maxattr + 1, sizeof(struct nlattr *), GFP_KERNEL); if (!attrbuf) @@ -498,7 +497,7 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family, err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, family->policy, validate, extack); if (err) { - if (parallel) + if (family->parallel_ops) kfree(attrbuf); return ERR_PTR(err); } @@ -506,22 +505,63 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family, } static void genl_family_rcv_msg_attrs_free(const struct genl_family *family, - struct nlattr **attrbuf, - bool parallel) + struct nlattr **attrbuf) { - if (parallel) + if (family->parallel_ops) kfree(attrbuf); } -static int genl_lock_start(struct netlink_callback *cb) +struct genl_start_context { + const struct genl_family *family; + struct nlmsghdr *nlh; + struct netlink_ext_ack *extack; + const struct genl_ops *ops; + int hdrlen; +}; + +static int genl_start(struct netlink_callback *cb) { - const struct genl_ops *ops = genl_dumpit_info(cb)->ops; + struct genl_start_context *ctx = cb->data; + const struct genl_ops *ops = ctx->ops; + struct genl_dumpit_info *info; + struct nlattr **attrs = NULL; int rc = 0; + if (ops->validate & GENL_DONT_VALIDATE_DUMP) + goto no_attrs; + + if (ctx->nlh->nlmsg_len < nlmsg_msg_size(ctx->hdrlen)) + return -EINVAL; + + attrs = genl_family_rcv_msg_attrs_parse(ctx->family, ctx->nlh, ctx->extack, + ops, ctx->hdrlen, + GENL_DONT_VALIDATE_DUMP_STRICT); + if (IS_ERR(attrs)) + return PTR_ERR(attrs); + +no_attrs: + info = genl_dumpit_info_alloc(); + if (!info) { + genl_family_rcv_msg_attrs_free(ctx->family, attrs); + return -ENOMEM; + } + info->family = ctx->family; + info->ops = ops; + info->attrs = attrs; + + cb->data = info; if (ops->start) { - genl_lock(); + if (!ctx->family->parallel_ops) + genl_lock(); rc = ops->start(cb); - genl_unlock(); + if (!ctx->family->parallel_ops) + genl_unlock(); + } + + if (rc) { + genl_family_rcv_msg_attrs_free(info->family, info->attrs); + genl_dumpit_info_free(info); + cb->data = NULL; } return rc; } @@ -548,7 +588,7 @@ static int genl_lock_done(struct netlink_callback *cb) rc = ops->done(cb); genl_unlock(); } - genl_family_rcv_msg_attrs_free(info->family, info->attrs, true); + genl_family_rcv_msg_attrs_free(info->family, info->attrs); genl_dumpit_info_free(info); return rc; } @@ -561,7 +601,7 @@ static int genl_parallel_done(struct netlink_callback *cb) if (ops->done) rc = ops->done(cb); - genl_family_rcv_msg_attrs_free(info->family, info->attrs, true); + genl_family_rcv_msg_attrs_free(info->family, info->attrs); genl_dumpit_info_free(info); return rc; } @@ -573,43 +613,23 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family, const struct genl_ops *ops, int hdrlen, struct net *net) { - struct genl_dumpit_info *info; - struct nlattr **attrs = NULL; + struct genl_start_context ctx; int err; if (!ops->dumpit) return -EOPNOTSUPP; - if (ops->validate & GENL_DONT_VALIDATE_DUMP) - goto no_attrs; - - if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) - return -EINVAL; - - attrs = genl_family_rcv_msg_attrs_parse(family, nlh, extack, - ops, hdrlen, - GENL_DONT_VALIDATE_DUMP_STRICT, - true); - if (IS_ERR(attrs)) - return PTR_ERR(attrs); - -no_attrs: - /* Allocate dumpit info. It is going to be freed by done() callback. */ - info = genl_dumpit_info_alloc(); - if (!info) { - genl_family_rcv_msg_attrs_free(family, attrs, true); - return -ENOMEM; - } - - info->family = family; - info->ops = ops; - info->attrs = attrs; + ctx.family = family; + ctx.nlh = nlh; + ctx.extack = extack; + ctx.ops = ops; + ctx.hdrlen = hdrlen; if (!family->parallel_ops) { struct netlink_dump_control c = { .module = family->module, - .data = info, - .start = genl_lock_start, + .data = &ctx, + .start = genl_start, .dump = genl_lock_dumpit, .done = genl_lock_done, }; @@ -617,12 +637,11 @@ no_attrs: genl_unlock(); err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); genl_lock(); - } else { struct netlink_dump_control c = { .module = family->module, - .data = info, - .start = ops->start, + .data = &ctx, + .start = genl_start, .dump = ops->dumpit, .done = genl_parallel_done, }; @@ -649,8 +668,7 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family, attrbuf = genl_family_rcv_msg_attrs_parse(family, nlh, extack, ops, hdrlen, - GENL_DONT_VALIDATE_STRICT, - family->parallel_ops); + GENL_DONT_VALIDATE_STRICT); if (IS_ERR(attrbuf)) return PTR_ERR(attrbuf); @@ -676,7 +694,7 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family, family->post_doit(ops, skb, &info); out: - genl_family_rcv_msg_attrs_free(family, attrbuf, family->parallel_ops); + genl_family_rcv_msg_attrs_free(family, attrbuf); return err; } diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index eccc7d366e17..f90ef6934b8f 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -70,6 +70,7 @@ static const struct proto_ops nr_proto_ops; * separate class since they always nest. */ static struct lock_class_key nr_netdev_xmit_lock_key; +static struct lock_class_key nr_netdev_addr_lock_key; static void nr_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, @@ -80,6 +81,7 @@ static void nr_set_lockdep_one(struct net_device *dev, static void nr_set_lockdep_key(struct net_device *dev) { + lockdep_set_class(&dev->addr_list_lock, &nr_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL); } diff --git a/net/nfc/hci/Kconfig b/net/nfc/hci/Kconfig index 4822d6f46947..9500b8a27475 100644 --- a/net/nfc/hci/Kconfig +++ b/net/nfc/hci/Kconfig @@ -13,6 +13,6 @@ config NFC_SHDLC select CRC_CCITT bool "SHDLC link layer for HCI based NFC drivers" default n - ---help--- + help Say yes if you use an NFC HCI driver that requires SHDLC link layer. If unsure, say N here. diff --git a/net/nsh/Kconfig b/net/nsh/Kconfig index 19af948ab6f0..84f2a2823417 100644 --- a/net/nsh/Kconfig +++ b/net/nsh/Kconfig @@ -2,7 +2,7 @@ menuconfig NET_NSH tristate "Network Service Header (NSH) protocol" default n - ---help--- + help Network Service Header is an implementation of Service Function Chaining (RFC 7665). The current implementation in Linux supports only MD type 1 and only with the openvswitch module. diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 22d7d5604b4c..15bd287f5cbd 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -15,7 +15,7 @@ config OPENVSWITCH select NET_MPLS_GSO select DST_CACHE select NET_NSH - ---help--- + help Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features expected in a traditional hardware switch, it enables fine-grained @@ -43,7 +43,7 @@ config OPENVSWITCH_GRE depends on OPENVSWITCH depends on NET_IPGRE default OPENVSWITCH - ---help--- + help If you say Y here, then the Open vSwitch will be able create GRE vport. @@ -56,7 +56,7 @@ config OPENVSWITCH_VXLAN depends on OPENVSWITCH depends on VXLAN default OPENVSWITCH - ---help--- + help If you say Y here, then the Open vSwitch will be able create vxlan vport. Say N to exclude this support and reduce the binary size. @@ -68,7 +68,7 @@ config OPENVSWITCH_GENEVE depends on OPENVSWITCH depends on GENEVE default OPENVSWITCH - ---help--- + help If you say Y here, then the Open vSwitch will be able create geneve vport. Say N to exclude this support and reduce the binary size. diff --git a/net/packet/Kconfig b/net/packet/Kconfig index b4abad135294..2997382d597c 100644 --- a/net/packet/Kconfig +++ b/net/packet/Kconfig @@ -5,7 +5,7 @@ config PACKET tristate "Packet socket" - ---help--- + help The Packet protocol is used by applications which communicate directly with network devices without an intermediate network protocol implemented in the kernel, e.g. tcpdump. If you want them @@ -20,6 +20,6 @@ config PACKET_DIAG tristate "Packet: sockets monitoring interface" depends on PACKET default n - ---help--- + help Support for PF_PACKET sockets monitoring interface used by the ss tool. If unsure, say Y. diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig index f362ca316015..b4020b84760f 100644 --- a/net/qrtr/Kconfig +++ b/net/qrtr/Kconfig @@ -4,7 +4,7 @@ config QRTR tristate "Qualcomm IPC Router support" - ---help--- + help Say Y if you intend to use Qualcomm IPC router protocol. The protocol is used to communicate with services provided by other hardware blocks in the system. @@ -17,13 +17,13 @@ if QRTR config QRTR_SMD tristate "SMD IPC Router channels" depends on RPMSG || (COMPILE_TEST && RPMSG=n) - ---help--- + help Say Y here to support SMD based ipcrouter channels. SMD is the most common transport for IPC Router. config QRTR_TUN tristate "TUN device for Qualcomm IPC Router" - ---help--- + help Say Y here to expose a character device that allows user space to implement endpoints of QRTR, for purpose of tunneling data to other hosts or testing purposes. diff --git a/net/rds/Kconfig b/net/rds/Kconfig index c64e154bc18f..75cd696963b2 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -3,14 +3,14 @@ config RDS tristate "The Reliable Datagram Sockets Protocol" depends on INET - ---help--- + help The RDS (Reliable Datagram Sockets) protocol provides reliable, sequenced delivery of datagrams over Infiniband or TCP. config RDS_RDMA tristate "RDS over Infiniband" depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS - ---help--- + help Allow RDS to use Infiniband as a transport. This transport supports RDMA operations. @@ -18,7 +18,7 @@ config RDS_TCP tristate "RDS over TCP" depends on RDS depends on IPV6 || !IPV6 - ---help--- + help Allow RDS to use TCP as a transport. This transport does not support RDMA operations. diff --git a/net/rds/Makefile b/net/rds/Makefile index e647f9de104a..8fdc118e2927 100644 --- a/net/rds/Makefile +++ b/net/rds/Makefile @@ -7,7 +7,7 @@ rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \ obj-$(CONFIG_RDS_RDMA) += rds_rdma.o rds_rdma-y := rdma_transport.o \ ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \ - ib_sysctl.o ib_rdma.o ib_fmr.o ib_frmr.o + ib_sysctl.o ib_rdma.o ib_frmr.o obj-$(CONFIG_RDS_TCP) += rds_tcp.o diff --git a/net/rds/ib.c b/net/rds/ib.c index a792d8a3872a..deecbdcdae84 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -127,19 +127,23 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) queue_work(rds_wq, &rds_ibdev->free_work); } -static void rds_ib_add_one(struct ib_device *device) +static int rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; - bool has_fr, has_fmr; + int ret; /* Only handle IB (no iWARP) devices */ if (device->node_type != RDMA_NODE_IB_CA) - return; + return -EOPNOTSUPP; + + /* Device must support FRWR */ + if (!(device->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) + return -EOPNOTSUPP; rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, ibdev_to_node(device)); if (!rds_ibdev) - return; + return -ENOMEM; spin_lock_init(&rds_ibdev->spinlock); refcount_set(&rds_ibdev->refcount, 1); @@ -151,11 +155,6 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_wrs = device->attrs.max_qp_wr; rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE); - has_fr = (device->attrs.device_cap_flags & - IB_DEVICE_MEM_MGT_EXTENSIONS); - has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr && - device->ops.map_phys_fmr && device->ops.unmap_fmr); - rds_ibdev->use_fastreg = (has_fr && !has_fmr); rds_ibdev->odp_capable = !!(device->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING) && @@ -164,7 +163,6 @@ static void rds_ib_add_one(struct ib_device *device) !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & IB_ODP_SUPPORT_READ); - rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; rds_ibdev->max_1m_mrs = device->attrs.max_mr ? min_t(unsigned int, (device->attrs.max_mr / 2), rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size; @@ -182,12 +180,14 @@ static void rds_ib_add_one(struct ib_device *device) if (!rds_ibdev->vector_load) { pr_err("RDS/IB: %s failed to allocate vector memory\n", __func__); + ret = -ENOMEM; goto put_dev; } rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device, 0); if (IS_ERR(rds_ibdev->pd)) { + ret = PTR_ERR(rds_ibdev->pd); rds_ibdev->pd = NULL; goto put_dev; } @@ -195,12 +195,15 @@ static void rds_ib_add_one(struct ib_device *device) device->dma_device, sizeof(struct rds_header), L1_CACHE_BYTES, 0); - if (!rds_ibdev->rid_hdrs_pool) + if (!rds_ibdev->rid_hdrs_pool) { + ret = -ENOMEM; goto put_dev; + } rds_ibdev->mr_1m_pool = rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL); if (IS_ERR(rds_ibdev->mr_1m_pool)) { + ret = PTR_ERR(rds_ibdev->mr_1m_pool); rds_ibdev->mr_1m_pool = NULL; goto put_dev; } @@ -208,18 +211,16 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->mr_8k_pool = rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL); if (IS_ERR(rds_ibdev->mr_8k_pool)) { + ret = PTR_ERR(rds_ibdev->mr_8k_pool); rds_ibdev->mr_8k_pool = NULL; goto put_dev; } - rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_mrs = %d, max_8k_mrs = %d\n", - device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, - rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_mrs, - rds_ibdev->max_8k_mrs); + rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, max_1m_mrs = %d, max_8k_mrs = %d\n", + device->attrs.max_mr, rds_ibdev->max_wrs, rds_ibdev->max_sge, + rds_ibdev->max_1m_mrs, rds_ibdev->max_8k_mrs); - pr_info("RDS/IB: %s: %s supported and preferred\n", - device->name, - rds_ibdev->use_fastreg ? "FRMR" : "FMR"); + pr_info("RDS/IB: %s: added\n", device->name); down_write(&rds_ib_devices_lock); list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); @@ -227,12 +228,13 @@ static void rds_ib_add_one(struct ib_device *device) refcount_inc(&rds_ibdev->refcount); ib_set_client_data(device, &rds_ib_client, rds_ibdev); - refcount_inc(&rds_ibdev->refcount); rds_ib_nodev_connect(); + return 0; put_dev: rds_ib_dev_put(rds_ibdev); + return ret; } /* @@ -274,9 +276,6 @@ static void rds_ib_remove_one(struct ib_device *device, void *client_data) { struct rds_ib_device *rds_ibdev = client_data; - if (!rds_ibdev) - return; - rds_ib_dev_shutdown(rds_ibdev); /* stop connection attempts from getting a reference to this device. */ diff --git a/net/rds/ib.h b/net/rds/ib.h index 0296f1f7acda..8dfff43cf07f 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -247,13 +247,11 @@ struct rds_ib_device { struct ib_device *dev; struct ib_pd *pd; struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */ - u8 use_fastreg:1; u8 odp_capable:1; unsigned int max_mrs; struct rds_ib_mr_pool *mr_1m_pool; struct rds_ib_mr_pool *mr_8k_pool; - unsigned int fmr_max_remaps; unsigned int max_8k_mrs; unsigned int max_1m_mrs; int max_sge; @@ -266,7 +264,13 @@ struct rds_ib_device { int *vector_load; }; -#define ibdev_to_node(ibdev) dev_to_node((ibdev)->dev.parent) +static inline int ibdev_to_node(struct ib_device *ibdev) +{ + struct device *parent; + + parent = ibdev->dev.parent; + return parent ? dev_to_node(parent) : NUMA_NO_NODE; +} #define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev) /* bits for i_ack_flags */ diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index c71f4328d138..c3319ff3ee11 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -37,6 +37,7 @@ #include <linux/vmalloc.h> #include <linux/ratelimit.h> #include <net/addrconf.h> +#include <rdma/ib_cm.h> #include "rds_single_path.h" #include "rds.h" @@ -526,10 +527,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn) return -EOPNOTSUPP; /* The fr_queue_space is currently set to 512, to add extra space on - * completion queue and send queue. This extra space is used for FRMR + * completion queue and send queue. This extra space is used for FRWR * registration and invalidation work requests */ - fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0); + fr_queue_space = RDS_IB_DEFAULT_FR_WR; /* add the conn now so that connection establishment has the dev */ rds_ib_add_conn(rds_ibdev, conn); @@ -927,7 +928,8 @@ out: if (conn) mutex_unlock(&conn->c_cm_lock); if (err) - rdma_reject(cm_id, &err, sizeof(int)); + rdma_reject(cm_id, &err, sizeof(int), + IB_CM_REJ_CONSUMER_DEFINED); return destroy; } diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c deleted file mode 100644 index 93c0437e6a5f..000000000000 --- a/net/rds/ib_fmr.c +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Copyright (c) 2016 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "ib_mr.h" - -struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages) -{ - struct rds_ib_mr_pool *pool; - struct rds_ib_mr *ibmr = NULL; - struct rds_ib_fmr *fmr; - int err = 0; - - if (npages <= RDS_MR_8K_MSG_SIZE) - pool = rds_ibdev->mr_8k_pool; - else - pool = rds_ibdev->mr_1m_pool; - - if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) - queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); - - /* Switch pools if one of the pool is reaching upper limit */ - if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) { - if (pool->pool_type == RDS_IB_MR_8K_POOL) - pool = rds_ibdev->mr_1m_pool; - else - pool = rds_ibdev->mr_8k_pool; - } - - ibmr = rds_ib_try_reuse_ibmr(pool); - if (ibmr) - return ibmr; - - ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, - rdsibdev_to_node(rds_ibdev)); - if (!ibmr) { - err = -ENOMEM; - goto out_no_cigar; - } - - fmr = &ibmr->u.fmr; - fmr->fmr = ib_alloc_fmr(rds_ibdev->pd, - (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_ATOMIC), - &pool->fmr_attr); - if (IS_ERR(fmr->fmr)) { - err = PTR_ERR(fmr->fmr); - fmr->fmr = NULL; - pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, err); - goto out_no_cigar; - } - - ibmr->pool = pool; - if (pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); - - return ibmr; - -out_no_cigar: - kfree(ibmr); - atomic_dec(&pool->item_count); - - return ERR_PTR(err); -} - -static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, - struct rds_ib_mr *ibmr, struct scatterlist *sg, - unsigned int nents) -{ - struct ib_device *dev = rds_ibdev->dev; - struct rds_ib_fmr *fmr = &ibmr->u.fmr; - struct scatterlist *scat = sg; - u64 io_addr = 0; - u64 *dma_pages; - u32 len; - int page_cnt, sg_dma_len; - int i, j; - int ret; - - sg_dma_len = ib_dma_map_sg(dev, sg, nents, DMA_BIDIRECTIONAL); - if (unlikely(!sg_dma_len)) { - pr_warn("RDS/IB: %s failed!\n", __func__); - return -EBUSY; - } - - len = 0; - page_cnt = 0; - - for (i = 0; i < sg_dma_len; ++i) { - unsigned int dma_len = sg_dma_len(&scat[i]); - u64 dma_addr = sg_dma_address(&scat[i]); - - if (dma_addr & ~PAGE_MASK) { - if (i > 0) { - ib_dma_unmap_sg(dev, sg, nents, - DMA_BIDIRECTIONAL); - return -EINVAL; - } else { - ++page_cnt; - } - } - if ((dma_addr + dma_len) & ~PAGE_MASK) { - if (i < sg_dma_len - 1) { - ib_dma_unmap_sg(dev, sg, nents, - DMA_BIDIRECTIONAL); - return -EINVAL; - } else { - ++page_cnt; - } - } - - len += dma_len; - } - - page_cnt += len >> PAGE_SHIFT; - if (page_cnt > ibmr->pool->fmr_attr.max_pages) { - ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL); - return -EINVAL; - } - - dma_pages = kmalloc_array_node(sizeof(u64), page_cnt, GFP_ATOMIC, - rdsibdev_to_node(rds_ibdev)); - if (!dma_pages) { - ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL); - return -ENOMEM; - } - - page_cnt = 0; - for (i = 0; i < sg_dma_len; ++i) { - unsigned int dma_len = sg_dma_len(&scat[i]); - u64 dma_addr = sg_dma_address(&scat[i]); - - for (j = 0; j < dma_len; j += PAGE_SIZE) - dma_pages[page_cnt++] = - (dma_addr & PAGE_MASK) + j; - } - - ret = ib_map_phys_fmr(fmr->fmr, dma_pages, page_cnt, io_addr); - if (ret) { - ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL); - goto out; - } - - /* Success - we successfully remapped the MR, so we can - * safely tear down the old mapping. - */ - rds_ib_teardown_mr(ibmr); - - ibmr->sg = scat; - ibmr->sg_len = nents; - ibmr->sg_dma_len = sg_dma_len; - ibmr->remap_count++; - - if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_used); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_used); - ret = 0; - -out: - kfree(dma_pages); - - return ret; -} - -struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *rds_ibdev, - struct scatterlist *sg, - unsigned long nents, - u32 *key) -{ - struct rds_ib_mr *ibmr = NULL; - struct rds_ib_fmr *fmr; - int ret; - - ibmr = rds_ib_alloc_fmr(rds_ibdev, nents); - if (IS_ERR(ibmr)) - return ibmr; - - ibmr->device = rds_ibdev; - fmr = &ibmr->u.fmr; - ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); - if (ret == 0) - *key = fmr->fmr->rkey; - else - rds_ib_free_mr(ibmr, 0); - - return ibmr; -} - -void rds_ib_unreg_fmr(struct list_head *list, unsigned int *nfreed, - unsigned long *unpinned, unsigned int goal) -{ - struct rds_ib_mr *ibmr, *next; - struct rds_ib_fmr *fmr; - LIST_HEAD(fmr_list); - int ret = 0; - unsigned int freed = *nfreed; - - /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ - list_for_each_entry(ibmr, list, unmap_list) { - fmr = &ibmr->u.fmr; - list_add(&fmr->fmr->list, &fmr_list); - } - - ret = ib_unmap_fmr(&fmr_list); - if (ret) - pr_warn("RDS/IB: FMR invalidation failed (err=%d)\n", ret); - - /* Now we can destroy the DMA mapping and unpin any pages */ - list_for_each_entry_safe(ibmr, next, list, unmap_list) { - fmr = &ibmr->u.fmr; - *unpinned += ibmr->sg_len; - __rds_ib_teardown_mr(ibmr); - if (freed < goal || - ibmr->remap_count >= ibmr->pool->fmr_attr.max_maps) { - if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_free); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_free); - list_del(&ibmr->unmap_list); - ib_dealloc_fmr(fmr->fmr); - kfree(ibmr); - freed++; - } - } - *nfreed = freed; -} - -void rds_ib_free_fmr_list(struct rds_ib_mr *ibmr) -{ - struct rds_ib_mr_pool *pool = ibmr->pool; - - if (ibmr->remap_count >= pool->fmr_attr.max_maps) - llist_add(&ibmr->llnode, &pool->drop_list); - else - llist_add(&ibmr->llnode, &pool->free_list); -} diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 06ecf9d2d4bf..9b6ffff72f2d 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -76,7 +76,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev, frmr = &ibmr->u.frmr; frmr->mr = ib_alloc_mr(rds_ibdev->pd, IB_MR_TYPE_MEM_REG, - pool->fmr_attr.max_pages); + pool->max_pages); if (IS_ERR(frmr->mr)) { pr_warn("RDS/IB: %s failed to allocate MR", __func__); err = PTR_ERR(frmr->mr); @@ -240,7 +240,7 @@ static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev, } frmr->dma_npages += len >> PAGE_SHIFT; - if (frmr->dma_npages > ibmr->pool->fmr_attr.max_pages) { + if (frmr->dma_npages > ibmr->pool->max_pages) { ret = -EMSGSIZE; goto out_unmap; } diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 0c8252d7fe2b..ea5e9aee4959 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -43,10 +43,6 @@ #define RDS_MR_8K_SCALE (256 / (RDS_MR_8K_MSG_SIZE + 1)) #define RDS_MR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2)) -struct rds_ib_fmr { - struct ib_fmr *fmr; -}; - enum rds_ib_fr_state { FRMR_IS_FREE, /* mr invalidated & ready for use */ FRMR_IS_INUSE, /* mr is in use or used & can be invalidated */ @@ -84,7 +80,6 @@ struct rds_ib_mr { u8 odp:1; union { - struct rds_ib_fmr fmr; struct rds_ib_frmr frmr; struct ib_mr *mr; } u; @@ -109,8 +104,7 @@ struct rds_ib_mr_pool { unsigned long max_items; unsigned long max_items_soft; unsigned long max_free_pinned; - struct ib_fmr_attr fmr_attr; - bool use_fastreg; + unsigned int max_pages; }; extern struct workqueue_struct *rds_ib_mr_wq; @@ -136,15 +130,9 @@ u32 rds_ib_get_lkey(void *trans_private); void __rds_ib_teardown_mr(struct rds_ib_mr *); void rds_ib_teardown_mr(struct rds_ib_mr *); -struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *, int); struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *); int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *, int, struct rds_ib_mr **); -struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *, struct scatterlist *, - unsigned long, u32 *); struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *); -void rds_ib_unreg_fmr(struct list_head *, unsigned int *, - unsigned long *, unsigned int); -void rds_ib_free_fmr_list(struct rds_ib_mr *); struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev, struct rds_ib_connection *ic, struct scatterlist *sg, diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index b34b24e237f8..8f070ee7e742 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -181,7 +181,7 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; iinfo->rdma_mr_max = pool_1m->max_items; - iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; + iinfo->rdma_mr_size = pool_1m->max_pages; } #if IS_ENABLED(CONFIG_IPV6) @@ -191,7 +191,7 @@ void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; iinfo6->rdma_mr_max = pool_1m->max_items; - iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages; + iinfo6->rdma_mr_size = pool_1m->max_pages; } #endif @@ -406,10 +406,7 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, if (list_empty(&unmap_list)) goto out; - if (pool->use_fastreg) - rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal); - else - rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal); + rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal); if (!list_empty(&unmap_list)) { unsigned long flags; @@ -503,10 +500,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) } /* Return it to the pool's free list */ - if (rds_ibdev->use_fastreg) - rds_ib_free_frmr_list(ibmr); - else - rds_ib_free_fmr_list(ibmr); + rds_ib_free_frmr_list(ibmr); atomic_add(ibmr->sg_len, &pool->free_pinned); atomic_inc(&pool->dirty_count); @@ -622,10 +616,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, goto out; } - if (rds_ibdev->use_fastreg) - ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); - else - ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret); + ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); if (IS_ERR(ibmr)) { ret = PTR_ERR(ibmr); pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret); @@ -669,19 +660,16 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, if (pool_type == RDS_IB_MR_1M_POOL) { /* +1 allows for unaligned MRs */ - pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1; + pool->max_pages = RDS_MR_1M_MSG_SIZE + 1; pool->max_items = rds_ibdev->max_1m_mrs; } else { /* pool_type == RDS_IB_MR_8K_POOL */ - pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1; + pool->max_pages = RDS_MR_8K_MSG_SIZE + 1; pool->max_items = rds_ibdev->max_8k_mrs; } - pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; - pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; - pool->fmr_attr.page_shift = PAGE_SHIFT; + pool->max_free_pinned = pool->max_items * pool->max_pages / 4; pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4; - pool->use_fastreg = rds_ibdev->use_fastreg; return pool; } diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index e7a872207b46..ce85656ac9c1 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -71,6 +71,7 @@ ax25_address rose_callsign; * separate class since they always nest. */ static struct lock_class_key rose_netdev_xmit_lock_key; +static struct lock_class_key rose_netdev_addr_lock_key; static void rose_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, @@ -81,6 +82,7 @@ static void rose_set_lockdep_one(struct net_device *dev, static void rose_set_lockdep_key(struct net_device *dev) { + lockdep_set_class(&dev->addr_list_lock, &rose_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL); } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9fe264bec70c..9a2139ebd67d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -810,100 +810,6 @@ static inline bool rxrpc_is_client_call(const struct rxrpc_call *call) } /* - * Transition a call to the complete state. - */ -static inline bool __rxrpc_set_call_completion(struct rxrpc_call *call, - enum rxrpc_call_completion compl, - u32 abort_code, - int error) -{ - if (call->state < RXRPC_CALL_COMPLETE) { - call->abort_code = abort_code; - call->error = error; - call->completion = compl, - call->state = RXRPC_CALL_COMPLETE; - trace_rxrpc_call_complete(call); - wake_up(&call->waitq); - return true; - } - return false; -} - -static inline bool rxrpc_set_call_completion(struct rxrpc_call *call, - enum rxrpc_call_completion compl, - u32 abort_code, - int error) -{ - bool ret; - - write_lock_bh(&call->state_lock); - ret = __rxrpc_set_call_completion(call, compl, abort_code, error); - write_unlock_bh(&call->state_lock); - return ret; -} - -/* - * Record that a call successfully completed. - */ -static inline bool __rxrpc_call_completed(struct rxrpc_call *call) -{ - return __rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0); -} - -static inline bool rxrpc_call_completed(struct rxrpc_call *call) -{ - bool ret; - - write_lock_bh(&call->state_lock); - ret = __rxrpc_call_completed(call); - write_unlock_bh(&call->state_lock); - return ret; -} - -/* - * Record that a call is locally aborted. - */ -static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call, - rxrpc_seq_t seq, - u32 abort_code, int error) -{ - trace_rxrpc_abort(call->debug_id, why, call->cid, call->call_id, seq, - abort_code, error); - return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED, - abort_code, error); -} - -static inline bool rxrpc_abort_call(const char *why, struct rxrpc_call *call, - rxrpc_seq_t seq, u32 abort_code, int error) -{ - bool ret; - - write_lock_bh(&call->state_lock); - ret = __rxrpc_abort_call(why, call, seq, abort_code, error); - write_unlock_bh(&call->state_lock); - return ret; -} - -/* - * Abort a call due to a protocol error. - */ -static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call, - struct sk_buff *skb, - const char *eproto_why, - const char *why, - u32 abort_code) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - trace_rxrpc_rx_eproto(call, sp->hdr.serial, eproto_why); - return rxrpc_abort_call(why, call, sp->hdr.seq, abort_code, -EPROTO); -} - -#define rxrpc_abort_eproto(call, skb, eproto_why, abort_why, abort_code) \ - __rxrpc_abort_eproto((call), (skb), tracepoint_string(eproto_why), \ - (abort_why), (abort_code)) - -/* * conn_client.c */ extern unsigned int rxrpc_max_client_connections; @@ -1101,9 +1007,34 @@ extern const struct seq_operations rxrpc_peer_seq_ops; * recvmsg.c */ void rxrpc_notify_socket(struct rxrpc_call *); +bool __rxrpc_set_call_completion(struct rxrpc_call *, enum rxrpc_call_completion, u32, int); +bool rxrpc_set_call_completion(struct rxrpc_call *, enum rxrpc_call_completion, u32, int); +bool __rxrpc_call_completed(struct rxrpc_call *); +bool rxrpc_call_completed(struct rxrpc_call *); +bool __rxrpc_abort_call(const char *, struct rxrpc_call *, rxrpc_seq_t, u32, int); +bool rxrpc_abort_call(const char *, struct rxrpc_call *, rxrpc_seq_t, u32, int); int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); /* + * Abort a call due to a protocol error. + */ +static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call, + struct sk_buff *skb, + const char *eproto_why, + const char *why, + u32 abort_code) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + trace_rxrpc_rx_eproto(call, sp->hdr.serial, eproto_why); + return rxrpc_abort_call(why, call, sp->hdr.seq, abort_code, -EPROTO); +} + +#define rxrpc_abort_eproto(call, skb, eproto_why, abort_why, abort_code) \ + __rxrpc_abort_eproto((call), (skb), tracepoint_string(eproto_why), \ + (abort_why), (abort_code)) + +/* * rtt.c */ void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 2a65ac41055f..aa1c8eee6557 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -248,7 +248,18 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) if (anno_type != RXRPC_TX_ANNO_RETRANS) continue; + /* We need to reset the retransmission state, but we need to do + * so before we drop the lock as a new ACK/NAK may come in and + * confuse things + */ + annotation &= ~RXRPC_TX_ANNO_MASK; + annotation |= RXRPC_TX_ANNO_RESENT; + call->rxtx_annotations[ix] = annotation; + skb = call->rxtx_buffer[ix]; + if (!skb) + continue; + rxrpc_get_skb(skb, rxrpc_skb_got); spin_unlock_bh(&call->lock); @@ -262,24 +273,6 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) rxrpc_free_skb(skb, rxrpc_skb_freed); spin_lock_bh(&call->lock); - - /* We need to clear the retransmit state, but there are two - * things we need to be aware of: A new ACK/NAK might have been - * received and the packet might have been hard-ACK'd (in which - * case it will no longer be in the buffer). - */ - if (after(seq, call->tx_hard_ack)) { - annotation = call->rxtx_annotations[ix]; - anno_type = annotation & RXRPC_TX_ANNO_MASK; - if (anno_type == RXRPC_TX_ANNO_RETRANS || - anno_type == RXRPC_TX_ANNO_NAK) { - annotation &= ~RXRPC_TX_ANNO_MASK; - annotation |= RXRPC_TX_ANNO_UNACK; - } - annotation |= RXRPC_TX_ANNO_RESENT; - call->rxtx_annotations[ix] = annotation; - } - if (after(call->tx_hard_ack, seq)) seq = call->tx_hard_ack; } @@ -320,7 +313,6 @@ recheck_state: if (call->state == RXRPC_CALL_COMPLETE) { del_timer_sync(&call->timer); - rxrpc_notify_socket(call); goto out_put; } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 06fcff2ebbba..447f55ca6886 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -173,10 +173,9 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, else trace_rxrpc_rx_abort(call, serial, conn->abort_code); - if (rxrpc_set_call_completion(call, compl, - conn->abort_code, - conn->error)) - rxrpc_notify_socket(call); + rxrpc_set_call_completion(call, compl, + conn->abort_code, + conn->error); } } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 3be4177baf70..299ac98e9754 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -275,7 +275,6 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, case RXRPC_CALL_SERVER_AWAIT_ACK: __rxrpc_call_completed(call); - rxrpc_notify_socket(call); state = call->state; break; @@ -1013,9 +1012,8 @@ static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb) _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code); - if (rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, - abort_code, -ECONNABORTED)) - rxrpc_notify_socket(call); + rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, + abort_code, -ECONNABORTED); } /* @@ -1102,7 +1100,6 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_sock *rx, spin_lock(&rx->incoming_lock); __rxrpc_disconnect_call(conn, call); spin_unlock(&rx->incoming_lock); - rxrpc_notify_socket(call); } /* diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index b1449d971883..a852f46d5234 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -271,6 +271,9 @@ static void rxrpc_store_error(struct rxrpc_peer *peer, break; case SO_EE_ORIGIN_ICMP6: + if (err == EACCES) + err = EHOSTUNREACH; + /* Fall through */ default: _proto("Rx Received error report { orig=%u }", ee->ee_origin); break; @@ -289,9 +292,7 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error, hlist_for_each_entry_rcu(call, &peer->error_targets, error_link) { rxrpc_see_call(call); - if (call->state < RXRPC_CALL_COMPLETE && - rxrpc_set_call_completion(call, compl, 0, -error)) - rxrpc_notify_socket(call); + rxrpc_set_call_completion(call, compl, 0, -error); } } diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 8b179e3c802a..543afd9bd664 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -68,7 +68,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) "Proto Local " " Remote " " SvID ConnID CallID End Use State Abort " - " UserID TxSeq TW RxSeq RW RxSerial RxTimo\n"); + " DebugId TxSeq TW RxSeq RW RxSerial RxTimo\n"); return 0; } @@ -100,7 +100,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) rx_hard_ack = READ_ONCE(call->rx_hard_ack); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" - " %-8.8s %08x %lx %08x %02x %08x %02x %08x %06lx\n", + " %-8.8s %08x %08x %08x %02x %08x %02x %08x %06lx\n", lbuff, rbuff, call->service_id, @@ -110,7 +110,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) atomic_read(&call->usage), rxrpc_call_states[call->state], call->abort_code, - call->user_call_ID, + call->debug_id, tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack, rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack, call->rx_serial, diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 8578c39ec839..2989742a4aa1 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -59,6 +59,85 @@ void rxrpc_notify_socket(struct rxrpc_call *call) } /* + * Transition a call to the complete state. + */ +bool __rxrpc_set_call_completion(struct rxrpc_call *call, + enum rxrpc_call_completion compl, + u32 abort_code, + int error) +{ + if (call->state < RXRPC_CALL_COMPLETE) { + call->abort_code = abort_code; + call->error = error; + call->completion = compl, + call->state = RXRPC_CALL_COMPLETE; + trace_rxrpc_call_complete(call); + wake_up(&call->waitq); + rxrpc_notify_socket(call); + return true; + } + return false; +} + +bool rxrpc_set_call_completion(struct rxrpc_call *call, + enum rxrpc_call_completion compl, + u32 abort_code, + int error) +{ + bool ret = false; + + if (call->state < RXRPC_CALL_COMPLETE) { + write_lock_bh(&call->state_lock); + ret = __rxrpc_set_call_completion(call, compl, abort_code, error); + write_unlock_bh(&call->state_lock); + } + return ret; +} + +/* + * Record that a call successfully completed. + */ +bool __rxrpc_call_completed(struct rxrpc_call *call) +{ + return __rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0); +} + +bool rxrpc_call_completed(struct rxrpc_call *call) +{ + bool ret = false; + + if (call->state < RXRPC_CALL_COMPLETE) { + write_lock_bh(&call->state_lock); + ret = __rxrpc_call_completed(call); + write_unlock_bh(&call->state_lock); + } + return ret; +} + +/* + * Record that a call is locally aborted. + */ +bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call, + rxrpc_seq_t seq, u32 abort_code, int error) +{ + trace_rxrpc_abort(call->debug_id, why, call->cid, call->call_id, seq, + abort_code, error); + return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED, + abort_code, error); +} + +bool rxrpc_abort_call(const char *why, struct rxrpc_call *call, + rxrpc_seq_t seq, u32 abort_code, int error) +{ + bool ret; + + write_lock_bh(&call->state_lock); + ret = __rxrpc_abort_call(why, call, seq, abort_code, error); + write_unlock_bh(&call->state_lock); + return ret; +} + +/* * Pass a call terminating message to userspace. */ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 5e9c43d4a314..1304b8608f56 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -261,10 +261,8 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, case -ENETUNREACH: case -EHOSTUNREACH: case -ECONNREFUSED: - rxrpc_set_call_completion(call, - RXRPC_CALL_LOCAL_ERROR, + rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret); - rxrpc_notify_socket(call); goto out; } _debug("need instant resend %d", ret); diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2f20073f4f84..84badf00647e 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -6,7 +6,7 @@ menuconfig NET_SCHED bool "QoS and/or fair queueing" select NET_SCH_FIFO - ---help--- + help When the kernel has several packets to send out over a network device, it has to decide which ones to send first, which ones to delay, and which ones to drop. This is the job of the queueing @@ -47,7 +47,7 @@ comment "Queueing/Scheduling" config NET_SCH_CBQ tristate "Class Based Queueing (CBQ)" - ---help--- + help Say Y here if you want to use the Class-Based Queueing (CBQ) packet scheduling algorithm. This algorithm classifies the waiting packets into a tree-like hierarchy of classes; the leaves of this tree are @@ -64,7 +64,7 @@ config NET_SCH_CBQ config NET_SCH_HTB tristate "Hierarchical Token Bucket (HTB)" - ---help--- + help Say Y here if you want to use the Hierarchical Token Buckets (HTB) packet scheduling algorithm. See <http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and @@ -78,7 +78,7 @@ config NET_SCH_HTB config NET_SCH_HFSC tristate "Hierarchical Fair Service Curve (HFSC)" - ---help--- + help Say Y here if you want to use the Hierarchical Fair Service Curve (HFSC) packet scheduling algorithm. @@ -88,7 +88,7 @@ config NET_SCH_HFSC config NET_SCH_ATM tristate "ATM Virtual Circuits (ATM)" depends on ATM - ---help--- + help Say Y here if you want to use the ATM pseudo-scheduler. This provides a framework for invoking classifiers, which in turn select classes of this queuing discipline. Each class maps @@ -101,7 +101,7 @@ config NET_SCH_ATM config NET_SCH_PRIO tristate "Multi Band Priority Queueing (PRIO)" - ---help--- + help Say Y here if you want to use an n-band priority queue packet scheduler. @@ -110,7 +110,7 @@ config NET_SCH_PRIO config NET_SCH_MULTIQ tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)" - ---help--- + help Say Y here if you want to use an n-band queue packet scheduler to support devices that have multiple hardware transmit queues. @@ -119,7 +119,7 @@ config NET_SCH_MULTIQ config NET_SCH_RED tristate "Random Early Detection (RED)" - ---help--- + help Say Y here if you want to use the Random Early Detection (RED) packet scheduling algorithm. @@ -130,7 +130,7 @@ config NET_SCH_RED config NET_SCH_SFB tristate "Stochastic Fair Blue (SFB)" - ---help--- + help Say Y here if you want to use the Stochastic Fair Blue (SFB) packet scheduling algorithm. @@ -141,7 +141,7 @@ config NET_SCH_SFB config NET_SCH_SFQ tristate "Stochastic Fairness Queueing (SFQ)" - ---help--- + help Say Y here if you want to use the Stochastic Fairness Queueing (SFQ) packet scheduling algorithm. @@ -152,7 +152,7 @@ config NET_SCH_SFQ config NET_SCH_TEQL tristate "True Link Equalizer (TEQL)" - ---help--- + help Say Y here if you want to use the True Link Equalizer (TLE) packet scheduling algorithm. This queueing discipline allows the combination of several physical devices into one virtual device. @@ -164,7 +164,7 @@ config NET_SCH_TEQL config NET_SCH_TBF tristate "Token Bucket Filter (TBF)" - ---help--- + help Say Y here if you want to use the Token Bucket Filter (TBF) packet scheduling algorithm. @@ -175,7 +175,7 @@ config NET_SCH_TBF config NET_SCH_CBS tristate "Credit Based Shaper (CBS)" - ---help--- + help Say Y here if you want to use the Credit Based Shaper (CBS) packet scheduling algorithm. @@ -208,7 +208,7 @@ config NET_SCH_TAPRIO config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" - ---help--- + help Say Y here if you want to use the Generic Random Early Detection (GRED) packet scheduling algorithm for some of your network devices (see the top of <file:net/sched/sch_red.c> for details and @@ -219,7 +219,7 @@ config NET_SCH_GRED config NET_SCH_DSMARK tristate "Differentiated Services marker (DSMARK)" - ---help--- + help Say Y if you want to schedule packets according to the Differentiated Services architecture proposed in RFC 2475. Technical information on this method, with pointers to associated @@ -230,7 +230,7 @@ config NET_SCH_DSMARK config NET_SCH_NETEM tristate "Network emulator (NETEM)" - ---help--- + help Say Y if you want to emulate network delay, loss, and packet re-ordering. This is often useful to simulate networks when testing applications or protocols. @@ -384,7 +384,7 @@ config NET_SCH_INGRESS depends on NET_CLS_ACT select NET_INGRESS select NET_EGRESS - ---help--- + help Say Y here if you want to use classifiers for incoming and/or outgoing packets. This qdisc doesn't do anything else besides running classifiers, which can also have actions attached to them. In case of outgoing packets, @@ -398,7 +398,7 @@ config NET_SCH_INGRESS config NET_SCH_PLUG tristate "Plug network traffic until release (PLUG)" - ---help--- + help This queuing discipline allows userspace to plug/unplug a network output queue, using the netlink interface. When it receives an @@ -441,7 +441,7 @@ config NET_SCH_ETS menuconfig NET_SCH_DEFAULT bool "Allow override default queue discipline" - ---help--- + help Support for selection of default queuing discipline. Nearly all users can safely say no here, and the default @@ -492,7 +492,7 @@ config NET_CLS config NET_CLS_BASIC tristate "Elementary classification (BASIC)" select NET_CLS - ---help--- + help Say Y here if you want to be able to classify packets using only extended matches and actions. @@ -502,7 +502,7 @@ config NET_CLS_BASIC config NET_CLS_TCINDEX tristate "Traffic-Control Index (TCINDEX)" select NET_CLS - ---help--- + help Say Y here if you want to be able to classify packets based on traffic control indices. You will want this feature if you want to implement Differentiated Services together with DSMARK. @@ -515,7 +515,7 @@ config NET_CLS_ROUTE4 depends on INET select IP_ROUTE_CLASSID select NET_CLS - ---help--- + help If you say Y here, you will be able to classify packets according to the route table entry they matched. @@ -525,7 +525,7 @@ config NET_CLS_ROUTE4 config NET_CLS_FW tristate "Netfilter mark (FW)" select NET_CLS - ---help--- + help If you say Y here, you will be able to classify packets according to netfilter/firewall marks. @@ -535,7 +535,7 @@ config NET_CLS_FW config NET_CLS_U32 tristate "Universal 32bit comparisons w/ hashing (U32)" select NET_CLS - ---help--- + help Say Y here to be able to classify packets using a universal 32bit pieces based comparison scheme. @@ -545,20 +545,20 @@ config NET_CLS_U32 config CLS_U32_PERF bool "Performance counters support" depends on NET_CLS_U32 - ---help--- + help Say Y here to make u32 gather additional statistics useful for fine tuning u32 classifiers. config CLS_U32_MARK bool "Netfilter marks support" depends on NET_CLS_U32 - ---help--- + help Say Y here to be able to use netfilter marks as u32 key. config NET_CLS_RSVP tristate "IPv4 Resource Reservation Protocol (RSVP)" select NET_CLS - ---help--- + help The Resource Reservation Protocol (RSVP) permits end systems to request a minimum and maximum data flow rate for a connection; this is important for real time data such as streaming sound or video. @@ -572,7 +572,7 @@ config NET_CLS_RSVP config NET_CLS_RSVP6 tristate "IPv6 Resource Reservation Protocol (RSVP6)" select NET_CLS - ---help--- + help The Resource Reservation Protocol (RSVP) permits end systems to request a minimum and maximum data flow rate for a connection; this is important for real time data such as streaming sound or video. @@ -586,7 +586,7 @@ config NET_CLS_RSVP6 config NET_CLS_FLOW tristate "Flow classifier" select NET_CLS - ---help--- + help If you say Y here, you will be able to classify packets based on a configurable combination of packet keys. This is mostly useful in combination with SFQ. @@ -599,7 +599,7 @@ config NET_CLS_CGROUP select NET_CLS select CGROUP_NET_CLASSID depends on CGROUPS - ---help--- + help Say Y here if you want to classify packets based on the control cgroup of their process. @@ -609,7 +609,7 @@ config NET_CLS_CGROUP config NET_CLS_BPF tristate "BPF-based classifier" select NET_CLS - ---help--- + help If you say Y here, you will be able to classify packets based on programmable BPF (JIT'ed) filters as an alternative to ematches. @@ -619,7 +619,7 @@ config NET_CLS_BPF config NET_CLS_FLOWER tristate "Flower classifier" select NET_CLS - ---help--- + help If you say Y here, you will be able to classify packets based on a configurable combination of packet keys and masks. @@ -629,7 +629,7 @@ config NET_CLS_FLOWER config NET_CLS_MATCHALL tristate "Match-all classifier" select NET_CLS - ---help--- + help If you say Y here, you will be able to classify packets based on nothing. Every packet will match. @@ -639,7 +639,7 @@ config NET_CLS_MATCHALL config NET_EMATCH bool "Extended Matches" select NET_CLS - ---help--- + help Say Y here if you want to use extended matches on top of classifiers and select the extended matches below. @@ -653,7 +653,7 @@ config NET_EMATCH_STACK int "Stack size" depends on NET_EMATCH default "32" - ---help--- + help Size of the local stack variable used while evaluating the tree of ematches. Limits the depth of the tree, i.e. the number of encapsulated precedences. Every level requires 4 bytes of additional @@ -662,7 +662,7 @@ config NET_EMATCH_STACK config NET_EMATCH_CMP tristate "Simple packet data comparison" depends on NET_EMATCH - ---help--- + help Say Y here if you want to be able to classify packets based on simple packet data comparisons for 8, 16, and 32bit values. @@ -672,7 +672,7 @@ config NET_EMATCH_CMP config NET_EMATCH_NBYTE tristate "Multi byte comparison" depends on NET_EMATCH - ---help--- + help Say Y here if you want to be able to classify packets based on multiple byte comparisons mainly useful for IPv6 address comparisons. @@ -682,7 +682,7 @@ config NET_EMATCH_NBYTE config NET_EMATCH_U32 tristate "U32 key" depends on NET_EMATCH - ---help--- + help Say Y here if you want to be able to classify packets using the famous u32 key in combination with logic relations. @@ -692,7 +692,7 @@ config NET_EMATCH_U32 config NET_EMATCH_META tristate "Metadata" depends on NET_EMATCH - ---help--- + help Say Y here if you want to be able to classify packets based on metadata such as load average, netfilter attributes, socket attributes and routing decisions. @@ -707,7 +707,7 @@ config NET_EMATCH_TEXT select TEXTSEARCH_KMP select TEXTSEARCH_BM select TEXTSEARCH_FSM - ---help--- + help Say Y here if you want to be able to classify packets based on textsearch comparisons. @@ -717,7 +717,7 @@ config NET_EMATCH_TEXT config NET_EMATCH_CANID tristate "CAN Identifier" depends on NET_EMATCH && (CAN=y || CAN=m) - ---help--- + help Say Y here if you want to be able to classify CAN frames based on CAN Identifier. @@ -727,7 +727,7 @@ config NET_EMATCH_CANID config NET_EMATCH_IPSET tristate "IPset" depends on NET_EMATCH && IP_SET - ---help--- + help Say Y here if you want to be able to classify packets based on ipset membership. @@ -737,7 +737,7 @@ config NET_EMATCH_IPSET config NET_EMATCH_IPT tristate "IPtables Matches" depends on NET_EMATCH && NETFILTER && NETFILTER_XTABLES - ---help--- + help Say Y here to be able to classify packets based on iptables matches. Current supported match is "policy" which allows packet classification @@ -749,7 +749,7 @@ config NET_EMATCH_IPT config NET_CLS_ACT bool "Actions" select NET_CLS - ---help--- + help Say Y here if you want to use traffic control actions. Actions get attached to classifiers and are invoked after a successful classification. They are used to overwrite the classification @@ -761,7 +761,7 @@ config NET_CLS_ACT config NET_ACT_POLICE tristate "Traffic Policing" depends on NET_CLS_ACT - ---help--- + help Say Y here if you want to do traffic policing, i.e. strict bandwidth limiting. This action replaces the existing policing module. @@ -772,7 +772,7 @@ config NET_ACT_POLICE config NET_ACT_GACT tristate "Generic actions" depends on NET_CLS_ACT - ---help--- + help Say Y here to take generic actions such as dropping and accepting packets. @@ -782,13 +782,13 @@ config NET_ACT_GACT config GACT_PROB bool "Probability support" depends on NET_ACT_GACT - ---help--- + help Say Y here to use the generic action randomly or deterministically. config NET_ACT_MIRRED tristate "Redirecting and Mirroring" depends on NET_CLS_ACT - ---help--- + help Say Y here to allow packets to be mirrored or redirected to other devices. @@ -799,7 +799,7 @@ config NET_ACT_SAMPLE tristate "Traffic Sampling" depends on NET_CLS_ACT select PSAMPLE - ---help--- + help Say Y here to allow packet sampling tc action. The packet sample action consists of statistically choosing packets and sampling them using the psample module. @@ -810,7 +810,7 @@ config NET_ACT_SAMPLE config NET_ACT_IPT tristate "IPtables targets" depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES - ---help--- + help Say Y here to be able to invoke iptables targets after successful classification. @@ -820,7 +820,7 @@ config NET_ACT_IPT config NET_ACT_NAT tristate "Stateless NAT" depends on NET_CLS_ACT - ---help--- + help Say Y here to do stateless NAT on IPv4 packets. You should use netfilter for NAT unless you know what you are doing. @@ -830,7 +830,7 @@ config NET_ACT_NAT config NET_ACT_PEDIT tristate "Packet Editing" depends on NET_CLS_ACT - ---help--- + help Say Y here if you want to mangle the content of packets. To compile this code as a module, choose M here: the @@ -839,7 +839,7 @@ config NET_ACT_PEDIT config NET_ACT_SIMP tristate "Simple Example (Debug)" depends on NET_CLS_ACT - ---help--- + help Say Y here to add a simple action for demonstration purposes. It is meant as an example and for debugging purposes. It will print a configured policy string followed by the packet count @@ -853,7 +853,7 @@ config NET_ACT_SIMP config NET_ACT_SKBEDIT tristate "SKB Editing" depends on NET_CLS_ACT - ---help--- + help Say Y here to change skb priority or queue_mapping settings. If unsure, say N. @@ -865,7 +865,7 @@ config NET_ACT_CSUM tristate "Checksum Updating" depends on NET_CLS_ACT && INET select LIBCRC32C - ---help--- + help Say Y here to update some common checksum after some direct packet alterations. @@ -886,7 +886,7 @@ config NET_ACT_MPLS config NET_ACT_VLAN tristate "Vlan manipulation" depends on NET_CLS_ACT - ---help--- + help Say Y here to push or pop vlan headers. If unsure, say N. @@ -897,7 +897,7 @@ config NET_ACT_VLAN config NET_ACT_BPF tristate "BPF based action" depends on NET_CLS_ACT - ---help--- + help Say Y here to execute BPF code on packets. The BPF code will decide if the packet should be dropped or not. @@ -910,7 +910,7 @@ config NET_ACT_CONNMARK tristate "Netfilter Connection Mark Retriever" depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES depends on NF_CONNTRACK && NF_CONNTRACK_MARK - ---help--- + help Say Y here to allow retrieving of conn mark If unsure, say N. @@ -938,7 +938,7 @@ config NET_ACT_CTINFO config NET_ACT_SKBMOD tristate "skb data modification action" depends on NET_CLS_ACT - ---help--- + help Say Y here to allow modification of skb data If unsure, say N. @@ -950,7 +950,7 @@ config NET_ACT_IFE tristate "Inter-FE action based on IETF ForCES InterFE LFB" depends on NET_CLS_ACT select NET_IFE - ---help--- + help Say Y here to allow for sourcing and terminating metadata For details refer to netdev01 paper: "Distributing Linux Traffic Control Classifier-Action Subsystem" @@ -962,7 +962,7 @@ config NET_ACT_IFE config NET_ACT_TUNNEL_KEY tristate "IP tunnel metadata manipulation" depends on NET_CLS_ACT - ---help--- + help Say Y here to set/release ip tunnel metadata. If unsure, say N. diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index e29f0f45d688..e9f3576cbf71 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1543,17 +1543,6 @@ static void __exit ct_cleanup_module(void) destroy_workqueue(act_ct_wq); } -void tcf_ct_flow_table_restore_skb(struct sk_buff *skb, unsigned long cookie) -{ - enum ip_conntrack_info ctinfo = cookie & NFCT_INFOMASK; - struct nf_conn *ct; - - ct = (struct nf_conn *)(cookie & NFCT_PTRMASK); - nf_conntrack_get(&ct->ct_general); - nf_ct_set(skb, ct, ctinfo); -} -EXPORT_SYMBOL_GPL(tcf_ct_flow_table_restore_skb); - module_init(ct_init_module); module_exit(ct_cleanup_module); MODULE_AUTHOR("Paul Blakey <paulb@mellanox.com>"); diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 9c628591f452..323ae7f6315d 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -32,7 +32,7 @@ static ktime_t gate_get_time(struct tcf_gate *gact) return KTIME_MAX; } -static int gate_get_start_time(struct tcf_gate *gact, ktime_t *start) +static void gate_get_start_time(struct tcf_gate *gact, ktime_t *start) { struct tcf_gate_params *param = &gact->param; ktime_t now, base, cycle; @@ -43,18 +43,13 @@ static int gate_get_start_time(struct tcf_gate *gact, ktime_t *start) if (ktime_after(base, now)) { *start = base; - return 0; + return; } cycle = param->tcfg_cycletime; - /* cycle time should not be zero */ - if (!cycle) - return -EFAULT; - n = div64_u64(ktime_sub_ns(now, base), cycle); *start = ktime_add_ns(base, (n + 1) * cycle); - return 0; } static void gate_start_timer(struct tcf_gate *gact, ktime_t start) @@ -277,6 +272,27 @@ release_list: return err; } +static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, + enum tk_offsets tko, s32 clockid, + bool do_init) +{ + if (!do_init) { + if (basetime == gact->param.tcfg_basetime && + tko == gact->tk_offset && + clockid == gact->param.tcfg_clockid) + return; + + spin_unlock_bh(&gact->tcf_lock); + hrtimer_cancel(&gact->hitimer); + spin_lock_bh(&gact->tcf_lock); + } + gact->param.tcfg_basetime = basetime; + gact->param.tcfg_clockid = clockid; + gact->tk_offset = tko; + hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); + gact->hitimer.function = gate_timer_func; +} + static int tcf_gate_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, @@ -287,12 +303,12 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, enum tk_offsets tk_offset = TK_OFFS_TAI; struct nlattr *tb[TCA_GATE_MAX + 1]; struct tcf_chain *goto_ch = NULL; + u64 cycletime = 0, basetime = 0; struct tcf_gate_params *p; s32 clockid = CLOCK_TAI; struct tcf_gate *gact; struct tc_gate *parm; int ret = 0, err; - u64 basetime = 0; u32 gflags = 0; s32 prio = -1; ktime_t start; @@ -308,6 +324,27 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (!tb[TCA_GATE_PARMS]) return -EINVAL; + if (tb[TCA_GATE_CLOCKID]) { + clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); + switch (clockid) { + case CLOCK_REALTIME: + tk_offset = TK_OFFS_REAL; + break; + case CLOCK_MONOTONIC: + tk_offset = TK_OFFS_MAX; + break; + case CLOCK_BOOTTIME: + tk_offset = TK_OFFS_BOOT; + break; + case CLOCK_TAI: + tk_offset = TK_OFFS_TAI; + break; + default: + NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); + return -EINVAL; + } + } + parm = nla_data(tb[TCA_GATE_PARMS]); index = parm->index; @@ -331,10 +368,6 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, tcf_idr_release(*a, bind); return -EEXIST; } - if (ret == ACT_P_CREATED) { - to_gate(*a)->param.tcfg_clockid = -1; - INIT_LIST_HEAD(&(to_gate(*a)->param.entries)); - } if (tb[TCA_GATE_PRIORITY]) prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); @@ -345,41 +378,19 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, if (tb[TCA_GATE_FLAGS]) gflags = nla_get_u32(tb[TCA_GATE_FLAGS]); - if (tb[TCA_GATE_CLOCKID]) { - clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); - switch (clockid) { - case CLOCK_REALTIME: - tk_offset = TK_OFFS_REAL; - break; - case CLOCK_MONOTONIC: - tk_offset = TK_OFFS_MAX; - break; - case CLOCK_BOOTTIME: - tk_offset = TK_OFFS_BOOT; - break; - case CLOCK_TAI: - tk_offset = TK_OFFS_TAI; - break; - default: - NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); - goto release_idr; - } - } + gact = to_gate(*a); + if (ret == ACT_P_CREATED) + INIT_LIST_HEAD(&gact->param.entries); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; - gact = to_gate(*a); - spin_lock_bh(&gact->tcf_lock); p = &gact->param; - if (tb[TCA_GATE_CYCLE_TIME]) { - p->tcfg_cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); - if (!p->tcfg_cycletime_ext) - goto chain_put; - } + if (tb[TCA_GATE_CYCLE_TIME]) + cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); if (tb[TCA_GATE_ENTRY_LIST]) { err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); @@ -387,35 +398,29 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, goto chain_put; } - if (!p->tcfg_cycletime) { + if (!cycletime) { struct tcfg_gate_entry *entry; ktime_t cycle = 0; list_for_each_entry(entry, &p->entries, list) cycle = ktime_add_ns(cycle, entry->interval); - p->tcfg_cycletime = cycle; + cycletime = cycle; + if (!cycletime) { + err = -EINVAL; + goto chain_put; + } } + p->tcfg_cycletime = cycletime; if (tb[TCA_GATE_CYCLE_TIME_EXT]) p->tcfg_cycletime_ext = nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); + gate_setup_timer(gact, basetime, tk_offset, clockid, + ret == ACT_P_CREATED); p->tcfg_priority = prio; - p->tcfg_basetime = basetime; - p->tcfg_clockid = clockid; p->tcfg_flags = gflags; - - gact->tk_offset = tk_offset; - hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); - gact->hitimer.function = gate_timer_func; - - err = gate_get_start_time(gact, &start); - if (err < 0) { - NL_SET_ERR_MSG(extack, - "Internal error: failed get start time"); - release_entry_list(&p->entries); - goto chain_put; - } + gate_get_start_time(gact, &start); gact->current_close_time = start; gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING; @@ -443,6 +448,13 @@ chain_put: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: + /* action is not inserted in any list: it's safe to init hitimer + * without taking tcf_lock. + */ + if (ret == ACT_P_CREATED) + gate_setup_timer(gact, gact->param.tcfg_basetime, + gact->tk_offset, gact->param.tcfg_clockid, + true); tcf_idr_release(*a, bind); return err; } @@ -453,9 +465,7 @@ static void tcf_gate_cleanup(struct tc_action *a) struct tcf_gate_params *p; p = &gact->param; - if (p->tcfg_clockid != -1) - hrtimer_cancel(&gact->hitimer); - + hrtimer_cancel(&gact->hitimer); release_entry_list(&p->entries); } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index b19a0021a0bd..265a61d011df 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -464,6 +464,7 @@ void __netdev_watchdog_up(struct net_device *dev) dev_hold(dev); } } +EXPORT_SYMBOL_GPL(__netdev_watchdog_up); static void dev_watchdog_up(struct net_device *dev) { diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 68934438ee19..39d7fa9569f8 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -11,7 +11,7 @@ menuconfig IP_SCTP select CRYPTO_HMAC select CRYPTO_SHA1 select LIBCRC32C - ---help--- + help Stream Control Transmission Protocol From RFC 2960 <http://www.ietf.org/rfc/rfc2960.txt>. diff --git a/net/smc/Kconfig b/net/smc/Kconfig index f54a70b8da82..1ab3c5a2c5ad 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -2,7 +2,7 @@ config SMC tristate "SMC socket protocol family" depends on INET && INFINIBAND - ---help--- + help SMC-R provides a "sockets over RDMA" solution making use of RDMA over Converged Ethernet (RoCE) technology to upgrade AF_INET TCP connections transparently. @@ -14,7 +14,7 @@ config SMC config SMC_DIAG tristate "SMC: socket monitoring interface" depends on SMC - ---help--- + help Support for SMC socket monitoring interface used by tools such as smcss. diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index f0a5064bf9bd..562a52d01ad1 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -548,18 +548,18 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) static struct ib_client smc_ib_client; /* callback function for ib_register_client() */ -static void smc_ib_add_dev(struct ib_device *ibdev) +static int smc_ib_add_dev(struct ib_device *ibdev) { struct smc_ib_device *smcibdev; u8 port_cnt; int i; if (ibdev->node_type != RDMA_NODE_IB_CA) - return; + return -EOPNOTSUPP; smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); if (!smcibdev) - return; + return -ENOMEM; smcibdev->ibdev = ibdev; INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); @@ -594,17 +594,14 @@ static void smc_ib_add_dev(struct ib_device *ibdev) ""); } schedule_work(&smcibdev->port_event_work); + return 0; } /* callback function for ib_unregister_client() */ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) { - struct smc_ib_device *smcibdev; + struct smc_ib_device *smcibdev = client_data; - smcibdev = ib_get_client_data(ibdev, &smc_ib_client); - if (!smcibdev || smcibdev->ibdev != ibdev) - return; - ib_set_client_data(ibdev, &smc_ib_client, NULL); spin_lock(&smc_ib_devices.lock); list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ spin_unlock(&smc_ib_devices.lock); diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c index 8b4d72b1a066..010dcb876f9d 100644 --- a/net/sunrpc/addr.c +++ b/net/sunrpc/addr.c @@ -82,11 +82,11 @@ static size_t rpc_ntop6(const struct sockaddr *sap, rc = snprintf(scopebuf, sizeof(scopebuf), "%c%u", IPV6_SCOPE_DELIMITER, sin6->sin6_scope_id); - if (unlikely((size_t)rc > sizeof(scopebuf))) + if (unlikely((size_t)rc >= sizeof(scopebuf))) return 0; len += rc; - if (unlikely(len > buflen)) + if (unlikely(len >= buflen)) return 0; strcat(buf, scopebuf); diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 5748ad0ba1bd..a9f0d17fdb0d 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -81,7 +81,7 @@ static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp) unsigned int nbits; nbits = *(unsigned int *)kp->arg; - return sprintf(buffer, "%u", 1U << nbits); + return sprintf(buffer, "%u\n", 1U << nbits); } #define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index ac5cac0dd24b..4ecc2a959567 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -254,7 +254,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct if (IS_ERR(p)) goto err; done: - trace_rpcgss_context(ctx->gc_expiry, now, timeout, + trace_rpcgss_context(window_size, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len, ctx->gc_acceptor.data); err: return p; @@ -697,10 +697,12 @@ retry: } schedule(); } - if (gss_msg->ctx) + if (gss_msg->ctx) { + trace_rpcgss_ctx_init(gss_cred); gss_cred_set_ctx(cred, gss_msg->ctx); - else + } else { err = gss_msg->msg.errno; + } spin_unlock(&pipe->lock); out_intr: finish_wait(&gss_msg->waitqueue, &wait); @@ -1054,11 +1056,11 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) auth->au_rslack = GSS_KRB5_MAX_SLACK_NEEDED >> 2; auth->au_verfsize = GSS_VERF_SLACK >> 2; auth->au_ralign = GSS_VERF_SLACK >> 2; - auth->au_flags = 0; + __set_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags); auth->au_ops = &authgss_ops; auth->au_flavor = flavor; if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) - auth->au_flags |= RPCAUTH_AUTH_DATATOUCH; + __set_bit(RPCAUTH_AUTH_DATATOUCH, &auth->au_flags); refcount_set(&auth->au_count, 1); kref_init(&gss_auth->kref); @@ -1284,8 +1286,9 @@ gss_send_destroy_context(struct rpc_cred *cred) if (new) { ctx->gc_proc = RPC_GSS_PROC_DESTROY; + trace_rpcgss_ctx_destroy(gss_cred); task = rpc_call_null(gss_auth->client, &new->gc_base, - RPC_TASK_ASYNC|RPC_TASK_SOFT); + RPC_TASK_ASYNC); if (!IS_ERR(task)) rpc_put_task(task); @@ -1349,7 +1352,6 @@ gss_destroy_nullcred(struct rpc_cred *cred) static void gss_destroy_cred(struct rpc_cred *cred) { - if (test_and_clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) gss_send_destroy_context(cred); gss_destroy_nullcred(cred); @@ -1613,6 +1615,7 @@ static int gss_renew_cred(struct rpc_task *task) new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW); if (IS_ERR(new)) return PTR_ERR(new); + task->tk_rqstp->rq_cred = new; put_rpccred(oldcred); return 0; @@ -1709,7 +1712,8 @@ gss_validate(struct rpc_task *task, struct xdr_stream *xdr) /* We leave it to unwrap to calculate au_rslack. For now we just * calculate the length of the verifier: */ - cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2; + if (test_bit(RPCAUTH_AUTH_UPDATE_SLACK, &cred->cr_auth->au_flags)) + cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2; status = 0; out: gss_put_ctx(ctx); @@ -1927,13 +1931,30 @@ out: return status; } -static int -gss_unwrap_resp_auth(struct rpc_cred *cred) +/** + * gss_update_rslack - Possibly update RPC receive buffer size estimates + * @task: rpc_task for incoming RPC Reply being unwrapped + * @cred: controlling rpc_cred for @task + * @before: XDR words needed before each RPC Reply message + * @after: XDR words needed following each RPC Reply message + * + */ +static void gss_update_rslack(struct rpc_task *task, struct rpc_cred *cred, + unsigned int before, unsigned int after) { struct rpc_auth *auth = cred->cr_auth; - auth->au_rslack = auth->au_verfsize; - auth->au_ralign = auth->au_verfsize; + if (test_and_clear_bit(RPCAUTH_AUTH_UPDATE_SLACK, &auth->au_flags)) { + auth->au_ralign = auth->au_verfsize + before; + auth->au_rslack = auth->au_verfsize + after; + trace_rpcgss_update_slack(task, auth); + } +} + +static int +gss_unwrap_resp_auth(struct rpc_task *task, struct rpc_cred *cred) +{ + gss_update_rslack(task, cred, 0, 0); return 0; } @@ -1956,7 +1977,6 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, struct xdr_stream *xdr) { struct xdr_buf gss_data, *rcv_buf = &rqstp->rq_rcv_buf; - struct rpc_auth *auth = cred->cr_auth; u32 len, offset, seqno, maj_stat; struct xdr_netobj mic; int ret; @@ -2005,8 +2025,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, if (maj_stat != GSS_S_COMPLETE) goto bad_mic; - auth->au_rslack = auth->au_verfsize + 2 + 1 + XDR_QUADLEN(mic.len); - auth->au_ralign = auth->au_verfsize + 2; + gss_update_rslack(task, cred, 2, 2 + 1 + XDR_QUADLEN(mic.len)); ret = 0; out: @@ -2031,7 +2050,6 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, { struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; struct kvec *head = rqstp->rq_rcv_buf.head; - struct rpc_auth *auth = cred->cr_auth; u32 offset, opaque_len, maj_stat; __be32 *p; @@ -2058,8 +2076,8 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, */ xdr_init_decode(xdr, rcv_buf, p, rqstp); - auth->au_rslack = auth->au_verfsize + 2 + ctx->gc_gss_ctx->slack; - auth->au_ralign = auth->au_verfsize + 2 + ctx->gc_gss_ctx->align; + gss_update_rslack(task, cred, 2 + ctx->gc_gss_ctx->align, + 2 + ctx->gc_gss_ctx->slack); return 0; unwrap_failed: @@ -2130,7 +2148,7 @@ gss_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr) goto out_decode; switch (gss_cred->gc_service) { case RPC_GSS_SVC_NONE: - status = gss_unwrap_resp_auth(cred); + status = gss_unwrap_resp_auth(task, cred); break; case RPC_GSS_SVC_INTEGRITY: status = gss_unwrap_resp_integ(task, cred, ctx, rqstp, xdr); diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 69316ab1b9fa..fae632da1058 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -37,6 +37,8 @@ gss_mech_free(struct gss_api_mech *gm) for (i = 0; i < gm->gm_pf_num; i++) { pf = &gm->gm_pfs[i]; + if (pf->domain) + auth_domain_put(pf->domain); kfree(pf->auth_domain_name); pf->auth_domain_name = NULL; } @@ -59,6 +61,7 @@ make_auth_domain_name(char *name) static int gss_mech_svc_setup(struct gss_api_mech *gm) { + struct auth_domain *dom; struct pf_desc *pf; int i, status; @@ -68,10 +71,13 @@ gss_mech_svc_setup(struct gss_api_mech *gm) status = -ENOMEM; if (pf->auth_domain_name == NULL) goto out; - status = svcauth_gss_register_pseudoflavor(pf->pseudoflavor, - pf->auth_domain_name); - if (status) + dom = svcauth_gss_register_pseudoflavor( + pf->pseudoflavor, pf->auth_domain_name); + if (IS_ERR(dom)) { + status = PTR_ERR(dom); goto out; + } + pf->domain = dom; } return 0; out: diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 0349f455a862..af9c7f43859c 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -223,7 +223,7 @@ static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg) static char *gssp_stringify(struct xdr_netobj *netobj) { - return kstrndup(netobj->data, netobj->len, GFP_KERNEL); + return kmemdup_nul(netobj->data, netobj->len, GFP_KERNEL); } static void gssp_hostbased_service(char **principal) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 50d93c49ef1a..46027d0c903f 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -809,7 +809,7 @@ u32 svcauth_gss_flavor(struct auth_domain *dom) EXPORT_SYMBOL_GPL(svcauth_gss_flavor); -int +struct auth_domain * svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) { struct gss_domain *new; @@ -826,21 +826,23 @@ svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) new->h.flavour = &svcauthops_gss; new->pseudoflavor = pseudoflavor; - stat = 0; test = auth_domain_lookup(name, &new->h); - if (test != &new->h) { /* Duplicate registration */ + if (test != &new->h) { + pr_warn("svc: duplicate registration of gss pseudo flavour %s.\n", + name); + stat = -EADDRINUSE; auth_domain_put(test); - kfree(new->h.name); - goto out_free_dom; + goto out_free_name; } - return 0; + return test; +out_free_name: + kfree(new->h.name); out_free_dom: kfree(new); out: - return stat; + return ERR_PTR(stat); } - EXPORT_SYMBOL_GPL(svcauth_gss_register_pseudoflavor); static inline int diff --git a/net/sunrpc/auth_gss/trace.c b/net/sunrpc/auth_gss/trace.c index 5576f1e66de9..49fa583d7f91 100644 --- a/net/sunrpc/auth_gss/trace.c +++ b/net/sunrpc/auth_gss/trace.c @@ -6,6 +6,7 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/gss_err.h> +#include <linux/sunrpc/auth_gss.h> #define CREATE_TRACE_POINTS #include <trace/events/rpcgss.h> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 61b21dafd7c0..a91d1cdad9d7 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -370,10 +370,6 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, const char *nodename = args->nodename; int err; - /* sanity check the name before trying to print it */ - dprintk("RPC: creating %s client for %s (xprt %p)\n", - program->name, args->servername, xprt); - err = rpciod_up(); if (err) goto out_no_rpciod; @@ -436,6 +432,8 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, goto out_no_path; if (parent) atomic_inc(&parent->cl_count); + + trace_rpc_clnt_new(clnt, xprt, program->name, args->servername); return clnt; out_no_path: @@ -450,6 +448,7 @@ out_err: out_no_rpciod: xprt_switch_put(xps); xprt_put(xprt); + trace_rpc_clnt_new_err(program->name, args->servername, err); return ERR_PTR(err); } @@ -634,10 +633,8 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, args->nodename = clnt->cl_nodename; new = rpc_new_client(args, xps, xprt, clnt); - if (IS_ERR(new)) { - err = PTR_ERR(new); - goto out_err; - } + if (IS_ERR(new)) + return new; /* Turn off autobind on clones */ new->cl_autobind = 0; @@ -650,7 +647,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, return new; out_err: - dprintk("RPC: %s: returned error %d\n", __func__, err); + trace_rpc_clnt_clone_err(clnt, err); return ERR_PTR(err); } @@ -723,11 +720,8 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt, int err; xprt = xprt_create_transport(args); - if (IS_ERR(xprt)) { - dprintk("RPC: failed to create new xprt for clnt %p\n", - clnt); + if (IS_ERR(xprt)) return PTR_ERR(xprt); - } xps = xprt_switch_alloc(xprt, GFP_KERNEL); if (xps == NULL) { @@ -767,7 +761,7 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt, rpc_release_client(parent); xprt_switch_put(oldxps); xprt_put(old); - dprintk("RPC: replaced xprt for clnt %p\n", clnt); + trace_rpc_clnt_replace_xprt(clnt); return 0; out_revert: @@ -777,7 +771,7 @@ out_revert: rpc_client_register(clnt, pseudoflavor, NULL); xprt_switch_put(xps); xprt_put(xprt); - dprintk("RPC: failed to switch xprt for clnt %p\n", clnt); + trace_rpc_clnt_replace_xprt_err(clnt); return err; } EXPORT_SYMBOL_GPL(rpc_switch_client_transport); @@ -844,10 +838,11 @@ void rpc_killall_tasks(struct rpc_clnt *clnt) if (list_empty(&clnt->cl_tasks)) return; - dprintk("RPC: killing all tasks for client %p\n", clnt); + /* * Spin lock all_tasks to prevent changes... */ + trace_rpc_clnt_killall(clnt); spin_lock(&clnt->cl_lock); list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) rpc_signal_task(rovr); @@ -863,9 +858,7 @@ void rpc_shutdown_client(struct rpc_clnt *clnt) { might_sleep(); - dprintk_rcu("RPC: shutting down %s client for %s\n", - clnt->cl_program->name, - rcu_dereference(clnt->cl_xprt)->servername); + trace_rpc_clnt_shutdown(clnt); while (!list_empty(&clnt->cl_tasks)) { rpc_killall_tasks(clnt); @@ -884,6 +877,8 @@ static void rpc_free_client_work(struct work_struct *work) { struct rpc_clnt *clnt = container_of(work, struct rpc_clnt, cl_work); + trace_rpc_clnt_free(clnt); + /* These might block on processes that might allocate memory, * so they cannot be called in rpciod, so they are handled separately * here. @@ -901,9 +896,7 @@ rpc_free_client(struct rpc_clnt *clnt) { struct rpc_clnt *parent = NULL; - dprintk_rcu("RPC: destroying %s client for %s\n", - clnt->cl_program->name, - rcu_dereference(clnt->cl_xprt)->servername); + trace_rpc_clnt_release(clnt); if (clnt->cl_parent != clnt) parent = clnt->cl_parent; rpc_unregister_client(clnt); @@ -945,8 +938,6 @@ rpc_free_auth(struct rpc_clnt *clnt) void rpc_release_client(struct rpc_clnt *clnt) { - dprintk("RPC: rpc_release_client(%p)\n", clnt); - do { if (list_empty(&clnt->cl_tasks)) wake_up(&destroy_wait); @@ -1270,7 +1261,7 @@ void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages, hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign - 1; xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len); - trace_rpc_reply_pages(req); + trace_rpc_xdr_reply_pages(req->rq_task, &req->rq_rcv_buf); } EXPORT_SYMBOL_GPL(rpc_prepare_reply_pages); @@ -1624,6 +1615,7 @@ const char static void __rpc_call_rpcerror(struct rpc_task *task, int tk_status, int rpc_status) { + trace_rpc_call_rpcerror(task, tk_status, rpc_status); task->tk_rpc_status = rpc_status; rpc_exit(task, tk_status); } @@ -2531,7 +2523,7 @@ call_decode(struct rpc_task *task) goto out; req->rq_rcv_buf.len = req->rq_private_buf.len; - trace_xprt_recvfrom(&req->rq_rcv_buf); + trace_rpc_xdr_recvfrom(task, &req->rq_rcv_buf); /* Check that the softirq receive buffer is valid */ WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf, @@ -2760,7 +2752,8 @@ struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, .rpc_op_cred = cred, .callback_ops = (ops != NULL) ? ops : &rpc_default_ops, .callback_data = data, - .flags = flags | RPC_TASK_NULLCREDS, + .flags = flags | RPC_TASK_SOFT | RPC_TASK_SOFTCONN | + RPC_TASK_NULLCREDS, }; return rpc_run_task(&task_setup_data); @@ -2823,8 +2816,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, goto success; } - task = rpc_call_null_helper(clnt, xprt, NULL, - RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC|RPC_TASK_NULLCREDS, + task = rpc_call_null_helper(clnt, xprt, NULL, RPC_TASK_ASYNC, &rpc_cb_add_xprt_call_ops, data); rpc_put_task(task); @@ -2867,9 +2859,7 @@ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, goto out_err; /* Test the connection */ - task = rpc_call_null_helper(clnt, xprt, NULL, - RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS, - NULL, NULL); + task = rpc_call_null_helper(clnt, xprt, NULL, 0, NULL, NULL); if (IS_ERR(task)) { status = PTR_ERR(task); goto out_err; diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 4a020b688860..c27123e6ba80 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -795,12 +795,6 @@ void rpcb_getport_async(struct rpc_task *task) child = rpcb_call_async(rpcb_clnt, map, proc); rpc_release_client(rpcb_clnt); - if (IS_ERR(child)) { - /* rpcb_map_release() has freed the arguments */ - dprintk("RPC: %5u %s: rpc_run_task failed\n", - task->tk_pid, __func__); - return; - } xprt->stat.bind_count++; rpc_put_task(child); diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h index 47a756503d11..f6fe2e6cd65a 100644 --- a/net/sunrpc/sunrpc.h +++ b/net/sunrpc/sunrpc.h @@ -52,4 +52,5 @@ static inline int sock_is_loopback(struct sock *sk) int rpc_clients_notifier_register(void); void rpc_clients_notifier_unregister(void); +void auth_domain_cleanup(void); #endif /* _NET_SUNRPC_SUNRPC_H */ diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index f9edaa9174a4..236fadc4a439 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -23,6 +23,7 @@ #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/xprtsock.h> +#include "sunrpc.h" #include "netns.h" unsigned int sunrpc_net_id; @@ -131,6 +132,7 @@ cleanup_sunrpc(void) unregister_rpc_pipefs(); rpc_destroy_mempool(); unregister_pernet_subsys(&sunrpc_net_ops); + auth_domain_cleanup(); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_unregister_sysctl(); #endif diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 9ed3126600ce..c211b607239e 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -88,15 +88,15 @@ param_get_pool_mode(char *buf, const struct kernel_param *kp) switch (*ip) { case SVC_POOL_AUTO: - return strlcpy(buf, "auto", 20); + return strlcpy(buf, "auto\n", 20); case SVC_POOL_GLOBAL: - return strlcpy(buf, "global", 20); + return strlcpy(buf, "global\n", 20); case SVC_POOL_PERCPU: - return strlcpy(buf, "percpu", 20); + return strlcpy(buf, "percpu\n", 20); case SVC_POOL_PERNODE: - return strlcpy(buf, "pernode", 20); + return strlcpy(buf, "pernode\n", 20); default: - return sprintf(buf, "%d", *ip); + return sprintf(buf, "%d\n", *ip); } } @@ -991,6 +991,7 @@ static int __svc_register(struct net *net, const char *progname, #endif } + trace_svc_register(progname, version, protocol, port, family, error); return error; } @@ -1000,11 +1001,6 @@ int svc_rpcbind_set_version(struct net *net, unsigned short proto, unsigned short port) { - dprintk("svc: svc_register(%sv%d, %s, %u, %u)\n", - progp->pg_name, version, - proto == IPPROTO_UDP? "udp" : "tcp", - port, family); - return __svc_register(net, progp->pg_name, progp->pg_prog, version, family, proto, port); @@ -1024,11 +1020,8 @@ int svc_generic_rpcbind_set(struct net *net, return 0; if (vers->vs_hidden) { - dprintk("svc: svc_register(%sv%d, %s, %u, %u)" - " (but not telling portmap)\n", - progp->pg_name, version, - proto == IPPROTO_UDP? "udp" : "tcp", - port, family); + trace_svc_noregister(progp->pg_name, version, proto, + port, family, 0); return 0; } @@ -1106,8 +1099,7 @@ static void __svc_unregister(struct net *net, const u32 program, const u32 versi if (error == -EPROTONOSUPPORT) error = rpcb_register(net, program, version, 0, 0); - dprintk("svc: %s(%sv%u), error %d\n", - __func__, progname, version, error); + trace_svc_unregister(progname, version, error); } /* @@ -1132,9 +1124,6 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net) continue; if (progp->pg_vers[i]->vs_hidden) continue; - - dprintk("svc: attempting to unregister %sv%u\n", - progp->pg_name, i); __svc_unregister(net, progp->pg_prog, i, progp->pg_name); } } diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 2284ff038dad..43cf8dbde898 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -153,6 +153,7 @@ static void svc_xprt_free(struct kref *kref) xprt_put(xprt->xpt_bc_xprt); if (xprt->xpt_bc_xps) xprt_switch_put(xprt->xpt_bc_xps); + trace_svc_xprt_free(xprt); xprt->xpt_ops->xpo_free(xprt); module_put(owner); } @@ -206,6 +207,7 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, .sin6_port = htons(port), }; #endif + struct svc_xprt *xprt; struct sockaddr *sap; size_t len; @@ -224,7 +226,11 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, return ERR_PTR(-EAFNOSUPPORT); } - return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); + xprt = xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); + if (IS_ERR(xprt)) + trace_svc_xprt_create_err(serv->sv_program->pg_name, + xcl->xcl_name, sap, xprt); + return xprt; } /* @@ -304,15 +310,11 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, { int err; - dprintk("svc: creating transport %s[%d]\n", xprt_name, port); err = _svc_create_xprt(serv, xprt_name, net, family, port, flags, cred); if (err == -EPROTONOSUPPORT) { request_module("svc%s", xprt_name); err = _svc_create_xprt(serv, xprt_name, net, family, port, flags, cred); } - if (err < 0) - dprintk("svc: transport %s not found, err %d\n", - xprt_name, -err); return err; } EXPORT_SYMBOL_GPL(svc_create_xprt); @@ -780,7 +782,6 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) int len = 0; if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { - dprintk("svc_recv: found XPT_CLOSE\n"); if (test_and_clear_bit(XPT_KILL_TEMP, &xprt->xpt_flags)) xprt->xpt_ops->xpo_kill_temp_xprt(xprt); svc_delete_xprt(xprt); @@ -799,6 +800,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) if (newxpt) { newxpt->xpt_cred = get_cred(xprt->xpt_cred); svc_add_new_temp_xprt(serv, newxpt); + trace_svc_xprt_accept(newxpt, serv->sv_name); } else module_put(xprt->xpt_class->xcl_owner); } else if (svc_xprt_reserve_slot(rqstp, xprt)) { @@ -812,7 +814,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) else len = xprt->xpt_ops->xpo_recvfrom(rqstp); if (len > 0) - trace_svc_recvfrom(&rqstp->rq_arg); + trace_svc_xdr_recvfrom(rqstp, &rqstp->rq_arg); rqstp->rq_stime = ktime_get(); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); @@ -835,14 +837,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) struct svc_serv *serv = rqstp->rq_server; int len, err; - dprintk("svc: server %p waiting for data (to = %ld)\n", - rqstp, timeout); - - if (rqstp->rq_xprt) - printk(KERN_ERR - "svc_recv: service %p, transport not NULL!\n", - rqstp); - err = svc_alloc_arg(rqstp); if (err) goto out; @@ -890,7 +884,6 @@ EXPORT_SYMBOL_GPL(svc_recv); void svc_drop(struct svc_rqst *rqstp) { trace_svc_drop(rqstp); - dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt); svc_xprt_release(rqstp); } EXPORT_SYMBOL_GPL(svc_drop); @@ -913,17 +906,11 @@ int svc_send(struct svc_rqst *rqstp) xb->len = xb->head[0].iov_len + xb->page_len + xb->tail[0].iov_len; - trace_svc_sendto(xb); - - /* Grab mutex to serialize outgoing data. */ - mutex_lock(&xprt->xpt_mutex); + trace_svc_xdr_sendto(rqstp, xb); trace_svc_stats_latency(rqstp); - if (test_bit(XPT_DEAD, &xprt->xpt_flags) - || test_bit(XPT_CLOSE, &xprt->xpt_flags)) - len = -ENOTCONN; - else - len = xprt->xpt_ops->xpo_sendto(rqstp); - mutex_unlock(&xprt->xpt_mutex); + + len = xprt->xpt_ops->xpo_sendto(rqstp); + trace_svc_send(rqstp, len); svc_xprt_release(rqstp); @@ -1031,11 +1018,10 @@ static void svc_delete_xprt(struct svc_xprt *xprt) struct svc_serv *serv = xprt->xpt_server; struct svc_deferred_req *dr; - /* Only do this once */ if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) - BUG(); + return; - dprintk("svc: svc_delete_xprt(%p)\n", xprt); + trace_svc_xprt_detach(xprt); xprt->xpt_ops->xpo_detach(xprt); if (xprt->xpt_bc_xprt) xprt->xpt_bc_xprt->ops->close(xprt->xpt_bc_xprt); @@ -1056,6 +1042,7 @@ static void svc_delete_xprt(struct svc_xprt *xprt) void svc_close_xprt(struct svc_xprt *xprt) { + trace_svc_xprt_close(xprt); set_bit(XPT_CLOSE, &xprt->xpt_flags); if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) /* someone else will have to effect the close */ @@ -1158,16 +1145,15 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) set_bit(XPT_DEFERRED, &xprt->xpt_flags); if (too_many || test_bit(XPT_DEAD, &xprt->xpt_flags)) { spin_unlock(&xprt->xpt_lock); - dprintk("revisit canceled\n"); + trace_svc_defer_drop(dr); svc_xprt_put(xprt); - trace_svc_drop_deferred(dr); kfree(dr); return; } - dprintk("revisit queued\n"); dr->xprt = NULL; list_add(&dr->handle.recent, &xprt->xpt_deferred); spin_unlock(&xprt->xpt_lock); + trace_svc_defer_queue(dr); svc_xprt_enqueue(xprt); svc_xprt_put(xprt); } @@ -1213,22 +1199,24 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip, dr->argslen << 2); } + trace_svc_defer(rqstp); svc_xprt_get(rqstp->rq_xprt); dr->xprt = rqstp->rq_xprt; set_bit(RQ_DROPME, &rqstp->rq_flags); dr->handle.revisit = svc_revisit; - trace_svc_defer(rqstp); return &dr->handle; } /* * recv data from a deferred request into an active one */ -static int svc_deferred_recv(struct svc_rqst *rqstp) +static noinline int svc_deferred_recv(struct svc_rqst *rqstp) { struct svc_deferred_req *dr = rqstp->rq_deferred; + trace_svc_defer_recv(dr); + /* setup iov_base past transport header */ rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2); /* The iov_len does not include the transport header bytes */ @@ -1259,7 +1247,6 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) struct svc_deferred_req, handle.recent); list_del_init(&dr->handle.recent); - trace_svc_revisit_deferred(dr); } else clear_bit(XPT_DEFERRED, &xprt->xpt_flags); spin_unlock(&xprt->xpt_lock); diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 552617e3467b..998b196b6176 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -21,6 +21,8 @@ #include <trace/events/sunrpc.h> +#include "sunrpc.h" + #define RPCDBG_FACILITY RPCDBG_AUTH @@ -205,3 +207,26 @@ struct auth_domain *auth_domain_find(char *name) return NULL; } EXPORT_SYMBOL_GPL(auth_domain_find); + +/** + * auth_domain_cleanup - check that the auth_domain table is empty + * + * On module unload the auth_domain_table must be empty. To make it + * easier to catch bugs which don't clean up domains properly, we + * warn if anything remains in the table at cleanup time. + * + * Note that we cannot proactively remove the domains at this stage. + * The ->release() function might be in a module that has already been + * unloaded. + */ + +void auth_domain_cleanup(void) +{ + int h; + struct auth_domain *hp; + + for (h = 0; h < DN_HASHMAX; h++) + hlist_for_each_entry(hp, &auth_domain_table[h], hash) + pr_warn("svc: domain %s still present at module unload.\n", + hp->name); +} diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 6c8f802c4261..97c0bddba7a3 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -332,15 +332,6 @@ static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, return 0; } -static inline int ip_map_update(struct net *net, struct ip_map *ipm, - struct unix_domain *udom, time64_t expiry) -{ - struct sunrpc_net *sn; - - sn = net_generic(net, sunrpc_net_id); - return __ip_map_update(sn->ip_map_cache, ipm, udom, expiry); -} - void svcauth_unix_purge(struct net *net) { struct sunrpc_net *sn; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index e7a0037d9b56..5c4ec9386f81 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -45,7 +45,6 @@ #include <net/tcp_states.h> #include <linux/uaccess.h> #include <asm/ioctls.h> -#include <trace/events/skb.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/clnt.h> @@ -55,6 +54,8 @@ #include <linux/sunrpc/stats.h> #include <linux/sunrpc/xprt.h> +#include <trace/events/sunrpc.h> + #include "socklib.h" #include "sunrpc.h" @@ -108,31 +109,35 @@ static void svc_reclassify_socket(struct socket *sock) } #endif -/* - * Release an skbuff after use +/** + * svc_tcp_release_rqst - Release transport-related resources + * @rqstp: request structure with resources to be released + * */ -static void svc_release_skb(struct svc_rqst *rqstp) +static void svc_tcp_release_rqst(struct svc_rqst *rqstp) { struct sk_buff *skb = rqstp->rq_xprt_ctxt; if (skb) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); - rqstp->rq_xprt_ctxt = NULL; - dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); + rqstp->rq_xprt_ctxt = NULL; skb_free_datagram_locked(svsk->sk_sk, skb); } } -static void svc_release_udp_skb(struct svc_rqst *rqstp) +/** + * svc_udp_release_rqst - Release transport-related resources + * @rqstp: request structure with resources to be released + * + */ +static void svc_udp_release_rqst(struct svc_rqst *rqstp) { struct sk_buff *skb = rqstp->rq_xprt_ctxt; if (skb) { rqstp->rq_xprt_ctxt = NULL; - - dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); consume_skb(skb); } } @@ -218,34 +223,68 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) return len; } +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +static void svc_flush_bvec(const struct bio_vec *bvec, size_t size, size_t seek) +{ + struct bvec_iter bi = { + .bi_size = size, + }; + struct bio_vec bv; + + bvec_iter_advance(bvec, &bi, seek & PAGE_MASK); + for_each_bvec(bv, bvec, bi, bi) + flush_dcache_page(bv.bv_page); +} +#else +static inline void svc_flush_bvec(const struct bio_vec *bvec, size_t size, + size_t seek) +{ +} +#endif + /* - * Generic recvfrom routine. + * Read from @rqstp's transport socket. The incoming message fills whole + * pages in @rqstp's rq_pages array until the last page of the message + * has been received into a partial page. */ -static ssize_t svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, - unsigned int nr, size_t buflen, unsigned int base) +static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, + size_t seek) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + struct bio_vec *bvec = rqstp->rq_bvec; struct msghdr msg = { NULL }; + unsigned int i; ssize_t len; + size_t t; rqstp->rq_xprt_hlen = 0; clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - iov_iter_kvec(&msg.msg_iter, READ, iov, nr, buflen); - if (base != 0) { - iov_iter_advance(&msg.msg_iter, base); - buflen -= base; + + for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE) { + bvec[i].bv_page = rqstp->rq_pages[i]; + bvec[i].bv_len = PAGE_SIZE; + bvec[i].bv_offset = 0; + } + rqstp->rq_respages = &rqstp->rq_pages[i]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + + iov_iter_bvec(&msg.msg_iter, READ, bvec, i, buflen); + if (seek) { + iov_iter_advance(&msg.msg_iter, seek); + buflen -= seek; } len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT); + if (len > 0) + svc_flush_bvec(bvec, len, seek); + /* If we read a full record, then assume there may be more * data to read (stream based sockets only!) */ if (len == buflen) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - dprintk("svc: socket %p recvfrom(%p, %zu) = %zd\n", - svsk, iov[0].iov_base, iov[0].iov_len, len); return len; } @@ -282,13 +321,10 @@ static void svc_data_ready(struct sock *sk) struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; if (svsk) { - dprintk("svc: socket %p(inet %p), busy=%d\n", - svsk, sk, - test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); - /* Refer to svc_setup_socket() for details. */ rmb(); svsk->sk_odata(sk); + trace_svcsock_data_ready(&svsk->sk_xprt, 0); if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags)) svc_xprt_enqueue(&svsk->sk_xprt); } @@ -302,11 +338,9 @@ static void svc_write_space(struct sock *sk) struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); if (svsk) { - dprintk("svc: socket %p(inet %p), write_space busy=%d\n", - svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); - /* Refer to svc_setup_socket() for details. */ rmb(); + trace_svcsock_write_space(&svsk->sk_xprt, 0); svsk->sk_owspace(sk); svc_xprt_enqueue(&svsk->sk_xprt); } @@ -383,8 +417,15 @@ static int svc_udp_get_dest_address(struct svc_rqst *rqstp, return 0; } -/* - * Receive a datagram from a UDP socket. +/** + * svc_udp_recvfrom - Receive a datagram from a UDP socket. + * @rqstp: request structure into which to receive an RPC Call + * + * Called in a loop when XPT_DATA has been set. + * + * Returns: + * On success, the number of bytes in a received RPC Call, or + * %0 if a complete RPC Call message was not ready to return */ static int svc_udp_recvfrom(struct svc_rqst *rqstp) { @@ -418,20 +459,14 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) svc_sock_setbufsize(svsk, serv->sv_nrthreads + 3); clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - skb = NULL; err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, 0, 0, MSG_PEEK | MSG_DONTWAIT); - if (err >= 0) - skb = skb_recv_udp(svsk->sk_sk, 0, 1, &err); - - if (skb == NULL) { - if (err != -EAGAIN) { - /* possibly an icmp error */ - dprintk("svc: recvfrom returned error %d\n", -err); - set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - } - return 0; - } + if (err < 0) + goto out_recv_err; + skb = skb_recv_udp(svsk->sk_sk, 0, 1, &err); + if (!skb) + goto out_recv_err; + len = svc_addr_len(svc_addr(rqstp)); rqstp->rq_addrlen = len; if (skb->tstamp == 0) { @@ -442,26 +477,21 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) sock_write_timestamp(svsk->sk_sk, skb->tstamp); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ - len = skb->len; + len = skb->len; rqstp->rq_arg.len = len; + trace_svcsock_udp_recv(&svsk->sk_xprt, len); rqstp->rq_prot = IPPROTO_UDP; - if (!svc_udp_get_dest_address(rqstp, cmh)) { - net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n", - cmh->cmsg_level, cmh->cmsg_type); - goto out_free; - } + if (!svc_udp_get_dest_address(rqstp, cmh)) + goto out_cmsg_err; rqstp->rq_daddrlen = svc_addr_len(svc_daddr(rqstp)); if (skb_is_nonlinear(skb)) { /* we have to copy */ local_bh_disable(); - if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) { - local_bh_enable(); - /* checksum error */ - goto out_free; - } + if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) + goto out_bh_enable; local_bh_enable(); consume_skb(skb); } else { @@ -489,6 +519,20 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) serv->sv_stats->netudpcnt++; return len; + +out_recv_err: + if (err != -EAGAIN) { + /* possibly an icmp error */ + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + } + trace_svcsock_udp_recv_err(&svsk->sk_xprt, err); + return 0; +out_cmsg_err: + net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n", + cmh->cmsg_level, cmh->cmsg_type); + goto out_free; +out_bh_enable: + local_bh_enable(); out_free: kfree_skb(skb); return 0; @@ -498,6 +542,9 @@ out_free: * svc_udp_sendto - Send out a reply on a UDP socket * @rqstp: completed svc_rqst * + * xpt_mutex ensures @rqstp's whole message is written to the socket + * without interruption. + * * Returns the number of bytes sent, or a negative errno. */ static int svc_udp_sendto(struct svc_rqst *rqstp) @@ -519,10 +566,15 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) unsigned int uninitialized_var(sent); int err; - svc_release_udp_skb(rqstp); + svc_udp_release_rqst(rqstp); svc_set_cmsg_data(rqstp, cmh); + mutex_lock(&xprt->xpt_mutex); + + if (svc_xprt_is_dead(xprt)) + goto out_notconn; + err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent); xdr_free_bvec(xdr); if (err == -ECONNREFUSED) { @@ -530,9 +582,16 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent); xdr_free_bvec(xdr); } + trace_svcsock_udp_send(xprt, err); + + mutex_unlock(&xprt->xpt_mutex); if (err < 0) return err; return sent; + +out_notconn: + mutex_unlock(&xprt->xpt_mutex); + return -ENOTCONN; } static int svc_udp_has_wspace(struct svc_xprt *xprt) @@ -576,7 +635,7 @@ static const struct svc_xprt_ops svc_udp_ops = { .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, .xpo_read_payload = svc_sock_read_payload, - .xpo_release_rqst = svc_release_udp_skb, + .xpo_release_rqst = svc_udp_release_rqst, .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, .xpo_has_wspace = svc_udp_has_wspace, @@ -632,9 +691,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - dprintk("svc: socket %p TCP (listen) state change %d\n", - sk, sk->sk_state); - if (svsk) { /* Refer to svc_setup_socket() for details. */ rmb(); @@ -655,8 +711,7 @@ static void svc_tcp_listen_data_ready(struct sock *sk) if (svsk) { set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); svc_xprt_enqueue(&svsk->sk_xprt); - } else - printk("svc: socket %p: no user data\n", sk); + } } } @@ -667,15 +722,11 @@ static void svc_tcp_state_change(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n", - sk, sk->sk_state, sk->sk_user_data); - - if (!svsk) - printk("svc: socket %p: no user data\n", sk); - else { + if (svsk) { /* Refer to svc_setup_socket() for details. */ rmb(); svsk->sk_ostate(sk); + trace_svcsock_tcp_state(&svsk->sk_xprt, svsk->sk_sock); if (sk->sk_state != TCP_ESTABLISHED) { set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); svc_xprt_enqueue(&svsk->sk_xprt); @@ -696,9 +747,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) struct socket *newsock; struct svc_sock *newsvsk; int err, slen; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); if (!sock) return NULL; @@ -711,30 +760,18 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) else if (err != -EAGAIN) net_warn_ratelimited("%s: accept failed (err %d)!\n", serv->sv_name, -err); + trace_svcsock_accept_err(xprt, serv->sv_name, err); return NULL; } set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); err = kernel_getpeername(newsock, sin); if (err < 0) { - net_warn_ratelimited("%s: peername failed (err %d)!\n", - serv->sv_name, -err); + trace_svcsock_getpeername_err(xprt, serv->sv_name, err); goto failed; /* aborted connection or whatever */ } slen = err; - /* Ideally, we would want to reject connections from unauthorized - * hosts here, but when we get encryption, the IP of the host won't - * tell us anything. For now just warn about unpriv connections. - */ - if (!svc_port_is_privileged(sin)) { - dprintk("%s: connect from unprivileged port: %s\n", - serv->sv_name, - __svc_print_addr(sin, buf, sizeof(buf))); - } - dprintk("%s: connect from %s\n", serv->sv_name, - __svc_print_addr(sin, buf, sizeof(buf))); - /* Reset the inherited callbacks before calling svc_setup_socket */ newsock->sk->sk_state_change = svsk->sk_ostate; newsock->sk->sk_data_ready = svsk->sk_odata; @@ -752,10 +789,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen); err = kernel_getsockname(newsock, sin); slen = err; - if (unlikely(err < 0)) { - dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); + if (unlikely(err < 0)) slen = offsetof(struct sockaddr, sa_data); - } svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen); if (sock_is_loopback(newsock->sk)) @@ -772,13 +807,14 @@ failed: return NULL; } -static unsigned int svc_tcp_restore_pages(struct svc_sock *svsk, struct svc_rqst *rqstp) +static size_t svc_tcp_restore_pages(struct svc_sock *svsk, + struct svc_rqst *rqstp) { - unsigned int i, len, npages; + size_t len = svsk->sk_datalen; + unsigned int i, npages; - if (svsk->sk_datalen == 0) + if (!len) return 0; - len = svsk->sk_datalen; npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < npages; i++) { if (rqstp->rq_pages[i] != NULL) @@ -827,47 +863,45 @@ out: } /* - * Receive fragment record header. - * If we haven't gotten the record length yet, get the next four bytes. + * Receive fragment record header into sk_marker. */ -static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) +static ssize_t svc_tcp_read_marker(struct svc_sock *svsk, + struct svc_rqst *rqstp) { - struct svc_serv *serv = svsk->sk_xprt.xpt_server; - unsigned int want; - int len; + ssize_t want, len; + /* If we haven't gotten the record length yet, + * get the next four bytes. + */ if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) { + struct msghdr msg = { NULL }; struct kvec iov; want = sizeof(rpc_fraghdr) - svsk->sk_tcplen; - iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; + iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen; iov.iov_len = want; - len = svc_recvfrom(rqstp, &iov, 1, want, 0); + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, want); + len = sock_recvmsg(svsk->sk_sock, &msg, MSG_DONTWAIT); if (len < 0) - goto error; + return len; svsk->sk_tcplen += len; - if (len < want) { - dprintk("svc: short recvfrom while reading record " - "length (%d of %d)\n", len, want); - return -EAGAIN; + /* call again to read the remaining bytes */ + goto err_short; } - - dprintk("svc: TCP record, %d bytes\n", svc_sock_reclen(svsk)); + trace_svcsock_marker(&svsk->sk_xprt, svsk->sk_marker); if (svc_sock_reclen(svsk) + svsk->sk_datalen > - serv->sv_max_mesg) { - net_notice_ratelimited("RPC: fragment too large: %d\n", - svc_sock_reclen(svsk)); - goto err_delete; - } + svsk->sk_xprt.xpt_server->sv_max_mesg) + goto err_too_large; } - return svc_sock_reclen(svsk); -error: - dprintk("RPC: TCP recv_record got %d\n", len); - return len; -err_delete: + +err_too_large: + net_notice_ratelimited("svc: %s %s RPC fragment too large: %d\n", + __func__, svsk->sk_xprt.xpt_server->sv_name, + svc_sock_reclen(svsk)); set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); +err_short: return -EAGAIN; } @@ -916,87 +950,58 @@ unlock_eagain: return -EAGAIN; } -static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len) -{ - int i = 0; - int t = 0; - - while (t < len) { - vec[i].iov_base = page_address(pages[i]); - vec[i].iov_len = PAGE_SIZE; - i++; - t += PAGE_SIZE; - } - return i; -} - static void svc_tcp_fragment_received(struct svc_sock *svsk) { /* If we have more data, signal svc_xprt_enqueue() to try again */ - dprintk("svc: TCP %s record (%d bytes)\n", - svc_sock_final_rec(svsk) ? "final" : "nonfinal", - svc_sock_reclen(svsk)); svsk->sk_tcplen = 0; - svsk->sk_reclen = 0; + svsk->sk_marker = xdr_zero; } -/* - * Receive data from a TCP socket. +/** + * svc_tcp_recvfrom - Receive data from a TCP socket + * @rqstp: request structure into which to receive an RPC Call + * + * Called in a loop when XPT_DATA has been set. + * + * Read the 4-byte stream record marker, then use the record length + * in that marker to set up exactly the resources needed to receive + * the next RPC message into @rqstp. + * + * Returns: + * On success, the number of bytes in a received RPC Call, or + * %0 if a complete RPC Call message was not ready to return + * + * The zero return case handles partial receives and callback Replies. + * The state of a partial receive is preserved in the svc_sock for + * the next call to svc_tcp_recvfrom. */ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); struct svc_serv *serv = svsk->sk_xprt.xpt_server; - int len; - struct kvec *vec; - unsigned int want, base; + size_t want, base; + ssize_t len; __be32 *p; __be32 calldir; - int pnum; - - dprintk("svc: tcp_recv %p data %d conn %d close %d\n", - svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags), - test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), - test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); - len = svc_tcp_recv_record(svsk, rqstp); + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + len = svc_tcp_read_marker(svsk, rqstp); if (len < 0) goto error; base = svc_tcp_restore_pages(svsk, rqstp); - want = svc_sock_reclen(svsk) - (svsk->sk_tcplen - sizeof(rpc_fraghdr)); - - vec = rqstp->rq_vec; - - pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], base + want); - - rqstp->rq_respages = &rqstp->rq_pages[pnum]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - - /* Now receive data */ - len = svc_recvfrom(rqstp, vec, pnum, base + want, base); + want = len - (svsk->sk_tcplen - sizeof(rpc_fraghdr)); + len = svc_tcp_read_msg(rqstp, base + want, base); if (len >= 0) { + trace_svcsock_tcp_recv(&svsk->sk_xprt, len); svsk->sk_tcplen += len; svsk->sk_datalen += len; } - if (len != want || !svc_sock_final_rec(svsk)) { - svc_tcp_save_pages(svsk, rqstp); - if (len < 0 && len != -EAGAIN) - goto err_delete; - if (len == want) - svc_tcp_fragment_received(svsk); - else - dprintk("svc: incomplete TCP record (%d of %d)\n", - (int)(svsk->sk_tcplen - sizeof(rpc_fraghdr)), - svc_sock_reclen(svsk)); - goto err_noclose; - } - - if (svsk->sk_datalen < 8) { - svsk->sk_datalen = 0; - goto err_delete; /* client is nuts. */ - } + if (len != want || !svc_sock_final_rec(svsk)) + goto err_incomplete; + if (svsk->sk_datalen < 8) + goto err_nuts; rqstp->rq_arg.len = svsk->sk_datalen; rqstp->rq_arg.page_base = 0; @@ -1031,14 +1036,26 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) return rqstp->rq_arg.len; +err_incomplete: + svc_tcp_save_pages(svsk, rqstp); + if (len < 0 && len != -EAGAIN) + goto err_delete; + if (len == want) + svc_tcp_fragment_received(svsk); + else + trace_svcsock_tcp_recv_short(&svsk->sk_xprt, + svc_sock_reclen(svsk), + svsk->sk_tcplen - sizeof(rpc_fraghdr)); + goto err_noclose; error: if (len != -EAGAIN) goto err_delete; - dprintk("RPC: TCP recvfrom got EAGAIN\n"); + trace_svcsock_tcp_recv_eagain(&svsk->sk_xprt, 0); return 0; +err_nuts: + svsk->sk_datalen = 0; err_delete: - printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", - svsk->sk_xprt.xpt_server->sv_name, -len); + trace_svcsock_tcp_recv_err(&svsk->sk_xprt, len); set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); err_noclose: return 0; /* record not complete */ @@ -1048,6 +1065,9 @@ err_noclose: * svc_tcp_sendto - Send out a reply on a TCP socket * @rqstp: completed svc_rqst * + * xpt_mutex ensures @rqstp's whole message is written to the socket + * without interruption. + * * Returns the number of bytes sent, or a negative errno. */ static int svc_tcp_sendto(struct svc_rqst *rqstp) @@ -1063,14 +1083,22 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) unsigned int uninitialized_var(sent); int err; - svc_release_skb(rqstp); + svc_tcp_release_rqst(rqstp); + mutex_lock(&xprt->xpt_mutex); + if (svc_xprt_is_dead(xprt)) + goto out_notconn; err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent); xdr_free_bvec(xdr); + trace_svcsock_tcp_send(xprt, err < 0 ? err : sent); if (err < 0 || sent != (xdr->len + sizeof(marker))) goto out_close; + mutex_unlock(&xprt->xpt_mutex); return sent; +out_notconn: + mutex_unlock(&xprt->xpt_mutex); + return -ENOTCONN; out_close: pr_notice("rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", xprt->xpt_server->sv_name, @@ -1078,6 +1106,7 @@ out_close: (err < 0) ? err : sent, xdr->len); set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); + mutex_unlock(&xprt->xpt_mutex); return -EAGAIN; } @@ -1094,7 +1123,7 @@ static const struct svc_xprt_ops svc_tcp_ops = { .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, .xpo_read_payload = svc_sock_read_payload, - .xpo_release_rqst = svc_release_skb, + .xpo_release_rqst = svc_tcp_release_rqst, .xpo_detach = svc_tcp_sock_detach, .xpo_free = svc_sock_free, .xpo_has_wspace = svc_tcp_has_wspace, @@ -1132,18 +1161,16 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags); if (sk->sk_state == TCP_LISTEN) { - dprintk("setting up TCP socket for listening\n"); strcpy(svsk->sk_xprt.xpt_remotebuf, "listener"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); sk->sk_data_ready = svc_tcp_listen_data_ready; set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); } else { - dprintk("setting up TCP socket for reading\n"); sk->sk_state_change = svc_tcp_state_change; sk->sk_data_ready = svc_data_ready; sk->sk_write_space = svc_write_space; - svsk->sk_reclen = 0; + svsk->sk_marker = xdr_zero; svsk->sk_tcplen = 0; svsk->sk_datalen = 0; memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages)); @@ -1188,7 +1215,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); int err = 0; - dprintk("svc: svc_setup_socket %p\n", sock); svsk = kzalloc(sizeof(*svsk), GFP_KERNEL); if (!svsk) return ERR_PTR(-ENOMEM); @@ -1225,12 +1251,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, else svc_tcp_init(svsk, serv); - dprintk("svc: svc_setup_socket created %p (inet %p), " - "listen %d close %d\n", - svsk, svsk->sk_sk, - test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags), - test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); - + trace_svcsock_new_socket(sock); return svsk; } @@ -1322,11 +1343,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, struct sockaddr *newsin = (struct sockaddr *)&addr; int newlen; int family; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - - dprintk("svc: svc_create_socket(%s, %d, %s)\n", - serv->sv_program->pg_name, protocol, - __svc_print_addr(sin, buf, sizeof(buf))); if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { printk(KERN_WARNING "svc: only UDP and TCP " @@ -1383,7 +1399,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen); return (struct svc_xprt *)svsk; bummer: - dprintk("svc: svc_create_socket error = %d\n", -error); sock_release(sock); return ERR_PTR(error); } @@ -1397,8 +1412,6 @@ static void svc_sock_detach(struct svc_xprt *xprt) struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); struct sock *sk = svsk->sk_sk; - dprintk("svc: svc_sock_detach(%p)\n", svsk); - /* put back the old socket callbacks */ lock_sock(sk); sk->sk_state_change = svsk->sk_ostate; @@ -1415,8 +1428,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - dprintk("svc: svc_tcp_sock_detach(%p)\n", svsk); - svc_sock_detach(xprt); if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) { @@ -1431,7 +1442,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) static void svc_sock_free(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - dprintk("svc: svc_sock_free(%p)\n", svsk); if (svsk->sk_sock->file) sockfd_put(svsk->sk_sock); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 493a30a296fc..d5cc5db9dbf3 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -663,6 +663,7 @@ static void xprt_autoclose(struct work_struct *work) container_of(work, struct rpc_xprt, task_cleanup); unsigned int pflags = memalloc_nofs_save(); + trace_xprt_disconnect_auto(xprt); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); xprt->ops->close(xprt); xprt_release_write(xprt, NULL); @@ -677,7 +678,7 @@ static void xprt_autoclose(struct work_struct *work) */ void xprt_disconnect_done(struct rpc_xprt *xprt) { - dprintk("RPC: disconnected transport %p\n", xprt); + trace_xprt_disconnect_done(xprt); spin_lock(&xprt->transport_lock); xprt_clear_connected(xprt); xprt_clear_write_space_locked(xprt); @@ -694,6 +695,8 @@ EXPORT_SYMBOL_GPL(xprt_disconnect_done); */ void xprt_force_disconnect(struct rpc_xprt *xprt) { + trace_xprt_disconnect_force(xprt); + /* Don't race with the test_bit() in xprt_clear_locked() */ spin_lock(&xprt->transport_lock); set_bit(XPRT_CLOSE_WAIT, &xprt->state); @@ -832,8 +835,10 @@ void xprt_connect(struct rpc_task *task) if (!xprt_lock_write(xprt, task)) return; - if (test_and_clear_bit(XPRT_CLOSE_WAIT, &xprt->state)) + if (test_and_clear_bit(XPRT_CLOSE_WAIT, &xprt->state)) { + trace_xprt_disconnect_cleanup(xprt); xprt->ops->close(xprt); + } if (!xprt_connected(xprt)) { task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie; @@ -1460,7 +1465,7 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) */ req->rq_ntrans++; - trace_xprt_sendto(&req->rq_snd_buf); + trace_rpc_xdr_sendto(task, &req->rq_snd_buf); connect_cookie = xprt->connect_cookie; status = xprt->ops->send_request(req); if (status != 0) { @@ -1903,11 +1908,8 @@ struct rpc_xprt *xprt_create_transport(struct xprt_create *args) found: xprt = t->setup(args); - if (IS_ERR(xprt)) { - dprintk("RPC: xprt_create_transport: failed, %ld\n", - -PTR_ERR(xprt)); + if (IS_ERR(xprt)) goto out; - } if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT) xprt->idle_timeout = 0; INIT_WORK(&xprt->task_cleanup, xprt_autoclose); @@ -1928,8 +1930,7 @@ found: rpc_xprt_debugfs_register(xprt); - dprintk("RPC: created transport %p with %u slots\n", xprt, - xprt->max_reqs); + trace_xprt_create(xprt); out: return xprt; } @@ -1939,6 +1940,8 @@ static void xprt_destroy_cb(struct work_struct *work) struct rpc_xprt *xprt = container_of(work, struct rpc_xprt, task_cleanup); + trace_xprt_destroy(xprt); + rpc_xprt_debugfs_unregister(xprt); rpc_destroy_wait_queue(&xprt->binding); rpc_destroy_wait_queue(&xprt->pending); @@ -1963,8 +1966,6 @@ static void xprt_destroy_cb(struct work_struct *work) */ static void xprt_destroy(struct rpc_xprt *xprt) { - dprintk("RPC: destroying transport %p\n", xprt); - /* * Exclude transport connect/disconnect handlers and autoclose */ diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 3c627dc685cc..2081c8fbfa48 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -892,8 +892,8 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) * or privacy, direct data placement of individual data items * is not allowed. */ - ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags & - RPCAUTH_AUTH_DATATOUCH); + ddp_allowed = !test_bit(RPCAUTH_AUTH_DATATOUCH, + &rqst->rq_cred->cr_auth->au_flags); /* * Chunks needed for results? diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index af7eb8d202ae..1ee73f7cf931 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -10,59 +10,34 @@ #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#define RPCDBG_FACILITY RPCDBG_SVCXPRT - -#undef SVCRDMA_BACKCHANNEL_DEBUG - /** - * svc_rdma_handle_bc_reply - Process incoming backchannel reply - * @xprt: controlling backchannel transport - * @rdma_resp: pointer to incoming transport header - * @rcvbuf: XDR buffer into which to decode the reply + * svc_rdma_handle_bc_reply - Process incoming backchannel Reply + * @rqstp: resources for handling the Reply + * @rctxt: Received message * - * Returns: - * %0 if @rcvbuf is filled in, xprt_complete_rqst called, - * %-EAGAIN if server should call ->recvfrom again. */ -int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, - struct xdr_buf *rcvbuf) +void svc_rdma_handle_bc_reply(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *rctxt) { + struct svc_xprt *sxprt = rqstp->rq_xprt; + struct rpc_xprt *xprt = sxprt->xpt_bc_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct xdr_buf *rcvbuf = &rqstp->rq_arg; struct kvec *dst, *src = &rcvbuf->head[0]; + __be32 *rdma_resp = rctxt->rc_recv_buf; struct rpc_rqst *req; u32 credits; - size_t len; - __be32 xid; - __be32 *p; - int ret; - - p = (__be32 *)src->iov_base; - len = src->iov_len; - xid = *rdma_resp; - -#ifdef SVCRDMA_BACKCHANNEL_DEBUG - pr_info("%s: xid=%08x, length=%zu\n", - __func__, be32_to_cpu(xid), len); - pr_info("%s: RPC/RDMA: %*ph\n", - __func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp); - pr_info("%s: RPC: %*ph\n", - __func__, (int)len, p); -#endif - - ret = -EAGAIN; - if (src->iov_len < 24) - goto out_shortreply; spin_lock(&xprt->queue_lock); - req = xprt_lookup_rqst(xprt, xid); + req = xprt_lookup_rqst(xprt, *rdma_resp); if (!req) - goto out_notfound; + goto out_unlock; dst = &req->rq_private_buf.head[0]; memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); - if (dst->iov_len < len) + if (dst->iov_len < src->iov_len) goto out_unlock; - memcpy(dst->iov_base, p, len); + memcpy(dst->iov_base, src->iov_base, src->iov_len); xprt_pin_rqst(req); spin_unlock(&xprt->queue_lock); @@ -71,31 +46,17 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_buf.rb_bc_max_requests) credits = r_xprt->rx_buf.rb_bc_max_requests; - spin_lock(&xprt->transport_lock); xprt->cwnd = credits << RPC_CWNDSHIFT; spin_unlock(&xprt->transport_lock); spin_lock(&xprt->queue_lock); - ret = 0; xprt_complete_rqst(req->rq_task, rcvbuf->len); xprt_unpin_rqst(req); rcvbuf->len = 0; out_unlock: spin_unlock(&xprt->queue_lock); -out: - return ret; - -out_shortreply: - dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n", - xprt, src->iov_len); - goto out; - -out_notfound: - dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n", - xprt, be32_to_cpu(xid)); - goto out_unlock; } /* Send a backwards direction RPC call. @@ -192,10 +153,6 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) *p++ = xdr_zero; *p = xdr_zero; -#ifdef SVCRDMA_BACKCHANNEL_DEBUG - pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); -#endif - rqst->rq_xtime = ktime_get(); rc = svc_rdma_bc_sendto(rdma, rqst, ctxt); if (rc) @@ -206,45 +163,36 @@ put_ctxt: svc_rdma_send_ctxt_put(rdma, ctxt); drop_connection: - dprintk("svcrdma: failed to send bc call\n"); return -ENOTCONN; } -/* Send an RPC call on the passive end of a transport - * connection. +/** + * xprt_rdma_bc_send_request - Send a reverse-direction Call + * @rqst: rpc_rqst containing Call message to be sent + * + * Return values: + * %0 if the message was sent successfully + * %ENOTCONN if the message was not sent */ -static int -xprt_rdma_bc_send_request(struct rpc_rqst *rqst) +static int xprt_rdma_bc_send_request(struct rpc_rqst *rqst) { struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; - struct svcxprt_rdma *rdma; + struct svcxprt_rdma *rdma = + container_of(sxprt, struct svcxprt_rdma, sc_xprt); int ret; - dprintk("svcrdma: sending bc call with xid: %08x\n", - be32_to_cpu(rqst->rq_xid)); + if (test_bit(XPT_DEAD, &sxprt->xpt_flags)) + return -ENOTCONN; - mutex_lock(&sxprt->xpt_mutex); - - ret = -ENOTCONN; - rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); - if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) { - ret = rpcrdma_bc_send_request(rdma, rqst); - if (ret == -ENOTCONN) - svc_close_xprt(sxprt); - } - - mutex_unlock(&sxprt->xpt_mutex); - - if (ret < 0) - return ret; - return 0; + ret = rpcrdma_bc_send_request(rdma, rqst); + if (ret == -ENOTCONN) + svc_close_xprt(sxprt); + return ret; } static void xprt_rdma_bc_close(struct rpc_xprt *xprt) { - dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); - xprt_disconnect_done(xprt); xprt->cwnd = RPC_CWNDSHIFT; } @@ -252,8 +200,6 @@ xprt_rdma_bc_close(struct rpc_xprt *xprt) static void xprt_rdma_bc_put(struct rpc_xprt *xprt) { - dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); - xprt_rdma_free_addresses(xprt); xprt_free(xprt); } @@ -288,19 +234,14 @@ xprt_setup_rdma_bc(struct xprt_create *args) struct rpc_xprt *xprt; struct rpcrdma_xprt *new_xprt; - if (args->addrlen > sizeof(xprt->addr)) { - dprintk("RPC: %s: address too large\n", __func__); + if (args->addrlen > sizeof(xprt->addr)) return ERR_PTR(-EBADF); - } xprt = xprt_alloc(args->net, sizeof(*new_xprt), RPCRDMA_MAX_BC_REQUESTS, RPCRDMA_MAX_BC_REQUESTS); - if (!xprt) { - dprintk("RPC: %s: couldn't allocate rpc_xprt\n", - __func__); + if (!xprt) return ERR_PTR(-ENOMEM); - } xprt->timeout = &xprt_rdma_bc_timeout; xprt_set_bound(xprt); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index efa5fcb5793f..e426fedb9524 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -665,23 +665,23 @@ static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg, return hdr_len; out_short: - trace_svcrdma_decode_short(rq_arg->len); + trace_svcrdma_decode_short_err(rq_arg->len); return -EINVAL; out_version: - trace_svcrdma_decode_badvers(rdma_argp); + trace_svcrdma_decode_badvers_err(rdma_argp); return -EPROTONOSUPPORT; out_drop: - trace_svcrdma_decode_drop(rdma_argp); + trace_svcrdma_decode_drop_err(rdma_argp); return 0; out_proc: - trace_svcrdma_decode_badproc(rdma_argp); + trace_svcrdma_decode_badproc_err(rdma_argp); return -EINVAL; out_inval: - trace_svcrdma_decode_parse(rdma_argp); + trace_svcrdma_decode_parse_err(rdma_argp); return -EINVAL; } @@ -878,12 +878,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto out_drop; rqstp->rq_xprt_hlen = ret; - if (svc_rdma_is_backchannel_reply(xprt, p)) { - ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p, - &rqstp->rq_arg); - svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); - return ret; - } + if (svc_rdma_is_backchannel_reply(xprt, p)) + goto out_backchannel; + svc_rdma_get_inv_rkey(rdma_xprt, ctxt); p += rpcrdma_fixed_maxsz; @@ -913,6 +910,8 @@ out_postfail: svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return ret; +out_backchannel: + svc_rdma_handle_bc_reply(rqstp, ctxt); out_drop: svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 23c2d3ce0dc9..5eb35309ecef 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -9,13 +9,10 @@ #include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_rdma.h> -#include <linux/sunrpc/debug.h> #include "xprt_rdma.h" #include <trace/events/rpcrdma.h> -#define RPCDBG_FACILITY RPCDBG_SVCXPRT - static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc); static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); @@ -39,7 +36,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); struct svc_rdma_rw_ctxt { struct list_head rw_list; struct rdma_rw_ctx rw_ctx; - int rw_nents; + unsigned int rw_nents; struct sg_table rw_sg_table; struct scatterlist rw_first_sgl[]; }; @@ -67,19 +64,22 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), GFP_KERNEL); if (!ctxt) - goto out; + goto out_noctx; INIT_LIST_HEAD(&ctxt->rw_list); } ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, ctxt->rw_sg_table.sgl, - SG_CHUNK_SIZE)) { - kfree(ctxt); - ctxt = NULL; - } -out: + SG_CHUNK_SIZE)) + goto out_free; return ctxt; + +out_free: + kfree(ctxt); +out_noctx: + trace_svcrdma_no_rwctx_err(rdma, sges); + return NULL; } static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, @@ -107,6 +107,34 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) } } +/** + * svc_rdma_rw_ctx_init - Prepare a R/W context for I/O + * @rdma: controlling transport instance + * @ctxt: R/W context to prepare + * @offset: RDMA offset + * @handle: RDMA tag/handle + * @direction: I/O direction + * + * Returns on success, the number of WQEs that will be needed + * on the workqueue, or a negative errno. + */ +static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt, + u64 offset, u32 handle, + enum dma_data_direction direction) +{ + int ret; + + ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, + ctxt->rw_sg_table.sgl, ctxt->rw_nents, + 0, offset, handle, direction); + if (unlikely(ret < 0)) { + svc_rdma_put_rw_ctxt(rdma, ctxt); + trace_svcrdma_dma_map_rw_err(rdma, ctxt->rw_nents, ret); + } + return ret; +} + /* A chunk context tracks all I/O for moving one Read or Write * chunk. This is a a set of rdma_rw's that handle data movement * for all segments of one chunk. @@ -428,15 +456,13 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, ctxt = svc_rdma_get_rw_ctxt(rdma, (write_len >> PAGE_SHIFT) + 2); if (!ctxt) - goto out_noctx; + return -ENOMEM; constructor(info, write_len, ctxt); - ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, - rdma->sc_port_num, ctxt->rw_sg_table.sgl, - ctxt->rw_nents, 0, seg_offset, - seg_handle, DMA_TO_DEVICE); + ret = svc_rdma_rw_ctx_init(rdma, ctxt, seg_offset, seg_handle, + DMA_TO_DEVICE); if (ret < 0) - goto out_initerr; + return -EIO; trace_svcrdma_send_wseg(seg_handle, write_len, seg_offset); @@ -455,18 +481,9 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, return 0; out_overflow: - dprintk("svcrdma: inadequate space in Write chunk (%u)\n", - info->wi_nsegs); + trace_svcrdma_small_wrch_err(rdma, remaining, info->wi_seg_no, + info->wi_nsegs); return -E2BIG; - -out_noctx: - dprintk("svcrdma: no R/W ctxs available\n"); - return -ENOMEM; - -out_initerr: - svc_rdma_put_rw_ctxt(rdma, ctxt); - trace_svcrdma_dma_map_rwctx(rdma, ret); - return -EIO; } /* Send one of an xdr_buf's kvecs by itself. To send a Reply @@ -616,7 +633,7 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT; ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no); if (!ctxt) - goto out_noctx; + return -ENOMEM; ctxt->rw_nents = sge_no; sg = ctxt->rw_sg_table.sgl; @@ -646,29 +663,18 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, goto out_overrun; } - ret = rdma_rw_ctx_init(&ctxt->rw_ctx, cc->cc_rdma->sc_qp, - cc->cc_rdma->sc_port_num, - ctxt->rw_sg_table.sgl, ctxt->rw_nents, - 0, offset, rkey, DMA_FROM_DEVICE); + ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, offset, rkey, + DMA_FROM_DEVICE); if (ret < 0) - goto out_initerr; + return -EIO; list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; return 0; -out_noctx: - dprintk("svcrdma: no R/W ctxs available\n"); - return -ENOMEM; - out_overrun: - dprintk("svcrdma: request overruns rq_pages\n"); + trace_svcrdma_page_overrun_err(cc->cc_rdma, rqstp, info->ri_pageno); return -EINVAL; - -out_initerr: - trace_svcrdma_dma_map_rwctx(cc->cc_rdma, ret); - svc_rdma_put_rw_ctxt(cc->cc_rdma, ctxt); - return -EIO; } /* Walk the segments in the Read chunk starting at @p and construct diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index b6c8643867f2..38e7c3c8c4a9 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -868,12 +868,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) __be32 *p; int ret; - /* Create the RDMA response header. xprt->xpt_mutex, - * acquired in svc_send(), serializes RPC replies. The - * code path below that inserts the credit grant value - * into each transport header runs only inside this - * critical section. - */ + ret = -ENOTCONN; + if (svc_xprt_is_dead(xprt)) + goto err0; + ret = -ENOMEM; sctxt = svc_rdma_send_ctxt_get(rdma); if (!sctxt) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index ea54785db4f8..d38be57b00ed 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -211,7 +211,12 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, newxprt->sc_ord = param->initiator_depth; sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; - svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + newxprt->sc_xprt.xpt_remotelen = svc_addr_len(sa); + memcpy(&newxprt->sc_xprt.xpt_remote, sa, + newxprt->sc_xprt.xpt_remotelen); + snprintf(newxprt->sc_xprt.xpt_remotebuf, + sizeof(newxprt->sc_xprt.xpt_remotebuf) - 1, "%pISc", sa); + /* The remote port is arbitrary and not under the control of the * client ULP. Set it to a fixed value so that the DRC continues * to be effective after a reconnect. @@ -309,11 +314,8 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, struct svcxprt_rdma *cma_xprt; int ret; - dprintk("svcrdma: Creating RDMA listener\n"); - if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) { - dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family); + if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6) return ERR_PTR(-EAFNOSUPPORT); - } cma_xprt = svc_rdma_create_xprt(serv, net); if (!cma_xprt) return ERR_PTR(-ENOMEM); @@ -324,7 +326,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(listen_id)) { ret = PTR_ERR(listen_id); - dprintk("svcrdma: rdma_create_id failed = %d\n", ret); goto err0; } @@ -333,23 +334,17 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, */ #if IS_ENABLED(CONFIG_IPV6) ret = rdma_set_afonly(listen_id, 1); - if (ret) { - dprintk("svcrdma: rdma_set_afonly failed = %d\n", ret); + if (ret) goto err1; - } #endif ret = rdma_bind_addr(listen_id, sa); - if (ret) { - dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); + if (ret) goto err1; - } cma_xprt->sc_cm_id = listen_id; ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); - if (ret) { - dprintk("svcrdma: rdma_listen failed = %d\n", ret); + if (ret) goto err1; - } /* * We need to use the address from the cm_id in case the @@ -405,9 +400,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (!newxprt) return NULL; - dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n", - newxprt, newxprt->sc_cm_id); - dev = newxprt->sc_cm_id->device; newxprt->sc_port_num = newxprt->sc_cm_id->port_num; @@ -443,21 +435,17 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_pd = ib_alloc_pd(dev, 0); if (IS_ERR(newxprt->sc_pd)) { - dprintk("svcrdma: error creating PD for connect request\n"); + trace_svcrdma_pd_err(newxprt, PTR_ERR(newxprt->sc_pd)); goto errout; } newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth, IB_POLL_WORKQUEUE); - if (IS_ERR(newxprt->sc_sq_cq)) { - dprintk("svcrdma: error creating SQ CQ for connect request\n"); + if (IS_ERR(newxprt->sc_sq_cq)) goto errout; - } newxprt->sc_rq_cq = ib_alloc_cq_any(dev, newxprt, rq_depth, IB_POLL_WORKQUEUE); - if (IS_ERR(newxprt->sc_rq_cq)) { - dprintk("svcrdma: error creating RQ CQ for connect request\n"); + if (IS_ERR(newxprt->sc_rq_cq)) goto errout; - } memset(&qp_attr, 0, sizeof qp_attr); qp_attr.event_handler = qp_event_handler; @@ -481,7 +469,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); if (ret) { - dprintk("svcrdma: failed to create QP, ret=%d\n", ret); + trace_svcrdma_qp_err(newxprt, ret); goto errout; } newxprt->sc_qp = newxprt->sc_cm_id->qp; @@ -489,8 +477,10 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) newxprt->sc_snd_w_inv = false; if (!rdma_protocol_iwarp(dev, newxprt->sc_port_num) && - !rdma_ib_or_roce(dev, newxprt->sc_port_num)) + !rdma_ib_or_roce(dev, newxprt->sc_port_num)) { + trace_svcrdma_fabric_err(newxprt, -EINVAL); goto errout; + } if (!svc_rdma_post_recvs(newxprt)) goto errout; @@ -512,15 +502,17 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) conn_param.initiator_depth = min_t(int, newxprt->sc_ord, dev->attrs.max_qp_init_rd_atom); if (!conn_param.initiator_depth) { - dprintk("svcrdma: invalid ORD setting\n"); ret = -EINVAL; + trace_svcrdma_initdepth_err(newxprt, ret); goto errout; } conn_param.private_data = &pmsg; conn_param.private_data_len = sizeof(pmsg); ret = rdma_accept(newxprt->sc_cm_id, &conn_param); - if (ret) + if (ret) { + trace_svcrdma_accept_err(newxprt, ret); goto errout; + } #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) dprintk("svcrdma: new connection %p accepted:\n", newxprt); @@ -535,12 +527,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk(" ord : %d\n", conn_param.initiator_depth); #endif - trace_svcrdma_xprt_accept(&newxprt->sc_xprt); return &newxprt->sc_xprt; errout: - dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); - trace_svcrdma_xprt_fail(&newxprt->sc_xprt); /* Take a reference in case the DTO handler runs */ svc_xprt_get(&newxprt->sc_xprt); if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) @@ -578,8 +567,6 @@ static void __svc_rdma_free(struct work_struct *work) container_of(work, struct svcxprt_rdma, sc_work); struct svc_xprt *xprt = &rdma->sc_xprt; - trace_svcrdma_xprt_free(xprt); - if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_drain_qp(rdma->sc_qp); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 659da37020a4..0c4af7f5e241 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -68,7 +68,7 @@ * tunables */ -unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; +static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR; @@ -281,8 +281,6 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - trace_xprtrdma_op_destroy(r_xprt); - cancel_delayed_work_sync(&r_xprt->rx_connect_worker); rpcrdma_xprt_disconnect(r_xprt); @@ -365,10 +363,6 @@ xprt_setup_rdma(struct xprt_create *args) xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; - dprintk("RPC: %s: %s:%s\n", __func__, - xprt->address_strings[RPC_DISPLAY_ADDR], - xprt->address_strings[RPC_DISPLAY_PORT]); - trace_xprtrdma_create(new_xprt); return xprt; } @@ -385,8 +379,6 @@ void xprt_rdma_close(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - trace_xprtrdma_op_close(r_xprt); - rpcrdma_xprt_disconnect(r_xprt); xprt->reestablish_timeout = 0; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 05c4d3a9cda2..2ae348377806 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -141,7 +141,6 @@ void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc) if (wc->status != IB_WC_SUCCESS && r_xprt->rx_ep->re_connect_status == 1) { r_xprt->rx_ep->re_connect_status = -ECONNABORTED; - trace_xprtrdma_flush_dct(r_xprt, wc->status); xprt_force_disconnect(xprt); } } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 3a143e250b9a..914508ea9b84 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2528,8 +2528,16 @@ static int bc_sendto(struct rpc_rqst *req) return sent; } -/* - * The send routine. Borrows from svc_send +/** + * bc_send_request - Send a backchannel Call on a TCP socket + * @req: rpc_rqst containing Call message to be sent + * + * xpt_mutex ensures @rqstp's whole message is written to the socket + * without interruption. + * + * Return values: + * %0 if the message was sent successfully + * %ENOTCONN if the message was not sent */ static int bc_send_request(struct rpc_rqst *req) { diff --git a/net/switchdev/Kconfig b/net/switchdev/Kconfig index 50f21a657007..18a2d980e11d 100644 --- a/net/switchdev/Kconfig +++ b/net/switchdev/Kconfig @@ -6,7 +6,7 @@ config NET_SWITCHDEV bool "Switch (and switch-ish) device support" depends on INET - ---help--- + help This module provides glue between core networking code and device drivers in order to support hardware switch chips in very generic meaning of the word "switch". This include devices supporting L2/L3 but diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig index 716b61a701a8..9dd780215eef 100644 --- a/net/tipc/Kconfig +++ b/net/tipc/Kconfig @@ -6,7 +6,7 @@ menuconfig TIPC tristate "The TIPC Protocol" depends on INET - ---help--- + help The Transparent Inter Process Communication (TIPC) protocol is specially designed for intra cluster communication. This protocol originates from Ericsson where it has been used in carrier grade @@ -55,6 +55,6 @@ config TIPC_DIAG tristate "TIPC: socket monitoring interface" depends on TIPC default y - ---help--- + help Support for TIPC socket monitoring interface used by ss tool. If unsure, say Y. diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 34ca7b789eba..e366ec9a7e4d 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -316,7 +316,6 @@ static int tipc_enable_bearer(struct net *net, const char *name, b->domain = disc_domain; b->net_plane = bearer_id + 'A'; b->priority = prio; - test_and_set_bit_lock(0, &b->up); refcount_set(&b->refcnt, 1); res = tipc_disc_create(net, b, &b->bcast_addr, &skb); @@ -326,6 +325,7 @@ static int tipc_enable_bearer(struct net *net, const char *name, goto rejected; } + test_and_set_bit_lock(0, &b->up); rcu_assign_pointer(tn->bearer_list[bearer_id], b); if (skb) tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index c0afcd627c5e..01b64869a173 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -221,7 +221,7 @@ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, accounted = skb ? msg_blocks(buf_msg(skb)) : 0; total = accounted; - while (rem) { + do { if (!skb || skb->len >= mss) { skb = tipc_buf_acquire(mss, GFP_KERNEL); if (unlikely(!skb)) @@ -238,14 +238,14 @@ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, hdr = buf_msg(skb); curr = msg_blocks(hdr); mlen = msg_size(hdr); - cpy = min_t(int, rem, mss - mlen); + cpy = min_t(size_t, rem, mss - mlen); if (cpy != copy_from_iter(skb->data + mlen, cpy, &m->msg_iter)) return -EFAULT; msg_set_size(hdr, mlen + cpy); skb_put(skb, cpy); rem -= cpy; total += msg_blocks(hdr) - curr; - } + } while (rem > 0); return total - accounted; } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 26123f4177fd..a94f38333698 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1574,7 +1574,8 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) break; send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE); blocks = tsk->snd_backlog; - if (tsk->oneway++ >= tsk->nagle_start && send <= maxnagle) { + if (tsk->oneway++ >= tsk->nagle_start && maxnagle && + send <= maxnagle) { rc = tipc_msg_append(hdr, m, send, maxnagle, txq); if (unlikely(rc < 0)) break; diff --git a/net/tls/Kconfig b/net/tls/Kconfig index 61ec78521a60..fa0724fd84b4 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -11,7 +11,7 @@ config TLS select STREAM_PARSER select NET_SOCK_MSG default n - ---help--- + help Enable kernel support for TLS protocol. This allows symmetric encryption handling of the TLS protocol to be done in-kernel. diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 0e989005bdc2..ec10041c6b7d 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -629,7 +629,7 @@ struct tls_context *tls_ctx_create(struct sock *sk) static void tls_build_proto(struct sock *sk) { int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; - const struct proto *prot = READ_ONCE(sk->sk_prot); + struct proto *prot = READ_ONCE(sk->sk_prot); /* Build IPv6 TLS whenever the address of tcpv6 _prot changes */ if (ip_ver == TLSV6 && diff --git a/net/unix/Kconfig b/net/unix/Kconfig index a23a5cca9753..b6c4282899ec 100644 --- a/net/unix/Kconfig +++ b/net/unix/Kconfig @@ -5,7 +5,7 @@ config UNIX tristate "Unix domain sockets" - ---help--- + help If you say Y here, you will include support for Unix domain sockets; sockets are the standard Unix mechanism for establishing and accessing network connections. Many commonly used programs such as @@ -29,6 +29,6 @@ config UNIX_DIAG tristate "UNIX: socket monitoring interface" depends on UNIX default n - ---help--- + help Support for UNIX socket monitoring interface used by the ss tool. If unsure, say Y. diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 4b8b1150a738..8b65323207db 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -2055,7 +2055,7 @@ static bool vmci_check_transport(struct vsock_sock *vsk) return vsk->transport == &vmci_transport; } -void vmci_vsock_transport_cb(bool is_host) +static void vmci_vsock_transport_cb(bool is_host) { int features; diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 813e93644ae7..faf74850a1b5 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -25,13 +25,13 @@ config CFG80211 # using a different algorithm, though right now they shouldn't # (this is here rather than below to allow it to be a module) select CRYPTO_SHA256 if CFG80211_USE_KERNEL_REGDB_KEYS - ---help--- + help cfg80211 is the Linux wireless LAN (802.11) configuration API. Enable this if you have a wireless device. For more information refer to documentation on the wireless wiki: - http://wireless.kernel.org/en/developers/Documentation/cfg80211 + https://wireless.wiki.kernel.org/en/developers/Documentation/cfg80211 When built as a module it will be called cfg80211. @@ -71,7 +71,7 @@ config CFG80211_CERTIFICATION_ONUS bool "cfg80211 certification onus" depends on EXPERT default n - ---help--- + help You should disable this option unless you are both capable and willing to ensure your system will remain regulatory compliant with the features available under this option. @@ -124,7 +124,7 @@ config CFG80211_EXTRA_REGDB_KEYDIR config CFG80211_REG_CELLULAR_HINTS bool "cfg80211 regulatory support for cellular base station hints" depends on CFG80211_CERTIFICATION_ONUS - ---help--- + help This option enables support for parsing regulatory hints from cellular base stations. If enabled and at least one driver claims support for parsing cellular base station hints the @@ -137,7 +137,7 @@ config CFG80211_REG_CELLULAR_HINTS config CFG80211_REG_RELAX_NO_IR bool "cfg80211 support for NO_IR relaxation" depends on CFG80211_CERTIFICATION_ONUS - ---help--- + help This option enables support for relaxation of the NO_IR flag for situations that certain regulatory bodies have provided clarifications on how relaxation can occur. This feature has an inherent dependency on @@ -171,7 +171,7 @@ config CFG80211_DEFAULT_PS config CFG80211_DEBUGFS bool "cfg80211 DebugFS entries" depends on DEBUG_FS - ---help--- + help You can enable this if you want debugfs entries for cfg80211. If unsure, say N. @@ -228,7 +228,7 @@ config LIB80211_DEBUG bool "lib80211 debugging messages" depends on LIB80211 default n - ---help--- + help You can enable this if you want verbose debugging messages from lib80211. diff --git a/net/wireless/core.c b/net/wireless/core.c index f0226ae9561c..c623d9bf5096 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -497,6 +497,8 @@ use_default_name: INIT_WORK(&rdev->propagate_radar_detect_wk, cfg80211_propagate_radar_detect_wk); INIT_WORK(&rdev->propagate_cac_done_wk, cfg80211_propagate_cac_done_wk); + INIT_WORK(&rdev->mgmt_registrations_update_wk, + cfg80211_mgmt_registrations_update_wk); #ifdef CONFIG_CFG80211_DEFAULT_PS rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT; @@ -1047,6 +1049,7 @@ void wiphy_unregister(struct wiphy *wiphy) flush_work(&rdev->sched_scan_stop_wk); flush_work(&rdev->propagate_radar_detect_wk); flush_work(&rdev->propagate_cac_done_wk); + flush_work(&rdev->mgmt_registrations_update_wk); #ifdef CONFIG_PM if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup) @@ -1108,7 +1111,6 @@ static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync) rdev->devlist_generation++; cfg80211_mlme_purge_registrations(wdev); - flush_work(&wdev->mgmt_registrations_update_wk); switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: @@ -1253,8 +1255,6 @@ void cfg80211_init_wdev(struct cfg80211_registered_device *rdev, spin_lock_init(&wdev->event_lock); INIT_LIST_HEAD(&wdev->mgmt_registrations); spin_lock_init(&wdev->mgmt_registrations_lock); - INIT_WORK(&wdev->mgmt_registrations_update_wk, - cfg80211_mgmt_registrations_update_wk); INIT_LIST_HEAD(&wdev->pmsr_list); spin_lock_init(&wdev->pmsr_lock); INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk); diff --git a/net/wireless/core.h b/net/wireless/core.h index e0e5b3ee9699..67b0389fca4d 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -99,6 +99,8 @@ struct cfg80211_registered_device { struct cfg80211_chan_def cac_done_chandef; struct work_struct propagate_cac_done_wk; + struct work_struct mgmt_registrations_update_wk; + /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ struct wiphy wiphy __aligned(NETDEV_ALIGN); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 189334314cba..a6c61a2e6569 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -440,9 +440,15 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) ASSERT_RTNL(); + spin_lock_bh(&wdev->mgmt_registrations_lock); + if (!wdev->mgmt_registrations_need_update) { + spin_unlock_bh(&wdev->mgmt_registrations_lock); + return; + } + rcu_read_lock(); list_for_each_entry_rcu(tmp, &rdev->wiphy.wdev_list, list) { - list_for_each_entry_rcu(reg, &tmp->mgmt_registrations, list) { + list_for_each_entry(reg, &tmp->mgmt_registrations, list) { u32 mask = BIT(le16_to_cpu(reg->frame_type) >> 4); u32 mcast_mask = 0; @@ -460,16 +466,23 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) } rcu_read_unlock(); + wdev->mgmt_registrations_need_update = 0; + spin_unlock_bh(&wdev->mgmt_registrations_lock); + rdev_update_mgmt_frame_registrations(rdev, wdev, &upd); } void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) { - struct wireless_dev *wdev = container_of(wk, struct wireless_dev, - mgmt_registrations_update_wk); + struct cfg80211_registered_device *rdev; + struct wireless_dev *wdev; + + rdev = container_of(wk, struct cfg80211_registered_device, + mgmt_registrations_update_wk); rtnl_lock(); - cfg80211_mgmt_registrations_update(wdev); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) + cfg80211_mgmt_registrations_update(wdev); rtnl_unlock(); } @@ -557,6 +570,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, nreg->multicast_rx = multicast_rx; list_add(&nreg->list, &wdev->mgmt_registrations); } + wdev->mgmt_registrations_need_update = 1; spin_unlock_bh(&wdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); @@ -585,7 +599,8 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) list_del(®->list); kfree(reg); - schedule_work(&wdev->mgmt_registrations_update_wk); + wdev->mgmt_registrations_need_update = 1; + schedule_work(&rdev->mgmt_registrations_update_wk); } spin_unlock_bh(&wdev->mgmt_registrations_lock); @@ -608,6 +623,7 @@ void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) list_del(®->list); kfree(reg); } + wdev->mgmt_registrations_need_update = 1; spin_unlock_bh(&wdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); diff --git a/net/x25/Kconfig b/net/x25/Kconfig index 9f0d58b0b90b..e3ed23245a82 100644 --- a/net/x25/Kconfig +++ b/net/x25/Kconfig @@ -5,7 +5,7 @@ config X25 tristate "CCITT X.25 Packet Layer" - ---help--- + help X.25 is a set of standardized network protocols, similar in scope to frame relay; the one physical line from your box to the X.25 network entry point can carry several logical point-to-point connections diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 1bbaf1747e4f..e97db37354e4 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -254,10 +254,10 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) if (!umem->pgs) return -ENOMEM; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); npgs = pin_user_pages(address, umem->npgs, gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (npgs != umem->npgs) { if (npgs >= 0) { @@ -336,7 +336,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if ((addr + size) < addr) return -EINVAL; - npgs = div_u64(size, PAGE_SIZE); + npgs = size >> PAGE_SHIFT; if (npgs > U32_MAX) return -EINVAL; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b6c0f08bd80d..3700266229f6 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -352,10 +352,8 @@ static int xsk_generic_xmit(struct sock *sk) len = desc.len; skb = sock_alloc_send_skb(sk, len, 1, &err); - if (unlikely(!skb)) { - err = -EAGAIN; + if (unlikely(!skb)) goto out; - } skb_put(skb, len); addr = desc.addr; diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index e77ba529229c..5b9a5ab48111 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -22,7 +22,7 @@ if INET config XFRM_USER tristate "Transformation user configuration interface" select XFRM_ALGO - ---help--- + help Support for Transformation(XFRM) user configuration interface like IPsec used by native Linux tools. @@ -31,7 +31,7 @@ config XFRM_USER config XFRM_INTERFACE tristate "Transformation virtual interface" depends on XFRM && IPV6 - ---help--- + help This provides a virtual interface to route IPsec traffic. If unsure, say N. @@ -39,7 +39,7 @@ config XFRM_INTERFACE config XFRM_SUB_POLICY bool "Transformation sub policy support" depends on XFRM - ---help--- + help Support sub policy for developers. By using sub policy with main one, two policies can be applied to the same packet at once. Policy which lives shorter time in kernel should be a sub. @@ -49,7 +49,7 @@ config XFRM_SUB_POLICY config XFRM_MIGRATE bool "Transformation migrate database" depends on XFRM - ---help--- + help A feature to update locator(s) of a given IPsec security association dynamically. This feature is required, for instance, in a Mobile IPv6 environment with IPsec configuration @@ -60,7 +60,7 @@ config XFRM_MIGRATE config XFRM_STATISTICS bool "Transformation statistics" depends on XFRM && PROC_FS - ---help--- + help This statistics is not a SNMP/MIB specification but shows statistics about transformation error (or almost error) factor at packet processing for developer. @@ -100,7 +100,7 @@ config XFRM_IPCOMP config NET_KEY tristate "PF_KEY sockets" select XFRM_ALGO - ---help--- + help PF_KEYv2 socket family, compatible to KAME ones. They are required if you are going to use IPsec tools ported from KAME. @@ -111,7 +111,7 @@ config NET_KEY_MIGRATE bool "PF_KEY MIGRATE" depends on NET_KEY select XFRM_MIGRATE - ---help--- + help Add a PF_KEY MIGRATE message to PF_KEYv2 socket family. The PF_KEY MIGRATE message is used to dynamically update locator(s) of a given IPsec security association. |