From a61675294735570daca3779bd1dbb3715f7232bd Mon Sep 17 00:00:00 2001 From: Chen Aotian Date: Sun, 9 Apr 2023 10:20:48 +0800 Subject: ieee802154: hwsim: Fix possible memory leaks After replacing e->info, it is necessary to free the old einfo. Fixes: f25da51fdc38 ("ieee802154: hwsim: add replacement for fakelb") Reviewed-by: Miquel Raynal Reviewed-by: Alexander Aring Signed-off-by: Chen Aotian Link: https://lore.kernel.org/r/20230409022048.61223-1-chenaotian2@163.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mac802154_hwsim.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c index 8445c2189d11..31cba9aa7636 100644 --- a/drivers/net/ieee802154/mac802154_hwsim.c +++ b/drivers/net/ieee802154/mac802154_hwsim.c @@ -685,7 +685,7 @@ static int hwsim_del_edge_nl(struct sk_buff *msg, struct genl_info *info) static int hwsim_set_edge_lqi(struct sk_buff *msg, struct genl_info *info) { struct nlattr *edge_attrs[MAC802154_HWSIM_EDGE_ATTR_MAX + 1]; - struct hwsim_edge_info *einfo; + struct hwsim_edge_info *einfo, *einfo_old; struct hwsim_phy *phy_v0; struct hwsim_edge *e; u32 v0, v1; @@ -723,8 +723,10 @@ static int hwsim_set_edge_lqi(struct sk_buff *msg, struct genl_info *info) list_for_each_entry_rcu(e, &phy_v0->edges, list) { if (e->endpoint->idx == v1) { einfo->lqi = lqi; - rcu_assign_pointer(e->info, einfo); + einfo_old = rcu_replace_pointer(e->info, einfo, + lockdep_is_held(&hwsim_phys_lock)); rcu_read_unlock(); + kfree_rcu(einfo_old, rcu); mutex_unlock(&hwsim_phys_lock); return 0; } -- cgit v1.2.3 From e3a0877e7e66b465588f6262680437024edccb9f Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 11 Apr 2023 11:01:21 +0200 Subject: MAINTAINERS: Update wpan tree The wpan maintainers group is switching from Stefan's tree to a group tree called 'wpan'. We will now maintain: * wpan/wpan.git master: Fixes targeting the 'net' tree * wpan/wpan-next.git master: Features targeting the 'net-next' tree * wpan/wpan-next.git staging: Same as the wpan-next master branch, but we will push there first, expecting robots to parse the tree and report mistakes we would have not catch. This branch can be rebased and force pushed, unlike the others. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring [Fixed two small typos stefan@datenfreihafen.org] Link: https://lore.kernel.org/r/20230411090122.419761-1-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3bc692d65328..5087b97d8c77 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9890,8 +9890,8 @@ M: Miquel Raynal L: linux-wpan@vger.kernel.org S: Maintained W: https://linux-wpan.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/sschmidt/wpan.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/sschmidt/wpan-next.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git F: Documentation/networking/ieee802154.rst F: drivers/net/ieee802154/ F: include/linux/ieee802154.h -- cgit v1.2.3 From bd4e3d82f4ccb422672029b099e402e5b3acd5ee Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 11 Apr 2023 11:01:22 +0200 Subject: MAINTAINERS: Add wpan patchwork This patchwork instance is hosted on kernel.org and has been used for a long time already, it was just not mentioned in MAINTAINERS. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20230411090122.419761-2-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 5087b97d8c77..ac67099d0755 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9890,6 +9890,7 @@ M: Miquel Raynal L: linux-wpan@vger.kernel.org S: Maintained W: https://linux-wpan.org/ +Q: https://patchwork.kernel.org/project/linux-wpan/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/wpan/wpan-next.git F: Documentation/networking/ieee802154.rst -- cgit v1.2.3 From b3dad076a058916c443c93074dc3ee80baaff4ea Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 28 Apr 2023 00:16:06 +0300 Subject: staging: octeon: delete my name from TODO contact I gave up using MIPS OCTEON hardware after the drivers were deleted from staging in 2019. Afterwards, the driver seems to be added back but the TODO contact name was not updated accordingly. Delete my name, as I still get mails from people asking help with the driver and their systems. Fixes: 422d97b8b05e ("Revert "staging: octeon: delete driver"") Signed-off-by: Aaro Koskinen Link: https://lore.kernel.org/r/20230427211606.GD881984@darkstar.musicnaut.iki.fi Signed-off-by: Greg Kroah-Hartman --- drivers/staging/octeon/TODO | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/staging/octeon/TODO b/drivers/staging/octeon/TODO index 67a0a1f6b922..044e48e3d65f 100644 --- a/drivers/staging/octeon/TODO +++ b/drivers/staging/octeon/TODO @@ -6,4 +6,3 @@ TODO: - make driver self-contained instead of being split between staging and arch/mips/cavium-octeon. -Contact: Aaro Koskinen -- cgit v1.2.3 From cb6aeeb69af06b09e687fbef4da36a81c36b4994 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Mon, 24 Apr 2023 03:10:16 -0700 Subject: x86/hyperv/vtl: Add noop for realmode pointers Assign the realmode pointers to noop, instead of NULL to fix kernel panic. Signed-off-by: Saurabh Sengar Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1682331016-22561-1-git-send-email-ssengar@linux.microsoft.com Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_vtl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 1ba5d3b99b16..85d38b9f3586 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -20,6 +20,8 @@ void __init hv_vtl_init_platform(void) { pr_info("Linux runs in Hyper-V Virtual Trust Level\n"); + x86_platform.realmode_reserve = x86_init_noop; + x86_platform.realmode_init = x86_init_noop; x86_init.irqs.pre_vector_init = x86_init_noop; x86_init.timers.timer_init = x86_init_noop; -- cgit v1.2.3 From ec97e112985c2581ee61854a4b74f080f6cdfc2c Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 4 May 2023 15:41:55 -0700 Subject: Drivers: hv: vmbus: Call hv_synic_free() if hv_synic_alloc() fails Commit 572086325ce9 ("Drivers: hv: vmbus: Cleanup synic memory free path") says "Any memory allocations that succeeded will be freed when the caller cleans up by calling hv_synic_free()", but if the get_zeroed_page() in hv_synic_alloc() fails, currently hv_synic_free() is not really called in vmbus_bus_init(), consequently there will be a memory leak, e.g. hv_context.hv_numa_map is not freed in the error path. Fix this by updating the goto labels. Cc: stable@kernel.org Signed-off-by: Dexuan Cui Fixes: 4df4cb9e99f8 ("x86/hyperv: Initialize clockevents earlier in CPU onlining") Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20230504224155.10484-1-decui@microsoft.com Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 1c65a6dfb9fa..67f95a29aeca 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1372,7 +1372,7 @@ static int vmbus_bus_init(void) ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online", hv_synic_init, hv_synic_cleanup); if (ret < 0) - goto err_cpuhp; + goto err_alloc; hyperv_cpuhp_online = ret; ret = vmbus_connect(); @@ -1392,9 +1392,8 @@ static int vmbus_bus_init(void) err_connect: cpuhp_remove_state(hyperv_cpuhp_online); -err_cpuhp: - hv_synic_free(); err_alloc: + hv_synic_free(); if (vmbus_irq == -1) { hv_remove_vmbus_handler(); } else { -- cgit v1.2.3 From 1f8b6df6a997a430b0c48b504638154b520781ad Mon Sep 17 00:00:00 2001 From: Benedict Wong Date: Wed, 10 May 2023 01:30:21 +0000 Subject: xfrm: Treat already-verified secpath entries as optional This change allows inbound traffic through nested IPsec tunnels to successfully match policies and templates, while retaining the secpath stack trace as necessary for netfilter policies. Specifically, this patch marks secpath entries that have already matched against a relevant policy as having been verified, allowing it to be treated as optional and skipped after a tunnel decapsulation (during which the src/dst/proto/etc may have changed, and the correct policy chain no long be resolvable). This approach is taken as opposed to the iteration in b0355dbbf13c, where the secpath was cleared, since that breaks subsequent validations that rely on the existence of the secpath entries (netfilter policies, or transport-in-tunnel mode, where policies remain resolvable). Fixes: b0355dbbf13c ("Fix XFRM-I support for nested ESP tunnels") Test: Tested against Android Kernel Unit Tests Test: Tested against Android CTS Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 + net/xfrm/xfrm_input.c | 1 + net/xfrm/xfrm_policy.c | 12 ++++++++++++ 3 files changed, 14 insertions(+) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 33ee3f5936e6..151ca95dd08d 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1054,6 +1054,7 @@ struct xfrm_offload { struct sec_path { int len; int olen; + int verified_cnt; struct xfrm_state *xvec[XFRM_MAX_DEPTH]; struct xfrm_offload ovec[XFRM_MAX_OFFLOAD_DEPTH]; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 39fb91ff23d9..b731687b5f3e 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -131,6 +131,7 @@ struct sec_path *secpath_set(struct sk_buff *skb) memset(sp->ovec, 0, sizeof(sp->ovec)); sp->olen = 0; sp->len = 0; + sp->verified_cnt = 0; return sp; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 6d15788b5123..ff58ce6c030c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3349,6 +3349,13 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id)) return ++idx; if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) { + if (idx < sp->verified_cnt) { + /* Secpath entry previously verified, consider optional and + * continue searching + */ + continue; + } + if (start == -1) start = -2-idx; break; @@ -3723,6 +3730,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, * Order is _important_. Later we will implement * some barriers, but at the moment barriers * are implied between each two transformations. + * Upon success, marks secpath entries as having been + * verified to allow them to be skipped in future policy + * checks (e.g. nested tunnels). */ for (i = xfrm_nr-1, k = 0; i >= 0; i--) { k = xfrm_policy_ok(tpp[i], sp, k, family, if_id); @@ -3741,6 +3751,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } xfrm_pols_put(pols, npols); + sp->verified_cnt = k; + return 1; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); -- cgit v1.2.3 From a287f5b0cfc6804c5b12a4be13c7c9fe27869e90 Mon Sep 17 00:00:00 2001 From: Benedict Wong Date: Wed, 10 May 2023 01:30:22 +0000 Subject: xfrm: Ensure policies always checked on XFRM-I input path This change adds methods in the XFRM-I input path that ensures that policies are checked prior to processing of the subsequent decapsulated packet, after which the relevant policies may no longer be resolvable (due to changing src/dst/proto/etc). Notably, raw ESP/AH packets did not perform policy checks inherently, whereas all other encapsulated packets (UDP, TCP encapsulated) do policy checks after calling xfrm_input handling in the respective encapsulation layer. Fixes: b0355dbbf13c ("Fix XFRM-I support for nested ESP tunnels") Test: Verified with additional Android Kernel Unit tests Test: Verified against Android CTS Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface_core.c | 54 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 1f99dc469027..35279c220bd7 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -310,6 +310,52 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) skb->mark = 0; } +static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type, unsigned short family) +{ + struct sec_path *sp; + + sp = skb_sec_path(skb); + if (sp && (sp->len || sp->olen) && + !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) + goto discard; + + XFRM_SPI_SKB_CB(skb)->family = family; + if (family == AF_INET) { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; + } else { + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + } + + return xfrm_input(skb, nexthdr, spi, encap_type); +discard: + kfree_skb(skb); + return 0; +} + +static int xfrmi4_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); +} + +static int xfrmi6_rcv(struct sk_buff *skb) +{ + return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], + 0, 0, AF_INET6); +} + +static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); +} + +static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) +{ + return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); +} + static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; @@ -945,8 +991,8 @@ static struct pernet_operations xfrmi_net_ops = { }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { - .handler = xfrm6_rcv, - .input_handler = xfrm_input, + .handler = xfrmi6_rcv, + .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, @@ -996,8 +1042,8 @@ static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { - .handler = xfrm4_rcv, - .input_handler = xfrm_input, + .handler = xfrmi4_rcv, + .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, -- cgit v1.2.3 From 70c2e03e9aaf17496c63f6e42333c012f5ae5307 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 29 Mar 2023 13:23:04 +0300 Subject: thunderbolt: dma_test: Use correct value for absent rings when creating paths Both tb_xdomain_enable_paths() and tb_xdomain_disable_paths() expect -1, not 0, if the corresponding ring is not needed. For this reason change the driver to use correct value for the rings that are not needed. Fixes: 180b0689425c ("thunderbolt: Allow multiple DMA tunnels over a single XDomain connection") Cc: stable@vger.kernel.org Reported-by: Pengfei Xu Tested-by: Pengfei Xu Signed-off-by: Mika Westerberg --- drivers/thunderbolt/dma_test.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/thunderbolt/dma_test.c b/drivers/thunderbolt/dma_test.c index 3bedecb236e0..14bb6dec6c4b 100644 --- a/drivers/thunderbolt/dma_test.c +++ b/drivers/thunderbolt/dma_test.c @@ -192,9 +192,9 @@ static int dma_test_start_rings(struct dma_test *dt) } ret = tb_xdomain_enable_paths(dt->xd, dt->tx_hopid, - dt->tx_ring ? dt->tx_ring->hop : 0, + dt->tx_ring ? dt->tx_ring->hop : -1, dt->rx_hopid, - dt->rx_ring ? dt->rx_ring->hop : 0); + dt->rx_ring ? dt->rx_ring->hop : -1); if (ret) { dma_test_free_rings(dt); return ret; @@ -218,9 +218,9 @@ static void dma_test_stop_rings(struct dma_test *dt) tb_ring_stop(dt->tx_ring); ret = tb_xdomain_disable_paths(dt->xd, dt->tx_hopid, - dt->tx_ring ? dt->tx_ring->hop : 0, + dt->tx_ring ? dt->tx_ring->hop : -1, dt->rx_hopid, - dt->rx_ring ? dt->rx_ring->hop : 0); + dt->rx_ring ? dt->rx_ring->hop : -1); if (ret) dev_warn(&dt->svc->dev, "failed to disable DMA paths\n"); -- cgit v1.2.3 From 320805ab61e5f1e2a5729ae266e16bec2904050c Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Thu, 18 May 2023 08:13:52 -0700 Subject: Drivers: hv: vmbus: Fix vmbus_wait_for_unload() to scan present CPUs vmbus_wait_for_unload() may be called in the panic path after other CPUs are stopped. vmbus_wait_for_unload() currently loops through online CPUs looking for the UNLOAD response message. But the values of CONFIG_KEXEC_CORE and crash_kexec_post_notifiers affect the path used to stop the other CPUs, and in one of the paths the stopped CPUs are removed from cpu_online_mask. This removal happens in both x86/x64 and arm64 architectures. In such a case, vmbus_wait_for_unload() only checks the panic'ing CPU, and misses the UNLOAD response message except when the panic'ing CPU is CPU 0. vmbus_wait_for_unload() eventually times out, but only after waiting 100 seconds. Fix this by looping through *present* CPUs in vmbus_wait_for_unload(). The cpu_present_mask is not modified by stopping the other CPUs in the panic path, nor should it be. Also, in a CoCo VM the synic_message_page is not allocated in hv_synic_alloc(), but is set and cleared in hv_synic_enable_regs() and hv_synic_disable_regs() such that it is set only when the CPU is online. If not all present CPUs are online when vmbus_wait_for_unload() is called, the synic_message_page might be NULL. Add a check for this. Fixes: cd95aad55793 ("Drivers: hv: vmbus: handle various crash scenarios") Cc: stable@vger.kernel.org Reported-by: John Starks Signed-off-by: Michael Kelley Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/1684422832-38476-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 007f26d5f1a4..2f4d09ce027a 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -829,11 +829,22 @@ static void vmbus_wait_for_unload(void) if (completion_done(&vmbus_connection.unload_event)) goto completed; - for_each_online_cpu(cpu) { + for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); + /* + * In a CoCo VM the synic_message_page is not allocated + * in hv_synic_alloc(). Instead it is set/cleared in + * hv_synic_enable_regs() and hv_synic_disable_regs() + * such that it is set only when the CPU is online. If + * not all present CPUs are online, the message page + * might be NULL, so skip such CPUs. + */ page_addr = hv_cpu->synic_message_page; + if (!page_addr) + continue; + msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; @@ -867,11 +878,14 @@ completed: * maybe-pending messages on all CPUs to be able to receive new * messages after we reconnect. */ - for_each_online_cpu(cpu) { + for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); page_addr = hv_cpu->synic_message_page; + if (!page_addr) + continue; + msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; msg->header.message_type = HVMSG_NONE; } -- cgit v1.2.3 From b6d572aeb58a5e0be86bd51ea514c4feba996cc4 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 24 May 2023 10:41:57 +0300 Subject: thunderbolt: Increase DisplayPort Connection Manager handshake timeout It turns out that when plugging in VGA cable through USB-C to VGA/DVI dongle the Connection Manager handshake can take longer time, at least on Intel Titan Ridge based docks such as Dell WD91TB. This leads to following error in the dmesg: thunderbolt 0000:00:0d.3: 3:10: DP tunnel activation failed, aborting and the display stays blank (because we failed to establish the tunnel). For this reason increase the timeout to 3s. Reported-by: Koba Ko Cc: stable@vger.kernel.org Acked-By: Yehezkel Bernat Signed-off-by: Mika Westerberg --- drivers/thunderbolt/tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thunderbolt/tunnel.c b/drivers/thunderbolt/tunnel.c index 9099ae73e78f..4f222673d651 100644 --- a/drivers/thunderbolt/tunnel.c +++ b/drivers/thunderbolt/tunnel.c @@ -526,7 +526,7 @@ static int tb_dp_xchg_caps(struct tb_tunnel *tunnel) * Perform connection manager handshake between IN and OUT ports * before capabilities exchange can take place. */ - ret = tb_dp_cm_handshake(in, out, 1500); + ret = tb_dp_cm_handshake(in, out, 3000); if (ret) return ret; -- cgit v1.2.3 From 3fe95742af29b8b4eccab2ba94bc521805c6e10c Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 24 May 2023 13:47:04 +0300 Subject: thunderbolt: Do not touch CL state configuration during discovery If the boot firmware has already established tunnels, especially ones that have special requirements from the link such as DisplayPort, we should not blindly enable CL states (nor change the TMU configuration). Otherwise the existing tunnels may not work as expected. For this reason, skip the CL state enabling when we go over the existing topology. This will also keep the TMU settings untouched because we do not change the TMU configuration when CL states are not enabled. Reported-by: Koba Ko Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/7831 Cc: stable@vger.kernel.org # v6.0+ Acked-By: Yehezkel Bernat Signed-off-by: Mika Westerberg --- drivers/thunderbolt/tb.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c index 7bfbc9ca9ba4..c1af712ca728 100644 --- a/drivers/thunderbolt/tb.c +++ b/drivers/thunderbolt/tb.c @@ -737,6 +737,7 @@ static void tb_scan_port(struct tb_port *port) { struct tb_cm *tcm = tb_priv(port->sw->tb); struct tb_port *upstream_port; + bool discovery = false; struct tb_switch *sw; int ret; @@ -804,8 +805,10 @@ static void tb_scan_port(struct tb_port *port) * tunnels and know which switches were authorized already by * the boot firmware. */ - if (!tcm->hotplug_active) + if (!tcm->hotplug_active) { dev_set_uevent_suppress(&sw->dev, true); + discovery = true; + } /* * At the moment Thunderbolt 2 and beyond (devices with LC) we @@ -835,10 +838,14 @@ static void tb_scan_port(struct tb_port *port) * CL0s and CL1 are enabled and supported together. * Silently ignore CLx enabling in case CLx is not supported. */ - ret = tb_switch_enable_clx(sw, TB_CL1); - if (ret && ret != -EOPNOTSUPP) - tb_sw_warn(sw, "failed to enable %s on upstream port\n", - tb_switch_clx_name(TB_CL1)); + if (discovery) { + tb_sw_dbg(sw, "discovery, not touching CL states\n"); + } else { + ret = tb_switch_enable_clx(sw, TB_CL1); + if (ret && ret != -EOPNOTSUPP) + tb_sw_warn(sw, "failed to enable %s on upstream port\n", + tb_switch_clx_name(TB_CL1)); + } if (tb_switch_is_clx_enabled(sw, TB_CL1)) /* -- cgit v1.2.3 From aafbb1eeabdc4e9241b400146f77369f041ec11b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 29 May 2023 04:40:02 -0400 Subject: tracing/rv/rtla: Update MAINTAINERS file to point to proper mailing list The mailing list that goes to linux-trace-devel is for the tracing libraries, and the patchwork associated to the tracing libraries keys off of that mailing list. For anything that lives in the Linux kernel proper (including the tools directory) must go through linux-trace-kernel, as the patchwork to that list keys off of the Linux kernel proper. Update the MAINTAINERS file to reflect the proper mailing lists. Link: https://lore.kernel.org/linux-trace-kernel/20230529044002.0481452b@rorschach.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) Acked-by: Daniel Bristot de Oliveira --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 27ef11624748..725d35b5d328 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17806,7 +17806,7 @@ F: tools/testing/selftests/rtc/ Real-time Linux Analysis (RTLA) tools M: Daniel Bristot de Oliveira M: Steven Rostedt -L: linux-trace-devel@vger.kernel.org +L: linux-trace-kernel@vger.kernel.org S: Maintained F: Documentation/tools/rtla/ F: tools/tracing/rtla/ @@ -18368,7 +18368,7 @@ F: drivers/infiniband/ulp/rtrs/ RUNTIME VERIFICATION (RV) M: Daniel Bristot de Oliveira M: Steven Rostedt -L: linux-trace-devel@vger.kernel.org +L: linux-trace-kernel@vger.kernel.org S: Maintained F: Documentation/trace/rv/ F: include/linux/rv.h -- cgit v1.2.3 From 9f9666e65359d5047089aef97ac87c50f624ecb0 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 30 May 2023 08:48:29 +0300 Subject: thunderbolt: Mask ring interrupt on Intel hardware as well When resuming from system sleep states the driver issues following warning on Intel hardware: thunderbolt 0000:07:00.0: interrupt for TX ring 0 is already enabled The reason for this is that the commit in question did not mask the ring interrupt on Intel hardware leaving the interrupt active. Fix this by masking it also in Intel hardware. Reported-by: beld zhang Tested-by: beld zhang Closes: https://lore.kernel.org/linux-usb/ZHKW5NeabmfhgLbY@debian.me/ Fixes: c4af8e3fecd0 ("thunderbolt: Clear registers properly when auto clear isn't in use") Cc: stable@vger.kernel.org Reviewed-by: Mario Limonciello Signed-off-by: Mika Westerberg --- drivers/thunderbolt/nhi.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index c0aee5dc5237..e58beac44295 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -56,9 +56,14 @@ static int ring_interrupt_index(const struct tb_ring *ring) static void nhi_mask_interrupt(struct tb_nhi *nhi, int mask, int ring) { - if (nhi->quirks & QUIRK_AUTO_CLEAR_INT) - return; - iowrite32(mask, nhi->iobase + REG_RING_INTERRUPT_MASK_CLEAR_BASE + ring); + if (nhi->quirks & QUIRK_AUTO_CLEAR_INT) { + u32 val; + + val = ioread32(nhi->iobase + REG_RING_INTERRUPT_BASE + ring); + iowrite32(val & ~mask, nhi->iobase + REG_RING_INTERRUPT_BASE + ring); + } else { + iowrite32(mask, nhi->iobase + REG_RING_INTERRUPT_MASK_CLEAR_BASE + ring); + } } static void nhi_clear_interrupt(struct tb_nhi *nhi, int ring) -- cgit v1.2.3 From 40994ce0ea010b1843e620bacc26d29ddebfc08d Mon Sep 17 00:00:00 2001 From: Zhu YiXin Date: Fri, 19 May 2023 12:45:55 +0800 Subject: MAINTAINERS: Add Chuanhua Lei as Intel LGM GW PCIe maintainer Rahul Tanwar is no longer at Maxlinear, so update the MAINTAINERS entry for the PCIe driver for Intel LGM GW SoC. Link: https://lore.kernel.org/r/20230519044555.3750-2-yzhu@maxlinear.com Signed-off-by: Zhu YiXin Signed-off-by: Bjorn Helgaas Acked-by: Rahul Tanwar Acked-by: Lei Chuanhua --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7e0b87d5aa2e..31c4e5e9c411 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16349,7 +16349,7 @@ F: Documentation/devicetree/bindings/pci/intel,keembay-pcie* F: drivers/pci/controller/dwc/pcie-keembay.c PCIE DRIVER FOR INTEL LGM GW SOC -M: Rahul Tanwar +M: Chuanhua Lei L: linux-pci@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/pci/intel-gw-pcie.yaml -- cgit v1.2.3 From f1832e2b5e498e258b090af3b065b85cf8cc5161 Mon Sep 17 00:00:00 2001 From: Jerry Meng Date: Wed, 31 May 2023 11:51:16 +0800 Subject: USB: serial: option: add Quectel EM061KGL series Add support for Quectel EM061KGL series which are based on Qualcomm SDX12 chip: EM061KGL_LTA(0x2c7c / 0x0123): MBIM + GNSS + DIAG + NMEA + AT + QDSS + DPL EM061KGL_LMS(0x2c7c / 0x0124): MBIM + GNSS + DIAG + NMEA + AT + QDSS + DPL EM061KGL_LWW(0x2c7c / 0x6008): MBIM + GNSS + DIAG + NMEA + AT + QDSS + DPL EM061KGL_LCN(0x2c7c / 0x6009): MBIM + GNSS + DIAG + NMEA + AT + QDSS + DPL Above products use the exact same interface layout and option driver is for interfaces DIAG, NMEA and AT. T: Bus=03 Lev=01 Prnt=01 Port=01 Cnt=02 Dev#= 5 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2c7c ProdID=6008 Rev= 5.04 S: Manufacturer=Quectel S: Product=Quectel EM061K-GL S: SerialNumber=f6fa08b6 C:* #Ifs= 8 Cfg#= 1 Atr=a0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=0e Prot=00 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=81(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I:* If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=40 Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=70 Driver=(none) E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 7 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=8f(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Jerry Meng Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 644a55447fd7..fd42e3a0bd18 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -248,6 +248,8 @@ static void option_instat_callback(struct urb *urb); #define QUECTEL_VENDOR_ID 0x2c7c /* These Quectel products use Quectel's vendor ID */ #define QUECTEL_PRODUCT_EC21 0x0121 +#define QUECTEL_PRODUCT_EM061K_LTA 0x0123 +#define QUECTEL_PRODUCT_EM061K_LMS 0x0124 #define QUECTEL_PRODUCT_EC25 0x0125 #define QUECTEL_PRODUCT_EG91 0x0191 #define QUECTEL_PRODUCT_EG95 0x0195 @@ -266,6 +268,8 @@ static void option_instat_callback(struct urb *urb); #define QUECTEL_PRODUCT_RM520N 0x0801 #define QUECTEL_PRODUCT_EC200U 0x0901 #define QUECTEL_PRODUCT_EC200S_CN 0x6002 +#define QUECTEL_PRODUCT_EM061K_LWW 0x6008 +#define QUECTEL_PRODUCT_EM061K_LCN 0x6009 #define QUECTEL_PRODUCT_EC200T 0x6026 #define QUECTEL_PRODUCT_RM500K 0x7001 @@ -1189,6 +1193,18 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K, 0xff, 0x00, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LCN, 0xff, 0xff, 0x30) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LCN, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LCN, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LMS, 0xff, 0xff, 0x30) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LMS, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LMS, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LTA, 0xff, 0xff, 0x30) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LTA, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LTA, 0xff, 0xff, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LWW, 0xff, 0xff, 0x30) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LWW, 0xff, 0x00, 0x40) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM061K_LWW, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM12, 0xff, 0xff, 0xff), .driver_info = RSVD(1) | RSVD(2) | RSVD(3) | RSVD(4) | NUMEP2 }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM12, 0xff, 0, 0) }, -- cgit v1.2.3 From 8681f71759010503892f9e3ddb05f65c0f21b690 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Fri, 2 Jun 2023 19:50:34 -0700 Subject: KVM: arm64: PMU: Restore the host's PMUSERENR_EL0 Restore the host's PMUSERENR_EL0 value instead of clearing it, before returning back to userspace, as the host's EL0 might have a direct access to PMU registers (some bits of PMUSERENR_EL0 for might not be zero for the host EL0). Fixes: 83a7a4d643d3 ("arm64: perf: Enable PMU counter userspace access for perf event") Signed-off-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230603025035.3781797-2-reijiw@google.com --- arch/arm64/kvm/hyp/include/hyp/switch.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 5c15c58f90cc..0dcb8f7620d1 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -82,7 +82,12 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) * EL1 instead of being trapped to EL2. */ if (kvm_arm_support_pmu_v3()) { + struct kvm_cpu_context *hctxt; + write_sysreg(0, pmselr_el0); + + hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); } @@ -106,8 +111,12 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) write_sysreg(vcpu->arch.mdcr_el2_host, mdcr_el2); write_sysreg(0, hstr_el2); - if (kvm_arm_support_pmu_v3()) - write_sysreg(0, pmuserenr_el0); + if (kvm_arm_support_pmu_v3()) { + struct kvm_cpu_context *hctxt; + + hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0); + } if (cpus_have_final_cap(ARM64_SME)) { sysreg_clear_set_s(SYS_HFGRTR_EL2, 0, -- cgit v1.2.3 From 0c2f9acf6ae74118385f7a7d48f4b2d93637b628 Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Fri, 2 Jun 2023 19:50:35 -0700 Subject: KVM: arm64: PMU: Don't overwrite PMUSERENR with vcpu loaded Currently, with VHE, KVM sets ER, CR, SW and EN bits of PMUSERENR_EL0 to 1 on vcpu_load(), and saves and restores the register value for the host on vcpu_load() and vcpu_put(). If the value of those bits are cleared on a pCPU with a vCPU loaded (armv8pmu_start() would do that when PMU counters are programmed for the guest), PMU access from the guest EL0 might be trapped to the guest EL1 directly regardless of the current PMUSERENR_EL0 value of the vCPU. Fix this by not letting armv8pmu_start() overwrite PMUSERENR_EL0 on the pCPU where PMUSERENR_EL0 for the guest is loaded, and instead updating the saved shadow register value for the host so that the value can be restored on vcpu_put() later. While vcpu_{put,load}() are manipulating PMUSERENR_EL0, disable IRQs to prevent a race condition between these processes and IPIs that attempt to update PMUSERENR_EL0 for the host EL0. Suggested-by: Mark Rutland Suggested-by: Marc Zyngier Fixes: 83a7a4d643d3 ("arm64: perf: Enable PMU counter userspace access for perf event") Signed-off-by: Reiji Watanabe Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230603025035.3781797-3-reijiw@google.com --- arch/arm/include/asm/arm_pmuv3.h | 5 +++++ arch/arm64/include/asm/kvm_host.h | 7 +++++++ arch/arm64/kvm/hyp/include/hyp/switch.h | 2 ++ arch/arm64/kvm/hyp/vhe/switch.c | 14 ++++++++++++++ arch/arm64/kvm/pmu.c | 27 +++++++++++++++++++++++++++ drivers/perf/arm_pmuv3.c | 21 ++++++++++++++++++--- 6 files changed, 73 insertions(+), 3 deletions(-) diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h index 78d3d4b82c6c..92ddd950478e 100644 --- a/arch/arm/include/asm/arm_pmuv3.h +++ b/arch/arm/include/asm/arm_pmuv3.h @@ -222,6 +222,11 @@ static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr) return false; } +static inline bool kvm_set_pmuserenr(u64 val) +{ + return false; +} + /* PMU Version in DFR Register */ #define ARMV8_PMU_DFR_VER_NI 0 #define ARMV8_PMU_DFR_VER_V3P4 0x5 diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7e7e19ef6993..9787503ff43f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -699,6 +699,8 @@ struct kvm_vcpu_arch { #define SYSREGS_ON_CPU __vcpu_single_flag(sflags, BIT(4)) /* Software step state is Active-pending */ #define DBG_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(5)) +/* PMUSERENR for the guest EL0 is on physical CPU */ +#define PMUSERENR_ON_CPU __vcpu_single_flag(sflags, BIT(6)) /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ @@ -1065,9 +1067,14 @@ void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr); void kvm_clr_pmu_events(u32 clr); +bool kvm_set_pmuserenr(u64 val); #else static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {} static inline void kvm_clr_pmu_events(u32 clr) {} +static inline bool kvm_set_pmuserenr(u64 val) +{ + return false; +} #endif void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 0dcb8f7620d1..4fe217efa218 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -89,6 +89,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); + vcpu_set_flag(vcpu, PMUSERENR_ON_CPU); } vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2); @@ -116,6 +117,7 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0); + vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU); } if (cpus_have_final_cap(ARM64_SME)) { diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 7a1aa511e7da..b37e7c96efea 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -92,14 +92,28 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) } NOKPROBE_SYMBOL(__deactivate_traps); +/* + * Disable IRQs in {activate,deactivate}_traps_vhe_{load,put}() to + * prevent a race condition between context switching of PMUSERENR_EL0 + * in __{activate,deactivate}_traps_common() and IPIs that attempts to + * update PMUSERENR_EL0. See also kvm_set_pmuserenr(). + */ void activate_traps_vhe_load(struct kvm_vcpu *vcpu) { + unsigned long flags; + + local_irq_save(flags); __activate_traps_common(vcpu); + local_irq_restore(flags); } void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu) { + unsigned long flags; + + local_irq_save(flags); __deactivate_traps_common(vcpu); + local_irq_restore(flags); } static const exit_handler_fn hyp_exit_handlers[] = { diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 7887133d15f0..121f1a14c829 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -209,3 +209,30 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) kvm_vcpu_pmu_enable_el0(events_host); kvm_vcpu_pmu_disable_el0(events_guest); } + +/* + * With VHE, keep track of the PMUSERENR_EL0 value for the host EL0 on the pCPU + * where PMUSERENR_EL0 for the guest is loaded, since PMUSERENR_EL0 is switched + * to the value for the guest on vcpu_load(). The value for the host EL0 + * will be restored on vcpu_put(), before returning to userspace. + * This isn't necessary for nVHE, as the register is context switched for + * every guest enter/exit. + * + * Return true if KVM takes care of the register. Otherwise return false. + */ +bool kvm_set_pmuserenr(u64 val) +{ + struct kvm_cpu_context *hctxt; + struct kvm_vcpu *vcpu; + + if (!kvm_arm_support_pmu_v3() || !has_vhe()) + return false; + + vcpu = kvm_get_running_vcpu(); + if (!vcpu || !vcpu_get_flag(vcpu, PMUSERENR_ON_CPU)) + return false; + + hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; + ctxt_sys_reg(hctxt, PMUSERENR_EL0) = val; + return true; +} diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index c98e4039386d..93b7edb5f1e7 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -677,9 +677,25 @@ static inline u32 armv8pmu_getreset_flags(void) return value; } +static void update_pmuserenr(u64 val) +{ + lockdep_assert_irqs_disabled(); + + /* + * The current PMUSERENR_EL0 value might be the value for the guest. + * If that's the case, have KVM keep tracking of the register value + * for the host EL0 so that KVM can restore it before returning to + * the host EL0. Otherwise, update the register now. + */ + if (kvm_set_pmuserenr(val)) + return; + + write_pmuserenr(val); +} + static void armv8pmu_disable_user_access(void) { - write_pmuserenr(0); + update_pmuserenr(0); } static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu) @@ -695,8 +711,7 @@ static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu) armv8pmu_write_evcntr(i, 0); } - write_pmuserenr(0); - write_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR); + update_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR); } static void armv8pmu_enable_event(struct perf_event *event) -- cgit v1.2.3 From ad24919540fb4df83981d469b5253cc1aecca939 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 5 Jun 2023 15:32:38 +0100 Subject: firmware: cs_dsp: Log correct region name in bin error messages In cs_dsp_load_coeff() region_name should be set in the XM/YM/ZM cases otherwise any errors will log the region as "Unknown". While doing this also change one error message that logged the region type ID to log the region_name instead. This makes it consistent with other messages in the same function. Signed-off-by: Richard Fitzgerald Link: https://lore.kernel.org/r/20230605143238.4001982-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- drivers/firmware/cirrus/cs_dsp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c index e4ccfb6a8fa5..ec056f6f40ce 100644 --- a/drivers/firmware/cirrus/cs_dsp.c +++ b/drivers/firmware/cirrus/cs_dsp.c @@ -2124,6 +2124,7 @@ static int cs_dsp_load_coeff(struct cs_dsp *dsp, const struct firmware *firmware file, blocks, le32_to_cpu(blk->len), type, le32_to_cpu(blk->id)); + region_name = cs_dsp_mem_region_name(type); mem = cs_dsp_find_region(dsp, type); if (!mem) { cs_dsp_err(dsp, "No base for region %x\n", type); @@ -2147,8 +2148,8 @@ static int cs_dsp_load_coeff(struct cs_dsp *dsp, const struct firmware *firmware reg = dsp->ops->region_to_reg(mem, reg); reg += offset; } else { - cs_dsp_err(dsp, "No %x for algorithm %x\n", - type, le32_to_cpu(blk->id)); + cs_dsp_err(dsp, "No %s for algorithm %x\n", + region_name, le32_to_cpu(blk->id)); } break; -- cgit v1.2.3 From 306320034e8fbe7ee1cc4f5269c55658b4612048 Mon Sep 17 00:00:00 2001 From: Bernhard Seibold Date: Fri, 2 Jun 2023 15:30:29 +0200 Subject: serial: lantiq: add missing interrupt ack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the error interrupt is never acknowledged, so once active it will stay active indefinitely, causing the handler to be called in an infinite loop. Fixes: 2f0fc4159a6a ("SERIAL: Lantiq: Add driver for MIPS Lantiq SOCs.") Cc: Signed-off-by: Bernhard Seibold Reviewed-by: Ilpo Järvinen Message-ID: <20230602133029.546-1-mail@bernhard-seibold.de> Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/lantiq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/lantiq.c b/drivers/tty/serial/lantiq.c index a58e9277dfad..f1387f1024db 100644 --- a/drivers/tty/serial/lantiq.c +++ b/drivers/tty/serial/lantiq.c @@ -250,6 +250,7 @@ lqasc_err_int(int irq, void *_port) struct ltq_uart_port *ltq_port = to_ltq_uart_port(port); spin_lock_irqsave(<q_port->lock, flags); + __raw_writel(ASC_IRNCR_EIR, port->membase + LTQ_ASC_IRNCR); /* clear any pending interrupts */ asc_update_bits(0, ASCWHBSTATE_CLRPE | ASCWHBSTATE_CLRFE | ASCWHBSTATE_CLRROE, port->membase + LTQ_ASC_WHBSTATE); -- cgit v1.2.3 From bf06fcf4be0feefebd27deb8b60ad262f4230489 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 5 Jun 2023 10:36:15 +0300 Subject: xfrm: add missed call to delete offloaded policies Offloaded policies are deleted through two flows: netdev is going down and policy flush. In both cases, the code lacks relevant call to delete offloaded policy. Fixes: 919e43fad516 ("xfrm: add an interface to offload policy") Signed-off-by: Leon Romanovsky Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ff58ce6c030c..e7617c9959c3 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1831,6 +1831,7 @@ again: __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + xfrm_dev_policy_delete(pol); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); @@ -1869,6 +1870,7 @@ again: __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + xfrm_dev_policy_delete(pol); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); -- cgit v1.2.3 From 1caa71a7a600f7781ce05ef1e84701c459653663 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 7 Jun 2023 15:38:44 +0100 Subject: KVM: arm64: Restore GICv2-on-GICv3 functionality When reworking the vgic locking, the vgic distributor registration got simplified, which was a very good cleanup. But just a tad too radical, as we now register the *native* vgic only, ignoring the GICv2-on-GICv3 that allows pre-historic VMs (or so I thought) to run. As it turns out, QEMU still defaults to GICv2 in some cases, and this breaks Nathan's setup! Fix it by propagating the *requested* vgic type rather than the host's version. Fixes: 59112e9c390b ("KVM: arm64: vgic: Fix a circular locking issue") Reported-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: Marc Zyngier link: https://lore.kernel.org/r/20230606221525.GA2269598@dev-arch.thelio-3990X --- arch/arm64/kvm/vgic/vgic-init.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 6eafc2c45cfc..c8c3cb812783 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -446,6 +446,7 @@ int vgic_lazy_init(struct kvm *kvm) int kvm_vgic_map_resources(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; + enum vgic_type type; gpa_t dist_base; int ret = 0; @@ -460,10 +461,13 @@ int kvm_vgic_map_resources(struct kvm *kvm) if (!irqchip_in_kernel(kvm)) goto out; - if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) { ret = vgic_v2_map_resources(kvm); - else + type = VGIC_V2; + } else { ret = vgic_v3_map_resources(kvm); + type = VGIC_V3; + } if (ret) { __kvm_vgic_destroy(kvm); @@ -473,8 +477,7 @@ int kvm_vgic_map_resources(struct kvm *kvm) dist_base = dist->vgic_dist_base; mutex_unlock(&kvm->arch.config_lock); - ret = vgic_register_dist_iodev(kvm, dist_base, - kvm_vgic_global_state.type); + ret = vgic_register_dist_iodev(kvm, dist_base, type); if (ret) { kvm_err("Unable to register VGIC dist MMIO regions\n"); kvm_vgic_destroy(kvm); -- cgit v1.2.3 From 30c60dda219ddda0bc6ff6ac55d493d9db8be4fa Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 6 Jun 2023 18:48:14 +0000 Subject: KVM: arm64: Use raw_smp_processor_id() in kvm_pmu_probe_armpmu() Sebastian reports that commit 1c913a1c35aa ("KVM: arm64: Iterate arm_pmus list to probe for default PMU") introduced the following splat with CONFIG_DEBUG_PREEMPT enabled: [70506.110187] BUG: using smp_processor_id() in preemptible [00000000] code: qemu-system-aar/3078242 [70506.119077] caller is debug_smp_processor_id+0x20/0x30 [70506.124229] CPU: 129 PID: 3078242 Comm: qemu-system-aar Tainted: G W 6.4.0-rc5 #25 [70506.133176] Hardware name: GIGABYTE R181-T92-00/MT91-FS4-00, BIOS F34 08/13/2020 [70506.140559] Call trace: [70506.142993] dump_backtrace+0xa4/0x130 [70506.146737] show_stack+0x20/0x38 [70506.150040] dump_stack_lvl+0x48/0x60 [70506.153704] dump_stack+0x18/0x28 [70506.157007] check_preemption_disabled+0xe4/0x108 [70506.161701] debug_smp_processor_id+0x20/0x30 [70506.166046] kvm_arm_pmu_v3_set_attr+0x460/0x628 [70506.170662] kvm_arm_vcpu_arch_set_attr+0x88/0xd8 [70506.175363] kvm_arch_vcpu_ioctl+0x258/0x4a8 [70506.179632] kvm_vcpu_ioctl+0x32c/0x6b8 [70506.183465] __arm64_sys_ioctl+0xb4/0x100 [70506.187467] invoke_syscall+0x78/0x108 [70506.191205] el0_svc_common.constprop.0+0x4c/0x100 [70506.195984] do_el0_svc+0x34/0x50 [70506.199287] el0_svc+0x34/0x108 [70506.202416] el0t_64_sync_handler+0xf4/0x120 [70506.206674] el0t_64_sync+0x194/0x198 Fix the issue by using the raw variant that bypasses the debug assertion. While at it, stick all of the nuance and UAPI baggage into a comment for posterity. Fixes: 1c913a1c35aa ("KVM: arm64: Iterate arm_pmus list to probe for default PMU") Reported-by: Sebastian Ott Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230606184814.456743-1-oliver.upton@linux.dev --- arch/arm64/kvm/pmu-emul.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 491ca7eb2a4c..560650972478 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -700,7 +700,25 @@ static struct arm_pmu *kvm_pmu_probe_armpmu(void) mutex_lock(&arm_pmus_lock); - cpu = smp_processor_id(); + /* + * It is safe to use a stale cpu to iterate the list of PMUs so long as + * the same value is used for the entirety of the loop. Given this, and + * the fact that no percpu data is used for the lookup there is no need + * to disable preemption. + * + * It is still necessary to get a valid cpu, though, to probe for the + * default PMU instance as userspace is not required to specify a PMU + * type. In order to uphold the preexisting behavior KVM selects the + * PMU instance for the core where the first call to the + * KVM_ARM_VCPU_PMU_V3_CTRL attribute group occurs. A dependent use case + * would be a user with disdain of all things big.LITTLE that affines + * the VMM to a particular cluster of cores. + * + * In any case, userspace should just do the sane thing and use the UAPI + * to select a PMU type directly. But, be wary of the baggage being + * carried here. + */ + cpu = raw_smp_processor_id(); list_for_each_entry(entry, &arm_pmus, entry) { tmp = entry->arm_pmu; -- cgit v1.2.3 From de29a96acceae732c68a4094d08dc49079eefa02 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Fri, 26 May 2023 15:35:37 +0800 Subject: notifier: Initialize new struct srcu_usage field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 95433f726301 ("srcu: Begin offloading srcu_struct fields to srcu_update"), a new struct srcu_usage field was added, but was not properly initialized. This led to a "spinlock bad magic" BUG when the SRCU notifier was ever used. This was observed in the MediaTek CCI devfreq driver on next-20230525. The trimmed stack trace is as follows: BUG: spinlock bad magic on CPU#4, swapper/0/1 lock: 0xffffff80ff529ac0, .magic: 00000000, .owner: /-1, .owner_cpu: 0 Call trace: spin_bug+0xa4/0xe8 do_raw_spin_lock+0xec/0x120 _raw_spin_lock_irqsave+0x78/0xb8 synchronize_srcu+0x3c/0x168 srcu_notifier_chain_unregister+0x5c/0xa0 cpufreq_unregister_notifier+0x94/0xe0 devfreq_passive_event_handler+0x7c/0x3e0 devfreq_remove_device+0x48/0xe8 Add __SRCU_USAGE_INIT() to SRCU_NOTIFIER_INIT() so that srcu_usage gets initialized properly. Reported-by: Jon Hunter Fixes: 95433f726301 ("srcu: Begin offloading srcu_struct fields to srcu_update") Signed-off-by: Chen-Yu Tsai Tested-by: AngeloGioacchino Del Regno Cc: Matthias Brugger Cc: "Rafael J. Wysocki" Cc: "Michał Mirosław" Cc: Dmitry Osipenko Cc: Sachin Sant Cc: Joel Fernandes (Google) Acked-by: Zqiang Signed-off-by: Paul E. McKenney --- include/linux/notifier.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 2aba75145144..86544707236a 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -106,12 +106,22 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); #define RAW_NOTIFIER_INIT(name) { \ .head = NULL } +#ifdef CONFIG_TREE_SRCU #define SRCU_NOTIFIER_INIT(name, pcpu) \ { \ .mutex = __MUTEX_INITIALIZER(name.mutex), \ .head = NULL, \ + .srcuu = __SRCU_USAGE_INIT(name.srcuu), \ .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ } +#else +#define SRCU_NOTIFIER_INIT(name, pcpu) \ + { \ + .mutex = __MUTEX_INITIALIZER(name.mutex), \ + .head = NULL, \ + .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ + } +#endif #define ATOMIC_NOTIFIER_HEAD(name) \ struct atomic_notifier_head name = \ -- cgit v1.2.3 From 713274f1f2c896d37017efee333fd44149710119 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 7 Jun 2023 15:39:50 +0300 Subject: bpf: Fix verifier id tracking of scalars on spill The following scenario describes a bug in the verifier where it incorrectly concludes about equivalent scalar IDs which could lead to verifier bypass in privileged mode: 1. Prepare a 32-bit rogue number. 2. Put the rogue number into the upper half of a 64-bit register, and roll a random (unknown to the verifier) bit in the lower half. The rest of the bits should be zero (although variations are possible). 3. Assign an ID to the register by MOVing it to another arbitrary register. 4. Perform a 32-bit spill of the register, then perform a 32-bit fill to another register. Due to a bug in the verifier, the ID will be preserved, although the new register will contain only the lower 32 bits, i.e. all zeros except one random bit. At this point there are two registers with different values but the same ID, which means the integrity of the verifier state has been corrupted. 5. Compare the new 32-bit register with 0. In the branch where it's equal to 0, the verifier will believe that the original 64-bit register is also 0, because it has the same ID, but its actual value still contains the rogue number in the upper half. Some optimizations of the verifier prevent the actual bypass, so extra care is needed: the comparison must be between two registers, and both branches must be reachable (this is why one random bit is needed). Both branches are still suitable for the bypass. 6. Right shift the original register by 32 bits to pop the rogue number. 7. Use the rogue number as an offset with any pointer. The verifier will believe that the offset is 0, while in reality it's the given number. The fix is similar to the 32-bit BPF_MOV handling in check_alu_op for SCALAR_VALUE. If the spill is narrowing the actual register value, don't keep the ID, make sure it's reset to 0. Fixes: 354e8f1970f8 ("bpf: Support <8-byte scalar spill and refill") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Daniel Borkmann Tested-by: Andrii Nakryiko # Checked veristat delta Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20230607123951.558971-2-maxtram95@gmail.com --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5871aa78d01a..0dd8adc7a159 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3868,6 +3868,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return err; } save_register_state(state, spi, reg, size); + /* Break the relation on a narrowing spill. */ + if (fls64(reg->umax_value) > BITS_PER_BYTE * size) + state->stack[spi].spilled_ptr.id = 0; } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) && insn->imm != 0 && env->bpf_capable) { struct bpf_reg_state fake_reg = {}; -- cgit v1.2.3 From f57ade27fc3eae16eab0f1edba1248ced429b2f0 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 7 Jun 2023 15:39:51 +0300 Subject: selftests/bpf: Add test cases to assert proper ID tracking on spill The previous commit fixed a verifier bypass by ensuring that ID is not preserved on narrowing spills. Add the test cases to check the problematic patterns. Signed-off-by: Maxim Mikityanskiy Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20230607123951.558971-3-maxtram95@gmail.com --- .../selftests/bpf/progs/verifier_spill_fill.c | 79 ++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 136e5530b72c..6115520154e3 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -371,4 +371,83 @@ __naked void and_then_at_fp_8(void) " ::: __clobber_all); } +SEC("xdp") +__description("32-bit spill of 64-bit reg should clear ID") +__failure __msg("math between ctx pointer and 4294967295 is not allowed") +__naked void spill_32bit_of_64bit_fail(void) +{ + asm volatile (" \ + r6 = r1; \ + /* Roll one bit to force the verifier to track both branches. */\ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x8; \ + /* Put a large number into r1. */ \ + r1 = 0xffffffff; \ + r1 <<= 32; \ + r1 += r0; \ + /* Assign an ID to r1. */ \ + r2 = r1; \ + /* 32-bit spill r1 to stack - should clear the ID! */\ + *(u32*)(r10 - 8) = r1; \ + /* 32-bit fill r2 from stack. */ \ + r2 = *(u32*)(r10 - 8); \ + /* Compare r2 with another register to trigger find_equal_scalars.\ + * Having one random bit is important here, otherwise the verifier cuts\ + * the corners. If the ID was mistakenly preserved on spill, this would\ + * cause the verifier to think that r1 is also equal to zero in one of\ + * the branches, and equal to eight on the other branch.\ + */ \ + r3 = 0; \ + if r2 != r3 goto l0_%=; \ +l0_%=: r1 >>= 32; \ + /* At this point, if the verifier thinks that r1 is 0, an out-of-bounds\ + * read will happen, because it actually contains 0xffffffff.\ + */ \ + r6 += r1; \ + r0 = *(u32*)(r6 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("xdp") +__description("16-bit spill of 32-bit reg should clear ID") +__failure __msg("dereference of modified ctx ptr R6 off=65535 disallowed") +__naked void spill_16bit_of_32bit_fail(void) +{ + asm volatile (" \ + r6 = r1; \ + /* Roll one bit to force the verifier to track both branches. */\ + call %[bpf_get_prandom_u32]; \ + r0 &= 0x8; \ + /* Put a large number into r1. */ \ + w1 = 0xffff0000; \ + r1 += r0; \ + /* Assign an ID to r1. */ \ + r2 = r1; \ + /* 16-bit spill r1 to stack - should clear the ID! */\ + *(u16*)(r10 - 8) = r1; \ + /* 16-bit fill r2 from stack. */ \ + r2 = *(u16*)(r10 - 8); \ + /* Compare r2 with another register to trigger find_equal_scalars.\ + * Having one random bit is important here, otherwise the verifier cuts\ + * the corners. If the ID was mistakenly preserved on spill, this would\ + * cause the verifier to think that r1 is also equal to zero in one of\ + * the branches, and equal to eight on the other branch.\ + */ \ + r3 = 0; \ + if r2 != r3 goto l0_%=; \ +l0_%=: r1 >>= 16; \ + /* At this point, if the verifier thinks that r1 is 0, an out-of-bounds\ + * read will happen, because it actually contains 0xffff.\ + */ \ + r6 += r1; \ + r0 = *(u32*)(r6 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 1166a530a84758bb9e6b448fc8c195ed413f5ded Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Mon, 5 Jun 2023 04:06:54 -0700 Subject: xfrm: fix inbound ipv4/udp/esp packets to UDPv6 dualstack sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before Linux v5.8 an AF_INET6 SOCK_DGRAM (udp/udplite) socket with SOL_UDP, UDP_ENCAP, UDP_ENCAP_ESPINUDP{,_NON_IKE} enabled would just unconditionally use xfrm4_udp_encap_rcv(), afterwards such a socket would use the newly added xfrm6_udp_encap_rcv() which only handles IPv6 packets. Cc: Sabrina Dubroca Cc: Steffen Klassert Cc: Jakub Kicinski Cc: Benedict Wong Cc: Yan Yan Fixes: 0146dca70b87 ("xfrm: add support for UDPv6 encapsulation of ESP") Signed-off-by: Maciej Żenczykowski Reviewed-by: Simon Horman Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_input.c | 1 + net/ipv6/xfrm6_input.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index ad2afeef4f10..eac206a290d0 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -164,6 +164,7 @@ drop: kfree_skb(skb); return 0; } +EXPORT_SYMBOL(xfrm4_udp_encap_rcv); int xfrm4_rcv(struct sk_buff *skb) { diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 04cbeefd8982..4907ab241d6b 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -86,6 +86,9 @@ int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) __be32 *udpdata32; __u16 encap_type = up->encap_type; + if (skb->protocol == htons(ETH_P_IP)) + return xfrm4_udp_encap_rcv(sk, skb); + /* if this is not encapsulated socket, then just return now */ if (!encap_type) return 1; -- cgit v1.2.3 From dfaed3e1fa7099de8de4e89cbe7eb9c1bca27dfe Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 7 Jun 2023 10:56:00 +1000 Subject: powerpc/64s/radix: Fix exit lazy tlb mm switch with irqs enabled Switching mm and tinkering with current->active_mm should be done with irqs disabled. There is a path where exit_lazy_flush_tlb can be called with irqs enabled: exit_lazy_flush_tlb flush_type_needed __flush_all_mm tlb_finish_mmu exit_mmap Which results in the switching being done with irqs enabled, which is incorrect. Fixes: a665eec0a22e ("powerpc/64s/radix: Fix mm_cpumask trimming race vs kthread_use_mm") Cc: stable@vger.kernel.org # v5.10+ Reported-by: Sachin Sant Link: https://lore.kernel.org/linuxppc-dev/A9A5D83D-BA70-47A4-BCB4-30C1AE19BC22@linux.ibm.com/ Tested-by: Sachin Sant Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://msgid.link/20230607005601.583293-1-npiggin@gmail.com --- arch/powerpc/mm/book3s64/radix_tlb.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index ce804b7bf84e..0bd4866d9824 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -795,12 +795,20 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush) goto out; if (current->active_mm == mm) { + unsigned long flags; + WARN_ON_ONCE(current->mm != NULL); - /* Is a kernel thread and is using mm as the lazy tlb */ + /* + * It is a kernel thread and is using mm as the lazy tlb, so + * switch it to init_mm. This is not always called from IPI + * (e.g., flush_type_needed), so must disable irqs. + */ + local_irq_save(flags); mmgrab_lazy_tlb(&init_mm); current->active_mm = &init_mm; switch_mm_irqs_off(mm, &init_mm, current); mmdrop_lazy_tlb(mm); + local_irq_restore(flags); } /* -- cgit v1.2.3 From 3e1b9b2d81901b3ceeb5ec1f1b4c41cd1cff53ba Mon Sep 17 00:00:00 2001 From: Carl Vanderlip Date: Fri, 2 Jun 2023 15:04:39 -0600 Subject: accel/qaic: Free user handle on interrupted mutex After user handle is allocated, if mutex is interrupted, we do not free the user handle and return an error. Kref had been initialized, but not added to users list, so device teardown would also not call free_usr. Fixes: c501ca23a6a3 ("accel/qaic: Add uapi and core driver file") Signed-off-by: Carl Vanderlip Reviewed-by: Pranjal Ramajor Asha Kanojiya Reviewed-by: Jeffrey Hugo Signed-off-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20230602210440.8411-2-quic_jhugo@quicinc.com --- drivers/accel/qaic/qaic_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/accel/qaic/qaic_drv.c b/drivers/accel/qaic/qaic_drv.c index 2d0828db28d8..961cd341b414 100644 --- a/drivers/accel/qaic/qaic_drv.c +++ b/drivers/accel/qaic/qaic_drv.c @@ -97,6 +97,7 @@ static int qaic_open(struct drm_device *dev, struct drm_file *file) cleanup_usr: cleanup_srcu_struct(&usr->qddev_lock); + ida_free(&qaic_usrs, usr->handle); free_usr: kfree(usr); dev_unlock: -- cgit v1.2.3 From 61d8cdb7872c82d8a4d5e5251b0010332c316a67 Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Fri, 2 Jun 2023 15:04:40 -0600 Subject: accel/qaic: Fix NULL pointer deref in qaic_destroy_drm_device() If qaic_destroy_drm_device() is called before the device has fully initialized it will cause a NULL pointer dereference as the drm device has not yet been created. Fix this with a NULL check. Fixes: c501ca23a6a3 ("accel/qaic: Add uapi and core driver file") Signed-off-by: Jeffrey Hugo Reviewed-by: Carl Vanderlip Reviewed-by: Pranjal Ramajor Asha Kanojiya Link: https://patchwork.freedesktop.org/patch/msgid/20230602210440.8411-3-quic_jhugo@quicinc.com --- drivers/accel/qaic/qaic_drv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/accel/qaic/qaic_drv.c b/drivers/accel/qaic/qaic_drv.c index 961cd341b414..b5ba550a0c04 100644 --- a/drivers/accel/qaic/qaic_drv.c +++ b/drivers/accel/qaic/qaic_drv.c @@ -225,6 +225,9 @@ static void qaic_destroy_drm_device(struct qaic_device *qdev, s32 partition_id) struct qaic_user *usr; qddev = qdev->qddev; + qdev->qddev = NULL; + if (!qddev) + return; /* * Existing users get unresolvable errors till they close FDs. -- cgit v1.2.3 From a2a871483161014f1bcc4e9a04354b01aa77cedb Mon Sep 17 00:00:00 2001 From: Edson Juliano Drosdeck Date: Fri, 9 Jun 2023 17:10:58 -0300 Subject: ALSA: hda/realtek: Add a quirk for Compaq N14JP6 Add a quirk for Compaq N14JP6 to fixup ALC897 headset MIC no sound. Signed-off-by: Edson Juliano Drosdeck Cc: Link: https://lore.kernel.org/r/20230609201058.523499-1-edson.drosdeck@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a5d55a7063d3..308ec7034cc9 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -11740,6 +11740,7 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = { SND_PCI_QUIRK(0x1b0a, 0x01b8, "ACER Veriton", ALC662_FIXUP_ACER_VERITON), SND_PCI_QUIRK(0x1b35, 0x1234, "CZC ET26", ALC662_FIXUP_CZC_ET26), SND_PCI_QUIRK(0x1b35, 0x2206, "CZC P10T", ALC662_FIXUP_CZC_P10T), + SND_PCI_QUIRK(0x1c6c, 0x1239, "Compaq N14JP6-V2", ALC897_FIXUP_HP_HSMIC_VERB), #if 0 /* Below is a quirk table taken from the old code. -- cgit v1.2.3 From 20cb1c2fb7568a6054c55defe044311397e01ddb Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 10 Jun 2023 07:42:49 +0800 Subject: blk-cgroup: Flush stats before releasing blkcg_gq As noted by Michal, the blkg_iostat_set's in the lockless list hold reference to blkg's to protect against their removal. Those blkg's hold reference to blkcg. When a cgroup is being destroyed, cgroup_rstat_flush() is only called at css_release_work_fn() which is called when the blkcg reference count reaches 0. This circular dependency will prevent blkcg and some blkgs from being freed after they are made offline. It is less a problem if the cgroup to be destroyed also has other controllers like memory that will call cgroup_rstat_flush() which will clean up the reference count. If block is the only controller that uses rstat, these offline blkcg and blkgs may never be freed leaking more and more memory over time. To prevent this potential memory leak: - flush blkcg per-cpu stats list in __blkg_release(), when no new stat can be added - add global blkg_stat_lock for covering concurrent parent blkg stat update - don't grab bio->bi_blkg reference when adding the stats into blkcg's per-cpu stat list since all stats are guaranteed to be consumed before releasing blkg instance, and grabbing blkg reference for stats was the most fragile part of original patch Based on Waiman's patch: https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/ Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()") Cc: stable@vger.kernel.org Reported-by: Jay Shin Acked-by: Tejun Heo Cc: Waiman Long Cc: mkoutny@suse.com Cc: Yosry Ahmed Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230609234249.1412858-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0ce64dd73cfe..f0b5c9c41cde 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -34,6 +34,8 @@ #include "blk-ioprio.h" #include "blk-throttle.h" +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu); + /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. * blkcg_pol_register_mutex nests outside of it and synchronizes entire @@ -56,6 +58,8 @@ static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ bool blkcg_debug_stats = false; +static DEFINE_RAW_SPINLOCK(blkg_stat_lock); + #define BLKG_DESTROY_BATCH_SIZE 64 /* @@ -163,10 +167,20 @@ static void blkg_free(struct blkcg_gq *blkg) static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); + struct blkcg *blkcg = blkg->blkcg; + int cpu; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO WARN_ON(!bio_list_empty(&blkg->async_bios)); #endif + /* + * Flush all the non-empty percpu lockless lists before releasing + * us, given these stat belongs to us. + * + * blkg_stat_lock is for serializing blkg stat update + */ + for_each_possible_cpu(cpu) + __blkcg_rstat_flush(blkcg, cpu); /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); @@ -951,23 +965,26 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur, u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } -static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) { - struct blkcg *blkcg = css_to_blkcg(css); struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); struct llist_node *lnode; struct blkg_iostat_set *bisc, *next_bisc; - /* Root-level stats are sourced from system-wide IO stats */ - if (!cgroup_parent(css->cgroup)) - return; - rcu_read_lock(); lnode = llist_del_all(lhead); if (!lnode) goto out; + /* + * For covering concurrent parent blkg update from blkg_release(). + * + * When flushing from cgroup, cgroup_rstat_lock is always held, so + * this lock won't cause contention most of time. + */ + raw_spin_lock(&blkg_stat_lock); + /* * Iterate only the iostat_cpu's queued in the lockless list. */ @@ -991,13 +1008,19 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (parent && parent->parent) blkcg_iostat_update(parent, &blkg->iostat.cur, &blkg->iostat.last); - percpu_ref_put(&blkg->refcnt); } - + raw_spin_unlock(&blkg_stat_lock); out: rcu_read_unlock(); } +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + /* Root-level stats are sourced from system-wide IO stats */ + if (cgroup_parent(css->cgroup)) + __blkcg_rstat_flush(css_to_blkcg(css), cpu); +} + /* * We source root cgroup stats from the system-wide stats to avoid * tracking the same information twice and incurring overhead when no @@ -2075,7 +2098,6 @@ void blk_cgroup_bio_start(struct bio *bio) llist_add(&bis->lnode, lhead); WRITE_ONCE(bis->lqueued, true); - percpu_ref_get(&bis->blkg->refcnt); } u64_stats_update_end_irqrestore(&bis->sync, flags); -- cgit v1.2.3 From 842665a9008a53ff13ac22a4e4b8ae2f10e92aca Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 7 Jun 2023 16:38:47 +0800 Subject: xfrm: Use xfrm_state selector for BEET input For BEET the inner address and therefore family is stored in the xfrm_state selector. Use that when decapsulating an input packet instead of incorrectly relying on a non-existent tunnel protocol. Fixes: 5f24f41e8ea6 ("xfrm: Remove inner/outer modes from input path") Reported-by: Steffen Klassert Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index b731687b5f3e..815b38080401 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -331,11 +331,10 @@ xfrm_inner_mode_encap_remove(struct xfrm_state *x, { switch (x->props.mode) { case XFRM_MODE_BEET: - switch (XFRM_MODE_SKB_CB(skb)->protocol) { - case IPPROTO_IPIP: - case IPPROTO_BEETPH: + switch (x->sel.family) { + case AF_INET: return xfrm4_remove_beet_encap(x, skb); - case IPPROTO_IPV6: + case AF_INET6: return xfrm6_remove_beet_encap(x, skb); } break; -- cgit v1.2.3 From e6f9e590b72e12bbb86b1b8be7e1981f357392ad Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Thu, 18 May 2023 11:39:36 +0200 Subject: mmc: sdhci-msm: Disable broken 64-bit DMA on MSM8916 While SDHCI claims to support 64-bit DMA on MSM8916 it does not seem to be properly functional. It is not immediately obvious because SDHCI is usually used with IOMMU bypassed on this SoC, and all physical memory has 32-bit addresses. But when trying to enable the IOMMU it quickly fails with an error such as the following: arm-smmu 1e00000.iommu: Unhandled context fault: fsr=0x402, iova=0xfffff200, fsynr=0xe0000, cbfrsynra=0x140, cb=3 mmc1: ADMA error: 0x02000000 mmc1: sdhci: ============ SDHCI REGISTER DUMP =========== mmc1: sdhci: Sys addr: 0x00000000 | Version: 0x00002e02 mmc1: sdhci: Blk size: 0x00000008 | Blk cnt: 0x00000000 mmc1: sdhci: Argument: 0x00000000 | Trn mode: 0x00000013 mmc1: sdhci: Present: 0x03f80206 | Host ctl: 0x00000019 mmc1: sdhci: Power: 0x0000000f | Blk gap: 0x00000000 mmc1: sdhci: Wake-up: 0x00000000 | Clock: 0x00000007 mmc1: sdhci: Timeout: 0x0000000a | Int stat: 0x00000001 mmc1: sdhci: Int enab: 0x03ff900b | Sig enab: 0x03ff100b mmc1: sdhci: ACmd stat: 0x00000000 | Slot int: 0x00000000 mmc1: sdhci: Caps: 0x322dc8b2 | Caps_1: 0x00008007 mmc1: sdhci: Cmd: 0x0000333a | Max curr: 0x00000000 mmc1: sdhci: Resp[0]: 0x00000920 | Resp[1]: 0x5b590000 mmc1: sdhci: Resp[2]: 0xe6487f80 | Resp[3]: 0x0a404094 mmc1: sdhci: Host ctl2: 0x00000008 mmc1: sdhci: ADMA Err: 0x00000001 | ADMA Ptr: 0x0000000ffffff224 mmc1: sdhci_msm: ----------- VENDOR REGISTER DUMP ----------- mmc1: sdhci_msm: DLL sts: 0x00000000 | DLL cfg: 0x60006400 | DLL cfg2: 0x00000000 mmc1: sdhci_msm: DLL cfg3: 0x00000000 | DLL usr ctl: 0x00000000 | DDR cfg: 0x00000000 mmc1: sdhci_msm: Vndr func: 0x00018a9c | Vndr func2 : 0xf88018a8 Vndr func3: 0x00000000 mmc1: sdhci: ============================================ mmc1: sdhci: fffffffff200: DMA 0x0000ffffffffe100, LEN 0x0008, Attr=0x21 mmc1: sdhci: fffffffff20c: DMA 0x0000000000000000, LEN 0x0000, Attr=0x03 Looking closely it's obvious that only the 32-bit part of the address (0xfffff200) arrives at the SMMU, the higher 16-bit (0xffff...) get lost somewhere. This might not be a limitation of the SDHCI itself but perhaps the bus/interconnect it is connected to, or even the connection to the SMMU. Work around this by setting SDHCI_QUIRK2_BROKEN_64_BIT_DMA to avoid using 64-bit addresses. Signed-off-by: Stephan Gerhold Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230518-msm8916-64bit-v1-1-5694b0f35211@gerhold.net Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-msm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 8ac81d57a3df..1877d583fe8c 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -2479,6 +2479,9 @@ static inline void sdhci_msm_get_of_property(struct platform_device *pdev, msm_host->ddr_config = DDR_CONFIG_POR_VAL; of_property_read_u32(node, "qcom,dll-config", &msm_host->dll_config); + + if (of_device_is_compatible(node, "qcom,msm8916-sdhci")) + host->quirks2 |= SDHCI_QUIRK2_BROKEN_64_BIT_DMA; } static int sdhci_msm_gcc_reset(struct device *dev, struct sdhci_host *host) -- cgit v1.2.3 From ad96f1c9138e0897bee7f7c5e54b3e24f8b62f57 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 8 Jun 2023 17:54:39 -0700 Subject: bpf: Fix a bpf_jit_dump issue for x86_64 with sysctl bpf_jit_enable. The sysctl net/core/bpf_jit_enable does not work now due to commit 1022a5498f6f ("bpf, x86_64: Use bpf_jit_binary_pack_alloc"). The commit saved the jitted insns into 'rw_image' instead of 'image' which caused bpf_jit_dump not dumping proper content. With 'echo 2 > /proc/sys/net/core/bpf_jit_enable', run './test_progs -t fentry_test'. Without this patch, one of jitted image for one particular prog is: flen=17 proglen=92 pass=4 image=0000000014c64883 from=test_progs pid=1807 00000000: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc 00000010: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc 00000020: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc 00000030: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc 00000040: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc 00000050: cc cc cc cc cc cc cc cc cc cc cc cc With this patch, the jitte image for the same prog is: flen=17 proglen=92 pass=4 image=00000000b90254b7 from=test_progs pid=1809 00000000: f3 0f 1e fa 0f 1f 44 00 00 66 90 55 48 89 e5 f3 00000010: 0f 1e fa 31 f6 48 8b 57 00 48 83 fa 07 75 2b 48 00000020: 8b 57 10 83 fa 09 75 22 48 8b 57 08 48 81 e2 ff 00000030: 00 00 00 48 83 fa 08 75 11 48 8b 7f 18 be 01 00 00000040: 00 00 48 83 ff 0a 74 02 31 f6 48 bf 18 d0 14 00 00000050: 00 c9 ff ff 48 89 77 00 31 c0 c9 c3 Fixes: 1022a5498f6f ("bpf, x86_64: Use bpf_jit_binary_pack_alloc") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20230609005439.3173569-1-yhs@fb.com --- arch/x86/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 1056bbf55b17..438adb695daa 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2570,7 +2570,7 @@ out_image: } if (bpf_jit_enable > 1) - bpf_jit_dump(prog->len, proglen, pass + 1, image); + bpf_jit_dump(prog->len, proglen, pass + 1, rw_image); if (image) { if (!prog->is_func || extra_pass) { -- cgit v1.2.3 From f0cc749254d12c78e93dae3b27b21dc9546843d0 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 11 Jun 2023 22:48:12 +0900 Subject: cgroup,freezer: hold cpu_hotplug_lock before freezer_mutex in freezer_css_{online,offline}() syzbot is again reporting circular locking dependency between cpu_hotplug_lock and freezer_mutex. Do like what we did with commit 57dcd64c7e036299 ("cgroup,freezer: hold cpu_hotplug_lock before freezer_mutex"). Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=2ab700fe1829880a2ec6 Signed-off-by: Tetsuo Handa Tested-by: syzbot Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") Cc: stable@vger.kernel.org # v6.1+ Signed-off-by: Tejun Heo --- kernel/cgroup/legacy_freezer.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 936473203a6b..122dacb3a443 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -108,16 +108,18 @@ static int freezer_css_online(struct cgroup_subsys_state *css) struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); + cpus_read_lock(); mutex_lock(&freezer_mutex); freezer->state |= CGROUP_FREEZER_ONLINE; if (parent && (parent->state & CGROUP_FREEZING)) { freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - static_branch_inc(&freezer_active); + static_branch_inc_cpuslocked(&freezer_active); } mutex_unlock(&freezer_mutex); + cpus_read_unlock(); return 0; } @@ -132,14 +134,16 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); + cpus_read_lock(); mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) - static_branch_dec(&freezer_active); + static_branch_dec_cpuslocked(&freezer_active); freezer->state = 0; mutex_unlock(&freezer_mutex); + cpus_read_unlock(); } static void freezer_css_free(struct cgroup_subsys_state *css) -- cgit v1.2.3 From 6f363f5aa845561f7ea496d8b1175e3204470486 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Sat, 10 Jun 2023 17:26:43 +0800 Subject: cgroup: Do not corrupt task iteration when rebinding subsystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We found a refcount UAF bug as follows: refcount_t: addition on 0; use-after-free. WARNING: CPU: 1 PID: 342 at lib/refcount.c:25 refcount_warn_saturate+0xa0/0x148 Workqueue: events cpuset_hotplug_workfn Call trace: refcount_warn_saturate+0xa0/0x148 __refcount_add.constprop.0+0x5c/0x80 css_task_iter_advance_css_set+0xd8/0x210 css_task_iter_advance+0xa8/0x120 css_task_iter_next+0x94/0x158 update_tasks_root_domain+0x58/0x98 rebuild_root_domains+0xa0/0x1b0 rebuild_sched_domains_locked+0x144/0x188 cpuset_hotplug_workfn+0x138/0x5a0 process_one_work+0x1e8/0x448 worker_thread+0x228/0x3e0 kthread+0xe0/0xf0 ret_from_fork+0x10/0x20 then a kernel panic will be triggered as below: Unable to handle kernel paging request at virtual address 00000000c0000010 Call trace: cgroup_apply_control_disable+0xa4/0x16c rebind_subsystems+0x224/0x590 cgroup_destroy_root+0x64/0x2e0 css_free_rwork_fn+0x198/0x2a0 process_one_work+0x1d4/0x4bc worker_thread+0x158/0x410 kthread+0x108/0x13c ret_from_fork+0x10/0x18 The race that cause this bug can be shown as below: (hotplug cpu) | (umount cpuset) mutex_lock(&cpuset_mutex) | mutex_lock(&cgroup_mutex) cpuset_hotplug_workfn | rebuild_root_domains | rebind_subsystems update_tasks_root_domain | spin_lock_irq(&css_set_lock) css_task_iter_start | list_move_tail(&cset->e_cset_node[ss->id] while(css_task_iter_next) | &dcgrp->e_csets[ss->id]); css_task_iter_end | spin_unlock_irq(&css_set_lock) mutex_unlock(&cpuset_mutex) | mutex_unlock(&cgroup_mutex) Inside css_task_iter_start/next/end, css_set_lock is hold and then released, so when iterating task(left side), the css_set may be moved to another list(right side), then it->cset_head points to the old list head and it->cset_pos->next points to the head node of new list, which can't be used as struct css_set. To fix this issue, switch from all css_sets to only scgrp's css_sets to patch in-flight iterators to preserve correct iteration, and then update it->cset_head as well. Reported-by: Gaosheng Cui Link: https://www.spinics.net/lists/cgroups/msg37935.html Suggested-by: Michal Koutný Link: https://lore.kernel.org/all/20230526114139.70274-1-xiujianfeng@huaweicloud.com/ Signed-off-by: Xiu Jianfeng Fixes: 2d8f243a5e6e ("cgroup: implement cgroup->e_csets[]") Cc: stable@vger.kernel.org # v3.16+ Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 245cf62ce85a..4d42f0cbc11e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1798,7 +1798,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - int ssid, i, ret; + int ssid, ret; u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1842,7 +1842,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); - struct css_set *cset; + struct css_set *cset, *cset_pos; + struct css_task_iter *it; WARN_ON(!css || cgroup_css(dcgrp, ss)); @@ -1860,9 +1861,22 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); - hash_for_each(css_set_table, i, cset, hlist) + WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); + list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], + e_cset_node[ss->id]) { list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); + /* + * all css_sets of scgrp together in same order to dcgrp, + * patch in-flight iterators to preserve correct iteration. + * since the iterator is always advanced right away and + * finished when it->cset_pos meets it->cset_head, so only + * update it->cset_head is enough here. + */ + list_for_each_entry(it, &cset->task_iters, iters_node) + if (it->cset_head == &scgrp->e_csets[ss->id]) + it->cset_head = &dcgrp->e_csets[ss->id]; + } spin_unlock_irq(&css_set_lock); if (ss->css_rstat_flush) { -- cgit v1.2.3 From 3d6f6d2b584e7a629158be8a3f44f5de00b337ee Mon Sep 17 00:00:00 2001 From: Alexandre Mergnat Date: Fri, 26 May 2023 15:10:43 +0200 Subject: clk: mediatek: mt8365: Fix index issue Before the patch [1], the clock probe was done directly in the clk-mt8365 driver. In this probe function, the array which stores the data clocks is sized using the higher defined numbers (*_NR_CLOCK) in the clock lists [2]. Currently, with the patch [1], the specific clk-mt8365 probe function is replaced by the mtk generic one [3], which size the clock data array by adding all the clock descriptor array size provided by the clk-mt8365 driver. Actually, all clock indexes come from the header file [2], that mean, if there are more clock (then more index) in the header file [2] than the number of clock declared in the clock descriptor arrays (which is the case currently), the clock data array will be undersized and then the generic probe function will overflow when it will try to write in "clk_data[CLK_INDEX]". Actually, instead of crashing at boot, the probe function returns an error in the log which looks like: "of_clk_hw_onecell_get: invalid index 135", then this clock isn't enabled. Solve this issue by adding in the driver the missing clocks declared in the header clock file [2]. [1]: Commit ffe91cb28f6a ("clk: mediatek: mt8365: Convert to mtk_clk_simple_{probe,remove}()") [2]: include/dt-bindings/clock/mediatek,mt8365-clk.h [3]: drivers/clk/mediatek/clk-mtk.c Fixes: ffe91cb28f6a ("clk: mediatek: mt8365: Convert to mtk_clk_simple_{probe,remove}()") Signed-off-by: Alexandre Mergnat Link: https://lore.kernel.org/r/20230517-fix-clk-index-v3-1-be4df46065c4@baylibre.com Tested-by: Markus Schneider-Pargmann Reviewed-by: Markus Schneider-Pargmann Signed-off-by: Stephen Boyd --- drivers/clk/mediatek/clk-mt8365.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/clk/mediatek/clk-mt8365.c b/drivers/clk/mediatek/clk-mt8365.c index 6b4e193f648d..c366ac1611e6 100644 --- a/drivers/clk/mediatek/clk-mt8365.c +++ b/drivers/clk/mediatek/clk-mt8365.c @@ -23,6 +23,7 @@ static DEFINE_SPINLOCK(mt8365_clk_lock); static const struct mtk_fixed_clk top_fixed_clks[] = { + FIXED_CLK(CLK_TOP_CLK_NULL, "clk_null", NULL, 0), FIXED_CLK(CLK_TOP_I2S0_BCK, "i2s0_bck", NULL, 26000000), FIXED_CLK(CLK_TOP_DSI0_LNTC_DSICK, "dsi0_lntc_dsick", "clk26m", 75000000), @@ -559,6 +560,14 @@ static const struct mtk_clk_divider top_adj_divs[] = { 0x324, 16, 8, CLK_DIVIDER_ROUND_CLOSEST), DIV_ADJ_F(CLK_TOP_APLL12_CK_DIV3, "apll12_ck_div3", "apll_i2s3_sel", 0x324, 24, 8, CLK_DIVIDER_ROUND_CLOSEST), + DIV_ADJ_F(CLK_TOP_APLL12_CK_DIV4, "apll12_ck_div4", "apll_tdmout_sel", + 0x328, 0, 8, CLK_DIVIDER_ROUND_CLOSEST), + DIV_ADJ_F(CLK_TOP_APLL12_CK_DIV4B, "apll12_ck_div4b", "apll_tdmout_sel", + 0x328, 8, 8, CLK_DIVIDER_ROUND_CLOSEST), + DIV_ADJ_F(CLK_TOP_APLL12_CK_DIV5, "apll12_ck_div5", "apll_tdmin_sel", + 0x328, 16, 8, CLK_DIVIDER_ROUND_CLOSEST), + DIV_ADJ_F(CLK_TOP_APLL12_CK_DIV5B, "apll12_ck_div5b", "apll_tdmin_sel", + 0x328, 24, 8, CLK_DIVIDER_ROUND_CLOSEST), DIV_ADJ_F(CLK_TOP_APLL12_CK_DIV6, "apll12_ck_div6", "apll_spdif_sel", 0x32c, 0, 8, CLK_DIVIDER_ROUND_CLOSEST), }; @@ -696,6 +705,7 @@ static const struct mtk_gate ifr_clks[] = { GATE_IFR3(CLK_IFR_GCPU, "ifr_gcpu", "axi_sel", 8), GATE_IFR3(CLK_IFR_TRNG, "ifr_trng", "axi_sel", 9), GATE_IFR3(CLK_IFR_AUXADC, "ifr_auxadc", "clk26m", 10), + GATE_IFR3(CLK_IFR_CPUM, "ifr_cpum", "clk26m", 11), GATE_IFR3(CLK_IFR_AUXADC_MD, "ifr_auxadc_md", "clk26m", 14), GATE_IFR3(CLK_IFR_AP_DMA, "ifr_ap_dma", "axi_sel", 18), GATE_IFR3(CLK_IFR_DEBUGSYS, "ifr_debugsys", "axi_sel", 24), @@ -717,6 +727,8 @@ static const struct mtk_gate ifr_clks[] = { GATE_IFR5(CLK_IFR_PWRAP_TMR, "ifr_pwrap_tmr", "clk26m", 12), GATE_IFR5(CLK_IFR_PWRAP_SPI, "ifr_pwrap_spi", "clk26m", 13), GATE_IFR5(CLK_IFR_PWRAP_SYS, "ifr_pwrap_sys", "clk26m", 14), + GATE_MTK_FLAGS(CLK_IFR_MCU_PM_BK, "ifr_mcu_pm_bk", NULL, &ifr5_cg_regs, + 17, &mtk_clk_gate_ops_setclr, CLK_IGNORE_UNUSED), GATE_IFR5(CLK_IFR_IRRX_26M, "ifr_irrx_26m", "clk26m", 22), GATE_IFR5(CLK_IFR_IRRX_32K, "ifr_irrx_32k", "clk32k", 23), GATE_IFR5(CLK_IFR_I2C0_AXI, "ifr_i2c0_axi", "i2c_sel", 24), -- cgit v1.2.3 From 7833b865953c8e62abc76a3261c04132b2fb69de Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 11:10:25 +0200 Subject: btrfs: fix iomap_begin length for nocow writes can_nocow_extent can reduce the len passed in, which needs to be propagated to btrfs_dio_iomap_begin so that iomap does not submit more data then is mapped. This problems exists since the btrfs_get_blocks_direct helper was added in commit c5794e51784a ("btrfs: Factor out write portion of btrfs_get_blocks_direct"), but the ordered_extent splitting added in commit b73a6fd1b1ef ("btrfs: split partial dio bios before submit") added a WARN_ON that made a syzkaller test fail. Reported-by: syzbot+ee90502d5c8fd1d0dd93@syzkaller.appspotmail.com Fixes: c5794e51784a ("btrfs: Factor out write portion of btrfs_get_blocks_direct") CC: stable@vger.kernel.org # 6.1+ Tested-by: syzbot+ee90502d5c8fd1d0dd93@syzkaller.appspotmail.com Reviewed-by: Filipe Manana Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/inode.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 19c707bc8801..3f99f02dc1fe 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7264,7 +7264,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, static int btrfs_get_blocks_direct_write(struct extent_map **map, struct inode *inode, struct btrfs_dio_data *dio_data, - u64 start, u64 len, + u64 start, u64 *lenp, unsigned int iomap_flags) { const bool nowait = (iomap_flags & IOMAP_NOWAIT); @@ -7275,6 +7275,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, struct btrfs_block_group *bg; bool can_nocow = false; bool space_reserved = false; + u64 len = *lenp; u64 prev_len; int ret = 0; @@ -7345,15 +7346,19 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, free_extent_map(em); *map = NULL; - if (nowait) - return -EAGAIN; + if (nowait) { + ret = -EAGAIN; + goto out; + } /* * If we could not allocate data space before locking the file * range and we can't do a NOCOW write, then we have to fail. */ - if (!dio_data->data_space_reserved) - return -ENOSPC; + if (!dio_data->data_space_reserved) { + ret = -ENOSPC; + goto out; + } /* * We have to COW and we have already reserved data space before, @@ -7394,6 +7399,7 @@ out: btrfs_delalloc_release_extents(BTRFS_I(inode), len); btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); } + *lenp = len; return ret; } @@ -7570,7 +7576,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (write) { ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, - start, len, flags); + start, &len, flags); if (ret < 0) goto unlock_err; unlock_extents = true; -- cgit v1.2.3 From deccae40e4b30f98837e44225194d80c8baf2233 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 9 Jun 2023 10:53:41 -0700 Subject: btrfs: can_nocow_file_extent should pass down args->strict from callers Commit 619104ba453ad0 ("btrfs: move common NOCOW checks against a file extent into a helper") changed our call to btrfs_cross_ref_exist() to always pass false for the 'strict' parameter. We're passing this down through the stack so that we can do a full check for cross references during swapfile activation. With strict always false, this test fails: btrfs subvol create swappy chattr +C swappy fallocate -l1G swappy/swapfile chmod 600 swappy/swapfile mkswap swappy/swapfile btrfs subvol snap swappy swapsnap btrfs subvol del -C swapsnap btrfs fi sync / sync;sync;sync swapon swappy/swapfile The fix is to just use args->strict, and everyone except swapfile activation is passing false. Fixes: 619104ba453ad0 ("btrfs: move common NOCOW checks against a file extent into a helper") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Filipe Manana Signed-off-by: Chris Mason Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3f99f02dc1fe..7fcafcc5292c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1864,7 +1864,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), key->offset - args->extent_offset, - args->disk_bytenr, false, path); + args->disk_bytenr, args->strict, path); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; -- cgit v1.2.3 From 745806fb4554f334e6406fa82b328562aa48f08f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 11 Jun 2023 08:09:13 +0800 Subject: btrfs: do not ASSERT() on duplicated global roots [BUG] Syzbot reports a reproducible ASSERT() when using rescue=usebackuproot mount option on a corrupted fs. The full report can be found here: https://syzkaller.appspot.com/bug?extid=c4614eae20a166c25bf0 BTRFS error (device loop0: state C): failed to load root csum assertion failed: !tmp, in fs/btrfs/disk-io.c:1103 ------------[ cut here ]------------ kernel BUG at fs/btrfs/ctree.h:3664! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 3608 Comm: syz-executor356 Not tainted 6.0.0-rc7-syzkaller-00029-g3800a713b607 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/26/2022 RIP: 0010:assertfail+0x1a/0x1c fs/btrfs/ctree.h:3663 RSP: 0018:ffffc90003aaf250 EFLAGS: 00010246 RAX: 0000000000000032 RBX: 0000000000000000 RCX: f21c13f886638400 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff888021c640a0 R08: ffffffff816bd38d R09: ffffed10173667f1 R10: ffffed10173667f1 R11: 1ffff110173667f0 R12: dffffc0000000000 R13: ffff8880229c21f7 R14: ffff888021c64060 R15: ffff8880226c0000 FS: 0000555556a73300(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055a2637d7a00 CR3: 00000000709c4000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_global_root_insert+0x1a7/0x1b0 fs/btrfs/disk-io.c:1103 load_global_roots_objectid+0x482/0x8c0 fs/btrfs/disk-io.c:2467 load_global_roots fs/btrfs/disk-io.c:2501 [inline] btrfs_read_roots fs/btrfs/disk-io.c:2528 [inline] init_tree_roots+0xccb/0x203c fs/btrfs/disk-io.c:2939 open_ctree+0x1e53/0x33df fs/btrfs/disk-io.c:3574 btrfs_fill_super+0x1c6/0x2d0 fs/btrfs/super.c:1456 btrfs_mount_root+0x885/0x9a0 fs/btrfs/super.c:1824 legacy_get_tree+0xea/0x180 fs/fs_context.c:610 vfs_get_tree+0x88/0x270 fs/super.c:1530 fc_mount fs/namespace.c:1043 [inline] vfs_kern_mount+0xc9/0x160 fs/namespace.c:1073 btrfs_mount+0x3d3/0xbb0 fs/btrfs/super.c:1884 [CAUSE] Since the introduction of global roots, we handle csum/extent/free-space-tree roots as global roots, even if no extent-tree-v2 feature is enabled. So for regular csum/extent/fst roots, we load them into fs_info::global_root_tree rb tree. And we should not expect any conflicts in that rb tree, thus we have an ASSERT() inside btrfs_global_root_insert(). But rescue=usebackuproot can break the assumption, as we will try to load those trees again and again as long as we have bad roots and have backup roots slot remaining. So in that case we can have conflicting roots in the rb tree, and triggering the ASSERT() crash. [FIX] We can safely remove that ASSERT(), as the caller will properly put the offending root. To make further debugging easier, also add two explicit error messages: - Error message for conflicting global roots - Error message when using backup roots slot Reported-by: syzbot+a694851c6ab28cbcfb9c@syzkaller.appspotmail.com Fixes: abed4aaae4f7 ("btrfs: track the csum, extent, and free space trees in a rb tree") CC: stable@vger.kernel.org # 6.1+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 88e6d1072a35..dabc79c1af1b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -996,13 +996,18 @@ int btrfs_global_root_insert(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *tmp; + int ret = 0; write_lock(&fs_info->global_root_lock); tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp); write_unlock(&fs_info->global_root_lock); - ASSERT(!tmp); - return tmp ? -EEXIST : 0; + if (tmp) { + ret = -EEXIST; + btrfs_warn(fs_info, "global root %llu %llu already exists", + root->root_key.objectid, root->root_key.offset); + } + return ret; } void btrfs_global_root_delete(struct btrfs_root *root) @@ -2842,6 +2847,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) /* We can't trust the free space cache either */ btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); + btrfs_warn(fs_info, "try to load backup roots slot %d", i); ret = read_backup_root(fs_info, i); backup_index = ret; if (ret < 0) -- cgit v1.2.3 From d54fb4b25a0261bf2f2bb7093fdf11a36718bf25 Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Fri, 26 May 2023 19:10:56 +0200 Subject: clk: composite: Fix handling of high clock rates ULONG_MAX is used by a few drivers to figure out the highest available clock rate via clk_round_rate(clk, ULONG_MAX). Since abs() takes a signed value as input, the current logic effectively calculates with ULONG_MAX = -1, which results in the worst parent clock being chosen instead of the best one. For example on Rockchip RK3588 the eMMC driver tries to figure out the highest available clock rate. There are three parent clocks available resulting in the following rate diffs with the existing logic: GPLL: abs(18446744073709551615 - 1188000000) = 1188000001 CPLL: abs(18446744073709551615 - 1500000000) = 1500000001 XIN24M: abs(18446744073709551615 - 24000000) = 24000001 As a result the clock framework will promote a maximum supported clock rate of 24 MHz, even though 1.5GHz are possible. With the updated logic any casting between signed and unsigned is avoided and the numbers look like this instead: GPLL: 18446744073709551615 - 1188000000 = 18446744072521551615 CPLL: 18446744073709551615 - 1500000000 = 18446744072209551615 XIN24M: 18446744073709551615 - 24000000 = 18446744073685551615 As a result the parent with the highest acceptable rate is chosen instead of the parent clock with the lowest one. Cc: stable@vger.kernel.org Fixes: 49502408007b ("mmc: sdhci-of-dwcmshc: properly determine max clock on Rockchip") Tested-by: Christopher Obbard Signed-off-by: Sebastian Reichel Link: https://lore.kernel.org/r/20230526171057.66876-2-sebastian.reichel@collabora.com Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Stephen Boyd --- drivers/clk/clk-composite.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c index edfa94641bbf..66759fe28fad 100644 --- a/drivers/clk/clk-composite.c +++ b/drivers/clk/clk-composite.c @@ -119,7 +119,10 @@ static int clk_composite_determine_rate(struct clk_hw *hw, if (ret) continue; - rate_diff = abs(req->rate - tmp_req.rate); + if (req->rate >= tmp_req.rate) + rate_diff = req->rate - tmp_req.rate; + else + rate_diff = tmp_req.rate - req->rate; if (!rate_diff || !req->best_parent_hw || best_rate_diff > rate_diff) { -- cgit v1.2.3 From a1043fbc8f99c7df2d70993eecad3baee07dc180 Mon Sep 17 00:00:00 2001 From: Markus Schneider-Pargmann Date: Thu, 11 May 2023 15:32:26 +0200 Subject: clk: mediatek: mt8365: Fix inverted topclk operations The given operations are inverted for the wrong registers which makes multiple of the mt8365 hardware units unusable. In my setup at least usb did not work. Fixed by swapping the operations with the inverted ones. Reported-by: Alexandre Mergnat Fixes: 905b7430d3cc ("clk: mediatek: mt8365: Convert simple_gate to mtk_gate clocks") Signed-off-by: Markus Schneider-Pargmann Link: https://lore.kernel.org/r/20230511133226.913600-1-msp@baylibre.com Tested-by: Alexandre Mergnat Reviewed-by: Alexandre Mergnat Reviewed-by: Matthias Brugger Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Stephen Boyd --- drivers/clk/mediatek/clk-mt8365.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/clk/mediatek/clk-mt8365.c b/drivers/clk/mediatek/clk-mt8365.c index c366ac1611e6..c87a6c4a7967 100644 --- a/drivers/clk/mediatek/clk-mt8365.c +++ b/drivers/clk/mediatek/clk-mt8365.c @@ -592,15 +592,15 @@ static const struct mtk_gate_regs top2_cg_regs = { #define GATE_TOP0(_id, _name, _parent, _shift) \ GATE_MTK(_id, _name, _parent, &top0_cg_regs, \ - _shift, &mtk_clk_gate_ops_no_setclr_inv) + _shift, &mtk_clk_gate_ops_no_setclr) #define GATE_TOP1(_id, _name, _parent, _shift) \ GATE_MTK(_id, _name, _parent, &top1_cg_regs, \ - _shift, &mtk_clk_gate_ops_no_setclr) + _shift, &mtk_clk_gate_ops_no_setclr_inv) #define GATE_TOP2(_id, _name, _parent, _shift) \ GATE_MTK(_id, _name, _parent, &top2_cg_regs, \ - _shift, &mtk_clk_gate_ops_no_setclr) + _shift, &mtk_clk_gate_ops_no_setclr_inv) static const struct mtk_gate top_clk_gates[] = { GATE_TOP0(CLK_TOP_CONN_32K, "conn_32k", "clk32k", 10), -- cgit v1.2.3 From 2a809ddca08d0d1b9ac961815ce04305814e179b Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Wed, 24 May 2023 09:49:24 +0800 Subject: clk: clk-loongson2: Zero init clk_init_data As clk_core_populate_parent_map() checks clk_init_data.num_parents first, and checks clk_init_data.parent_names[] before clk_init_data.parent_data[] and clk_init_data.parent_hws[]. Therefore the clk_init_data structure needs to be explicitly initialised to prevent an unexpected crash if clk_init_data.parent_names[] is a random value. CPU 0 Unable to handle kernel paging request at virtual address 0000000000000dc0, era == 9000000002986290, ra == 900000000298624c Oops[#1]: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.4.0-rc2+ #4582 pc 9000000002986290 ra 900000000298624c tp 9000000100094000 sp 9000000100097a60 a0 9000000104541e00 a1 0000000000000000 a2 0000000000000dc0 a3 0000000000000001 a4 90000001000979f0 a5 90000001800977d7 a6 0000000000000000 a7 900000000362a000 t0 90000000034f3548 t1 6f8c2a9cb5ab5f64 t2 0000000000011340 t3 90000000031cf5b0 t4 0000000000000dc0 t5 0000000000000004 t6 0000000000011300 t7 9000000104541e40 t8 000000000005a4f8 u0 9000000104541e00 s9 9000000104541e00 s0 9000000104bc4700 s1 9000000104541da8 s2 0000000000000001 s3 900000000356f9d8 s4 ffffffffffffffff s5 0000000000000000 s6 0000000000000dc0 s7 90000000030d0a88 s8 0000000000000000 ra: 900000000298624c __clk_register+0x228/0x84c ERA: 9000000002986290 __clk_register+0x26c/0x84c CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE) PRMD: 00000004 (PPLV0 +PIE -PWE) EUEN: 00000000 (-FPE -SXE -ASXE -BTE) ECFG: 00071c1c (LIE=2-4,10-12 VS=7) ESTAT: 00010000 [PIL] (IS= ECode=1 EsubCode=0) BADV: 0000000000000dc0 PRID: 0014a000 (Loongson-64bit, ) Modules linked in: Process swapper/0 (pid: 1, threadinfo=(____ptrval____), task=(____ptrval____)) Stack : 90000000031c1810 90000000030d0a88 900000000325bac0 90000000034f3548 90000001002ab410 9000000104541e00 0000000000000dc0 9000000003150098 90000000031c1810 90000000031a0460 900000000362a000 90000001002ab410 900000000362a000 9000000104541da8 9000000104541de8 90000001002ab410 900000000362a000 9000000002986a68 90000000034f3ed8 90000000030d0aa8 9000000104541da8 900000000298d3b8 90000000031c1810 0000000000000000 90000000034f3ed8 90000000030d0aa8 0000000000000dc0 90000000030d0a88 90000001002ab410 900000000298d401 0000000000000000 6f8c2a9cb5ab5f64 90000000034f4000 90000000030d0a88 9000000003a48a58 90000001002ab410 9000000104bd81a8 900000000298d484 9000000100020260 0000000000000000 ... Call Trace: [<9000000002986290>] __clk_register+0x26c/0x84c [<9000000002986a68>] devm_clk_hw_register+0x5c/0xe0 [<900000000298d3b8>] loongson2_clk_register.constprop.0+0xdc/0x10c [<900000000298d484>] loongson2_clk_probe+0x9c/0x4ac [<9000000002a4eba4>] platform_probe+0x68/0xc8 [<9000000002a4bf80>] really_probe+0xbc/0x2f0 [<9000000002a4c23c>] __driver_probe_device+0x88/0x128 [<9000000002a4c318>] driver_probe_device+0x3c/0x11c [<9000000002a4c5dc>] __driver_attach+0x98/0x18c [<9000000002a49ca0>] bus_for_each_dev+0x80/0xe0 [<9000000002a4b0dc>] bus_add_driver+0xfc/0x1ec [<9000000002a4d4a8>] driver_register+0x68/0x134 [<90000000020f0110>] do_one_initcall+0x50/0x188 [<9000000003150f00>] kernel_init_freeable+0x224/0x294 [<90000000030240fc>] kernel_init+0x20/0x110 [<90000000020f1568>] ret_from_kernel_thread+0xc/0xa4 Fixes: acc0ccffec50 ("clk: clk-loongson2: add clock controller driver support") Cc: stable@vger.kernel.org Cc: Yinbo Zhu Signed-off-by: Binbin Zhou Link: https://lore.kernel.org/r/20230524014924.2869051-1-zhoubinbin@loongson.cn Signed-off-by: Stephen Boyd --- drivers/clk/clk-loongson2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/clk-loongson2.c b/drivers/clk/clk-loongson2.c index 70ae1dd2e474..bacdcbb287ac 100644 --- a/drivers/clk/clk-loongson2.c +++ b/drivers/clk/clk-loongson2.c @@ -40,7 +40,7 @@ static struct clk_hw *loongson2_clk_register(struct device *dev, { int ret; struct clk_hw *hw; - struct clk_init_data init; + struct clk_init_data init = { }; hw = devm_kzalloc(dev, sizeof(*hw), GFP_KERNEL); if (!hw) -- cgit v1.2.3 From 3e6ac852fbc71a234de24b5455086f6b98d3d958 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 30 May 2023 17:17:20 +0100 Subject: usb: gadget: udc: renesas_usb3: Fix RZ/V2M {modprobe,bind} error Currently {modprobe, bind} after {rmmod, unbind} results in probe failure. genirq: Flags mismatch irq 22. 00000004 (85070400.usb3drd) vs. 00000004 (85070400.usb3drd) renesas_usb3: probe of 85070000.usb3peri failed with error -16 The reason is, it is trying to register an interrupt handler for the same IRQ twice. The devm_request_irq() was called with the parent device. So the interrupt handler won't be unregistered when the usb3-peri device is unbound. Fix this issue by replacing "parent dev"->"dev" as the irq resource is managed by this driver. Fixes: 9cad72dfc556 ("usb: gadget: Add support for RZ/V2M USB3DRD driver") Cc: stable Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Message-ID: <20230530161720.179927-1-biju.das.jz@bp.renesas.com> Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/renesas_usb3.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index aac8bc185afa..eb008e873361 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -2877,9 +2877,9 @@ static int renesas_usb3_probe(struct platform_device *pdev) struct rzv2m_usb3drd *ddata = dev_get_drvdata(pdev->dev.parent); usb3->drd_reg = ddata->reg; - ret = devm_request_irq(ddata->dev, ddata->drd_irq, + ret = devm_request_irq(&pdev->dev, ddata->drd_irq, renesas_usb3_otg_irq, 0, - dev_name(ddata->dev), usb3); + dev_name(&pdev->dev), usb3); if (ret < 0) return ret; } -- cgit v1.2.3 From 00f8205ffcf112dcef14f8151d78075d38d22c08 Mon Sep 17 00:00:00 2001 From: Elson Roy Serrao Date: Thu, 1 Jun 2023 14:27:30 -0700 Subject: usb: dwc3: gadget: Reset num TRBs before giving back the request Consider a scenario where cable disconnect happens when there is an active usb reqest queued to the UDC. As part of the disconnect we would issue an end transfer with no interrupt-on-completion before giving back this request. Since we are giving back the request without skipping TRBs the num_trbs field of dwc3_request still holds the stale value previously used. Function drivers re-use same request for a given bind-unbind session and hence their dwc3_request context gets preserved across cable disconnect/connect. When such a request gets re-queued after cable connect, we would increase the num_trbs field on top of the previous stale value thus incorrectly representing the number of TRBs used. Fix this by resetting num_trbs field before giving back the request. Fixes: 09fe1f8d7e2f ("usb: dwc3: gadget: track number of TRBs per request") Cc: stable Signed-off-by: Elson Roy Serrao Acked-by: Thinh Nguyen Message-ID: <1685654850-8468-1-git-send-email-quic_eserrao@quicinc.com> Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index d831f5acf7b5..b78599dd705c 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -198,6 +198,7 @@ static void dwc3_gadget_del_and_unmap_request(struct dwc3_ep *dep, list_del(&req->list); req->remaining = 0; req->needs_extra_trb = false; + req->num_trbs = 0; if (req->request.status == -EINPROGRESS) req->request.status = status; -- cgit v1.2.3 From d2d69354226de0b333d4405981f3d9c41ba8430a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 7 Jun 2023 12:05:39 +0200 Subject: USB: dwc3: qcom: fix NULL-deref on suspend The Qualcomm dwc3 glue driver is currently accessing the driver data of the child core device during suspend and on wakeup interrupts. This is clearly a bad idea as the child may not have probed yet or could have been unbound from its driver. The first such layering violation was part of the initial version of the driver, but this was later made worse when the hack that accesses the driver data of the grand child xhci device to configure the wakeup interrupts was added. Fixing this properly is not that easily done, so add a sanity check to make sure that the child driver data is non-NULL before dereferencing it for now. Note that this relies on subtleties like the fact that driver core is making sure that the parent is not suspended while the child is probing. Reported-by: Manivannan Sadhasivam Link: https://lore.kernel.org/all/20230325165217.31069-4-manivannan.sadhasivam@linaro.org/ Fixes: d9152161b4bf ("usb: dwc3: Add Qualcomm DWC3 glue layer driver") Fixes: 6895ea55c385 ("usb: dwc3: qcom: Configure wakeup interrupts during suspend") Cc: stable@vger.kernel.org # 3.18: a872ab303d5d: "usb: dwc3: qcom: fix use-after-free on runtime-PM wakeup" Cc: Sandeep Maheswaram Cc: Krishna Kurapati Signed-off-by: Johan Hovold Acked-by: Thinh Nguyen Reviewed-by: Manivannan Sadhasivam Message-ID: <20230607100540.31045-2-johan+linaro@kernel.org> Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-qcom.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/dwc3-qcom.c b/drivers/usb/dwc3/dwc3-qcom.c index 959fc925ca7c..79b22abf9727 100644 --- a/drivers/usb/dwc3/dwc3-qcom.c +++ b/drivers/usb/dwc3/dwc3-qcom.c @@ -308,7 +308,16 @@ static void dwc3_qcom_interconnect_exit(struct dwc3_qcom *qcom) /* Only usable in contexts where the role can not change. */ static bool dwc3_qcom_is_host(struct dwc3_qcom *qcom) { - struct dwc3 *dwc = platform_get_drvdata(qcom->dwc3); + struct dwc3 *dwc; + + /* + * FIXME: Fix this layering violation. + */ + dwc = platform_get_drvdata(qcom->dwc3); + + /* Core driver may not have probed yet. */ + if (!dwc) + return false; return dwc->xhci; } -- cgit v1.2.3 From e3dbb657571509044be15184a13134fa7c1fdca1 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 7 Jun 2023 12:05:40 +0200 Subject: USB: dwc3: fix use-after-free on core driver unbind Some dwc3 glue drivers are currently accessing the driver data of the child core device directly, which is clearly a bad idea as the child may not have probed yet or may have been unbound from its driver. As a workaround until the glue drivers have been fixed, clear the driver data pointer before allowing the glue parent device to runtime suspend to prevent its driver from accessing data that has been freed during unbind. Fixes: 6dd2565989b4 ("usb: dwc3: add imx8mp dwc3 glue layer driver") Fixes: 6895ea55c385 ("usb: dwc3: qcom: Configure wakeup interrupts during suspend") Cc: stable@vger.kernel.org # 5.12 Cc: Li Jun Cc: Sandeep Maheswaram Cc: Krishna Kurapati Signed-off-by: Johan Hovold Acked-by: Thinh Nguyen Reviewed-by: Manivannan Sadhasivam Message-ID: <20230607100540.31045-3-johan+linaro@kernel.org> Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index 7b2ce013cc5b..d68958e151a7 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -1929,6 +1929,11 @@ static int dwc3_remove(struct platform_device *pdev) pm_runtime_disable(&pdev->dev); pm_runtime_dont_use_autosuspend(&pdev->dev); pm_runtime_put_noidle(&pdev->dev); + /* + * HACK: Clear the driver data, which is currently accessed by parent + * glue drivers, before allowing the parent to suspend. + */ + platform_set_drvdata(pdev, NULL); pm_runtime_set_suspended(&pdev->dev); dwc3_free_event_buffers(dwc); -- cgit v1.2.3 From c4a8bfabefed706bb9150867db528ceefd5cb5fe Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Tue, 6 Jun 2023 14:58:02 +0300 Subject: usb: typec: ucsi: Fix command cancellation The Cancel command was passed to the write callback as the offset instead of as the actual command which caused NULL pointer dereference. Reported-by: Stephan Bolten Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217517 Fixes: 094902bc6a3c ("usb: typec: ucsi: Always cancel the command if PPM reports BUSY condition") Cc: stable@vger.kernel.org Signed-off-by: Heikki Krogerus Message-ID: <20230606115802.79339-1-heikki.krogerus@linux.intel.com> Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index 2b472ec01dc4..b664ecbb798b 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -132,10 +132,8 @@ static int ucsi_exec_command(struct ucsi *ucsi, u64 cmd) if (ret) return ret; - if (cci & UCSI_CCI_BUSY) { - ucsi->ops->async_write(ucsi, UCSI_CANCEL, NULL, 0); - return -EBUSY; - } + if (cmd != UCSI_CANCEL && cci & UCSI_CCI_BUSY) + return ucsi_exec_command(ucsi, UCSI_CANCEL); if (!(cci & UCSI_CCI_COMMAND_COMPLETE)) return -EIO; @@ -149,6 +147,11 @@ static int ucsi_exec_command(struct ucsi *ucsi, u64 cmd) return ucsi_read_error(ucsi); } + if (cmd == UCSI_CANCEL && cci & UCSI_CCI_CANCEL_COMPLETE) { + ret = ucsi_acknowledge_command(ucsi); + return ret ? ret : -EBUSY; + } + return UCSI_CCI_LENGTH(cci); } -- cgit v1.2.3 From 92c9c3baad6b1fd584fbabeaa4756f9b77926cb5 Mon Sep 17 00:00:00 2001 From: Pavan Holla Date: Wed, 7 Jun 2023 19:33:26 +0000 Subject: usb: typec: Fix fast_role_swap_current show function The current implementation mistakenly performs a & operation on the output of sysfs_emit. This patch performs the & operation before calling sysfs_emit. Fixes: 662a60102c12 ("usb: typec: Separate USB Power Delivery from USB Type-C") Cc: stable Reported-by: Benson Leung Signed-off-by: Pavan Holla Reviewed-by: Heikki Krogerus Reviewed-by: Benson Leung Message-ID: <20230607193328.3359487-1-pholla@chromium.org> Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/pd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/pd.c b/drivers/usb/typec/pd.c index 0bcde1ff4d39..8cc66e4467c4 100644 --- a/drivers/usb/typec/pd.c +++ b/drivers/usb/typec/pd.c @@ -95,7 +95,7 @@ peak_current_show(struct device *dev, struct device_attribute *attr, char *buf) static ssize_t fast_role_swap_current_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sysfs_emit(buf, "%u\n", to_pdo(dev)->pdo >> PDO_FIXED_FRS_CURR_SHIFT) & 3; + return sysfs_emit(buf, "%u\n", (to_pdo(dev)->pdo >> PDO_FIXED_FRS_CURR_SHIFT) & 3); } static DEVICE_ATTR_RO(fast_role_swap_current); -- cgit v1.2.3 From 50966da807c81c5eb3bdfd392990fe0bba94d1ee Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 9 Jun 2023 01:02:26 +0000 Subject: usb: gadget: udc: core: Offload usb_udc_vbus_handler processing usb_udc_vbus_handler() can be invoked from interrupt context by irq handlers of the gadget drivers, however, usb_udc_connect_control() has to run in non-atomic context due to the following: a. Some of the gadget driver implementations expect the ->pullup callback to be invoked in non-atomic context. b. usb_gadget_disconnect() acquires udc_lock which is a mutex. Hence offload invocation of usb_udc_connect_control() to workqueue. UDC should not be pulled up unless gadget driver is bound. The new flag "allow_connect" is now set by gadget_bind_driver() and cleared by gadget_unbind_driver(). This prevents work item to pull up the gadget even if queued when the gadget driver is already unbound. Cc: stable@vger.kernel.org Fixes: 1016fc0c096c ("USB: gadget: Fix obscure lockdep violation for udc_mutex") Signed-off-by: Badhri Jagan Sridharan Reviewed-by: Alan Stern Message-ID: <20230609010227.978661-1-badhri@google.com> Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/core.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index 52e6d2e84e35..d2e4f78c53e3 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -37,6 +37,9 @@ static const struct bus_type gadget_bus_type; * @vbus: for udcs who care about vbus status, this value is real vbus status; * for udcs who do not care about vbus status, this value is always true * @started: the UDC's started state. True if the UDC had started. + * @allow_connect: Indicates whether UDC is allowed to be pulled up. + * Set/cleared by gadget_(un)bind_driver() after gadget driver is bound or + * unbound. * * This represents the internal data structure which is used by the UDC-class * to hold information about udc driver and gadget together. @@ -48,6 +51,8 @@ struct usb_udc { struct list_head list; bool vbus; bool started; + bool allow_connect; + struct work_struct vbus_work; }; static struct class *udc_class; @@ -706,7 +711,7 @@ int usb_gadget_connect(struct usb_gadget *gadget) goto out; } - if (gadget->deactivated) { + if (gadget->deactivated || !gadget->udc->allow_connect) { /* * If gadget is deactivated we only save new state. * Gadget will be connected automatically after activation. @@ -1086,6 +1091,13 @@ static void usb_udc_connect_control(struct usb_udc *udc) usb_gadget_disconnect(udc->gadget); } +static void vbus_event_work(struct work_struct *work) +{ + struct usb_udc *udc = container_of(work, struct usb_udc, vbus_work); + + usb_udc_connect_control(udc); +} + /** * usb_udc_vbus_handler - updates the udc core vbus status, and try to * connect or disconnect gadget @@ -1094,6 +1106,14 @@ static void usb_udc_connect_control(struct usb_udc *udc) * * The udc driver calls it when it wants to connect or disconnect gadget * according to vbus status. + * + * This function can be invoked from interrupt context by irq handlers of + * the gadget drivers, however, usb_udc_connect_control() has to run in + * non-atomic context due to the following: + * a. Some of the gadget driver implementations expect the ->pullup + * callback to be invoked in non-atomic context. + * b. usb_gadget_disconnect() acquires udc_lock which is a mutex. + * Hence offload invocation of usb_udc_connect_control() to workqueue. */ void usb_udc_vbus_handler(struct usb_gadget *gadget, bool status) { @@ -1101,7 +1121,7 @@ void usb_udc_vbus_handler(struct usb_gadget *gadget, bool status) if (udc) { udc->vbus = status; - usb_udc_connect_control(udc); + schedule_work(&udc->vbus_work); } } EXPORT_SYMBOL_GPL(usb_udc_vbus_handler); @@ -1328,6 +1348,7 @@ int usb_add_gadget(struct usb_gadget *gadget) mutex_lock(&udc_lock); list_add_tail(&udc->list, &udc_list); mutex_unlock(&udc_lock); + INIT_WORK(&udc->vbus_work, vbus_event_work); ret = device_add(&udc->dev); if (ret) @@ -1459,6 +1480,7 @@ void usb_del_gadget(struct usb_gadget *gadget) flush_work(&gadget->work); device_del(&gadget->dev); ida_free(&gadget_id_numbers, gadget->id_number); + cancel_work_sync(&udc->vbus_work); device_unregister(&udc->dev); } EXPORT_SYMBOL_GPL(usb_del_gadget); @@ -1527,6 +1549,7 @@ static int gadget_bind_driver(struct device *dev) if (ret) goto err_start; usb_gadget_enable_async_callbacks(udc); + udc->allow_connect = true; usb_udc_connect_control(udc); kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE); @@ -1558,6 +1581,8 @@ static void gadget_unbind_driver(struct device *dev) kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE); + udc->allow_connect = false; + cancel_work_sync(&udc->vbus_work); usb_gadget_disconnect(gadget); usb_gadget_disable_async_callbacks(udc); if (gadget->irq) -- cgit v1.2.3 From 286d9975a838d0a54da049765fa1d1fb96b89682 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 9 Jun 2023 01:02:27 +0000 Subject: usb: gadget: udc: core: Prevent soft_connect_store() race usb_udc_connect_control(), soft_connect_store() and usb_gadget_deactivate() can potentially race against each other to invoke usb_gadget_connect()/usb_gadget_disconnect(). To prevent this, guard udc->started, gadget->allow_connect, gadget->deactivate and gadget->connect with connect_lock so that ->pullup() is only invoked when the gadget is bound, started and not deactivated. The routines usb_gadget_connect_locked(), usb_gadget_disconnect_locked(), usb_udc_connect_control_locked(), usb_gadget_udc_start_locked(), usb_gadget_udc_stop_locked() are called with this lock held. An earlier version of this commit was reverted due to the crash reported in https://lore.kernel.org/all/ZF4BvgsOyoKxdPFF@francesco-nb.int.toradex.com/. commit 16737e78d190 ("usb: gadget: udc: core: Offload usb_udc_vbus_handler processing") addresses the crash reported. Cc: stable@vger.kernel.org Fixes: 628ef0d273a6 ("usb: udc: add usb_udc_vbus_handler") Signed-off-by: Badhri Jagan Sridharan Reviewed-by: Alan Stern Message-ID: <20230609010227.978661-2-badhri@google.com> Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/core.c | 155 +++++++++++++++++++++++++++++------------- 1 file changed, 106 insertions(+), 49 deletions(-) diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index d2e4f78c53e3..83fd1de14784 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -40,6 +40,11 @@ static const struct bus_type gadget_bus_type; * @allow_connect: Indicates whether UDC is allowed to be pulled up. * Set/cleared by gadget_(un)bind_driver() after gadget driver is bound or * unbound. + * @connect_lock: protects udc->started, gadget->connect, + * gadget->allow_connect and gadget->deactivate. The routines + * usb_gadget_connect_locked(), usb_gadget_disconnect_locked(), + * usb_udc_connect_control_locked(), usb_gadget_udc_start_locked() and + * usb_gadget_udc_stop_locked() are called with this lock held. * * This represents the internal data structure which is used by the UDC-class * to hold information about udc driver and gadget together. @@ -53,6 +58,7 @@ struct usb_udc { bool started; bool allow_connect; struct work_struct vbus_work; + struct mutex connect_lock; }; static struct class *udc_class; @@ -692,17 +698,8 @@ out: } EXPORT_SYMBOL_GPL(usb_gadget_vbus_disconnect); -/** - * usb_gadget_connect - software-controlled connect to USB host - * @gadget:the peripheral being connected - * - * Enables the D+ (or potentially D-) pullup. The host will start - * enumerating this gadget when the pullup is active and a VBUS session - * is active (the link is powered). - * - * Returns zero on success, else negative errno. - */ -int usb_gadget_connect(struct usb_gadget *gadget) +static int usb_gadget_connect_locked(struct usb_gadget *gadget) + __must_hold(&gadget->udc->connect_lock) { int ret = 0; @@ -711,10 +708,12 @@ int usb_gadget_connect(struct usb_gadget *gadget) goto out; } - if (gadget->deactivated || !gadget->udc->allow_connect) { + if (gadget->deactivated || !gadget->udc->allow_connect || !gadget->udc->started) { /* - * If gadget is deactivated we only save new state. - * Gadget will be connected automatically after activation. + * If the gadget isn't usable (because it is deactivated, + * unbound, or not yet started), we only save the new state. + * The gadget will be connected automatically when it is + * activated/bound/started. */ gadget->connected = true; goto out; @@ -729,22 +728,31 @@ out: return ret; } -EXPORT_SYMBOL_GPL(usb_gadget_connect); /** - * usb_gadget_disconnect - software-controlled disconnect from USB host - * @gadget:the peripheral being disconnected - * - * Disables the D+ (or potentially D-) pullup, which the host may see - * as a disconnect (when a VBUS session is active). Not all systems - * support software pullup controls. + * usb_gadget_connect - software-controlled connect to USB host + * @gadget:the peripheral being connected * - * Following a successful disconnect, invoke the ->disconnect() callback - * for the current gadget driver so that UDC drivers don't need to. + * Enables the D+ (or potentially D-) pullup. The host will start + * enumerating this gadget when the pullup is active and a VBUS session + * is active (the link is powered). * * Returns zero on success, else negative errno. */ -int usb_gadget_disconnect(struct usb_gadget *gadget) +int usb_gadget_connect(struct usb_gadget *gadget) +{ + int ret; + + mutex_lock(&gadget->udc->connect_lock); + ret = usb_gadget_connect_locked(gadget); + mutex_unlock(&gadget->udc->connect_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(usb_gadget_connect); + +static int usb_gadget_disconnect_locked(struct usb_gadget *gadget) + __must_hold(&gadget->udc->connect_lock) { int ret = 0; @@ -756,7 +764,7 @@ int usb_gadget_disconnect(struct usb_gadget *gadget) if (!gadget->connected) goto out; - if (gadget->deactivated) { + if (gadget->deactivated || !gadget->udc->started) { /* * If gadget is deactivated we only save new state. * Gadget will stay disconnected after activation. @@ -779,6 +787,30 @@ out: return ret; } + +/** + * usb_gadget_disconnect - software-controlled disconnect from USB host + * @gadget:the peripheral being disconnected + * + * Disables the D+ (or potentially D-) pullup, which the host may see + * as a disconnect (when a VBUS session is active). Not all systems + * support software pullup controls. + * + * Following a successful disconnect, invoke the ->disconnect() callback + * for the current gadget driver so that UDC drivers don't need to. + * + * Returns zero on success, else negative errno. + */ +int usb_gadget_disconnect(struct usb_gadget *gadget) +{ + int ret; + + mutex_lock(&gadget->udc->connect_lock); + ret = usb_gadget_disconnect_locked(gadget); + mutex_unlock(&gadget->udc->connect_lock); + + return ret; +} EXPORT_SYMBOL_GPL(usb_gadget_disconnect); /** @@ -796,13 +828,14 @@ int usb_gadget_deactivate(struct usb_gadget *gadget) { int ret = 0; + mutex_lock(&gadget->udc->connect_lock); if (gadget->deactivated) - goto out; + goto unlock; if (gadget->connected) { - ret = usb_gadget_disconnect(gadget); + ret = usb_gadget_disconnect_locked(gadget); if (ret) - goto out; + goto unlock; /* * If gadget was being connected before deactivation, we want @@ -812,7 +845,8 @@ int usb_gadget_deactivate(struct usb_gadget *gadget) } gadget->deactivated = true; -out: +unlock: + mutex_unlock(&gadget->udc->connect_lock); trace_usb_gadget_deactivate(gadget, ret); return ret; @@ -832,8 +866,9 @@ int usb_gadget_activate(struct usb_gadget *gadget) { int ret = 0; + mutex_lock(&gadget->udc->connect_lock); if (!gadget->deactivated) - goto out; + goto unlock; gadget->deactivated = false; @@ -842,9 +877,11 @@ int usb_gadget_activate(struct usb_gadget *gadget) * while it was being deactivated, we call usb_gadget_connect(). */ if (gadget->connected) - ret = usb_gadget_connect(gadget); + ret = usb_gadget_connect_locked(gadget); + mutex_unlock(&gadget->udc->connect_lock); -out: +unlock: + mutex_unlock(&gadget->udc->connect_lock); trace_usb_gadget_activate(gadget, ret); return ret; @@ -1083,19 +1120,22 @@ EXPORT_SYMBOL_GPL(usb_gadget_set_state); /* ------------------------------------------------------------------------- */ -static void usb_udc_connect_control(struct usb_udc *udc) +/* Acquire connect_lock before calling this function. */ +static void usb_udc_connect_control_locked(struct usb_udc *udc) __must_hold(&udc->connect_lock) { if (udc->vbus) - usb_gadget_connect(udc->gadget); + usb_gadget_connect_locked(udc->gadget); else - usb_gadget_disconnect(udc->gadget); + usb_gadget_disconnect_locked(udc->gadget); } static void vbus_event_work(struct work_struct *work) { struct usb_udc *udc = container_of(work, struct usb_udc, vbus_work); - usb_udc_connect_control(udc); + mutex_lock(&udc->connect_lock); + usb_udc_connect_control_locked(udc); + mutex_unlock(&udc->connect_lock); } /** @@ -1144,7 +1184,7 @@ void usb_gadget_udc_reset(struct usb_gadget *gadget, EXPORT_SYMBOL_GPL(usb_gadget_udc_reset); /** - * usb_gadget_udc_start - tells usb device controller to start up + * usb_gadget_udc_start_locked - tells usb device controller to start up * @udc: The UDC to be started * * This call is issued by the UDC Class driver when it's about @@ -1155,8 +1195,11 @@ EXPORT_SYMBOL_GPL(usb_gadget_udc_reset); * necessary to have it powered on. * * Returns zero on success, else negative errno. + * + * Caller should acquire connect_lock before invoking this function. */ -static inline int usb_gadget_udc_start(struct usb_udc *udc) +static inline int usb_gadget_udc_start_locked(struct usb_udc *udc) + __must_hold(&udc->connect_lock) { int ret; @@ -1173,7 +1216,7 @@ static inline int usb_gadget_udc_start(struct usb_udc *udc) } /** - * usb_gadget_udc_stop - tells usb device controller we don't need it anymore + * usb_gadget_udc_stop_locked - tells usb device controller we don't need it anymore * @udc: The UDC to be stopped * * This call is issued by the UDC Class driver after calling @@ -1182,8 +1225,11 @@ static inline int usb_gadget_udc_start(struct usb_udc *udc) * The details are implementation specific, but it can go as * far as powering off UDC completely and disable its data * line pullups. + * + * Caller should acquire connect lock before invoking this function. */ -static inline void usb_gadget_udc_stop(struct usb_udc *udc) +static inline void usb_gadget_udc_stop_locked(struct usb_udc *udc) + __must_hold(&udc->connect_lock) { if (!udc->started) { dev_err(&udc->dev, "UDC had already stopped\n"); @@ -1342,6 +1388,7 @@ int usb_add_gadget(struct usb_gadget *gadget) udc->gadget = gadget; gadget->udc = udc; + mutex_init(&udc->connect_lock); udc->started = false; @@ -1545,12 +1592,16 @@ static int gadget_bind_driver(struct device *dev) if (ret) goto err_bind; - ret = usb_gadget_udc_start(udc); - if (ret) + mutex_lock(&udc->connect_lock); + ret = usb_gadget_udc_start_locked(udc); + if (ret) { + mutex_unlock(&udc->connect_lock); goto err_start; + } usb_gadget_enable_async_callbacks(udc); udc->allow_connect = true; - usb_udc_connect_control(udc); + usb_udc_connect_control_locked(udc); + mutex_unlock(&udc->connect_lock); kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE); return 0; @@ -1583,12 +1634,14 @@ static void gadget_unbind_driver(struct device *dev) udc->allow_connect = false; cancel_work_sync(&udc->vbus_work); - usb_gadget_disconnect(gadget); + mutex_lock(&udc->connect_lock); + usb_gadget_disconnect_locked(gadget); usb_gadget_disable_async_callbacks(udc); if (gadget->irq) synchronize_irq(gadget->irq); udc->driver->unbind(gadget); - usb_gadget_udc_stop(udc); + usb_gadget_udc_stop_locked(udc); + mutex_unlock(&udc->connect_lock); mutex_lock(&udc_lock); driver->is_bound = false; @@ -1674,11 +1727,15 @@ static ssize_t soft_connect_store(struct device *dev, } if (sysfs_streq(buf, "connect")) { - usb_gadget_udc_start(udc); - usb_gadget_connect(udc->gadget); + mutex_lock(&udc->connect_lock); + usb_gadget_udc_start_locked(udc); + usb_gadget_connect_locked(udc->gadget); + mutex_unlock(&udc->connect_lock); } else if (sysfs_streq(buf, "disconnect")) { - usb_gadget_disconnect(udc->gadget); - usb_gadget_udc_stop(udc); + mutex_lock(&udc->connect_lock); + usb_gadget_disconnect_locked(udc->gadget); + usb_gadget_udc_stop_locked(udc); + mutex_unlock(&udc->connect_lock); } else { dev_err(dev, "unsupported command '%s'\n", buf); ret = -EINVAL; -- cgit v1.2.3 From 11d24327c2d7ad7f24fcc44fb00e1fa91ebf6525 Mon Sep 17 00:00:00 2001 From: Ratchanan Srirattanamet Date: Wed, 24 May 2023 04:11:56 +0700 Subject: drm/nouveau: don't detect DSM for non-NVIDIA device The call site of nouveau_dsm_pci_probe() uses single set of output variables for all invocations. So, we must not write anything to them unless it's an NVIDIA device. Otherwise, if we are called with another device after the NVIDIA device, we'll clober the result of the NVIDIA device. For example, if the other device doesn't have _PR3 resources, the detection later would miss the presence of power resource support, and the rest of the code will keep using Optimus DSM, breaking power management for that machine. Also, because we're detecting NVIDIA's DSM, it doesn't make sense to run this detection on a non-NVIDIA device anyway. Thus, check at the beginning of the detection code if this is an NVIDIA card, and just return if it isn't. This, together with commit d22915d22ded ("drm/nouveau/devinit/tu102-: wait for GFW_BOOT_PROGRESS == COMPLETED") developed independently and landed earlier, fixes runtime power management of the NVIDIA card in Lenovo Legion 5-15ARH05. Without this patch, the GPU resumption code will "timeout", sometimes hanging userspace. As a bonus, we'll also stop preventing _PR3 usage from the bridge for unrelated devices, which is always nice, I guess. Fixes: ccfc2d5cdb02 ("drm/nouveau: Use generic helper to check _PR3 presence") Signed-off-by: Ratchanan Srirattanamet Closes: https://gitlab.freedesktop.org/drm/nouveau/-/issues/79 Reviewed-by: Karol Herbst Signed-off-by: Karol Herbst Link: https://patchwork.freedesktop.org/patch/msgid/DM6PR19MB2780805D4BE1E3F9B3AC96D0BC409@DM6PR19MB2780.namprd19.prod.outlook.com --- drivers/gpu/drm/nouveau/nouveau_acpi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_acpi.c b/drivers/gpu/drm/nouveau/nouveau_acpi.c index 8cf096f841a9..a2ae8c21e4dc 100644 --- a/drivers/gpu/drm/nouveau/nouveau_acpi.c +++ b/drivers/gpu/drm/nouveau/nouveau_acpi.c @@ -220,6 +220,9 @@ static void nouveau_dsm_pci_probe(struct pci_dev *pdev, acpi_handle *dhandle_out int optimus_funcs; struct pci_dev *parent_pdev; + if (pdev->vendor != PCI_VENDOR_ID_NVIDIA) + return; + *has_pr3 = false; parent_pdev = pci_upstream_bridge(pdev); if (parent_pdev) { -- cgit v1.2.3 From a82c3df955f8c1c726e4976527aa6ae924a67dd9 Mon Sep 17 00:00:00 2001 From: Robert Hodaszi Date: Fri, 9 Jun 2023 14:13:34 +0200 Subject: tty: serial: fsl_lpuart: reduce RX watermark to 0 on LS1028A LS1028A is using DMA with LPUART. Having RX watermark set to 1, means DMA transactions are started only after receiving the second character. On other platforms with newer LPUART IP, Receiver Idle Empty function initiates the DMA request after the receiver is idling for 4 characters. But this feature is missing on LS1028A, which is causing a 1-character delay in the RX direction on this platform. Set RX watermark to 0 to initiate RX DMA after each character. Link: https://lore.kernel.org/linux-serial/20230607103459.1222426-1-robert.hodaszi@digi.com/ Fixes: 9ad9df844754 ("tty: serial: fsl_lpuart: Fix the wrong RXWATER setting for rx dma case") Cc: stable Signed-off-by: Robert Hodaszi Message-ID: <20230609121334.1878626-1-robert.hodaszi@digi.com> Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/fsl_lpuart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c index 7486a2b8556c..7fd30fcc10c6 100644 --- a/drivers/tty/serial/fsl_lpuart.c +++ b/drivers/tty/serial/fsl_lpuart.c @@ -310,7 +310,7 @@ static const struct lpuart_soc_data ls1021a_data = { static const struct lpuart_soc_data ls1028a_data = { .devtype = LS1028A_LPUART, .iotype = UPIO_MEM32, - .rx_watermark = 1, + .rx_watermark = 0, }; static struct lpuart_soc_data imx7ulp_data = { -- cgit v1.2.3 From f9fd804aa0a36f15a35ca070ec4c52650876cc29 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 13 Jun 2023 10:34:53 +0100 Subject: ASoC: tegra: Fix Master Volume Control Commit 3ed2b549b39f ("ALSA: pcm: fix wait_time calculations") corrected the PCM wait_time calculations and in doing so reduced the calculated wait_time. This exposed an issue with the Tegra Master Volume Control (MVC) device where the reduced wait_time caused the MVC to fail. For now fix this by setting the default wait_time for Tegra to be 500ms. Fixes: 3ed2b549b39f ("ALSA: pcm: fix wait_time calculations") Signed-off-by: Jon Hunter Link: https://lore.kernel.org/r/20230613093453.13927-1-jonathanh@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra_pcm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/tegra/tegra_pcm.c b/sound/soc/tegra/tegra_pcm.c index 468c8e77de21..0b69cebc9a33 100644 --- a/sound/soc/tegra/tegra_pcm.c +++ b/sound/soc/tegra/tegra_pcm.c @@ -117,6 +117,9 @@ int tegra_pcm_open(struct snd_soc_component *component, return ret; } + /* Set wait time to 500ms by default */ + substream->wait_time = 500; + return 0; } EXPORT_SYMBOL_GPL(tegra_pcm_open); -- cgit v1.2.3 From 47b3ad6b7842f49d374a01b054a4b1461a621bdc Mon Sep 17 00:00:00 2001 From: Christophe Kerello Date: Tue, 13 Jun 2023 15:41:46 +0200 Subject: mmc: mmci: stm32: fix max busy timeout calculation The way that the timeout is currently calculated could lead to a u64 timeout value in mmci_start_command(). This value is then cast in a u32 register that leads to mmc erase failed issue with some SD cards. Fixes: 8266c585f489 ("mmc: mmci: add hardware busy timeout feature") Signed-off-by: Yann Gautier Signed-off-by: Christophe Kerello Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230613134146.418016-1-yann.gautier@foss.st.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/mmci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index f2b2e8b0574e..696cbef3ff7d 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -1735,7 +1735,8 @@ static void mmci_set_max_busy_timeout(struct mmc_host *mmc) return; if (host->variant->busy_timeout && mmc->actual_clock) - max_busy_timeout = ~0UL / (mmc->actual_clock / MSEC_PER_SEC); + max_busy_timeout = U32_MAX / DIV_ROUND_UP(mmc->actual_clock, + MSEC_PER_SEC); mmc->max_busy_timeout = max_busy_timeout; } -- cgit v1.2.3 From 95011f267c44a4d1f9ca1769e8a29ab2c559e004 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Thu, 8 Jun 2023 09:24:43 +0800 Subject: drm/bridge: ti-sn65dsi86: Avoid possible buffer overflow Smatch error:buffer overflow 'ti_sn_bridge_refclk_lut' 5 <= 5. Fixes: cea86c5bb442 ("drm/bridge: ti-sn65dsi86: Implement the pwm_chip") Signed-off-by: Su Hui Reviewed-by: Douglas Anderson Signed-off-by: Douglas Anderson Link: https://patchwork.freedesktop.org/patch/msgid/20230608012443.839372-1-suhui@nfschina.com --- drivers/gpu/drm/bridge/ti-sn65dsi86.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c index 7a748785c545..4676cf2900df 100644 --- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c +++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c @@ -298,6 +298,10 @@ static void ti_sn_bridge_set_refclk_freq(struct ti_sn65dsi86 *pdata) if (refclk_lut[i] == refclk_rate) break; + /* avoid buffer overflow and "1" is the default rate in the datasheet. */ + if (i >= refclk_lut_size) + i = 1; + regmap_update_bits(pdata->regmap, SN_DPPLL_SRC_REG, REFCLK_FREQ_MASK, REFCLK_FREQ(i)); -- cgit v1.2.3 From 20a2ce87fbaf81e4c3dcb631d738e423959eb320 Mon Sep 17 00:00:00 2001 From: Natalia Petrova Date: Fri, 12 May 2023 14:15:26 +0300 Subject: drm/nouveau/dp: check for NULL nv_connector->native_mode Add checking for NULL before calling nouveau_connector_detect_depth() in nouveau_connector_get_modes() function because nv_connector->native_mode could be dereferenced there since connector pointer passed to nouveau_connector_detect_depth() and the same value of nv_connector->native_mode is used there. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: d4c2c99bdc83 ("drm/nouveau/dp: remove broken display depth function, use the improved one") Signed-off-by: Natalia Petrova Reviewed-by: Lyude Paul Signed-off-by: Lyude Paul Link: https://patchwork.freedesktop.org/patch/msgid/20230512111526.82408-1-n.petrova@fintech.ru --- drivers/gpu/drm/nouveau/nouveau_connector.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_connector.c b/drivers/gpu/drm/nouveau/nouveau_connector.c index 086b66b60d91..5dbf025e6873 100644 --- a/drivers/gpu/drm/nouveau/nouveau_connector.c +++ b/drivers/gpu/drm/nouveau/nouveau_connector.c @@ -966,7 +966,7 @@ nouveau_connector_get_modes(struct drm_connector *connector) /* Determine display colour depth for everything except LVDS now, * DP requires this before mode_valid() is called. */ - if (connector->connector_type != DRM_MODE_CONNECTOR_LVDS) + if (connector->connector_type != DRM_MODE_CONNECTOR_LVDS && nv_connector->native_mode) nouveau_connector_detect_depth(connector); /* Find the native mode if this is a digital panel, if we didn't @@ -987,7 +987,7 @@ nouveau_connector_get_modes(struct drm_connector *connector) * "native" mode as some VBIOS tables require us to use the * pixel clock as part of the lookup... */ - if (connector->connector_type == DRM_MODE_CONNECTOR_LVDS) + if (connector->connector_type == DRM_MODE_CONNECTOR_LVDS && nv_connector->native_mode) nouveau_connector_detect_depth(connector); if (nv_encoder->dcb->type == DCB_OUTPUT_TV) -- cgit v1.2.3 From 1dbcf770cc2d15baf8a1e8174d6fd014a68b45ca Mon Sep 17 00:00:00 2001 From: Jiadong Zhu Date: Wed, 24 May 2023 11:42:19 +0800 Subject: drm/amdgpu: Reset CP_VMID_PREEMPT after trailing fence signaled When MEC executes unmap_queue for mid command buffer preemption, it will kick the write pointer of the gfx ring, set CP_VMID_PREEMPT to trigger the preemption and wait for CP_VMID_PREEMPT becomes zero after the preemption done. There is a race condition that PFP may excute the resetting command before MEC set CP_VMID_PREEMPT. As a result, hang happens as CP_VMID_PREEMPT is always 0xffff. To avoid this, we send resetting CP_VMID_PREEMPT command after the trailing fence is siganled and update gfx write pointer explicitly. Signed-off-by: Jiadong Zhu Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.3.x Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2535 --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index e7f2b7bf0ff5..6c1e923bc7ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -5369,10 +5369,6 @@ static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring) amdgpu_ring_alloc(ring, 13); gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr, ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC | AMDGPU_FENCE_FLAG_INT); - /*reset the CP_VMID_PREEMPT after trailing fence*/ - amdgpu_ring_emit_wreg(ring, - SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT), - 0x0); /* assert IB preemption, emit the trailing fence */ kiq->pmf->kiq_unmap_queues(kiq_ring, ring, PREEMPT_QUEUES_NO_UNMAP, @@ -5395,6 +5391,10 @@ static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring) DRM_WARN("ring %d timeout to preempt ib\n", ring->idx); } + /*reset the CP_VMID_PREEMPT after trailing fence*/ + amdgpu_ring_emit_wreg(ring, + SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT), + 0x0); amdgpu_ring_commit(ring); /* deassert preemption condition */ -- cgit v1.2.3 From 55b94bb8c42464bad3d2217f6874aa1a85664eac Mon Sep 17 00:00:00 2001 From: Natalia Petrova Date: Fri, 12 May 2023 13:33:20 +0300 Subject: drm/nouveau: add nv_encoder pointer check for NULL Pointer nv_encoder could be dereferenced at nouveau_connector.c in case it's equal to NULL by jumping to goto label. This patch adds a NULL-check to avoid it. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 3195c5f9784a ("drm/nouveau: set encoder for lvds") Signed-off-by: Natalia Petrova Reviewed-by: Lyude Paul [Fixed patch title] Signed-off-by: Lyude Paul Link: https://patchwork.freedesktop.org/patch/msgid/20230512103320.82234-1-n.petrova@fintech.ru --- drivers/gpu/drm/nouveau/nouveau_connector.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_connector.c b/drivers/gpu/drm/nouveau/nouveau_connector.c index 5dbf025e6873..f75c6f09dd2a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_connector.c +++ b/drivers/gpu/drm/nouveau/nouveau_connector.c @@ -730,7 +730,8 @@ out: #endif nouveau_connector_set_edid(nv_connector, edid); - nouveau_connector_set_encoder(connector, nv_encoder); + if (nv_encoder) + nouveau_connector_set_encoder(connector, nv_encoder); return status; } -- cgit v1.2.3 From 94034b306ddde4a4a9c1a597ae7f61f04b710dc7 Mon Sep 17 00:00:00 2001 From: Jiadong Zhu Date: Wed, 24 May 2023 16:51:32 +0800 Subject: drm/amdgpu: Program gds backup address as zero if no gds allocated It is firmware requirement to set gds_backup_addrlo and gds_backup_addrhi of DE meta both zero if no gds partition is allocated for the frame. Signed-off-by: Jiadong Zhu Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.3.x --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 6c1e923bc7ce..bb7dd39b6d1a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -755,7 +755,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev); static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev, struct amdgpu_cu_info *cu_info); static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev); -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume); +static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume, bool usegds); static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring); static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status); @@ -5127,7 +5127,8 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring, gfx_v9_0_ring_emit_de_meta(ring, (!amdgpu_sriov_vf(ring->adev) && flags & AMDGPU_IB_PREEMPTED) ? - true : false); + true : false, + job->gds_size > 0 && job->gds_base != 0); } amdgpu_ring_write(ring, header); @@ -5402,7 +5403,7 @@ static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring) return r; } -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume) +static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume, bool usegds) { struct amdgpu_device *adev = ring->adev; struct v9_de_ib_state de_payload = {0}; @@ -5433,8 +5434,10 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume) PAGE_SIZE); } - de_payload.gds_backup_addrlo = lower_32_bits(gds_addr); - de_payload.gds_backup_addrhi = upper_32_bits(gds_addr); + if (usegds) { + de_payload.gds_backup_addrlo = lower_32_bits(gds_addr); + de_payload.gds_backup_addrhi = upper_32_bits(gds_addr); + } cnt = (sizeof(de_payload) >> 2) + 4 - 2; amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, cnt)); -- cgit v1.2.3 From 87af86ae89963c227a3beb4d914f3dc7959a690e Mon Sep 17 00:00:00 2001 From: Jiadong Zhu Date: Thu, 25 May 2023 16:52:55 +0800 Subject: drm/amdgpu: Modify indirect buffer packages for resubmission When the preempted IB frame resubmitted to cp, we need to modify the frame data including: 1. set PRE_RESUME 1 in CONTEXT_CONTROL. 2. use meta data(DE and CE) read from CSA in WRITE_DATA. Add functions to save the location the first time IBs emitted and callback to patch the package when resubmission happens. Signed-off-by: Jiadong Zhu Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.3.x --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 18 +++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 9 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 60 ++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h | 15 +++++++ 4 files changed, 102 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index dc474b809604..49de3a3eebc7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -581,3 +581,21 @@ void amdgpu_ring_ib_end(struct amdgpu_ring *ring) if (ring->is_sw_ring) amdgpu_sw_ring_ib_end(ring); } + +void amdgpu_ring_ib_on_emit_cntl(struct amdgpu_ring *ring) +{ + if (ring->is_sw_ring) + amdgpu_sw_ring_ib_mark_offset(ring, AMDGPU_MUX_OFFSET_TYPE_CONTROL); +} + +void amdgpu_ring_ib_on_emit_ce(struct amdgpu_ring *ring) +{ + if (ring->is_sw_ring) + amdgpu_sw_ring_ib_mark_offset(ring, AMDGPU_MUX_OFFSET_TYPE_CE); +} + +void amdgpu_ring_ib_on_emit_de(struct amdgpu_ring *ring) +{ + if (ring->is_sw_ring) + amdgpu_sw_ring_ib_mark_offset(ring, AMDGPU_MUX_OFFSET_TYPE_DE); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index d8749444b689..2474cb71e476 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -227,6 +227,9 @@ struct amdgpu_ring_funcs { int (*preempt_ib)(struct amdgpu_ring *ring); void (*emit_mem_sync)(struct amdgpu_ring *ring); void (*emit_wave_limit)(struct amdgpu_ring *ring, bool enable); + void (*patch_cntl)(struct amdgpu_ring *ring, unsigned offset); + void (*patch_ce)(struct amdgpu_ring *ring, unsigned offset); + void (*patch_de)(struct amdgpu_ring *ring, unsigned offset); }; struct amdgpu_ring { @@ -318,10 +321,16 @@ struct amdgpu_ring { #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r)) #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o)) #define amdgpu_ring_preempt_ib(r) (r)->funcs->preempt_ib(r) +#define amdgpu_ring_patch_cntl(r, o) ((r)->funcs->patch_cntl((r), (o))) +#define amdgpu_ring_patch_ce(r, o) ((r)->funcs->patch_ce((r), (o))) +#define amdgpu_ring_patch_de(r, o) ((r)->funcs->patch_de((r), (o))) int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw); void amdgpu_ring_ib_begin(struct amdgpu_ring *ring); void amdgpu_ring_ib_end(struct amdgpu_ring *ring); +void amdgpu_ring_ib_on_emit_cntl(struct amdgpu_ring *ring); +void amdgpu_ring_ib_on_emit_ce(struct amdgpu_ring *ring); +void amdgpu_ring_ib_on_emit_de(struct amdgpu_ring *ring); void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count); void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c index 62079f0e3ee8..73516abef662 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c @@ -105,6 +105,16 @@ static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux) amdgpu_fence_update_start_timestamp(e->ring, chunk->sync_seq, ktime_get()); + if (chunk->sync_seq == + le32_to_cpu(*(e->ring->fence_drv.cpu_addr + 2))) { + if (chunk->cntl_offset <= e->ring->buf_mask) + amdgpu_ring_patch_cntl(e->ring, + chunk->cntl_offset); + if (chunk->ce_offset <= e->ring->buf_mask) + amdgpu_ring_patch_ce(e->ring, chunk->ce_offset); + if (chunk->de_offset <= e->ring->buf_mask) + amdgpu_ring_patch_de(e->ring, chunk->de_offset); + } amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, e->ring, chunk->start, chunk->end); @@ -407,6 +417,17 @@ void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring) amdgpu_ring_mux_end_ib(mux, ring); } +void amdgpu_sw_ring_ib_mark_offset(struct amdgpu_ring *ring, enum amdgpu_ring_mux_offset_type type) +{ + struct amdgpu_device *adev = ring->adev; + struct amdgpu_ring_mux *mux = &adev->gfx.muxer; + unsigned offset; + + offset = ring->wptr & ring->buf_mask; + + amdgpu_ring_mux_ib_mark_offset(mux, ring, offset, type); +} + void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring) { struct amdgpu_mux_entry *e; @@ -429,6 +450,10 @@ void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *r } chunk->start = ring->wptr; + /* the initialized value used to check if they are set by the ib submission*/ + chunk->cntl_offset = ring->buf_mask + 1; + chunk->de_offset = ring->buf_mask + 1; + chunk->ce_offset = ring->buf_mask + 1; list_add_tail(&chunk->entry, &e->list); } @@ -454,6 +479,41 @@ static void scan_and_remove_signaled_chunk(struct amdgpu_ring_mux *mux, struct a } } +void amdgpu_ring_mux_ib_mark_offset(struct amdgpu_ring_mux *mux, + struct amdgpu_ring *ring, u64 offset, + enum amdgpu_ring_mux_offset_type type) +{ + struct amdgpu_mux_entry *e; + struct amdgpu_mux_chunk *chunk; + + e = amdgpu_ring_mux_sw_entry(mux, ring); + if (!e) { + DRM_ERROR("cannot find entry!\n"); + return; + } + + chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk, entry); + if (!chunk) { + DRM_ERROR("cannot find chunk!\n"); + return; + } + + switch (type) { + case AMDGPU_MUX_OFFSET_TYPE_CONTROL: + chunk->cntl_offset = offset; + break; + case AMDGPU_MUX_OFFSET_TYPE_DE: + chunk->de_offset = offset; + break; + case AMDGPU_MUX_OFFSET_TYPE_CE: + chunk->ce_offset = offset; + break; + default: + DRM_ERROR("invalid type (%d)\n", type); + break; + } +} + void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring) { struct amdgpu_mux_entry *e; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h index 4be45fc14954..b22d4fb2a847 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h @@ -50,6 +50,12 @@ struct amdgpu_mux_entry { struct list_head list; }; +enum amdgpu_ring_mux_offset_type { + AMDGPU_MUX_OFFSET_TYPE_CONTROL, + AMDGPU_MUX_OFFSET_TYPE_DE, + AMDGPU_MUX_OFFSET_TYPE_CE, +}; + struct amdgpu_ring_mux { struct amdgpu_ring *real_ring; @@ -72,12 +78,18 @@ struct amdgpu_ring_mux { * @sync_seq: the fence seqno related with the saved IB. * @start:- start location on the software ring. * @end:- end location on the software ring. + * @control_offset:- the PRE_RESUME bit position used for resubmission. + * @de_offset:- the anchor in write_data for de meta of resubmission. + * @ce_offset:- the anchor in write_data for ce meta of resubmission. */ struct amdgpu_mux_chunk { struct list_head entry; uint32_t sync_seq; u64 start; u64 end; + u64 cntl_offset; + u64 de_offset; + u64 ce_offset; }; int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, @@ -89,6 +101,8 @@ u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ri u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring); void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring); void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring); +void amdgpu_ring_mux_ib_mark_offset(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, + u64 offset, enum amdgpu_ring_mux_offset_type type); bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux); u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring); @@ -97,6 +111,7 @@ void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring); void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count); void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring); void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring); +void amdgpu_sw_ring_ib_mark_offset(struct amdgpu_ring *ring, enum amdgpu_ring_mux_offset_type type); const char *amdgpu_sw_ring_name(int idx); unsigned int amdgpu_sw_ring_priority(int idx); -- cgit v1.2.3 From 5b711e7f9c73e5ff44d6ac865711d9a05c2a0360 Mon Sep 17 00:00:00 2001 From: Jiadong Zhu Date: Thu, 25 May 2023 18:42:15 +0800 Subject: drm/amdgpu: Implement gfx9 patch functions for resubmission Patch the packages including CONTEXT_CONTROL and WRITE_DATA for gfx9 during the resubmission scenario. Signed-off-by: Jiadong Zhu Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.3.x --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 80 +++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index bb7dd39b6d1a..a674c8a58dc2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -5139,9 +5139,83 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring, #endif lower_32_bits(ib->gpu_addr)); amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr)); + amdgpu_ring_ib_on_emit_cntl(ring); amdgpu_ring_write(ring, control); } +static void gfx_v9_0_ring_patch_cntl(struct amdgpu_ring *ring, + unsigned offset) +{ + u32 control = ring->ring[offset]; + + control |= INDIRECT_BUFFER_PRE_RESUME(1); + ring->ring[offset] = control; +} + +static void gfx_v9_0_ring_patch_ce_meta(struct amdgpu_ring *ring, + unsigned offset) +{ + struct amdgpu_device *adev = ring->adev; + void *ce_payload_cpu_addr; + uint64_t payload_offset, payload_size; + + payload_size = sizeof(struct v9_ce_ib_state); + + if (ring->is_mes_queue) { + payload_offset = offsetof(struct amdgpu_mes_ctx_meta_data, + gfx[0].gfx_meta_data) + + offsetof(struct v9_gfx_meta_data, ce_payload); + ce_payload_cpu_addr = + amdgpu_mes_ctx_get_offs_cpu_addr(ring, payload_offset); + } else { + payload_offset = offsetof(struct v9_gfx_meta_data, ce_payload); + ce_payload_cpu_addr = adev->virt.csa_cpu_addr + payload_offset; + } + + if (offset + (payload_size >> 2) <= ring->buf_mask + 1) { + memcpy((void *)&ring->ring[offset], ce_payload_cpu_addr, payload_size); + } else { + memcpy((void *)&ring->ring[offset], ce_payload_cpu_addr, + (ring->buf_mask + 1 - offset) << 2); + payload_size -= (ring->buf_mask + 1 - offset) << 2; + memcpy((void *)&ring->ring[0], + ce_payload_cpu_addr + ((ring->buf_mask + 1 - offset) << 2), + payload_size); + } +} + +static void gfx_v9_0_ring_patch_de_meta(struct amdgpu_ring *ring, + unsigned offset) +{ + struct amdgpu_device *adev = ring->adev; + void *de_payload_cpu_addr; + uint64_t payload_offset, payload_size; + + payload_size = sizeof(struct v9_de_ib_state); + + if (ring->is_mes_queue) { + payload_offset = offsetof(struct amdgpu_mes_ctx_meta_data, + gfx[0].gfx_meta_data) + + offsetof(struct v9_gfx_meta_data, de_payload); + de_payload_cpu_addr = + amdgpu_mes_ctx_get_offs_cpu_addr(ring, payload_offset); + } else { + payload_offset = offsetof(struct v9_gfx_meta_data, de_payload); + de_payload_cpu_addr = adev->virt.csa_cpu_addr + payload_offset; + } + + if (offset + (payload_size >> 2) <= ring->buf_mask + 1) { + memcpy((void *)&ring->ring[offset], de_payload_cpu_addr, payload_size); + } else { + memcpy((void *)&ring->ring[offset], de_payload_cpu_addr, + (ring->buf_mask + 1 - offset) << 2); + payload_size -= (ring->buf_mask + 1 - offset) << 2; + memcpy((void *)&ring->ring[0], + de_payload_cpu_addr + ((ring->buf_mask + 1 - offset) << 2), + payload_size); + } +} + static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring, struct amdgpu_job *job, struct amdgpu_ib *ib, @@ -5337,6 +5411,8 @@ static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool resume) amdgpu_ring_write(ring, lower_32_bits(ce_payload_gpu_addr)); amdgpu_ring_write(ring, upper_32_bits(ce_payload_gpu_addr)); + amdgpu_ring_ib_on_emit_ce(ring); + if (resume) amdgpu_ring_write_multiple(ring, ce_payload_cpu_addr, sizeof(ce_payload) >> 2); @@ -5448,6 +5524,7 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume, bo amdgpu_ring_write(ring, lower_32_bits(de_payload_gpu_addr)); amdgpu_ring_write(ring, upper_32_bits(de_payload_gpu_addr)); + amdgpu_ring_ib_on_emit_de(ring); if (resume) amdgpu_ring_write_multiple(ring, de_payload_cpu_addr, sizeof(de_payload) >> 2); @@ -6858,6 +6935,9 @@ static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = { .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait, .soft_recovery = gfx_v9_0_ring_soft_recovery, .emit_mem_sync = gfx_v9_0_emit_mem_sync, + .patch_cntl = gfx_v9_0_ring_patch_cntl, + .patch_de = gfx_v9_0_ring_patch_de_meta, + .patch_ce = gfx_v9_0_ring_patch_ce_meta, }; static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = { -- cgit v1.2.3 From e61f67749b351c19455ce3085af2ae9af80023bc Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 6 Jun 2023 11:14:04 -0400 Subject: drm/amdgpu: add missing radeon secondary PCI ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 0x5b70 is a missing RV370 secondary id. Add it so we don't try and probe it with amdgpu. Cc: michel@daenzer.net Reviewed-by: Michel Dänzer Tested-by: Michel Dänzer Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index b1ca1ab6d6ad..393b6fb7a71d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1615,6 +1615,7 @@ static const u16 amdgpu_unsupported_pciidlist[] = { 0x5874, 0x5940, 0x5941, + 0x5b70, 0x5b72, 0x5b73, 0x5b74, -- cgit v1.2.3 From 3eb1a3a04056ba3df3205e169b8acc9da0c65a94 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 7 Jun 2023 01:41:22 -0500 Subject: drm/amd: Make sure image is written to trigger VBIOS image update flow The VBIOS image update flow requires userspace to: 1) Write the image to `psp_vbflash` 2) Read `psp_vbflash` 3) Poll `psp_vbflash_status` to check for completion If userspace reads `psp_vbflash` before writing an image, it's possible that it causes problems that can put the dGPU into an invalid state. Explicitly check that an image has been written before letting a read succeed. Cc: stable@vger.kernel.org Fixes: 8424f2ccb3c0 ("drm/amdgpu/psp: Add vbflash sysfs interface support") Reviewed-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 9d7e6e0e73ed..cf2ffe6fe4a2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -3548,6 +3548,9 @@ static ssize_t amdgpu_psp_vbflash_read(struct file *filp, struct kobject *kobj, void *fw_pri_cpu_addr; int ret; + if (adev->psp.vbflash_image_size == 0) + return -EINVAL; + dev_info(adev->dev, "VBIOS flash to PSP started"); ret = amdgpu_bo_create_kernel(adev, adev->psp.vbflash_image_size, -- cgit v1.2.3 From 7ab1a4913d0051cf5196ef7987b5fa42c25e13b6 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 7 Jun 2023 01:45:20 -0500 Subject: drm/amd: Tighten permissions on VBIOS flashing attributes Non-root users shouldn't be able to try to trigger a VBIOS flash or query the flashing status. This should be reserved for users with the appropriate permissions. Cc: stable@vger.kernel.org Fixes: 8424f2ccb3c0 ("drm/amdgpu/psp: Add vbflash sysfs interface support") Reviewed-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index cf2ffe6fe4a2..a150b7a4b4aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -3602,13 +3602,13 @@ static ssize_t amdgpu_psp_vbflash_status(struct device *dev, } static const struct bin_attribute psp_vbflash_bin_attr = { - .attr = {.name = "psp_vbflash", .mode = 0664}, + .attr = {.name = "psp_vbflash", .mode = 0660}, .size = 0, .write = amdgpu_psp_vbflash_write, .read = amdgpu_psp_vbflash_read, }; -static DEVICE_ATTR(psp_vbflash_status, 0444, amdgpu_psp_vbflash_status, NULL); +static DEVICE_ATTR(psp_vbflash_status, 0440, amdgpu_psp_vbflash_status, NULL); int amdgpu_psp_sysfs_init(struct amdgpu_device *adev) { -- cgit v1.2.3 From 7ca302d488f80cf4529620acc1c545f9022d8bb8 Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Thu, 8 Jun 2023 22:07:11 +0800 Subject: drm/amd/pm: workaround for compute workload type on some skus On smu 13.0.0, the compute workload type cannot be set on all the skus due to some other problems. This workaround is to make sure compute workload type can also run on some specific skus. v2: keep the variable consistent Signed-off-by: Kenneth Feng Acked-by: Lijo Lazar Reviewed-by: Feifei Xu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.1.x --- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 33 ++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 09405ef1e3c8..08577d1b84ec 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -1696,10 +1696,39 @@ static int smu_v13_0_0_set_power_profile_mode(struct smu_context *smu, } } - /* conv PP_SMC_POWER_PROFILE* to WORKLOAD_PPLIB_*_BIT */ - workload_type = smu_cmn_to_asic_specific_index(smu, + if (smu->power_profile_mode == PP_SMC_POWER_PROFILE_COMPUTE && + (((smu->adev->pdev->device == 0x744C) && (smu->adev->pdev->revision == 0xC8)) || + ((smu->adev->pdev->device == 0x744C) && (smu->adev->pdev->revision == 0xCC)))) { + ret = smu_cmn_update_table(smu, + SMU_TABLE_ACTIVITY_MONITOR_COEFF, + WORKLOAD_PPLIB_COMPUTE_BIT, + (void *)(&activity_monitor_external), + false); + if (ret) { + dev_err(smu->adev->dev, "[%s] Failed to get activity monitor!", __func__); + return ret; + } + + ret = smu_cmn_update_table(smu, + SMU_TABLE_ACTIVITY_MONITOR_COEFF, + WORKLOAD_PPLIB_CUSTOM_BIT, + (void *)(&activity_monitor_external), + true); + if (ret) { + dev_err(smu->adev->dev, "[%s] Failed to set activity monitor!", __func__); + return ret; + } + + workload_type = smu_cmn_to_asic_specific_index(smu, + CMN2ASIC_MAPPING_WORKLOAD, + PP_SMC_POWER_PROFILE_CUSTOM); + } else { + /* conv PP_SMC_POWER_PROFILE* to WORKLOAD_PPLIB_*_BIT */ + workload_type = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_WORKLOAD, smu->power_profile_mode); + } + if (workload_type < 0) return -EINVAL; -- cgit v1.2.3 From 7ac9be96b0113a34c33110b32912642bdc8ff33d Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 9 Jun 2023 16:03:56 +0200 Subject: drm/radeon: Disable outputs when releasing fbdev client MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Disable the modesetting pipeline before release the radeon's fbdev client. Fixes the following error: [ 17.217408] WARNING: CPU: 5 PID: 1464 at drivers/gpu/drm/ttm/ttm_bo.c:326 ttm_bo_release+0x27e/0x2d0 [ttm] [ 17.217418] Modules linked in: edac_mce_amd radeon(+) drm_ttm_helper ttm video drm_suballoc_helper drm_display_helper kvm irqbypass drm_kms_helper syscopyarea crc32_pclmul sysfillrect sha512_ssse3 sysimgblt sha512_generic cfbfillrect cfbimgblt wmi_bmof aesni_intel cfbcopyarea crypto_simd cryptd k10temp acpi_cpufreq wmi dm_mod [ 17.217432] CPU: 5 PID: 1464 Comm: systemd-udevd Not tainted 6.4.0-rc4+ #1 [ 17.217436] Hardware name: Micro-Star International Co., Ltd. MS-7A38/B450M PRO-VDH MAX (MS-7A38), BIOS B.G0 07/26/2022 [ 17.217438] RIP: 0010:ttm_bo_release+0x27e/0x2d0 [ttm] [ 17.217444] Code: 48 89 43 38 48 89 43 40 48 8b 5c 24 30 48 8b b5 40 08 00 00 48 8b 6c 24 38 48 83 c4 58 e9 7a 49 f7 e0 48 89 ef e9 6c fe ff ff <0f> 0b 48 83 7b 20 00 0f 84 b7 fd ff ff 0f 0b 0f 1f 00 e9 ad fd ff [ 17.217448] RSP: 0018:ffffc9000095fbb0 EFLAGS: 00010202 [ 17.217451] RAX: 0000000000000001 RBX: ffff8881052c8de0 RCX: 0000000000000000 [ 17.217453] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff8881052c8de0 [ 17.217455] RBP: ffff888104a66e00 R08: ffff8881052c8de0 R09: ffff888104a7cf08 [ 17.217457] R10: ffffc9000095fbe0 R11: ffffc9000095fbe8 R12: ffff8881052c8c78 [ 17.217458] R13: ffff8881052c8c78 R14: dead000000000100 R15: ffff88810528b108 [ 17.217460] FS: 00007f319fcbb8c0(0000) GS:ffff88881a540000(0000) knlGS:0000000000000000 [ 17.217463] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 17.217464] CR2: 000055dc8b0224a0 CR3: 000000010373d000 CR4: 0000000000750ee0 [ 17.217466] PKRU: 55555554 [ 17.217468] Call Trace: [ 17.217470] [ 17.217472] ? __warn+0x97/0x160 [ 17.217476] ? ttm_bo_release+0x27e/0x2d0 [ttm] [ 17.217481] ? report_bug+0x1ec/0x200 [ 17.217487] ? handle_bug+0x3c/0x70 [ 17.217490] ? exc_invalid_op+0x1f/0x90 [ 17.217493] ? preempt_count_sub+0xb5/0x100 [ 17.217496] ? asm_exc_invalid_op+0x16/0x20 [ 17.217500] ? ttm_bo_release+0x27e/0x2d0 [ttm] [ 17.217505] ? ttm_resource_move_to_lru_tail+0x1ab/0x1d0 [ttm] [ 17.217511] radeon_bo_unref+0x1a/0x30 [radeon] [ 17.217547] radeon_gem_object_free+0x20/0x30 [radeon] [ 17.217579] radeon_fbdev_fb_destroy+0x57/0x90 [radeon] [ 17.217616] unregister_framebuffer+0x72/0x110 [ 17.217620] drm_client_dev_unregister+0x6d/0xe0 [ 17.217623] drm_dev_unregister+0x2e/0x90 [ 17.217626] drm_put_dev+0x26/0x90 [ 17.217628] pci_device_remove+0x44/0xc0 [ 17.217631] really_probe+0x257/0x340 [ 17.217635] __driver_probe_device+0x73/0x120 [ 17.217638] driver_probe_device+0x2c/0xb0 [ 17.217641] __driver_attach+0xa0/0x150 [ 17.217643] ? __pfx___driver_attach+0x10/0x10 [ 17.217646] bus_for_each_dev+0x67/0xa0 [ 17.217649] bus_add_driver+0x10e/0x210 [ 17.217651] driver_register+0x5c/0x120 [ 17.217653] ? __pfx_radeon_module_init+0x10/0x10 [radeon] [ 17.217681] do_one_initcall+0x44/0x220 [ 17.217684] ? kmalloc_trace+0x37/0xc0 [ 17.217688] do_init_module+0x64/0x240 [ 17.217691] __do_sys_finit_module+0xb2/0x100 [ 17.217694] do_syscall_64+0x3b/0x90 [ 17.217697] entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 17.217700] RIP: 0033:0x7f319feaa5a9 [ 17.217702] Code: 08 89 e8 5b 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 27 08 0d 00 f7 d8 64 89 01 48 [ 17.217706] RSP: 002b:00007ffc6bf3e7f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 17.217709] RAX: ffffffffffffffda RBX: 00005607204f3170 RCX: 00007f319feaa5a9 [ 17.217710] RDX: 0000000000000000 RSI: 00007f31a002eefd RDI: 0000000000000018 [ 17.217712] RBP: 00007f31a002eefd R08: 0000000000000000 R09: 00005607204f1860 [ 17.217714] R10: 0000000000000018 R11: 0000000000000246 R12: 0000000000020000 [ 17.217716] R13: 0000000000000000 R14: 0000560720522450 R15: 0000560720255899 [ 17.217718] [ 17.217719] ---[ end trace 0000000000000000 ]--- The buffer object backing the fbdev emulation got pinned twice: by the fb_probe helper radeon_fbdev_create_pinned_object() and the modesetting code when the framebuffer got displayed. It only got unpinned once by the fbdev helper radeon_fbdev_destroy_pinned_object(). Hence TTM's BO- release function complains about the pin counter. Forcing the outputs off also undoes the modesettings pin increment. Tested-by: Borislav Petkov (AMD) Reported-by: Borislav Petkov Closes: https://lore.kernel.org/dri-devel/20230603174814.GCZHt83pN+wNjf63sC@fat_crate.local/ Signed-off-by: Thomas Zimmermann Fixes: e317a69fe891 ("drm/radeon: Implement client-based fbdev emulation") Cc: Alex Deucher Cc: Thomas Zimmermann Cc: "Christian König" Cc: "Pan, Xinhui" Cc: amd-gfx@lists.freedesktop.org Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon_fbdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/radeon/radeon_fbdev.c b/drivers/gpu/drm/radeon/radeon_fbdev.c index fe76e29910ef..8f6c3aef0962 100644 --- a/drivers/gpu/drm/radeon/radeon_fbdev.c +++ b/drivers/gpu/drm/radeon/radeon_fbdev.c @@ -307,6 +307,7 @@ static void radeon_fbdev_client_unregister(struct drm_client_dev *client) if (fb_helper->info) { vga_switcheroo_client_fb_set(rdev->pdev, NULL); + drm_helper_force_disable_all(dev); drm_fb_helper_unregister_info(fb_helper); } else { drm_client_release(&fb_helper->client); -- cgit v1.2.3 From 9db5ec1ceb5303398ec4f899d691073d531257c3 Mon Sep 17 00:00:00 2001 From: Sonny Jiang Date: Tue, 6 Jun 2023 17:18:52 -0400 Subject: drm/amdgpu: vcn_4_0 set instance 0 init sched score to 1 Only vcn0 can process AV1 codecx. In order to use both vcn0 and vcn1 in h264/265 transcode to AV1 cases, set vcn0 sched score to 1 at initialization time. Signed-off-by: Sonny Jiang Reviewed-by: Leo Liu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.1.x --- drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c index e5fd1e00914d..da126ff8bcbc 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c @@ -129,7 +129,11 @@ static int vcn_v4_0_sw_init(void *handle) if (adev->vcn.harvest_config & (1 << i)) continue; - atomic_set(&adev->vcn.inst[i].sched_score, 0); + /* Init instance 0 sched_score to 1, so it's scheduled after other instances */ + if (i == 0) + atomic_set(&adev->vcn.inst[i].sched_score, 1); + else + atomic_set(&adev->vcn.inst[i].sched_score, 0); /* VCN UNIFIED TRAP */ r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[i], -- cgit v1.2.3 From 34e5a54327dce5033582f3609eb54812a8c61b90 Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Fri, 9 Jun 2023 06:18:41 -0700 Subject: Revert "drm/amdgpu: remove TOPDOWN flags when allocating VRAM in large bar system" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit c105518679b6e87232874ffc989ec403bee59664. This patch disables the TOPDOWN flag for APU and few dGPU cards which has the VRAM size equal to the BAR size. When we enable the TOPDOWN flag, we get the free blocks at the highest available memory region and we don't split the lower order blocks. This change is required to keep off the fragmentation related issues particularly in ASIC which has VRAM space <= 500MiB Hence, we are reverting this patch. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2270 Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 3b225be89cb7..a70103ac0026 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -140,7 +140,7 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) if (flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) places[c].lpfn = visible_pfn; - else if (adev->gmc.real_vram_size != adev->gmc.visible_vram_size) + else places[c].flags |= TTM_PL_FLAG_TOPDOWN; if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) -- cgit v1.2.3 From e749dd10e5f292061ad63d2b030194bf7d7d452c Mon Sep 17 00:00:00 2001 From: Hersen Wu Date: Thu, 25 May 2023 08:37:40 -0400 Subject: drm/amd/display: edp do not add non-edid timings [Why] most edp support only timings from edid. applying non-edid timings, especially those timings out of edp bandwidth, may damage edp. [How] do not add non-edid timings for edp. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Acked-by: Stylon Wang Signed-off-by: Hersen Wu Reviewed-by: Roman Li Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index d5cec03eaa8d..2b8d42de1f73 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -7196,7 +7196,13 @@ static int amdgpu_dm_connector_get_modes(struct drm_connector *connector) drm_add_modes_noedid(connector, 1920, 1080); } else { amdgpu_dm_connector_ddc_get_modes(connector, edid); - amdgpu_dm_connector_add_common_modes(encoder, connector); + /* most eDP supports only timings from its edid, + * usually only detailed timings are available + * from eDP edid. timings which are not from edid + * may damage eDP + */ + if (connector->connector_type != DRM_MODE_CONNECTOR_eDP) + amdgpu_dm_connector_add_common_modes(encoder, connector); amdgpu_dm_connector_add_freesync_modes(connector, edid); } amdgpu_dm_fbc_init(connector); -- cgit v1.2.3 From ea2062dd1f0384ae1b136d333ee4ced15bedae38 Mon Sep 17 00:00:00 2001 From: Tom Chung Date: Mon, 29 May 2023 18:00:09 +0800 Subject: drm/amd/display: fix the system hang while disable PSR [Why] When the PSR enabled. If you try to adjust the timing parameters, it may cause system hang. Because the timing mismatch with the DMCUB settings. [How] Disable the PSR before adjusting timing parameters. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Acked-by: Stylon Wang Signed-off-by: Tom Chung Reviewed-by: Wayne Lin Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 2b8d42de1f73..7acd73e5004f 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -8204,6 +8204,12 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, if (acrtc_state->abm_level != dm_old_crtc_state->abm_level) bundle->stream_update.abm_level = &acrtc_state->abm_level; + mutex_lock(&dm->dc_lock); + if ((acrtc_state->update_type > UPDATE_TYPE_FAST) && + acrtc_state->stream->link->psr_settings.psr_allow_active) + amdgpu_dm_psr_disable(acrtc_state->stream); + mutex_unlock(&dm->dc_lock); + /* * If FreeSync state on the stream has changed then we need to * re-adjust the min/max bounds now that DC doesn't handle this @@ -8217,10 +8223,6 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, spin_unlock_irqrestore(&pcrtc->dev->event_lock, flags); } mutex_lock(&dm->dc_lock); - if ((acrtc_state->update_type > UPDATE_TYPE_FAST) && - acrtc_state->stream->link->psr_settings.psr_allow_active) - amdgpu_dm_psr_disable(acrtc_state->stream); - update_planes_and_stream_adapter(dm->dc, acrtc_state->update_type, planes_count, -- cgit v1.2.3 From 7c5835bcb9176df94683396f1c0e5df6bf5094b3 Mon Sep 17 00:00:00 2001 From: Peichen Huang Date: Wed, 31 May 2023 13:36:14 +0800 Subject: drm/amd/display: limit DPIA link rate to HBR3 [Why] DPIA doesn't support UHBR, driver should not enable UHBR for dp tunneling [How] limit DPIA link rate to HBR3 Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Acked-by: Stylon Wang Signed-off-by: Peichen Huang Reviewed-by: Mustapha Ghaddar Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/link/link_detection.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/link/link_detection.c b/drivers/gpu/drm/amd/display/dc/link/link_detection.c index a131e30fd7d6..d471d58aba92 100644 --- a/drivers/gpu/drm/amd/display/dc/link/link_detection.c +++ b/drivers/gpu/drm/amd/display/dc/link/link_detection.c @@ -980,6 +980,11 @@ static bool detect_link_and_local_sink(struct dc_link *link, (link->dpcd_caps.dongle_type != DISPLAY_DONGLE_DP_HDMI_CONVERTER)) converter_disable_audio = true; + + /* limited link rate to HBR3 for DPIA until we implement USB4 V2 */ + if (link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA && + link->reported_link_cap.link_rate > LINK_RATE_HIGH3) + link->reported_link_cap.link_rate = LINK_RATE_HIGH3; break; } -- cgit v1.2.3 From 0108a4e9f3584a7a2c026d1601b0682ff7335d95 Mon Sep 17 00:00:00 2001 From: Krister Johansen Date: Mon, 12 Jun 2023 17:44:40 -0700 Subject: bpf: ensure main program has an extable When subprograms are in use, the main program is not jit'd after the subprograms because jit_subprogs sets a value for prog->bpf_func upon success. Subsequent calls to the JIT are bypassed when this value is non-NULL. This leads to a situation where the main program and its func[0] counterpart are both in the bpf kallsyms tree, but only func[0] has an extable. Extables are only created during JIT. Now there are two nearly identical program ksym entries in the tree, but only one has an extable. Depending upon how the entries are placed, there's a chance that a fault will call search_extable on the aux with the NULL entry. Since jit_subprogs already copies state from func[0] to the main program, include the extable pointer in this state duplication. Additionally, ensure that the copy of the main program in func[0] is not added to the bpf_prog_kallsyms table. Instead, let the main program get added later in bpf_prog_load(). This ensures there is only a single copy of the main program in the kallsyms table, and that its tag matches the tag observed by tooling like bpftool. Cc: stable@vger.kernel.org Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") Signed-off-by: Krister Johansen Acked-by: Yonghong Song Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich Link: https://lore.kernel.org/r/6de9b2f4b4724ef56efbb0339daaa66c8b68b1e7.1686616663.git.kjlx@templeofstupid.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0dd8adc7a159..cf5f230360f5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17217,9 +17217,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) } /* finally lock prog and jit images for all functions and - * populate kallsysm + * populate kallsysm. Begin at the first subprogram, since + * bpf_prog_load will add the kallsyms for the main program. */ - for (i = 0; i < env->subprog_cnt; i++) { + for (i = 1; i < env->subprog_cnt; i++) { bpf_prog_lock_ro(func[i]); bpf_prog_kallsyms_add(func[i]); } @@ -17245,6 +17246,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->jited = 1; prog->bpf_func = func[0]->bpf_func; prog->jited_len = func[0]->jited_len; + prog->aux->extable = func[0]->aux->extable; + prog->aux->num_exentries = func[0]->aux->num_exentries; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt; bpf_prog_jit_attempt_done(prog); -- cgit v1.2.3 From 84a62b445c86d0f2c9831290ffecee7269f18254 Mon Sep 17 00:00:00 2001 From: Krister Johansen Date: Mon, 12 Jun 2023 17:44:54 -0700 Subject: selftests/bpf: add a test for subprogram extables In certain situations a program with subprograms may have a NULL extable entry. This should not happen, and when it does, it turns a single trap into multiple. Add a test case for further debugging and to prevent regressions. The test-case contains three essentially identical versions of the same test because just one program may not be sufficient to trigger the oops. This is due to the fact that the items are stored in a binary tree and have identical values so it's possible to sometimes find the ksym with the extable. With 3 copies, this has been reliable on this author's test systems. When triggered out of this test case, the oops looks like this: BUG: kernel NULL pointer dereference, address: 000000000000000c #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 0 PID: 1132 Comm: test_progs Tainted: G OE 6.4.0-rc3+ #2 RIP: 0010:cmp_ex_search+0xb/0x30 Code: cc cc cc cc e8 36 cb 03 00 66 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 55 48 89 e5 48 8b 07 <48> 63 0e 48 01 f1 31 d2 48 39 c8 19 d2 48 39 c8 b8 01 00 00 00 0f RSP: 0018:ffffb30c4291f998 EFLAGS: 00010006 RAX: ffffffffc00b49da RBX: 0000000000000002 RCX: 000000000000000c RDX: 0000000000000002 RSI: 000000000000000c RDI: ffffb30c4291f9e8 RBP: ffffb30c4291f998 R08: ffffffffab1a42d0 R09: 0000000000000001 R10: 0000000000000000 R11: ffffffffab1a42d0 R12: ffffb30c4291f9e8 R13: 000000000000000c R14: 000000000000000c R15: 0000000000000000 FS: 00007fb5d9e044c0(0000) GS:ffff92e95ee00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000000c CR3: 000000010c3a2005 CR4: 00000000007706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: bsearch+0x41/0x90 ? __pfx_cmp_ex_search+0x10/0x10 ? bpf_prog_45a7907e7114d0ff_handle_fexit_ret_subprogs3+0x2a/0x6c search_extable+0x3b/0x60 ? bpf_prog_45a7907e7114d0ff_handle_fexit_ret_subprogs3+0x2a/0x6c search_bpf_extables+0x10d/0x190 ? bpf_prog_45a7907e7114d0ff_handle_fexit_ret_subprogs3+0x2a/0x6c search_exception_tables+0x5d/0x70 fixup_exception+0x3f/0x5b0 ? look_up_lock_class+0x61/0x110 ? __lock_acquire+0x6b8/0x3560 ? __lock_acquire+0x6b8/0x3560 ? __lock_acquire+0x6b8/0x3560 kernelmode_fixup_or_oops+0x46/0x110 __bad_area_nosemaphore+0x68/0x2b0 ? __lock_acquire+0x6b8/0x3560 bad_area_nosemaphore+0x16/0x20 do_kern_addr_fault+0x81/0xa0 exc_page_fault+0xd6/0x210 asm_exc_page_fault+0x2b/0x30 RIP: 0010:bpf_prog_45a7907e7114d0ff_handle_fexit_ret_subprogs3+0x2a/0x6c Code: f3 0f 1e fa 0f 1f 44 00 00 66 90 55 48 89 e5 f3 0f 1e fa 48 8b 7f 08 49 bb 00 00 00 00 00 80 00 00 4c 39 df 73 04 31 f6 eb 04 <48> 8b 77 00 49 bb 00 00 00 00 00 80 00 00 48 81 c7 7c 00 00 00 4c RSP: 0018:ffffb30c4291fcb8 EFLAGS: 00010282 RAX: 0000000000000001 RBX: 0000000000000001 RCX: 0000000000000000 RDX: 00000000cddf1af1 RSI: 000000005315a00d RDI: ffffffffffffffea RBP: ffffb30c4291fcb8 R08: ffff92e644bf38a8 R09: 0000000000000000 R10: 0000000000000000 R11: 0000800000000000 R12: ffff92e663652690 R13: 00000000000001c8 R14: 00000000000001c8 R15: 0000000000000003 bpf_trampoline_251255721842_2+0x63/0x1000 bpf_testmod_return_ptr+0x9/0xb0 [bpf_testmod] ? bpf_testmod_test_read+0x43/0x2d0 [bpf_testmod] sysfs_kf_bin_read+0x60/0x90 kernfs_fop_read_iter+0x143/0x250 vfs_read+0x240/0x2a0 ksys_read+0x70/0xe0 __x64_sys_read+0x1f/0x30 do_syscall_64+0x68/0xa0 ? syscall_exit_to_user_mode+0x77/0x1f0 ? do_syscall_64+0x77/0xa0 ? irqentry_exit+0x35/0xa0 ? sysvec_apic_timer_interrupt+0x4d/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7fb5da00a392 Code: ac 00 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb be 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 0f 05 <48> 3d 00 f0 ff ff 77 56 c3 0f 1f 44 00 00 48 83 ec 28 48 89 54 24 RSP: 002b:00007ffc5b3cab68 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 000055bee7b8b100 RCX: 00007fb5da00a392 RDX: 00000000000001c8 RSI: 0000000000000000 RDI: 0000000000000009 RBP: 00007ffc5b3caba0 R08: 0000000000000000 R09: 0000000000000037 R10: 000055bee7b8c2a7 R11: 0000000000000246 R12: 000055bee78f1f60 R13: 00007ffc5b3cae90 R14: 0000000000000000 R15: 0000000000000000 Modules linked in: bpf_testmod(OE) nls_iso8859_1 dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua intel_rapl_msr intel_rapl_common intel_uncore_frequency_common ppdev nfit crct10dif_pclmul crc32_pclmul psmouse ghash_clmulni_intel sha512_ssse3 aesni_intel parport_pc crypto_simd cryptd input_leds parport rapl ena i2c_piix4 mac_hid serio_raw ramoops reed_solomon pstore_blk drm pstore_zone efi_pstore autofs4 [last unloaded: bpf_testmod(OE)] CR2: 000000000000000c Though there may be some variation, depending on which suprogram triggers the bug. Signed-off-by: Krister Johansen Acked-by: Yonghong Song Link: https://lore.kernel.org/r/4ebf95ec857cd785b81db69f3e408c039ad8408b.1686616663.git.kjlx@templeofstupid.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/subprogs_extable.c | 29 ++++++++++++ .../selftests/bpf/progs/test_subprogs_extable.c | 51 ++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/subprogs_extable.c create mode 100644 tools/testing/selftests/bpf/progs/test_subprogs_extable.c diff --git a/tools/testing/selftests/bpf/prog_tests/subprogs_extable.c b/tools/testing/selftests/bpf/prog_tests/subprogs_extable.c new file mode 100644 index 000000000000..3afd9f775f68 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/subprogs_extable.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "test_subprogs_extable.skel.h" + +void test_subprogs_extable(void) +{ + const int read_sz = 456; + struct test_subprogs_extable *skel; + int err; + + skel = test_subprogs_extable__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = test_subprogs_extable__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger tracepoint */ + ASSERT_OK(trigger_module_test_read(read_sz), "trigger_read"); + + ASSERT_NEQ(skel->bss->triggered, 0, "verify at least one program ran"); + + test_subprogs_extable__detach(skel); + +cleanup: + test_subprogs_extable__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_subprogs_extable.c b/tools/testing/selftests/bpf/progs/test_subprogs_extable.c new file mode 100644 index 000000000000..e2a21fbd4e44 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_subprogs_extable.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 8); + __type(key, __u32); + __type(value, __u64); +} test_array SEC(".maps"); + +unsigned int triggered; + +static __u64 test_cb(struct bpf_map *map, __u32 *key, __u64 *val, void *data) +{ + return 1; +} + +SEC("fexit/bpf_testmod_return_ptr") +int BPF_PROG(handle_fexit_ret_subprogs, int arg, struct file *ret) +{ + *(volatile long *)ret; + *(volatile int *)&ret->f_mode; + bpf_for_each_map_elem(&test_array, test_cb, NULL, 0); + triggered++; + return 0; +} + +SEC("fexit/bpf_testmod_return_ptr") +int BPF_PROG(handle_fexit_ret_subprogs2, int arg, struct file *ret) +{ + *(volatile long *)ret; + *(volatile int *)&ret->f_mode; + bpf_for_each_map_elem(&test_array, test_cb, NULL, 0); + triggered++; + return 0; +} + +SEC("fexit/bpf_testmod_return_ptr") +int BPF_PROG(handle_fexit_ret_subprogs3, int arg, struct file *ret) +{ + *(volatile long *)ret; + *(volatile int *)&ret->f_mode; + bpf_for_each_map_elem(&test_array, test_cb, NULL, 0); + triggered++; + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From cac9e4418f4cbd548ccb065b3adcafe073f7f7d2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 Jun 2023 13:51:36 -0600 Subject: io_uring/net: save msghdr->msg_control for retries If the application sets ->msg_control and we have to later retry this command, or if it got queued with IOSQE_ASYNC to begin with, then we need to retain the original msg_control value. This is due to the net stack overwriting this field with an in-kernel pointer, to copy it in. Hitting that path for the second time will now fail the copy from user, as it's attempting to copy from a non-user address. Cc: stable@vger.kernel.org # 5.10+ Link: https://github.com/axboe/liburing/issues/880 Reported-and-tested-by: Marek Majkowski Signed-off-by: Jens Axboe --- io_uring/net.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/io_uring/net.c b/io_uring/net.c index 89e839013837..51b0f7fbb4f5 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -65,6 +65,7 @@ struct io_sr_msg { u16 addr_len; u16 buf_group; void __user *addr; + void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; }; @@ -195,11 +196,15 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + int ret; iomsg->msg.msg_name = &iomsg->addr; iomsg->free_iov = iomsg->fast_iov; - return sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags, + ret = sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags, &iomsg->free_iov); + /* save msg_control as sys_sendmsg() overwrites it */ + sr->msg_control = iomsg->msg.msg_control; + return ret; } int io_send_prep_async(struct io_kiocb *req) @@ -297,6 +302,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (req_has_async_data(req)) { kmsg = req->async_data; + kmsg->msg.msg_control = sr->msg_control; } else { ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) -- cgit v1.2.3 From 3c40eb8145325b0f5b93b8a169146078cb2c49d6 Mon Sep 17 00:00:00 2001 From: Martin Hundebøll Date: Wed, 7 Jun 2023 10:27:12 +0200 Subject: mmc: meson-gx: remove redundant mmc_request_done() call from irq context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The call to mmc_request_done() can schedule, so it must not be called from irq context. Wake the irq thread if it needs to be called, and let its existing logic do its work. Fixes the following kernel bug, which appears when running an RT patched kernel on the AmLogic Meson AXG A113X SoC: [ 11.111407] BUG: scheduling while atomic: kworker/0:1H/75/0x00010001 [ 11.111438] Modules linked in: [ 11.111451] CPU: 0 PID: 75 Comm: kworker/0:1H Not tainted 6.4.0-rc3-rt2-rtx-00081-gfd07f41ed6b4-dirty #1 [ 11.111461] Hardware name: RTX AXG A113X Linux Platform Board (DT) [ 11.111469] Workqueue: kblockd blk_mq_run_work_fn [ 11.111492] Call trace: [ 11.111497] dump_backtrace+0xac/0xe8 [ 11.111510] show_stack+0x18/0x28 [ 11.111518] dump_stack_lvl+0x48/0x60 [ 11.111530] dump_stack+0x18/0x24 [ 11.111537] __schedule_bug+0x4c/0x68 [ 11.111548] __schedule+0x80/0x574 [ 11.111558] schedule_loop+0x2c/0x50 [ 11.111567] schedule_rtlock+0x14/0x20 [ 11.111576] rtlock_slowlock_locked+0x468/0x730 [ 11.111587] rt_spin_lock+0x40/0x64 [ 11.111596] __wake_up_common_lock+0x5c/0xc4 [ 11.111610] __wake_up+0x18/0x24 [ 11.111620] mmc_blk_mq_req_done+0x68/0x138 [ 11.111633] mmc_request_done+0x104/0x118 [ 11.111644] meson_mmc_request_done+0x38/0x48 [ 11.111654] meson_mmc_irq+0x128/0x1f0 [ 11.111663] __handle_irq_event_percpu+0x70/0x114 [ 11.111674] handle_irq_event_percpu+0x18/0x4c [ 11.111683] handle_irq_event+0x80/0xb8 [ 11.111691] handle_fasteoi_irq+0xa4/0x120 [ 11.111704] handle_irq_desc+0x20/0x38 [ 11.111712] generic_handle_domain_irq+0x1c/0x28 [ 11.111721] gic_handle_irq+0x8c/0xa8 [ 11.111735] call_on_irq_stack+0x24/0x4c [ 11.111746] do_interrupt_handler+0x88/0x94 [ 11.111757] el1_interrupt+0x34/0x64 [ 11.111769] el1h_64_irq_handler+0x18/0x24 [ 11.111779] el1h_64_irq+0x64/0x68 [ 11.111786] __add_wait_queue+0x0/0x4c [ 11.111795] mmc_blk_rw_wait+0x84/0x118 [ 11.111804] mmc_blk_mq_issue_rq+0x5c4/0x654 [ 11.111814] mmc_mq_queue_rq+0x194/0x214 [ 11.111822] blk_mq_dispatch_rq_list+0x3ac/0x528 [ 11.111834] __blk_mq_sched_dispatch_requests+0x340/0x4d0 [ 11.111847] blk_mq_sched_dispatch_requests+0x38/0x70 [ 11.111858] blk_mq_run_work_fn+0x3c/0x70 [ 11.111865] process_one_work+0x17c/0x1f0 [ 11.111876] worker_thread+0x1d4/0x26c [ 11.111885] kthread+0xe4/0xf4 [ 11.111894] ret_from_fork+0x10/0x20 Fixes: 51c5d8447bd7 ("MMC: meson: initial support for GX platforms") Cc: stable@vger.kernel.org Signed-off-by: Martin Hundebøll Link: https://lore.kernel.org/r/20230607082713.517157-1-martin@geanix.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-gx-mmc.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index b8514d9d5e73..f90b0fd8d8b0 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -991,11 +991,8 @@ static irqreturn_t meson_mmc_irq(int irq, void *dev_id) if (data && !cmd->error) data->bytes_xfered = data->blksz * data->blocks; - if (meson_mmc_bounce_buf_read(data) || - meson_mmc_get_next_command(cmd)) - ret = IRQ_WAKE_THREAD; - else - ret = IRQ_HANDLED; + + return IRQ_WAKE_THREAD; } out: @@ -1007,9 +1004,6 @@ out: writel(start, host->regs + SD_EMMC_START); } - if (ret == IRQ_HANDLED) - meson_mmc_request_done(host->mmc, cmd->mrq); - return ret; } -- cgit v1.2.3 From b104dbedbe61d89a933479f8effce6409037ef73 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Tue, 6 Jun 2023 07:59:19 +0100 Subject: Documentation: RISC-V: patch-acceptance: mention patchwork's role MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Palmer suggested at some point, not sure if it was in one of the weekly linux-riscv syncs, or a conversation at FOSDEM, that we should document the role of the automation running on our patchwork instance plays in patch acceptance. Add a short note to the patch-acceptance document to that end. Signed-off-by: Conor Dooley Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20230606-rehab-monsoon-12c17bbe08e3@wendy Signed-off-by: Palmer Dabbelt --- Documentation/riscv/patch-acceptance.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Documentation/riscv/patch-acceptance.rst b/Documentation/riscv/patch-acceptance.rst index 07d5a5623e2a..634aa222b410 100644 --- a/Documentation/riscv/patch-acceptance.rst +++ b/Documentation/riscv/patch-acceptance.rst @@ -16,6 +16,24 @@ tested code over experimental code. We wish to extend these same principles to the RISC-V-related code that will be accepted for inclusion in the kernel. +Patchwork +--------- + +RISC-V has a patchwork instance, where the status of patches can be checked: + + https://patchwork.kernel.org/project/linux-riscv/list/ + +If your patch does not appear in the default view, the RISC-V maintainers have +likely either requested changes, or expect it to be applied to another tree. + +Automation runs against this patchwork instance, building/testing patches as +they arrive. The automation applies patches against the current HEAD of the +RISC-V `for-next` and `fixes` branches, depending on whether the patch has been +detected as a fix. Failing those, it will use the RISC-V `master` branch. +The exact commit to which a series has been applied will be noted on patchwork. +Patches for which any of the checks fail are unlikely to be applied and in most +cases will need to be resubmitted. + Submit Checklist Addendum ------------------------- We'll only accept patches for new modules or extensions if the -- cgit v1.2.3 From b50f2d048ecf1512ff85128ea4153bceb0e60590 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 14 Jun 2023 14:49:35 +0800 Subject: btrfs: scrub: fix a return value overwrite in scrub_stripe() [RETURN VALUE OVERWRITE] Inside scrub_stripe(), we would submit all the remaining stripes after iterating all extents. But since flush_scrub_stripes() can return error, we need to avoid overwriting the existing @ret if there is any error. However the existing check is doing the wrong check: ret2 = flush_scrub_stripes(); if (!ret2) ret = ret2; This would overwrite the existing @ret to 0 as long as the final flush detects no critical errors. [FIX] We should check @ret other than @ret2 in that case. Fixes: 8eb3dd17eadd ("btrfs: dev-replace: error out if we have unrepaired metadata error during") Reviewed-by: Christoph Hellwig Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 50c241aba1a1..bceaa8c2007e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2266,7 +2266,7 @@ next: } out: ret2 = flush_scrub_stripes(sctx); - if (!ret2) + if (!ret) ret = ret2; if (sctx->raid56_data_stripes) { for (int i = 0; i < nr_data_stripes(map); i++) -- cgit v1.2.3 From ba470eebc2f6c2f704872955a715b9555328e7d0 Mon Sep 17 00:00:00 2001 From: sunliming Date: Mon, 29 May 2023 11:21:00 +0800 Subject: tracing/user_events: Prevent same name but different args event User processes register name_args for events. If the same name but different args event are registered. The trace outputs of second event are printed as the first event. This is incorrect. Return EADDRINUSE back to the user process if the same name but different args event has being registered. Link: https://lore.kernel.org/linux-trace-kernel/20230529032100.286534-1-sunliming@kylinos.cn Signed-off-by: sunliming Reviewed-by: Masami Hiramatsu (Google) Acked-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 36 +++++++++++++++++++---- tools/testing/selftests/user_events/ftrace_test.c | 6 ++++ 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index dbb14705d0d3..37a38496a6be 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1786,6 +1786,8 @@ static int user_event_parse(struct user_event_group *group, char *name, int ret; u32 key; struct user_event *user; + int argc = 0; + char **argv; /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); @@ -1793,13 +1795,35 @@ static int user_event_parse(struct user_event_group *group, char *name, mutex_unlock(&event_mutex); if (user) { - *newuser = user; - /* - * Name is allocated by caller, free it since it already exists. - * Caller only worries about failure cases for freeing. - */ - kfree(name); + if (args) { + argv = argv_split(GFP_KERNEL, args, &argc); + if (!argv) { + ret = -ENOMEM; + goto error; + } + + ret = user_fields_match(user, argc, (const char **)argv); + argv_free(argv); + + } else + ret = list_empty(&user->fields); + + if (ret) { + *newuser = user; + /* + * Name is allocated by caller, free it since it already exists. + * Caller only worries about failure cases for freeing. + */ + kfree(name); + } else { + ret = -EADDRINUSE; + goto error; + } + return 0; +error: + refcount_dec(&user->refcnt); + return ret; } user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT); diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index 7c99cef94a65..6e8c4b47281c 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -228,6 +228,12 @@ TEST_F(user, register_events) { ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); ASSERT_EQ(0, reg.write_index); + /* Multiple registers to same name but different args should fail */ + reg.enable_bit = 29; + reg.name_args = (__u64)"__test_event u32 field1;"; + ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSREG, ®)); + ASSERT_EQ(EADDRINUSE, errno); + /* Ensure disabled */ self->enable_fd = open(enable_file, O_RDWR); ASSERT_NE(-1, self->enable_fd); -- cgit v1.2.3 From cfac4ed7279d056df6167bd665e460787dc9e0c4 Mon Sep 17 00:00:00 2001 From: sunliming Date: Mon, 29 May 2023 14:51:10 +0800 Subject: tracing/user_events: Handle matching arguments that is null from dyn_events When A registering user event from dyn_events has no argments, it will pass the matching check, regardless of whether there is a user event with the same name and arguments. Add the matching check when the arguments of registering user event is null. Link: https://lore.kernel.org/linux-trace-kernel/20230529065110.303440-1-sunliming@kylinos.cn Signed-off-by: sunliming Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 37a38496a6be..afe61dc86543 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1745,6 +1745,8 @@ static bool user_event_match(const char *system, const char *event, if (match && argc > 0) match = user_fields_match(user, argc, argv); + else if (match && argc == 0) + match = list_empty(&user->fields); return match; } -- cgit v1.2.3 From e70bb54d7a51299e7feca9169c07d79f9a3fc01d Mon Sep 17 00:00:00 2001 From: sunliming Date: Thu, 25 May 2023 16:52:32 +0800 Subject: tracing: Modify print_fields() for fields output order Now the print_fields() print trace event fields in reverse order. Modify it to the positive sequence. Example outputs for a user event: test0 u32 count1; u32 count2 Output before: example-2547 [000] ..... 325.666387: test0: count2=0x2 (2) count1=0x1 (1) Output after: example-2742 [002] ..... 429.769370: test0: count1=0x1 (1) count2=0x2 (2) Link: https://lore.kernel.org/linux-trace-kernel/20230525085232.5096-1-sunliming@kylinos.cn Fixes: 80a76994b2d88 ("tracing: Add "fields" option to show raw trace event fields") Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 15f05faaae44..1e33f367783e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -847,7 +847,7 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c int ret; void *pos; - list_for_each_entry(field, head, link) { + list_for_each_entry_reverse(field, head, link) { trace_seq_printf(&iter->seq, " %s=", field->name); if (field->offset + field->size > iter->ent_size) { trace_seq_puts(&iter->seq, ""); -- cgit v1.2.3 From 6f05dcabe5c241d066ec472cf38ac8b84f8c9c6f Mon Sep 17 00:00:00 2001 From: sunliming Date: Tue, 6 Jun 2023 14:20:24 +0800 Subject: tracing/user_events: Fix the incorrect trace record for empty arguments events The user_events support events that has empty arguments. But the trace event is discarded and not really committed when the arguments is empty. Fix this by not attempting to copy in zero-length data. Link: https://lkml.kernel.org/r/20230606062027.1008398-2-sunliming@kylinos.cn Acked-by: Beau Belgrave Acked-by: Masami Hiramatsu (Google) Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index afe61dc86543..49914b6cb651 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1432,7 +1432,7 @@ static void user_event_ftrace(struct user_event *user, struct iov_iter *i, if (unlikely(!entry)) return; - if (unlikely(!copy_nofault(entry + 1, i->count, i))) + if (unlikely(i->count != 0 && !copy_nofault(entry + 1, i->count, i))) goto discard; if (!list_empty(&user->validators) && @@ -1473,7 +1473,7 @@ static void user_event_perf(struct user_event *user, struct iov_iter *i, perf_fetch_caller_regs(regs); - if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) + if (unlikely(i->count != 0 && !copy_nofault(perf_entry + 1, i->count, i))) goto discard; if (!list_empty(&user->validators) && -- cgit v1.2.3 From 3e7269dd5fd5e77c215829e80fc1f29c989c9c42 Mon Sep 17 00:00:00 2001 From: sunliming Date: Tue, 6 Jun 2023 14:20:25 +0800 Subject: selftests/user_events: Add ftrace self-test for empty arguments events Tests to ensure events that has empty arguments can input trace record correctly when using ftrace. Link: https://lkml.kernel.org/r/20230606062027.1008398-3-sunliming@kylinos.cn Acked-by: Beau Belgrave Acked-by: Masami Hiramatsu (Google) Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- tools/testing/selftests/user_events/ftrace_test.c | 33 +++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index 6e8c4b47281c..abfb49558a26 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -316,6 +316,39 @@ TEST_F(user, write_events) { ASSERT_EQ(EINVAL, errno); } +TEST_F(user, write_empty_events) { + struct user_reg reg = {0}; + struct iovec io[1]; + int before = 0, after = 0; + + reg.size = sizeof(reg); + reg.name_args = (__u64)"__test_event"; + reg.enable_bit = 31; + reg.enable_addr = (__u64)&self->check; + reg.enable_size = sizeof(self->check); + + io[0].iov_base = ®.write_index; + io[0].iov_len = sizeof(reg.write_index); + + /* Register should work */ + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); + ASSERT_EQ(0, reg.write_index); + ASSERT_EQ(0, self->check); + + /* Enable event */ + self->enable_fd = open(enable_file, O_RDWR); + ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) + + /* Event should now be enabled */ + ASSERT_EQ(1 << reg.enable_bit, self->check); + + /* Write should make it out to ftrace buffers */ + before = trace_bytes(); + ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 1)); + after = trace_bytes(); + ASSERT_GT(after, before); +} + TEST_F(user, write_fault) { struct user_reg reg = {0}; struct iovec io[2]; -- cgit v1.2.3 From 4b56c21b11dcd2c6e4c6ee81bde77bea05e64db0 Mon Sep 17 00:00:00 2001 From: sunliming Date: Tue, 6 Jun 2023 14:20:26 +0800 Subject: selftests/user_events: Clear the events after perf self-test When the self test is completed, perf self-test left the user events not to be cleared. Clear the events by unregister and delete the event. Link: https://lkml.kernel.org/r/20230606062027.1008398-4-sunliming@kylinos.cn Acked-by: Beau Belgrave Acked-by: Masami Hiramatsu (Google) Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- tools/testing/selftests/user_events/perf_test.c | 29 +++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c index a070258d4449..e97f24ab6e2f 100644 --- a/tools/testing/selftests/user_events/perf_test.c +++ b/tools/testing/selftests/user_events/perf_test.c @@ -81,6 +81,32 @@ static int get_offset(void) return offset; } +static int clear(int *check) +{ + struct user_unreg unreg = {0}; + + unreg.size = sizeof(unreg); + unreg.disable_bit = 31; + unreg.disable_addr = (__u64)check; + + int fd = open(data_file, O_RDWR); + + if (fd == -1) + return -1; + + if (ioctl(fd, DIAG_IOCSUNREG, &unreg) == -1) + if (errno != ENOENT) + return -1; + + if (ioctl(fd, DIAG_IOCSDEL, "__test_event") == -1) + if (errno != ENOENT) + return -1; + + close(fd); + + return 0; +} + FIXTURE(user) { int data_fd; int check; @@ -93,6 +119,9 @@ FIXTURE_SETUP(user) { FIXTURE_TEARDOWN(user) { close(self->data_fd); + + if (clear(&self->check) != 0) + printf("WARNING: Clear didn't work!\n"); } TEST_F(user, perf_write) { -- cgit v1.2.3 From 42187bdc3ca45435bec96c66a90276315b2db1cb Mon Sep 17 00:00:00 2001 From: sunliming Date: Tue, 6 Jun 2023 14:20:27 +0800 Subject: selftests/user_events: Add perf self-test for empty arguments events Tests to ensure events that has empty arguments can input trace record correctly when using perf. Link: https://lkml.kernel.org/r/20230606062027.1008398-5-sunliming@kylinos.cn Acked-by: Beau Belgrave Acked-by: Masami Hiramatsu (Google) Signed-off-by: sunliming Signed-off-by: Steven Rostedt (Google) --- tools/testing/selftests/user_events/perf_test.c | 53 +++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c index e97f24ab6e2f..8b09be566fa2 100644 --- a/tools/testing/selftests/user_events/perf_test.c +++ b/tools/testing/selftests/user_events/perf_test.c @@ -189,6 +189,59 @@ TEST_F(user, perf_write) { ASSERT_EQ(0, self->check); } +TEST_F(user, perf_empty_events) { + struct perf_event_attr pe = {0}; + struct user_reg reg = {0}; + struct perf_event_mmap_page *perf_page; + int page_size = sysconf(_SC_PAGESIZE); + int id, fd; + __u32 *val; + + reg.size = sizeof(reg); + reg.name_args = (__u64)"__test_event"; + reg.enable_bit = 31; + reg.enable_addr = (__u64)&self->check; + reg.enable_size = sizeof(self->check); + + /* Register should work */ + ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); + ASSERT_EQ(0, reg.write_index); + ASSERT_EQ(0, self->check); + + /* Id should be there */ + id = get_id(); + ASSERT_NE(-1, id); + + pe.type = PERF_TYPE_TRACEPOINT; + pe.size = sizeof(pe); + pe.config = id; + pe.sample_type = PERF_SAMPLE_RAW; + pe.sample_period = 1; + pe.wakeup_events = 1; + + /* Tracepoint attach should work */ + fd = perf_event_open(&pe, 0, -1, -1, 0); + ASSERT_NE(-1, fd); + + perf_page = mmap(NULL, page_size * 2, PROT_READ, MAP_SHARED, fd, 0); + ASSERT_NE(MAP_FAILED, perf_page); + + /* Status should be updated */ + ASSERT_EQ(1 << reg.enable_bit, self->check); + + /* Ensure write shows up at correct offset */ + ASSERT_NE(-1, write(self->data_fd, ®.write_index, + sizeof(reg.write_index))); + val = (void *)(((char *)perf_page) + perf_page->data_offset); + ASSERT_EQ(PERF_RECORD_SAMPLE, *val); + + munmap(perf_page, page_size * 2); + close(fd); + + /* Status should be updated */ + ASSERT_EQ(0, self->check); +} + int main(int argc, char **argv) { return test_harness_run(argc, argv); -- cgit v1.2.3 From ed0e0ae0c932188654422d01f4e0ea14ff97c063 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 1 Jun 2023 15:49:28 -0700 Subject: tracing/user_events: Remove user_ns walk for groups During discussions it was suggested that user_ns is not a good place to try to attach a tracing namespace. The current code has stubs to enable that work that are very likely to change and incur a performance cost. Remove the user_ns walk when creating a group and determining the system name to use, since it's unlikely user_ns will be used in the future. Link: https://lore.kernel.org/all/20230601-urenkel-holzofen-cd9403b9cadd@brauner/ Link: https://lore.kernel.org/linux-trace-kernel/20230601224928.301-1-beaub@linux.microsoft.com Suggested-by: Christian Brauner Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 42 +++++----------------------------------- 1 file changed, 5 insertions(+), 37 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 49914b6cb651..cf6d4c02c363 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -182,21 +182,11 @@ static void user_event_group_destroy(struct user_event_group *group) kfree(group); } -static char *user_event_group_system_name(struct user_namespace *user_ns) +static char *user_event_group_system_name(void) { char *system_name; int len = sizeof(USER_EVENTS_SYSTEM) + 1; - if (user_ns != &init_user_ns) { - /* - * Unexpected at this point: - * We only currently support init_user_ns. - * When we enable more, this will trigger a failure so log. - */ - pr_warn("user_events: Namespace other than init_user_ns!\n"); - return NULL; - } - system_name = kmalloc(len, GFP_KERNEL); if (!system_name) @@ -207,34 +197,12 @@ static char *user_event_group_system_name(struct user_namespace *user_ns) return system_name; } -static inline struct user_event_group -*user_event_group_from_user_ns(struct user_namespace *user_ns) -{ - if (user_ns == &init_user_ns) - return init_group; - - return NULL; -} - static struct user_event_group *current_user_event_group(void) { - struct user_namespace *user_ns = current_user_ns(); - struct user_event_group *group = NULL; - - while (user_ns) { - group = user_event_group_from_user_ns(user_ns); - - if (group) - break; - - user_ns = user_ns->parent; - } - - return group; + return init_group; } -static struct user_event_group -*user_event_group_create(struct user_namespace *user_ns) +static struct user_event_group *user_event_group_create(void) { struct user_event_group *group; @@ -243,7 +211,7 @@ static struct user_event_group if (!group) return NULL; - group->system_name = user_event_group_system_name(user_ns); + group->system_name = user_event_group_system_name(); if (!group->system_name) goto error; @@ -2603,7 +2571,7 @@ static int __init trace_events_user_init(void) if (!fault_cache) return -ENOMEM; - init_group = user_event_group_create(&init_user_ns); + init_group = user_event_group_create(); if (!init_group) { kmem_cache_destroy(fault_cache); -- cgit v1.2.3 From b08d72580584ab89c41d6fc0f15cd1cf4ce2ed93 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Wed, 14 Jun 2023 09:33:31 -0700 Subject: tracing/user_events: Store register flags on events Currently we don't have any available flags for user processes to use to indicate options for user_events. We will soon have a flag to indicate the event should or should not auto-delete once it's not being used by anyone. Add a reg_flags field to user_events and parameters to existing functions to allow for this in future patches. Link: https://lkml.kernel.org/r/20230614163336.5797-2-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index cf6d4c02c363..629823e21447 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -87,6 +87,7 @@ struct user_event { struct list_head validators; refcount_t refcnt; int min_size; + int reg_flags; char status; }; @@ -165,7 +166,7 @@ typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, - struct user_event **newuser); + struct user_event **newuser, int reg_flags); static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); @@ -810,7 +811,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call) * Upon success user_event has its ref count increased by 1. */ static int user_event_parse_cmd(struct user_event_group *group, - char *raw_command, struct user_event **newuser) + char *raw_command, struct user_event **newuser, + int reg_flags) { char *name = raw_command; char *args = strpbrk(name, " "); @@ -824,7 +826,7 @@ static int user_event_parse_cmd(struct user_event_group *group, if (flags) *flags++ = '\0'; - return user_event_parse(group, name, args, flags, newuser); + return user_event_parse(group, name, args, flags, newuser, reg_flags); } static int user_field_array_size(const char *type) @@ -1588,7 +1590,7 @@ static int user_event_create(const char *raw_command) mutex_lock(&group->reg_mutex); - ret = user_event_parse_cmd(group, name, &user); + ret = user_event_parse_cmd(group, name, &user, 0); if (!ret) refcount_dec(&user->refcnt); @@ -1751,7 +1753,7 @@ static int user_event_trace_register(struct user_event *user) */ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, - struct user_event **newuser) + struct user_event **newuser, int reg_flags) { int ret; u32 key; @@ -1846,6 +1848,8 @@ error: if (ret) goto put_user_lock; + user->reg_flags = reg_flags; + /* Ensure we track self ref and caller ref (2) */ refcount_set(&user->refcnt, 2); @@ -2144,7 +2148,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, return ret; } - ret = user_event_parse_cmd(info->group, name, &user); + ret = user_event_parse_cmd(info->group, name, &user, reg.flags); if (ret) { kfree(name); -- cgit v1.2.3 From f0dbf6fd0bddb9290c89fdd7afc53dad5051030e Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Wed, 14 Jun 2023 09:33:32 -0700 Subject: tracing/user_events: Track refcount consistently via put/get Various parts of the code today track user_event's refcnt field directly via a refcount_add/dec. This makes it hard to modify the behavior of the last reference decrement in all code paths consistently. For example, in the future we will auto-delete events upon the last reference going away. This last reference could happen in many places, but we want it to be consistently handled. Add user_event_get() and user_event_put() for the add/dec. Update all places where direct refcounts are being used to utilize these new functions. In each location pass if event_mutex is locked or not. This allows us to drop events automatically in future patches clearly. Ensure when caller states the lock is held, it really is (or is not) held. Link: https://lkml.kernel.org/r/20230614163336.5797-3-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 69 ++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 629823e21447..c064458eea5c 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -177,6 +177,28 @@ static u32 user_event_key(char *name) return jhash(name, strlen(name), 0); } +static struct user_event *user_event_get(struct user_event *user) +{ + refcount_inc(&user->refcnt); + + return user; +} + +static void user_event_put(struct user_event *user, bool locked) +{ +#ifdef CONFIG_LOCKDEP + if (locked) + lockdep_assert_held(&event_mutex); + else + lockdep_assert_not_held(&event_mutex); +#endif + + if (unlikely(!user)) + return; + + refcount_dec(&user->refcnt); +} + static void user_event_group_destroy(struct user_event_group *group) { kfree(group->system_name); @@ -228,12 +250,13 @@ error: return NULL; }; -static void user_event_enabler_destroy(struct user_event_enabler *enabler) +static void user_event_enabler_destroy(struct user_event_enabler *enabler, + bool locked) { list_del_rcu(&enabler->mm_enablers_link); /* No longer tracking the event via the enabler */ - refcount_dec(&enabler->event->refcnt); + user_event_put(enabler->event, locked); kfree(enabler); } @@ -295,7 +318,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work) /* User asked for enabler to be removed during fault */ if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) { - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, true); goto out; } @@ -470,14 +493,12 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig, if (!enabler) return false; - enabler->event = orig->event; + enabler->event = user_event_get(orig->event); enabler->addr = orig->addr; /* Only dup part of value (ignore future flags, etc) */ enabler->values = orig->values & ENABLE_VAL_DUP_MASK; - refcount_inc(&enabler->event->refcnt); - /* Enablers not exposed yet, RCU not required */ list_add(&enabler->mm_enablers_link, &mm->enablers); @@ -594,7 +615,7 @@ static void user_event_mm_destroy(struct user_event_mm *mm) struct user_event_enabler *enabler, *next; list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, false); mmdrop(mm->mm); kfree(mm); @@ -749,7 +770,7 @@ retry: * exit or run exec(), which includes forks and clones. */ if (!*write_result) { - refcount_inc(&enabler->event->refcnt); + user_event_get(user); list_add_rcu(&enabler->mm_enablers_link, &user_mm->enablers); } @@ -1337,10 +1358,8 @@ static struct user_event *find_user_event(struct user_event_group *group, *outkey = key; hash_for_each_possible(group->register_table, user, node, key) - if (!strcmp(EVENT_NAME(user), name)) { - refcount_inc(&user->refcnt); - return user; - } + if (!strcmp(EVENT_NAME(user), name)) + return user_event_get(user); return NULL; } @@ -1554,12 +1573,12 @@ static int user_event_reg(struct trace_event_call *call, return ret; inc: - refcount_inc(&user->refcnt); + user_event_get(user); update_enable_bit_for(user); return 0; dec: update_enable_bit_for(user); - refcount_dec(&user->refcnt); + user_event_put(user, true); return 0; } @@ -1593,7 +1612,7 @@ static int user_event_create(const char *raw_command) ret = user_event_parse_cmd(group, name, &user, 0); if (!ret) - refcount_dec(&user->refcnt); + user_event_put(user, false); mutex_unlock(&group->reg_mutex); @@ -1794,7 +1813,7 @@ static int user_event_parse(struct user_event_group *group, char *name, return 0; error: - refcount_dec(&user->refcnt); + user_event_put(user, false); return ret; } @@ -1883,7 +1902,7 @@ static int delete_user_event(struct user_event_group *group, char *name) if (!user) return -ENOENT; - refcount_dec(&user->refcnt); + user_event_put(user, true); if (!user_event_last_ref(user)) return -EBUSY; @@ -2042,9 +2061,7 @@ static int user_events_ref_add(struct user_event_file_info *info, for (i = 0; i < count; ++i) new_refs->events[i] = refs->events[i]; - new_refs->events[i] = user; - - refcount_inc(&user->refcnt); + new_refs->events[i] = user_event_get(user); rcu_assign_pointer(info->refs, new_refs); @@ -2158,7 +2175,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info, ret = user_events_ref_add(info, user); /* No longer need parse ref, ref_add either worked or not */ - refcount_dec(&user->refcnt); + user_event_put(user, false); /* Positive number is index and valid */ if (ret < 0) @@ -2307,7 +2324,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))) - user_event_enabler_destroy(enabler); + user_event_enabler_destroy(enabler, true); /* Removed at least one */ ret = 0; @@ -2365,7 +2382,6 @@ static int user_events_release(struct inode *node, struct file *file) struct user_event_file_info *info = file->private_data; struct user_event_group *group; struct user_event_refs *refs; - struct user_event *user; int i; if (!info) @@ -2389,12 +2405,9 @@ static int user_events_release(struct inode *node, struct file *file) * The underlying user_events are ref counted, and cannot be freed. * After this decrement, the user_events may be freed elsewhere. */ - for (i = 0; i < refs->count; ++i) { - user = refs->events[i]; + for (i = 0; i < refs->count; ++i) + user_event_put(refs->events[i], false); - if (user) - refcount_dec(&user->refcnt); - } out: file->private_data = NULL; -- cgit v1.2.3 From a65442edb47a6d9c974b50049296ac9ddb378fee Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Wed, 14 Jun 2023 09:33:33 -0700 Subject: tracing/user_events: Add auto cleanup and future persist flag Currently user events need to be manually deleted via the delete IOCTL call or via the dynamic_events file. Most operators and processes wish to have these events auto cleanup when they are no longer used by anything to prevent them piling without manual maintenance. However, some operators may not want this, such as pre-registering events via the dynamic_events tracefs file. Update user_event_put() to attempt an auto delete of the event if it's the last reference. The auto delete must run in a work queue to ensure proper behavior of class->reg() invocations that don't expect the call to go away from underneath them during the unregister. Add work_struct to user_event struct to ensure we can do this reliably. Add a persist flag, that is not yet exposed, to ensure we can toggle between auto-cleanup and leaving the events existing in the future. When a non-zero flag is seen during register, return -EINVAL to ensure ABI is clear for the user processes while we work out the best approach for persistent events. Link: https://lkml.kernel.org/r/20230614163336.5797-4-beaub@linux.microsoft.com Link: https://lore.kernel.org/linux-trace-kernel/20230518093600.3f119d68@rorschach.local.home/ Suggested-by: Steven Rostedt Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 139 +++++++++++++++++++++++++++++++++++---- 1 file changed, 126 insertions(+), 13 deletions(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index c064458eea5c..8df0550415e7 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -49,6 +49,18 @@ #define EVENT_STATUS_PERF BIT(1) #define EVENT_STATUS_OTHER BIT(7) +/* + * User register flags are not allowed yet, keep them here until we are + * ready to expose them out to the user ABI. + */ +enum user_reg_flag { + /* Event will not delete upon last reference closing */ + USER_EVENT_REG_PERSIST = 1U << 0, + + /* This value or above is currently non-ABI */ + USER_EVENT_REG_MAX = 1U << 1, +}; + /* * Stores the system name, tables, and locks for a group of events. This * allows isolation for events by various means. @@ -85,6 +97,7 @@ struct user_event { struct hlist_node node; struct list_head fields; struct list_head validators; + struct work_struct put_work; refcount_t refcnt; int min_size; int reg_flags; @@ -171,6 +184,7 @@ static int user_event_parse(struct user_event_group *group, char *name, static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); static void user_event_mm_put(struct user_event_mm *mm); +static int destroy_user_event(struct user_event *user); static u32 user_event_key(char *name) { @@ -184,19 +198,103 @@ static struct user_event *user_event_get(struct user_event *user) return user; } +static void delayed_destroy_user_event(struct work_struct *work) +{ + struct user_event *user = container_of( + work, struct user_event, put_work); + + mutex_lock(&event_mutex); + + if (!refcount_dec_and_test(&user->refcnt)) + goto out; + + if (destroy_user_event(user)) { + /* + * The only reason this would fail here is if we cannot + * update the visibility of the event. In this case the + * event stays in the hashtable, waiting for someone to + * attempt to delete it later. + */ + pr_warn("user_events: Unable to delete event\n"); + refcount_set(&user->refcnt, 1); + } +out: + mutex_unlock(&event_mutex); +} + static void user_event_put(struct user_event *user, bool locked) { -#ifdef CONFIG_LOCKDEP - if (locked) - lockdep_assert_held(&event_mutex); - else - lockdep_assert_not_held(&event_mutex); -#endif + bool delete; if (unlikely(!user)) return; - refcount_dec(&user->refcnt); + /* + * When the event is not enabled for auto-delete there will always + * be at least 1 reference to the event. During the event creation + * we initially set the refcnt to 2 to achieve this. In those cases + * the caller must acquire event_mutex and after decrement check if + * the refcnt is 1, meaning this is the last reference. When auto + * delete is enabled, there will only be 1 ref, IE: refcnt will be + * only set to 1 during creation to allow the below checks to go + * through upon the last put. The last put must always be done with + * the event mutex held. + */ + if (!locked) { + lockdep_assert_not_held(&event_mutex); + delete = refcount_dec_and_mutex_lock(&user->refcnt, &event_mutex); + } else { + lockdep_assert_held(&event_mutex); + delete = refcount_dec_and_test(&user->refcnt); + } + + if (!delete) + return; + + /* + * We now have the event_mutex in all cases, which ensures that + * no new references will be taken until event_mutex is released. + * New references come through find_user_event(), which requires + * the event_mutex to be held. + */ + + if (user->reg_flags & USER_EVENT_REG_PERSIST) { + /* We should not get here when persist flag is set */ + pr_alert("BUG: Auto-delete engaged on persistent event\n"); + goto out; + } + + /* + * Unfortunately we have to attempt the actual destroy in a work + * queue. This is because not all cases handle a trace_event_call + * being removed within the class->reg() operation for unregister. + */ + INIT_WORK(&user->put_work, delayed_destroy_user_event); + + /* + * Since the event is still in the hashtable, we have to re-inc + * the ref count to 1. This count will be decremented and checked + * in the work queue to ensure it's still the last ref. This is + * needed because a user-process could register the same event in + * between the time of event_mutex release and the work queue + * running the delayed destroy. If we removed the item now from + * the hashtable, this would result in a timing window where a + * user process would fail a register because the trace_event_call + * register would fail in the tracing layers. + */ + refcount_set(&user->refcnt, 1); + + if (WARN_ON_ONCE(!schedule_work(&user->put_work))) { + /* + * If we fail we must wait for an admin to attempt delete or + * another register/close of the event, whichever is first. + */ + pr_warn("user_events: Unable to queue delayed destroy\n"); + } +out: + /* Ensure if we didn't have event_mutex before we unlock it */ + if (!locked) + mutex_unlock(&event_mutex); } static void user_event_group_destroy(struct user_event_group *group) @@ -793,7 +891,12 @@ out: static __always_inline __must_check bool user_event_last_ref(struct user_event *user) { - return refcount_read(&user->refcnt) == 1; + int last = 0; + + if (user->reg_flags & USER_EVENT_REG_PERSIST) + last = 1; + + return refcount_read(&user->refcnt) == last; } static __always_inline __must_check @@ -1609,7 +1712,8 @@ static int user_event_create(const char *raw_command) mutex_lock(&group->reg_mutex); - ret = user_event_parse_cmd(group, name, &user, 0); + /* Dyn events persist, otherwise they would cleanup immediately */ + ret = user_event_parse_cmd(group, name, &user, USER_EVENT_REG_PERSIST); if (!ret) user_event_put(user, false); @@ -1780,6 +1884,10 @@ static int user_event_parse(struct user_event_group *group, char *name, int argc = 0; char **argv; + /* User register flags are not ready yet */ + if (reg_flags != 0 || flags != NULL) + return -EINVAL; + /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); user = find_user_event(group, name, &key); @@ -1869,8 +1977,13 @@ error: user->reg_flags = reg_flags; - /* Ensure we track self ref and caller ref (2) */ - refcount_set(&user->refcnt, 2); + if (user->reg_flags & USER_EVENT_REG_PERSIST) { + /* Ensure we track self ref and caller ref (2) */ + refcount_set(&user->refcnt, 2); + } else { + /* Ensure we track only caller ref (1) */ + refcount_set(&user->refcnt, 1); + } dyn_event_init(&user->devent, &user_event_dops); dyn_event_add(&user->devent, &user->call); @@ -2092,8 +2205,8 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) if (ret) return ret; - /* Ensure no flags, since we don't support any yet */ - if (kreg->flags != 0) + /* Ensure only valid flags */ + if (kreg->flags & ~(USER_EVENT_REG_MAX-1)) return -EINVAL; /* Ensure supported size */ -- cgit v1.2.3 From 216a137e3eadac888cece141bd2c940785e235a2 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Wed, 14 Jun 2023 09:33:34 -0700 Subject: selftests/user_events: Ensure auto cleanup works as expected User events now auto cleanup upon the last reference put. Update ftrace_test to ensure this works as expected. Ensure EBUSY delays while event is being deleted do not cause transient failures by waiting and re-attempting. Link: https://lkml.kernel.org/r/20230614163336.5797-5-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- tools/testing/selftests/user_events/ftrace_test.c | 49 ++++++++++++++++++----- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c index abfb49558a26..eb6904d89f14 100644 --- a/tools/testing/selftests/user_events/ftrace_test.c +++ b/tools/testing/selftests/user_events/ftrace_test.c @@ -102,30 +102,56 @@ err: return -1; } +static bool wait_for_delete(void) +{ + int i; + + for (i = 0; i < 1000; ++i) { + int fd = open(enable_file, O_RDONLY); + + if (fd == -1) + return true; + + close(fd); + usleep(1000); + } + + return false; +} + static int clear(int *check) { struct user_unreg unreg = {0}; + int fd; unreg.size = sizeof(unreg); unreg.disable_bit = 31; unreg.disable_addr = (__u64)check; - int fd = open(data_file, O_RDWR); + fd = open(data_file, O_RDWR); if (fd == -1) return -1; if (ioctl(fd, DIAG_IOCSUNREG, &unreg) == -1) if (errno != ENOENT) - return -1; - - if (ioctl(fd, DIAG_IOCSDEL, "__test_event") == -1) - if (errno != ENOENT) - return -1; + goto fail; + + if (ioctl(fd, DIAG_IOCSDEL, "__test_event") == -1) { + if (errno == EBUSY) { + if (!wait_for_delete()) + goto fail; + } else if (errno != ENOENT) + goto fail; + } close(fd); return 0; +fail: + close(fd); + + return -1; } static int check_print_fmt(const char *event, const char *expected, int *check) @@ -155,9 +181,8 @@ static int check_print_fmt(const char *event, const char *expected, int *check) /* Register should work */ ret = ioctl(fd, DIAG_IOCSREG, ®); - close(fd); - if (ret != 0) { + close(fd); printf("Reg failed in fmt\n"); return ret; } @@ -165,6 +190,8 @@ static int check_print_fmt(const char *event, const char *expected, int *check) /* Ensure correct print_fmt */ ret = get_print_fmt(print_fmt, sizeof(print_fmt)); + close(fd); + if (ret != 0) return ret; @@ -256,10 +283,10 @@ TEST_F(user, register_events) { unreg.disable_bit = 30; ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSUNREG, &unreg)); - /* Delete should work only after close and unregister */ + /* Delete should have been auto-done after close and unregister */ close(self->data_fd); - self->data_fd = open(data_file, O_RDWR); - ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSDEL, "__test_event")); + + ASSERT_EQ(true, wait_for_delete()); } TEST_F(user, write_events) { -- cgit v1.2.3 From 61701242e86f0b509a0f5f9deb226bedfc8dfab7 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Wed, 14 Jun 2023 09:33:35 -0700 Subject: selftests/user_events: Adapt dyn_test to non-persist events Now that user_events does not honor persist events the dynamic_events file cannot be easily used to test parsing and matching cases. Update dyn_test to use the direct ABI file instead of dynamic_events so that we still have testing coverage until persist events and dynamic_events file integration has been decided. Link: https://lkml.kernel.org/r/20230614163336.5797-6-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- tools/testing/selftests/user_events/dyn_test.c | 177 +++++++++++++++++++------ 1 file changed, 136 insertions(+), 41 deletions(-) diff --git a/tools/testing/selftests/user_events/dyn_test.c b/tools/testing/selftests/user_events/dyn_test.c index 8879a7b04c6a..d6979a48478f 100644 --- a/tools/testing/selftests/user_events/dyn_test.c +++ b/tools/testing/selftests/user_events/dyn_test.c @@ -16,42 +16,140 @@ #include "../kselftest_harness.h" -const char *dyn_file = "/sys/kernel/tracing/dynamic_events"; -const char *clear = "!u:__test_event"; +const char *abi_file = "/sys/kernel/tracing/user_events_data"; +const char *enable_file = "/sys/kernel/tracing/events/user_events/__test_event/enable"; -static int Append(const char *value) +static bool wait_for_delete(void) { - int fd = open(dyn_file, O_RDWR | O_APPEND); - int ret = write(fd, value, strlen(value)); + int i; + + for (i = 0; i < 1000; ++i) { + int fd = open(enable_file, O_RDONLY); + + if (fd == -1) + return true; + + close(fd); + usleep(1000); + } + + return false; +} + +static int reg_event(int fd, int *check, int bit, const char *value) +{ + struct user_reg reg = {0}; + + reg.size = sizeof(reg); + reg.name_args = (__u64)value; + reg.enable_bit = bit; + reg.enable_addr = (__u64)check; + reg.enable_size = sizeof(*check); + + if (ioctl(fd, DIAG_IOCSREG, ®) == -1) + return -1; + + return 0; +} + +static int unreg_event(int fd, int *check, int bit) +{ + struct user_unreg unreg = {0}; + + unreg.size = sizeof(unreg); + unreg.disable_bit = bit; + unreg.disable_addr = (__u64)check; + + return ioctl(fd, DIAG_IOCSUNREG, &unreg); +} + +static int parse(int *check, const char *value) +{ + int fd = open(abi_file, O_RDWR); + int ret; + + if (fd == -1) + return -1; + + /* Until we have persist flags via dynamic events, use the base name */ + if (value[0] != 'u' || value[1] != ':') { + close(fd); + return -1; + } + + ret = reg_event(fd, check, 31, value + 2); + + if (ret != -1) { + if (unreg_event(fd, check, 31) == -1) + printf("WARN: Couldn't unreg event\n"); + } close(fd); + return ret; } -#define CLEAR() \ +static int check_match(int *check, const char *first, const char *second, bool *match) +{ + int fd = open(abi_file, O_RDWR); + int ret = -1; + + if (fd == -1) + return -1; + + if (reg_event(fd, check, 31, first) == -1) + goto cleanup; + + if (reg_event(fd, check, 30, second) == -1) { + if (errno == EADDRINUSE) { + /* Name is in use, with different fields */ + *match = false; + ret = 0; + } + + goto cleanup; + } + + *match = true; + ret = 0; +cleanup: + unreg_event(fd, check, 31); + unreg_event(fd, check, 30); + + close(fd); + + wait_for_delete(); + + return ret; +} + +#define TEST_MATCH(x, y) \ do { \ - int ret = Append(clear); \ - if (ret == -1) \ - ASSERT_EQ(ENOENT, errno); \ + bool match; \ + ASSERT_NE(-1, check_match(&self->check, x, y, &match)); \ + ASSERT_EQ(true, match); \ } while (0) -#define TEST_PARSE(x) \ +#define TEST_NMATCH(x, y) \ do { \ - ASSERT_NE(-1, Append(x)); \ - CLEAR(); \ + bool match; \ + ASSERT_NE(-1, check_match(&self->check, x, y, &match)); \ + ASSERT_EQ(false, match); \ } while (0) -#define TEST_NPARSE(x) ASSERT_EQ(-1, Append(x)) +#define TEST_PARSE(x) ASSERT_NE(-1, parse(&self->check, x)) + +#define TEST_NPARSE(x) ASSERT_EQ(-1, parse(&self->check, x)) FIXTURE(user) { + int check; }; FIXTURE_SETUP(user) { - CLEAR(); } FIXTURE_TEARDOWN(user) { - CLEAR(); + wait_for_delete(); } TEST_F(user, basic_types) { @@ -95,33 +193,30 @@ TEST_F(user, size_types) { TEST_NPARSE("u:__test_event char a 20"); } -TEST_F(user, flags) { - /* Should work */ - TEST_PARSE("u:__test_event:BPF_ITER u32 a"); - /* Forward compat */ - TEST_PARSE("u:__test_event:BPF_ITER,FLAG_FUTURE u32 a"); -} - TEST_F(user, matching) { - /* Register */ - ASSERT_NE(-1, Append("u:__test_event struct custom a 20")); - /* Should not match */ - TEST_NPARSE("!u:__test_event struct custom b"); - /* Should match */ - TEST_PARSE("!u:__test_event struct custom a"); - /* Multi field reg */ - ASSERT_NE(-1, Append("u:__test_event u32 a; u32 b")); - /* Non matching cases */ - TEST_NPARSE("!u:__test_event u32 a"); - TEST_NPARSE("!u:__test_event u32 b"); - TEST_NPARSE("!u:__test_event u32 a; u32 "); - TEST_NPARSE("!u:__test_event u32 a; u32 a"); - /* Matching case */ - TEST_PARSE("!u:__test_event u32 a; u32 b"); - /* Register */ - ASSERT_NE(-1, Append("u:__test_event u32 a; u32 b")); - /* Ensure trailing semi-colon case */ - TEST_PARSE("!u:__test_event u32 a; u32 b;"); + /* Single name matches */ + TEST_MATCH("__test_event u32 a", + "__test_event u32 a"); + + /* Multiple names match */ + TEST_MATCH("__test_event u32 a; u32 b", + "__test_event u32 a; u32 b"); + + /* Multiple names match with dangling ; */ + TEST_MATCH("__test_event u32 a; u32 b", + "__test_event u32 a; u32 b;"); + + /* Single name doesn't match */ + TEST_NMATCH("__test_event u32 a", + "__test_event u32 b"); + + /* Multiple names don't match */ + TEST_NMATCH("__test_event u32 a; u32 b", + "__test_event u32 b; u32 a"); + + /* Types don't match */ + TEST_NMATCH("__test_event u64 a; u64 b", + "__test_event u32 a; u32 b"); } int main(int argc, char **argv) -- cgit v1.2.3 From 0113d4615dbf053ae9a7a1e0acbc6652713af01f Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Wed, 14 Jun 2023 09:33:36 -0700 Subject: tracing/user_events: Document auto-cleanup and remove dyn_event refs Now user_events auto-cleanup upon the last reference by default. This makes it not possible to use the dynamics event file via tracefs. Document that auto-cleanup is enabled by default and remove the refernce to /sys/kernel/tracing/dynamic_events file to make this clear. Link: https://lkml.kernel.org/r/20230614163336.5797-7-beaub@linux.microsoft.com Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/user_events.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst index f79987e16cf4..e7b07313550a 100644 --- a/Documentation/trace/user_events.rst +++ b/Documentation/trace/user_events.rst @@ -14,10 +14,6 @@ Programs can view status of the events via /sys/kernel/tracing/user_events_status and can both register and write data out via /sys/kernel/tracing/user_events_data. -Programs can also use /sys/kernel/tracing/dynamic_events to register and -delete user based events via the u: prefix. The format of the command to -dynamic_events is the same as the ioctl with the u: prefix applied. - Typically programs will register a set of events that they wish to expose to tools that can read trace_events (such as ftrace and perf). The registration process tells the kernel which address and bit to reflect if any tool has @@ -144,6 +140,9 @@ its name. Delete will only succeed if there are no references left to the event (in both user and kernel space). User programs should use a separate file to request deletes than the one used for registration due to this. +**NOTE:** By default events will auto-delete when there are no references left +to the event. Flags in the future may change this logic. + Unregistering ------------- If after registering an event it is no longer wanted to be updated then it can -- cgit v1.2.3 From adeaa3f290ecf7f6a6a5c53219a4686cbdff5fbd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 Jun 2023 19:26:55 -0600 Subject: io_uring/io-wq: clear current->worker_private on exit A recent fix stopped clearing PF_IO_WORKER from current->flags on exit, which meant that we can now call inc/dec running on the worker after it has been removed if it ends up scheduling in/out as part of exit. If this happens after an RCU grace period has passed, then the struct pointed to by current->worker_private may have been freed, and we can now be accessing memory that is freed. Ensure this doesn't happen by clearing the task worker_private field. Both io_wq_worker_running() and io_wq_worker_sleeping() check this field before going any further, and we don't need any accounting etc done after this worker has exited. Fixes: fd37b884003c ("io_uring/io-wq: don't clear PF_IO_WORKER on exit") Reported-by: Zorro Lang Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index fe38eb0cbc82..399e9a15c38d 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -220,7 +220,12 @@ static void io_worker_exit(struct io_worker *worker) list_del_rcu(&worker->all_list); raw_spin_unlock(&wq->lock); io_wq_dec_running(worker); - worker->flags = 0; + /* + * this worker is a goner, clear ->worker_private to avoid any + * inc/dec running calls that could happen as part of exit from + * touching 'worker'. + */ + current->worker_private = NULL; kfree_rcu(worker, rcu); io_worker_ref_put(wq); -- cgit v1.2.3 From 23200a4c8ac284f8b4263d7cecaefecaa3ad6732 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 11 May 2023 12:58:33 +0200 Subject: clk: pxa: fix NULL pointer dereference in pxa3xx_clk_update_accr sparse points out an embarrasing bug in an older patch of mine, which uses the register offset instead of an __iomem pointer: drivers/clk/pxa/clk-pxa3xx.c:167:9: sparse: sparse: Using plain integer as NULL pointer Unlike sparse, gcc and clang ignore this bug and fail to warn because a literal '0' is considered a valid representation of a NULL pointer. Fixes: 3c816d950a49 ("ARM: pxa: move clk register definitions to driver") Cc: stable@vger.kernel.org Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202305111301.RAHohdob-lkp@intel.com/ Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20230511105845.299859-1-arnd@kernel.org Signed-off-by: Stephen Boyd --- drivers/clk/pxa/clk-pxa3xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/pxa/clk-pxa3xx.c b/drivers/clk/pxa/clk-pxa3xx.c index 42958a542662..621e298f101a 100644 --- a/drivers/clk/pxa/clk-pxa3xx.c +++ b/drivers/clk/pxa/clk-pxa3xx.c @@ -164,7 +164,7 @@ void pxa3xx_clk_update_accr(u32 disable, u32 enable, u32 xclkcfg, u32 mask) accr &= ~disable; accr |= enable; - writel(accr, ACCR); + writel(accr, clk_regs + ACCR); if (xclkcfg) __asm__("mcr p14, 0, %0, c6, c0, 0\n" : : "r"(xclkcfg)); -- cgit v1.2.3 From 9dc704dcc09eae7d21b5da0615eb2ed79278f63e Mon Sep 17 00:00:00 2001 From: Sagar Biradar Date: Fri, 19 May 2023 16:08:34 -0700 Subject: scsi: aacraid: Reply queue mapping to CPUs based on IRQ affinity Fix the I/O hang that arises because of the MSIx vector not having a mapped online CPU upon receiving completion. SCSI cmds take the blk_mq route, which is setup during init. Reserved cmds fetch the vector_no from mq_map after init is complete. Before init, they have to use 0 - as per the norm. Reviewed-by: Gilbert Wu Signed-off-by: Sagar Biradar Reviewed-by: John Garry Link: https://lore.kernel.org/r/20230519230834.27436-1-sagar.biradar@microchip.com Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/aacraid.h | 1 + drivers/scsi/aacraid/commsup.c | 6 +++++- drivers/scsi/aacraid/linit.c | 14 ++++++++++++++ drivers/scsi/aacraid/src.c | 25 +++++++++++++++++++++++-- 4 files changed, 43 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h index 5e115e8b2ba4..7c6efde75da6 100644 --- a/drivers/scsi/aacraid/aacraid.h +++ b/drivers/scsi/aacraid/aacraid.h @@ -1678,6 +1678,7 @@ struct aac_dev u32 handle_pci_error; bool init_reset; u8 soft_reset_support; + u8 use_map_queue; }; #define aac_adapter_interrupt(dev) \ diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c index deb32c9f4b3e..3f062e4013ab 100644 --- a/drivers/scsi/aacraid/commsup.c +++ b/drivers/scsi/aacraid/commsup.c @@ -223,8 +223,12 @@ int aac_fib_setup(struct aac_dev * dev) struct fib *aac_fib_alloc_tag(struct aac_dev *dev, struct scsi_cmnd *scmd) { struct fib *fibptr; + u32 blk_tag; + int i; - fibptr = &dev->fibs[scsi_cmd_to_rq(scmd)->tag]; + blk_tag = blk_mq_unique_tag(scsi_cmd_to_rq(scmd)); + i = blk_mq_unique_tag_to_tag(blk_tag); + fibptr = &dev->fibs[i]; /* * Null out fields that depend on being zero at the start of * each I/O diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 68f4dbcfff49..c4a36c0be527 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -504,6 +505,15 @@ common_config: return 0; } +static void aac_map_queues(struct Scsi_Host *shost) +{ + struct aac_dev *aac = (struct aac_dev *)shost->hostdata; + + blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT], + aac->pdev, 0); + aac->use_map_queue = true; +} + /** * aac_change_queue_depth - alter queue depths * @sdev: SCSI device we are considering @@ -1488,6 +1498,7 @@ static const struct scsi_host_template aac_driver_template = { .bios_param = aac_biosparm, .shost_groups = aac_host_groups, .slave_configure = aac_slave_configure, + .map_queues = aac_map_queues, .change_queue_depth = aac_change_queue_depth, .sdev_groups = aac_dev_groups, .eh_abort_handler = aac_eh_abort, @@ -1775,6 +1786,8 @@ static int aac_probe_one(struct pci_dev *pdev, const struct pci_device_id *id) shost->max_lun = AAC_MAX_LUN; pci_set_drvdata(pdev, shost); + shost->nr_hw_queues = aac->max_msix; + shost->host_tagset = 1; error = scsi_add_host(shost, &pdev->dev); if (error) @@ -1906,6 +1919,7 @@ static void aac_remove_one(struct pci_dev *pdev) struct aac_dev *aac = (struct aac_dev *)shost->hostdata; aac_cancel_rescan_worker(aac); + aac->use_map_queue = false; scsi_remove_host(shost); __aac_shutdown(aac); diff --git a/drivers/scsi/aacraid/src.c b/drivers/scsi/aacraid/src.c index 11ef58204e96..61949f374188 100644 --- a/drivers/scsi/aacraid/src.c +++ b/drivers/scsi/aacraid/src.c @@ -493,6 +493,10 @@ static int aac_src_deliver_message(struct fib *fib) #endif u16 vector_no; + struct scsi_cmnd *scmd; + u32 blk_tag; + struct Scsi_Host *shost = dev->scsi_host_ptr; + struct blk_mq_queue_map *qmap; atomic_inc(&q->numpending); @@ -505,8 +509,25 @@ static int aac_src_deliver_message(struct fib *fib) if ((dev->comm_interface == AAC_COMM_MESSAGE_TYPE3) && dev->sa_firmware) vector_no = aac_get_vector(dev); - else - vector_no = fib->vector_no; + else { + if (!fib->vector_no || !fib->callback_data) { + if (shost && dev->use_map_queue) { + qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; + vector_no = qmap->mq_map[raw_smp_processor_id()]; + } + /* + * We hardcode the vector_no for + * reserved commands as a valid shost is + * absent during the init + */ + else + vector_no = 0; + } else { + scmd = (struct scsi_cmnd *)fib->callback_data; + blk_tag = blk_mq_unique_tag(scsi_cmd_to_rq(scmd)); + vector_no = blk_mq_unique_tag_to_hwq(blk_tag); + } + } if (native_hba) { if (fib->flags & FIB_CONTEXT_FLAG_NATIVE_HBA_TMF) { -- cgit v1.2.3 From 31d16e712bdcaee769de4780f72ff8d6cd3f0589 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Fri, 9 Jun 2023 13:38:21 -0700 Subject: scsi: storvsc: Always set no_report_opcodes Hyper-V synthetic SCSI devices do not support the MAINTENANCE_IN SCSI command, so scsi_report_opcode() always fails, resulting in messages like this: hv_storvsc : tag#205 cmd 0xa3 status: scsi 0x2 srb 0x86 hv 0xc0000001 The recently added support for command duration limits calls scsi_report_opcode() four times as each device comes online, which significantly increases the number of messages logged in a system with many disks. Fix the problem by always marking Hyper-V synthetic SCSI devices as not supporting scsi_report_opcode(). With this setting, the MAINTENANCE_IN SCSI command is not issued and no messages are logged. Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/1686343101-18930-1-git-send-email-mikelley@microsoft.com Signed-off-by: Martin K. Petersen --- drivers/scsi/storvsc_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index e6bc622954cf..659196a2f63a 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1567,6 +1567,8 @@ static int storvsc_device_configure(struct scsi_device *sdevice) { blk_queue_rq_timeout(sdevice->request_queue, (storvsc_timeout * HZ)); + /* storvsc devices don't support MAINTENANCE_IN SCSI cmd */ + sdevice->no_report_opcodes = 1; sdevice->no_write_same = 1; /* -- cgit v1.2.3 From 91271699228bfc66f1bc8abc0327169dc156d854 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 13 Jun 2023 09:43:00 -0500 Subject: scsi: target: core: Fix error path in target_setup_session() In the error exits in target_setup_session(), if a branch is taken to free_sess: transport_free_session() may call to target_free_cmd_counter() and then fall through to call target_free_cmd_counter() a second time. This can, and does, sometimes cause seg faults since the data field in cmd_cnt->refcnt has been freed in the first call. Fix this problem by simply returning after the call to transport_free_session(). The second call is redundant for those cases. Fixes: 4edba7e4a8f3 ("scsi: target: Move cmd counter allocation") Signed-off-by: Bob Pearson Link: https://lore.kernel.org/r/20230613144259.12890-1-rpearsonhpe@gmail.com Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/target/target_core_transport.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 86adff2a86ed..687adc9e086c 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -504,6 +504,8 @@ target_setup_session(struct se_portal_group *tpg, free_sess: transport_free_session(sess); + return ERR_PTR(rc); + free_cnt: target_free_cmd_counter(cmd_cnt); return ERR_PTR(rc); -- cgit v1.2.3 From 9cefd6e7e0a77b0fbca5c793f6fb6821b0962775 Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Wed, 14 Jun 2023 10:59:44 -0700 Subject: scsi: lpfc: Fix incorrect big endian type assignment in bsg loopback path The kernel test robot reported sparse warnings regarding incorrect type assignment for __be16 variables in bsg loopback path. Change the flagged lines to use the be16_to_cpu() and cpu_to_be16() macros appropriately. Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20230614175944.3577-1-justintee8345@gmail.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306110819.sDIKiGgg-lkp@intel.com/ Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_bsg.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_bsg.c b/drivers/scsi/lpfc/lpfc_bsg.c index 9a322a3a2150..595dca92e8db 100644 --- a/drivers/scsi/lpfc/lpfc_bsg.c +++ b/drivers/scsi/lpfc/lpfc_bsg.c @@ -889,7 +889,7 @@ lpfc_bsg_ct_unsol_event(struct lpfc_hba *phba, struct lpfc_sli_ring *pring, struct lpfc_iocbq *piocbq) { uint32_t evt_req_id = 0; - uint32_t cmd; + u16 cmd; struct lpfc_dmabuf *dmabuf = NULL; struct lpfc_bsg_event *evt; struct event_data *evt_dat = NULL; @@ -915,7 +915,7 @@ lpfc_bsg_ct_unsol_event(struct lpfc_hba *phba, struct lpfc_sli_ring *pring, ct_req = (struct lpfc_sli_ct_request *)bdeBuf1->virt; evt_req_id = ct_req->FsType; - cmd = ct_req->CommandResponse.bits.CmdRsp; + cmd = be16_to_cpu(ct_req->CommandResponse.bits.CmdRsp); spin_lock_irqsave(&phba->ct_ev_lock, flags); list_for_each_entry(evt, &phba->ct_ev_waiters, node) { @@ -3186,8 +3186,8 @@ lpfc_bsg_diag_loopback_run(struct bsg_job *job) ctreq->RevisionId.bits.InId = 0; ctreq->FsType = SLI_CT_ELX_LOOPBACK; ctreq->FsSubType = 0; - ctreq->CommandResponse.bits.CmdRsp = ELX_LOOPBACK_DATA; - ctreq->CommandResponse.bits.Size = size; + ctreq->CommandResponse.bits.CmdRsp = cpu_to_be16(ELX_LOOPBACK_DATA); + ctreq->CommandResponse.bits.Size = cpu_to_be16(size); segment_offset = ELX_LOOPBACK_HEADER_SZ; } else segment_offset = 0; -- cgit v1.2.3 From 9d7054fb3ac2e8d252aae1268f20623f244e644f Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Thu, 15 Jun 2023 14:51:45 +0200 Subject: spi: spi-geni-qcom: correctly handle -EPROBE_DEFER from dma_request_chan() Now spi_geni_grab_gpi_chan() errors are correctly reported, the -EPROBE_DEFER error should be returned from probe in case the GPI dma driver is built as module and/or not probed yet. Fixes: b59c122484ec ("spi: spi-geni-qcom: Add support for GPI dma") Fixes: 6532582c353f ("spi: spi-geni-qcom: fix error handling in spi_geni_grab_gpi_chan()") Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20230615-topic-sm8550-upstream-fix-spi-geni-qcom-probe-v2-1-670c3d9e8c9c@linaro.org Signed-off-by: Mark Brown --- drivers/spi/spi-geni-qcom.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/spi/spi-geni-qcom.c b/drivers/spi/spi-geni-qcom.c index a98b781b103a..b293428760bc 100644 --- a/drivers/spi/spi-geni-qcom.c +++ b/drivers/spi/spi-geni-qcom.c @@ -646,6 +646,8 @@ static int spi_geni_init(struct spi_geni_master *mas) geni_se_select_mode(se, GENI_GPI_DMA); dev_dbg(mas->dev, "Using GPI DMA mode for SPI\n"); break; + } else if (ret == -EPROBE_DEFER) { + goto out_pm; } /* * in case of failure to get gpi dma channel, we can still do the -- cgit v1.2.3 From 22db06337f590d01d79f60f181d8dfe5a9ef9085 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 14 Jun 2023 17:29:21 +0200 Subject: ACPI: sleep: Avoid breaking S3 wakeup due to might_sleep() The addition of might_sleep() to down_timeout() caused the latter to enable interrupts unconditionally in some cases, which in turn broke the ACPI S3 wakeup path in acpi_suspend_enter(), where down_timeout() is called by acpi_disable_all_gpes() via acpi_ut_acquire_mutex(). Namely, if CONFIG_DEBUG_ATOMIC_SLEEP is set, might_sleep() causes might_resched() to be used and if CONFIG_PREEMPT_VOLUNTARY is set, this triggers __cond_resched() which may call preempt_schedule_common(), so __schedule() gets invoked and it ends up with enabled interrupts (in the prev == next case). Now, enabling interrupts early in the S3 wakeup path causes the kernel to crash. Address this by modifying acpi_suspend_enter() to disable GPEs without attempting to acquire the sleeping lock which is not needed in that code path anyway. Fixes: 99409b935c9a ("locking/semaphore: Add might_sleep() to down_*() family") Reported-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Cc: 5.15+ # 5.15+ --- drivers/acpi/acpica/achware.h | 2 -- drivers/acpi/sleep.c | 16 ++++++++++++---- include/acpi/acpixf.h | 1 + 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/acpica/achware.h b/drivers/acpi/acpica/achware.h index ebf8fd373cf7..79bbfe00d241 100644 --- a/drivers/acpi/acpica/achware.h +++ b/drivers/acpi/acpica/achware.h @@ -101,8 +101,6 @@ acpi_status acpi_hw_get_gpe_status(struct acpi_gpe_event_info *gpe_event_info, acpi_event_status *event_status); -acpi_status acpi_hw_disable_all_gpes(void); - acpi_status acpi_hw_enable_all_runtime_gpes(void); acpi_status acpi_hw_enable_all_wakeup_gpes(void); diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 72470b9f16c4..f32570f72b90 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -636,11 +636,19 @@ static int acpi_suspend_enter(suspend_state_t pm_state) } /* - * Disable and clear GPE status before interrupt is enabled. Some GPEs - * (like wakeup GPE) haven't handler, this can avoid such GPE misfire. - * acpi_leave_sleep_state will reenable specific GPEs later + * Disable all GPE and clear their status bits before interrupts are + * enabled. Some GPEs (like wakeup GPEs) have no handlers and this can + * prevent them from producing spurious interrups. + * + * acpi_leave_sleep_state() will reenable specific GPEs later. + * + * Because this code runs on one CPU with disabled interrupts (all of + * the other CPUs are offline at this time), it need not acquire any + * sleeping locks which may trigger an implicit preemption point even + * if there is no contention, so avoid doing that by using a low-level + * library routine here. */ - acpi_disable_all_gpes(); + acpi_hw_disable_all_gpes(); /* Allow EC transactions to happen. */ acpi_ec_unblock_transactions(); diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index e6098a08c914..9ffdc0425bc2 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -761,6 +761,7 @@ ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_event_status *event_status)) ACPI_HW_DEPENDENT_RETURN_UINT32(u32 acpi_dispatch_gpe(acpi_handle gpe_device, u32 gpe_number)) +ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_hw_disable_all_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_disable_all_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_runtime_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_wakeup_gpes(void)) -- cgit v1.2.3 From 0bb619f9227aa370330d2b309733d74750705053 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 14 Jun 2023 12:07:56 +0200 Subject: thermal/intel/intel_soc_dts_iosf: Fix reporting wrong temperatures Since commit 955fb8719efb ("thermal/intel/intel_soc_dts_iosf: Use Intel TCC library") intel_soc_dts_iosf is reporting the wrong temperature. The driver expects tj_max to be in milli-degrees-celcius but after the switch to the TCC library this is now in degrees celcius so instead of e.g. 90000 it is set to 90 causing a temperature 45 degrees below tj_max to be reported as -44910 milli-degrees instead of as 45000 milli-degrees. Fix this by adding back the lost factor of 1000. Fixes: 955fb8719efb ("thermal/intel/intel_soc_dts_iosf: Use Intel TCC library") Reported-by: Bernhard Krug Signed-off-by: Hans de Goede Acked-by: Zhang Rui Cc: 6.3+ # 6.3+ Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/intel_soc_dts_iosf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/intel/intel_soc_dts_iosf.c b/drivers/thermal/intel/intel_soc_dts_iosf.c index f99dc7e4ae89..db97499f4f0a 100644 --- a/drivers/thermal/intel/intel_soc_dts_iosf.c +++ b/drivers/thermal/intel/intel_soc_dts_iosf.c @@ -398,7 +398,7 @@ struct intel_soc_dts_sensors *intel_soc_dts_iosf_init( spin_lock_init(&sensors->intr_notify_lock); mutex_init(&sensors->dts_update_lock); sensors->intr_type = intr_type; - sensors->tj_max = tj_max; + sensors->tj_max = tj_max * 1000; if (intr_type == INTEL_SOC_DTS_INTERRUPT_NONE) notification = false; else -- cgit v1.2.3 From c8a5d5ea3ba6a18958f8d76430e4cd68eea33943 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 15 Jun 2023 12:22:11 +1000 Subject: nouveau: fix client work fence deletion race This seems to have existed for ever but is now more apparant after commit 9bff18d13473 ("drm/ttm: use per BO cleanup workers") My analysis: two threads are running, one in the irq signalling the fence, in dma_fence_signal_timestamp_locked, it has done the DMA_FENCE_FLAG_SIGNALLED_BIT setting, but hasn't yet reached the callbacks. The second thread in nouveau_cli_work_ready, where it sees the fence is signalled, so then puts the fence, cleanups the object and frees the work item, which contains the callback. Thread one goes again and tries to call the callback and causes the use-after-free. Proposed fix: lock the fence signalled check in nouveau_cli_work_ready, so either the callbacks are done or the memory is freed. Reviewed-by: Karol Herbst Fixes: 11e451e74050 ("drm/nouveau: remove fence wait code from deferred client work handler") Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie Link: https://lore.kernel.org/dri-devel/20230615024008.1600281-1-airlied@gmail.com/ --- drivers/gpu/drm/nouveau/nouveau_drm.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index cc7c5b4a05fd..7aac9384600e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -137,10 +137,16 @@ nouveau_name(struct drm_device *dev) static inline bool nouveau_cli_work_ready(struct dma_fence *fence) { - if (!dma_fence_is_signaled(fence)) - return false; - dma_fence_put(fence); - return true; + bool ret = true; + + spin_lock_irq(fence->lock); + if (!dma_fence_is_signaled_locked(fence)) + ret = false; + spin_unlock_irq(fence->lock); + + if (ret == true) + dma_fence_put(fence); + return ret; } static void -- cgit v1.2.3 From 372b304c1e517c4d3e5b6ca6c9cfb20f027c3b03 Mon Sep 17 00:00:00 2001 From: Magali Lemes Date: Tue, 13 Jun 2023 09:32:19 -0300 Subject: selftests/harness: allow tests to be skipped during setup Before executing each test from a fixture, FIXTURE_SETUP is run once. When SKIP is used in FIXTURE_SETUP, the setup function returns early but the test still proceeds to run, unless another SKIP macro is used within the test definition, leading to some code repetition. Therefore, allow tests to be skipped directly from the setup function. Suggested-by: Jakub Kicinski Signed-off-by: Magali Lemes Signed-off-by: Jakub Kicinski --- tools/testing/selftests/kselftest_harness.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index d8bff2005dfc..5fd49ad0c696 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -249,7 +249,7 @@ /** * FIXTURE_SETUP() - Prepares the setup function for the fixture. - * *_metadata* is included so that EXPECT_* and ASSERT_* work correctly. + * *_metadata* is included so that EXPECT_*, ASSERT_* etc. work correctly. * * @fixture_name: fixture name * @@ -275,7 +275,7 @@ /** * FIXTURE_TEARDOWN() - * *_metadata* is included so that EXPECT_* and ASSERT_* work correctly. + * *_metadata* is included so that EXPECT_*, ASSERT_* etc. work correctly. * * @fixture_name: fixture name * @@ -388,7 +388,7 @@ if (setjmp(_metadata->env) == 0) { \ fixture_name##_setup(_metadata, &self, variant->data); \ /* Let setup failure terminate early. */ \ - if (!_metadata->passed) \ + if (!_metadata->passed || _metadata->skip) \ return; \ _metadata->setup_completed = true; \ fixture_name##_##test_name(_metadata, &self, variant->data); \ -- cgit v1.2.3 From d113c395c67b62fc0d3f2004c0afc406aca0a2b7 Mon Sep 17 00:00:00 2001 From: Magali Lemes Date: Tue, 13 Jun 2023 09:32:20 -0300 Subject: selftests: net: tls: check if FIPS mode is enabled TLS selftests use the ChaCha20-Poly1305 and SM4 algorithms, which are not FIPS compliant. When fips=1, this set of tests fails. Add a check and only run these tests if not in FIPS mode. Fixes: 4f336e88a870 ("selftests/tls: add CHACHA20-POLY1305 to tls selftests") Fixes: e506342a03c7 ("selftests/tls: add SM4 GCM/CCM to tls selftests") Reviewed-by: Jakub Kicinski Signed-off-by: Magali Lemes Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tls.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index e699548d4247..ff36844d14b4 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -25,6 +25,8 @@ #define TLS_PAYLOAD_MAX_LEN 16384 #define SOL_TLS 282 +static int fips_enabled; + struct tls_crypto_info_keys { union { struct tls12_crypto_info_aes_gcm_128 aes128; @@ -235,7 +237,7 @@ FIXTURE_VARIANT(tls) { uint16_t tls_version; uint16_t cipher_type; - bool nopad; + bool nopad, fips_non_compliant; }; FIXTURE_VARIANT_ADD(tls, 12_aes_gcm) @@ -254,24 +256,28 @@ FIXTURE_VARIANT_ADD(tls, 12_chacha) { .tls_version = TLS_1_2_VERSION, .cipher_type = TLS_CIPHER_CHACHA20_POLY1305, + .fips_non_compliant = true, }; FIXTURE_VARIANT_ADD(tls, 13_chacha) { .tls_version = TLS_1_3_VERSION, .cipher_type = TLS_CIPHER_CHACHA20_POLY1305, + .fips_non_compliant = true, }; FIXTURE_VARIANT_ADD(tls, 13_sm4_gcm) { .tls_version = TLS_1_3_VERSION, .cipher_type = TLS_CIPHER_SM4_GCM, + .fips_non_compliant = true, }; FIXTURE_VARIANT_ADD(tls, 13_sm4_ccm) { .tls_version = TLS_1_3_VERSION, .cipher_type = TLS_CIPHER_SM4_CCM, + .fips_non_compliant = true, }; FIXTURE_VARIANT_ADD(tls, 12_aes_ccm) @@ -311,6 +317,9 @@ FIXTURE_SETUP(tls) int one = 1; int ret; + if (fips_enabled && variant->fips_non_compliant) + SKIP(return, "Unsupported cipher in FIPS mode"); + tls_crypto_info_init(variant->tls_version, variant->cipher_type, &tls12); @@ -1865,4 +1874,17 @@ TEST(prequeue) { close(cfd); } +static void __attribute__((constructor)) fips_check(void) { + int res; + FILE *f; + + f = fopen("/proc/sys/crypto/fips_enabled", "r"); + if (f) { + res = fscanf(f, "%d", &fips_enabled); + if (res != 1) + ksft_print_msg("ERROR: Couldn't read /proc/sys/crypto/fips_enabled\n"); + fclose(f); + } +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From cb43c60e64ca67fcc9d23bd08f51d2ab8209d9d7 Mon Sep 17 00:00:00 2001 From: Magali Lemes Date: Tue, 13 Jun 2023 09:32:21 -0300 Subject: selftests: net: vrf-xfrm-tests: change authentication and encryption algos The vrf-xfrm-tests tests use the hmac(md5) and cbc(des3_ede) algorithms for performing authentication and encryption, respectively. This causes the tests to fail when fips=1 is set, since these algorithms are not allowed in FIPS mode. Therefore, switch from hmac(md5) and cbc(des3_ede) to hmac(sha1) and cbc(aes), which are FIPS compliant. Fixes: 3f251d741150 ("selftests: Add tests for vrf and xfrms") Reviewed-by: David Ahern Signed-off-by: Magali Lemes Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/vrf-xfrm-tests.sh | 32 +++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/net/vrf-xfrm-tests.sh b/tools/testing/selftests/net/vrf-xfrm-tests.sh index 184da81f554f..452638ae8aed 100755 --- a/tools/testing/selftests/net/vrf-xfrm-tests.sh +++ b/tools/testing/selftests/net/vrf-xfrm-tests.sh @@ -264,60 +264,60 @@ setup_xfrm() ip -netns host1 xfrm state add src ${HOST1_4} dst ${HOST2_4} \ proto esp spi ${SPI_1} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ - enc 'cbc(des3_ede)' ${ENC_1} \ + auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \ + enc 'cbc(aes)' ${ENC_1} \ sel src ${h1_4} dst ${h2_4} ${devarg} ip -netns host2 xfrm state add src ${HOST1_4} dst ${HOST2_4} \ proto esp spi ${SPI_1} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ - enc 'cbc(des3_ede)' ${ENC_1} \ + auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \ + enc 'cbc(aes)' ${ENC_1} \ sel src ${h1_4} dst ${h2_4} ip -netns host1 xfrm state add src ${HOST2_4} dst ${HOST1_4} \ proto esp spi ${SPI_2} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ - enc 'cbc(des3_ede)' ${ENC_2} \ + auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \ + enc 'cbc(aes)' ${ENC_2} \ sel src ${h2_4} dst ${h1_4} ${devarg} ip -netns host2 xfrm state add src ${HOST2_4} dst ${HOST1_4} \ proto esp spi ${SPI_2} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ - enc 'cbc(des3_ede)' ${ENC_2} \ + auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \ + enc 'cbc(aes)' ${ENC_2} \ sel src ${h2_4} dst ${h1_4} ip -6 -netns host1 xfrm state add src ${HOST1_6} dst ${HOST2_6} \ proto esp spi ${SPI_1} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ - enc 'cbc(des3_ede)' ${ENC_1} \ + auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \ + enc 'cbc(aes)' ${ENC_1} \ sel src ${h1_6} dst ${h2_6} ${devarg} ip -6 -netns host2 xfrm state add src ${HOST1_6} dst ${HOST2_6} \ proto esp spi ${SPI_1} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_1} 96 \ - enc 'cbc(des3_ede)' ${ENC_1} \ + auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \ + enc 'cbc(aes)' ${ENC_1} \ sel src ${h1_6} dst ${h2_6} ip -6 -netns host1 xfrm state add src ${HOST2_6} dst ${HOST1_6} \ proto esp spi ${SPI_2} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ - enc 'cbc(des3_ede)' ${ENC_2} \ + auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \ + enc 'cbc(aes)' ${ENC_2} \ sel src ${h2_6} dst ${h1_6} ${devarg} ip -6 -netns host2 xfrm state add src ${HOST2_6} dst ${HOST1_6} \ proto esp spi ${SPI_2} reqid 0 mode tunnel \ replay-window 4 replay-oseq 0x4 \ - auth-trunc 'hmac(md5)' ${AUTH_2} 96 \ - enc 'cbc(des3_ede)' ${ENC_2} \ + auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \ + enc 'cbc(aes)' ${ENC_2} \ sel src ${h2_6} dst ${h1_6} } -- cgit v1.2.3 From d7a2fc1437f71cb058c7b11bc33dfc19e4bf277a Mon Sep 17 00:00:00 2001 From: Magali Lemes Date: Tue, 13 Jun 2023 09:32:22 -0300 Subject: selftests: net: fcnal-test: check if FIPS mode is enabled There are some MD5 tests which fail when the kernel is in FIPS mode, since MD5 is not FIPS compliant. Add a check and only run those tests if FIPS mode is not enabled. Fixes: f0bee1ebb5594 ("fcnal-test: Add TCP MD5 tests") Fixes: 5cad8bce26e01 ("fcnal-test: Add TCP MD5 tests for VRF") Reviewed-by: David Ahern Signed-off-by: Magali Lemes Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fcnal-test.sh | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 21ca91473c09..ee6880ac3e5e 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -92,6 +92,13 @@ NSC_CMD="ip netns exec ${NSC}" which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) +# Check if FIPS mode is enabled +if [ -f /proc/sys/crypto/fips_enabled ]; then + fips_enabled=`cat /proc/sys/crypto/fips_enabled` +else + fips_enabled=0 +fi + ################################################################################ # utilities @@ -1216,7 +1223,7 @@ ipv4_tcp_novrf() run_cmd nettest -d ${NSA_DEV} -r ${a} log_test_addr ${a} $? 1 "No server, device client, local conn" - ipv4_tcp_md5_novrf + [ "$fips_enabled" = "1" ] || ipv4_tcp_md5_novrf } ipv4_tcp_vrf() @@ -1270,9 +1277,11 @@ ipv4_tcp_vrf() log_test_addr ${a} $? 1 "Global server, local connection" # run MD5 tests - setup_vrf_dup - ipv4_tcp_md5 - cleanup_vrf_dup + if [ "$fips_enabled" = "0" ]; then + setup_vrf_dup + ipv4_tcp_md5 + cleanup_vrf_dup + fi # # enable VRF global server @@ -2772,7 +2781,7 @@ ipv6_tcp_novrf() log_test_addr ${a} $? 1 "No server, device client, local conn" done - ipv6_tcp_md5_novrf + [ "$fips_enabled" = "1" ] || ipv6_tcp_md5_novrf } ipv6_tcp_vrf() @@ -2842,9 +2851,11 @@ ipv6_tcp_vrf() log_test_addr ${a} $? 1 "Global server, local connection" # run MD5 tests - setup_vrf_dup - ipv6_tcp_md5 - cleanup_vrf_dup + if [ "$fips_enabled" = "0" ]; then + setup_vrf_dup + ipv6_tcp_md5 + cleanup_vrf_dup + fi # # enable VRF global server -- cgit v1.2.3 From 297224fc0922e7385573a30c29ffdabb67f27b7d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 12 Jun 2023 14:55:33 +0200 Subject: ALSA: seq: oss: Fix racy open/close of MIDI devices Although snd_seq_oss_midi_open() and snd_seq_oss_midi_close() can be called concurrently from different code paths, we have no proper data protection against races. Introduce open_mutex to each seq_oss_midi object for avoiding the races. Reported-by: "Gong, Sishuai" Closes: https://lore.kernel.org/r/7DC9AF71-F481-4ABA-955F-76C535661E33@purdue.edu Link: https://lore.kernel.org/r/20230612125533.27461-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/oss/seq_oss_midi.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/sound/core/seq/oss/seq_oss_midi.c b/sound/core/seq/oss/seq_oss_midi.c index 07efb38f58ac..f2940b29595f 100644 --- a/sound/core/seq/oss/seq_oss_midi.c +++ b/sound/core/seq/oss/seq_oss_midi.c @@ -37,6 +37,7 @@ struct seq_oss_midi { struct snd_midi_event *coder; /* MIDI event coder */ struct seq_oss_devinfo *devinfo; /* assigned OSSseq device */ snd_use_lock_t use_lock; + struct mutex open_mutex; }; @@ -172,6 +173,7 @@ snd_seq_oss_midi_check_new_port(struct snd_seq_port_info *pinfo) mdev->flags = pinfo->capability; mdev->opened = 0; snd_use_lock_init(&mdev->use_lock); + mutex_init(&mdev->open_mutex); /* copy and truncate the name of synth device */ strscpy(mdev->name, pinfo->name, sizeof(mdev->name)); @@ -322,15 +324,17 @@ snd_seq_oss_midi_open(struct seq_oss_devinfo *dp, int dev, int fmode) int perm; struct seq_oss_midi *mdev; struct snd_seq_port_subscribe subs; + int err; mdev = get_mididev(dp, dev); if (!mdev) return -ENODEV; + mutex_lock(&mdev->open_mutex); /* already used? */ if (mdev->opened && mdev->devinfo != dp) { - snd_use_lock_free(&mdev->use_lock); - return -EBUSY; + err = -EBUSY; + goto unlock; } perm = 0; @@ -340,14 +344,14 @@ snd_seq_oss_midi_open(struct seq_oss_devinfo *dp, int dev, int fmode) perm |= PERM_READ; perm &= mdev->flags; if (perm == 0) { - snd_use_lock_free(&mdev->use_lock); - return -ENXIO; + err = -ENXIO; + goto unlock; } /* already opened? */ if ((mdev->opened & perm) == perm) { - snd_use_lock_free(&mdev->use_lock); - return 0; + err = 0; + goto unlock; } perm &= ~mdev->opened; @@ -372,13 +376,17 @@ snd_seq_oss_midi_open(struct seq_oss_devinfo *dp, int dev, int fmode) } if (! mdev->opened) { - snd_use_lock_free(&mdev->use_lock); - return -ENXIO; + err = -ENXIO; + goto unlock; } mdev->devinfo = dp; + err = 0; + + unlock: + mutex_unlock(&mdev->open_mutex); snd_use_lock_free(&mdev->use_lock); - return 0; + return err; } /* @@ -393,10 +401,9 @@ snd_seq_oss_midi_close(struct seq_oss_devinfo *dp, int dev) mdev = get_mididev(dp, dev); if (!mdev) return -ENODEV; - if (! mdev->opened || mdev->devinfo != dp) { - snd_use_lock_free(&mdev->use_lock); - return 0; - } + mutex_lock(&mdev->open_mutex); + if (!mdev->opened || mdev->devinfo != dp) + goto unlock; memset(&subs, 0, sizeof(subs)); if (mdev->opened & PERM_WRITE) { @@ -415,6 +422,8 @@ snd_seq_oss_midi_close(struct seq_oss_devinfo *dp, int dev) mdev->opened = 0; mdev->devinfo = NULL; + unlock: + mutex_unlock(&mdev->open_mutex); snd_use_lock_free(&mdev->use_lock); return 0; } -- cgit v1.2.3 From 8ba61c9f6c9bdfbf9d197b0282641d24ae909778 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 12 Jun 2023 15:28:18 +0200 Subject: ALSA: usb-audio: Fix broken resume due to UAC3 power state As reported in the bugzilla below, the PM resume of a UAC3 device may fail due to the incomplete power state change, stuck at D1. The reason is that the driver expects the full D0 power state change only at hw_params, while the normal PCM resume procedure doesn't call hw_params. For fixing the bug, we add the same power state update to D0 at the prepare callback, which is certainly called by the resume procedure. Note that, with this change, the power state change in the hw_params becomes almost redundant, since snd_usb_hw_params() doesn't touch the parameters (at least it tires so). But dropping it is still a bit risky (e.g. we have the media-driver binding), so I leave the D0 power state change in snd_usb_hw_params() as is for now. Fixes: a0a4959eb4e9 ("ALSA: usb-audio: Operate UAC3 Power Domains in PCM callbacks") Cc: Link: https://bugzilla.kernel.org/show_bug.cgi?id=217539 Link: https://lore.kernel.org/r/20230612132818.29486-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/pcm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c index eec5232f9fb2..08bf535ed163 100644 --- a/sound/usb/pcm.c +++ b/sound/usb/pcm.c @@ -650,6 +650,10 @@ static int snd_usb_pcm_prepare(struct snd_pcm_substream *substream) goto unlock; } + ret = snd_usb_pcm_change_state(subs, UAC3_PD_STATE_D0); + if (ret < 0) + goto unlock; + again: if (subs->sync_endpoint) { ret = snd_usb_endpoint_prepare(chip, subs->sync_endpoint); -- cgit v1.2.3 From 122e2cb7e1a30438cc0e8bf70d4279db245d7d5b Mon Sep 17 00:00:00 2001 From: Lukasz Tyl Date: Wed, 14 Jun 2023 14:25:24 +0200 Subject: ALSA: usb-audio: Add quirk flag for HEM devices to enable native DSD playback This commit adds new DEVICE_FLG with QUIRK_FLAG_DSD_RAW and Vendor Id for HEM devices which supports native DSD. Prior to this change Linux kernel was not enabling native DSD playback for HEM devices, and as a result, DSD audio was being converted to PCM "on the fly". HEM devices, when connected to the system, would only play audio in PCM format, even if the source material was in DSD format. With the addition of new VENDOR_FLG in the quircks.c file, the devices are now correctly recognized, and raw DSD data is transmitted to the device, allowing for native DSD playback. Signed-off-by: Lukasz Tyl Cc: Link: https://lore.kernel.org/r/20230614122524.30271-1-ltyl@hem-e.com Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 3ecd1ba7fd4b..6cf55b7f7a04 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2191,6 +2191,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x2ab6, /* T+A devices */ QUIRK_FLAG_DSD_RAW), + VENDOR_FLG(0x3336, /* HEM devices */ + QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x3353, /* Khadas devices */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x3842, /* EVGA */ -- cgit v1.2.3 From f015b900bc3285322029b4a7d132d6aeb0e51857 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 14 Jun 2023 12:02:02 +0200 Subject: xfrm: Linearize the skb after offloading if needed. With offloading enabled, esp_xmit() gets invoked very late, from within validate_xmit_xfrm() which is after validate_xmit_skb() validates and linearizes the skb if the underlying device does not support fragments. esp_output_tail() may add a fragment to the skb while adding the auth tag/ IV. Devices without the proper support will then send skb->data points to with the correct length so the packet will have garbage at the end. A pcap sniffer will claim that the proper data has been sent since it parses the skb properly. It is not affected with INET_ESP_OFFLOAD disabled. Linearize the skb after offloading if the sending hardware requires it. It was tested on v4, v6 has been adopted. Fixes: 7785bba299a8d ("esp: Add a software GRO codepath") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 3 +++ net/ipv6/esp6_offload.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 3969fa805679..ee848be59e65 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -340,6 +340,9 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ secpath_reset(skb); + if (skb_needs_linearize(skb, skb->dev->features) && + __skb_linearize(skb)) + return -ENOMEM; return 0; } diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 75c02992c520..772340268997 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -374,6 +374,9 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features secpath_reset(skb); + if (skb_needs_linearize(skb, skb->dev->features) && + __skb_linearize(skb)) + return -ENOMEM; return 0; } -- cgit v1.2.3 From 13bb06f8dd42071cb9a49f6e21099eea05d4b856 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Jun 2023 11:18:30 +0200 Subject: tick/common: Align tick period during sched_timer setup The tick period is aligned very early while the first clock_event_device is registered. At that point the system runs in periodic mode and switches later to one-shot mode if possible. The next wake-up event is programmed based on the aligned value (tick_next_period) but the delta value, that is used to program the clock_event_device, is computed based on ktime_get(). With the subtracted offset, the device fires earlier than the exact time frame. With a large enough offset the system programs the timer for the next wake-up and the remaining time left is too small to make any boot progress. The system hangs. Move the alignment later to the setup of tick_sched timer. At this point the system switches to oneshot mode and a high resolution clocksource is available. At this point it is safe to align tick_next_period because ktime_get() will now return accurate (not jiffies based) time. [bigeasy: Patch description + testing]. Fixes: e9523a0d81899 ("tick/common: Align tick period with the HZ tick.") Reported-by: Mathias Krause Reported-by: "Bhatnagar, Rishabh" Suggested-by: Mathias Krause Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Richard W.M. Jones Tested-by: Mathias Krause Acked-by: SeongJae Park Cc: stable@vger.kernel.org Link: https://lore.kernel.org/5a56290d-806e-b9a5-f37c-f21958b5a8c0@grsecurity.net Link: https://lore.kernel.org/12c6f9a3-d087-b824-0d05-0d18c9bc1bf3@amazon.com Link: https://lore.kernel.org/r/20230615091830.RxMV2xf_@linutronix.de --- kernel/time/tick-common.c | 13 +------------ kernel/time/tick-sched.c | 13 ++++++++++++- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 65b8658da829..e9138cd7a0f5 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -218,19 +218,8 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - ktime_t next_p; - u32 rem; - tick_do_timer_cpu = cpu; - - next_p = ktime_get(); - div_u64_rem(next_p, TICK_NSEC, &rem); - if (rem) { - next_p -= rem; - next_p += TICK_NSEC; - } - - tick_next_period = next_p; + tick_next_period = ktime_get(); #ifdef CONFIG_NO_HZ_FULL /* * The boot CPU may be nohz_full, in which case set diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 52254679ec48..42c0be3080bd 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -161,8 +161,19 @@ static ktime_t tick_init_jiffy_update(void) raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Did we start the jiffies update yet ? */ - if (last_jiffies_update == 0) + if (last_jiffies_update == 0) { + u32 rem; + + /* + * Ensure that the tick is aligned to a multiple of + * TICK_NSEC. + */ + div_u64_rem(tick_next_period, TICK_NSEC, &rem); + if (rem) + tick_next_period += TICK_NSEC - rem; + last_jiffies_update = tick_next_period; + } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); -- cgit v1.2.3 From 4e7401fc8c8d048a813e0221a706be84182a76c1 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 14 Jun 2023 12:00:05 +0300 Subject: net/mlx5e: XDP, Allow growing tail for XDP multi buffer The cited commits missed passing frag_size to __xdp_rxq_info_reg, which is required by bpf_xdp_adjust_tail to support growing the tail pointer in fragmented packets. Pass the missing parameter when the current RQ mode allows XDP multi buffer. Fixes: ea5d49bdae8b ("net/mlx5e: Add XDP multi buffer support to the non-linear legacy RQ") Fixes: 9cb9482ef10e ("net/mlx5e: Use fragments of the same size in non-linear legacy RQ with XDP") Signed-off-by: Maxim Mikityanskiy Cc: Tariq Toukan Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/params.c | 8 ++++++-- drivers/net/ethernet/mellanox/mlx5/core/en/params.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 7 ++++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c index 9c94807097cb..5ce28ff7685f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c @@ -732,7 +732,8 @@ static void mlx5e_rx_compute_wqe_bulk_params(struct mlx5e_params *params, static int mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_xsk_param *xsk, - struct mlx5e_rq_frags_info *info) + struct mlx5e_rq_frags_info *info, + u32 *xdp_frag_size) { u32 byte_count = MLX5E_SW2HW_MTU(params, params->sw_mtu); int frag_size_max = DEFAULT_FRAG_SIZE; @@ -845,6 +846,8 @@ out: info->log_num_frags = order_base_2(info->num_frags); + *xdp_frag_size = info->num_frags > 1 && params->xdp_prog ? PAGE_SIZE : 0; + return 0; } @@ -989,7 +992,8 @@ int mlx5e_build_rq_param(struct mlx5_core_dev *mdev, } default: /* MLX5_WQ_TYPE_CYCLIC */ MLX5_SET(wq, wq, log_wq_sz, params->log_rq_mtu_frames); - err = mlx5e_build_rq_frags_info(mdev, params, xsk, ¶m->frags_info); + err = mlx5e_build_rq_frags_info(mdev, params, xsk, ¶m->frags_info, + ¶m->xdp_frag_size); if (err) return err; ndsegs = param->frags_info.num_frags; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h index a5d20f6d6d9c..6800949dafbc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h @@ -24,6 +24,7 @@ struct mlx5e_rq_param { u32 rqc[MLX5_ST_SZ_DW(rqc)]; struct mlx5_wq_param wq; struct mlx5e_rq_frags_info frags_info; + u32 xdp_frag_size; }; struct mlx5e_sq_param { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index a7c526ee5024..a5bdf78955d7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -641,7 +641,7 @@ static void mlx5e_free_mpwqe_rq_drop_page(struct mlx5e_rq *rq) } static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *params, - struct mlx5e_rq *rq) + u32 xdp_frag_size, struct mlx5e_rq *rq) { struct mlx5_core_dev *mdev = c->mdev; int err; @@ -665,7 +665,8 @@ static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param if (err) return err; - return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix, c->napi.napi_id); + return __xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix, c->napi.napi_id, + xdp_frag_size); } static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev, @@ -2240,7 +2241,7 @@ static int mlx5e_open_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param { int err; - err = mlx5e_init_rxq_rq(c, params, &c->rq); + err = mlx5e_init_rxq_rq(c, params, rq_params->xdp_frag_size, &c->rq); if (err) return err; -- cgit v1.2.3 From 62a522d3354d81a86dd56feeb40e5ced36d72737 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 14 Jun 2023 12:00:06 +0300 Subject: net/mlx5e: xsk: Set napi_id to support busy polling on XSK RQ The cited commit missed setting napi_id on XSK RQs, it only affected regular RQs. Add the missing part to support socket busy polling on XSK RQs. Fixes: a2740f529da2 ("net/mlx5e: xsk: Set napi_id to support busy polling") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c index ed279f450976..36826b582484 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c @@ -86,7 +86,7 @@ static int mlx5e_init_xsk_rq(struct mlx5e_channel *c, if (err) return err; - return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix, 0); + return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix, c->napi.napi_id); } static int mlx5e_open_xsk_rq(struct mlx5e_channel *c, struct mlx5e_params *params, -- cgit v1.2.3 From 0ab999d4a1bfe8251538be1e7e495c50433475a8 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 3 May 2023 15:10:05 +0300 Subject: net/mlx5: Fix driver load with single msix vector When a PCI device has just one msix vector available, we want to share this vector between async and completion events. Current code fails to do that assuming it will always have at least one dedicated vector for completion events. Fix this by detecting when the pool contains just a single vector. Fixes: 3354822cde5a ("net/mlx5: Use dynamic msix vectors allocation") Signed-off-by: Eli Cohen Reviewed-by: Shay Drory Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 843da89a9035..33b9359de53d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -565,15 +565,21 @@ void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs) int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs, struct mlx5_irq **irqs, struct cpu_rmap **rmap) { + struct mlx5_irq_table *table = mlx5_irq_table_get(dev); + struct mlx5_irq_pool *pool = table->pcif_pool; struct irq_affinity_desc af_desc; struct mlx5_irq *irq; + int offset = 1; int i; + if (!pool->xa_num_irqs.max) + offset = 0; + af_desc.is_managed = false; for (i = 0; i < nirqs; i++) { cpumask_clear(&af_desc.mask); cpumask_set_cpu(cpus[i], &af_desc.mask); - irq = mlx5_irq_request(dev, i + 1, &af_desc, rmap); + irq = mlx5_irq_request(dev, i + offset, &af_desc, rmap); if (IS_ERR(irq)) break; irqs[i] = irq; -- cgit v1.2.3 From b100573ab76ee5b0cb8f9129b568d9e7da795f76 Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Tue, 11 Apr 2023 11:17:35 +0300 Subject: net/mlx5e: TC, Add null pointer check for hardware miss support The cited commits add hardware miss support to tc action. But if the rules can't be offloaded, the pointers are null and system will panic when accessing them. Fix it by checking null pointer. Fixes: 08fe94ec5f77 ("net/mlx5e: TC, Remove special handling of CT action") Fixes: 6702782845a5 ("net/mlx5e: TC, Set CT miss to the specific ct action instance") Signed-off-by: Chris Mi Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index ead38ef69483..a254e728ac95 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -2021,6 +2021,8 @@ void mlx5_tc_ct_delete_flow(struct mlx5_tc_ct_priv *priv, struct mlx5_flow_attr *attr) { + if (!attr->ct_attr.ft) /* no ct action, return */ + return; if (!attr->ct_attr.nf_ft) /* means only ct clear action, and not ct_clear,ct() */ return; -- cgit v1.2.3 From fb7be476ab7e797b18c6f0047041635c41b3b5a4 Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Thu, 6 Apr 2023 09:38:09 +0300 Subject: net/mlx5e: TC, Cleanup ct resources for nic flow The cited commit removes special handling of CT action. But it removes too much. Pre ct/ct_nat tables and some other resources are not destroyed due to the cited commit. Fix it by adding it back. Fixes: 08fe94ec5f77 ("net/mlx5e: TC, Remove special handling of CT action") Signed-off-by: Chris Mi Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 8a5a8703f0a3..b9b1da751a3b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1439,6 +1439,7 @@ static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv, mlx5e_hairpin_flow_del(priv, flow); free_flow_post_acts(flow); + mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), attr); kvfree(attr->parse_attr); kfree(flow->attr); -- cgit v1.2.3 From 87cd0649176c0588daf2cad53058143f808b0905 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Mon, 12 Jun 2023 01:05:48 +0300 Subject: net/mlx5: DR, Support SW created encap actions for FW table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases, steering might need to use SW-created action in FW table, which results in wrong packet reformat being used: mlx5_core 0000:81:00.1: mlx5_cmd_check:756:(pid 1154): SET_FLOW_TABLE_ENTRY(0×936) op_mod(0×0) failed, status bad resource(0×5), syndrome (0xf2ff71) This patch adds support for usage of SW-created packet reformat (encap) actions in FW tables, and adds clear error flow for attempt to use SW-created modify header on FW tables. Fixes: 6a48faeeca10 ("net/mlx5: Add direct rule fs_cmd implementation") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Erez Shitrit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 50 ++++++++++++++++------ drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 7 +++ .../mellanox/mlx5/core/steering/dr_action.c | 5 +++ .../ethernet/mellanox/mlx5/core/steering/fs_dr.c | 27 +++++++++++- .../ethernet/mellanox/mlx5/core/steering/fs_dr.h | 7 +++ .../ethernet/mellanox/mlx5/core/steering/mlx5dr.h | 2 + 6 files changed, 83 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 144e59480686..ec83e6483d1a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -511,10 +511,11 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, struct mlx5_flow_rule *dst; void *in_flow_context, *vlan; void *in_match_value; + int reformat_id = 0; unsigned int inlen; int dst_cnt_size; + u32 *in, action; void *in_dests; - u32 *in; int err; if (mlx5_set_extended_dest(dev, fte, &extended_dest)) @@ -553,22 +554,42 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, MLX5_SET(flow_context, in_flow_context, extended_destination, extended_dest); - if (extended_dest) { - u32 action; - action = fte->action.action & - ~MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; - MLX5_SET(flow_context, in_flow_context, action, action); - } else { - MLX5_SET(flow_context, in_flow_context, action, - fte->action.action); - if (fte->action.pkt_reformat) - MLX5_SET(flow_context, in_flow_context, packet_reformat_id, - fte->action.pkt_reformat->id); + action = fte->action.action; + if (extended_dest) + action &= ~MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + + MLX5_SET(flow_context, in_flow_context, action, action); + + if (!extended_dest && fte->action.pkt_reformat) { + struct mlx5_pkt_reformat *pkt_reformat = fte->action.pkt_reformat; + + if (pkt_reformat->owner == MLX5_FLOW_RESOURCE_OWNER_SW) { + reformat_id = mlx5_fs_dr_action_get_pkt_reformat_id(pkt_reformat); + if (reformat_id < 0) { + mlx5_core_err(dev, + "Unsupported SW-owned pkt_reformat type (%d) in FW-owned table\n", + pkt_reformat->reformat_type); + err = reformat_id; + goto err_out; + } + } else { + reformat_id = fte->action.pkt_reformat->id; + } } - if (fte->action.modify_hdr) + + MLX5_SET(flow_context, in_flow_context, packet_reformat_id, (u32)reformat_id); + + if (fte->action.modify_hdr) { + if (fte->action.modify_hdr->owner == MLX5_FLOW_RESOURCE_OWNER_SW) { + mlx5_core_err(dev, "Can't use SW-owned modify_hdr in FW-owned table\n"); + err = -EOPNOTSUPP; + goto err_out; + } + MLX5_SET(flow_context, in_flow_context, modify_header_id, fte->action.modify_hdr->id); + } MLX5_SET(flow_context, in_flow_context, encrypt_decrypt_type, fte->action.crypto.type); @@ -885,6 +906,8 @@ static int mlx5_cmd_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns, pkt_reformat->id = MLX5_GET(alloc_packet_reformat_context_out, out, packet_reformat_id); + pkt_reformat->owner = MLX5_FLOW_RESOURCE_OWNER_FW; + kfree(in); return err; } @@ -969,6 +992,7 @@ static int mlx5_cmd_modify_header_alloc(struct mlx5_flow_root_namespace *ns, err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); modify_hdr->id = MLX5_GET(alloc_modify_header_context_out, out, modify_header_id); + modify_hdr->owner = MLX5_FLOW_RESOURCE_OWNER_FW; kfree(in); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index f137a0611b77..b043190e50a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -54,8 +54,14 @@ struct mlx5_flow_definer { u32 id; }; +enum mlx5_flow_resource_owner { + MLX5_FLOW_RESOURCE_OWNER_FW, + MLX5_FLOW_RESOURCE_OWNER_SW, +}; + struct mlx5_modify_hdr { enum mlx5_flow_namespace_type ns_type; + enum mlx5_flow_resource_owner owner; union { struct mlx5_fs_dr_action action; u32 id; @@ -65,6 +71,7 @@ struct mlx5_modify_hdr { struct mlx5_pkt_reformat { enum mlx5_flow_namespace_type ns_type; int reformat_type; /* from mlx5_ifc */ + enum mlx5_flow_resource_owner owner; union { struct mlx5_fs_dr_action action; u32 id; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c index 0eb9a8d7f282..57e22c5170df 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c @@ -2129,6 +2129,11 @@ mlx5dr_action_create_aso(struct mlx5dr_domain *dmn, u32 obj_id, return action; } +u32 mlx5dr_action_get_pkt_reformat_id(struct mlx5dr_action *action) +{ + return action->reformat->id; +} + int mlx5dr_action_destroy(struct mlx5dr_action *action) { if (WARN_ON_ONCE(refcount_read(&action->refcount) > 1)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c index 984653756779..cc215beb7436 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c @@ -331,8 +331,16 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, } if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) { - bool is_decap = fte->action.pkt_reformat->reformat_type == - MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2; + bool is_decap; + + if (fte->action.pkt_reformat->owner == MLX5_FLOW_RESOURCE_OWNER_FW) { + err = -EINVAL; + mlx5dr_err(domain, "FW-owned reformat can't be used in SW rule\n"); + goto free_actions; + } + + is_decap = fte->action.pkt_reformat->reformat_type == + MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2; if (is_decap) actions[num_actions++] = @@ -661,6 +669,7 @@ static int mlx5_cmd_dr_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns return -EINVAL; } + pkt_reformat->owner = MLX5_FLOW_RESOURCE_OWNER_SW; pkt_reformat->action.dr_action = action; return 0; @@ -691,6 +700,7 @@ static int mlx5_cmd_dr_modify_header_alloc(struct mlx5_flow_root_namespace *ns, return -EINVAL; } + modify_hdr->owner = MLX5_FLOW_RESOURCE_OWNER_SW; modify_hdr->action.dr_action = action; return 0; @@ -816,6 +826,19 @@ static u32 mlx5_cmd_dr_get_capabilities(struct mlx5_flow_root_namespace *ns, return steering_caps; } +int mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat) +{ + switch (pkt_reformat->reformat_type) { + case MLX5_REFORMAT_TYPE_L2_TO_VXLAN: + case MLX5_REFORMAT_TYPE_L2_TO_NVGRE: + case MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + case MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + case MLX5_REFORMAT_TYPE_INSERT_HDR: + return mlx5dr_action_get_pkt_reformat_id(pkt_reformat->action.dr_action); + } + return -EOPNOTSUPP; +} + bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev) { return mlx5dr_is_supported(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h index d168622063d5..99a3b2eff6b8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h @@ -38,6 +38,8 @@ struct mlx5_fs_dr_table { bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev); +int mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat); + const struct mlx5_flow_cmds *mlx5_fs_cmd_get_dr_cmds(void); #else @@ -47,6 +49,11 @@ static inline const struct mlx5_flow_cmds *mlx5_fs_cmd_get_dr_cmds(void) return NULL; } +static inline u32 mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat) +{ + return 0; +} + static inline bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev) { return false; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h index 9afd268a2573..d1c04f43d86d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h @@ -150,6 +150,8 @@ mlx5dr_action_create_dest_match_range(struct mlx5dr_domain *dmn, int mlx5dr_action_destroy(struct mlx5dr_action *action); +u32 mlx5dr_action_get_pkt_reformat_id(struct mlx5dr_action *action); + int mlx5dr_definer_get(struct mlx5dr_domain *dmn, u16 format_id, u8 *dw_selectors, u8 *byte_selectors, u8 *match_mask, u32 *definer_id); -- cgit v1.2.3 From ef4c5afc783dc3d47640270a9b94713229c697e8 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 4 Jun 2023 21:07:04 +0300 Subject: net/mlx5: DR, Fix wrong action data allocation in decap action When TUNNEL_L3_TO_L2 decap action was created, a pointer to a local variable was passed as its HW action data, resulting in attempt to free invalid address: BUG: KASAN: invalid-free in mlx5dr_action_destroy+0x318/0x410 [mlx5_core] Fixes: 4781df92f4da ("net/mlx5: DR, Move STEv0 modify header logic") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c index 57e22c5170df..0f783e7906cb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c @@ -1421,9 +1421,13 @@ dr_action_create_reformat_action(struct mlx5dr_domain *dmn, } case DR_ACTION_TYP_TNL_L3_TO_L2: { - u8 hw_actions[DR_ACTION_CACHE_LINE_SIZE] = {}; + u8 *hw_actions; int ret; + hw_actions = kzalloc(DR_ACTION_CACHE_LINE_SIZE, GFP_KERNEL); + if (!hw_actions) + return -ENOMEM; + ret = mlx5dr_ste_set_action_decap_l3_list(dmn->ste_ctx, data, data_sz, hw_actions, @@ -1431,6 +1435,7 @@ dr_action_create_reformat_action(struct mlx5dr_domain *dmn, &action->rewrite->num_of_actions); if (ret) { mlx5dr_dbg(dmn, "Failed creating decap l3 action list\n"); + kfree(hw_actions); return ret; } @@ -1440,6 +1445,7 @@ dr_action_create_reformat_action(struct mlx5dr_domain *dmn, ret = mlx5dr_ste_alloc_modify_hdr(action); if (ret) { mlx5dr_dbg(dmn, "Failed preparing reformat data\n"); + kfree(hw_actions); return ret; } return 0; -- cgit v1.2.3 From 314ded538e5f22e7610b1bf621402024a180ec80 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 8 Jun 2023 12:00:54 -0700 Subject: net/mlx5: Free IRQ rmap and notifier on kernel shutdown The kernel IRQ system needs the irq affinity notifier to be clear before attempting to free the irq, see WARN_ON log below. On a normal driver unload we don't have this issue since we do the complete cleanup of the irq resources. To fix this, put the important resources cleanup in a helper function and use it in both normal driver unload and shutdown flows. [ 4497.498434] ------------[ cut here ]------------ [ 4497.498726] WARNING: CPU: 0 PID: 9 at kernel/irq/manage.c:2034 free_irq+0x295/0x340 [ 4497.499193] Modules linked in: [ 4497.499386] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.4.0-rc4+ #10 [ 4497.499876] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc38 04/01/2014 [ 4497.500518] Workqueue: events do_poweroff [ 4497.500849] RIP: 0010:free_irq+0x295/0x340 [ 4497.501132] Code: 85 c0 0f 84 1d ff ff ff 48 89 ef ff d0 0f 1f 00 e9 10 ff ff ff 0f 0b e9 72 ff ff ff 49 8d 7f 28 ff d0 0f 1f 00 e9 df fd ff ff <0f> 0b 48 c7 80 c0 008 [ 4497.502269] RSP: 0018:ffffc90000053da0 EFLAGS: 00010282 [ 4497.502589] RAX: ffff888100949600 RBX: ffff88810330b948 RCX: 0000000000000000 [ 4497.503035] RDX: ffff888100949600 RSI: ffff888100400490 RDI: 0000000000000023 [ 4497.503472] RBP: ffff88810330c7e0 R08: ffff8881004005d0 R09: ffffffff8273a260 [ 4497.503923] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8881009ae000 [ 4497.504359] R13: ffff8881009ae148 R14: 0000000000000000 R15: ffff888100949600 [ 4497.504804] FS: 0000000000000000(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000 [ 4497.505302] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4497.505671] CR2: 00007fce98806298 CR3: 000000000262e005 CR4: 0000000000370ef0 [ 4497.506104] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4497.506540] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4497.507002] Call Trace: [ 4497.507158] [ 4497.507299] ? free_irq+0x295/0x340 [ 4497.507522] ? __warn+0x7c/0x130 [ 4497.507740] ? free_irq+0x295/0x340 [ 4497.507963] ? report_bug+0x171/0x1a0 [ 4497.508197] ? handle_bug+0x3c/0x70 [ 4497.508417] ? exc_invalid_op+0x17/0x70 [ 4497.508662] ? asm_exc_invalid_op+0x1a/0x20 [ 4497.508926] ? free_irq+0x295/0x340 [ 4497.509146] mlx5_irq_pool_free_irqs+0x48/0x90 [ 4497.509421] mlx5_irq_table_free_irqs+0x38/0x50 [ 4497.509714] mlx5_core_eq_free_irqs+0x27/0x40 [ 4497.509984] shutdown+0x7b/0x100 [ 4497.510184] pci_device_shutdown+0x30/0x60 [ 4497.510440] device_shutdown+0x14d/0x240 [ 4497.510698] kernel_power_off+0x30/0x70 [ 4497.510938] process_one_work+0x1e6/0x3e0 [ 4497.511183] worker_thread+0x49/0x3b0 [ 4497.511407] ? __pfx_worker_thread+0x10/0x10 [ 4497.511679] kthread+0xe0/0x110 [ 4497.511879] ? __pfx_kthread+0x10/0x10 [ 4497.512114] ret_from_fork+0x29/0x50 [ 4497.512342] Fixes: 9c2d08010963 ("net/mlx5: Free irqs only on shutdown callback") Signed-off-by: Saeed Mahameed Reviewed-by: Shay Drory --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 25 +++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 33b9359de53d..98412bd5a696 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -126,14 +126,22 @@ out: return ret; } -static void irq_release(struct mlx5_irq *irq) +/* mlx5_system_free_irq - Free an IRQ + * @irq: IRQ to free + * + * Free the IRQ and other resources such as rmap from the system. + * BUT doesn't free or remove reference from mlx5. + * This function is very important for the shutdown flow, where we need to + * cleanup system resoruces but keep mlx5 objects alive, + * see mlx5_irq_table_free_irqs(). + */ +static void mlx5_system_free_irq(struct mlx5_irq *irq) { struct mlx5_irq_pool *pool = irq->pool; #ifdef CONFIG_RFS_ACCEL struct cpu_rmap *rmap; #endif - xa_erase(&pool->irqs, irq->pool_index); /* free_irq requires that affinity_hint and rmap will be cleared before * calling it. To satisfy this requirement, we call * irq_cpu_rmap_remove() to remove the notifier @@ -145,10 +153,18 @@ static void irq_release(struct mlx5_irq *irq) irq_cpu_rmap_remove(rmap, irq->map.virq); #endif - free_cpumask_var(irq->mask); free_irq(irq->map.virq, &irq->nh); if (irq->map.index && pci_msix_can_alloc_dyn(pool->dev->pdev)) pci_msix_free_irq(pool->dev->pdev, irq->map); +} + +static void irq_release(struct mlx5_irq *irq) +{ + struct mlx5_irq_pool *pool = irq->pool; + + xa_erase(&pool->irqs, irq->pool_index); + mlx5_system_free_irq(irq); + free_cpumask_var(irq->mask); kfree(irq); } @@ -705,7 +721,8 @@ static void mlx5_irq_pool_free_irqs(struct mlx5_irq_pool *pool) unsigned long index; xa_for_each(&pool->irqs, index, irq) - free_irq(irq->map.virq, &irq->nh); + mlx5_system_free_irq(irq); + } static void mlx5_irq_pools_free_irqs(struct mlx5_irq_table *table) -- cgit v1.2.3 From cf5bb02320d4c4cf4e827efc0314b6ca4082799c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 5 Jun 2023 11:09:49 +0300 Subject: net/mlx5e: Don't delay release of hardware objects XFRM core provides two callbacks to release resources, one is .xdo_dev_policy_delete() and another is .xdo_dev_policy_free(). This separation allows delayed release so "ip xfrm policy free" commands won't starve. Unfortunately, mlx5 command interface can't run in .xdo_dev_policy_free() callbacks as the latter runs in ATOMIC context. BUG: scheduling while atomic: swapper/7/0/0x00000100 Modules linked in: act_mirred act_tunnel_key cls_flower sch_ingress vxlan mlx5_vdpa vringh vhost_iotlb vdpa rpcrdma rdma_ucm ib_iser libiscsi ib_umad scsi_transport_iscsi rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay mlx5_core zram zsmalloc fuse CPU: 7 PID: 0 Comm: swapper/7 Not tainted 6.3.0+ #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x33/0x50 __schedule_bug+0x4e/0x60 __schedule+0x5d5/0x780 ? __mod_timer+0x286/0x3d0 schedule+0x50/0x90 schedule_timeout+0x7c/0xf0 ? __bpf_trace_tick_stop+0x10/0x10 __wait_for_common+0x88/0x190 ? usleep_range_state+0x90/0x90 cmd_exec+0x42e/0xb40 [mlx5_core] mlx5_cmd_do+0x1e/0x40 [mlx5_core] mlx5_cmd_exec+0x18/0x30 [mlx5_core] mlx5_cmd_delete_fte+0xa8/0xd0 [mlx5_core] del_hw_fte+0x60/0x120 [mlx5_core] mlx5_del_flow_rules+0xec/0x270 [mlx5_core] ? default_send_IPI_single_phys+0x26/0x30 mlx5e_accel_ipsec_fs_del_pol+0x1a/0x60 [mlx5_core] mlx5e_xfrm_free_policy+0x15/0x20 [mlx5_core] xfrm_policy_destroy+0x5a/0xb0 xfrm4_dst_destroy+0x7b/0x100 dst_destroy+0x37/0x120 rcu_core+0x2d6/0x540 __do_softirq+0xcd/0x273 irq_exit_rcu+0x82/0xb0 sysvec_apic_timer_interrupt+0x72/0x90 asm_sysvec_apic_timer_interrupt+0x16/0x20 RIP: 0010:default_idle+0x13/0x20 Code: c0 08 00 00 00 4d 29 c8 4c 01 c7 4c 29 c2 e9 72 ff ff ff cc cc cc cc 8b 05 7a 4d ee 00 85 c0 7e 07 0f 00 2d 2f 98 2e 00 fb f4 c3 66 66 2e 0f 1f 84 00 00 00 00 00 65 48 8b 04 25 40 b4 02 00 RSP: 0018:ffff888100843ee0 EFLAGS: 00000242 RAX: 0000000000000001 RBX: ffff888100812b00 RCX: 4000000000000000 RDX: 0000000000000001 RSI: 0000000000000083 RDI: 000000000002d2ec RBP: 0000000000000007 R08: 00000021daeded59 R09: 0000000000000001 R10: 0000000000000000 R11: 000000000000000f R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 default_idle_call+0x30/0xb0 do_idle+0x1c1/0x1d0 cpu_startup_entry+0x19/0x20 start_secondary+0xfe/0x120 secondary_startup_64_no_verify+0xf3/0xfb bad: scheduling from the idle thread! Fixes: a5b8ca9471d3 ("net/mlx5e: Add XFRM policy offload logic") Signed-off-by: Leon Romanovsky Reviewed-by: Simon Horman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index 55b38544422f..d1c801723d35 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -1040,11 +1040,17 @@ err_fs: return err; } -static void mlx5e_xfrm_free_policy(struct xfrm_policy *x) +static void mlx5e_xfrm_del_policy(struct xfrm_policy *x) { struct mlx5e_ipsec_pol_entry *pol_entry = to_ipsec_pol_entry(x); mlx5e_accel_ipsec_fs_del_pol(pol_entry); +} + +static void mlx5e_xfrm_free_policy(struct xfrm_policy *x) +{ + struct mlx5e_ipsec_pol_entry *pol_entry = to_ipsec_pol_entry(x); + kfree(pol_entry); } @@ -1065,6 +1071,7 @@ static const struct xfrmdev_ops mlx5e_ipsec_packet_xfrmdev_ops = { .xdo_dev_state_update_curlft = mlx5e_xfrm_update_curlft, .xdo_dev_policy_add = mlx5e_xfrm_add_policy, + .xdo_dev_policy_delete = mlx5e_xfrm_del_policy, .xdo_dev_policy_free = mlx5e_xfrm_free_policy, }; -- cgit v1.2.3 From fef06678931ff67b158d337b581e5cf5ca40a3a3 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Mon, 5 Jun 2023 11:09:50 +0300 Subject: net/mlx5e: Fix ESN update kernel panic Previously during mlx5e_ipsec_handle_event the driver tried to execute an operation that could sleep, while holding a spinlock, which caused the kernel panic mentioned below. Move the function call that can sleep outside of the spinlock context. Call Trace: dump_stack_lvl+0x49/0x6c __schedule_bug.cold+0x42/0x4e schedule_debug.constprop.0+0xe0/0x118 __schedule+0x59/0x58a ? __mod_timer+0x2a1/0x3ef schedule+0x5e/0xd4 schedule_timeout+0x99/0x164 ? __pfx_process_timeout+0x10/0x10 __wait_for_common+0x90/0x1da ? __pfx_schedule_timeout+0x10/0x10 wait_func+0x34/0x142 [mlx5_core] mlx5_cmd_invoke+0x1f3/0x313 [mlx5_core] cmd_exec+0x1fe/0x325 [mlx5_core] mlx5_cmd_do+0x22/0x50 [mlx5_core] mlx5_cmd_exec+0x1c/0x40 [mlx5_core] mlx5_modify_ipsec_obj+0xb2/0x17f [mlx5_core] mlx5e_ipsec_update_esn_state+0x69/0xf0 [mlx5_core] ? wake_affine+0x62/0x1f8 mlx5e_ipsec_handle_event+0xb1/0xc0 [mlx5_core] process_one_work+0x1e2/0x3e6 ? __pfx_worker_thread+0x10/0x10 worker_thread+0x54/0x3ad ? __pfx_worker_thread+0x10/0x10 kthread+0xda/0x101 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x29/0x37 BUG: workqueue leaked lock or atomic: kworker/u256:4/0x7fffffff/189754#012 last function: mlx5e_ipsec_handle_event [mlx5_core] CPU: 66 PID: 189754 Comm: kworker/u256:4 Kdump: loaded Tainted: G W 6.2.0-2596.20230309201517_5.el8uek.rc1.x86_64 #2 Hardware name: Oracle Corporation ORACLE SERVER X9-2/ASMMBX9-2, BIOS 61070300 08/17/2022 Workqueue: mlx5e_ipsec: eth%d mlx5e_ipsec_handle_event [mlx5_core] Call Trace: dump_stack_lvl+0x49/0x6c process_one_work.cold+0x2b/0x3c ? __pfx_worker_thread+0x10/0x10 worker_thread+0x54/0x3ad ? __pfx_worker_thread+0x10/0x10 kthread+0xda/0x101 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x29/0x37 BUG: scheduling while atomic: kworker/u256:4/189754/0x00000000 Fixes: cee137a63431 ("net/mlx5e: Handle ESN update events") Signed-off-by: Patrisious Haddad Signed-off-by: Leon Romanovsky Reviewed-by: Simon Horman Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c index df90e19066bc..ca16cb9807ea 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c @@ -305,7 +305,17 @@ static void mlx5e_ipsec_update_esn_state(struct mlx5e_ipsec_sa_entry *sa_entry, } mlx5e_ipsec_build_accel_xfrm_attrs(sa_entry, &attrs); + + /* It is safe to execute the modify below unlocked since the only flows + * that could affect this HW object, are create, destroy and this work. + * + * Creation flow can't co-exist with this modify work, the destruction + * flow would cancel this work, and this work is a single entity that + * can't conflict with it self. + */ + spin_unlock_bh(&sa_entry->x->lock); mlx5_accel_esp_modify_xfrm(sa_entry, &attrs); + spin_lock_bh(&sa_entry->x->lock); data.data_offset_condition_operand = MLX5_IPSEC_ASO_REMOVE_FLOW_PKT_CNT_OFFSET; @@ -431,7 +441,7 @@ static void mlx5e_ipsec_handle_event(struct work_struct *_work) aso = sa_entry->ipsec->aso; attrs = &sa_entry->attrs; - spin_lock(&sa_entry->x->lock); + spin_lock_bh(&sa_entry->x->lock); ret = mlx5e_ipsec_aso_query(sa_entry, NULL); if (ret) goto unlock; @@ -447,7 +457,7 @@ static void mlx5e_ipsec_handle_event(struct work_struct *_work) mlx5e_ipsec_handle_limits(sa_entry); unlock: - spin_unlock(&sa_entry->x->lock); + spin_unlock_bh(&sa_entry->x->lock); kfree(work); } -- cgit v1.2.3 From c75b94255aaa45d9e531df2763baa67020bb6fa9 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 5 Jun 2023 11:09:51 +0300 Subject: net/mlx5e: Drop XFRM state lock when modifying flow steering XFRM state which is changed to be XFRM_STATE_EXPIRED doesn't really need to hold lock while modifying flow steering rules to drop traffic. That state can be deleted only and as such mlx5e_ipsec_handle_tx_limit() work will be canceled anyway and won't run in parallel. Fixes: b2f7b01d36a9 ("net/mlx5e: Simulate missing IPsec TX limits hardware functionality") Signed-off-by: Leon Romanovsky Reviewed-by: Simon Horman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index d1c801723d35..891d39b4bfd4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -61,16 +61,19 @@ static void mlx5e_ipsec_handle_tx_limit(struct work_struct *_work) struct mlx5e_ipsec_sa_entry *sa_entry = dwork->sa_entry; struct xfrm_state *x = sa_entry->x; - spin_lock(&x->lock); + if (sa_entry->attrs.drop) + return; + + spin_lock_bh(&x->lock); xfrm_state_check_expire(x); if (x->km.state == XFRM_STATE_EXPIRED) { sa_entry->attrs.drop = true; - mlx5e_accel_ipsec_fs_modify(sa_entry); - } - spin_unlock(&x->lock); + spin_unlock_bh(&x->lock); - if (sa_entry->attrs.drop) + mlx5e_accel_ipsec_fs_modify(sa_entry); return; + } + spin_unlock_bh(&x->lock); queue_delayed_work(sa_entry->ipsec->wq, &dwork->dwork, MLX5_IPSEC_RESCHED); -- cgit v1.2.3 From a128f9d4c1227dfcf7f2328070760cb7ed1ec08d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 5 Jun 2023 11:09:52 +0300 Subject: net/mlx5e: Fix scheduling of IPsec ASO query while in atomic ASO query can be scheduled in atomic context as such it can't use usleep. Use udelay as recommended in Documentation/timers/timers-howto.rst. Fixes: 76e463f6508b ("net/mlx5e: Overcome slow response for first IPsec ASO WQE") Signed-off-by: Leon Romanovsky Reviewed-by: Simon Horman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c index ca16cb9807ea..a3554bde3e07 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c @@ -606,7 +606,8 @@ int mlx5e_ipsec_aso_query(struct mlx5e_ipsec_sa_entry *sa_entry, do { ret = mlx5_aso_poll_cq(aso->aso, false); if (ret) - usleep_range(2, 10); + /* We are in atomic context */ + udelay(10); } while (ret && time_is_after_jiffies(expires)); spin_unlock_bh(&aso->lock); return ret; -- cgit v1.2.3 From cd9125030689dda69f73f6c2843d63135cb383f0 Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Tue, 13 Jun 2023 00:33:25 +0000 Subject: ieee802154: Replace strlcpy with strscpy strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. In an effort to remove strlcpy() completely [2], replace strlcpy() here with strscpy(). Direct replacement is safe here since the return values from the helper macros are ignored by the callers. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 Signed-off-by: Azeem Shaikh Reviewed-by: Kees Cook Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230613003326.3538391-1-azeemshaikh38@gmail.com Signed-off-by: Stefan Schmidt --- net/ieee802154/trace.h | 2 +- net/mac802154/trace.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h index e5d8439b9e45..c16db0b326fa 100644 --- a/net/ieee802154/trace.h +++ b/net/ieee802154/trace.h @@ -13,7 +13,7 @@ #define MAXNAME 32 #define WPAN_PHY_ENTRY __array(char, wpan_phy_name, MAXNAME) -#define WPAN_PHY_ASSIGN strlcpy(__entry->wpan_phy_name, \ +#define WPAN_PHY_ASSIGN strscpy(__entry->wpan_phy_name, \ wpan_phy_name(wpan_phy), \ MAXNAME) #define WPAN_PHY_PR_FMT "%s" diff --git a/net/mac802154/trace.h b/net/mac802154/trace.h index 689396d6c76a..1574ecc48075 100644 --- a/net/mac802154/trace.h +++ b/net/mac802154/trace.h @@ -14,7 +14,7 @@ #define MAXNAME 32 #define LOCAL_ENTRY __array(char, wpan_phy_name, MAXNAME) -#define LOCAL_ASSIGN strlcpy(__entry->wpan_phy_name, \ +#define LOCAL_ASSIGN strscpy(__entry->wpan_phy_name, \ wpan_phy_name(local->hw.phy), MAXNAME) #define LOCAL_PR_FMT "%s" #define LOCAL_PR_ARG __entry->wpan_phy_name -- cgit v1.2.3 From ba00b190670809c1a89326d80de96d714f6004f2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 16 Jun 2023 22:39:39 +0100 Subject: afs: Fix vlserver probe RTT handling In the same spirit as commit ca57f02295f1 ("afs: Fix fileserver probe RTT handling"), don't rule out using a vlserver just because there haven't been enough packets yet to calculate a real rtt. Always set the server's probe rtt from the estimate provided by rxrpc_kernel_get_srtt, which is capped at 1 second. This could lead to EDESTADDRREQ errors when accessing a cell for the first time, even though the vl servers are known and have responded to a probe. Fixes: 1d4adfaf6574 ("rxrpc: Make rxrpc_kernel_get_srtt() indicate validity") Signed-off-by: Marc Dionne Signed-off-by: David Howells cc: linux-afs@lists.infradead.org Link: http://lists.infradead.org/pipermail/linux-afs/2023-June/006746.html Signed-off-by: Linus Torvalds --- fs/afs/vl_probe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index d1c7068b4346..58452b86e672 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -115,8 +115,8 @@ responded: } } - if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) && - rtt_us < server->probe.rtt) { + rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us); + if (rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; server->rtt = rtt_us; alist->preferred = index; -- cgit v1.2.3 From 2b9b8f3b68edb3d67d79962f02e26dbb5ae3808d Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 5 Jun 2023 01:57:34 +0900 Subject: ksmbd: validate command payload size ->StructureSize2 indicates command payload size. ksmbd should validate this size with rfc1002 length before accessing it. This patch remove unneeded check and add the validation for this. [ 8.912583] BUG: KASAN: slab-out-of-bounds in ksmbd_smb2_check_message+0x12a/0xc50 [ 8.913051] Read of size 2 at addr ffff88800ac7d92c by task kworker/0:0/7 ... [ 8.914967] Call Trace: [ 8.915126] [ 8.915267] dump_stack_lvl+0x33/0x50 [ 8.915506] print_report+0xcc/0x620 [ 8.916558] kasan_report+0xae/0xe0 [ 8.917080] kasan_check_range+0x35/0x1b0 [ 8.917334] ksmbd_smb2_check_message+0x12a/0xc50 [ 8.917935] ksmbd_verify_smb_message+0xae/0xd0 [ 8.918223] handle_ksmbd_work+0x192/0x820 [ 8.918478] process_one_work+0x419/0x760 [ 8.918727] worker_thread+0x2a2/0x6f0 [ 8.919222] kthread+0x187/0x1d0 [ 8.919723] ret_from_fork+0x1f/0x30 [ 8.919954] Cc: stable@vger.kernel.org Reported-by: Chih-Yen Chang Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2misc.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c index 0ffe663b7590..57749f41b991 100644 --- a/fs/smb/server/smb2misc.c +++ b/fs/smb/server/smb2misc.c @@ -351,6 +351,7 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) int command; __u32 clc_len; /* calculated length */ __u32 len = get_rfc1002_len(work->request_buf); + __u32 req_struct_size; if (le32_to_cpu(hdr->NextCommand) > 0) len = le32_to_cpu(hdr->NextCommand); @@ -373,17 +374,9 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) } if (smb2_req_struct_sizes[command] != pdu->StructureSize2) { - if (command != SMB2_OPLOCK_BREAK_HE && - (hdr->Status == 0 || pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2_LE)) { - /* error packets have 9 byte structure size */ - ksmbd_debug(SMB, - "Illegal request size %u for command %d\n", - le16_to_cpu(pdu->StructureSize2), command); - return 1; - } else if (command == SMB2_OPLOCK_BREAK_HE && - hdr->Status == 0 && - le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 && - le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) { + if (command == SMB2_OPLOCK_BREAK_HE && + le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 && + le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) { /* special case for SMB2.1 lease break message */ ksmbd_debug(SMB, "Illegal request size %d for oplock break\n", @@ -392,6 +385,14 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) } } + req_struct_size = le16_to_cpu(pdu->StructureSize2) + + __SMB2_HEADER_STRUCTURE_SIZE; + if (command == SMB2_LOCK_HE) + req_struct_size -= sizeof(struct smb2_lock_element); + + if (req_struct_size > len + 1) + return 1; + if (smb2_calc_size(hdr, &clc_len)) return 1; -- cgit v1.2.3 From 40b268d384a22276dca1450549f53eed60e21deb Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 15 Jun 2023 15:56:32 +0900 Subject: ksmbd: add mnt_want_write to ksmbd vfs functions ksmbd is doing write access using vfs helpers. There are the cases that mnt_want_write() is not called in vfs helper. This patch add missing mnt_want_write() to ksmbd vfs functions. Cc: stable@vger.kernel.org Cc: Amir Goldstein Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 26 +++++------ fs/smb/server/smbacl.c | 10 ++-- fs/smb/server/vfs.c | 117 ++++++++++++++++++++++++++++++++++++---------- fs/smb/server/vfs.h | 17 ++++--- fs/smb/server/vfs_cache.c | 2 +- 5 files changed, 117 insertions(+), 55 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 25c0ba04c59d..ccecdb71d2bc 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -2249,7 +2249,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, /* delete the EA only when it exits */ if (rc > 0) { rc = ksmbd_vfs_remove_xattr(idmap, - path->dentry, + path, attr_name); if (rc < 0) { @@ -2263,8 +2263,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, /* if the EA doesn't exist, just do nothing. */ rc = 0; } else { - rc = ksmbd_vfs_setxattr(idmap, - path->dentry, attr_name, value, + rc = ksmbd_vfs_setxattr(idmap, path, attr_name, value, le16_to_cpu(eabuf->EaValueLength), 0); if (rc < 0) { ksmbd_debug(SMB, @@ -2321,8 +2320,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, return -EBADF; } - rc = ksmbd_vfs_setxattr(idmap, path->dentry, - xattr_stream_name, NULL, 0, 0); + rc = ksmbd_vfs_setxattr(idmap, path, xattr_stream_name, NULL, 0, 0); if (rc < 0) pr_err("Failed to store XATTR stream name :%d\n", rc); return 0; @@ -2350,7 +2348,7 @@ static int smb2_remove_smb_xattrs(const struct path *path) if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && !strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, STREAM_PREFIX_LEN)) { - err = ksmbd_vfs_remove_xattr(idmap, path->dentry, + err = ksmbd_vfs_remove_xattr(idmap, path, name); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", @@ -2397,8 +2395,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path * da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), - path->dentry, &da); + rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), path, &da); if (rc) ksmbd_debug(SMB, "failed to store file attribute into xattr\n"); } @@ -2972,7 +2969,7 @@ int smb2_open(struct ksmbd_work *work) struct inode *inode = d_inode(path.dentry); posix_acl_rc = ksmbd_vfs_inherit_posix_acl(idmap, - path.dentry, + &path, d_inode(path.dentry->d_parent)); if (posix_acl_rc) ksmbd_debug(SMB, "inherit posix acl failed : %d\n", posix_acl_rc); @@ -2988,7 +2985,7 @@ int smb2_open(struct ksmbd_work *work) if (rc) { if (posix_acl_rc) ksmbd_vfs_set_init_posix_acl(idmap, - path.dentry); + &path); if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { @@ -3028,7 +3025,7 @@ int smb2_open(struct ksmbd_work *work) rc = ksmbd_vfs_set_sd_xattr(conn, idmap, - path.dentry, + &path, pntsd, pntsd_size); kfree(pntsd); @@ -5464,7 +5461,7 @@ static int smb2_rename(struct ksmbd_work *work, goto out; rc = ksmbd_vfs_setxattr(file_mnt_idmap(fp->filp), - fp->filp->f_path.dentry, + &fp->filp->f_path, xattr_stream_name, NULL, 0, 0); if (rc < 0) { @@ -5629,8 +5626,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, - filp->f_path.dentry, &da); + rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, &filp->f_path, &da); if (rc) ksmbd_debug(SMB, "failed to restore file attribute in EA\n"); @@ -7485,7 +7481,7 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id, da.attr = le32_to_cpu(fp->f_ci->m_fattr); ret = ksmbd_vfs_set_dos_attrib_xattr(idmap, - fp->filp->f_path.dentry, &da); + &fp->filp->f_path, &da); if (ret) fp->f_ci->m_fattr = old_fattr; } diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index 0a5862a61c77..ad919a4239d0 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -1162,8 +1162,7 @@ pass: pntsd_size += sizeof(struct smb_acl) + nt_size; } - ksmbd_vfs_set_sd_xattr(conn, idmap, - path->dentry, pntsd, pntsd_size); + ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, pntsd_size); kfree(pntsd); } @@ -1383,7 +1382,7 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, newattrs.ia_valid |= ATTR_MODE; newattrs.ia_mode = (inode->i_mode & ~0777) | (fattr.cf_mode & 0777); - ksmbd_vfs_remove_acl_xattrs(idmap, path->dentry); + ksmbd_vfs_remove_acl_xattrs(idmap, path); /* Update posix acls */ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) { rc = set_posix_acl(idmap, path->dentry, @@ -1414,9 +1413,8 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, if (test_share_config_flag(tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { /* Update WinACL in xattr */ - ksmbd_vfs_remove_sd_xattrs(idmap, path->dentry); - ksmbd_vfs_set_sd_xattr(conn, idmap, - path->dentry, pntsd, ntsd_len); + ksmbd_vfs_remove_sd_xattrs(idmap, path); + ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, ntsd_len); } out: diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index f9fb778247e7..81489fdedd8e 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -170,6 +170,10 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) return err; } + err = mnt_want_write(path.mnt); + if (err) + goto out_err; + mode |= S_IFREG; err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry), dentry, mode, true); @@ -179,6 +183,9 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) } else { pr_err("File(%s): creation failed (err:%d)\n", name, err); } + mnt_drop_write(path.mnt); + +out_err: done_path_create(&path, dentry); return err; } @@ -209,30 +216,35 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) return err; } + err = mnt_want_write(path.mnt); + if (err) + goto out_err2; + idmap = mnt_idmap(path.mnt); mode |= S_IFDIR; err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); - if (err) { - goto out; - } else if (d_unhashed(dentry)) { + if (!err && d_unhashed(dentry)) { struct dentry *d; d = lookup_one(idmap, dentry->d_name.name, dentry->d_parent, dentry->d_name.len); if (IS_ERR(d)) { err = PTR_ERR(d); - goto out; + goto out_err1; } if (unlikely(d_is_negative(d))) { dput(d); err = -ENOENT; - goto out; + goto out_err1; } ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d)); dput(d); } -out: + +out_err1: + mnt_drop_write(path.mnt); +out_err2: done_path_create(&path, dentry); if (err) pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); @@ -443,7 +455,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, memcpy(&stream_buf[*pos], buf, count); err = ksmbd_vfs_setxattr(idmap, - fp->filp->f_path.dentry, + &fp->filp->f_path, fp->stream.name, (void *)stream_buf, size, @@ -589,6 +601,10 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path) goto out_err; } + err = mnt_want_write(path->mnt); + if (err) + goto out_err; + idmap = mnt_idmap(path->mnt); if (S_ISDIR(d_inode(path->dentry)->i_mode)) { err = vfs_rmdir(idmap, d_inode(parent), path->dentry); @@ -599,6 +615,7 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path) if (err) ksmbd_debug(VFS, "unlink failed, err %d\n", err); } + mnt_drop_write(path->mnt); out_err: ksmbd_revert_fsids(work); @@ -644,11 +661,16 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, goto out3; } + err = mnt_want_write(newpath.mnt); + if (err) + goto out3; + err = vfs_link(oldpath.dentry, mnt_idmap(newpath.mnt), d_inode(newpath.dentry), dentry, NULL); if (err) ksmbd_debug(VFS, "vfs_link failed err %d\n", err); + mnt_drop_write(newpath.mnt); out3: done_path_create(&newpath, dentry); @@ -694,6 +716,10 @@ retry: goto out2; } + err = mnt_want_write(old_path->mnt); + if (err) + goto out2; + trap = lock_rename_child(old_child, new_path.dentry); old_parent = dget(old_child->d_parent); @@ -757,6 +783,7 @@ out4: out3: dput(old_parent); unlock_rename(old_parent, new_path.dentry); + mnt_drop_write(old_path->mnt); out2: path_put(&new_path); @@ -897,19 +924,24 @@ ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap, * Return: 0 on success, otherwise error */ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, - struct dentry *dentry, const char *attr_name, + const struct path *path, const char *attr_name, void *attr_value, size_t attr_size, int flags) { int err; + err = mnt_want_write(path->mnt); + if (err) + return err; + err = vfs_setxattr(idmap, - dentry, + path->dentry, attr_name, attr_value, attr_size, flags); if (err) ksmbd_debug(VFS, "setxattr failed, err %d\n", err); + mnt_drop_write(path->mnt); return err; } @@ -1013,9 +1045,18 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, } int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, char *attr_name) + const struct path *path, char *attr_name) { - return vfs_removexattr(idmap, dentry, attr_name); + int err; + + err = mnt_want_write(path->mnt); + if (err) + return err; + + err = vfs_removexattr(idmap, path->dentry, attr_name); + mnt_drop_write(path->mnt); + + return err; } int ksmbd_vfs_unlink(struct file *filp) @@ -1024,6 +1065,10 @@ int ksmbd_vfs_unlink(struct file *filp) struct dentry *dir, *dentry = filp->f_path.dentry; struct mnt_idmap *idmap = file_mnt_idmap(filp); + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + dir = dget_parent(dentry); err = ksmbd_vfs_lock_parent(dir, dentry); if (err) @@ -1041,6 +1086,7 @@ int ksmbd_vfs_unlink(struct file *filp) ksmbd_debug(VFS, "failed to delete, err %d\n", err); out: dput(dir); + mnt_drop_write(filp->f_path.mnt); return err; } @@ -1244,13 +1290,13 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, } int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry) + const struct path *path) { char *name, *xattr_list = NULL; ssize_t xattr_list_len; int err = 0; - xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list); + xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list); if (xattr_list_len < 0) { goto out; } else if (!xattr_list_len) { @@ -1258,6 +1304,10 @@ int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, goto out; } + err = mnt_want_write(path->mnt); + if (err) + goto out; + for (name = xattr_list; name - xattr_list < xattr_list_len; name += strlen(name) + 1) { ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); @@ -1266,25 +1316,26 @@ int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) || !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) { - err = vfs_remove_acl(idmap, dentry, name); + err = vfs_remove_acl(idmap, path->dentry, name); if (err) ksmbd_debug(SMB, "remove acl xattr failed : %s\n", name); } } + mnt_drop_write(path->mnt); + out: kvfree(xattr_list); return err; } -int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry) +int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path) { char *name, *xattr_list = NULL; ssize_t xattr_list_len; int err = 0; - xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list); + xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list); if (xattr_list_len < 0) { goto out; } else if (!xattr_list_len) { @@ -1297,7 +1348,7 @@ int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); if (!strncmp(name, XATTR_NAME_SD, XATTR_NAME_SD_LEN)) { - err = ksmbd_vfs_remove_xattr(idmap, dentry, name); + err = ksmbd_vfs_remove_xattr(idmap, path, name); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", name); } @@ -1374,13 +1425,14 @@ out: int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct smb_ntsd *pntsd, int len) { int rc; struct ndr sd_ndr = {0}, acl_ndr = {0}; struct xattr_ntacl acl = {0}; struct xattr_smb_acl *smb_acl, *def_smb_acl = NULL; + struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); acl.version = 4; @@ -1432,7 +1484,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, goto out; } - rc = ksmbd_vfs_setxattr(idmap, dentry, + rc = ksmbd_vfs_setxattr(idmap, path, XATTR_NAME_SD, sd_ndr.data, sd_ndr.offset, 0); if (rc < 0) @@ -1522,7 +1574,7 @@ free_n_data: } int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct xattr_dos_attrib *da) { struct ndr n; @@ -1532,7 +1584,7 @@ int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, if (err) return err; - err = ksmbd_vfs_setxattr(idmap, dentry, XATTR_NAME_DOS_ATTRIBUTE, + err = ksmbd_vfs_setxattr(idmap, path, XATTR_NAME_DOS_ATTRIBUTE, (void *)n.data, n.offset, 0); if (err) ksmbd_debug(SMB, "failed to store dos attribute in xattr\n"); @@ -1769,10 +1821,11 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock) } int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry) + struct path *path) { struct posix_acl_state acl_state; struct posix_acl *acls; + struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); int rc; @@ -1802,6 +1855,11 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, return -ENOMEM; } posix_state_to_acl(&acl_state, acls->a_entries); + + rc = mnt_want_write(path->mnt); + if (rc) + goto out_err; + rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", @@ -1813,16 +1871,20 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); } + mnt_drop_write(path->mnt); + +out_err: free_acl_state(&acl_state); posix_acl_release(acls); return rc; } int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *parent_inode) + struct path *path, struct inode *parent_inode) { struct posix_acl *acls; struct posix_acl_entry *pace; + struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); int rc, i; @@ -1841,6 +1903,10 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, } } + rc = mnt_want_write(path->mnt); + if (rc) + goto out_err; + rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", @@ -1852,6 +1918,9 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); } + mnt_drop_write(path->mnt); + +out_err: posix_acl_release(acls); return rc; } diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index a4ae89f3230d..8c0931d4d531 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -108,12 +108,12 @@ ssize_t ksmbd_vfs_casexattr_len(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name, int attr_name_len); int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, - struct dentry *dentry, const char *attr_name, + const struct path *path, const char *attr_name, void *attr_value, size_t attr_size, int flags); int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, char *attr_name); + const struct path *path, char *attr_name); int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, unsigned int flags, struct path *path, bool caseless); @@ -139,26 +139,25 @@ void ksmbd_vfs_posix_lock_wait(struct file_lock *flock); int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout); void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock); int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry); -int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry); + const struct path *path); +int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path); int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct smb_ntsd *pntsd, int len); int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd **pntsd); int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct xattr_dos_attrib *da); int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da); int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry); + struct path *path); int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry, + struct path *path, struct inode *parent_inode); #endif /* __KSMBD_VFS_H__ */ diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 2d0138e72d78..f41f8d6108ce 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -252,7 +252,7 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) if (ksmbd_stream_fd(fp) && (ci->m_flags & S_DEL_ON_CLS_STREAM)) { ci->m_flags &= ~S_DEL_ON_CLS_STREAM; err = ksmbd_vfs_remove_xattr(file_mnt_idmap(filp), - filp->f_path.dentry, + &filp->f_path, fp->stream.name); if (err) pr_err("remove xattr failed : %s\n", -- cgit v1.2.3 From 5fe7f7b78290638806211046a99f031ff26164e1 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 15 Jun 2023 22:04:40 +0900 Subject: ksmbd: fix out-of-bound read in smb2_write ksmbd_smb2_check_message doesn't validate hdr->NextCommand. If ->NextCommand is bigger than Offset + Length of smb2 write, It will allow oversized smb2 write length. It will cause OOB read in smb2_write. Cc: stable@vger.kernel.org Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-21164 Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2misc.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c index 57749f41b991..33b7e6c4ceff 100644 --- a/fs/smb/server/smb2misc.c +++ b/fs/smb/server/smb2misc.c @@ -351,10 +351,16 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) int command; __u32 clc_len; /* calculated length */ __u32 len = get_rfc1002_len(work->request_buf); - __u32 req_struct_size; + __u32 req_struct_size, next_cmd = le32_to_cpu(hdr->NextCommand); - if (le32_to_cpu(hdr->NextCommand) > 0) - len = le32_to_cpu(hdr->NextCommand); + if ((u64)work->next_smb2_rcv_hdr_off + next_cmd > len) { + pr_err("next command(%u) offset exceeds smb msg size\n", + next_cmd); + return 1; + } + + if (next_cmd > 0) + len = next_cmd; else if (work->next_smb2_rcv_hdr_off) len -= work->next_smb2_rcv_hdr_off; -- cgit v1.2.3 From 5005bcb4219156f1bf7587b185080ec1da08518e Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 15 Jun 2023 22:05:29 +0900 Subject: ksmbd: validate session id and tree id in the compound request This patch validate session id and tree id in compound request. If first operation in the compound is SMB2 ECHO request, ksmbd bypass session and tree validation. So work->sess and work->tcon could be NULL. If secound request in the compound access work->sess or tcon, It cause NULL pointer dereferecing error. Cc: stable@vger.kernel.org Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-21165 Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/server.c | 33 ++++++++++++++++++++------------- fs/smb/server/smb2pdu.c | 44 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 59 insertions(+), 18 deletions(-) diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index f9b2e0f19b03..ced7a9e916f0 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -185,24 +185,31 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, goto send; } - if (conn->ops->check_user_session) { - rc = conn->ops->check_user_session(work); - if (rc < 0) { - command = conn->ops->get_cmd_val(work); - conn->ops->set_rsp_status(work, - STATUS_USER_SESSION_DELETED); - goto send; - } else if (rc > 0) { - rc = conn->ops->get_ksmbd_tcon(work); + do { + if (conn->ops->check_user_session) { + rc = conn->ops->check_user_session(work); if (rc < 0) { - conn->ops->set_rsp_status(work, - STATUS_NETWORK_NAME_DELETED); + if (rc == -EINVAL) + conn->ops->set_rsp_status(work, + STATUS_INVALID_PARAMETER); + else + conn->ops->set_rsp_status(work, + STATUS_USER_SESSION_DELETED); goto send; + } else if (rc > 0) { + rc = conn->ops->get_ksmbd_tcon(work); + if (rc < 0) { + if (rc == -EINVAL) + conn->ops->set_rsp_status(work, + STATUS_INVALID_PARAMETER); + else + conn->ops->set_rsp_status(work, + STATUS_NETWORK_NAME_DELETED); + goto send; + } } } - } - do { rc = __process_request(work, conn, &command); if (rc == SERVER_HANDLER_ABORT) break; diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index ccecdb71d2bc..da1787c68ba0 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -91,7 +91,6 @@ int smb2_get_ksmbd_tcon(struct ksmbd_work *work) unsigned int cmd = le16_to_cpu(req_hdr->Command); int tree_id; - work->tcon = NULL; if (cmd == SMB2_TREE_CONNECT_HE || cmd == SMB2_CANCEL_HE || cmd == SMB2_LOGOFF_HE) { @@ -105,10 +104,28 @@ int smb2_get_ksmbd_tcon(struct ksmbd_work *work) } tree_id = le32_to_cpu(req_hdr->Id.SyncId.TreeId); + + /* + * If request is not the first in Compound request, + * Just validate tree id in header with work->tcon->id. + */ + if (work->next_smb2_rcv_hdr_off) { + if (!work->tcon) { + pr_err("The first operation in the compound does not have tcon\n"); + return -EINVAL; + } + if (work->tcon->id != tree_id) { + pr_err("tree id(%u) is different with id(%u) in first operation\n", + tree_id, work->tcon->id); + return -EINVAL; + } + return 1; + } + work->tcon = ksmbd_tree_conn_lookup(work->sess, tree_id); if (!work->tcon) { pr_err("Invalid tid %d\n", tree_id); - return -EINVAL; + return -ENOENT; } return 1; @@ -547,7 +564,6 @@ int smb2_check_user_session(struct ksmbd_work *work) unsigned int cmd = conn->ops->get_cmd_val(work); unsigned long long sess_id; - work->sess = NULL; /* * SMB2_ECHO, SMB2_NEGOTIATE, SMB2_SESSION_SETUP command do not * require a session id, so no need to validate user session's for @@ -558,15 +574,33 @@ int smb2_check_user_session(struct ksmbd_work *work) return 0; if (!ksmbd_conn_good(conn)) - return -EINVAL; + return -EIO; sess_id = le64_to_cpu(req_hdr->SessionId); + + /* + * If request is not the first in Compound request, + * Just validate session id in header with work->sess->id. + */ + if (work->next_smb2_rcv_hdr_off) { + if (!work->sess) { + pr_err("The first operation in the compound does not have sess\n"); + return -EINVAL; + } + if (work->sess->id != sess_id) { + pr_err("session id(%llu) is different with the first operation(%lld)\n", + sess_id, work->sess->id); + return -EINVAL; + } + return 1; + } + /* Check for validity of user session */ work->sess = ksmbd_session_lookup_all(conn, sess_id); if (work->sess) return 1; ksmbd_debug(SMB, "Invalid user session, Uid %llu\n", sess_id); - return -EINVAL; + return -ENOENT; } static void destroy_previous_session(struct ksmbd_conn *conn, -- cgit v1.2.3 From b5b2a02bcaac7c287694aa0db4837a07bf178626 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 15 Jun 2023 00:00:02 +0200 Subject: parisc: Delete redundant register definitions in We define sp and ipsw in using ".reg", and when using current binutils (snapshot 2.40.50.20230611) the definitions in using "=" conflict with those: arch/parisc/include/asm/assembly.h: Assembler messages: arch/parisc/include/asm/assembly.h:93: Error: symbol `sp' is already defined arch/parisc/include/asm/assembly.h:95: Error: symbol `ipsw' is already defined Delete the duplicate definitions in . Also delete the definition of gp, which isn't used anywhere. Signed-off-by: Ben Hutchings Cc: stable@vger.kernel.org # v6.0+ Signed-off-by: Helge Deller --- arch/parisc/include/asm/assembly.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/parisc/include/asm/assembly.h b/arch/parisc/include/asm/assembly.h index 0f0d4a496fef..75677b526b2b 100644 --- a/arch/parisc/include/asm/assembly.h +++ b/arch/parisc/include/asm/assembly.h @@ -90,10 +90,6 @@ #include #include - sp = 30 - gp = 27 - ipsw = 22 - /* * We provide two versions of each macro to convert from physical * to virtual and vice versa. The "_r1" versions take one argument -- cgit v1.2.3 From 4aaf2c52834b7f95acdf9fb0211a1b60adbf421b Mon Sep 17 00:00:00 2001 From: Íñigo Huguet Date: Thu, 15 Jun 2023 10:49:29 +0200 Subject: sfc: use budget for TX completions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When running workloads heavy unbalanced towards TX (high TX, low RX traffic), sfc driver can retain the CPU during too long times. Although in many cases this is not enough to be visible, it can affect performance and system responsiveness. A way to reproduce it is to use a debug kernel and run some parallel netperf TX tests. In some systems, this will lead to this message being logged: kernel:watchdog: BUG: soft lockup - CPU#12 stuck for 22s! The reason is that sfc driver doesn't account any NAPI budget for the TX completion events work. With high-TX/low-RX traffic, this makes that the CPU is held for long time for NAPI poll. Documentations says "drivers can process completions for any number of Tx packets but should only process up to budget number of Rx packets". However, many drivers do limit the amount of TX completions that they process in a single NAPI poll. In the same way, this patch adds a limit for the TX work in sfc. With the patch applied, the watchdog warning never appears. Tested with netperf in different combinations: single process / parallel processes, TCP / UDP and different sizes of UDP messages. Repeated the tests before and after the patch, without any noticeable difference in network or CPU performance. Test hardware: Intel(R) Xeon(R) CPU E5-1620 v4 @ 3.50GHz (4 cores, 2 threads/core) Solarflare Communications XtremeScale X2522-25G Network Adapter Fixes: 5227ecccea2d ("sfc: remove tx and MCDI handling from NAPI budget consideration") Fixes: d19a53721863 ("sfc_ef100: TX path for EF100 NICs") Reported-by: Fei Liu Signed-off-by: Íñigo Huguet Acked-by: Martin Habets Link: https://lore.kernel.org/r/20230615084929.10506-1-ihuguet@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/ef10.c | 25 ++++++++++++++++++------- drivers/net/ethernet/sfc/ef100_nic.c | 7 ++++++- drivers/net/ethernet/sfc/ef100_tx.c | 4 ++-- drivers/net/ethernet/sfc/ef100_tx.h | 2 +- drivers/net/ethernet/sfc/tx_common.c | 4 +++- drivers/net/ethernet/sfc/tx_common.h | 2 +- 6 files changed, 31 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index d30459dbfe8f..b63e47af6365 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -2950,7 +2950,7 @@ static u32 efx_ef10_extract_event_ts(efx_qword_t *event) return tstamp; } -static void +static int efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) { struct efx_nic *efx = channel->efx; @@ -2958,13 +2958,14 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) unsigned int tx_ev_desc_ptr; unsigned int tx_ev_q_label; unsigned int tx_ev_type; + int work_done; u64 ts_part; if (unlikely(READ_ONCE(efx->reset_pending))) - return; + return 0; if (unlikely(EFX_QWORD_FIELD(*event, ESF_DZ_TX_DROP_EVENT))) - return; + return 0; /* Get the transmit queue */ tx_ev_q_label = EFX_QWORD_FIELD(*event, ESF_DZ_TX_QLABEL); @@ -2973,8 +2974,7 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) if (!tx_queue->timestamping) { /* Transmit completion */ tx_ev_desc_ptr = EFX_QWORD_FIELD(*event, ESF_DZ_TX_DESCR_INDX); - efx_xmit_done(tx_queue, tx_ev_desc_ptr & tx_queue->ptr_mask); - return; + return efx_xmit_done(tx_queue, tx_ev_desc_ptr & tx_queue->ptr_mask); } /* Transmit timestamps are only available for 8XXX series. They result @@ -3000,6 +3000,7 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) * fields in the event. */ tx_ev_type = EFX_QWORD_FIELD(*event, ESF_EZ_TX_SOFT1); + work_done = 0; switch (tx_ev_type) { case TX_TIMESTAMP_EVENT_TX_EV_COMPLETION: @@ -3016,6 +3017,7 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) tx_queue->completed_timestamp_major = ts_part; efx_xmit_done_single(tx_queue); + work_done = 1; break; default: @@ -3026,6 +3028,8 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) EFX_QWORD_VAL(*event)); break; } + + return work_done; } static void @@ -3081,13 +3085,16 @@ static void efx_ef10_handle_driver_generated_event(struct efx_channel *channel, } } +#define EFX_NAPI_MAX_TX 512 + static int efx_ef10_ev_process(struct efx_channel *channel, int quota) { struct efx_nic *efx = channel->efx; efx_qword_t event, *p_event; unsigned int read_ptr; - int ev_code; + int spent_tx = 0; int spent = 0; + int ev_code; if (quota <= 0) return spent; @@ -3126,7 +3133,11 @@ static int efx_ef10_ev_process(struct efx_channel *channel, int quota) } break; case ESE_DZ_EV_CODE_TX_EV: - efx_ef10_handle_tx_event(channel, &event); + spent_tx += efx_ef10_handle_tx_event(channel, &event); + if (spent_tx >= EFX_NAPI_MAX_TX) { + spent = quota; + goto out; + } break; case ESE_DZ_EV_CODE_DRIVER_EV: efx_ef10_handle_driver_event(channel, &event); diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c index 4dc643b0d2db..7adde9639c8a 100644 --- a/drivers/net/ethernet/sfc/ef100_nic.c +++ b/drivers/net/ethernet/sfc/ef100_nic.c @@ -253,6 +253,8 @@ static void ef100_ev_read_ack(struct efx_channel *channel) efx_reg(channel->efx, ER_GZ_EVQ_INT_PRIME)); } +#define EFX_NAPI_MAX_TX 512 + static int ef100_ev_process(struct efx_channel *channel, int quota) { struct efx_nic *efx = channel->efx; @@ -260,6 +262,7 @@ static int ef100_ev_process(struct efx_channel *channel, int quota) bool evq_phase, old_evq_phase; unsigned int read_ptr; efx_qword_t *p_event; + int spent_tx = 0; int spent = 0; bool ev_phase; int ev_type; @@ -295,7 +298,9 @@ static int ef100_ev_process(struct efx_channel *channel, int quota) efx_mcdi_process_event(channel, p_event); break; case ESE_GZ_EF100_EV_TX_COMPLETION: - ef100_ev_tx(channel, p_event); + spent_tx += ef100_ev_tx(channel, p_event); + if (spent_tx >= EFX_NAPI_MAX_TX) + spent = quota; break; case ESE_GZ_EF100_EV_DRIVER: netif_info(efx, drv, efx->net_dev, diff --git a/drivers/net/ethernet/sfc/ef100_tx.c b/drivers/net/ethernet/sfc/ef100_tx.c index 29ffaf35559d..849e5555bd12 100644 --- a/drivers/net/ethernet/sfc/ef100_tx.c +++ b/drivers/net/ethernet/sfc/ef100_tx.c @@ -346,7 +346,7 @@ void ef100_tx_write(struct efx_tx_queue *tx_queue) ef100_tx_push_buffers(tx_queue); } -void ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event) +int ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event) { unsigned int tx_done = EFX_QWORD_FIELD(*p_event, ESF_GZ_EV_TXCMPL_NUM_DESC); @@ -357,7 +357,7 @@ void ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event) unsigned int tx_index = (tx_queue->read_count + tx_done - 1) & tx_queue->ptr_mask; - efx_xmit_done(tx_queue, tx_index); + return efx_xmit_done(tx_queue, tx_index); } /* Add a socket buffer to a TX queue diff --git a/drivers/net/ethernet/sfc/ef100_tx.h b/drivers/net/ethernet/sfc/ef100_tx.h index e9e11540fcde..d9a0819c5a72 100644 --- a/drivers/net/ethernet/sfc/ef100_tx.h +++ b/drivers/net/ethernet/sfc/ef100_tx.h @@ -20,7 +20,7 @@ void ef100_tx_init(struct efx_tx_queue *tx_queue); void ef100_tx_write(struct efx_tx_queue *tx_queue); unsigned int ef100_tx_max_skb_descs(struct efx_nic *efx); -void ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event); +int ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event); netdev_tx_t ef100_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb); int __ef100_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb, diff --git a/drivers/net/ethernet/sfc/tx_common.c b/drivers/net/ethernet/sfc/tx_common.c index 67e789b96c43..755aa92bf823 100644 --- a/drivers/net/ethernet/sfc/tx_common.c +++ b/drivers/net/ethernet/sfc/tx_common.c @@ -249,7 +249,7 @@ void efx_xmit_done_check_empty(struct efx_tx_queue *tx_queue) } } -void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) +int efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) { unsigned int fill_level, pkts_compl = 0, bytes_compl = 0; unsigned int efv_pkts_compl = 0; @@ -279,6 +279,8 @@ void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) } efx_xmit_done_check_empty(tx_queue); + + return pkts_compl + efv_pkts_compl; } /* Remove buffers put into a tx_queue for the current packet. diff --git a/drivers/net/ethernet/sfc/tx_common.h b/drivers/net/ethernet/sfc/tx_common.h index d87aecbc7bf1..1e9f42938aac 100644 --- a/drivers/net/ethernet/sfc/tx_common.h +++ b/drivers/net/ethernet/sfc/tx_common.h @@ -28,7 +28,7 @@ static inline bool efx_tx_buffer_in_use(struct efx_tx_buffer *buffer) } void efx_xmit_done_check_empty(struct efx_tx_queue *tx_queue); -void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index); +int efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index); void efx_enqueue_unwind(struct efx_tx_queue *tx_queue, unsigned int insert_count); -- cgit v1.2.3 From 9636be85cc5bdd8b7a7f6a53405cbcc52161c93c Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Tue, 23 May 2023 10:14:21 -0700 Subject: x86/hyperv: Fix hyperv_pcpu_input_arg handling when CPUs go online/offline These commits a494aef23dfc ("PCI: hv: Replace retarget_msi_interrupt_params with hyperv_pcpu_input_arg") 2c6ba4216844 ("PCI: hv: Enable PCI pass-thru devices in Confidential VMs") update the Hyper-V virtual PCI driver to use the hyperv_pcpu_input_arg because that memory will be correctly marked as decrypted or encrypted for all VM types (CoCo or normal). But problems ensue when CPUs in the VM go online or offline after virtual PCI devices have been configured. When a CPU is brought online, the hyperv_pcpu_input_arg for that CPU is initialized by hv_cpu_init() running under state CPUHP_AP_ONLINE_DYN. But this state occurs after state CPUHP_AP_IRQ_AFFINITY_ONLINE, which may call the virtual PCI driver and fault trying to use the as yet uninitialized hyperv_pcpu_input_arg. A similar problem occurs in a CoCo VM if the MMIO read and write hypercalls are used from state CPUHP_AP_IRQ_AFFINITY_ONLINE. When a CPU is taken offline, IRQs may be reassigned in state CPUHP_TEARDOWN_CPU. Again, the virtual PCI driver may fault trying to use the hyperv_pcpu_input_arg that has already been freed by a higher state. Fix the onlining problem by adding state CPUHP_AP_HYPERV_ONLINE immediately after CPUHP_AP_ONLINE_IDLE (similar to CPUHP_AP_KVM_ONLINE) and before CPUHP_AP_IRQ_AFFINITY_ONLINE. Use this new state for Hyper-V initialization so that hyperv_pcpu_input_arg is allocated early enough. Fix the offlining problem by not freeing hyperv_pcpu_input_arg when a CPU goes offline. Retain the allocated memory, and reuse it if the CPU comes back online later. Signed-off-by: Michael Kelley Reviewed-by: Vitaly Kuznetsov Acked-by: Borislav Petkov (AMD) Reviewed-by: Dexuan Cui Link: https://lore.kernel.org/r/1684862062-51576-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_init.c | 2 +- drivers/hv/hv_common.c | 48 +++++++++++++++++++++++----------------------- include/linux/cpuhotplug.h | 1 + 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index a5f9474f08e1..6c04b52f139b 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -416,7 +416,7 @@ void __init hyperv_init(void) goto free_vp_assist_page; } - cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", + cpuhp = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "x86/hyperv_init:online", hv_cpu_init, hv_cpu_die); if (cpuhp < 0) goto free_ghcb_page; diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 64f9ceca887b..542a1d53b303 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -364,13 +364,20 @@ int hv_common_cpu_init(unsigned int cpu) flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL; inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); - *inputarg = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); - if (!(*inputarg)) - return -ENOMEM; - if (hv_root_partition) { - outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); - *outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE; + /* + * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already + * allocated if this CPU was previously online and then taken offline + */ + if (!*inputarg) { + *inputarg = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); + if (!(*inputarg)) + return -ENOMEM; + + if (hv_root_partition) { + outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); + *outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE; + } } msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX); @@ -385,24 +392,17 @@ int hv_common_cpu_init(unsigned int cpu) int hv_common_cpu_die(unsigned int cpu) { - unsigned long flags; - void **inputarg, **outputarg; - void *mem; - - local_irq_save(flags); - - inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); - mem = *inputarg; - *inputarg = NULL; - - if (hv_root_partition) { - outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); - *outputarg = NULL; - } - - local_irq_restore(flags); - - kfree(mem); + /* + * The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory + * is not freed when the CPU goes offline as the hyperv_pcpu_input_arg + * may be used by the Hyper-V vPCI driver in reassigning interrupts + * as part of the offlining process. The interrupt reassignment + * happens *after* the CPUHP_AP_HYPERV_ONLINE state has run and + * called this function. + * + * If a previously offlined CPU is brought back online again, the + * originally allocated memory is reused in hv_common_cpu_init(). + */ return 0; } diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 0f1001dca0e0..3ceb9dfa0993 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -200,6 +200,7 @@ enum cpuhp_state { /* Online section invoked on the hotplugged CPU from the hotplug thread */ CPUHP_AP_ONLINE_IDLE, + CPUHP_AP_HYPERV_ONLINE, CPUHP_AP_KVM_ONLINE, CPUHP_AP_SCHED_WAIT_EMPTY, CPUHP_AP_SMPBOOT_THREADS, -- cgit v1.2.3 From 52ae076c3a9b366b6fa9f7c7e67aed8b28716ed9 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Tue, 23 May 2023 10:14:22 -0700 Subject: arm64/hyperv: Use CPUHP_AP_HYPERV_ONLINE state to fix CPU online sequencing State CPUHP_AP_HYPERV_ONLINE has been introduced to correctly sequence the initialization of hyperv_pcpu_input_arg. Use this new state for Hyper-V initialization so that hyperv_pcpu_input_arg is allocated early enough. Signed-off-by: Michael Kelley Reviewed-by: Dexuan Cui Link: https://lore.kernel.org/r/1684862062-51576-2-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- arch/arm64/hyperv/mshyperv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/hyperv/mshyperv.c b/arch/arm64/hyperv/mshyperv.c index a406454578f0..f1b8a04ee9f2 100644 --- a/arch/arm64/hyperv/mshyperv.c +++ b/arch/arm64/hyperv/mshyperv.c @@ -67,7 +67,7 @@ static int __init hyperv_init(void) if (ret) return ret; - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "arm64/hyperv_init:online", + ret = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "arm64/hyperv_init:online", hv_common_cpu_init, hv_common_cpu_die); if (ret < 0) { hv_common_free(); -- cgit v1.2.3 From ef7dfac51d8ed961b742218f526bd589f3900a59 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 17 Jun 2023 19:50:24 -0600 Subject: io_uring/poll: serialize poll linked timer start with poll removal We selectively grab the ctx->uring_lock for poll update/removal, but we really should grab it from the start to fully synchronize with linked timeouts. Normally this is indeed the case, but if requests are forced async by the application, we don't fully cover removal and timer disarm within the uring_lock. Make this simpler by having consistent locking state for poll removal. Cc: stable@vger.kernel.org # 6.1+ Reported-by: Querijn Voet Signed-off-by: Jens Axboe --- io_uring/poll.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index c90e47dc1e29..a78b8af7d9ab 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -977,8 +977,9 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_hash_bucket *bucket; struct io_kiocb *preq; int ret2, ret = 0; - struct io_tw_state ts = {}; + struct io_tw_state ts = { .locked = true }; + io_ring_submit_lock(ctx, issue_flags); preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); ret2 = io_poll_disarm(preq); if (bucket) @@ -990,12 +991,10 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) goto out; } - io_ring_submit_lock(ctx, issue_flags); preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket); ret2 = io_poll_disarm(preq); if (bucket) spin_unlock(&bucket->lock); - io_ring_submit_unlock(ctx, issue_flags); if (ret2) { ret = ret2; goto out; @@ -1019,7 +1018,7 @@ found: if (poll_update->update_user_data) preq->cqe.user_data = poll_update->new_user_data; - ret2 = io_poll_add(preq, issue_flags); + ret2 = io_poll_add(preq, issue_flags & ~IO_URING_F_UNLOCKED); /* successfully updated, don't complete poll request */ if (!ret2 || ret2 == -EIOCBQUEUED) goto out; @@ -1027,9 +1026,9 @@ found: req_set_fail(preq); io_req_set_res(preq, -ECANCELED, 0); - ts.locked = !(issue_flags & IO_URING_F_UNLOCKED); io_req_task_complete(preq, &ts); out: + io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) { req_set_fail(req); return ret; -- cgit v1.2.3 From 6aa0365a3c8512587fffd42fe438768709ddef8e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 15 Jun 2023 17:18:53 +0900 Subject: ata: libata-scsi: Avoid deadlock on rescan after device resume When an ATA port is resumed from sleep, the port is reset and a power management request issued to libata EH to reset the port and rescanning the device(s) attached to the port. Device rescanning is done by scheduling an ata_scsi_dev_rescan() work, which will execute scsi_rescan_device(). However, scsi_rescan_device() takes the generic device lock, which is also taken by dpm_resume() when the SCSI device is resumed as well. If a device rescan execution starts before the completion of the SCSI device resume, the rcu locking used to refresh the cached VPD pages of the device, combined with the generic device locking from scsi_rescan_device() and from dpm_resume() can cause a deadlock. Avoid this situation by changing struct ata_port scsi_rescan_task to be a delayed work instead of a simple work_struct. ata_scsi_dev_rescan() is modified to check if the SCSI device associated with the ATA device that must be rescanned is not suspended. If the SCSI device is still suspended, ata_scsi_dev_rescan() returns early and reschedule itself for execution after an arbitrary delay of 5ms. Reported-by: Kai-Heng Feng Reported-by: Joe Breuer Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217530 Fixes: a19a93e4c6a9 ("scsi: core: pm: Rely on the device driver core for async power management") Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Tested-by: Kai-Heng Feng Tested-by: Joe Breuer --- drivers/ata/libata-core.c | 3 ++- drivers/ata/libata-eh.c | 2 +- drivers/ata/libata-scsi.c | 22 +++++++++++++++++++++- include/linux/libata.h | 2 +- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 8bf612bdd61a..b4f246f0cac7 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5348,7 +5348,7 @@ struct ata_port *ata_port_alloc(struct ata_host *host) mutex_init(&ap->scsi_scan_mutex); INIT_DELAYED_WORK(&ap->hotplug_task, ata_scsi_hotplug); - INIT_WORK(&ap->scsi_rescan_task, ata_scsi_dev_rescan); + INIT_DELAYED_WORK(&ap->scsi_rescan_task, ata_scsi_dev_rescan); INIT_LIST_HEAD(&ap->eh_done_q); init_waitqueue_head(&ap->eh_wait_q); init_completion(&ap->park_req_pending); @@ -5954,6 +5954,7 @@ static void ata_port_detach(struct ata_port *ap) WARN_ON(!(ap->pflags & ATA_PFLAG_UNLOADED)); cancel_delayed_work_sync(&ap->hotplug_task); + cancel_delayed_work_sync(&ap->scsi_rescan_task); skip_eh: /* clean up zpodd on port removal */ diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index a6c901811802..6f8d14191593 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -2984,7 +2984,7 @@ static int ata_eh_revalidate_and_attach(struct ata_link *link, ehc->i.flags |= ATA_EHI_SETMODE; /* schedule the scsi_rescan_device() here */ - schedule_work(&(ap->scsi_rescan_task)); + schedule_delayed_work(&ap->scsi_rescan_task, 0); } else if (dev->class == ATA_DEV_UNKNOWN && ehc->tries[dev->devno] && ata_class_enabled(ehc->classes[dev->devno])) { diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 8ce90284eb34..551077cea4e4 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -4597,10 +4597,11 @@ int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel, void ata_scsi_dev_rescan(struct work_struct *work) { struct ata_port *ap = - container_of(work, struct ata_port, scsi_rescan_task); + container_of(work, struct ata_port, scsi_rescan_task.work); struct ata_link *link; struct ata_device *dev; unsigned long flags; + bool delay_rescan = false; mutex_lock(&ap->scsi_scan_mutex); spin_lock_irqsave(ap->lock, flags); @@ -4614,6 +4615,21 @@ void ata_scsi_dev_rescan(struct work_struct *work) if (scsi_device_get(sdev)) continue; + /* + * If the rescan work was scheduled because of a resume + * event, the port is already fully resumed, but the + * SCSI device may not yet be fully resumed. In such + * case, executing scsi_rescan_device() may cause a + * deadlock with the PM code on device_lock(). Prevent + * this by giving up and retrying rescan after a short + * delay. + */ + delay_rescan = sdev->sdev_gendev.power.is_suspended; + if (delay_rescan) { + scsi_device_put(sdev); + break; + } + spin_unlock_irqrestore(ap->lock, flags); scsi_rescan_device(&(sdev->sdev_gendev)); scsi_device_put(sdev); @@ -4623,4 +4639,8 @@ void ata_scsi_dev_rescan(struct work_struct *work) spin_unlock_irqrestore(ap->lock, flags); mutex_unlock(&ap->scsi_scan_mutex); + + if (delay_rescan) + schedule_delayed_work(&ap->scsi_rescan_task, + msecs_to_jiffies(5)); } diff --git a/include/linux/libata.h b/include/linux/libata.h index 311cd93377c7..dd5797fb6305 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -836,7 +836,7 @@ struct ata_port { struct mutex scsi_scan_mutex; struct delayed_work hotplug_task; - struct work_struct scsi_rescan_task; + struct delayed_work scsi_rescan_task; unsigned int hsm_task_state; -- cgit v1.2.3 From 440b5e3663271b0ffbd4908115044a6a51fb938b Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Wed, 14 Jun 2023 21:44:47 -0700 Subject: PCI: hv: Fix a race condition bug in hv_pci_query_relations() Since day 1 of the driver, there has been a race between hv_pci_query_relations() and survey_child_resources(): during fast device hotplug, hv_pci_query_relations() may error out due to device-remove and the stack variable 'comp' is no longer valid; however, pci_devices_present_work() -> survey_child_resources() -> complete() may be running on another CPU and accessing the no-longer-valid 'comp'. Fix the race by flushing the workqueue before we exit from hv_pci_query_relations(). Fixes: 4daace0d8ce8 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs") Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Acked-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230615044451.5580-2-decui@microsoft.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index bc32662c6bb7..ea8862e656b6 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -3401,6 +3401,24 @@ static int hv_pci_query_relations(struct hv_device *hdev) if (!ret) ret = wait_for_response(hdev, &comp); + /* + * In the case of fast device addition/removal, it's possible that + * vmbus_sendpacket() or wait_for_response() returns -ENODEV but we + * already got a PCI_BUS_RELATIONS* message from the host and the + * channel callback already scheduled a work to hbus->wq, which can be + * running pci_devices_present_work() -> survey_child_resources() -> + * complete(&hbus->survey_event), even after hv_pci_query_relations() + * exits and the stack variable 'comp' is no longer valid; as a result, + * a hang or a page fault may happen when the complete() calls + * raw_spin_lock_irqsave(). Flush hbus->wq before we exit from + * hv_pci_query_relations() to avoid the issues. Note: if 'ret' is + * -ENODEV, there can't be any more work item scheduled to hbus->wq + * after the flush_workqueue(): see vmbus_onoffer_rescind() -> + * vmbus_reset_channel_cb(), vmbus_rescind_cleanup() -> + * channel->rescind = true. + */ + flush_workqueue(hbus->wq); + return ret; } -- cgit v1.2.3 From 2738d5ab7929a845b654cd171a1e275c37eb428e Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Wed, 14 Jun 2023 21:44:48 -0700 Subject: PCI: hv: Fix a race condition in hv_irq_unmask() that can cause panic When the host tries to remove a PCI device, the host first sends a PCI_EJECT message to the guest, and the guest is supposed to gracefully remove the PCI device and send a PCI_EJECTION_COMPLETE message to the host; the host then sends a VMBus message CHANNELMSG_RESCIND_CHANNELOFFER to the guest (when the guest receives this message, the device is already unassigned from the guest) and the guest can do some final cleanup work; if the guest fails to respond to the PCI_EJECT message within one minute, the host sends the VMBus message CHANNELMSG_RESCIND_CHANNELOFFER and removes the PCI device forcibly. In the case of fast device addition/removal, it's possible that the PCI device driver is still configuring MSI-X interrupts when the guest receives the PCI_EJECT message; the channel callback calls hv_pci_eject_device(), which sets hpdev->state to hv_pcichild_ejecting, and schedules a work hv_eject_device_work(); if the PCI device driver is calling pci_alloc_irq_vectors() -> ... -> hv_compose_msi_msg(), we can break the while loop in hv_compose_msi_msg() due to the updated hpdev->state, and leave data->chip_data with its default value of NULL; later, when the PCI device driver calls request_irq() -> ... -> hv_irq_unmask(), the guest crashes in hv_arch_irq_unmask() due to data->chip_data being NULL. Fix the issue by not testing hpdev->state in the while loop: when the guest receives PCI_EJECT, the device is still assigned to the guest, and the guest has one minute to finish the device removal gracefully. We don't really need to (and we should not) test hpdev->state in the loop. Fixes: de0aa7b2f97d ("PCI: hv: Fix 2 hang issues in hv_compose_msi_msg()") Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230615044451.5580-3-decui@microsoft.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index ea8862e656b6..733637d96765 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -635,6 +635,11 @@ static void hv_arch_irq_unmask(struct irq_data *data) pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); int_desc = data->chip_data; + if (!int_desc) { + dev_warn(&hbus->hdev->device, "%s() can not unmask irq %u\n", + __func__, data->irq); + return; + } local_irq_save(flags); @@ -2004,12 +2009,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) hv_pci_onchannelcallback(hbus); spin_unlock_irqrestore(&channel->sched_lock, flags); - if (hpdev->state == hv_pcichild_ejecting) { - dev_err_once(&hbus->hdev->device, - "the device is being ejected\n"); - goto enable_tasklet; - } - udelay(100); } -- cgit v1.2.3 From add9195e69c94b32e96f78c2f9cea68f0e850b3f Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Wed, 14 Jun 2023 21:44:49 -0700 Subject: PCI: hv: Remove the useless hv_pcichild_state from struct hv_pci_dev The hpdev->state is never really useful. The only use in hv_pci_eject_device() and hv_eject_device_work() is not really necessary. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Acked-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230615044451.5580-4-decui@microsoft.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 733637d96765..a826b41c949a 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -545,19 +545,10 @@ struct hv_dr_state { struct hv_pcidev_description func[]; }; -enum hv_pcichild_state { - hv_pcichild_init = 0, - hv_pcichild_requirements, - hv_pcichild_resourced, - hv_pcichild_ejecting, - hv_pcichild_maximum -}; - struct hv_pci_dev { /* List protected by pci_rescan_remove_lock */ struct list_head list_entry; refcount_t refs; - enum hv_pcichild_state state; struct pci_slot *pci_slot; struct hv_pcidev_description desc; bool reported_missing; @@ -2843,8 +2834,6 @@ static void hv_eject_device_work(struct work_struct *work) hpdev = container_of(work, struct hv_pci_dev, wrk); hbus = hpdev->hbus; - WARN_ON(hpdev->state != hv_pcichild_ejecting); - /* * Ejection can come before or after the PCI bus has been set up, so * attempt to find it and tear down the bus state, if it exists. This @@ -2901,7 +2890,6 @@ static void hv_pci_eject_device(struct hv_pci_dev *hpdev) return; } - hpdev->state = hv_pcichild_ejecting; get_pcichild(hpdev); INIT_WORK(&hpdev->wrk, hv_eject_device_work); queue_work(hbus->wq, &hpdev->wrk); -- cgit v1.2.3 From a847234e24d03d01a9566d1d9dcce018cc018d67 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Wed, 14 Jun 2023 21:44:50 -0700 Subject: Revert "PCI: hv: Fix a timing issue which causes kdump to fail occasionally" This reverts commit d6af2ed29c7c1c311b96dac989dcb991e90ee195. The statement "the hv_pci_bus_exit() call releases structures of all its child devices" in commit d6af2ed29c7c is not true: in the path hv_pci_probe() -> hv_pci_enter_d0() -> hv_pci_bus_exit(hdev, true): the parameter "keep_devs" is true, so hv_pci_bus_exit() does *not* release the child "struct hv_pci_dev *hpdev" that is created earlier in pci_devices_present_work() -> new_pcichild_device(). The commit d6af2ed29c7c was originally made in July 2020 for RHEL 7.7, where the old version of hv_pci_bus_exit() was used; when the commit was rebased and merged into the upstream, people didn't notice that it's not really necessary. The commit itself doesn't cause any issue, but it makes hv_pci_probe() more complicated. Revert it to facilitate some upcoming changes to hv_pci_probe(). Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Acked-by: Wei Hu Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230615044451.5580-5-decui@microsoft.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 71 ++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index a826b41c949a..1a5296fad1c4 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -3318,8 +3318,10 @@ static int hv_pci_enter_d0(struct hv_device *hdev) struct pci_bus_d0_entry *d0_entry; struct hv_pci_compl comp_pkt; struct pci_packet *pkt; + bool retry = true; int ret; +enter_d0_retry: /* * Tell the host that the bus is ready to use, and moved into the * powered-on state. This includes telling the host which region @@ -3346,6 +3348,38 @@ static int hv_pci_enter_d0(struct hv_device *hdev) if (ret) goto exit; + /* + * In certain case (Kdump) the pci device of interest was + * not cleanly shut down and resource is still held on host + * side, the host could return invalid device status. + * We need to explicitly request host to release the resource + * and try to enter D0 again. + */ + if (comp_pkt.completion_status < 0 && retry) { + retry = false; + + dev_err(&hdev->device, "Retrying D0 Entry\n"); + + /* + * Hv_pci_bus_exit() calls hv_send_resource_released() + * to free up resources of its child devices. + * In the kdump kernel we need to set the + * wslot_res_allocated to 255 so it scans all child + * devices to release resources allocated in the + * normal kernel before panic happened. + */ + hbus->wslot_res_allocated = 255; + + ret = hv_pci_bus_exit(hdev, true); + + if (ret == 0) { + kfree(pkt); + goto enter_d0_retry; + } + dev_err(&hdev->device, + "Retrying D0 failed with ret %d\n", ret); + } + if (comp_pkt.completion_status < 0) { dev_err(&hdev->device, "PCI Pass-through VSP failed D0 Entry with status %x\n", @@ -3591,7 +3625,6 @@ static int hv_pci_probe(struct hv_device *hdev, struct hv_pcibus_device *hbus; u16 dom_req, dom; char *name; - bool enter_d0_retry = true; int ret; bridge = devm_pci_alloc_host_bridge(&hdev->device, 0); @@ -3708,47 +3741,11 @@ static int hv_pci_probe(struct hv_device *hdev, if (ret) goto free_fwnode; -retry: ret = hv_pci_query_relations(hdev); if (ret) goto free_irq_domain; ret = hv_pci_enter_d0(hdev); - /* - * In certain case (Kdump) the pci device of interest was - * not cleanly shut down and resource is still held on host - * side, the host could return invalid device status. - * We need to explicitly request host to release the resource - * and try to enter D0 again. - * Since the hv_pci_bus_exit() call releases structures - * of all its child devices, we need to start the retry from - * hv_pci_query_relations() call, requesting host to send - * the synchronous child device relations message before this - * information is needed in hv_send_resources_allocated() - * call later. - */ - if (ret == -EPROTO && enter_d0_retry) { - enter_d0_retry = false; - - dev_err(&hdev->device, "Retrying D0 Entry\n"); - - /* - * Hv_pci_bus_exit() calls hv_send_resources_released() - * to free up resources of its child devices. - * In the kdump kernel we need to set the - * wslot_res_allocated to 255 so it scans all child - * devices to release resources allocated in the - * normal kernel before panic happened. - */ - hbus->wslot_res_allocated = 255; - ret = hv_pci_bus_exit(hdev, true); - - if (ret == 0) - goto retry; - - dev_err(&hdev->device, - "Retrying D0 failed with ret %d\n", ret); - } if (ret) goto free_irq_domain; -- cgit v1.2.3 From 067d6ec7ed5b49380688e06c1e5f883a71bef4fe Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Wed, 14 Jun 2023 21:44:51 -0700 Subject: PCI: hv: Add a per-bus mutex state_lock In the case of fast device addition/removal, it's possible that hv_eject_device_work() can start to run before create_root_hv_pci_bus() starts to run; as a result, the pci_get_domain_bus_and_slot() in hv_eject_device_work() can return a 'pdev' of NULL, and hv_eject_device_work() can remove the 'hpdev', and immediately send a message PCI_EJECTION_COMPLETE to the host, and the host immediately unassigns the PCI device from the guest; meanwhile, create_root_hv_pci_bus() and the PCI device driver can be probing the dead PCI device and reporting timeout errors. Fix the issue by adding a per-bus mutex 'state_lock' and grabbing the mutex before powering on the PCI bus in hv_pci_enter_d0(): when hv_eject_device_work() starts to run, it's able to find the 'pdev' and call pci_stop_and_remove_bus_device(pdev): if the PCI device driver has loaded, the PCI device driver's probe() function is already called in create_root_hv_pci_bus() -> pci_bus_add_devices(), and now hv_eject_device_work() -> pci_stop_and_remove_bus_device() is able to call the PCI device driver's remove() function and remove the device reliably; if the PCI device driver hasn't loaded yet, the function call hv_eject_device_work() -> pci_stop_and_remove_bus_device() is able to remove the PCI device reliably and the PCI device driver's probe() function won't be called; if the PCI device driver's probe() is already running (e.g., systemd-udev is loading the PCI device driver), it must be holding the per-device lock, and after the probe() finishes and releases the lock, hv_eject_device_work() -> pci_stop_and_remove_bus_device() is able to proceed to remove the device reliably. Fixes: 4daace0d8ce8 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs") Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Acked-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230615044451.5580-6-decui@microsoft.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 1a5296fad1c4..2d93d0c4f10d 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -489,7 +489,10 @@ struct hv_pcibus_device { struct fwnode_handle *fwnode; /* Protocol version negotiated with the host */ enum pci_protocol_version_t protocol_version; + + struct mutex state_lock; enum hv_pcibus_state state; + struct hv_device *hdev; resource_size_t low_mmio_space; resource_size_t high_mmio_space; @@ -2605,6 +2608,8 @@ static void pci_devices_present_work(struct work_struct *work) if (!dr) return; + mutex_lock(&hbus->state_lock); + /* First, mark all existing children as reported missing. */ spin_lock_irqsave(&hbus->device_list_lock, flags); list_for_each_entry(hpdev, &hbus->children, list_entry) { @@ -2686,6 +2691,8 @@ static void pci_devices_present_work(struct work_struct *work) break; } + mutex_unlock(&hbus->state_lock); + kfree(dr); } @@ -2834,6 +2841,8 @@ static void hv_eject_device_work(struct work_struct *work) hpdev = container_of(work, struct hv_pci_dev, wrk); hbus = hpdev->hbus; + mutex_lock(&hbus->state_lock); + /* * Ejection can come before or after the PCI bus has been set up, so * attempt to find it and tear down the bus state, if it exists. This @@ -2870,6 +2879,8 @@ static void hv_eject_device_work(struct work_struct *work) put_pcichild(hpdev); put_pcichild(hpdev); /* hpdev has been freed. Do not use it any more. */ + + mutex_unlock(&hbus->state_lock); } /** @@ -3636,6 +3647,7 @@ static int hv_pci_probe(struct hv_device *hdev, return -ENOMEM; hbus->bridge = bridge; + mutex_init(&hbus->state_lock); hbus->state = hv_pcibus_init; hbus->wslot_res_allocated = -1; @@ -3745,9 +3757,11 @@ static int hv_pci_probe(struct hv_device *hdev, if (ret) goto free_irq_domain; + mutex_lock(&hbus->state_lock); + ret = hv_pci_enter_d0(hdev); if (ret) - goto free_irq_domain; + goto release_state_lock; ret = hv_pci_allocate_bridge_windows(hbus); if (ret) @@ -3765,12 +3779,15 @@ static int hv_pci_probe(struct hv_device *hdev, if (ret) goto free_windows; + mutex_unlock(&hbus->state_lock); return 0; free_windows: hv_pci_free_bridge_windows(hbus); exit_d0: (void) hv_pci_bus_exit(hdev, true); +release_state_lock: + mutex_unlock(&hbus->state_lock); free_irq_domain: irq_domain_remove(hbus->irq_domain); free_fwnode: @@ -4020,20 +4037,26 @@ static int hv_pci_resume(struct hv_device *hdev) if (ret) goto out; + mutex_lock(&hbus->state_lock); + ret = hv_pci_enter_d0(hdev); if (ret) - goto out; + goto release_state_lock; ret = hv_send_resources_allocated(hdev); if (ret) - goto out; + goto release_state_lock; prepopulate_bars(hbus); hv_pci_restore_msi_state(hbus); hbus->state = hv_pcibus_installed; + mutex_unlock(&hbus->state_lock); return 0; + +release_state_lock: + mutex_unlock(&hbus->state_lock); out: vmbus_close(hdev->channel); return ret; -- cgit v1.2.3 From f593a94b530aee4c7f2511c9e48eb495dff03991 Mon Sep 17 00:00:00 2001 From: Juerg Haefliger Date: Fri, 16 Jun 2023 14:18:07 +0200 Subject: ieee802154/adf7242: Add MODULE_FIRMWARE macro The module loads firmware so add a MODULE_FIRMWARE macro to provide that information via modinfo. Signed-off-by: Juerg Haefliger Signed-off-by: David S. Miller --- drivers/net/ieee802154/adf7242.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ieee802154/adf7242.c b/drivers/net/ieee802154/adf7242.c index f9972b8140f9..a03490ba2e5b 100644 --- a/drivers/net/ieee802154/adf7242.c +++ b/drivers/net/ieee802154/adf7242.c @@ -1348,3 +1348,5 @@ module_spi_driver(adf7242_driver); MODULE_AUTHOR("Michael Hennerich "); MODULE_DESCRIPTION("ADF7242 IEEE802.15.4 Transceiver Driver"); MODULE_LICENSE("GPL"); + +MODULE_FIRMWARE(FIRMWARE); -- cgit v1.2.3 From eb09fc2d14163c0c217846cfabec3d0cce7c8f8c Mon Sep 17 00:00:00 2001 From: Juerg Haefliger Date: Fri, 16 Jun 2023 14:22:18 +0200 Subject: nfc: fdp: Add MODULE_FIRMWARE macros The module loads firmware so add MODULE_FIRMWARE macros to provide that information via modinfo. Signed-off-by: Juerg Haefliger Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- drivers/nfc/fdp/fdp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nfc/fdp/fdp.c b/drivers/nfc/fdp/fdp.c index f12f903a9dd1..da3e2dce8e70 100644 --- a/drivers/nfc/fdp/fdp.c +++ b/drivers/nfc/fdp/fdp.c @@ -762,3 +762,6 @@ EXPORT_SYMBOL(fdp_nci_remove); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("NFC NCI driver for Intel Fields Peak NFC controller"); MODULE_AUTHOR("Robert Dolca "); + +MODULE_FIRMWARE(FDP_OTP_PATCH_NAME); +MODULE_FIRMWARE(FDP_RAM_PATCH_NAME); -- cgit v1.2.3 From 606c812eb1d5b5fb0dd9e330ca94b52d7c227830 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Sat, 17 Jun 2023 20:47:08 -0400 Subject: mm/mmap: Fix error path in do_vmi_align_munmap() The error unrolling was leaving the VMAs detached in many cases and leaving the locked_vm statistic altered, and skipping the unrolling entirely in the case of the vma tree write failing. Fix the error path by re-attaching the detached VMAs and adding the necessary goto for the failed vma tree write, and fix the locked_vm statistic by only updating after the vma tree write succeeds. Fixes: 763ecb035029 ("mm: remove the vma linked list") Reported-by: Vegard Nossum Signed-off-by: Liam R. Howlett Signed-off-by: Linus Torvalds --- mm/mmap.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 13678edaa22c..d600404580b2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2318,21 +2318,6 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } -static inline int munmap_sidetree(struct vm_area_struct *vma, - struct ma_state *mas_detach) -{ - vma_start_write(vma); - mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1); - if (mas_store_gfp(mas_detach, vma, GFP_KERNEL)) - return -ENOMEM; - - vma_mark_detached(vma, true); - if (vma->vm_flags & VM_LOCKED) - vma->vm_mm->locked_vm -= vma_pages(vma); - - return 0; -} - /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator @@ -2354,6 +2339,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct maple_tree mt_detach; int count = 0; int error = -ENOMEM; + unsigned long locked_vm = 0; MA_STATE(mas_detach, &mt_detach, 0, 0); mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_set_external_lock(&mt_detach, &mm->mmap_lock); @@ -2399,9 +2385,13 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (error) goto end_split_failed; } - error = munmap_sidetree(next, &mas_detach); - if (error) - goto munmap_sidetree_failed; + vma_start_write(next); + mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1); + if (mas_store_gfp(&mas_detach, next, GFP_KERNEL)) + goto munmap_gather_failed; + vma_mark_detached(next, true); + if (next->vm_flags & VM_LOCKED) + locked_vm += vma_pages(next); count++; #ifdef CONFIG_DEBUG_VM_MAPLE_TREE @@ -2447,10 +2437,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, } #endif /* Point of no return */ + error = -ENOMEM; vma_iter_set(vmi, start); if (vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL)) - return -ENOMEM; + goto clear_tree_failed; + mm->locked_vm -= locked_vm; mm->map_count -= count; /* * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or @@ -2480,9 +2472,14 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, validate_mm(mm); return downgrade ? 1 : 0; +clear_tree_failed: userfaultfd_error: -munmap_sidetree_failed: +munmap_gather_failed: end_split_failed: + mas_set(&mas_detach, 0); + mas_for_each(&mas_detach, next, end) + vma_mark_detached(next, false); + __mt_destroy(&mt_detach); start_split_failed: map_count_exceeded: -- cgit v1.2.3 From c938ab4da0eb1620ae3243b0b24c572ddfc318fc Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 17 Jun 2023 17:55:00 +0200 Subject: net: phy: Manual remove LEDs to ensure correct ordering If the core is left to remove the LEDs via devm_, it is performed too late, after the PHY driver is removed from the PHY. This results in dereferencing a NULL pointer when the LED core tries to turn the LED off before destroying the LED. Manually unregister the LEDs at a safe point in phy_remove. Cc: stable@vger.kernel.org Reported-by: Florian Fainelli Suggested-by: Florian Fainelli Fixes: 01e5b728e9e4 ("net: phy: Add a binding for PHY LEDs") Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 17d0d0555a79..53598210be6c 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3021,6 +3021,15 @@ static int phy_led_blink_set(struct led_classdev *led_cdev, return err; } +static void phy_leds_unregister(struct phy_device *phydev) +{ + struct phy_led *phyled; + + list_for_each_entry(phyled, &phydev->leds, list) { + led_classdev_unregister(&phyled->led_cdev); + } +} + static int of_phy_led(struct phy_device *phydev, struct device_node *led) { @@ -3054,7 +3063,7 @@ static int of_phy_led(struct phy_device *phydev, init_data.fwnode = of_fwnode_handle(led); init_data.devname_mandatory = true; - err = devm_led_classdev_register_ext(dev, cdev, &init_data); + err = led_classdev_register_ext(dev, cdev, &init_data); if (err) return err; @@ -3083,6 +3092,7 @@ static int of_phy_leds(struct phy_device *phydev) err = of_phy_led(phydev, led); if (err) { of_node_put(led); + phy_leds_unregister(phydev); return err; } } @@ -3305,6 +3315,9 @@ static int phy_remove(struct device *dev) cancel_delayed_work_sync(&phydev->state_queue); + if (IS_ENABLED(CONFIG_PHYLIB_LEDS)) + phy_leds_unregister(phydev); + phydev->state = PHY_DOWN; sfp_bus_del_upstream(phydev->sfp_bus); -- cgit v1.2.3 From 45a3e24f65e90a047bef86f927ebdc4c710edaa1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 18 Jun 2023 14:06:27 -0700 Subject: Linux 6.4-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0d3a9d3e73c1..b68b43c19072 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 4 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* -- cgit v1.2.3 From 92717c2356cb62c89e8a3dc37cbbab2502562524 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Wed, 14 Jun 2023 23:06:56 +0200 Subject: net: qca_spi: Avoid high load if QCA7000 is not available In case the QCA7000 is not available via SPI (e.g. in reset), the driver will cause a high load. The reason for this is that the synchronization is never finished and schedule() is never called. Since the synchronization is not timing critical, it's safe to drop this from the scheduling condition. Signed-off-by: Stefan Wahren Fixes: 291ab06ecf67 ("net: qualcomm: new Ethernet over SPI driver for QCA7000") Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/qca_spi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c index c865a4be05ee..4a1b94e5a8ea 100644 --- a/drivers/net/ethernet/qualcomm/qca_spi.c +++ b/drivers/net/ethernet/qualcomm/qca_spi.c @@ -582,8 +582,7 @@ qcaspi_spi_thread(void *data) while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); if ((qca->intr_req == qca->intr_svc) && - (qca->txr.skb[qca->txr.head] == NULL) && - (qca->sync == QCASPI_SYNC_READY)) + !qca->txr.skb[qca->txr.head]) schedule(); set_current_state(TASK_RUNNING); -- cgit v1.2.3 From f334ad47683606b682b4166b800d8b372d315436 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Sat, 17 Jun 2023 16:53:19 +0800 Subject: mmc: litex_mmc: set PROBE_PREFER_ASYNCHRONOUS mmc host drivers should have enabled the asynchronous probe option, but it seems like we didn't set it for litex_mmc when introducing litex mmc support, so let's set it now. Tested with linux-on-litex-vexriscv on sipeed tang nano 20K fpga. Signed-off-by: Jisheng Zhang Acked-by: Gabriel Somlo Fixes: 92e099104729 ("mmc: Add driver for LiteX's LiteSDCard interface") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230617085319.2139-1-jszhang@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/litex_mmc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/litex_mmc.c b/drivers/mmc/host/litex_mmc.c index 39c6707fdfdb..9af6b0902efe 100644 --- a/drivers/mmc/host/litex_mmc.c +++ b/drivers/mmc/host/litex_mmc.c @@ -649,6 +649,7 @@ static struct platform_driver litex_mmc_driver = { .driver = { .name = "litex-mmc", .of_match_table = litex_match, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; module_platform_driver(litex_mmc_driver); -- cgit v1.2.3 From 71150ac12558bcd9d75e6e24cf7c872c2efd80f3 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:11 +0300 Subject: mmc: bcm2835: fix deferred probing The driver overrides the error codes and IRQ0 returned by platform_get_irq() to -EINVAL, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Since commit ce753ad1549c ("platform: finally disallow IRQ0 in platform_get_irq() and its ilk") IRQ0 is no longer returned by those APIs, so we now can safely ignore it... Fixes: 660fc733bd74 ("mmc: bcm2835: Add new driver for the sdhost controller.") Cc: stable@vger.kernel.org # v5.19+ Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20230617203622.6812-2-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson --- drivers/mmc/host/bcm2835.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/bcm2835.c b/drivers/mmc/host/bcm2835.c index 8648f7e63ca1..eea208856ce0 100644 --- a/drivers/mmc/host/bcm2835.c +++ b/drivers/mmc/host/bcm2835.c @@ -1403,8 +1403,8 @@ static int bcm2835_probe(struct platform_device *pdev) host->max_clk = clk_get_rate(clk); host->irq = platform_get_irq(pdev, 0); - if (host->irq <= 0) { - ret = -EINVAL; + if (host->irq < 0) { + ret = host->irq; goto err; } -- cgit v1.2.3 From b8ada54fa1b83f3b6480d4cced71354301750153 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:12 +0300 Subject: mmc: meson-gx: fix deferred probing The driver overrides the error codes and IRQ0 returned by platform_get_irq() to -EINVAL, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Since commit ce753ad1549c ("platform: finally disallow IRQ0 in platform_get_irq() and its ilk") IRQ0 is no longer returned by those APIs, so we now can safely ignore it... Fixes: cbcaac6d7dd2 ("mmc: meson-gx-mmc: Fix platform_get_irq's error checking") Cc: stable@vger.kernel.org # v5.19+ Signed-off-by: Sergey Shtylyov Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20230617203622.6812-3-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-gx-mmc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index f90b0fd8d8b0..ee9a25b900ae 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -1186,8 +1186,8 @@ static int meson_mmc_probe(struct platform_device *pdev) return PTR_ERR(host->regs); host->irq = platform_get_irq(pdev, 0); - if (host->irq <= 0) - return -EINVAL; + if (host->irq < 0) + return host->irq; cd_irq = platform_get_irq_optional(pdev, 1); mmc_gpio_set_cd_irq(mmc, cd_irq); -- cgit v1.2.3 From 0c4dc0f054891a2cbde0426b0c0fdf232d89f47f Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:13 +0300 Subject: mmc: mtk-sd: fix deferred probing The driver overrides the error codes returned by platform_get_irq() to -EINVAL, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Fixes: 208489032bdd ("mmc: mediatek: Add Mediatek MMC driver") Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20230617203622.6812-4-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index edade0e54a0c..9785ec91654f 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -2680,7 +2680,7 @@ static int msdc_drv_probe(struct platform_device *pdev) host->irq = platform_get_irq(pdev, 0); if (host->irq < 0) { - ret = -EINVAL; + ret = host->irq; goto host_free; } -- cgit v1.2.3 From 8d84064da0d4672e74f984e8710f27881137472c Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:14 +0300 Subject: mmc: mvsdio: fix deferred probing The driver overrides the error codes returned by platform_get_irq() to -ENXIO, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Fixes: 9ec36cafe43b ("of/irq: do irq resolution in platform_get_irq") Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20230617203622.6812-5-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson --- drivers/mmc/host/mvsdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index 629efbe639c4..b4f6a0a2fcb5 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -704,7 +704,7 @@ static int mvsd_probe(struct platform_device *pdev) } irq = platform_get_irq(pdev, 0); if (irq < 0) - return -ENXIO; + return irq; mmc = mmc_alloc_host(sizeof(struct mvsd_host), &pdev->dev); if (!mmc) { -- cgit v1.2.3 From aedf4ba1ad00aaa94c1b66c73ecaae95e2564b95 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:15 +0300 Subject: mmc: omap: fix deferred probing The driver overrides the error codes returned by platform_get_irq() to -ENXIO, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Fixes: 9ec36cafe43b ("of/irq: do irq resolution in platform_get_irq") Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20230617203622.6812-6-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson --- drivers/mmc/host/omap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index ce78edfb402b..86454f1182bb 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1343,7 +1343,7 @@ static int mmc_omap_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) - return -ENXIO; + return irq; host->virt_base = devm_platform_get_and_ioremap_resource(pdev, 0, &res); if (IS_ERR(host->virt_base)) -- cgit v1.2.3 From fb51b74a57859b707c3e8055ed0c25a7ca4f6a29 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:16 +0300 Subject: mmc: omap_hsmmc: fix deferred probing The driver overrides the error codes returned by platform_get_irq() to -ENXIO, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Fixes: 9ec36cafe43b ("of/irq: do irq resolution in platform_get_irq") Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20230617203622.6812-7-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson --- drivers/mmc/host/omap_hsmmc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 517dde777413..1e0f2d7774bd 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -1791,9 +1791,11 @@ static int omap_hsmmc_probe(struct platform_device *pdev) } res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - irq = platform_get_irq(pdev, 0); - if (res == NULL || irq < 0) + if (!res) return -ENXIO; + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(base)) -- cgit v1.2.3 From 3c482e1e830d79b9be8afb900a965135c01f7893 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:17 +0300 Subject: mmc: owl: fix deferred probing The driver overrides the error codes returned by platform_get_irq() to -EINVAL, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Fixes: ff65ffe46d28 ("mmc: Add Actions Semi Owl SoCs SD/MMC driver") Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20230617203622.6812-8-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson --- drivers/mmc/host/owl-mmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/owl-mmc.c b/drivers/mmc/host/owl-mmc.c index 6f9d31a886ba..1bf22b08b373 100644 --- a/drivers/mmc/host/owl-mmc.c +++ b/drivers/mmc/host/owl-mmc.c @@ -637,7 +637,7 @@ static int owl_mmc_probe(struct platform_device *pdev) owl_host->irq = platform_get_irq(pdev, 0); if (owl_host->irq < 0) { - ret = -EINVAL; + ret = owl_host->irq; goto err_release_channel; } -- cgit v1.2.3 From b465dea5e1540c7d7b5211adaf94926980d3014b Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:18 +0300 Subject: mmc: sdhci-acpi: fix deferred probing The driver overrides the error codes returned by platform_get_irq() to -EINVAL, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Fixes: 1b7ba57ecc86 ("mmc: sdhci-acpi: Handle return value of platform_get_irq") Signed-off-by: Sergey Shtylyov Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20230617203622.6812-9-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index 8f0e639236b1..edf2e6c14dc6 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -829,7 +829,7 @@ static int sdhci_acpi_probe(struct platform_device *pdev) host->ops = &sdhci_acpi_ops_dflt; host->irq = platform_get_irq(pdev, 0); if (host->irq < 0) { - err = -EINVAL; + err = host->irq; goto err_free; } -- cgit v1.2.3 From 8d0caeedcd05a721f3cc2537b0ea212ec4027307 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:19 +0300 Subject: mmc: sdhci-spear: fix deferred probing The driver overrides the error codes and IRQ0 returned by platform_get_irq() to -EINVAL, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Since commit ce753ad1549c ("platform: finally disallow IRQ0 in platform_get_irq() and its ilk") IRQ0 is no longer returned by those APIs, so we now can safely ignore it... Fixes: 682798a596a6 ("mmc: sdhci-spear: Handle return value of platform_get_irq") Cc: stable@vger.kernel.org # v5.19+ Signed-off-by: Sergey Shtylyov Acked-by: Viresh Kumar Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20230617203622.6812-10-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-spear.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/sdhci-spear.c b/drivers/mmc/host/sdhci-spear.c index d463e2fd5b1a..c79035727b20 100644 --- a/drivers/mmc/host/sdhci-spear.c +++ b/drivers/mmc/host/sdhci-spear.c @@ -65,8 +65,8 @@ static int sdhci_probe(struct platform_device *pdev) host->hw_name = "sdhci"; host->ops = &sdhci_pltfm_ops; host->irq = platform_get_irq(pdev, 0); - if (host->irq <= 0) { - ret = -EINVAL; + if (host->irq < 0) { + ret = host->irq; goto err_host; } host->quirks = SDHCI_QUIRK_BROKEN_ADMA; -- cgit v1.2.3 From 5b067d7f855c61df7f8e2e8ccbcee133c282415e Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:20 +0300 Subject: mmc: sh_mmcif: fix deferred probing The driver overrides the error codes returned by platform_get_irq() to -ENXIO, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Fixes: 9ec36cafe43b ("of/irq: do irq resolution in platform_get_irq") Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20230617203622.6812-11-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson --- drivers/mmc/host/sh_mmcif.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index 0fd4c9d644dd..5cf53348372a 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -1400,7 +1400,7 @@ static int sh_mmcif_probe(struct platform_device *pdev) irq[0] = platform_get_irq(pdev, 0); irq[1] = platform_get_irq_optional(pdev, 1); if (irq[0] < 0) - return -ENXIO; + return irq[0]; reg = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(reg)) -- cgit v1.2.3 From c2df53c5806cfd746dae08e07bc8c4ad247c3b70 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:21 +0300 Subject: mmc: sunxi: fix deferred probing The driver overrides the error codes and IRQ0 returned by platform_get_irq() to -EINVAL, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating the error codes upstream. Since commit ce753ad1549c ("platform: finally disallow IRQ0 in platform_get_irq() and its ilk") IRQ0 is no longer returned by those APIs, so we now can safely ignore it... Fixes: 2408a08583d2 ("mmc: sunxi-mmc: Handle return value of platform_get_irq") Cc: stable@vger.kernel.org # v5.19+ Signed-off-by: Sergey Shtylyov Reviewed-by: Jernej Skrabec Link: https://lore.kernel.org/r/20230617203622.6812-12-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson --- drivers/mmc/host/sunxi-mmc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/sunxi-mmc.c b/drivers/mmc/host/sunxi-mmc.c index 3db9f32d6a7b..69dcb8805e05 100644 --- a/drivers/mmc/host/sunxi-mmc.c +++ b/drivers/mmc/host/sunxi-mmc.c @@ -1350,8 +1350,8 @@ static int sunxi_mmc_resource_request(struct sunxi_mmc_host *host, return ret; host->irq = platform_get_irq(pdev, 0); - if (host->irq <= 0) { - ret = -EINVAL; + if (host->irq < 0) { + ret = host->irq; goto error_disable_mmc; } -- cgit v1.2.3 From 413db499730248431c1005b392e8ed82c4fa19bf Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 17 Jun 2023 23:36:22 +0300 Subject: mmc: usdhi60rol0: fix deferred probing The driver overrides the error codes returned by platform_get_irq_byname() to -ENODEV, so if it returns -EPROBE_DEFER, the driver will fail the probe permanently instead of the deferred probing. Switch to propagating error codes upstream. Fixes: 9ec36cafe43b ("of/irq: do irq resolution in platform_get_irq") Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/20230617203622.6812-13-s.shtylyov@omp.ru Signed-off-by: Ulf Hansson --- drivers/mmc/host/usdhi6rol0.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/usdhi6rol0.c b/drivers/mmc/host/usdhi6rol0.c index 2f59917b105e..2e17903658fc 100644 --- a/drivers/mmc/host/usdhi6rol0.c +++ b/drivers/mmc/host/usdhi6rol0.c @@ -1757,8 +1757,10 @@ static int usdhi6_probe(struct platform_device *pdev) irq_cd = platform_get_irq_byname(pdev, "card detect"); irq_sd = platform_get_irq_byname(pdev, "data"); irq_sdio = platform_get_irq_byname(pdev, "SDIO"); - if (irq_sd < 0 || irq_sdio < 0) - return -ENODEV; + if (irq_sd < 0) + return irq_sd; + if (irq_sdio < 0) + return irq_sdio; mmc = mmc_alloc_host(sizeof(struct usdhi6_host), dev); if (!mmc) -- cgit v1.2.3 From a2b6f2ab3e144f8e23666aafeba0e4d9ea4b7975 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 7 Jun 2023 13:41:19 -0700 Subject: afs: Fix dangling folio ref counts in writeback Commit acc8d8588cb7 converted afs_writepages_region() to write back a folio batch. If writeback needs rescheduling, the function exits without dropping the references to the folios in fbatch. This patch fixes that. [DH: Moved the added line before the _leave()] Fixes: acc8d8588cb7 ("afs: convert afs_writepages_region() to use filemap_get_folios_tag()") Signed-off-by: Vishal Moola (Oracle) Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/20230607204120.89416-1-vishal.moola@gmail.com/ --- fs/afs/write.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/afs/write.c b/fs/afs/write.c index c822d6006033..fd433024070e 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -763,6 +763,7 @@ static int afs_writepages_region(struct address_space *mapping, if (wbc->sync_mode == WB_SYNC_NONE) { if (skips >= 5 || need_resched()) { *_next = start; + folio_batch_release(&fbatch); _leave(" = 0 [%llx]", *_next); return 0; } -- cgit v1.2.3 From 819da022dd007398d0c42ebcd8dbb1b681acea53 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 7 Jun 2023 13:41:20 -0700 Subject: afs: Fix waiting for writeback then skipping folio Commit acc8d8588cb7 converted afs_writepages_region() to write back a folio batch. The function waits for writeback to a folio, but then proceeds to the rest of the batch without trying to write that folio again. This patch fixes has it attempt to write the folio again. [DH: Also remove an 'else' that adding a goto makes redundant] Fixes: acc8d8588cb7 ("afs: convert afs_writepages_region() to use filemap_get_folios_tag()") Signed-off-by: Vishal Moola (Oracle) Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/20230607204120.89416-2-vishal.moola@gmail.com/ --- fs/afs/write.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index fd433024070e..8750b99c3f56 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -731,6 +731,7 @@ static int afs_writepages_region(struct address_space *mapping, * (changing page->mapping to NULL), or even swizzled * back from swapper_space to tmpfs file mapping */ +try_again: if (wbc->sync_mode != WB_SYNC_NONE) { ret = folio_lock_killable(folio); if (ret < 0) { @@ -757,9 +758,10 @@ static int afs_writepages_region(struct address_space *mapping, #ifdef CONFIG_AFS_FSCACHE folio_wait_fscache(folio); #endif - } else { - start += folio_size(folio); + goto try_again; } + + start += folio_size(folio); if (wbc->sync_mode == WB_SYNC_NONE) { if (skips >= 5 || need_resched()) { *_next = start; -- cgit v1.2.3 From d7fce52fdf96663ddc2eb21afecff3775588612a Mon Sep 17 00:00:00 2001 From: Terin Stock Date: Fri, 9 Jun 2023 22:58:42 +0200 Subject: ipvs: align inner_mac_header for encapsulation When using encapsulation the original packet's headers are copied to the inner headers. This preserves the space for an inner mac header, which is not used by the inner payloads for the encapsulation types supported by IPVS. If a packet is using GUE or GRE encapsulation and needs to be segmented, flow can be passed to __skb_udp_tunnel_segment() which calculates a negative tunnel header length. A negative tunnel header length causes pskb_may_pull() to fail, dropping the packet. This can be observed by attaching probes to ip_vs_in_hook(), __dev_queue_xmit(), and __skb_udp_tunnel_segment(): perf probe --add '__dev_queue_xmit skb->inner_mac_header \ skb->inner_network_header skb->mac_header skb->network_header' perf probe --add '__skb_udp_tunnel_segment:7 tnl_hlen' perf probe -m ip_vs --add 'ip_vs_in_hook skb->inner_mac_header \ skb->inner_network_header skb->mac_header skb->network_header' These probes the headers and tunnel header length for packets which traverse the IPVS encapsulation path. A TCP packet can be forced into the segmentation path by being smaller than a calculated clamped MSS, but larger than the advertised MSS. probe:ip_vs_in_hook: inner_mac_header=0x0 inner_network_header=0x0 mac_header=0x44 network_header=0x52 probe:ip_vs_in_hook: inner_mac_header=0x44 inner_network_header=0x52 mac_header=0x44 network_header=0x32 probe:dev_queue_xmit: inner_mac_header=0x44 inner_network_header=0x52 mac_header=0x44 network_header=0x32 probe:__skb_udp_tunnel_segment_L7: tnl_hlen=-2 When using veth-based encapsulation, the interfaces are set to be mac-less, which does not preserve space for an inner mac header. This prevents this issue from occurring. In our real-world testing of sending a 32KB file we observed operation time increasing from ~75ms for veth-based encapsulation to over 1.5s using IPVS encapsulation due to retries from dropped packets. This changeset modifies the packet on the encapsulation path in ip_vs_tunnel_xmit() and ip_vs_tunnel_xmit_v6() to remove the inner mac header offset. This fixes UDP segmentation for both encapsulation types, and corrects the inner headers for any IPIP flows that may use it. Fixes: 84c0d5e96f3a ("ipvs: allow tunneling with gue encapsulation") Signed-off-by: Terin Stock Acked-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_xmit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index feb1d7fcb09f..a80b960223e1 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -1207,6 +1207,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb->transport_header = skb->network_header; skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { bool check = false; @@ -1349,6 +1350,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb->transport_header = skb->network_header; skb_set_inner_ipproto(skb, next_protocol); + skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { bool check = false; -- cgit v1.2.3 From 7257d930aadcd62d1c7971ab14f3b1126356abdc Mon Sep 17 00:00:00 2001 From: Teresa Remmet Date: Wed, 14 Jun 2023 14:52:40 +0200 Subject: regulator: pca9450: Fix LDO3OUT and LDO4OUT MASK L3_OUT and L4_OUT Bit fields range from Bit 0:4 and thus the mask should be 0x1F instead of 0x0F. Fixes: 0935ff5f1f0a ("regulator: pca9450: add pca9450 pmic driver") Signed-off-by: Teresa Remmet Reviewed-by: Frieder Schrempf Link: https://lore.kernel.org/r/20230614125240.3946519-1-t.remmet@phytec.de Signed-off-by: Mark Brown --- include/linux/regulator/pca9450.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h index 3c01c2bf84f5..505c908dbb81 100644 --- a/include/linux/regulator/pca9450.h +++ b/include/linux/regulator/pca9450.h @@ -196,11 +196,11 @@ enum { /* PCA9450_REG_LDO3_VOLT bits */ #define LDO3_EN_MASK 0xC0 -#define LDO3OUT_MASK 0x0F +#define LDO3OUT_MASK 0x1F /* PCA9450_REG_LDO4_VOLT bits */ #define LDO4_EN_MASK 0xC0 -#define LDO4OUT_MASK 0x0F +#define LDO4OUT_MASK 0x1F /* PCA9450_REG_LDO5_VOLT bits */ #define LDO5L_EN_MASK 0xC0 -- cgit v1.2.3 From 54abe19e00cfcc5a72773d15cd00ed19ab763439 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 6 Jun 2023 19:36:13 -0400 Subject: writeback: fix dereferencing NULL mapping->host on writeback_page_template When commit 19343b5bdd16 ("mm/page-writeback: introduce tracepoint for wait_on_page_writeback()") repurposed the writeback_dirty_page trace event as a template to create its new wait_on_page_writeback trace event, it ended up opening a window to NULL pointer dereference crashes due to the (infrequent) occurrence of a race where an access to a page in the swap-cache happens concurrently with the moment this page is being written to disk and the tracepoint is enabled: BUG: kernel NULL pointer dereference, address: 0000000000000040 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 800000010ec0a067 P4D 800000010ec0a067 PUD 102353067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 1320 Comm: shmem-worker Kdump: loaded Not tainted 6.4.0-rc5+ #13 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS edk2-20230301gitf80f052277c8-1.fc37 03/01/2023 RIP: 0010:trace_event_raw_event_writeback_folio_template+0x76/0xf0 Code: 4d 85 e4 74 5c 49 8b 3c 24 e8 06 98 ee ff 48 89 c7 e8 9e 8b ee ff ba 20 00 00 00 48 89 ef 48 89 c6 e8 fe d4 1a 00 49 8b 04 24 <48> 8b 40 40 48 89 43 28 49 8b 45 20 48 89 e7 48 89 43 30 e8 a2 4d RSP: 0000:ffffaad580b6fb60 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff90e38035c01c RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff90e38035c044 RBP: ffff90e38035c024 R08: 0000000000000002 R09: 0000000000000006 R10: ffff90e38035c02e R11: 0000000000000020 R12: ffff90e380bac000 R13: ffffe3a7456d9200 R14: 0000000000001b81 R15: ffffe3a7456d9200 FS: 00007f2e4e8a15c0(0000) GS:ffff90e3fbc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000040 CR3: 00000001150c6003 CR4: 0000000000170ee0 Call Trace: ? __die+0x20/0x70 ? page_fault_oops+0x76/0x170 ? kernelmode_fixup_or_oops+0x84/0x110 ? exc_page_fault+0x65/0x150 ? asm_exc_page_fault+0x22/0x30 ? trace_event_raw_event_writeback_folio_template+0x76/0xf0 folio_wait_writeback+0x6b/0x80 shmem_swapin_folio+0x24a/0x500 ? filemap_get_entry+0xe3/0x140 shmem_get_folio_gfp+0x36e/0x7c0 ? find_busiest_group+0x43/0x1a0 shmem_fault+0x76/0x2a0 ? __update_load_avg_cfs_rq+0x281/0x2f0 __do_fault+0x33/0x130 do_read_fault+0x118/0x160 do_pte_missing+0x1ed/0x2a0 __handle_mm_fault+0x566/0x630 handle_mm_fault+0x91/0x210 do_user_addr_fault+0x22c/0x740 exc_page_fault+0x65/0x150 asm_exc_page_fault+0x22/0x30 This problem arises from the fact that the repurposed writeback_dirty_page trace event code was written assuming that every pointer to mapping (struct address_space) would come from a file-mapped page-cache object, thus mapping->host would always be populated, and that was a valid case before commit 19343b5bdd16. The swap-cache address space (swapper_spaces), however, doesn't populate its ->host (struct inode) pointer, thus leading to the crashes in the corner-case aforementioned. commit 19343b5bdd16 ended up breaking the assignment of __entry->name and __entry->ino for the wait_on_page_writeback tracepoint -- both dependent on mapping->host carrying a pointer to a valid inode. The assignment of __entry->name was fixed by commit 68f23b89067f ("memcg: fix a crash in wb_workfn when a device disappears"), and this commit fixes the remaining case, for __entry->ino. Link: https://lkml.kernel.org/r/20230606233613.1290819-1-aquini@redhat.com Fixes: 19343b5bdd16 ("mm/page-writeback: introduce tracepoint for wait_on_page_writeback()") Signed-off-by: Rafael Aquini Reviewed-by: Yafang Shao Cc: Aristeu Rozanski Cc: Signed-off-by: Andrew Morton --- include/trace/events/writeback.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 86b2a82da546..54e353c9f919 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(writeback_folio_template, strscpy_pad(__entry->name, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); - __entry->ino = mapping ? mapping->host->i_ino : 0; + __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0; __entry->index = folio->index; ), -- cgit v1.2.3 From 77795f900e2a07c1cbedc375789aefb43843b6c2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Jun 2023 14:29:12 -0400 Subject: mm/mprotect: fix do_mprotect_pkey() limit check The return of do_mprotect_pkey() can still be incorrectly returned as success if there is a gap that spans to or beyond the end address passed in. Update the check to ensure that the end address has indeed been seen. Link: https://lore.kernel.org/all/CABi2SkXjN+5iFoBhxk71t3cmunTk-s=rB4T7qo0UQRh17s49PQ@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230606182912.586576-1-Liam.Howlett@oracle.com Fixes: 82f951340f25 ("mm/mprotect: fix do_mprotect_pkey() return on error") Signed-off-by: Liam R. Howlett Reported-by: Jeff Xu Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/mprotect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 92d3d3ca390a..c59e7561698c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -867,7 +867,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } tlb_finish_mmu(&tlb); - if (!error && vma_iter_end(&vmi) < end) + if (!error && tmp < end) error = -ENOMEM; out: -- cgit v1.2.3 From 95a301eefa82057571207edd06ea36218985a75e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 5 Jun 2023 21:11:07 +0100 Subject: mm/vmalloc: do not output a spurious warning when huge vmalloc() fails In __vmalloc_area_node() we always warn_alloc() when an allocation performed by vm_area_alloc_pages() fails unless it was due to a pending fatal signal. However, huge page allocations instigated either by vmalloc_huge() or __vmalloc_node_range() (or a caller that invokes this like kvmalloc() or kvmalloc_node()) always falls back to order-0 allocations if the huge page allocation fails. This renders the warning useless and noisy, especially as all callers appear to be aware that this may fallback. This has already resulted in at least one bug report from a user who was confused by this (see link). Therefore, simply update the code to only output this warning for order-0 pages when no fatal signal is pending. Link: https://bugzilla.suse.com/show_bug.cgi?id=1211410 Link: https://lkml.kernel.org/r/20230605201107.83298-1-lstoakes@gmail.com Fixes: 80b1d8fdfad1 ("mm: vmalloc: correct use of __GFP_NOWARN mask in __vmalloc_area_node()") Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Baoquan He Acked-by: Michal Hocko Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9683573f1225..1d13d71687d7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3098,11 +3098,20 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { - /* vm_area_alloc_pages() can also fail due to a fatal signal */ - if (!fatal_signal_pending(current)) + /* + * vm_area_alloc_pages() can fail due to insufficient memory but + * also:- + * + * - a pending fatal signal + * - insufficient huge page-order pages + * + * Since we always retry allocations at order-0 in the huge page + * case a warning for either is spurious. + */ + if (!fatal_signal_pending(current) && page_order == 0) warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, page order %u, failed to allocate pages", - area->nr_pages * PAGE_SIZE, page_order); + "vmalloc error: size %lu, failed to allocate pages", + area->nr_pages * PAGE_SIZE); goto fail; } -- cgit v1.2.3 From 935d44acf621aa0688fef8312dec3e5940f38f4e Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 7 Jun 2023 15:24:27 +0200 Subject: memfd: check for non-NULL file_seals in memfd_create() syscall Ensure that file_seals is non-NULL before using it in the memfd_create() syscall. One situation in which memfd_file_seals_ptr() could return a NULL pointer when CONFIG_SHMEM=n, oopsing the kernel. Link: https://lkml.kernel.org/r/20230607132427.2867435-1-roberto.sassu@huaweicloud.com Fixes: 47b9012ecdc7 ("shmem: add sealing support to hugetlb-backed memfd") Signed-off-by: Roberto Sassu Cc: Marc-Andr Lureau Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton --- mm/memfd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/memfd.c b/mm/memfd.c index 69b90c31d38c..e763e76f1106 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -371,12 +371,15 @@ SYSCALL_DEFINE2(memfd_create, inode->i_mode &= ~0111; file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; - *file_seals |= F_SEAL_EXEC; + if (file_seals) { + *file_seals &= ~F_SEAL_SEAL; + *file_seals |= F_SEAL_EXEC; + } } else if (flags & MFD_ALLOW_SEALING) { /* MFD_EXEC and MFD_ALLOW_SEALING are set */ file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; + if (file_seals) + *file_seals &= ~F_SEAL_SEAL; } fd_install(fd, file); -- cgit v1.2.3 From c8a8f3b4a95ace7683b615ad9c9aa0eac59013ae Mon Sep 17 00:00:00 2001 From: David Stevens Date: Wed, 7 Jun 2023 14:31:35 +0900 Subject: mm/khugepaged: fix iteration in collapse_file Remove an unnecessary call to xas_set(index) when iterating over the target range in collapse_file. The extra call to xas_set reset the xas cursor to the top of the tree, causing the xas_next call on the next iteration to walk the tree to index instead of advancing to index+1. This returned the same page again, which would cause collapse_file to fail because the page is already locked. This bug was hidden when CONFIG_DEBUG_VM was set. When that config was used, the xas_load in a subsequent VM_BUG_ON assert would walk xas from the top of the tree to index, causing the xas_next call on the next loop iteration to advance the cursor as expected. Link: https://lkml.kernel.org/r/20230607053135.2087354-1-stevensd@google.com Fixes: a2e17cc2efc7 ("mm/khugepaged: maintain page cache uptodate flag") Signed-off-by: David Stevens Reviewed-by: Peter Xu Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jiaqi Yan Cc: Kirill A . Shutemov Cc: Matthew Wilcox Cc: Yang Shi Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/khugepaged.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6b9d39d65b73..2d0d58fb4e7f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2070,7 +2070,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); - xas_set(&xas, index); VM_BUG_ON_PAGE(page != xas_load(&xas), page); -- cgit v1.2.3 From b7cb3821905b79b6ed474fd5ba34d1e187649139 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 8 Jun 2023 13:49:27 -0700 Subject: udmabuf: revert 'Add support for mapping hugepages (v4)' This effectively reverts commit 16c243e99d33 ("udmabuf: Add support for mapping hugepages (v4)"). Recently, Junxiao Chang found a BUG with page map counting as described here [1]. This issue pointed out that the udmabuf driver was making direct use of subpages of hugetlb pages. This is not a good idea, and no other mm code attempts such use. In addition to the mapcount issue, this also causes issues with hugetlb vmemmap optimization and page poisoning. For now, remove hugetlb support. If udmabuf wants to be used on hugetlb mappings, it should be changed to only use complete hugetlb pages. This will require different alignment and size requirements on the UDMABUF_CREATE API. [1] https://lore.kernel.org/linux-mm/20230512072036.1027784-1-junxiao.chang@intel.com/ Link: https://lkml.kernel.org/r/20230608204927.88711-1-mike.kravetz@oracle.com Fixes: 16c243e99d33 ("udmabuf: Add support for mapping hugepages (v4)") Signed-off-by: Mike Kravetz Acked-by: Greg Kroah-Hartman Acked-by: Vivek Kasireddy Acked-by: Gerd Hoffmann Cc: David Hildenbrand Cc: Dongwon Kim Cc: James Houghton Cc: Jerome Marchand Cc: Junxiao Chang Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- drivers/dma-buf/udmabuf.c | 47 ++++++----------------------------------------- 1 file changed, 6 insertions(+), 41 deletions(-) diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 01f2e86f3f7c..12cf6bb2e3ce 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -207,9 +206,7 @@ static long udmabuf_create(struct miscdevice *device, struct udmabuf *ubuf; struct dma_buf *buf; pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit; - struct page *page, *hpage = NULL; - pgoff_t subpgoff, maxsubpgs; - struct hstate *hpstate; + struct page *page; int seals, ret = -EINVAL; u32 i, flags; @@ -245,7 +242,7 @@ static long udmabuf_create(struct miscdevice *device, if (!memfd) goto err; mapping = memfd->f_mapping; - if (!shmem_mapping(mapping) && !is_file_hugepages(memfd)) + if (!shmem_mapping(mapping)) goto err; seals = memfd_fcntl(memfd, F_GET_SEALS, 0); if (seals == -EINVAL) @@ -256,48 +253,16 @@ static long udmabuf_create(struct miscdevice *device, goto err; pgoff = list[i].offset >> PAGE_SHIFT; pgcnt = list[i].size >> PAGE_SHIFT; - if (is_file_hugepages(memfd)) { - hpstate = hstate_file(memfd); - pgoff = list[i].offset >> huge_page_shift(hpstate); - subpgoff = (list[i].offset & - ~huge_page_mask(hpstate)) >> PAGE_SHIFT; - maxsubpgs = huge_page_size(hpstate) >> PAGE_SHIFT; - } for (pgidx = 0; pgidx < pgcnt; pgidx++) { - if (is_file_hugepages(memfd)) { - if (!hpage) { - hpage = find_get_page_flags(mapping, pgoff, - FGP_ACCESSED); - if (!hpage) { - ret = -EINVAL; - goto err; - } - } - page = hpage + subpgoff; - get_page(page); - subpgoff++; - if (subpgoff == maxsubpgs) { - put_page(hpage); - hpage = NULL; - subpgoff = 0; - pgoff++; - } - } else { - page = shmem_read_mapping_page(mapping, - pgoff + pgidx); - if (IS_ERR(page)) { - ret = PTR_ERR(page); - goto err; - } + page = shmem_read_mapping_page(mapping, pgoff + pgidx); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto err; } ubuf->pages[pgbuf++] = page; } fput(memfd); memfd = NULL; - if (hpage) { - put_page(hpage); - hpage = NULL; - } } exp_info.ops = &udmabuf_ops; -- cgit v1.2.3 From 2049a7d0cbc6ac8e370e836ed68597be04a7dc49 Mon Sep 17 00:00:00 2001 From: Prathu Baronia Date: Thu, 8 Jun 2023 21:14:49 +0530 Subject: scripts: fix the gfp flags header path in gfp-translate Since gfp flags have been shifted to gfp_types.h so update the path in the gfp-translate script. Link: https://lkml.kernel.org/r/20230608154450.21758-1-prathubaronia2011@gmail.com Fixes: cb5a065b4ea9c ("headers/deps: mm: Split out of ") Signed-off-by: Prathu Baronia Reviewed-by: David Hildenbrand Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Nicolas Schier Cc: Ingo Molnar Cc: Yury Norov Cc: Signed-off-by: Andrew Morton --- scripts/gfp-translate | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/gfp-translate b/scripts/gfp-translate index b2ce416d944b..6c9aed17cf56 100755 --- a/scripts/gfp-translate +++ b/scripts/gfp-translate @@ -63,11 +63,11 @@ fi # Extract GFP flags from the kernel source TMPFILE=`mktemp -t gfptranslate-XXXXXX` || exit 1 -grep -q ___GFP $SOURCE/include/linux/gfp.h +grep -q ___GFP $SOURCE/include/linux/gfp_types.h if [ $? -eq 0 ]; then - grep "^#define ___GFP" $SOURCE/include/linux/gfp.h | sed -e 's/u$//' | grep -v GFP_BITS > $TMPFILE + grep "^#define ___GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/u$//' | grep -v GFP_BITS > $TMPFILE else - grep "^#define __GFP" $SOURCE/include/linux/gfp.h | sed -e 's/(__force gfp_t)//' | sed -e 's/u)/)/' | grep -v GFP_BITS | sed -e 's/)\//) \//' > $TMPFILE + grep "^#define __GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/(__force gfp_t)//' | sed -e 's/u)/)/' | grep -v GFP_BITS | sed -e 's/)\//) \//' > $TMPFILE fi # Parse the flags -- cgit v1.2.3 From 6a59cb5158bff13b80f116305155fbe4967a5010 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 7 Jun 2023 15:13:35 -0700 Subject: scripts/gdb: fix SB_* constants parsing --0000000000009a0c9905fd9173ad Content-Transfer-Encoding: 8bit After f15afbd34d8f ("fs: fix undefined behavior in bit shift for SB_NOUSER") the constants were changed from plain integers which LX_VALUE() can parse to constants using the BIT() macro which causes the following: Reading symbols from build/linux-custom/vmlinux...done. Traceback (most recent call last): File "/home/fainelli/work/buildroot/output/arm64/build/linux-custom/vmlinux-gdb.py", line 25, in import linux.constants File "/home/fainelli/work/buildroot/output/arm64/build/linux-custom/scripts/gdb/linux/constants.py", line 5 LX_SB_RDONLY = ((((1UL))) << (0)) Use LX_GDBPARSED() which does not suffer from that issue. f15afbd34d8f ("fs: fix undefined behavior in bit shift for SB_NOUSER") Link: https://lkml.kernel.org/r/20230607221337.2781730-1-florian.fainelli@broadcom.com Signed-off-by: Florian Fainelli Acked-by: Christian Brauner Cc: Hao Ge Cc: Jan Kiszka Cc: Kieran Bingham Cc: Luis Chamberlain Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- scripts/gdb/linux/constants.py.in | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in index 471300ba176c..50a92c4e9984 100644 --- a/scripts/gdb/linux/constants.py.in +++ b/scripts/gdb/linux/constants.py.in @@ -48,12 +48,12 @@ if IS_BUILTIN(CONFIG_COMMON_CLK): LX_GDBPARSED(CLK_GET_RATE_NOCACHE) /* linux/fs.h */ -LX_VALUE(SB_RDONLY) -LX_VALUE(SB_SYNCHRONOUS) -LX_VALUE(SB_MANDLOCK) -LX_VALUE(SB_DIRSYNC) -LX_VALUE(SB_NOATIME) -LX_VALUE(SB_NODIRATIME) +LX_GDBPARSED(SB_RDONLY) +LX_GDBPARSED(SB_SYNCHRONOUS) +LX_GDBPARSED(SB_MANDLOCK) +LX_GDBPARSED(SB_DIRSYNC) +LX_GDBPARSED(SB_NOATIME) +LX_GDBPARSED(SB_NODIRATIME) /* linux/htimer.h */ LX_GDBPARSED(hrtimer_resolution) -- cgit v1.2.3 From 679bd7ebdd315bf457a4740b306ae99f1d0a403d Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 9 Jun 2023 12:57:32 +0900 Subject: nilfs2: fix buffer corruption due to concurrent device reads As a result of analysis of a syzbot report, it turned out that in three cases where nilfs2 allocates block device buffers directly via sb_getblk, concurrent reads to the device can corrupt the allocated buffers. Nilfs2 uses sb_getblk for segment summary blocks, that make up a log header, and the super root block, that is the trailer, and when moving and writing the second super block after fs resize. In any of these, since the uptodate flag is not set when storing metadata to be written in the allocated buffers, the stored metadata will be overwritten if a device read of the same block occurs concurrently before the write. This causes metadata corruption and misbehavior in the log write itself, causing warnings in nilfs_btree_assign() as reported. Fix these issues by setting an uptodate flag on the buffer head on the first or before modifying each buffer obtained with sb_getblk, and clearing the flag on failure. When setting the uptodate flag, the lock_buffer/unlock_buffer pair is used to perform necessary exclusive control, and the buffer is filled to ensure that uninitialized bytes are not mixed into the data read from others. As for buffers for segment summary blocks, they are filled incrementally, so if the uptodate flag was unset on their allocation, set the flag and zero fill the buffer once at that point. Also, regarding the superblock move routine, the starting point of the memset call to zerofill the block is incorrectly specified, which can cause a buffer overflow on file systems with block sizes greater than 4KiB. In addition, if the superblock is moved within a large block, it is necessary to assume the possibility that the data in the superblock will be destroyed by zero-filling before copying. So fix these potential issues as well. Link: https://lkml.kernel.org/r/20230609035732.20426-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: syzbot+31837fe952932efc8fb9@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/00000000000030000a05e981f475@google.com Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/segbuf.c | 6 ++++++ fs/nilfs2/segment.c | 7 +++++++ fs/nilfs2/super.c | 23 ++++++++++++++++++++++- 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 1362ccb64ec7..6e59dc19a732 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -101,6 +101,12 @@ int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf) if (unlikely(!bh)) return -ENOMEM; + lock_buffer(bh); + if (!buffer_uptodate(bh)) { + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); nilfs_segbuf_add_segsum_buffer(segbuf, bh); return 0; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index ac949fd7603f..c2553024bd25 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -981,10 +981,13 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, unsigned int isz, srsz; bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root; + + lock_buffer(bh_sr); raw_sr = (struct nilfs_super_root *)bh_sr->b_data; isz = nilfs->ns_inode_size; srsz = NILFS_SR_BYTES(isz); + raw_sr->sr_sum = 0; /* Ensure initialization within this update */ raw_sr->sr_bytes = cpu_to_le16(srsz); raw_sr->sr_nongc_ctime = cpu_to_le64(nilfs_doing_gc() ? @@ -998,6 +1001,8 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr + NILFS_SR_SUFILE_OFFSET(isz), 1); memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz); + set_buffer_uptodate(bh_sr); + unlock_buffer(bh_sr); } static void nilfs_redirty_inodes(struct list_head *head) @@ -1780,6 +1785,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) list_for_each_entry(segbuf, logs, sb_list) { list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { + clear_buffer_uptodate(bh); if (bh->b_page != bd_page) { if (bd_page) end_page_writeback(bd_page); @@ -1791,6 +1797,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) b_assoc_buffers) { clear_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { + clear_buffer_uptodate(bh); if (bh->b_page != bd_page) { end_page_writeback(bd_page); bd_page = bh->b_page; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 77f1e5778d1c..9ba4933087af 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -372,10 +372,31 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) goto out; } nsbp = (void *)nsbh->b_data + offset; - memset(nsbp, 0, nilfs->ns_blocksize); + lock_buffer(nsbh); if (sb2i >= 0) { + /* + * The position of the second superblock only changes by 4KiB, + * which is larger than the maximum superblock data size + * (= 1KiB), so there is no need to use memmove() to allow + * overlap between source and destination. + */ memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize); + + /* + * Zero fill after copy to avoid overwriting in case of move + * within the same block. + */ + memset(nsbh->b_data, 0, offset); + memset((void *)nsbp + nilfs->ns_sbsize, 0, + nsbh->b_size - offset - nilfs->ns_sbsize); + } else { + memset(nsbh->b_data, 0, nsbh->b_size); + } + set_buffer_uptodate(nsbh); + unlock_buffer(nsbh); + + if (sb2i >= 0) { brelse(nilfs->ns_sbh[sb2i]); nilfs->ns_sbh[sb2i] = nsbh; nilfs->ns_sbp[sb2i] = nsbp; -- cgit v1.2.3 From 47a7c01c3efc6581f5dcca40928baeb38e1e40c2 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:12 +0000 Subject: Revert "mm: shrinkers: convert shrinker_rwsem to mutex" Patch series "revert shrinker_srcu related changes". This patch (of 7): This reverts commit cf2e309ebca7bb0916771839f9b580b06c778530. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. After discussion, we will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So revert the shrinker_mutex back to shrinker_rwsem first. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-1-qi.zheng@linux.dev Link: https://lkml.kernel.org/r/20230609081518.3039120-2-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Cc: Yujie Liu Signed-off-by: Andrew Morton --- drivers/md/dm-cache-metadata.c | 2 +- drivers/md/dm-thin-metadata.c | 2 +- fs/super.c | 2 +- mm/shrinker_debug.c | 14 +++++++------- mm/vmscan.c | 34 +++++++++++++++++----------------- 5 files changed, 27 insertions(+), 27 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 9e0c69958587..acffed750e3e 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1828,7 +1828,7 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs cmd root_lock). - * - must take shrinker_mutex without holding cmd->root_lock + * - must take shrinker_rwsem without holding cmd->root_lock */ new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, CACHE_MAX_CONCURRENT_LOCKS); diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 9f5cb52c5763..fd464fb024c3 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1887,7 +1887,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs pmd root_lock). - * - must take shrinker_mutex without holding pmd->root_lock + * - must take shrinker_rwsem without holding pmd->root_lock */ new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, THIN_MAX_CONCURRENT_LOCKS); diff --git a/fs/super.c b/fs/super.c index 34afe411cf2b..04bc62ab7dfe 100644 --- a/fs/super.c +++ b/fs/super.c @@ -54,7 +54,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = { * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the - * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we + * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index fe10436d9911..2be15b8a6d0b 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -8,7 +8,7 @@ #include /* defined in vmscan.c */ -extern struct mutex shrinker_mutex; +extern struct rw_semaphore shrinker_rwsem; extern struct list_head shrinker_list; extern struct srcu_struct shrinker_srcu; @@ -168,7 +168,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker) char buf[128]; int id; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); /* debugfs isn't initialized yet, add debugfs entries later. */ if (!shrinker_debugfs_root) @@ -211,7 +211,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) if (!new) return -ENOMEM; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); old = shrinker->name; shrinker->name = new; @@ -229,7 +229,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) shrinker->debugfs_entry = entry; } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); kfree_const(old); @@ -242,7 +242,7 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, { struct dentry *entry = shrinker->debugfs_entry; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); kfree_const(shrinker->name); shrinker->name = NULL; @@ -271,14 +271,14 @@ static int __init shrinker_debugfs_init(void) shrinker_debugfs_root = dentry; /* Create debugfs entries for shrinkers registered at boot */ - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); list_for_each_entry(shrinker, &shrinker_list, list) if (!shrinker->debugfs_entry) { ret = shrinker_debugfs_add(shrinker); if (ret) break; } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 6d0cd2840cf0..4730dba253c8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include @@ -190,7 +190,7 @@ struct scan_control { int vm_swappiness = 60; LIST_HEAD(shrinker_list); -DEFINE_MUTEX(shrinker_mutex); +DECLARE_RWSEM(shrinker_rwsem); DEFINE_SRCU(shrinker_srcu); static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0); @@ -213,7 +213,7 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, { return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, &shrinker_srcu, - lockdep_is_held(&shrinker_mutex)); + lockdep_is_held(&shrinker_rwsem)); } static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, @@ -292,7 +292,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) int nid, size, ret = 0; int map_size, defer_size = 0; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); map_size = shrinker_map_size(shrinker_nr_max); defer_size = shrinker_defer_size(shrinker_nr_max); size = map_size + defer_size; @@ -308,7 +308,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) info->map_nr_max = shrinker_nr_max; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -324,7 +324,7 @@ static int expand_shrinker_info(int new_id) if (!root_mem_cgroup) goto out; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); map_size = shrinker_map_size(new_nr_max); defer_size = shrinker_defer_size(new_nr_max); @@ -374,7 +374,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) if (mem_cgroup_disabled()) return -ENOSYS; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -388,7 +388,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) shrinker->id = id; ret = 0; unlock: - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -398,7 +398,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) BUG_ON(id < 0); - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); idr_remove(&shrinker_idr, id); } @@ -433,7 +433,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -442,7 +442,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) atomic_long_add(nr, &parent_info->nr_deferred[i]); } } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); } static bool cgroup_reclaim(struct scan_control *sc) @@ -743,9 +743,9 @@ void free_prealloced_shrinker(struct shrinker *shrinker) shrinker->name = NULL; #endif if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return; } @@ -755,11 +755,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker) void register_shrinker_prepared(struct shrinker *shrinker) { - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); list_add_tail_rcu(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); } static int __register_shrinker(struct shrinker *shrinker) @@ -810,13 +810,13 @@ void unregister_shrinker(struct shrinker *shrinker) if (!(shrinker->flags & SHRINKER_REGISTERED)) return; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); list_del_rcu(&shrinker->list); shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); atomic_inc(&shrinker_srcu_generation); synchronize_srcu(&shrinker_srcu); -- cgit v1.2.3 From 07252b0f97150b03050f74f42250f275566d70b9 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:13 +0000 Subject: Revert "mm: vmscan: remove shrinker_rwsem from synchronize_shrinkers()" This reverts commit 1643db98d9b314e0a592d152603094fbf7ab906e. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. We will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So we still need shrinker_rwsem in synchronize_shrinkers() after reverting the shrinker_srcu related changes. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-3-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4730dba253c8..0ba0e1180f3f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -831,11 +831,15 @@ EXPORT_SYMBOL(unregister_shrinker); /** * synchronize_shrinkers - Wait for all running shrinkers to complete. * - * This is useful to guarantee that all shrinker invocations have seen an - * update, before freeing memory. + * This is equivalent to calling unregister_shrink() and register_shrinker(), + * but atomically and with less overhead. This is useful to guarantee that all + * shrinker invocations have seen an update, before freeing memory, similar to + * rcu. */ void synchronize_shrinkers(void) { + down_write(&shrinker_rwsem); + up_write(&shrinker_rwsem); atomic_inc(&shrinker_srcu_generation); synchronize_srcu(&shrinker_srcu); } -- cgit v1.2.3 From c534f7cca6b9b1c0dc97d6e9c5587858d4330cd9 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:14 +0000 Subject: Revert "mm: vmscan: hold write lock to reparent shrinker nr_deferred" This reverts commit b3cabea3c9153fd42fe5cb851ac58b51ea2b32b8. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. We will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. Because there will be other readers after reverting the shrinker_srcu related changes, so it is better to restore to hold read lock to reparent shrinker nr_deferred. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-4-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 0ba0e1180f3f..d1d309fc3212 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -433,7 +433,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - down_write(&shrinker_rwsem); + down_read(&shrinker_rwsem); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -442,7 +442,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) atomic_long_add(nr, &parent_info->nr_deferred[i]); } } - up_write(&shrinker_rwsem); + up_read(&shrinker_rwsem); } static bool cgroup_reclaim(struct scan_control *sc) -- cgit v1.2.3 From 1a554ecc971406e291cea867112f7f2e377e810e Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:15 +0000 Subject: Revert "mm: shrinkers: make count and scan in shrinker debugfs lockless" This reverts commit 20cd1892fcc3efc10a7ac327cc3790494bec46b5. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. We will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So revert the shrinker_srcu related changes first. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-5-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/shrinker_debug.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 2be15b8a6d0b..3ab53fad8876 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -5,12 +5,10 @@ #include #include #include -#include /* defined in vmscan.c */ extern struct rw_semaphore shrinker_rwsem; extern struct list_head shrinker_list; -extern struct srcu_struct shrinker_srcu; static DEFINE_IDA(shrinker_debugfs_ida); static struct dentry *shrinker_debugfs_root; @@ -51,13 +49,18 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) struct mem_cgroup *memcg; unsigned long total; bool memcg_aware; - int ret = 0, nid, srcu_idx; + int ret, nid; count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); if (!count_per_node) return -ENOMEM; - srcu_idx = srcu_read_lock(&shrinker_srcu); + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + kfree(count_per_node); + return ret; + } + rcu_read_lock(); memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE; @@ -88,7 +91,8 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) } } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - srcu_read_unlock(&shrinker_srcu, srcu_idx); + rcu_read_unlock(); + up_read(&shrinker_rwsem); kfree(count_per_node); return ret; @@ -111,8 +115,9 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, .gfp_mask = GFP_KERNEL, }; struct mem_cgroup *memcg = NULL; - int nid, srcu_idx; + int nid; char kbuf[72]; + ssize_t ret; read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1); if (copy_from_user(kbuf, buf, read_len)) @@ -141,7 +146,11 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return -EINVAL; } - srcu_idx = srcu_read_lock(&shrinker_srcu); + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + mem_cgroup_put(memcg); + return ret; + } sc.nid = nid; sc.memcg = memcg; @@ -150,7 +159,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, shrinker->scan_objects(shrinker, &sc); - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); mem_cgroup_put(memcg); return size; -- cgit v1.2.3 From d6ecbcd70fffcffe79d45f3be7b8fcf5d8e04a3d Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:16 +0000 Subject: Revert "mm: vmscan: add shrinker_srcu_generation" This reverts commit 475733dda5aedba9e086379aafe6b5ffd53e8f5e. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. We will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So revert the shrinker_srcu related changes first. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-6-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d1d309fc3212..50775b73d0c7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -192,7 +192,6 @@ int vm_swappiness = 60; LIST_HEAD(shrinker_list); DECLARE_RWSEM(shrinker_rwsem); DEFINE_SRCU(shrinker_srcu); -static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -818,7 +817,6 @@ void unregister_shrinker(struct shrinker *shrinker) debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); up_write(&shrinker_rwsem); - atomic_inc(&shrinker_srcu_generation); synchronize_srcu(&shrinker_srcu); shrinker_debugfs_remove(debugfs_entry, debugfs_id); @@ -840,7 +838,6 @@ void synchronize_shrinkers(void) { down_write(&shrinker_rwsem); up_write(&shrinker_rwsem); - atomic_inc(&shrinker_srcu_generation); synchronize_srcu(&shrinker_srcu); } EXPORT_SYMBOL(synchronize_shrinkers); @@ -950,20 +947,18 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; - int srcu_idx, generation; - int i = 0; + int srcu_idx; + int i; if (!mem_cgroup_online(memcg)) return 0; -again: srcu_idx = srcu_read_lock(&shrinker_srcu); info = shrinker_info_srcu(memcg, nid); if (unlikely(!info)) goto unlock; - generation = atomic_read(&shrinker_srcu_generation); - for_each_set_bit_from(i, info->map, info->map_nr_max) { + for_each_set_bit(i, info->map, info->map_nr_max) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -1009,11 +1004,6 @@ again: set_shrinker_bit(memcg, nid, i); } freed += ret; - if (atomic_read(&shrinker_srcu_generation) != generation) { - srcu_read_unlock(&shrinker_srcu, srcu_idx); - i++; - goto again; - } } unlock: srcu_read_unlock(&shrinker_srcu, srcu_idx); @@ -1053,7 +1043,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, { unsigned long ret, freed = 0; struct shrinker *shrinker; - int srcu_idx, generation; + int srcu_idx; /* * The root memcg might be allocated even though memcg is disabled @@ -1067,7 +1057,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, srcu_idx = srcu_read_lock(&shrinker_srcu); - generation = atomic_read(&shrinker_srcu_generation); list_for_each_entry_srcu(shrinker, &shrinker_list, list, srcu_read_lock_held(&shrinker_srcu)) { struct shrink_control sc = { @@ -1080,11 +1069,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; freed += ret; - - if (atomic_read(&shrinker_srcu_generation) != generation) { - freed = freed ? : 1; - break; - } } srcu_read_unlock(&shrinker_srcu, srcu_idx); -- cgit v1.2.3 From 7cee3603192a198b23c034a7372b599081cbee88 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:17 +0000 Subject: Revert "mm: vmscan: make memcg slab shrink lockless" This reverts commit caa05325c9126c77ebf114edce51536a0d0a9a08. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. After discussion, we will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So revert the shrinker_srcu related changes first. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-7-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 50775b73d0c7..a008d7f2d0fc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -210,21 +210,8 @@ static inline int shrinker_defer_size(int nr_items) static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { - return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, - &shrinker_srcu, - lockdep_is_held(&shrinker_rwsem)); -} - -static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, - int nid) -{ - return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, - &shrinker_srcu); -} - -static void free_shrinker_info_rcu(struct rcu_head *head) -{ - kvfree(container_of(head, struct shrinker_info, rcu)); + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); } static int expand_one_shrinker_info(struct mem_cgroup *memcg, @@ -265,7 +252,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, defer_size - old_defer_size); rcu_assign_pointer(pn->shrinker_info, new); - call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu); + kvfree_rcu(old, rcu); } return 0; @@ -351,16 +338,15 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; - int srcu_idx; - srcu_idx = srcu_read_lock(&shrinker_srcu); - info = shrinker_info_srcu(memcg, nid); + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); set_bit(shrinker_id, info->map); } - srcu_read_unlock(&shrinker_srcu, srcu_idx); + rcu_read_unlock(); } } @@ -374,6 +360,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) return -ENOSYS; down_write(&shrinker_rwsem); + /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -407,7 +394,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_srcu(memcg, nid); + info = shrinker_info_protected(memcg, nid); return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); } @@ -416,7 +403,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_srcu(memcg, nid); + info = shrinker_info_protected(memcg, nid); return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); } @@ -947,14 +934,15 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; - int srcu_idx; int i; if (!mem_cgroup_online(memcg)) return 0; - srcu_idx = srcu_read_lock(&shrinker_srcu); - info = shrinker_info_srcu(memcg, nid); + if (!down_read_trylock(&shrinker_rwsem)) + return 0; + + info = shrinker_info_protected(memcg, nid); if (unlikely(!info)) goto unlock; @@ -1004,9 +992,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, set_shrinker_bit(memcg, nid, i); } freed += ret; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } } unlock: - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); return freed; } #else /* CONFIG_MEMCG */ -- cgit v1.2.3 From 71c3ad65fabec9620d3f548b2da948c79c7ad9d5 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:18 +0000 Subject: Revert "mm: vmscan: make global slab shrink lockless" This reverts commit f95bdb700bc6bb74e1199b1f5f90c613e152cfa7. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. After discussion, we will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So revert the shrinker_srcu related changes first. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-8-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index a008d7f2d0fc..5bf98d0a22c9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include @@ -191,7 +190,6 @@ int vm_swappiness = 60; LIST_HEAD(shrinker_list); DECLARE_RWSEM(shrinker_rwsem); -DEFINE_SRCU(shrinker_srcu); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -742,7 +740,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker) void register_shrinker_prepared(struct shrinker *shrinker) { down_write(&shrinker_rwsem); - list_add_tail_rcu(&shrinker->list, &shrinker_list); + list_add_tail(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); up_write(&shrinker_rwsem); @@ -797,15 +795,13 @@ void unregister_shrinker(struct shrinker *shrinker) return; down_write(&shrinker_rwsem); - list_del_rcu(&shrinker->list); + list_del(&shrinker->list); shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); up_write(&shrinker_rwsem); - synchronize_srcu(&shrinker_srcu); - shrinker_debugfs_remove(debugfs_entry, debugfs_id); kfree(shrinker->nr_deferred); @@ -825,7 +821,6 @@ void synchronize_shrinkers(void) { down_write(&shrinker_rwsem); up_write(&shrinker_rwsem); - synchronize_srcu(&shrinker_srcu); } EXPORT_SYMBOL(synchronize_shrinkers); @@ -1036,7 +1031,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, { unsigned long ret, freed = 0; struct shrinker *shrinker; - int srcu_idx; /* * The root memcg might be allocated even though memcg is disabled @@ -1048,10 +1042,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); - srcu_idx = srcu_read_lock(&shrinker_srcu); + if (!down_read_trylock(&shrinker_rwsem)) + goto out; - list_for_each_entry_srcu(shrinker, &shrinker_list, list, - srcu_read_lock_held(&shrinker_srcu)) { + list_for_each_entry(shrinker, &shrinker_list, list) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -1062,9 +1056,19 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; freed += ret; + /* + * Bail out if someone want to register a new shrinker to + * prevent the registration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } } - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); +out: cond_resched(); return freed; } -- cgit v1.2.3 From 782e53d0c14420858dbf0f8f797973c150d3b6d7 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 12 Jun 2023 11:14:56 +0900 Subject: nilfs2: prevent general protection fault in nilfs_clear_dirty_page() In a syzbot stress test that deliberately causes file system errors on nilfs2 with a corrupted disk image, it has been reported that nilfs_clear_dirty_page() called from nilfs_clear_dirty_pages() can cause a general protection fault. In nilfs_clear_dirty_pages(), when looking up dirty pages from the page cache and calling nilfs_clear_dirty_page() for each dirty page/folio retrieved, the back reference from the argument page to "mapping" may have been changed to NULL (and possibly others). It is necessary to check this after locking the page/folio. So, fix this issue by not calling nilfs_clear_dirty_page() on a page/folio after locking it in nilfs_clear_dirty_pages() if the back reference "mapping" from the page/folio is different from the "mapping" that held the page/folio just before. Link: https://lkml.kernel.org/r/20230612021456.3682-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: syzbot+53369d11851d8f26735c@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/000000000000da4f6b05eb9bf593@google.com Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/page.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 5cf30827f244..b4e54d079b7d 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -370,7 +370,15 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) struct folio *folio = fbatch.folios[i]; folio_lock(folio); - nilfs_clear_dirty_page(&folio->page, silent); + + /* + * This folio may have been removed from the address + * space by truncation or invalidation when the lock + * was acquired. Skip processing in that case. + */ + if (likely(folio->mapping == mapping)) + nilfs_clear_dirty_page(&folio->page, silent); + folio_unlock(folio); } folio_batch_release(&fbatch); -- cgit v1.2.3 From 823b37e8a7104ca2bcf1945be77f33d4d5740f55 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 15 Jun 2023 09:18:20 +0100 Subject: mailmap: add entries for Ben Dooks I am going to be losing my sifive.com address soon and I also realised my old Simtec address (from >10 years ago) is also not been updates so update .mailmap for both. Link: https://lkml.kernel.org/r/20230615081820.79485-1-ben.dooks@codethink.co.uk Signed-off-by: Ben Dooks Signed-off-by: Ben Dooks Signed-off-by: Andrew Morton --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 650689d00930..c94da2a63d0f 100644 --- a/.mailmap +++ b/.mailmap @@ -70,6 +70,8 @@ Baolin Wang Baolin Wang Bart Van Assche Bart Van Assche +Ben Dooks +Ben Dooks Ben Gardner Ben M Cahill Ben Widawsky -- cgit v1.2.3 From 0518dbe97fe629fea255318841cf3ef1b4532d66 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 14 Jun 2023 22:18:58 +0100 Subject: selftests/mm: fix cross compilation with LLVM Currently the MM selftests attempt to work out the target architecture by using CROSS_COMPILE or otherwise querying the host machine, storing the target architecture in a variable called MACHINE rather than the usual ARCH though as far as I can tell (including for x86_64) the value is the same as we would use for architecture. When cross compiling with LLVM we don't need a CROSS_COMPILE as LLVM can support many target architectures in a single build so this logic does not work, CROSS_COMPILE is not set and we end up selecting tests for the host rather than target architecture. Fix this by using the more standard ARCH to describe the architecture, taking it from the environment if specified. Link: https://lkml.kernel.org/r/20230614-kselftest-mm-llvm-v1-1-180523f277d3@kernel.org Signed-off-by: Mark Brown Cc: Nick Desaulniers Cc: Albert Ou Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Shuah Khan Cc: Tom Rix Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 23af4633f0f4..4f0c50c33ba7 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -5,12 +5,15 @@ LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h include local_config.mk +ifeq ($(ARCH),) + ifeq ($(CROSS_COMPILE),) uname_M := $(shell uname -m 2>/dev/null || echo not) else uname_M := $(shell echo $(CROSS_COMPILE) | grep -o '^[a-z0-9]\+') endif -MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') +ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') +endif # Without this, failed build products remain, with up-to-date timestamps, # thus tricking Make (and you!) into believing that All Is Well, in subsequent @@ -65,7 +68,7 @@ TEST_GEN_PROGS += ksm_tests TEST_GEN_PROGS += ksm_functional_tests TEST_GEN_PROGS += mdwe_test -ifeq ($(MACHINE),x86_64) +ifeq ($(ARCH),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c) CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) @@ -87,13 +90,13 @@ TEST_GEN_PROGS += $(BINARIES_64) endif else -ifneq (,$(findstring $(MACHINE),ppc64)) +ifneq (,$(findstring $(ARCH),ppc64)) TEST_GEN_PROGS += protection_keys endif endif -ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sparc64 x86_64)) +ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sparc64 x86_64)) TEST_GEN_PROGS += va_high_addr_switch TEST_GEN_PROGS += virtual_address_range TEST_GEN_PROGS += write_to_hugetlbfs @@ -112,7 +115,7 @@ $(TEST_GEN_PROGS): vm_util.c $(OUTPUT)/uffd-stress: uffd-common.c $(OUTPUT)/uffd-unit-tests: uffd-common.c -ifeq ($(MACHINE),x86_64) +ifeq ($(ARCH),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) -- cgit v1.2.3 From 9a43827e876c9a071826cc81783aa2222b020f1d Mon Sep 17 00:00:00 2001 From: Josua Mayer Date: Fri, 16 Jun 2023 14:14:14 +0300 Subject: net: dpaa2-mac: add 25gbase-r support Layerscape MACs support 25Gbps network speed with dpmac "CAUI" mode. Add the mappings between DPMAC_ETH_IF_* and HY_INTERFACE_MODE_*, as well as the 25000 mac capability. Tested on SolidRun LX2162a Clearfog, serdes 1 protocol 18. Signed-off-by: Josua Mayer Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c index b1871e6c4006..00e50bd30189 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c @@ -54,6 +54,9 @@ static int phy_mode(enum dpmac_eth_if eth_if, phy_interface_t *if_mode) case DPMAC_ETH_IF_XFI: *if_mode = PHY_INTERFACE_MODE_10GBASER; break; + case DPMAC_ETH_IF_CAUI: + *if_mode = PHY_INTERFACE_MODE_25GBASER; + break; default: return -EINVAL; } @@ -79,6 +82,8 @@ static enum dpmac_eth_if dpmac_eth_if_mode(phy_interface_t if_mode) return DPMAC_ETH_IF_XFI; case PHY_INTERFACE_MODE_1000BASEX: return DPMAC_ETH_IF_1000BASEX; + case PHY_INTERFACE_MODE_25GBASER: + return DPMAC_ETH_IF_CAUI; default: return DPMAC_ETH_IF_MII; } @@ -418,7 +423,7 @@ int dpaa2_mac_connect(struct dpaa2_mac *mac) mac->phylink_config.mac_capabilities = MAC_SYM_PAUSE | MAC_ASYM_PAUSE | MAC_10FD | MAC_100FD | MAC_1000FD | MAC_2500FD | MAC_5000FD | - MAC_10000FD; + MAC_10000FD | MAC_25000FD; dpaa2_mac_set_supported_interfaces(mac); -- cgit v1.2.3 From ff221029a51fd54cacac66e193e0c75e4de940e7 Mon Sep 17 00:00:00 2001 From: Arınç ÜNAL Date: Sat, 17 Jun 2023 09:26:44 +0300 Subject: net: dsa: mt7530: set all CPU ports in MT7531_CPU_PMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MT7531_CPU_PMAP represents the destination port mask for trapped-to-CPU frames (further restricted by PCR_MATRIX). Currently the driver sets the first CPU port as the single port in this bit mask, which works fine regardless of whether the device tree defines port 5, 6 or 5+6 as CPU ports. This is because the logic coincides with DSA's logic of picking the first CPU port as the CPU port that all user ports are affine to, by default. An upcoming change would like to influence DSA's selection of the default CPU port to no longer be the first one, and in that case, this logic needs adaptation. Since there is no observed leakage or duplication of frames if all CPU ports are defined in this bit mask, simply include them all. Suggested-by: Russell King (Oracle) Suggested-by: Vladimir Oltean Signed-off-by: Arınç ÜNAL Reviewed-by: Vladimir Oltean Reviewed-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 15 ++++++++------- drivers/net/dsa/mt7530.h | 1 + 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 9bc54e1348cb..f8503155f179 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1010,6 +1010,13 @@ mt753x_cpu_port_enable(struct dsa_switch *ds, int port) if (priv->id == ID_MT7621) mt7530_rmw(priv, MT7530_MFC, CPU_MASK, CPU_EN | CPU_PORT(port)); + /* Add the CPU port to the CPU port bitmap for MT7531 and the switch on + * the MT7988 SoC. Trapped frames will be forwarded to the CPU port that + * is affine to the inbound user port. + */ + if (priv->id == ID_MT7531 || priv->id == ID_MT7988) + mt7530_set(priv, MT7531_CFC, MT7531_CPU_PMAP(BIT(port))); + /* CPU port gets connected to all user ports of * the switch. */ @@ -2352,15 +2359,9 @@ static int mt7531_setup_common(struct dsa_switch *ds) { struct mt7530_priv *priv = ds->priv; - struct dsa_port *cpu_dp; int ret, i; - /* BPDU to CPU port */ - dsa_switch_for_each_cpu_port(cpu_dp, ds) { - mt7530_rmw(priv, MT7531_CFC, MT7531_CPU_PMAP_MASK, - BIT(cpu_dp->index)); - break; - } + /* Trap BPDUs to the CPU port(s) */ mt7530_rmw(priv, MT753X_BPC, MT753X_BPDU_PORT_FW_MASK, MT753X_BPDU_CPU_ONLY); diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index 5084f48a8869..e590cf43f3ae 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -54,6 +54,7 @@ enum mt753x_id { #define MT7531_MIRROR_PORT_GET(x) (((x) >> 16) & MIRROR_MASK) #define MT7531_MIRROR_PORT_SET(x) (((x) & MIRROR_MASK) << 16) #define MT7531_CPU_PMAP_MASK GENMASK(7, 0) +#define MT7531_CPU_PMAP(x) FIELD_PREP(MT7531_CPU_PMAP_MASK, x) #define MT753X_MIRROR_REG(id) ((((id) == ID_MT7531) || ((id) == ID_MT7988)) ? \ MT7531_CFC : MT7530_MFC) -- cgit v1.2.3 From 4ae90f90e4909e3014e2dc6a0627964617a7b824 Mon Sep 17 00:00:00 2001 From: Arınç ÜNAL Date: Sat, 17 Jun 2023 09:26:45 +0300 Subject: net: dsa: mt7530: fix trapping frames on non-MT7621 SoC MT7530 switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All MT7530 switch IP variants share the MT7530_MFC register, but the current driver only writes it for the switch variant that is integrated in the MT7621 SoC. Modify the code to include all MT7530 derivatives. Fixes: b8f126a8d543 ("net-next: dsa: add dsa support for Mediatek MT7530 switch") Suggested-by: Vladimir Oltean Signed-off-by: Arınç ÜNAL Reviewed-by: Vladimir Oltean Reviewed-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index f8503155f179..8c9acf109a4e 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1007,7 +1007,7 @@ mt753x_cpu_port_enable(struct dsa_switch *ds, int port) UNU_FFP(BIT(port))); /* Set CPU port number */ - if (priv->id == ID_MT7621) + if (priv->id == ID_MT7530 || priv->id == ID_MT7621) mt7530_rmw(priv, MT7530_MFC, CPU_MASK, CPU_EN | CPU_PORT(port)); /* Add the CPU port to the CPU port bitmap for MT7531 and the switch on -- cgit v1.2.3 From d7c66073559386b836bded7cdc8b66ee5c049129 Mon Sep 17 00:00:00 2001 From: Arınç ÜNAL Date: Sat, 17 Jun 2023 09:26:46 +0300 Subject: net: dsa: mt7530: fix handling of BPDUs on MT7530 switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPDUs are link-local frames, therefore they must be trapped to the CPU port. Currently, the MT7530 switch treats BPDUs as regular multicast frames, therefore flooding them to user ports. To fix this, set BPDUs to be trapped to the CPU port. Group this on mt7530_setup() and mt7531_setup_common() into mt753x_trap_frames() and call that. Fixes: b8f126a8d543 ("net-next: dsa: add dsa support for Mediatek MT7530 switch") Signed-off-by: Arınç ÜNAL Reviewed-by: Vladimir Oltean Reviewed-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 8c9acf109a4e..5e4f6965cebd 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -985,6 +985,14 @@ unlock_exit: mutex_unlock(&priv->reg_mutex); } +static void +mt753x_trap_frames(struct mt7530_priv *priv) +{ + /* Trap BPDUs to the CPU port(s) */ + mt7530_rmw(priv, MT753X_BPC, MT753X_BPDU_PORT_FW_MASK, + MT753X_BPDU_CPU_ONLY); +} + static int mt753x_cpu_port_enable(struct dsa_switch *ds, int port) { @@ -2262,6 +2270,8 @@ mt7530_setup(struct dsa_switch *ds) priv->p6_interface = PHY_INTERFACE_MODE_NA; + mt753x_trap_frames(priv); + /* Enable and reset MIB counters */ mt7530_mib_reset(ds); @@ -2361,9 +2371,7 @@ mt7531_setup_common(struct dsa_switch *ds) struct mt7530_priv *priv = ds->priv; int ret, i; - /* Trap BPDUs to the CPU port(s) */ - mt7530_rmw(priv, MT753X_BPC, MT753X_BPDU_PORT_FW_MASK, - MT753X_BPDU_CPU_ONLY); + mt753x_trap_frames(priv); /* Enable and reset MIB counters */ mt7530_mib_reset(ds); -- cgit v1.2.3 From 8332cf6fd7c7087dbc2067115b33979c9851bbc4 Mon Sep 17 00:00:00 2001 From: Arınç ÜNAL Date: Sat, 17 Jun 2023 09:26:47 +0300 Subject: net: dsa: mt7530: fix handling of LLDP frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LLDP frames are link-local frames, therefore they must be trapped to the CPU port. Currently, the MT753X switches treat LLDP frames as regular multicast frames, therefore flooding them to user ports. To fix this, set LLDP frames to be trapped to the CPU port(s). Fixes: b8f126a8d543 ("net-next: dsa: add dsa support for Mediatek MT7530 switch") Signed-off-by: Arınç ÜNAL Reviewed-by: Vladimir Oltean Reviewed-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 4 ++++ drivers/net/dsa/mt7530.h | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 5e4f6965cebd..6d6ff293900c 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -991,6 +991,10 @@ mt753x_trap_frames(struct mt7530_priv *priv) /* Trap BPDUs to the CPU port(s) */ mt7530_rmw(priv, MT753X_BPC, MT753X_BPDU_PORT_FW_MASK, MT753X_BPDU_CPU_ONLY); + + /* Trap LLDP frames with :0E MAC DA to the CPU port(s) */ + mt7530_rmw(priv, MT753X_RGAC2, MT753X_R0E_PORT_FW_MASK, + MT753X_R0E_PORT_FW(MT753X_BPDU_CPU_ONLY)); } static int diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index e590cf43f3ae..08045b035e6a 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -67,6 +67,11 @@ enum mt753x_id { #define MT753X_BPC 0x24 #define MT753X_BPDU_PORT_FW_MASK GENMASK(2, 0) +/* Register for :03 and :0E MAC DA frame control */ +#define MT753X_RGAC2 0x2c +#define MT753X_R0E_PORT_FW_MASK GENMASK(18, 16) +#define MT753X_R0E_PORT_FW(x) FIELD_PREP(MT753X_R0E_PORT_FW_MASK, x) + enum mt753x_bpdu_port_fw { MT753X_BPDU_FOLLOW_MFC, MT753X_BPDU_CPU_EXCLUDE = 4, -- cgit v1.2.3 From b79d7c14f48083abb3fb061370c0c64a569edf4c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 17 Jun 2023 09:26:48 +0300 Subject: net: dsa: introduce preferred_default_local_cpu_port and use on MT7530 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the introduction of the OF bindings, DSA has always had a policy that in case multiple CPU ports are present in the device tree, the numerically smallest one is always chosen. The MT7530 switch family, except the switch on the MT7988 SoC, has 2 CPU ports, 5 and 6, where port 6 is preferable on the MT7531BE switch because it has higher bandwidth. The MT7530 driver developers had 3 options: - to modify DSA when the MT7531 switch support was introduced, such as to prefer the better port - to declare both CPU ports in device trees as CPU ports, and live with the sub-optimal performance resulting from not preferring the better port - to declare just port 6 in the device tree as a CPU port Of course they chose the path of least resistance (3rd option), kicking the can down the road. The hardware description in the device tree is supposed to be stable - developers are not supposed to adopt the strategy of piecemeal hardware description, where the device tree is updated in lockstep with the features that the kernel currently supports. Now, as a result of the fact that they did that, any attempts to modify the device tree and describe both CPU ports as CPU ports would make DSA change its default selection from port 6 to 5, effectively resulting in a performance degradation visible to users with the MT7531BE switch as can be seen below. Without preferring port 6: [ ID][Role] Interval Transfer Bitrate Retr [ 5][TX-C] 0.00-20.00 sec 374 MBytes 157 Mbits/sec 734 sender [ 5][TX-C] 0.00-20.00 sec 373 MBytes 156 Mbits/sec receiver [ 7][RX-C] 0.00-20.00 sec 1.81 GBytes 778 Mbits/sec 0 sender [ 7][RX-C] 0.00-20.00 sec 1.81 GBytes 777 Mbits/sec receiver With preferring port 6: [ ID][Role] Interval Transfer Bitrate Retr [ 5][TX-C] 0.00-20.00 sec 1.99 GBytes 856 Mbits/sec 273 sender [ 5][TX-C] 0.00-20.00 sec 1.99 GBytes 855 Mbits/sec receiver [ 7][RX-C] 0.00-20.00 sec 1.72 GBytes 737 Mbits/sec 15 sender [ 7][RX-C] 0.00-20.00 sec 1.71 GBytes 736 Mbits/sec receiver Using one port for WAN and the other ports for LAN is a very popular use case which is what this test emulates. As such, this change proposes that we retroactively modify stable kernels (which don't support the modification of the CPU port assignments, so as to let user space fix the problem and restore the throughput) to keep the mt7530 driver preferring port 6 even with device trees where the hardware is more fully described. Fixes: c288575f7810 ("net: dsa: mt7530: Add the support of MT7531 switch") Signed-off-by: Vladimir Oltean Signed-off-by: Arınç ÜNAL Reviewed-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 15 +++++++++++++++ include/net/dsa.h | 8 ++++++++ net/dsa/dsa.c | 24 +++++++++++++++++++++++- 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 6d6ff293900c..7e773c4ba046 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -399,6 +399,20 @@ static void mt7530_pll_setup(struct mt7530_priv *priv) core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN); } +/* If port 6 is available as a CPU port, always prefer that as the default, + * otherwise don't care. + */ +static struct dsa_port * +mt753x_preferred_default_local_cpu_port(struct dsa_switch *ds) +{ + struct dsa_port *cpu_dp = dsa_to_port(ds, 6); + + if (dsa_port_is_cpu(cpu_dp)) + return cpu_dp; + + return NULL; +} + /* Setup port 6 interface mode and TRGMII TX circuit */ static int mt7530_pad_clk_setup(struct dsa_switch *ds, phy_interface_t interface) @@ -3098,6 +3112,7 @@ static int mt7988_setup(struct dsa_switch *ds) const struct dsa_switch_ops mt7530_switch_ops = { .get_tag_protocol = mtk_get_tag_protocol, .setup = mt753x_setup, + .preferred_default_local_cpu_port = mt753x_preferred_default_local_cpu_port, .get_strings = mt7530_get_strings, .get_ethtool_stats = mt7530_get_ethtool_stats, .get_sset_count = mt7530_get_sset_count, diff --git a/include/net/dsa.h b/include/net/dsa.h index 8903053fa5aa..ab0f0a5b0860 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -958,6 +958,14 @@ struct dsa_switch_ops { struct phy_device *phy); void (*port_disable)(struct dsa_switch *ds, int port); + /* + * Compatibility between device trees defining multiple CPU ports and + * drivers which are not OK to use by default the numerically smallest + * CPU port of a switch for its local ports. This can return NULL, + * meaning "don't know/don't care". + */ + struct dsa_port *(*preferred_default_local_cpu_port)(struct dsa_switch *ds); + /* * Port's MAC EEE settings */ diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index ab1afe67fd18..1afed89e03c0 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -403,6 +403,24 @@ static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) return 0; } +static struct dsa_port * +dsa_switch_preferred_default_local_cpu_port(struct dsa_switch *ds) +{ + struct dsa_port *cpu_dp; + + if (!ds->ops->preferred_default_local_cpu_port) + return NULL; + + cpu_dp = ds->ops->preferred_default_local_cpu_port(ds); + if (!cpu_dp) + return NULL; + + if (WARN_ON(!dsa_port_is_cpu(cpu_dp) || cpu_dp->ds != ds)) + return NULL; + + return cpu_dp; +} + /* Perform initial assignment of CPU ports to user ports and DSA links in the * fabric, giving preference to CPU ports local to each switch. Default to * using the first CPU port in the switch tree if the port does not have a CPU @@ -410,12 +428,16 @@ static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) */ static int dsa_tree_setup_cpu_ports(struct dsa_switch_tree *dst) { - struct dsa_port *cpu_dp, *dp; + struct dsa_port *preferred_cpu_dp, *cpu_dp, *dp; list_for_each_entry(cpu_dp, &dst->ports, list) { if (!dsa_port_is_cpu(cpu_dp)) continue; + preferred_cpu_dp = dsa_switch_preferred_default_local_cpu_port(cpu_dp->ds); + if (preferred_cpu_dp && preferred_cpu_dp != cpu_dp) + continue; + /* Prefer a local CPU port */ dsa_switch_for_each_port(dp, cpu_dp->ds) { /* Prefer the first local CPU port found */ -- cgit v1.2.3 From 94d12d88b4a849765bf7185c876c6c672ad4714e Mon Sep 17 00:00:00 2001 From: Arınç ÜNAL Date: Sat, 17 Jun 2023 09:26:49 +0300 Subject: MAINTAINERS: add me as maintainer of MEDIATEK SWITCH DRIVER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add me as a maintainer of the MediaTek MT7530 DSA subdriver. List maintainers in alphabetical order by first name. Signed-off-by: Arınç ÜNAL Reviewed-by: Vladimir Oltean Reviewed-by: Matthias Brugger Reviewed-by: Matthias Brugger Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- MAINTAINERS | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 82203351ea38..5a7f13e6c6c1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13270,10 +13270,11 @@ F: drivers/memory/mtk-smi.c F: include/soc/mediatek/smi.h MEDIATEK SWITCH DRIVER -M: Sean Wang +M: Arınç ÜNAL +M: Daniel Golle M: Landen Chao M: DENG Qingfang -M: Daniel Golle +M: Sean Wang L: netdev@vger.kernel.org S: Maintained F: drivers/net/dsa/mt7530-mdio.c -- cgit v1.2.3 From a7299a18a179a9713651fce9ad00972a633c14a9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 20 Jun 2023 17:57:31 +0800 Subject: btrfs: fix u32 overflows when left shifting stripe_nr [BUG] David reported an ASSERT() get triggered during fio load on 8 devices with data/raid6 and metadata/raid1c3: fio --rw=randrw --randrepeat=1 --size=3000m \ --bsrange=512b-64k --bs_unaligned \ --ioengine=libaio --fsync=1024 \ --name=job0 --name=job1 \ The ASSERT() is from rbio_add_bio() of raid56.c: ASSERT(orig_logical >= full_stripe_start && orig_logical + orig_len <= full_stripe_start + rbio->nr_data * BTRFS_STRIPE_LEN); Which is checking if the target rbio is crossing the full stripe boundary. [100.789] assertion failed: orig_logical >= full_stripe_start && orig_logical + orig_len <= full_stripe_start + rbio->nr_data * BTRFS_STRIPE_LEN, in fs/btrfs/raid56.c:1622 [100.795] ------------[ cut here ]------------ [100.796] kernel BUG at fs/btrfs/raid56.c:1622! [100.797] invalid opcode: 0000 [#1] PREEMPT SMP KASAN [100.798] CPU: 1 PID: 100 Comm: kworker/u8:4 Not tainted 6.4.0-rc6-default+ #124 [100.799] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552-rebuilt.opensuse.org 04/01/2014 [100.802] Workqueue: writeback wb_workfn (flush-btrfs-1) [100.803] RIP: 0010:rbio_add_bio+0x204/0x210 [btrfs] [100.806] RSP: 0018:ffff888104a8f300 EFLAGS: 00010246 [100.808] RAX: 00000000000000a1 RBX: ffff8881075907e0 RCX: ffffed1020951e01 [100.809] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 0000000000000001 [100.811] RBP: 0000000141d20000 R08: 0000000000000001 R09: ffff888104a8f04f [100.813] R10: ffffed1020951e09 R11: 0000000000000003 R12: ffff88810e87f400 [100.815] R13: 0000000041d20000 R14: 0000000144529000 R15: ffff888101524000 [100.817] FS: 0000000000000000(0000) GS:ffff88811ac00000(0000) knlGS:0000000000000000 [100.821] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [100.822] CR2: 000055d54e44c270 CR3: 000000010a9a1006 CR4: 00000000003706a0 [100.824] Call Trace: [100.825] [100.825] ? die+0x32/0x80 [100.826] ? do_trap+0x12d/0x160 [100.827] ? rbio_add_bio+0x204/0x210 [btrfs] [100.827] ? rbio_add_bio+0x204/0x210 [btrfs] [100.829] ? do_error_trap+0x90/0x130 [100.830] ? rbio_add_bio+0x204/0x210 [btrfs] [100.831] ? handle_invalid_op+0x2c/0x30 [100.833] ? rbio_add_bio+0x204/0x210 [btrfs] [100.835] ? exc_invalid_op+0x29/0x40 [100.836] ? asm_exc_invalid_op+0x16/0x20 [100.837] ? rbio_add_bio+0x204/0x210 [btrfs] [100.837] raid56_parity_write+0x64/0x270 [btrfs] [100.838] btrfs_submit_chunk+0x26e/0x800 [btrfs] [100.840] ? btrfs_bio_init+0x80/0x80 [btrfs] [100.841] ? release_pages+0x503/0x6d0 [100.842] ? folio_unlock+0x2f/0x60 [100.844] ? __folio_put+0x60/0x60 [100.845] ? btrfs_do_readpage+0xae0/0xae0 [btrfs] [100.847] btrfs_submit_bio+0x21/0x60 [btrfs] [100.847] submit_one_bio+0x6a/0xb0 [btrfs] [100.849] extent_write_cache_pages+0x395/0x680 [btrfs] [100.850] ? __extent_writepage+0x520/0x520 [btrfs] [100.851] ? mark_usage+0x190/0x190 [100.852] extent_writepages+0xdb/0x130 [btrfs] [100.853] ? extent_write_locked_range+0x480/0x480 [btrfs] [100.854] ? mark_usage+0x190/0x190 [100.854] ? attach_extent_buffer_page+0x220/0x220 [btrfs] [100.855] ? reacquire_held_locks+0x178/0x280 [100.856] ? writeback_sb_inodes+0x245/0x7f0 [100.857] do_writepages+0x102/0x2e0 [100.858] ? page_writeback_cpu_online+0x10/0x10 [100.859] ? __lock_release.isra.0+0x14a/0x4d0 [100.860] ? reacquire_held_locks+0x280/0x280 [100.861] ? __lock_acquired+0x1e9/0x3d0 [100.862] ? do_raw_spin_lock+0x1b0/0x1b0 [100.863] __writeback_single_inode+0x94/0x450 [100.864] writeback_sb_inodes+0x372/0x7f0 [100.864] ? lock_sync+0xd0/0xd0 [100.865] ? do_raw_spin_unlock+0x93/0xf0 [100.866] ? sync_inode_metadata+0xc0/0xc0 [100.867] ? rwsem_optimistic_spin+0x340/0x340 [100.868] __writeback_inodes_wb+0x70/0x130 [100.869] wb_writeback+0x2d1/0x530 [100.869] ? __writeback_inodes_wb+0x130/0x130 [100.870] ? lockdep_hardirqs_on_prepare.part.0+0xf1/0x1c0 [100.870] wb_do_writeback+0x3eb/0x480 [100.871] ? wb_writeback+0x530/0x530 [100.871] ? mark_lock_irq+0xcd0/0xcd0 [100.872] wb_workfn+0xe0/0x3f0< [CAUSE] Commit a97699d1d610 ("btrfs: replace map_lookup->stripe_len by BTRFS_STRIPE_LEN") changes how we calculate the map length, to reduce u64 division. Function btrfs_max_io_len() is to get the length to the stripe boundary. It calculates the full stripe start offset (inside the chunk) by the following code: *full_stripe_start = rounddown(*stripe_nr, nr_data_stripes(map)) << BTRFS_STRIPE_LEN_SHIFT; The calculation itself is fine, but the value returned by rounddown() is dependent on both @stripe_nr (which is u32) and nr_data_stripes() (which returned int). Thus the result is also u32, then we do the left shift, which can overflow u32. If such overflow happens, @full_stripe_start will be a value way smaller than @offset, causing later "full_stripe_len - (offset - *full_stripe_start)" to underflow, thus make later length calculation to have no stripe boundary limit, resulting a write bio to exceed stripe boundary. There are some other locations like this, with a u32 @stripe_nr got left shift, which can lead to a similar overflow. [FIX] Fix all @stripe_nr with left shift with a type cast to u64 before the left shift. Those involved @stripe_nr or similar variables are recording the stripe number inside the chunk, which is small enough to be contained by u32, but their offset inside the chunk can not fit into u32. Thus for those specific left shifts, a type cast to u64 is necessary so this patch does not touch them and the code will be cleaned up in the future to keep the fix minimal. Reported-by: David Sterba Fixes: a97699d1d610 ("btrfs: replace map_lookup->stripe_len by BTRFS_STRIPE_LEN") Tested-by: David Sterba Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 841e799dece5..e60beb14852a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5975,12 +5975,12 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; /* stripe_offset is the offset of this block in its stripe */ - stripe_offset = offset - (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); + stripe_offset = offset - ((u64)stripe_nr << BTRFS_STRIPE_LEN_SHIFT); stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >> BTRFS_STRIPE_LEN_SHIFT; stripe_cnt = stripe_nr_end - stripe_nr; - stripe_end_offset = (stripe_nr_end << BTRFS_STRIPE_LEN_SHIFT) - + stripe_end_offset = ((u64)stripe_nr_end << BTRFS_STRIPE_LEN_SHIFT) - (offset + length); /* * after this, stripe_nr is the number of stripes on this @@ -6023,7 +6023,7 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, for (i = 0; i < *num_stripes; i++) { stripes[i].physical = map->stripes[stripe_index].physical + - stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); + stripe_offset + ((u64)stripe_nr << BTRFS_STRIPE_LEN_SHIFT); stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | @@ -6196,9 +6196,11 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, * not ensured to be power of 2. */ *full_stripe_start = - rounddown(*stripe_nr, nr_data_stripes(map)) << + (u64)rounddown(*stripe_nr, nr_data_stripes(map)) << BTRFS_STRIPE_LEN_SHIFT; + ASSERT(*full_stripe_start + full_stripe_len > offset); + ASSERT(*full_stripe_start <= offset); /* * For writes to RAID56, allow to write a full stripe set, but * no straddling of stripe sets. @@ -6221,7 +6223,7 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup * { dst->dev = map->stripes[stripe_index].dev; dst->physical = map->stripes[stripe_index].physical + - stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); + stripe_offset + ((u64)stripe_nr << BTRFS_STRIPE_LEN_SHIFT); } int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, -- cgit v1.2.3 From 7580e0a78eb29e7bb1a772eba4088250bbb70d41 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Fri, 16 Jun 2023 17:45:49 +0100 Subject: be2net: Extend xmit workaround to BE3 chip We have seen a bug where the NIC incorrectly changes the length in the IP header of a padded packet to include the padding bytes. The driver already has a workaround for this so do the workaround for this NIC too. This resolves the issue. The NIC in question identifies itself as follows: [ 8.828494] be2net 0000:02:00.0: FW version is 10.7.110.31 [ 8.834759] be2net 0000:02:00.0: Emulex OneConnect(be3): PF FLEX10 port 1 02:00.0 Ethernet controller: Emulex Corporation OneConnect 10Gb NIC (be3) (rev 01) Fixes: ca34fe38f06d ("be2net: fix wrong usage of adapter->generation") Signed-off-by: Ross Lagerwall Link: https://lore.kernel.org/r/20230616164549.2863037-1-ross.lagerwall@citrix.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/emulex/benet/be_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 7e408bcc88de..0defd519ba62 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -1135,8 +1135,8 @@ static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter, eth_hdr_len = ntohs(skb->protocol) == ETH_P_8021Q ? VLAN_ETH_HLEN : ETH_HLEN; if (skb->len <= 60 && - (lancer_chip(adapter) || skb_vlan_tag_present(skb)) && - is_ipv4_pkt(skb)) { + (lancer_chip(adapter) || BE3_chip(adapter) || + skb_vlan_tag_present(skb)) && is_ipv4_pkt(skb)) { ip = (struct iphdr *)ip_hdr(skb); pskb_trim(skb, eth_hdr_len + ntohs(ip->tot_len)); } -- cgit v1.2.3 From 4e9f0ec38852c18faa9689322e758575af33e5d4 Mon Sep 17 00:00:00 2001 From: Mukesh Sisodiya Date: Mon, 19 Jun 2023 17:02:34 +0200 Subject: wifi: iwlwifi: pcie: Handle SO-F device for PCI id 0x7AF0 Add support for AX1690i and AX1690s devices with PCIE id 0x7AF0. Cc: stable@vger.kernel.org # 6.1+ Signed-off-by: Mukesh Sisodiya Signed-off-by: Gregory Greenman Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20230619150233.461290-2-johannes@sipsolutions.net Signed-off-by: Jakub Kicinski --- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index dba112394838..79115eb1c285 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -548,6 +548,8 @@ static const struct iwl_dev_info iwl_dev_info_table[] = { IWL_DEV_INFO(0x54F0, 0x1692, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690i_name), IWL_DEV_INFO(0x7A70, 0x1691, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690s_name), IWL_DEV_INFO(0x7A70, 0x1692, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690i_name), + IWL_DEV_INFO(0x7AF0, 0x1691, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690s_name), + IWL_DEV_INFO(0x7AF0, 0x1692, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690i_name), IWL_DEV_INFO(0x271C, 0x0214, iwl9260_2ac_cfg, iwl9260_1_name), IWL_DEV_INFO(0x7E40, 0x1691, iwl_cfg_ma_a0_gf4_a0, iwl_ax411_killer_1690s_name), -- cgit v1.2.3 From c8e796895e2310b6130e7577248da1d771431a77 Mon Sep 17 00:00:00 2001 From: Russ Weight Date: Tue, 20 Jun 2023 13:28:24 -0700 Subject: regmap: spi-avmm: Fix regmap_bus max_raw_write The max_raw_write member of the regmap_spi_avmm_bus structure is defined as: .max_raw_write = SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT SPI_AVMM_VAL_SIZE == 4 and MAX_WRITE_CNT == 1 so this results in a maximum write transfer size of 4 bytes which provides only enough space to transfer the address of the target register. It provides no space for the value to be transferred. This bug became an issue (divide-by-zero in _regmap_raw_write()) after the following was accepted into mainline: commit 3981514180c9 ("regmap: Account for register length when chunking") Change max_raw_write to include space (4 additional bytes) for both the register address and value: .max_raw_write = SPI_AVMM_REG_SIZE + SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT Fixes: 7f9fb67358a2 ("regmap: add Intel SPI Slave to AVMM Bus Bridge support") Reviewed-by: Matthew Gerlach Signed-off-by: Russ Weight Link: https://lore.kernel.org/r/20230620202824.380313-1-russell.h.weight@intel.com Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-spi-avmm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/regmap/regmap-spi-avmm.c b/drivers/base/regmap/regmap-spi-avmm.c index 4c2b94b3e30b..6af692844c19 100644 --- a/drivers/base/regmap/regmap-spi-avmm.c +++ b/drivers/base/regmap/regmap-spi-avmm.c @@ -660,7 +660,7 @@ static const struct regmap_bus regmap_spi_avmm_bus = { .reg_format_endian_default = REGMAP_ENDIAN_NATIVE, .val_format_endian_default = REGMAP_ENDIAN_NATIVE, .max_raw_read = SPI_AVMM_VAL_SIZE * MAX_READ_CNT, - .max_raw_write = SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT, + .max_raw_write = SPI_AVMM_REG_SIZE + SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT, .free_context = spi_avmm_bridge_ctx_free, }; -- cgit v1.2.3 From 4bedf9eee016286c835e3d8fa981ddece5338795 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 14:45:22 +0200 Subject: netfilter: nf_tables: fix chain binding transaction logic Add bound flag to rule and chain transactions as in 6a0a8d10a366 ("netfilter: nf_tables: use-after-free in failing rule with bound set") to skip them in case that the chain is already bound from the abort path. This patch fixes an imbalance in the chain use refcnt that triggers a WARN_ON on the table and chain destroy path. This patch also disallows nested chain bindings, which is not supported from userspace. The logic to deal with chain binding in nft_data_hold() and nft_data_release() is not correct. The NFT_TRANS_PREPARE state needs a special handling in case a chain is bound but next expressions in the same rule fail to initialize as described by 1240eb93f061 ("netfilter: nf_tables: incorrect error path handling with NFT_MSG_NEWRULE"). The chain is left bound if rule construction fails, so the objects stored in this chain (and the chain itself) are released by the transaction records from the abort path, follow up patch ("netfilter: nf_tables: add NFT_TRANS_PREPARE_ERROR to deal with bound set/chain") completes this error handling. When deleting an existing rule, chain bound flag is set off so the rule expression .destroy path releases the objects. Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 21 +++++++++- net/netfilter/nf_tables_api.c | 86 ++++++++++++++++++++++++-------------- net/netfilter/nft_immediate.c | 87 +++++++++++++++++++++++++++++++++++---- 3 files changed, 153 insertions(+), 41 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 83db182decc8..8ba7f81e81a6 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1009,7 +1009,10 @@ static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) return (void *)&rule->data[rule->dlen]; } -void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule); +void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule); +void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, + enum nft_trans_phase phase); +void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule); static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, @@ -1104,6 +1107,7 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_set_elem *elem); int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); +int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, @@ -1140,11 +1144,17 @@ int nft_chain_validate_dependency(const struct nft_chain *chain, int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); +static inline bool nft_chain_binding(const struct nft_chain *chain) +{ + return chain->flags & NFT_CHAIN_BINDING; +} + static inline bool nft_chain_is_bound(struct nft_chain *chain) { return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; } +int nft_chain_add(struct nft_table *table, struct nft_chain *chain); void nft_chain_del(struct nft_chain *chain); void nf_tables_chain_destroy(struct nft_ctx *ctx); @@ -1575,6 +1585,7 @@ struct nft_trans_rule { struct nft_rule *rule; struct nft_flow_rule *flow; u32 rule_id; + bool bound; }; #define nft_trans_rule(trans) \ @@ -1583,6 +1594,8 @@ struct nft_trans_rule { (((struct nft_trans_rule *)trans->data)->flow) #define nft_trans_rule_id(trans) \ (((struct nft_trans_rule *)trans->data)->rule_id) +#define nft_trans_rule_bound(trans) \ + (((struct nft_trans_rule *)trans->data)->bound) struct nft_trans_set { struct nft_set *set; @@ -1607,15 +1620,19 @@ struct nft_trans_set { (((struct nft_trans_set *)trans->data)->gc_int) struct nft_trans_chain { + struct nft_chain *chain; bool update; char *name; struct nft_stats __percpu *stats; u8 policy; + bool bound; u32 chain_id; struct nft_base_chain *basechain; struct list_head hook_list; }; +#define nft_trans_chain(trans) \ + (((struct nft_trans_chain *)trans->data)->chain) #define nft_trans_chain_update(trans) \ (((struct nft_trans_chain *)trans->data)->update) #define nft_trans_chain_name(trans) \ @@ -1624,6 +1641,8 @@ struct nft_trans_chain { (((struct nft_trans_chain *)trans->data)->stats) #define nft_trans_chain_policy(trans) \ (((struct nft_trans_chain *)trans->data)->policy) +#define nft_trans_chain_bound(trans) \ + (((struct nft_trans_chain *)trans->data)->bound) #define nft_trans_chain_id(trans) \ (((struct nft_trans_chain *)trans->data)->chain_id) #define nft_trans_basechain(trans) \ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 69bceefaa5c8..5e4965b98528 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -193,6 +193,48 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) } } +static void nft_chain_trans_bind(const struct nft_ctx *ctx, struct nft_chain *chain) +{ + struct nftables_pernet *nft_net; + struct net *net = ctx->net; + struct nft_trans *trans; + + if (!nft_chain_binding(chain)) + return; + + nft_net = nft_pernet(net); + list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + if (nft_trans_chain(trans) == chain) + nft_trans_chain_bound(trans) = true; + break; + case NFT_MSG_NEWRULE: + if (trans->ctx.chain == chain) + nft_trans_rule_bound(trans) = true; + break; + } + } +} + +int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) +{ + if (!nft_chain_binding(chain)) + return 0; + + if (nft_chain_binding(ctx->chain)) + return -EOPNOTSUPP; + + if (chain->bound) + return -EBUSY; + + chain->bound = true; + chain->use++; + nft_chain_trans_bind(ctx, chain); + + return 0; +} + static int nft_netdev_register_hooks(struct net *net, struct list_head *hook_list) { @@ -338,8 +380,9 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID])); } } - + nft_trans_chain(trans) = ctx->chain; nft_trans_commit_list_add_tail(ctx->net, trans); + return trans; } @@ -357,8 +400,7 @@ static int nft_delchain(struct nft_ctx *ctx) return 0; } -static void nft_rule_expr_activate(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr; @@ -371,9 +413,8 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx, } } -static void nft_rule_expr_deactivate(const struct nft_ctx *ctx, - struct nft_rule *rule, - enum nft_trans_phase phase) +void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, + enum nft_trans_phase phase) { struct nft_expr *expr; @@ -2226,7 +2267,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, return 0; } -static int nft_chain_add(struct nft_table *table, struct nft_chain *chain) +int nft_chain_add(struct nft_table *table, struct nft_chain *chain) { int err; @@ -3490,8 +3531,7 @@ err_fill_rule_info: return err; } -static void nf_tables_rule_destroy(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule) { struct nft_expr *expr, *next; @@ -3508,7 +3548,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, kfree(rule); } -void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) +static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); @@ -6638,7 +6678,6 @@ static int nf_tables_newsetelem(struct sk_buff *skb, void nft_data_hold(const struct nft_data *data, enum nft_data_types type) { struct nft_chain *chain; - struct nft_rule *rule; if (type == NFT_DATA_VERDICT) { switch (data->verdict.code) { @@ -6646,15 +6685,6 @@ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) case NFT_GOTO: chain = data->verdict.chain; chain->use++; - - if (!nft_chain_is_bound(chain)) - break; - - chain->table->use++; - list_for_each_entry(rule, &chain->rules, list) - chain->use++; - - nft_chain_add(chain->table, chain); break; } } @@ -9677,7 +9707,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); } else { - if (nft_chain_is_bound(trans->ctx.chain)) { + if (nft_trans_chain_bound(trans)) { nft_trans_destroy(trans); break; } @@ -9700,6 +9730,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; case NFT_MSG_NEWRULE: + if (nft_trans_rule_bound(trans)) { + nft_trans_destroy(trans); + break; + } trans->ctx.chain->use--; list_del_rcu(&nft_trans_rule(trans)->list); nft_rule_expr_deactivate(&trans->ctx, @@ -10263,22 +10297,12 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, static void nft_verdict_uninit(const struct nft_data *data) { struct nft_chain *chain; - struct nft_rule *rule; switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; chain->use--; - - if (!nft_chain_is_bound(chain)) - break; - - chain->table->use--; - list_for_each_entry(rule, &chain->rules, list) - chain->use--; - - nft_chain_del(chain); break; } } diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index c9d2f7c29f53..0492a04064f1 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -76,11 +76,9 @@ static int nft_immediate_init(const struct nft_ctx *ctx, switch (priv->data.verdict.code) { case NFT_JUMP: case NFT_GOTO: - if (nft_chain_is_bound(chain)) { - err = -EBUSY; - goto err1; - } - chain->bound = true; + err = nf_tables_bind_chain(ctx, chain); + if (err < 0) + return err; break; default: break; @@ -98,6 +96,31 @@ static void nft_immediate_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + struct nft_rule *rule; + + if (priv->dreg == NFT_REG_VERDICT) { + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + if (!nft_chain_binding(chain)) + break; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nft_rule_expr_activate(&chain_ctx, rule); + + nft_clear(ctx->net, chain); + break; + default: + break; + } + } return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg)); } @@ -107,6 +130,40 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx, enum nft_trans_phase phase) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + struct nft_rule *rule; + + if (priv->dreg == NFT_REG_VERDICT) { + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + if (!nft_chain_binding(chain)) + break; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nft_rule_expr_deactivate(&chain_ctx, rule, phase); + + switch (phase) { + case NFT_TRANS_PREPARE: + nft_deactivate_next(ctx->net, chain); + break; + default: + nft_chain_del(chain); + chain->bound = false; + chain->table->use--; + break; + } + break; + default: + break; + } + } if (phase == NFT_TRANS_COMMIT) return; @@ -131,15 +188,27 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, case NFT_GOTO: chain = data->verdict.chain; - if (!nft_chain_is_bound(chain)) + if (!nft_chain_binding(chain)) + break; + + /* Rule construction failed, but chain is already bound: + * let the transaction records release this chain and its rules. + */ + if (chain->bound) { + chain->use--; break; + } + /* Rule has been deleted, release chain and its rules. */ chain_ctx = *ctx; chain_ctx.chain = chain; - list_for_each_entry_safe(rule, n, &chain->rules, list) - nf_tables_rule_release(&chain_ctx, rule); - + chain->use--; + list_for_each_entry_safe(rule, n, &chain->rules, list) { + chain->use--; + list_del(&rule->list); + nf_tables_rule_destroy(&chain_ctx, rule); + } nf_tables_chain_destroy(&chain_ctx); break; default: -- cgit v1.2.3 From 26b5a5712eb85e253724e56a54c17f8519bd8e4e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 14:45:26 +0200 Subject: netfilter: nf_tables: add NFT_TRANS_PREPARE_ERROR to deal with bound set/chain Add a new state to deal with rule expressions deactivation from the newrule error path, otherwise the anonymous set remains in the list in inactive state for the next generation. Mark the set/chain transaction as unbound so the abort path releases this object, set it as inactive in the next generation so it is not reachable anymore from this transaction and reference counter is dropped. Fixes: 1240eb93f061 ("netfilter: nf_tables: incorrect error path handling with NFT_MSG_NEWRULE") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 45 +++++++++++++++++++++++++++++++++------ net/netfilter/nft_immediate.c | 3 +++ 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 8ba7f81e81a6..de2b0130c151 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -901,6 +901,7 @@ struct nft_expr_type { enum nft_trans_phase { NFT_TRANS_PREPARE, + NFT_TRANS_PREPARE_ERROR, NFT_TRANS_ABORT, NFT_TRANS_COMMIT, NFT_TRANS_RELEASE @@ -1108,6 +1109,7 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_elem *elem); int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); +void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5e4965b98528..f8afefc8294f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -169,7 +169,8 @@ static void nft_trans_destroy(struct nft_trans *trans) kfree(trans); } -static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) +static void __nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set, + bool bind) { struct nftables_pernet *nft_net; struct net *net = ctx->net; @@ -183,17 +184,28 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) switch (trans->msg_type) { case NFT_MSG_NEWSET: if (nft_trans_set(trans) == set) - nft_trans_set_bound(trans) = true; + nft_trans_set_bound(trans) = bind; break; case NFT_MSG_NEWSETELEM: if (nft_trans_elem_set(trans) == set) - nft_trans_elem_set_bound(trans) = true; + nft_trans_elem_set_bound(trans) = bind; break; } } } -static void nft_chain_trans_bind(const struct nft_ctx *ctx, struct nft_chain *chain) +static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) +{ + return __nft_set_trans_bind(ctx, set, true); +} + +static void nft_set_trans_unbind(const struct nft_ctx *ctx, struct nft_set *set) +{ + return __nft_set_trans_bind(ctx, set, false); +} + +static void __nft_chain_trans_bind(const struct nft_ctx *ctx, + struct nft_chain *chain, bool bind) { struct nftables_pernet *nft_net; struct net *net = ctx->net; @@ -207,16 +219,22 @@ static void nft_chain_trans_bind(const struct nft_ctx *ctx, struct nft_chain *ch switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (nft_trans_chain(trans) == chain) - nft_trans_chain_bound(trans) = true; + nft_trans_chain_bound(trans) = bind; break; case NFT_MSG_NEWRULE: if (trans->ctx.chain == chain) - nft_trans_rule_bound(trans) = true; + nft_trans_rule_bound(trans) = bind; break; } } } +static void nft_chain_trans_bind(const struct nft_ctx *ctx, + struct nft_chain *chain) +{ + __nft_chain_trans_bind(ctx, chain, true); +} + int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) { if (!nft_chain_binding(chain)) @@ -235,6 +253,11 @@ int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) return 0; } +void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) +{ + __nft_chain_trans_bind(ctx, chain, false); +} + static int nft_netdev_register_hooks(struct net *net, struct list_head *hook_list) { @@ -3884,7 +3907,7 @@ err_destroy_flow_rule: if (flow) nft_flow_rule_destroy(flow); err_release_rule: - nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE); + nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR); nf_tables_rule_destroy(&ctx, rule); err_release_expr: for (i = 0; i < n; i++) { @@ -5183,6 +5206,13 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, enum nft_trans_phase phase) { switch (phase) { + case NFT_TRANS_PREPARE_ERROR: + nft_set_trans_unbind(ctx, set); + if (nft_set_is_anonymous(set)) + nft_deactivate_next(ctx->net, set); + + set->use--; + break; case NFT_TRANS_PREPARE: if (nft_set_is_anonymous(set)) nft_deactivate_next(ctx->net, set); @@ -7701,6 +7731,7 @@ void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, enum nft_trans_phase phase) { switch (phase) { + case NFT_TRANS_PREPARE_ERROR: case NFT_TRANS_PREPARE: case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 0492a04064f1..3d76ebfe8939 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -150,6 +150,9 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx, nft_rule_expr_deactivate(&chain_ctx, rule, phase); switch (phase) { + case NFT_TRANS_PREPARE_ERROR: + nf_tables_unbind_chain(ctx, chain); + fallthrough; case NFT_TRANS_PREPARE: nft_deactivate_next(ctx->net, chain); break; -- cgit v1.2.3 From 628bd3e49cba1c066228e23d71a852c23e26da73 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 14:51:49 +0200 Subject: netfilter: nf_tables: drop map element references from preparation phase set .destroy callback releases the references to other objects in maps. This is very late and it results in spurious EBUSY errors. Drop refcount from the preparation phase instead, update set backend not to drop reference counter from set .destroy path. Exceptions: NFT_TRANS_PREPARE_ERROR does not require to drop the reference counter because the transaction abort path releases the map references for each element since the set is unbound. The abort path also deals with releasing reference counter for new elements added to unbound sets. Fixes: 591054469b3e ("netfilter: nf_tables: revisit chain/object refcounting from elements") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +- net/netfilter/nf_tables_api.c | 147 +++++++++++++++++++++++++++++++++----- net/netfilter/nft_set_bitmap.c | 5 +- net/netfilter/nft_set_hash.c | 23 ++++-- net/netfilter/nft_set_pipapo.c | 14 ++-- net/netfilter/nft_set_rbtree.c | 5 +- 6 files changed, 167 insertions(+), 32 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index de2b0130c151..f84b6daea5c4 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -472,7 +472,8 @@ struct nft_set_ops { int (*init)(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]); - void (*destroy)(const struct nft_set *set); + void (*destroy)(const struct nft_ctx *ctx, + const struct nft_set *set); void (*gc_init)(const struct nft_set *set); unsigned int elemsize; @@ -809,6 +810,8 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]); void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr); +void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, void *elem); /** * struct nft_set_gc_batch_head - nf_tables set garbage collection batch diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f8afefc8294f..e2493f83c48e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -559,6 +559,58 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, return __nft_trans_set_add(ctx, msg_type, set, NULL); } +static void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + +static int nft_mapelem_deactivate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + nft_setelem_data_deactivate(ctx->net, set, elem); + + return 0; +} + +struct nft_set_elem_catchall { + struct list_head list; + struct rcu_head rcu; + void *elem; +}; + +static void nft_map_catchall_deactivate(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_elem elem; + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + elem.priv = catchall->elem; + nft_setelem_data_deactivate(ctx->net, set, &elem); + break; + } +} + +static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .fn = nft_mapelem_deactivate, + }; + + set->ops->walk(ctx, set, &iter); + WARN_ON_ONCE(iter.err); + + nft_map_catchall_deactivate(ctx, set); +} + static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) { int err; @@ -567,6 +619,9 @@ static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) if (err < 0) return err; + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + nft_deactivate_next(ctx->net, set); ctx->table->use--; @@ -3659,12 +3714,6 @@ int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, return 0; } -struct nft_set_elem_catchall { - struct list_head list; - struct rcu_head rcu; - void *elem; -}; - int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) { u8 genmask = nft_genmask_next(ctx->net); @@ -4997,7 +5046,7 @@ err_set_expr_alloc: for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(&ctx, set->exprs[i]); err_set_destroy: - ops->destroy(set); + ops->destroy(&ctx, set); err_set_init: kfree(set->name); err_set_name: @@ -5012,7 +5061,7 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx, list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { list_del_rcu(&catchall->list); - nft_set_elem_destroy(set, catchall->elem, true); + nf_tables_set_elem_destroy(ctx, set, catchall->elem); kfree_rcu(catchall, rcu); } } @@ -5027,7 +5076,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) for (i = 0; i < set->num_exprs; i++) nft_expr_destroy(ctx, set->exprs[i]); - set->ops->destroy(set); + set->ops->destroy(ctx, set); nft_set_catchall_destroy(ctx, set); kfree(set->name); kvfree(set); @@ -5192,10 +5241,60 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, } } +static void nft_setelem_data_activate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + +static int nft_mapelem_activate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + nft_setelem_data_activate(ctx->net, set, elem); + + return 0; +} + +static void nft_map_catchall_activate(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_elem elem; + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + elem.priv = catchall->elem; + nft_setelem_data_activate(ctx->net, set, &elem); + break; + } +} + +static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) +{ + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .fn = nft_mapelem_activate, + }; + + set->ops->walk(ctx, set, &iter); + WARN_ON_ONCE(iter.err); + + nft_map_catchall_activate(ctx, set); +} + void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set) { - if (nft_set_is_anonymous(set)) + if (nft_set_is_anonymous(set)) { + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_activate(ctx, set); + nft_clear(ctx->net, set); + } set->use++; } @@ -5214,13 +5313,20 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, set->use--; break; case NFT_TRANS_PREPARE: - if (nft_set_is_anonymous(set)) - nft_deactivate_next(ctx->net, set); + if (nft_set_is_anonymous(set)) { + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + nft_deactivate_next(ctx->net, set); + } set->use--; return; case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: + if (nft_set_is_anonymous(set) && + set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(ctx, set); + set->use--; fallthrough; default: @@ -5973,6 +6079,7 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, __nft_set_elem_expr_destroy(ctx, expr); } +/* Drop references and destroy. Called from gc, dynset and abort path. */ void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr) { @@ -5994,11 +6101,11 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); -/* Only called from commit path, nft_setelem_data_deactivate() already deals - * with the refcounting from the preparation phase. +/* Destroy element. References have been already dropped in the preparation + * path via nft_setelem_data_deactivate(). */ -static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, - const struct nft_set *set, void *elem) +void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, void *elem) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem); @@ -6631,7 +6738,7 @@ err_elem_free: if (obj) obj->use--; err_elem_userdata: - nf_tables_set_elem_destroy(ctx, set, elem.priv); + nft_set_elem_destroy(set, elem.priv, true); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_release(&elem.data.val, desc.type); @@ -9799,6 +9906,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DESTROYSET: trans->ctx.table->use++; nft_clear(trans->ctx.net, nft_trans_set(trans)); + if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_activate(&trans->ctx, nft_trans_set(trans)); + nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: @@ -10568,6 +10678,9 @@ static void __nft_release_table(struct net *net, struct nft_table *table) list_for_each_entry_safe(set, ns, &table->sets, list) { list_del(&set->list); table->use--; + if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) + nft_map_deactivate(&ctx, set); + nft_set_destroy(&ctx, set); } list_for_each_entry_safe(obj, ne, &table->objects, list) { diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 96081ac8d2b4..1e5e7a181e0b 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -271,13 +271,14 @@ static int nft_bitmap_init(const struct nft_set *set, return 0; } -static void nft_bitmap_destroy(const struct nft_set *set) +static void nft_bitmap_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be, *n; list_for_each_entry_safe(be, n, &priv->list, head) - nft_set_elem_destroy(set, be, true); + nf_tables_set_elem_destroy(ctx, set, be); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 76de6c8d9865..0b73cb0e752f 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -400,19 +400,31 @@ static int nft_rhash_init(const struct nft_set *set, return 0; } +struct nft_rhash_ctx { + const struct nft_ctx ctx; + const struct nft_set *set; +}; + static void nft_rhash_elem_destroy(void *ptr, void *arg) { - nft_set_elem_destroy(arg, ptr, true); + struct nft_rhash_ctx *rhash_ctx = arg; + + nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, ptr); } -static void nft_rhash_destroy(const struct nft_set *set) +static void nft_rhash_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_ctx rhash_ctx = { + .ctx = *ctx, + .set = set, + }; cancel_delayed_work_sync(&priv->gc_work); rcu_barrier(); rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, - (void *)set); + (void *)&rhash_ctx); } /* Number of buckets is stored in u32, so cap our result to 1U<<31 */ @@ -643,7 +655,8 @@ static int nft_hash_init(const struct nft_set *set, return 0; } -static void nft_hash_destroy(const struct nft_set *set) +static void nft_hash_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_hash *priv = nft_set_priv(set); struct nft_hash_elem *he; @@ -653,7 +666,7 @@ static void nft_hash_destroy(const struct nft_set *set) for (i = 0; i < priv->buckets; i++) { hlist_for_each_entry_safe(he, next, &priv->table[i], node) { hlist_del_rcu(&he->node); - nft_set_elem_destroy(set, he, true); + nf_tables_set_elem_destroy(ctx, set, he); } } } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 15e451dc3fc4..c867b5b772e8 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2148,10 +2148,12 @@ out_scratch: /** * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array + * @ctx: context * @set: nftables API set representation * @m: matching data pointing to key mapping array */ -static void nft_set_pipapo_match_destroy(const struct nft_set *set, +static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo_field *f; @@ -2168,15 +2170,17 @@ static void nft_set_pipapo_match_destroy(const struct nft_set *set, e = f->mt[r].e; - nft_set_elem_destroy(set, e, true); + nf_tables_set_elem_destroy(ctx, set, e); } } /** * nft_pipapo_destroy() - Free private data for set and all committed elements + * @ctx: context * @set: nftables API set representation */ -static void nft_pipapo_destroy(const struct nft_set *set) +static void nft_pipapo_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; @@ -2186,7 +2190,7 @@ static void nft_pipapo_destroy(const struct nft_set *set) if (m) { rcu_barrier(); - nft_set_pipapo_match_destroy(set, m); + nft_set_pipapo_match_destroy(ctx, set, m); #ifdef NFT_PIPAPO_ALIGN free_percpu(m->scratch_aligned); @@ -2203,7 +2207,7 @@ static void nft_pipapo_destroy(const struct nft_set *set) m = priv->clone; if (priv->dirty) - nft_set_pipapo_match_destroy(set, m); + nft_set_pipapo_match_destroy(ctx, set, m); #ifdef NFT_PIPAPO_ALIGN free_percpu(priv->clone->scratch_aligned); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 2f114aa10f1a..5c05c9b990fb 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -664,7 +664,8 @@ static int nft_rbtree_init(const struct nft_set *set, return 0; } -static void nft_rbtree_destroy(const struct nft_set *set) +static void nft_rbtree_destroy(const struct nft_ctx *ctx, + const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; @@ -675,7 +676,7 @@ static void nft_rbtree_destroy(const struct nft_set *set) while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); - nft_set_elem_destroy(set, rbe, true); + nf_tables_set_elem_destroy(ctx, set, rbe); } } -- cgit v1.2.3 From 2b84e215f87443c74ac0aa7f76bb172d43a87033 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 15:20:04 +0200 Subject: netfilter: nft_set_pipapo: .walk does not deal with generations The .walk callback iterates over the current active set, but it might be useful to iterate over the next generation set. Use the generation mask to determine what set view (either current or next generation) is use for the walk iteration. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index c867b5b772e8..0452ee586c1c 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1974,12 +1974,16 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_pipapo *priv = nft_set_priv(set); + struct net *net = read_pnet(&set->net); struct nft_pipapo_match *m; struct nft_pipapo_field *f; int i, r; rcu_read_lock(); - m = rcu_dereference(priv->match); + if (iter->genmask == nft_genmask_cur(net)) + m = rcu_dereference(priv->match); + else + m = priv->clone; if (unlikely(!m)) goto out; -- cgit v1.2.3 From d6b478666ffa6d2c25386d78bf1c4640d4da305e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 15:20:08 +0200 Subject: netfilter: nf_tables: fix underflow in object reference counter Since ("netfilter: nf_tables: drop map element references from preparation phase"), integration with commit protocol is better, therefore drop the workaround that b91d90368837 ("netfilter: nf_tables: fix leaking object reference count") provides. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e2493f83c48e..cf10b3bd5c0f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6668,19 +6668,19 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (flags) *nft_set_ext_flags(ext) = flags; + if (obj) { + *nft_set_ext_obj(ext) = obj; + obj->use++; + } if (ulen > 0) { if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) { err = -EINVAL; - goto err_elem_userdata; + goto err_elem_free; } udata = nft_set_ext_userdata(ext); udata->len = ulen - 1; nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); } - if (obj) { - *nft_set_ext_obj(ext) = obj; - obj->use++; - } err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs); if (err < 0) goto err_elem_free; @@ -6735,9 +6735,6 @@ err_set_full: err_element_clash: kfree(trans); err_elem_free: - if (obj) - obj->use--; -err_elem_userdata: nft_set_elem_destroy(set, elem.priv, true); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) -- cgit v1.2.3 From c88c535b592d3baeee74009f3eceeeaf0fdd5e1b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 15:20:16 +0200 Subject: netfilter: nf_tables: disallow element updates of bound anonymous sets Anonymous sets come with NFT_SET_CONSTANT from userspace. Although API allows to create anonymous sets without NFT_SET_CONSTANT, it makes no sense to allow to add and to delete elements for bound anonymous sets. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index cf10b3bd5c0f..6f26520bd865 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6779,7 +6779,8 @@ static int nf_tables_newsetelem(struct sk_buff *skb, if (IS_ERR(set)) return PTR_ERR(set); - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + if (!list_empty(&set->bindings) && + (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); @@ -7053,7 +7054,9 @@ static int nf_tables_delsetelem(struct sk_buff *skb, set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); - if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) + + if (!list_empty(&set->bindings) && + (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) return -EBUSY; nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); -- cgit v1.2.3 From 938154b93be8cd611ddfd7bafc1849f3c4355201 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 15:21:33 +0200 Subject: netfilter: nf_tables: reject unbound anonymous set before commit phase Add a new list to track set transaction and to check for unbound anonymous sets before entering the commit phase. Bail out at the end of the transaction handling if an anonymous set remains unbound. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 +++ net/netfilter/nf_tables_api.c | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f84b6daea5c4..ee47d7143d99 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1573,6 +1573,7 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) * struct nft_trans - nf_tables object update in transaction * * @list: used internally + * @binding_list: list of objects with possible bindings * @msg_type: message type * @put_net: ctx->net needs to be put * @ctx: transaction context @@ -1580,6 +1581,7 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) */ struct nft_trans { struct list_head list; + struct list_head binding_list; int msg_type; bool put_net; struct nft_ctx ctx; @@ -1724,6 +1726,7 @@ static inline int nft_request_module(struct net *net, const char *fmt, ...) { re struct nftables_pernet { struct list_head tables; struct list_head commit_list; + struct list_head binding_list; struct list_head module_list; struct list_head notify_list; struct mutex commit_mutex; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6f26520bd865..66da44bff5e4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -151,6 +151,7 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, return NULL; INIT_LIST_HEAD(&trans->list); + INIT_LIST_HEAD(&trans->binding_list); trans->msg_type = msg_type; trans->ctx = *ctx; @@ -163,9 +164,15 @@ static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); } -static void nft_trans_destroy(struct nft_trans *trans) +static void nft_trans_list_del(struct nft_trans *trans) { list_del(&trans->list); + list_del(&trans->binding_list); +} + +static void nft_trans_destroy(struct nft_trans *trans) +{ + nft_trans_list_del(trans); kfree(trans); } @@ -357,6 +364,14 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr { struct nftables_pernet *nft_net = nft_pernet(net); + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (!nft_trans_set_update(trans) && + nft_set_is_anonymous(nft_trans_set(trans))) + list_add_tail(&trans->binding_list, &nft_net->binding_list); + break; + } + list_add_tail(&trans->list, &nft_net->commit_list); } @@ -9111,7 +9126,7 @@ static void nf_tables_trans_destroy_work(struct work_struct *w) synchronize_rcu(); list_for_each_entry_safe(trans, next, &head, list) { - list_del(&trans->list); + nft_trans_list_del(trans); nft_commit_release(trans); } } @@ -9476,6 +9491,19 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return 0; } + list_for_each_entry(trans, &nft_net->binding_list, binding_list) { + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (!nft_trans_set_update(trans) && + nft_set_is_anonymous(nft_trans_set(trans)) && + !nft_trans_set_bound(trans)) { + pr_warn_once("nftables ruleset with unbound set\n"); + return -EINVAL; + } + break; + } + } + /* 0. Validate ruleset, otherwise roll back for error reporting. */ if (nf_tables_validate(net) < 0) return -EAGAIN; @@ -9989,7 +10017,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { - list_del(&trans->list); + nft_trans_list_del(trans); nf_tables_abort_release(trans); } @@ -10765,6 +10793,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->tables); INIT_LIST_HEAD(&nft_net->commit_list); + INIT_LIST_HEAD(&nft_net->binding_list); INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); -- cgit v1.2.3 From 62e1e94b246e685d89c3163aaef4b160e42ceb02 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 15:21:39 +0200 Subject: netfilter: nf_tables: reject unbound chain set before commit phase Use binding list to track set transaction and to check for unbound chains before entering the commit phase. Bail out if chain binding remain unused before entering the commit step. Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 66da44bff5e4..bab792434a8d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -370,6 +370,11 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr nft_set_is_anonymous(nft_trans_set(trans))) list_add_tail(&trans->binding_list, &nft_net->binding_list); break; + case NFT_MSG_NEWCHAIN: + if (!nft_trans_chain_update(trans) && + nft_chain_binding(nft_trans_chain(trans))) + list_add_tail(&trans->binding_list, &nft_net->binding_list); + break; } list_add_tail(&trans->list, &nft_net->commit_list); @@ -9501,6 +9506,14 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return -EINVAL; } break; + case NFT_MSG_NEWCHAIN: + if (!nft_trans_chain_update(trans) && + nft_chain_binding(nft_trans_chain(trans)) && + !nft_trans_chain_bound(trans)) { + pr_warn_once("nftables ruleset with unbound chain\n"); + return -EINVAL; + } + break; } } -- cgit v1.2.3 From b770283c98e0eee9133c47bc03b6cc625dc94723 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 15:22:01 +0200 Subject: netfilter: nf_tables: disallow updates of anonymous sets Disallow updates of set timeout and garbage collection parameters for anonymous sets. Fixes: 123b99619cca ("netfilter: nf_tables: honor set timeout and garbage collection updates") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bab792434a8d..16995b88da2f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4963,6 +4963,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + if (nft_set_is_anonymous(set)) + return -EOPNOTSUPP; + err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags); if (err < 0) return err; -- cgit v1.2.3 From e26d3009efda338f19016df4175f354a9bd0a4ab Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Jun 2023 15:22:18 +0200 Subject: netfilter: nf_tables: disallow timeout for anonymous sets Never used from userspace, disallow these parameters. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 16995b88da2f..0d293eab310b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4909,6 +4909,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout); if (err) return err; @@ -4917,6 +4920,10 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_SET_GC_INTERVAL] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; + + if (flags & NFT_SET_ANONYMOUS) + return -EOPNOTSUPP; + desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL])); } -- cgit v1.2.3 From 043d2acf57227db1fdaaa620b2a420acfaa56d6e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 14 Jun 2023 23:20:18 +0200 Subject: netfilter: nf_tables: drop module reference after updating chain Otherwise the module reference counter is leaked. Fixes b9703ed44ffb ("netfilter: nf_tables: support for adding new devices to an existing netdev chain") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0d293eab310b..c1db2f4b2aa4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2667,6 +2667,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nft_trans_basechain(trans) = basechain; INIT_LIST_HEAD(&nft_trans_chain_hooks(trans)); list_splice(&hook.list, &nft_trans_chain_hooks(trans)); + if (nla[NFTA_CHAIN_HOOK]) + module_put(hook.type->owner); nft_trans_commit_list_add_tail(ctx->net, trans); -- cgit v1.2.3 From 62f9a68a36d4441a6c412b81faed102594bc6670 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 15 Jun 2023 10:14:25 +0200 Subject: netfilter: nfnetlink_osf: fix module autoload Move the alias from xt_osf to nfnetlink_osf. Fixes: f9324952088f ("netfilter: nfnetlink_osf: extract nfnetlink_subsystem code from xt_osf.c") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_osf.c | 1 + net/netfilter/xt_osf.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index ee6840bd5933..8f1bfa6ccc2d 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -439,3 +439,4 @@ module_init(nfnl_osf_init); module_exit(nfnl_osf_fini); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index e1990baf3a3b..dc9485854002 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -71,4 +71,3 @@ MODULE_AUTHOR("Evgeniy Polyakov "); MODULE_DESCRIPTION("Passive OS fingerprint matching."); MODULE_ALIAS("ipt_osf"); MODULE_ALIAS("ip6t_osf"); -MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); -- cgit v1.2.3 From 42e344f01688490cdac4bed8f5ba21817cad26ee Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 16 Jun 2023 17:56:11 +0200 Subject: netfilter: nf_tables: Fix for deleting base chains with payload When deleting a base chain, iptables-nft simply submits the whole chain to the kernel, including the NFTA_CHAIN_HOOK attribute. The new code added by fixed commit then turned this into a chain update, destroying the hook but not the chain itself. Detect the situation by checking if the chain type is either netdev or inet/ingress. Fixes: 7d937b107108f ("netfilter: nf_tables: support for deleting devices in an existing netdev chain") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c1db2f4b2aa4..4c7937fd803f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2811,21 +2811,18 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, return nf_tables_addchain(&ctx, family, genmask, policy, flags, extack); } -static int nft_delchain_hook(struct nft_ctx *ctx, struct nft_chain *chain, +static int nft_delchain_hook(struct nft_ctx *ctx, + struct nft_base_chain *basechain, struct netlink_ext_ack *extack) { + const struct nft_chain *chain = &basechain->chain; const struct nlattr * const *nla = ctx->nla; struct nft_chain_hook chain_hook = {}; - struct nft_base_chain *basechain; struct nft_hook *this, *hook; LIST_HEAD(chain_del_list); struct nft_trans *trans; int err; - if (!nft_is_base_chain(chain)) - return -EOPNOTSUPP; - - basechain = nft_base_chain(chain); err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook, ctx->family, chain->flags, extack); if (err < 0) @@ -2910,7 +2907,12 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, if (chain->flags & NFT_CHAIN_HW_OFFLOAD) return -EOPNOTSUPP; - return nft_delchain_hook(&ctx, chain, extack); + if (nft_is_base_chain(chain)) { + struct nft_base_chain *basechain = nft_base_chain(chain); + + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) + return nft_delchain_hook(&ctx, basechain, extack); + } } if (info->nlh->nlmsg_flags & NLM_F_NONREC && -- cgit v1.2.3 From 408c090002c8ca5da3da1417d1d675583379fae6 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Mon, 19 Jun 2023 17:49:48 +0800 Subject: net: mdio: fix the wrong parameters PHY address and device address are passed in the wrong order. Cc: stable@vger.kernel.org Fixes: 4e4aafcddbbf ("net: mdio: Add dedicated C45 API to MDIO bus drivers") Signed-off-by: Jiawen Wu Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20230619094948.84452-1-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/mdio_bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 389f33a12534..8b3618d3da4a 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -1287,7 +1287,7 @@ EXPORT_SYMBOL_GPL(mdiobus_modify_changed); * @mask: bit mask of bits to clear * @set: bit mask of bits to set */ -int mdiobus_c45_modify_changed(struct mii_bus *bus, int devad, int addr, +int mdiobus_c45_modify_changed(struct mii_bus *bus, int addr, int devad, u32 regnum, u16 mask, u16 set) { int err; -- cgit v1.2.3 From a129b41fe0a8b4da828c46b10f5244ca07a3fec3 Mon Sep 17 00:00:00 2001 From: Francesco Dolcini Date: Mon, 19 Jun 2023 17:44:35 +0200 Subject: Revert "net: phy: dp83867: perform soft reset and retain established link" This reverts commit da9ef50f545f86ffe6ff786174d26500c4db737a. This fixes a regression in which the link would come up, but no communication was possible. The reverted commit was also removing a comment about DP83867_PHYCR_FORCE_LINK_GOOD, this is not added back in this commits since it seems that this is unrelated to the original code change. Closes: https://lore.kernel.org/all/ZGuDJos8D7N0J6Z2@francesco-nb.int.toradex.com/ Fixes: da9ef50f545f ("net: phy: dp83867: perform soft reset and retain established link") Signed-off-by: Francesco Dolcini Reviewed-by: Andrew Lunn Reviewed-by: Praneeth Bajjuri Link: https://lore.kernel.org/r/20230619154435.355485-1-francesco@dolcini.it Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83867.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index 76f5a2402fb0..e397e7d642d9 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -936,7 +936,7 @@ static int dp83867_phy_reset(struct phy_device *phydev) { int err; - err = phy_write(phydev, DP83867_CTRL, DP83867_SW_RESTART); + err = phy_write(phydev, DP83867_CTRL, DP83867_SW_RESET); if (err < 0) return err; -- cgit v1.2.3 From afd384f0dbea2229fd11159efb86a5b41051c4a9 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 8 Jun 2023 17:42:53 -0400 Subject: Revert "virtio-blk: support completion batching for the IRQ path" This reverts commit 07b679f70d73483930e8d3c293942416d9cd5c13. This change appears to have broken things... We now see applications hanging during disk accesses. e.g. multi-port virtio-blk device running in h/w (FPGA) Host running a simple 'fio' test. [global] thread=1 direct=1 ioengine=libaio norandommap=1 group_reporting=1 bs=4K rw=read iodepth=128 runtime=1 numjobs=4 time_based [job0] filename=/dev/vda [job1] filename=/dev/vdb [job2] filename=/dev/vdc ... [job15] filename=/dev/vdp i.e. 16 disks; 4 queues per disk; simple burst of 4KB reads This is repeatedly run in a loop. After a few, normally <10 seconds, fio hangs. With 64 queues (16 disks), failure occurs within a few seconds; with 8 queues (2 disks) it may take ~hour before hanging. Last message: fio-3.19 Starting 8 threads Jobs: 1 (f=1): [_(7),R(1)][68.3%][eta 03h:11m:06s] I think this means at the end of the run 1 queue was left incomplete. 'diskstats' (run while fio is hung) shows no outstanding transactions. e.g. $ cat /proc/diskstats ... 252 0 vda 1843140071 0 14745120568 712568645 0 0 0 0 0 3117947 712568645 0 0 0 0 0 0 252 16 vdb 1816291511 0 14530332088 704905623 0 0 0 0 0 3117711 704905623 0 0 0 0 0 0 ... Other stats (in the h/w, and added to the virtio-blk driver ([a]virtio_queue_rq(), [b]virtblk_handle_req(), [c]virtblk_request_done()) all agree, and show every request had a completion, and that virtblk_request_done() never gets called. e.g. PF= 0 vq=0 1 2 3 [a]request_count - 839416590 813148916 105586179 84988123 [b]completion1_count - 839416590 813148916 105586179 84988123 [c]completion2_count - 0 0 0 0 PF= 1 vq=0 1 2 3 [a]request_count - 823335887 812516140 104582672 75856549 [b]completion1_count - 823335887 812516140 104582672 75856549 [c]completion2_count - 0 0 0 0 i.e. the issue is after the virtio-blk driver. This change was introduced in kernel 6.3.0. I am seeing this using 6.3.3. If I run with an earlier kernel (5.15), it does not occur. If I make a simple patch to the 6.3.3 virtio-blk driver, to skip the blk_mq_add_to_batch()call, it does not fail. e.g. kernel 5.15 - this is OK virtio_blk.c,virtblk_done() [irq handler] if (likely(!blk_should_fake_timeout(req->q))) { blk_mq_complete_request(req); } kernel 6.3.3 - this fails virtio_blk.c,virtblk_handle_req() [irq handler] if (likely(!blk_should_fake_timeout(req->q))) { if (!blk_mq_complete_request_remote(req)) { if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) { virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed } } } If I do, kernel 6.3.3 - this is OK virtio_blk.c,virtblk_handle_req() [irq handler] if (likely(!blk_should_fake_timeout(req->q))) { if (!blk_mq_complete_request_remote(req)) { virtblk_request_done(req); //force this here... if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) { virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed } } } Perhaps you might like to fix/test/revert this change... Martin Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306090826.C1fZmdMe-lkp@intel.com/ Cc: Suwan Kim Tested-by: edliaw@google.com Reported-by: "Roberts, Martin" Message-Id: <336455b4f630f329380a8f53ee8cad3868764d5c.1686295549.git.mst@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/block/virtio_blk.c | 82 +++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 45 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 2b918e28acaa..b47358da92a2 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -348,63 +348,33 @@ static inline void virtblk_request_done(struct request *req) blk_mq_end_request(req, status); } -static void virtblk_complete_batch(struct io_comp_batch *iob) -{ - struct request *req; - - rq_list_for_each(&iob->req_list, req) { - virtblk_unmap_data(req, blk_mq_rq_to_pdu(req)); - virtblk_cleanup_cmd(req); - } - blk_mq_end_request_batch(iob); -} - -static int virtblk_handle_req(struct virtio_blk_vq *vq, - struct io_comp_batch *iob) -{ - struct virtblk_req *vbr; - int req_done = 0; - unsigned int len; - - while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) { - struct request *req = blk_mq_rq_from_pdu(vbr); - - if (likely(!blk_should_fake_timeout(req->q)) && - !blk_mq_complete_request_remote(req) && - !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), - virtblk_complete_batch)) - virtblk_request_done(req); - req_done++; - } - - return req_done; -} - static void virtblk_done(struct virtqueue *vq) { struct virtio_blk *vblk = vq->vdev->priv; - struct virtio_blk_vq *vblk_vq = &vblk->vqs[vq->index]; - int req_done = 0; + bool req_done = false; + int qid = vq->index; + struct virtblk_req *vbr; unsigned long flags; - DEFINE_IO_COMP_BATCH(iob); + unsigned int len; - spin_lock_irqsave(&vblk_vq->lock, flags); + spin_lock_irqsave(&vblk->vqs[qid].lock, flags); do { virtqueue_disable_cb(vq); - req_done += virtblk_handle_req(vblk_vq, &iob); + while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) { + struct request *req = blk_mq_rq_from_pdu(vbr); + if (likely(!blk_should_fake_timeout(req->q))) + blk_mq_complete_request(req); + req_done = true; + } if (unlikely(virtqueue_is_broken(vq))) break; } while (!virtqueue_enable_cb(vq)); - if (req_done) { - if (!rq_list_empty(iob.req_list)) - iob.complete(&iob); - - /* In case queue is stopped waiting for more buffers. */ + /* In case queue is stopped waiting for more buffers. */ + if (req_done) blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); - } - spin_unlock_irqrestore(&vblk_vq->lock, flags); + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); } static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx) @@ -1283,15 +1253,37 @@ static void virtblk_map_queues(struct blk_mq_tag_set *set) } } +static void virtblk_complete_batch(struct io_comp_batch *iob) +{ + struct request *req; + + rq_list_for_each(&iob->req_list, req) { + virtblk_unmap_data(req, blk_mq_rq_to_pdu(req)); + virtblk_cleanup_cmd(req); + } + blk_mq_end_request_batch(iob); +} + static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct virtio_blk *vblk = hctx->queue->queuedata; struct virtio_blk_vq *vq = get_virtio_blk_vq(hctx); + struct virtblk_req *vbr; unsigned long flags; + unsigned int len; int found = 0; spin_lock_irqsave(&vq->lock, flags); - found = virtblk_handle_req(vq, iob); + + while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) { + struct request *req = blk_mq_rq_from_pdu(vbr); + + found++; + if (!blk_mq_complete_request_remote(req) && + !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), + virtblk_complete_batch)) + virtblk_request_done(req); + } if (found) blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); -- cgit v1.2.3 From 9724160b3942b0a967b91a59f81da5593f28b8ba Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Thu, 15 Jun 2023 16:56:07 +0200 Subject: bpf/btf: Accept function names that contain dots When building a kernel with LLVM=1, LLVM_IAS=0 and CONFIG_KASAN=y, LLVM leaves DWARF tags for the "asan.module_ctor" & co symbols. In turn, pahole creates BTF_KIND_FUNC entries for these and this makes the BTF metadata validation fail because they contain a dot. In a dramatic turn of event, this BTF verification failure can cause the netfilter_bpf initialization to fail, causing netfilter_core to free the netfilter_helper hashmap and netfilter_ftp to trigger a use-after-free. The risk of u-a-f in netfilter will be addressed separately but the existence of "asan.module_ctor" debug info under some build conditions sounds like a good enough reason to accept functions that contain dots in BTF. Although using only LLVM=1 is the recommended way to compile clang-based kernels, users can certainly do LLVM=1, LLVM_IAS=0 as well and we still try to support that combination according to Nick. To clarify: - > v5.10 kernel, LLVM=1 (LLVM_IAS=0 is not the default) is recommended, but user can still have LLVM=1, LLVM_IAS=0 to trigger the issue - <= 5.10 kernel, LLVM=1 (LLVM_IAS=0 is the default) is recommended in which case GNU as will be used Fixes: 1dc92851849c ("bpf: kernel side support for BTF Var and DataSec") Signed-off-by: Florent Revest Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Cc: Yonghong Song Cc: Nick Desaulniers Link: https://lore.kernel.org/bpf/20230615145607.3469985-1-revest@chromium.org --- kernel/bpf/btf.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6b682b8e4b50..72b32b7cd9cd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -744,13 +744,12 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) return offset < btf->hdr.str_len; } -static bool __btf_name_char_ok(char c, bool first, bool dot_ok) +static bool __btf_name_char_ok(char c, bool first) { if ((first ? !isalpha(c) : !isalnum(c)) && c != '_' && - ((c == '.' && !dot_ok) || - c != '.')) + c != '.') return false; return true; } @@ -767,20 +766,20 @@ static const char *btf_str_by_offset(const struct btf *btf, u32 offset) return NULL; } -static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) +static bool __btf_name_valid(const struct btf *btf, u32 offset) { /* offset must be valid */ const char *src = btf_str_by_offset(btf, offset); const char *src_limit; - if (!__btf_name_char_ok(*src, true, dot_ok)) + if (!__btf_name_char_ok(*src, true)) return false; /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; src++; while (*src && src < src_limit) { - if (!__btf_name_char_ok(*src, false, dot_ok)) + if (!__btf_name_char_ok(*src, false)) return false; src++; } @@ -788,17 +787,14 @@ static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) return !*src; } -/* Only C-style identifier is permitted. This can be relaxed if - * necessary. - */ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) { - return __btf_name_valid(btf, offset, false); + return __btf_name_valid(btf, offset); } static bool btf_name_valid_section(const struct btf *btf, u32 offset) { - return __btf_name_valid(btf, offset, true); + return __btf_name_valid(btf, offset); } static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) @@ -4422,7 +4418,7 @@ static s32 btf_var_check_meta(struct btf_verifier_env *env, } if (!t->name_off || - !__btf_name_valid(env->btf, t->name_off, true)) { + !__btf_name_valid(env->btf, t->name_off)) { btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } -- cgit v1.2.3 From db8eae6bc5c702d8e3ab2d0c6bb5976c131576eb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 18 Jun 2023 15:14:14 +0200 Subject: bpf: Force kprobe multi expected_attach_type for kprobe_multi link We currently allow to create perf link for program with expected_attach_type == BPF_TRACE_KPROBE_MULTI. This will cause crash when we call helpers like get_attach_cookie or get_func_ip in such program, because it will call the kprobe_multi's version (current->bpf_ctx context setup) of those helpers while it expects perf_link's current->bpf_ctx context setup. Making sure that we use BPF_TRACE_KPROBE_MULTI expected_attach_type only for programs attaching through kprobe_multi link. Fixes: ca74823c6e16 ("bpf: Add cookie support to programs attached with kprobe multi link") Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230618131414.75649-1-jolsa@kernel.org --- kernel/bpf/syscall.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0c21d0d8efe4..f1c8733f76b8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3440,6 +3440,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; + case BPF_PROG_TYPE_KPROBE: + if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_KPROBE_MULTI) + return -EINVAL; + return 0; default: return 0; } -- cgit v1.2.3 From b1dc492087db0f2e5a45f1072a743d04618dd6be Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 Jun 2023 09:35:34 -0600 Subject: io_uring/net: clear msg_controllen on partial sendmsg retry If we have cmsg attached AND we transferred partial data at least, clear msg_controllen on retry so we don't attempt to send that again. Cc: stable@vger.kernel.org # 5.10+ Fixes: cac9e4418f4c ("io_uring/net: save msghdr->msg_control for retries") Reported-by: Stefan Metzmacher Reviewed-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- io_uring/net.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/net.c b/io_uring/net.c index 51b0f7fbb4f5..c0924ab1ea11 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -326,6 +326,8 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) return io_setup_async_msg(req, kmsg, issue_flags); if (ret > 0 && io_net_retry(sock, flags)) { + kmsg->msg.msg_controllen = 0; + kmsg->msg.msg_control = NULL; sr->done_io += ret; req->flags |= REQ_F_PARTIAL_IO; return io_setup_async_msg(req, kmsg, issue_flags); -- cgit v1.2.3 From 78d0d2063bab954d19a1696feae4c7706a626d48 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 Jun 2023 09:41:05 -0600 Subject: io_uring/net: disable partial retries for recvmsg with cmsg We cannot sanely handle partial retries for recvmsg if we have cmsg attached. If we don't, then we'd just be overwriting the initial cmsg header on retries. Alternatively we could increment and handle this appropriately, but it doesn't seem worth the complication. Move the MSG_WAITALL check into the non-multishot case while at it, since MSG_WAITALL is explicitly disabled for multishot anyway. Link: https://lore.kernel.org/io-uring/0b0d4411-c8fd-4272-770b-e030af6919a0@kernel.dk/ Cc: stable@vger.kernel.org # 5.10+ Reported-by: Stefan Metzmacher Reviewed-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- io_uring/net.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index c0924ab1ea11..2bc2cb2f4d6c 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -789,16 +789,19 @@ retry_multishot: flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&kmsg->msg.msg_iter); kmsg->msg.msg_get_inq = 1; - if (req->flags & REQ_F_APOLL_MULTISHOT) + if (req->flags & REQ_F_APOLL_MULTISHOT) { ret = io_recvmsg_multishot(sock, sr, kmsg, flags, &mshot_finished); - else + } else { + /* disable partial retry for recvmsg with cmsg attached */ + if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); + ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); + } if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { -- cgit v1.2.3 From 26fed83653d0154704cadb7afc418f315c7ac1f0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 20 Jun 2023 16:11:51 -0600 Subject: io_uring/net: use the correct msghdr union member in io_sendmsg_copy_hdr Rather than assign the user pointer to msghdr->msg_control, assign it to msghdr->msg_control_user to make sparse happy. They are in a union so the end result is the same, but let's avoid new sparse warnings and squash this one. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306210654.mDMcyMuB-lkp@intel.com/ Fixes: cac9e4418f4c ("io_uring/net: save msghdr->msg_control for retries") Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- io_uring/net.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 2bc2cb2f4d6c..c8a4b2ac00f7 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -203,7 +203,7 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, ret = sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags, &iomsg->free_iov); /* save msg_control as sys_sendmsg() overwrites it */ - sr->msg_control = iomsg->msg.msg_control; + sr->msg_control = iomsg->msg.msg_control_user; return ret; } @@ -302,7 +302,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (req_has_async_data(req)) { kmsg = req->async_data; - kmsg->msg.msg_control = sr->msg_control; + kmsg->msg.msg_control_user = sr->msg_control; } else { ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) -- cgit v1.2.3 From 69cbeb61ff9093a9155cb19a36d633033f71093a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 21 Jun 2023 10:58:46 -0700 Subject: Revert "efi: random: refresh non-volatile random seed when RNG is initialized" This reverts commit e7b813b32a42a3a6281a4fd9ae7700a0257c1d50 (and the subsequent fix for it: 41a15855c1ee "efi: random: fix NULL-deref when refreshing seed"). It turns otu to cause non-deterministic boot stalls on at least a HP 6730b laptop. Reported-and-bisected-by: Sami Korkalainen Link: https://lore.kernel.org/all/GQUnKz2al3yke5mB2i1kp3SzNHjK8vi6KJEh7rnLrOQ24OrlljeCyeWveLW9pICEmB9Qc8PKdNt3w1t_g3-Uvxq1l8Wj67PpoMeWDoH8PKk=@proton.me/ Cc: Jason A. Donenfeld Cc: Bagas Sanjaya Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- drivers/firmware/efi/efi.c | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index abeff7dc0b58..34b9e7876538 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -361,24 +361,6 @@ static void __init efi_debugfs_init(void) static inline void efi_debugfs_init(void) {} #endif -static void refresh_nv_rng_seed(struct work_struct *work) -{ - u8 seed[EFI_RANDOM_SEED_SIZE]; - - get_random_bytes(seed, sizeof(seed)); - efi.set_variable(L"RandomSeed", &LINUX_EFI_RANDOM_SEED_TABLE_GUID, - EFI_VARIABLE_NON_VOLATILE | EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS, sizeof(seed), seed); - memzero_explicit(seed, sizeof(seed)); -} -static int refresh_nv_rng_seed_notification(struct notifier_block *nb, unsigned long action, void *data) -{ - static DECLARE_WORK(work, refresh_nv_rng_seed); - schedule_work(&work); - return NOTIFY_DONE; -} -static struct notifier_block refresh_nv_rng_seed_nb = { .notifier_call = refresh_nv_rng_seed_notification }; - /* * We register the efi subsystem with the firmware subsystem and the * efivars subsystem with the efi subsystem, if the system was booted with @@ -451,9 +433,6 @@ static int __init efisubsys_init(void) platform_device_register_simple("efi_secret", 0, NULL, 0); #endif - if (efi_rt_services_supported(EFI_RT_SUPPORTED_SET_VARIABLE)) - execute_with_initialized_rng(&refresh_nv_rng_seed_nb); - return 0; err_remove_group: -- cgit v1.2.3 From c2b2ae3925b65070adb27d5a31a31c376f26dec7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:24:18 +0200 Subject: mptcp: handle correctly disconnect() failures Currently the mptcp code has assumes that disconnect() can fail only at mptcp_sendmsg_fastopen() time - to avoid a deadlock scenario - and don't even bother returning an error code. Soon mptcp_disconnect() will handle more error conditions: let's track them explicitly. As a bonus, explicitly annotate TCP-level disconnect as not failing: the mptcp code never blocks for event on the subflows. Fixes: 7d803344fdc3 ("mptcp: fix deadlock in fastopen error path") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Tested-by: Christoph Paasch Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 67311e7d5b21..86f8a7621aff 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1727,7 +1727,13 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) *copied_syn = 0; } else if (ret && ret != -EINPROGRESS) { - mptcp_disconnect(sk, 0); + /* The disconnect() op called by tcp_sendmsg_fastopen()/ + * __inet_stream_connect() can fail, due to looking check, + * see mptcp_disconnect(). + * Attempt it again outside the problematic scope. + */ + if (!mptcp_disconnect(sk, 0)) + sk->sk_socket->state = SS_UNCONNECTED; } inet_sk(sk)->defer_connect = 0; @@ -2389,7 +2395,10 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { - tcp_disconnect(ssk, 0); + /* The MPTCP code never wait on the subflow sockets, TCP-level + * disconnect should never fail + */ + WARN_ON_ONCE(tcp_disconnect(ssk, 0)); msk->subflow->state = SS_UNCONNECTED; mptcp_subflow_ctx_reset(subflow); release_sock(ssk); @@ -2812,7 +2821,7 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) break; fallthrough; case TCP_SYN_SENT: - tcp_disconnect(ssk, O_NONBLOCK); + WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK)); break; default: if (__mptcp_check_fallback(mptcp_sk(sk))) { @@ -3075,11 +3084,10 @@ static int mptcp_disconnect(struct sock *sk, int flags) /* We are on the fastopen error path. We can't call straight into the * subflows cleanup code due to lock nesting (we are already under - * msk->firstsocket lock). Do nothing and leave the cleanup to the - * caller. + * msk->firstsocket lock). */ if (msk->fastopening) - return 0; + return -EBUSY; mptcp_listen_inuse_dec(sk); inet_sk_state_store(sk, TCP_CLOSE); -- cgit v1.2.3 From 0ad529d9fd2bfa3fc619552a8d2fb2f2ef0bce2e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:24:19 +0200 Subject: mptcp: fix possible divide by zero in recvmsg() Christoph reported a divide by zero bug in mptcp_recvmsg(): divide error: 0000 [#1] PREEMPT SMP CPU: 1 PID: 19978 Comm: syz-executor.6 Not tainted 6.4.0-rc2-gffcc7899081b #20 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 RIP: 0010:__tcp_select_window+0x30e/0x420 net/ipv4/tcp_output.c:3018 Code: 11 ff 0f b7 cd c1 e9 0c b8 ff ff ff ff d3 e0 89 c1 f7 d1 01 cb 21 c3 eb 17 e8 2e 83 11 ff 31 db eb 0e e8 25 83 11 ff 89 d8 99 7c 24 04 29 d3 65 48 8b 04 25 28 00 00 00 48 3b 44 24 10 75 60 RSP: 0018:ffffc90000a07a18 EFLAGS: 00010246 RAX: 000000000000ffd7 RBX: 000000000000ffd7 RCX: 0000000000040000 RDX: 0000000000000000 RSI: 000000000003ffff RDI: 0000000000040000 RBP: 000000000000ffd7 R08: ffffffff820cf297 R09: 0000000000000001 R10: 0000000000000000 R11: ffffffff8103d1a0 R12: 0000000000003f00 R13: 0000000000300000 R14: ffff888101cf3540 R15: 0000000000180000 FS: 00007f9af4c09640(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b33824000 CR3: 000000012f241001 CR4: 0000000000170ee0 Call Trace: __tcp_cleanup_rbuf+0x138/0x1d0 net/ipv4/tcp.c:1611 mptcp_recvmsg+0xcb8/0xdd0 net/mptcp/protocol.c:2034 inet_recvmsg+0x127/0x1f0 net/ipv4/af_inet.c:861 ____sys_recvmsg+0x269/0x2b0 net/socket.c:1019 ___sys_recvmsg+0xe6/0x260 net/socket.c:2764 do_recvmmsg+0x1a5/0x470 net/socket.c:2858 __do_sys_recvmmsg net/socket.c:2937 [inline] __se_sys_recvmmsg net/socket.c:2953 [inline] __x64_sys_recvmmsg+0xa6/0x130 net/socket.c:2953 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x47/0xa0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7f9af58fc6a9 Code: 5c c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 4f 37 0d 00 f7 d8 64 89 01 48 RSP: 002b:00007f9af4c08cd8 EFLAGS: 00000246 ORIG_RAX: 000000000000012b RAX: ffffffffffffffda RBX: 00000000006bc050 RCX: 00007f9af58fc6a9 RDX: 0000000000000001 RSI: 0000000020000140 RDI: 0000000000000004 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000f00 R11: 0000000000000246 R12: 00000000006bc05c R13: fffffffffffffea8 R14: 00000000006bc050 R15: 000000000001fe40 mptcp_recvmsg is allowed to release the msk socket lock when blocking, and before re-acquiring it another thread could have switched the sock to TCP_LISTEN status - with a prior connect(AF_UNSPEC) - also clearing icsk_ack.rcv_mss. Address the issue preventing the disconnect if some other process is concurrently performing a blocking syscall on the same socket, alike commit 4faeee0cf8a5 ("tcp: deny tcp_disconnect() when threads are waiting"). Fixes: a6b118febbab ("mptcp: add receive buffer auto-tuning") Cc: stable@vger.kernel.org Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/404 Signed-off-by: Paolo Abeni Tested-by: Christoph Paasch Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 86f8a7621aff..ee357700b27b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3082,6 +3082,12 @@ static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); + /* Deny disconnect if other threads are blocked in sk_wait_event() + * or inet_wait_for_connect(). + */ + if (sk->sk_wait_pending) + return -EBUSY; + /* We are on the fastopen error path. We can't call straight into the * subflows cleanup code due to lock nesting (we are already under * msk->firstsocket lock). @@ -3148,6 +3154,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); #endif + nsk->sk_wait_pending = 0; __mptcp_init_sock(nsk); msk = mptcp_sk(nsk); -- cgit v1.2.3 From 56a666c48b038e91b76471289e2cf60c79d326b9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:24:20 +0200 Subject: mptcp: fix possible list corruption on passive MPJ At passive MPJ time, if the msk socket lock is held by the user, the new subflow is appended to the msk->join_list under the msk data lock. In mptcp_release_cb()/__mptcp_flush_join_list(), the subflows in that list are moved from the join_list into the conn_list under the msk socket lock. Append and removal could race, possibly corrupting such list. Address the issue splicing the join list into a temporary one while still under the msk data lock. Found by code inspection, the race itself should be almost impossible to trigger in practice. Fixes: 3e5014909b56 ("mptcp: cleanup MPJ subflow list handling") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index ee357700b27b..9a40dae31cec 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -850,12 +850,12 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) return true; } -static void __mptcp_flush_join_list(struct sock *sk) +static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list) { struct mptcp_subflow_context *tmp, *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) { + list_for_each_entry_safe(subflow, tmp, join_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); @@ -3342,9 +3342,14 @@ static void mptcp_release_cb(struct sock *sk) for (;;) { unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) | msk->push_pending; + struct list_head join_list; + if (!flags) break; + INIT_LIST_HEAD(&join_list); + list_splice_init(&msk->join_list, &join_list); + /* the following actions acquire the subflow socket lock * * 1) can't be invoked in atomic scope @@ -3355,8 +3360,9 @@ static void mptcp_release_cb(struct sock *sk) msk->push_pending = 0; msk->cb_flags &= ~flags; spin_unlock_bh(&sk->sk_lock.slock); + if (flags & BIT(MPTCP_FLUSH_JOIN_LIST)) - __mptcp_flush_join_list(sk); + __mptcp_flush_join_list(sk, &join_list); if (flags & BIT(MPTCP_PUSH_PENDING)) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) -- cgit v1.2.3 From 81c1d029016001f994ce1c46849c5e9900d8eab8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:24:21 +0200 Subject: mptcp: consolidate fallback and non fallback state machine An orphaned msk releases the used resources via the worker, when the latter first see the msk in CLOSED status. If the msk status transitions to TCP_CLOSE in the release callback invoked by the worker's final release_sock(), such instance of the workqueue will not take any action. Additionally the MPTCP code prevents scheduling the worker once the socket reaches the CLOSE status: such msk resources will be leaked. The only code path that can trigger the above scenario is the __mptcp_check_send_data_fin() in fallback mode. Address the issue removing the special handling of fallback socket in __mptcp_check_send_data_fin(), consolidating the state machine for fallback and non fallback socket. Since non-fallback sockets do not send and do not receive data_fin, the mptcp code can update the msk internal status to match the next step in the SM every time data fin (ack) should be generated or received. As a consequence we can remove a bunch of checks for fallback from the fastpath. Fixes: 6e628cd3a8f7 ("mptcp: use mptcp release_cb for delayed tasks") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 41 +++++++++++++++-------------------------- net/mptcp/subflow.c | 17 ++++++++++------- 2 files changed, 25 insertions(+), 33 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9a40dae31cec..27d206f7af62 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -44,7 +44,7 @@ enum { static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp; static void __mptcp_destroy_sock(struct sock *sk); -static void __mptcp_check_send_data_fin(struct sock *sk); +static void mptcp_check_send_data_fin(struct sock *sk); DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); static struct net_device mptcp_napi_dev; @@ -424,8 +424,7 @@ static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - return !__mptcp_check_fallback(msk) && - ((1 << sk->sk_state) & + return ((1 << sk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && msk->write_seq == READ_ONCE(msk->snd_una); } @@ -583,9 +582,6 @@ static bool mptcp_check_data_fin(struct sock *sk) u64 rcv_data_fin_seq; bool ret = false; - if (__mptcp_check_fallback(msk)) - return ret; - /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. * msk->rcv_data_fin was set when parsing the incoming options @@ -623,7 +619,8 @@ static bool mptcp_check_data_fin(struct sock *sk) } ret = true; - mptcp_send_ack(msk); + if (!__mptcp_check_fallback(msk)) + mptcp_send_ack(msk); mptcp_close_wake_up(sk); } return ret; @@ -1609,7 +1606,7 @@ out: if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); if (do_check_data_fin) - __mptcp_check_send_data_fin(sk); + mptcp_check_send_data_fin(sk); } static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first) @@ -2680,8 +2677,6 @@ static void mptcp_worker(struct work_struct *work) if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN))) goto unlock; - mptcp_check_data_fin_ack(sk); - mptcp_check_fastclose(msk); mptcp_pm_nl_work(msk); @@ -2689,7 +2684,8 @@ static void mptcp_worker(struct work_struct *work) if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); - __mptcp_check_send_data_fin(sk); + mptcp_check_send_data_fin(sk); + mptcp_check_data_fin_ack(sk); mptcp_check_data_fin(sk); if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) @@ -2828,6 +2824,12 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) pr_debug("Fallback"); ssk->sk_shutdown |= how; tcp_shutdown(ssk, how); + + /* simulate the data_fin ack reception to let the state + * machine move forward + */ + WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt); + mptcp_schedule_work(sk); } else { pr_debug("Sending DATA_FIN on subflow %p", ssk); tcp_send_ack(ssk); @@ -2867,7 +2869,7 @@ static int mptcp_close_state(struct sock *sk) return next & TCP_ACTION_FIN; } -static void __mptcp_check_send_data_fin(struct sock *sk) +static void mptcp_check_send_data_fin(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); @@ -2885,19 +2887,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk) WRITE_ONCE(msk->snd_nxt, msk->write_seq); - /* fallback socket will not get data_fin/ack, can move to the next - * state now - */ - if (__mptcp_check_fallback(msk)) { - WRITE_ONCE(msk->snd_una, msk->write_seq); - if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { - inet_sk_state_store(sk, TCP_CLOSE); - mptcp_close_wake_up(sk); - } else if (sk->sk_state == TCP_FIN_WAIT1) { - inet_sk_state_store(sk, TCP_FIN_WAIT2); - } - } - mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); @@ -2917,7 +2906,7 @@ static void __mptcp_wr_shutdown(struct sock *sk) WRITE_ONCE(msk->write_seq, msk->write_seq + 1); WRITE_ONCE(msk->snd_data_fin_enable, 1); - __mptcp_check_send_data_fin(sk); + mptcp_check_send_data_fin(sk); } static void __mptcp_destroy_sock(struct sock *sk) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 4688daa6b38b..d9c8b21c6076 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1749,14 +1749,16 @@ static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; + struct mptcp_sock *msk; __subflow_state_change(sk); + msk = mptcp_sk(parent); if (subflow_simultaneous_connect(sk)) { mptcp_propagate_sndbuf(parent, sk); mptcp_do_fallback(sk); - mptcp_rcv_space_init(mptcp_sk(parent), sk); - pr_fallback(mptcp_sk(parent)); + mptcp_rcv_space_init(msk, sk); + pr_fallback(msk); subflow->conn_finished = 1; mptcp_set_connected(parent); } @@ -1772,11 +1774,12 @@ static void subflow_state_change(struct sock *sk) subflow_sched_work_if_closed(mptcp_sk(parent), sk); - if (__mptcp_check_fallback(mptcp_sk(parent)) && - !subflow->rx_eof && subflow_is_done(sk)) { - subflow->rx_eof = 1; - mptcp_subflow_eof(parent); - } + /* when the fallback subflow closes the rx side, trigger a 'dummy' + * ingress data fin, so that the msk state will follow along + */ + if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk && + mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) + mptcp_schedule_work(parent); } void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) -- cgit v1.2.3 From b7535cfed223a9f02f9530853616f197b386d775 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:24:22 +0200 Subject: mptcp: drop legacy code around RX EOF Thanks to the previous patch -- "mptcp: consolidate fallback and non fallback state machine" -- we can finally drop the "temporary hack" used to detect rx eof. Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 49 ------------------------------------------------- net/mptcp/protocol.h | 5 +---- 2 files changed, 1 insertion(+), 53 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 27d206f7af62..a66ec341485e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -894,49 +894,6 @@ bool mptcp_schedule_work(struct sock *sk) return false; } -void mptcp_subflow_eof(struct sock *sk) -{ - if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags)) - mptcp_schedule_work(sk); -} - -static void mptcp_check_for_eof(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - struct sock *sk = (struct sock *)msk; - int receivers = 0; - - mptcp_for_each_subflow(msk, subflow) - receivers += !subflow->rx_eof; - if (receivers) - return; - - if (!(sk->sk_shutdown & RCV_SHUTDOWN)) { - /* hopefully temporary hack: propagate shutdown status - * to msk, when all subflows agree on it - */ - WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); - - smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - sk->sk_data_ready(sk); - } - - switch (sk->sk_state) { - case TCP_ESTABLISHED: - inet_sk_state_store(sk, TCP_CLOSE_WAIT); - break; - case TCP_FIN_WAIT1: - inet_sk_state_store(sk, TCP_CLOSING); - break; - case TCP_FIN_WAIT2: - inet_sk_state_store(sk, TCP_CLOSE); - break; - default: - return; - } - mptcp_close_wake_up(sk); -} - static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; @@ -2161,9 +2118,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, break; } - if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) - mptcp_check_for_eof(msk); - if (sk->sk_shutdown & RCV_SHUTDOWN) { /* race breaker: the shutdown could be after the * previous receive queue check @@ -2681,9 +2635,6 @@ static void mptcp_worker(struct work_struct *work) mptcp_pm_nl_work(msk); - if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) - mptcp_check_for_eof(msk); - mptcp_check_send_data_fin(sk); mptcp_check_data_fin_ack(sk); mptcp_check_data_fin(sk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 70c957bc56a8..d3783a7056e1 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -113,7 +113,6 @@ /* MPTCP socket atomic flags */ #define MPTCP_NOSPACE 1 #define MPTCP_WORK_RTX 2 -#define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 #define MPTCP_WORK_CLOSE_SUBFLOW 5 @@ -476,14 +475,13 @@ struct mptcp_subflow_context { send_mp_fail : 1, send_fastclose : 1, send_infinite_map : 1, - rx_eof : 1, remote_key_valid : 1, /* received the peer key from */ disposable : 1, /* ctx can be free at ulp release time */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ local_id_valid : 1, /* local_id is correctly initialized */ valid_csum_seen : 1, /* at least one csum validated */ is_mptfo : 1, /* subflow is doing TFO */ - __unused : 8; + __unused : 9; enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; @@ -720,7 +718,6 @@ static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit) void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); -void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) { -- cgit v1.2.3 From 57fc0f1ceaa4016354cf6f88533e20b56190e41a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:24:23 +0200 Subject: mptcp: ensure listener is unhashed before updating the sk status The MPTCP protocol access the listener subflow in a lockless manner in a couple of places (poll, diag). That works only if the msk itself leaves the listener status only after that the subflow itself has been closed/disconnected. Otherwise we risk deadlock in diag, as reported by Christoph. Address the issue ensuring that the first subflow (the listener one) is always disconnected before updating the msk socket status. Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/407 Fixes: b29fcfb54cd7 ("mptcp: full disconnect implementation") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 1 + net/mptcp/protocol.c | 31 +++++++++++++++++++------------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 59f8f3124855..1224dfca5bf3 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1047,6 +1047,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, if (err) return err; + inet_sk_state_store(newsk, TCP_LISTEN); err = kernel_listen(ssock, backlog); if (err) return err; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a66ec341485e..a6c7f2d24909 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2368,13 +2368,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, kfree_rcu(subflow, rcu); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ - if (ssk->sk_state == TCP_LISTEN) { - tcp_set_state(ssk, TCP_CLOSE); - mptcp_subflow_queue_clean(sk, ssk); - inet_csk_listen_stop(ssk); - mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); - } - __tcp_close(ssk, 0); /* close acquired an extra ref */ @@ -2902,10 +2895,24 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) return EPOLLIN | EPOLLRDNORM; } -static void mptcp_listen_inuse_dec(struct sock *sk) +static void mptcp_check_listen_stop(struct sock *sk) { - if (inet_sk_state_load(sk) == TCP_LISTEN) - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + struct sock *ssk; + + if (inet_sk_state_load(sk) != TCP_LISTEN) + return; + + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + ssk = mptcp_sk(sk)->first; + if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN)) + return; + + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + mptcp_subflow_queue_clean(sk, ssk); + inet_csk_listen_stop(ssk); + mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); + tcp_set_state(ssk, TCP_CLOSE); + release_sock(ssk); } bool __mptcp_close(struct sock *sk, long timeout) @@ -2918,7 +2925,7 @@ bool __mptcp_close(struct sock *sk, long timeout) WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { - mptcp_listen_inuse_dec(sk); + mptcp_check_listen_stop(sk); inet_sk_state_store(sk, TCP_CLOSE); goto cleanup; } @@ -3035,7 +3042,7 @@ static int mptcp_disconnect(struct sock *sk, int flags) if (msk->fastopening) return -EBUSY; - mptcp_listen_inuse_dec(sk); + mptcp_check_listen_stop(sk); inet_sk_state_store(sk, TCP_CLOSE); mptcp_stop_timer(sk); -- cgit v1.2.3 From 7f4e09700bdc13ce9aafa279bc999051e9bcda35 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 21 Jun 2023 14:05:44 +0200 Subject: wifi: mac80211: report all unusable beacon frames Properly check for RX_DROP_UNUSABLE now that the new drop reason infrastructure is used. Without this change, the comparison will always be false as a more specific reason is given in the lower bits of result. Fixes: baa951a1c177 ("mac80211: use the new drop reasons infrastructure") Signed-off-by: Benjamin Berg Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20230621120543.412920-2-johannes@sipsolutions.net Signed-off-by: Jakub Kicinski --- net/mac80211/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index d996aa2579df..fc6e130364da 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2110,7 +2110,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) /* either the frame has been decrypted or will be dropped */ status->flag |= RX_FLAG_DECRYPTED; - if (unlikely(ieee80211_is_beacon(fc) && result == RX_DROP_UNUSABLE && + if (unlikely(ieee80211_is_beacon(fc) && (result & RX_DROP_UNUSABLE) && rx->sdata->dev)) cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, skb->data, skb->len); -- cgit v1.2.3 From c7c059fba6fb19c3bc924925c984772e733cb594 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Tue, 20 Jun 2023 14:45:15 +0200 Subject: selftests: forwarding: Fix race condition in mirror installation When mirroring to a gretap in hardware the device expects to be programmed with the egress port and all the encapsulating headers. This requires the driver to resolve the path the packet will take in the software data path and program the device accordingly. If the path cannot be resolved (in this case because of an unresolved neighbor), then mirror installation fails until the path is resolved. This results in a race that causes the test to sometimes fail. Fix this by setting the neighbor's state to permanent in a couple of tests, so that it is always valid. Fixes: 35c31d5c323f ("selftests: forwarding: Test mirror-to-gretap w/ UL 802.1d") Fixes: 239e754af854 ("selftests: forwarding: Test mirror-to-gretap w/ UL 802.1q") Signed-off-by: Danielle Ratson Reviewed-by: Petr Machata Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/268816ac729cb6028c7a34d4dda6f4ec7af55333.1687264607.git.petrm@nvidia.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh | 4 ++++ tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh index c5095da7f6bf..aec752a22e9e 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh @@ -93,12 +93,16 @@ cleanup() test_gretap() { + ip neigh replace 192.0.2.130 lladdr $(mac_get $h3) \ + nud permanent dev br2 full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap" full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap" } test_ip6gretap() { + ip neigh replace 2001:db8:2::2 lladdr $(mac_get $h3) \ + nud permanent dev br2 full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap" full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap" } diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh index 9ff22f28032d..0cf4c47a46f9 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh @@ -90,12 +90,16 @@ cleanup() test_gretap() { + ip neigh replace 192.0.2.130 lladdr $(mac_get $h3) \ + nud permanent dev br1 full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap" full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap" } test_ip6gretap() { + ip neigh replace 2001:db8:2::2 lladdr $(mac_get $h3) \ + nud permanent dev br1 full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap" full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap" } -- cgit v1.2.3 From 146b6f6855e7656e8329910606595220c761daac Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Thu, 22 Jun 2023 11:33:09 +0530 Subject: platform/x86/amd/pmf: Register notify handler only if SPS is enabled Power source notify handler is getting registered even when none of the PMF feature in enabled leading to a crash. ... [ 22.592162] Call Trace: [ 22.592164] [ 22.592164] ? rcu_note_context_switch+0x5e0/0x660 [ 22.592166] ? __warn+0x81/0x130 [ 22.592171] ? rcu_note_context_switch+0x5e0/0x660 [ 22.592172] ? report_bug+0x171/0x1a0 [ 22.592175] ? prb_read_valid+0x1b/0x30 [ 22.592177] ? handle_bug+0x3c/0x80 [ 22.592178] ? exc_invalid_op+0x17/0x70 [ 22.592179] ? asm_exc_invalid_op+0x1a/0x20 [ 22.592182] ? rcu_note_context_switch+0x5e0/0x660 [ 22.592183] ? acpi_ut_delete_object_desc+0x86/0xb0 [ 22.592186] ? acpi_ut_update_ref_count.part.0+0x22d/0x930 [ 22.592187] __schedule+0xc0/0x1410 [ 22.592189] ? ktime_get+0x3c/0xa0 [ 22.592191] ? lapic_next_event+0x1d/0x30 [ 22.592193] ? hrtimer_start_range_ns+0x25b/0x350 [ 22.592196] schedule+0x5e/0xd0 [ 22.592197] schedule_hrtimeout_range_clock+0xbe/0x140 [ 22.592199] ? __pfx_hrtimer_wakeup+0x10/0x10 [ 22.592200] usleep_range_state+0x64/0x90 [ 22.592203] amd_pmf_send_cmd+0x106/0x2a0 [amd_pmf bddfe0fe3712aaa99acce3d5487405c5213c6616] [ 22.592207] amd_pmf_update_slider+0x56/0x1b0 [amd_pmf bddfe0fe3712aaa99acce3d5487405c5213c6616] [ 22.592210] amd_pmf_set_sps_power_limits+0x72/0x80 [amd_pmf bddfe0fe3712aaa99acce3d5487405c5213c6616] [ 22.592213] amd_pmf_pwr_src_notify_call+0x49/0x90 [amd_pmf bddfe0fe3712aaa99acce3d5487405c5213c6616] [ 22.592216] notifier_call_chain+0x5a/0xd0 [ 22.592218] atomic_notifier_call_chain+0x32/0x50 ... Fix this by moving the registration of source change notify handler only when SPS(Static Slider) is advertised as supported. Reported-by: Allen Zhong Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217571 Fixes: 4c71ae414474 ("platform/x86/amd/pmf: Add support SPS PMF feature") Tested-by: Patil Rajesh Reddy Reviewed-by: Mario Limonciello Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20230622060309.310001-1-Shyam-sundar.S-k@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/amd/pmf/core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index ee5f124f78b6..7780705917b7 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -297,6 +297,8 @@ static void amd_pmf_init_features(struct amd_pmf_dev *dev) /* Enable Static Slider */ if (is_apmf_func_supported(dev, APMF_FUNC_STATIC_SLIDER_GRANULAR)) { amd_pmf_init_sps(dev); + dev->pwr_src_notifier.notifier_call = amd_pmf_pwr_src_notify_call; + power_supply_reg_notifier(&dev->pwr_src_notifier); dev_dbg(dev->dev, "SPS enabled and Platform Profiles registered\n"); } @@ -315,8 +317,10 @@ static void amd_pmf_init_features(struct amd_pmf_dev *dev) static void amd_pmf_deinit_features(struct amd_pmf_dev *dev) { - if (is_apmf_func_supported(dev, APMF_FUNC_STATIC_SLIDER_GRANULAR)) + if (is_apmf_func_supported(dev, APMF_FUNC_STATIC_SLIDER_GRANULAR)) { + power_supply_unreg_notifier(&dev->pwr_src_notifier); amd_pmf_deinit_sps(dev); + } if (is_apmf_func_supported(dev, APMF_FUNC_AUTO_MODE)) { amd_pmf_deinit_auto_mode(dev); @@ -399,9 +403,6 @@ static int amd_pmf_probe(struct platform_device *pdev) apmf_install_handler(dev); amd_pmf_dbgfs_register(dev); - dev->pwr_src_notifier.notifier_call = amd_pmf_pwr_src_notify_call; - power_supply_reg_notifier(&dev->pwr_src_notifier); - dev_info(dev->dev, "registered PMF device successfully\n"); return 0; @@ -411,7 +412,6 @@ static void amd_pmf_remove(struct platform_device *pdev) { struct amd_pmf_dev *dev = platform_get_drvdata(pdev); - power_supply_unreg_notifier(&dev->pwr_src_notifier); amd_pmf_deinit_features(dev); apmf_acpi_deinit(dev); amd_pmf_dbgfs_unregister(dev); -- cgit v1.2.3 From 2174a08db80d1efeea382e25ac41c4e7511eb6d6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Jun 2023 18:44:25 +0000 Subject: sch_netem: acquire qdisc lock in netem_change() syzbot managed to trigger a divide error [1] in netem. It could happen if q->rate changes while netem_enqueue() is running, since q->rate is read twice. It turns out netem_change() always lacked proper synchronization. [1] divide error: 0000 [#1] SMP KASAN CPU: 1 PID: 7867 Comm: syz-executor.1 Not tainted 6.1.30-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/25/2023 RIP: 0010:div64_u64 include/linux/math64.h:69 [inline] RIP: 0010:packet_time_ns net/sched/sch_netem.c:357 [inline] RIP: 0010:netem_enqueue+0x2067/0x36d0 net/sched/sch_netem.c:576 Code: 89 e2 48 69 da 00 ca 9a 3b 42 80 3c 28 00 4c 8b a4 24 88 00 00 00 74 0d 4c 89 e7 e8 c3 4f 3b fd 48 8b 4c 24 18 48 89 d8 31 d2 <49> f7 34 24 49 01 c7 4c 8b 64 24 48 4d 01 f7 4c 89 e3 48 c1 eb 03 RSP: 0018:ffffc9000dccea60 EFLAGS: 00010246 RAX: 000001a442624200 RBX: 000001a442624200 RCX: ffff888108a4f000 RDX: 0000000000000000 RSI: 000000000000070d RDI: 000000000000070d RBP: ffffc9000dcceb90 R08: ffffffff849c5e26 R09: fffffbfff10e1297 R10: 0000000000000000 R11: dffffc0000000001 R12: ffff888108a4f358 R13: dffffc0000000000 R14: 0000001a8cd9a7ec R15: 0000000000000000 FS: 00007fa73fe18700(0000) GS:ffff8881f6b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa73fdf7718 CR3: 000000011d36e000 CR4: 0000000000350ee0 Call Trace: [] __dev_xmit_skb net/core/dev.c:3931 [inline] [] __dev_queue_xmit+0xcf5/0x3370 net/core/dev.c:4290 [] dev_queue_xmit include/linux/netdevice.h:3030 [inline] [] neigh_hh_output include/net/neighbour.h:531 [inline] [] neigh_output include/net/neighbour.h:545 [inline] [] ip_finish_output2+0xb92/0x10d0 net/ipv4/ip_output.c:235 [] __ip_finish_output+0xc3/0x2b0 [] ip_finish_output+0x31/0x2a0 net/ipv4/ip_output.c:323 [] NF_HOOK_COND include/linux/netfilter.h:298 [inline] [] ip_output+0x224/0x2a0 net/ipv4/ip_output.c:437 [] dst_output include/net/dst.h:444 [inline] [] ip_local_out net/ipv4/ip_output.c:127 [inline] [] __ip_queue_xmit+0x1425/0x2000 net/ipv4/ip_output.c:542 [] ip_queue_xmit+0x4c/0x70 net/ipv4/ip_output.c:556 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Stephen Hemminger Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Reviewed-by: Jamal Hadi Salim Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230620184425.1179809-1-edumazet@google.com Signed-off-by: Paolo Abeni --- net/sched/sch_netem.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 6ef3021e1169..e79be1b3e74d 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -966,6 +966,7 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, if (ret < 0) return ret; + sch_tree_lock(sch); /* backup q->clg and q->loss_model */ old_clg = q->clg; old_loss_model = q->loss_model; @@ -974,7 +975,7 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); if (ret) { q->loss_model = old_loss_model; - return ret; + goto unlock; } } else { q->loss_model = CLG_RANDOM; @@ -1041,6 +1042,8 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, /* capping jitter to the range acceptable by tabledist() */ q->jitter = min_t(s64, abs(q->jitter), INT_MAX); +unlock: + sch_tree_unlock(sch); return ret; get_table_failure: @@ -1050,7 +1053,8 @@ get_table_failure: */ q->clg = old_clg; q->loss_model = old_loss_model; - return ret; + + goto unlock; } static int netem_init(struct Qdisc *sch, struct nlattr *opt, -- cgit v1.2.3 From dec24b3b339487e58ce2da2875e9ee0316cc7e70 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 20 Jun 2023 12:42:38 -0700 Subject: net: wwan: iosm: Convert single instance struct member to flexible array struct mux_adth actually ends with multiple struct mux_adth_dg members. This is seen both in the comments about the member: /** * struct mux_adth - Structure of the Aggregated Datagram Table Header. ... * @dg: datagramm table with variable length */ and in the preparation for populating it: adth_dg_size = offsetof(struct mux_adth, dg) + ul_adb->dg_count[i] * sizeof(*dg); ... adth_dg_size -= offsetof(struct mux_adth, dg); memcpy(&adth->dg, ul_adb->dg[i], adth_dg_size); This was reported as a run-time false positive warning: memcpy: detected field-spanning write (size 16) of single field "&adth->dg" at drivers/net/wwan/iosm/iosm_ipc_mux_codec.c:852 (size 8) Adjust the struct mux_adth definition and associated sizeof() math; no binary output differences are observed in the resulting object file. Reported-by: Florian Klink Closes: https://lore.kernel.org/lkml/dbfa25f5-64c8-5574-4f5d-0151ba95d232@gmail.com/ Fixes: 1f52d7b62285 ("net: wwan: iosm: Enable M.2 7360 WWAN card support") Cc: M Chetan Kumar Cc: Bagas Sanjaya Cc: Intel Corporation Cc: Loic Poulain Cc: Sergey Ryazanov Cc: Johannes Berg Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: "Gustavo A. R. Silva" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230620194234.never.023-kees@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/wwan/iosm/iosm_ipc_mux_codec.c | 15 ++++++--------- drivers/net/wwan/iosm/iosm_ipc_mux_codec.h | 2 +- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c index d6b166fc5c0e..bff46f7ca59f 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c +++ b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c @@ -626,14 +626,12 @@ static void mux_dl_adb_decode(struct iosm_mux *ipc_mux, if (adth->signature != cpu_to_le32(IOSM_AGGR_MUX_SIG_ADTH)) goto adb_decode_err; - if (le16_to_cpu(adth->table_length) < (sizeof(struct mux_adth) - - sizeof(struct mux_adth_dg))) + if (le16_to_cpu(adth->table_length) < sizeof(struct mux_adth)) goto adb_decode_err; /* Calculate the number of datagrams. */ nr_of_dg = (le16_to_cpu(adth->table_length) - - sizeof(struct mux_adth) + - sizeof(struct mux_adth_dg)) / + sizeof(struct mux_adth)) / sizeof(struct mux_adth_dg); /* Is the datagram table empty ? */ @@ -649,7 +647,7 @@ static void mux_dl_adb_decode(struct iosm_mux *ipc_mux, } /* New aggregated datagram table. */ - dg = &adth->dg; + dg = adth->dg; if (mux_dl_process_dg(ipc_mux, adbh, dg, skb, if_id, nr_of_dg) < 0) goto adb_decode_err; @@ -849,7 +847,7 @@ static void ipc_mux_ul_encode_adth(struct iosm_mux *ipc_mux, adth->if_id = i; adth->table_length = cpu_to_le16(adth_dg_size); adth_dg_size -= offsetof(struct mux_adth, dg); - memcpy(&adth->dg, ul_adb->dg[i], adth_dg_size); + memcpy(adth->dg, ul_adb->dg[i], adth_dg_size); ul_adb->if_cnt++; } @@ -1426,14 +1424,13 @@ static int ipc_mux_get_payload_from_adb(struct iosm_mux *ipc_mux, if (adth->signature == cpu_to_le32(IOSM_AGGR_MUX_SIG_ADTH)) { nr_of_dg = (le16_to_cpu(adth->table_length) - - sizeof(struct mux_adth) + - sizeof(struct mux_adth_dg)) / + sizeof(struct mux_adth)) / sizeof(struct mux_adth_dg); if (nr_of_dg <= 0) return payload_size; - dg = &adth->dg; + dg = adth->dg; for (i = 0; i < nr_of_dg; i++, dg++) { if (le32_to_cpu(dg->datagram_index) < diff --git a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.h b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.h index 5d4e3b89542c..f8df88f816c4 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.h +++ b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.h @@ -161,7 +161,7 @@ struct mux_adth { u8 opt_ipv4v6; __le32 next_table_index; __le32 reserved2; - struct mux_adth_dg dg; + struct mux_adth_dg dg[]; }; /** -- cgit v1.2.3 From a9628e88776eb7d045cf46467f1afdd0f7fe72ea Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sun, 18 Jun 2023 03:31:30 -0700 Subject: revert "net: align SO_RCVMARK required privileges with SO_MARK" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 1f86123b9749 ("net: align SO_RCVMARK required privileges with SO_MARK") because the reasoning in the commit message is not really correct: SO_RCVMARK is used for 'reading' incoming skb mark (via cmsg), as such it is more equivalent to 'getsockopt(SO_MARK)' which has no priv check and retrieves the socket mark, rather than 'setsockopt(SO_MARK) which sets the socket mark and does require privs. Additionally incoming skb->mark may already be visible if sysctl_fwmark_reflect and/or sysctl_tcp_fwmark_accept are enabled. Furthermore, it is easier to block the getsockopt via bpf (either cgroup setsockopt hook, or via syscall filters) then to unblock it if it requires CAP_NET_RAW/ADMIN. On Android the socket mark is (among other things) used to store the network identifier a socket is bound to. Setting it is privileged, but retrieving it is not. We'd like unprivileged userspace to be able to read the network id of incoming packets (where mark is set via iptables [to be moved to bpf])... An alternative would be to add another sysctl to control whether setting SO_RCVMARK is privilged or not. (or even a MASK of which bits in the mark can be exposed) But this seems like over-engineering... Note: This is a non-trivial revert, due to later merged commit e42c7beee71d ("bpf: net: Consider has_current_bpf_ctx() when testing capable() in sk_setsockopt()") which changed both 'ns_capable' into 'sockopt_ns_capable' calls. Fixes: 1f86123b9749 ("net: align SO_RCVMARK required privileges with SO_MARK") Cc: Larysa Zaremba Cc: Simon Horman Cc: Paolo Abeni Cc: Eyal Birger Cc: Jakub Kicinski Cc: Eric Dumazet Cc: Patrick Rohr Signed-off-by: Maciej Żenczykowski Reviewed-by: Simon Horman Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230618103130.51628-1-maze@google.com Signed-off-by: Paolo Abeni --- net/core/sock.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 24f2761bdb1d..6e5662ca00fe 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1362,12 +1362,6 @@ set_sndbuf: __sock_set_mark(sk, val); break; case SO_RCVMARK: - if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { - ret = -EPERM; - break; - } - sock_valbool_flag(sk, SOCK_RCVMARK, valbool); break; -- cgit v1.2.3 From 2230f9e1171a2e9731422a14d1bbc313c0b719d1 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 15 Jun 2023 15:42:59 +1000 Subject: KVM: Avoid illegal stage2 mapping on invalid memory slot We run into guest hang in edk2 firmware when KSM is kept as running on the host. The edk2 firmware is waiting for status 0x80 from QEMU's pflash device (TYPE_PFLASH_CFI01) during the operation of sector erasing or buffered write. The status is returned by reading the memory region of the pflash device and the read request should have been forwarded to QEMU and emulated by it. Unfortunately, the read request is covered by an illegal stage2 mapping when the guest hang issue occurs. The read request is completed with QEMU bypassed and wrong status is fetched. The edk2 firmware runs into an infinite loop with the wrong status. The illegal stage2 mapping is populated due to same page sharing by KSM at (C) even the associated memory slot has been marked as invalid at (B) when the memory slot is requested to be deleted. It's notable that the active and inactive memory slots can't be swapped when we're in the middle of kvm_mmu_notifier_change_pte() because kvm->mn_active_invalidate_count is elevated, and kvm_swap_active_memslots() will busy loop until it reaches to zero again. Besides, the swapping from the active to the inactive memory slots is also avoided by holding &kvm->srcu in __kvm_handle_hva_range(), corresponding to synchronize_srcu_expedited() in kvm_swap_active_memslots(). CPU-A CPU-B ----- ----- ioctl(kvm_fd, KVM_SET_USER_MEMORY_REGION) kvm_vm_ioctl_set_memory_region kvm_set_memory_region __kvm_set_memory_region kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE) kvm_invalidate_memslot kvm_copy_memslot kvm_replace_memslot kvm_swap_active_memslots (A) kvm_arch_flush_shadow_memslot (B) same page sharing by KSM kvm_mmu_notifier_invalidate_range_start : kvm_mmu_notifier_change_pte kvm_handle_hva_range __kvm_handle_hva_range kvm_set_spte_gfn (C) : kvm_mmu_notifier_invalidate_range_end Fix the issue by skipping the invalid memory slot at (C) to avoid the illegal stage2 mapping so that the read request for the pflash's status is forwarded to QEMU and emulated by it. In this way, the correct pflash's status can be returned from QEMU to break the infinite loop in the edk2 firmware. We tried a git-bisect and the first problematic commit is cd4c71835228 (" KVM: arm64: Convert to the gfn-based MMU notifier callbacks"). With this, clean_dcache_guest_page() is called after the memory slots are iterated in kvm_mmu_notifier_change_pte(). clean_dcache_guest_page() is called before the iteration on the memory slots before this commit. This change literally enlarges the racy window between kvm_mmu_notifier_change_pte() and memory slot removal so that we're able to reproduce the issue in a practical test case. However, the issue exists since commit d5d8184d35c9 ("KVM: ARM: Memory virtualization setup"). Cc: stable@vger.kernel.org # v3.9+ Fixes: d5d8184d35c9 ("KVM: ARM: Memory virtualization setup") Reported-by: Shuai Hu Reported-by: Zhenyu Zhang Signed-off-by: Gavin Shan Reviewed-by: David Hildenbrand Reviewed-by: Oliver Upton Reviewed-by: Peter Xu Reviewed-by: Sean Christopherson Reviewed-by: Shaoqin Huang Message-Id: <20230615054259.14911-1-gshan@redhat.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 479802a892d4..65f94f592ff8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -686,6 +686,24 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn return __kvm_handle_hva_range(kvm, &range); } + +static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + /* + * Skipping invalid memslots is correct if and only change_pte() is + * surrounded by invalidate_range_{start,end}(), which is currently + * guaranteed by the primary MMU. If that ever changes, KVM needs to + * unmap the memslot instead of skipping the memslot to ensure that KVM + * doesn't hold references to the old PFN. + */ + WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); + + if (range->slot->flags & KVM_MEMSLOT_INVALID) + return false; + + return kvm_set_spte_gfn(kvm, range); +} + static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address, @@ -707,7 +725,7 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, if (!READ_ONCE(kvm->mmu_invalidate_in_progress)) return; - kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); + kvm_handle_hva_range(mn, address, address + 1, pte, kvm_change_spte_gfn); } void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start, -- cgit v1.2.3