From 19873eec7e13fda140a0ebc75d6664e57c00bfb1 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 4 Sep 2020 19:55:55 -0700 Subject: Drivers: hv: vmbus: hibernation: do not hang forever in vmbus_bus_resume() After we Stop and later Start a VM that uses Accelerated Networking (NIC SR-IOV), currently the VF vmbus device's Instance GUID can change, so after vmbus_bus_resume() -> vmbus_request_offers(), vmbus_onoffer() can not find the original vmbus channel of the VF, and hence we can't complete() vmbus_connection.ready_for_resume_event in check_ready_for_resume_event(), and the VM hangs in vmbus_bus_resume() forever. Fix the issue by adding a timeout, so the resuming can still succeed, and the saved state is not lost, and according to my test, the user can disable Accelerated Networking and then will be able to SSH into the VM for further recovery. Also prevent the VM in question from suspending again. The host will be fixed so in future the Instance GUID will stay the same across hibernation. Fixes: d8bd2d442bb2 ("Drivers: hv: vmbus: Resume after fixing up old primary channels") Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20200905025555.45614-1-decui@microsoft.com Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 45c954d75e70..0f2d6a60f769 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2387,7 +2387,10 @@ static int vmbus_bus_suspend(struct device *dev) if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0) wait_for_completion(&vmbus_connection.ready_for_suspend_event); - WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0); + if (atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0) { + pr_err("Can not suspend due to a previous failed resuming\n"); + return -EBUSY; + } mutex_lock(&vmbus_connection.channel_mutex); @@ -2463,7 +2466,9 @@ static int vmbus_bus_resume(struct device *dev) vmbus_request_offers(); - wait_for_completion(&vmbus_connection.ready_for_resume_event); + if (wait_for_completion_timeout( + &vmbus_connection.ready_for_resume_event, 10 * HZ) == 0) + pr_err("Some vmbus device is missing after suspending?\n"); /* Reset the event for the next suspend. */ reinit_completion(&vmbus_connection.ready_for_suspend_event); -- cgit v1.2.3 From 911e1987efc8f3e6445955fbae7f54b428b92bd3 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 13 Sep 2020 12:47:29 -0700 Subject: Drivers: hv: vmbus: Add timeout to vmbus_wait_for_unload vmbus_wait_for_unload() looks for a CHANNELMSG_UNLOAD_RESPONSE message coming from Hyper-V. But if the message isn't found for some reason, the panic path gets hung forever. Add a timeout of 10 seconds to prevent this. Fixes: 415719160de3 ("Drivers: hv: vmbus: avoid scheduling in interrupt context in vmbus_initiate_unload()") Signed-off-by: Michael Kelley Reviewed-by: Dexuan Cui Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/1600026449-23651-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 417a95e5094d..af7832e13167 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -750,7 +750,7 @@ static void vmbus_wait_for_unload(void) void *page_addr; struct hv_message *msg; struct vmbus_channel_message_header *hdr; - u32 message_type; + u32 message_type, i; /* * CHANNELMSG_UNLOAD_RESPONSE is always delivered to the CPU which was @@ -760,8 +760,11 @@ static void vmbus_wait_for_unload(void) * functional and vmbus_unload_response() will complete * vmbus_connection.unload_event. If not, the last thing we can do is * read message pages for all CPUs directly. + * + * Wait no more than 10 seconds so that the panic path can't get + * hung forever in case the response message isn't seen. */ - while (1) { + for (i = 0; i < 1000; i++) { if (completion_done(&vmbus_connection.unload_event)) break; -- cgit v1.2.3