summaryrefslogtreecommitdiff
path: root/drivers/hv
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/hv')
-rw-r--r--drivers/hv/Kconfig66
-rw-r--r--drivers/hv/Makefile15
-rw-r--r--drivers/hv/channel.c143
-rw-r--r--drivers/hv/channel_mgmt.c46
-rw-r--r--drivers/hv/connection.c39
-rw-r--r--drivers/hv/hv.c488
-rw-r--r--drivers/hv/hv_balloon.c6
-rw-r--r--drivers/hv/hv_common.c320
-rw-r--r--drivers/hv/hv_kvp.c4
-rw-r--r--drivers/hv/hv_proc.c288
-rw-r--r--drivers/hv/hv_snapshot.c4
-rw-r--r--drivers/hv/hv_util.c2
-rw-r--r--drivers/hv/hv_utils_transport.c12
-rw-r--r--drivers/hv/hyperv_vmbus.h85
-rw-r--r--drivers/hv/mshv.h28
-rw-r--r--drivers/hv/mshv_common.c241
-rw-r--r--drivers/hv/mshv_debugfs.c724
-rw-r--r--drivers/hv/mshv_debugfs_counters.c490
-rw-r--r--drivers/hv/mshv_eventfd.c853
-rw-r--r--drivers/hv/mshv_eventfd.h70
-rw-r--r--drivers/hv/mshv_irq.c132
-rw-r--r--drivers/hv/mshv_portid_table.c83
-rw-r--r--drivers/hv/mshv_regions.c590
-rw-r--r--drivers/hv/mshv_root.h381
-rw-r--r--drivers/hv/mshv_root_hv_call.c1074
-rw-r--r--drivers/hv/mshv_root_main.c2417
-rw-r--r--drivers/hv/mshv_synic.c820
-rw-r--r--drivers/hv/mshv_trace.c9
-rw-r--r--drivers/hv/mshv_trace.h544
-rw-r--r--drivers/hv/mshv_vtl.h25
-rw-r--r--drivers/hv/mshv_vtl_main.c1399
-rw-r--r--drivers/hv/ring_buffer.c10
-rw-r--r--drivers/hv/vmbus_drv.c603
33 files changed, 11501 insertions, 510 deletions
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 862c47b191af..2d0b3fcb0ff8 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -3,19 +3,21 @@
menu "Microsoft Hyper-V guest support"
config HYPERV
- tristate "Microsoft Hyper-V client drivers"
+ bool "Microsoft Hyper-V core hypervisor support"
depends on (X86 && X86_LOCAL_APIC && HYPERVISOR_GUEST) \
- || (ACPI && ARM64 && !CPU_BIG_ENDIAN)
+ || (ARM64 && !CPU_BIG_ENDIAN)
select PARAVIRT
select X86_HV_CALLBACK_VECTOR if X86
select OF_EARLY_FLATTREE if OF
+ select IRQ_MSI_LIB if X86
help
Select this option to run Linux as a Hyper-V client operating
system.
config HYPERV_VTL_MODE
bool "Enable Linux to boot in VTL context"
- depends on X86_64 && HYPERV
+ depends on (X86_64 && HAVE_STATIC_CALL) || ARM64
+ depends on HYPERV
depends on SMP
default n
help
@@ -31,7 +33,7 @@ config HYPERV_VTL_MODE
Select this option to build a Linux kernel to run at a VTL other than
the normal VTL0, which currently is only VTL2. This option
- initializes the x86 platform for VTL2, and adds the ability to boot
+ initializes the kernel to run in VTL2, and adds the ability to boot
secondary CPUs directly into 64-bit context as required for VTLs other
than 0. A kernel built with this option must run at VTL2, and will
not run as a normal guest.
@@ -43,16 +45,68 @@ config HYPERV_TIMER
config HYPERV_UTILS
tristate "Microsoft Hyper-V Utilities driver"
- depends on HYPERV && CONNECTOR && NLS
+ depends on HYPERV_VMBUS && CONNECTOR && NLS
depends on PTP_1588_CLOCK_OPTIONAL
help
Select this option to enable the Hyper-V Utilities.
config HYPERV_BALLOON
tristate "Microsoft Hyper-V Balloon driver"
- depends on HYPERV
+ depends on HYPERV_VMBUS
select PAGE_REPORTING
help
Select this option to enable Hyper-V Balloon driver.
+config HYPERV_VMBUS
+ tristate "Microsoft Hyper-V VMBus driver"
+ depends on HYPERV
+ default HYPERV
+ select SYSFB if EFI && !HYPERV_VTL_MODE
+ help
+ Select this option to enable Hyper-V Vmbus driver.
+
+config MSHV_ROOT
+ tristate "Microsoft Hyper-V root partition support"
+ depends on HYPERV && (X86_64 || ARM64)
+ depends on !HYPERV_VTL_MODE
+ # The hypervisor interface operates on 4k pages. Enforcing it here
+ # simplifies many assumptions in the root partition code.
+ # e.g. When withdrawing memory, the hypervisor gives back 4k pages in
+ # no particular order, making it impossible to reassemble larger pages
+ depends on PAGE_SIZE_4KB
+ select EVENTFD
+ select VIRT_XFER_TO_GUEST_WORK
+ select HMM_MIRROR
+ select MMU_NOTIFIER
+ default n
+ help
+ Select this option to enable support for booting and running as root
+ partition on Microsoft Hyper-V.
+
+ If unsure, say N.
+
+config MSHV_VTL
+ tristate "Microsoft Hyper-V VTL driver"
+ depends on X86_64 && HYPERV_VTL_MODE
+ depends on HYPERV_VMBUS
+ # Mapping VTL0 memory to a userspace process in VTL2 is supported in OpenHCL.
+ # VTL2 for OpenHCL makes use of Huge Pages to improve performance on VMs,
+ # specially with large memory requirements.
+ depends on TRANSPARENT_HUGEPAGE
+ # MTRRs are controlled by VTL0, and are not specific to individual VTLs.
+ # Therefore, do not attempt to access or modify MTRRs here.
+ depends on !MTRR
+ select CPUMASK_OFFSTACK
+ select VIRT_XFER_TO_GUEST_WORK
+ default n
+ help
+ Select this option to enable Hyper-V VTL driver support.
+ This driver provides interfaces for Virtual Machine Manager (VMM) running in VTL2
+ userspace to create VTLs and partitions, setup and manage VTL0 memory and
+ allow userspace to make direct hypercalls. This also allows to map VTL0's address
+ space to a usermode process in VTL2 and supports getting new VMBus messages and channel
+ events in VTL2.
+
+ If unsure, say N.
+
endmenu
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index b992c0ed182b..888a748cc7cb 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -1,7 +1,9 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_HYPERV) += hv_vmbus.o
+obj-$(CONFIG_HYPERV_VMBUS) += hv_vmbus.o
obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o
obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
+obj-$(CONFIG_MSHV_ROOT) += mshv_root.o
+obj-$(CONFIG_MSHV_VTL) += mshv_vtl.o
CFLAGS_hv_trace.o = -I$(src)
CFLAGS_hv_balloon.o = -I$(src)
@@ -11,6 +13,15 @@ hv_vmbus-y := vmbus_drv.o \
channel_mgmt.o ring_buffer.o hv_trace.o
hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o
hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
+mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
+ mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o
+mshv_root-$(CONFIG_DEBUG_FS) += mshv_debugfs.o
+mshv_root-$(CONFIG_TRACEPOINTS) += mshv_trace.o
+mshv_vtl-y := mshv_vtl_main.o
# Code that must be built-in
-obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o
+obj-$(CONFIG_HYPERV) += hv_common.o
+obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o
+ifneq ($(CONFIG_MSHV_ROOT)$(CONFIG_MSHV_VTL),)
+ obj-y += mshv_common.o
+endif
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index fb8cd8469328..6821f225248b 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -18,6 +18,7 @@
#include <linux/uio.h>
#include <linux/interrupt.h>
#include <linux/set_memory.h>
+#include <linux/export.h>
#include <asm/page.h>
#include <asm/mshyperv.h>
@@ -409,6 +410,21 @@ static int create_gpadl_header(enum hv_gpadl_type type, void *kbuffer,
return 0;
}
+static void vmbus_free_channel_msginfo(struct vmbus_channel_msginfo *msginfo)
+{
+ struct vmbus_channel_msginfo *submsginfo, *tmp;
+
+ if (!msginfo)
+ return;
+
+ list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist,
+ msglistentry) {
+ kfree(submsginfo);
+ }
+
+ kfree(msginfo);
+}
+
/*
* __vmbus_establish_gpadl - Establish a GPADL for a buffer or ringbuffer
*
@@ -428,7 +444,7 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel,
struct vmbus_channel_gpadl_header *gpadlmsg;
struct vmbus_channel_gpadl_body *gpadl_body;
struct vmbus_channel_msginfo *msginfo = NULL;
- struct vmbus_channel_msginfo *submsginfo, *tmp;
+ struct vmbus_channel_msginfo *submsginfo;
struct list_head *curr;
u32 next_gpadl_handle;
unsigned long flags;
@@ -443,20 +459,24 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel,
return ret;
}
- /*
- * Set the "decrypted" flag to true for the set_memory_decrypted()
- * success case. In the failure case, the encryption state of the
- * memory is unknown. Leave "decrypted" as true to ensure the
- * memory will be leaked instead of going back on the free list.
- */
- gpadl->decrypted = true;
- ret = set_memory_decrypted((unsigned long)kbuffer,
- PFN_UP(size));
- if (ret) {
- dev_warn(&channel->device_obj->device,
- "Failed to set host visibility for new GPADL %d.\n",
- ret);
- return ret;
+ gpadl->decrypted = !((channel->co_external_memory && type == HV_GPADL_BUFFER) ||
+ (channel->co_ring_buffer && type == HV_GPADL_RING));
+ if (gpadl->decrypted) {
+ /*
+ * The "decrypted" flag being true assumes that set_memory_decrypted() succeeds.
+ * But if it fails, the encryption state of the memory is unknown. In that case,
+ * leave "decrypted" as true to ensure the memory is leaked instead of going back
+ * on the free list.
+ */
+ ret = set_memory_decrypted((unsigned long)kbuffer,
+ PFN_UP(size));
+ if (ret) {
+ dev_warn(&channel->device_obj->device,
+ "Failed to set host visibility for new GPADL %d.\n",
+ ret);
+ vmbus_free_channel_msginfo(msginfo);
+ return ret;
+ }
}
init_completion(&msginfo->waitevent);
@@ -531,12 +551,8 @@ cleanup:
spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
list_del(&msginfo->msglistentry);
spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
- list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist,
- msglistentry) {
- kfree(submsginfo);
- }
- kfree(msginfo);
+ vmbus_free_channel_msginfo(msginfo);
if (ret) {
/*
@@ -544,8 +560,10 @@ cleanup:
* left as true so the memory is leaked instead of being
* put back on the free list.
*/
- if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size)))
- gpadl->decrypted = false;
+ if (gpadl->decrypted) {
+ if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size)))
+ gpadl->decrypted = false;
+ }
}
return ret;
@@ -572,7 +590,7 @@ EXPORT_SYMBOL_GPL(vmbus_establish_gpadl);
* keeps track of the next available slot in the array. Initially, each
* slot points to the next one (as in a Linked List). The last slot
* does not point to anything, so its value is U64_MAX by default.
- * @size The size of the array
+ * @size: The size of the array
*/
static u64 *request_arr_init(u32 size)
{
@@ -676,12 +694,13 @@ static int __vmbus_open(struct vmbus_channel *newchannel,
goto error_clean_ring;
err = hv_ringbuffer_init(&newchannel->outbound,
- page, send_pages, 0);
+ page, send_pages, 0, newchannel->co_ring_buffer);
if (err)
goto error_free_gpadl;
err = hv_ringbuffer_init(&newchannel->inbound, &page[send_pages],
- recv_pages, newchannel->max_pkt_size);
+ recv_pages, newchannel->max_pkt_size,
+ newchannel->co_ring_buffer);
if (err)
goto error_free_gpadl;
@@ -862,8 +881,11 @@ post_msg_err:
kfree(info);
- ret = set_memory_encrypted((unsigned long)gpadl->buffer,
- PFN_UP(gpadl->size));
+ if (gpadl->decrypted)
+ ret = set_memory_encrypted((unsigned long)gpadl->buffer,
+ PFN_UP(gpadl->size));
+ else
+ ret = 0;
if (ret)
pr_warn("Fail to set mem host visibility in GPADL teardown %d.\n", ret);
@@ -924,7 +946,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel)
/* Send a closing message */
- msg = &channel->close_msg.msg;
+ msg = &channel->close_msg;
msg->header.msgtype = CHANNELMSG_CLOSECHANNEL;
msg->child_relid = channel->offermsg.child_relid;
@@ -1077,68 +1099,10 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
EXPORT_SYMBOL(vmbus_sendpacket);
/*
- * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer
- * packets using a GPADL Direct packet type. This interface allows you
- * to control notifying the host. This will be useful for sending
- * batched data. Also the sender can control the send flags
- * explicitly.
- */
-int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
- struct hv_page_buffer pagebuffers[],
- u32 pagecount, void *buffer, u32 bufferlen,
- u64 requestid)
-{
- int i;
- struct vmbus_channel_packet_page_buffer desc;
- u32 descsize;
- u32 packetlen;
- u32 packetlen_aligned;
- struct kvec bufferlist[3];
- u64 aligned_data = 0;
-
- if (pagecount > MAX_PAGE_BUFFER_COUNT)
- return -EINVAL;
-
- /*
- * Adjust the size down since vmbus_channel_packet_page_buffer is the
- * largest size we support
- */
- descsize = sizeof(struct vmbus_channel_packet_page_buffer) -
- ((MAX_PAGE_BUFFER_COUNT - pagecount) *
- sizeof(struct hv_page_buffer));
- packetlen = descsize + bufferlen;
- packetlen_aligned = ALIGN(packetlen, sizeof(u64));
-
- /* Setup the descriptor */
- desc.type = VM_PKT_DATA_USING_GPA_DIRECT;
- desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
- desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */
- desc.length8 = (u16)(packetlen_aligned >> 3);
- desc.transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */
- desc.reserved = 0;
- desc.rangecount = pagecount;
-
- for (i = 0; i < pagecount; i++) {
- desc.range[i].len = pagebuffers[i].len;
- desc.range[i].offset = pagebuffers[i].offset;
- desc.range[i].pfn = pagebuffers[i].pfn;
- }
-
- bufferlist[0].iov_base = &desc;
- bufferlist[0].iov_len = descsize;
- bufferlist[1].iov_base = buffer;
- bufferlist[1].iov_len = bufferlen;
- bufferlist[2].iov_base = &aligned_data;
- bufferlist[2].iov_len = (packetlen_aligned - packetlen);
-
- return hv_ringbuffer_write(channel, bufferlist, 3, requestid, NULL);
-}
-EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer);
-
-/*
- * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet
+ * vmbus_sendpacket_mpb_desc - Send one or more multi-page buffer packets
* using a GPADL Direct packet type.
- * The buffer includes the vmbus descriptor.
+ * The desc argument must include space for the VMBus descriptor. The
+ * rangecount field must already be set.
*/
int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
struct vmbus_packet_mpb_array *desc,
@@ -1160,7 +1124,6 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
desc->length8 = (u16)(packetlen_aligned >> 3);
desc->transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */
desc->reserved = 0;
- desc->rangecount = 1;
bufferlist[0].iov_base = desc;
bufferlist[0].iov_len = desc_size;
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 6e084c207414..84eb0a6a0b54 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -20,6 +20,7 @@
#include <linux/delay.h>
#include <linux/cpu.h>
#include <linux/hyperv.h>
+#include <linux/export.h>
#include <asm/mshyperv.h>
#include <linux/sched/isolation.h>
@@ -353,7 +354,7 @@ static struct vmbus_channel *alloc_channel(void)
{
struct vmbus_channel *channel;
- channel = kzalloc(sizeof(*channel), GFP_ATOMIC);
+ channel = kzalloc_obj(*channel, GFP_ATOMIC);
if (!channel)
return NULL;
@@ -383,8 +384,18 @@ static void free_channel(struct vmbus_channel *channel)
void vmbus_channel_map_relid(struct vmbus_channel *channel)
{
- if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS))
+ u32 new_relid = channel->offermsg.child_relid;
+
+ if (WARN_ON(new_relid >= MAX_CHANNEL_RELIDS))
return;
+
+ /*
+ * This function is always called in the tasklet for the connect CPU.
+ * So updating the relid hiwater mark does not need to be atomic.
+ */
+ if (new_relid > READ_ONCE(vmbus_connection.relid_hiwater))
+ WRITE_ONCE(vmbus_connection.relid_hiwater, new_relid);
+
/*
* The mapping of the channel's relid is visible from the CPUs that
* execute vmbus_chan_sched() by the time that vmbus_chan_sched() will
@@ -410,9 +421,7 @@ void vmbus_channel_map_relid(struct vmbus_channel *channel)
* of the VMBus driver and vmbus_chan_sched() can not run before
* vmbus_bus_resume() has completed execution (cf. resume_noirq).
*/
- virt_store_mb(
- vmbus_connection.channels[channel->offermsg.child_relid],
- channel);
+ virt_store_mb(vmbus_connection.channels[new_relid], channel);
}
void vmbus_channel_unmap_relid(struct vmbus_channel *channel)
@@ -843,14 +852,14 @@ static void vmbus_wait_for_unload(void)
= per_cpu_ptr(hv_context.cpu_context, cpu);
/*
- * In a CoCo VM the synic_message_page is not allocated
+ * In a CoCo VM the hyp_synic_message_page is not allocated
* in hv_synic_alloc(). Instead it is set/cleared in
- * hv_synic_enable_regs() and hv_synic_disable_regs()
+ * hv_hyp_synic_enable_regs() and hv_hyp_synic_disable_regs()
* such that it is set only when the CPU is online. If
* not all present CPUs are online, the message page
* might be NULL, so skip such CPUs.
*/
- page_addr = hv_cpu->synic_message_page;
+ page_addr = hv_cpu->hyp_synic_message_page;
if (!page_addr)
continue;
@@ -891,7 +900,7 @@ completed:
struct hv_per_cpu_context *hv_cpu
= per_cpu_ptr(hv_context.cpu_context, cpu);
- page_addr = hv_cpu->synic_message_page;
+ page_addr = hv_cpu->hyp_synic_message_page;
if (!page_addr)
continue;
@@ -1021,6 +1030,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
struct vmbus_channel_offer_channel *offer;
struct vmbus_channel *oldchannel, *newchannel;
size_t offer_sz;
+ bool co_ring_buffer, co_external_memory;
offer = (struct vmbus_channel_offer_channel *)hdr;
@@ -1033,6 +1043,22 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
return;
}
+ co_ring_buffer = is_co_ring_buffer(offer);
+ co_external_memory = is_co_external_memory(offer);
+ if (!co_ring_buffer && co_external_memory) {
+ pr_err("Invalid offer relid=%d: the ring buffer isn't encrypted\n",
+ offer->child_relid);
+ return;
+ }
+ if (co_ring_buffer || co_external_memory) {
+ if (vmbus_proto_version < VERSION_WIN10_V6_0 || !vmbus_is_confidential()) {
+ pr_err("Invalid offer relid=%d: no support for confidential VMBus\n",
+ offer->child_relid);
+ atomic_dec(&vmbus_connection.offer_in_progress);
+ return;
+ }
+ }
+
oldchannel = find_primary_channel_by_offer(offer);
if (oldchannel != NULL) {
@@ -1111,6 +1137,8 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
pr_err("Unable to allocate channel object\n");
return;
}
+ newchannel->co_ring_buffer = co_ring_buffer;
+ newchannel->co_external_memory = co_external_memory;
vmbus_setup_channel_state(newchannel, offer);
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 8351360bba16..b5b322ce16df 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version);
* Linux guests and are not listed.
*/
static __u32 vmbus_versions[] = {
+ VERSION_WIN10_V6_0,
VERSION_WIN10_V5_3,
VERSION_WIN10_V5_2,
VERSION_WIN10_V5_1,
@@ -65,7 +66,7 @@ static __u32 vmbus_versions[] = {
* Maximal VMBus protocol version guests can negotiate. Useful to cap the
* VMBus version for testing and debugging purpose.
*/
-static uint max_version = VERSION_WIN10_V5_3;
+static uint max_version = VERSION_WIN10_V6_0;
module_param(max_version, uint, S_IRUGO);
MODULE_PARM_DESC(max_version,
@@ -105,6 +106,9 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version)
vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID;
}
+ if (vmbus_is_confidential() && version >= VERSION_WIN10_V6_0)
+ msg->feature_flags = VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS;
+
/*
* shared_gpa_boundary is zero in non-SNP VMs, so it's safe to always
* bitwise OR it
@@ -207,10 +211,19 @@ int vmbus_connect(void)
mutex_init(&vmbus_connection.channel_mutex);
/*
+ * The following Hyper-V interrupt and monitor pages can be used by
+ * UIO for mapping to user-space, so they should always be allocated on
+ * system page boundaries. The system page size must be >= the Hyper-V
+ * page size.
+ */
+ BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE);
+
+ /*
* Setup the vmbus event connection for channel interrupt
* abstraction stuff
*/
- vmbus_connection.int_page = hv_alloc_hyperv_zeroed_page();
+ vmbus_connection.int_page =
+ (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
if (vmbus_connection.int_page == NULL) {
ret = -ENOMEM;
goto cleanup;
@@ -225,8 +238,8 @@ int vmbus_connect(void)
* Setup the monitor notification facility. The 1st page for
* parent->child and the 2nd page for child->parent
*/
- vmbus_connection.monitor_pages[0] = hv_alloc_hyperv_page();
- vmbus_connection.monitor_pages[1] = hv_alloc_hyperv_page();
+ vmbus_connection.monitor_pages[0] = (void *)__get_free_page(GFP_KERNEL);
+ vmbus_connection.monitor_pages[1] = (void *)__get_free_page(GFP_KERNEL);
if ((vmbus_connection.monitor_pages[0] == NULL) ||
(vmbus_connection.monitor_pages[1] == NULL)) {
ret = -ENOMEM;
@@ -300,9 +313,8 @@ int vmbus_connect(void)
pr_info("Vmbus version:%d.%d\n",
version >> 16, version & 0xFFFF);
- vmbus_connection.channels = kcalloc(MAX_CHANNEL_RELIDS,
- sizeof(struct vmbus_channel *),
- GFP_KERNEL);
+ vmbus_connection.channels = kzalloc_objs(struct vmbus_channel *,
+ MAX_CHANNEL_RELIDS);
if (vmbus_connection.channels == NULL) {
ret = -ENOMEM;
goto cleanup;
@@ -342,21 +354,23 @@ void vmbus_disconnect(void)
destroy_workqueue(vmbus_connection.work_queue);
if (vmbus_connection.int_page) {
- hv_free_hyperv_page(vmbus_connection.int_page);
+ free_page((unsigned long)vmbus_connection.int_page);
vmbus_connection.int_page = NULL;
}
if (vmbus_connection.monitor_pages[0]) {
if (!set_memory_encrypted(
(unsigned long)vmbus_connection.monitor_pages[0], 1))
- hv_free_hyperv_page(vmbus_connection.monitor_pages[0]);
+ free_page((unsigned long)
+ vmbus_connection.monitor_pages[0]);
vmbus_connection.monitor_pages[0] = NULL;
}
if (vmbus_connection.monitor_pages[1]) {
if (!set_memory_encrypted(
(unsigned long)vmbus_connection.monitor_pages[1], 1))
- hv_free_hyperv_page(vmbus_connection.monitor_pages[1]);
+ free_page((unsigned long)
+ vmbus_connection.monitor_pages[1]);
vmbus_connection.monitor_pages[1] = NULL;
}
}
@@ -508,7 +522,10 @@ void vmbus_set_event(struct vmbus_channel *channel)
else
WARN_ON_ONCE(1);
} else {
- hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, channel->sig_event);
+ u64 control = HVCALL_SIGNAL_EVENT;
+
+ control |= hv_nested ? HV_HYPERCALL_NESTED : 0;
+ hv_do_fast_hypercall8(control, channel->sig_event);
}
}
EXPORT_SYMBOL_GPL(vmbus_set_event);
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 36d9ba097ff5..ae60fd542292 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -18,6 +18,7 @@
#include <linux/clockchips.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
+#include <linux/export.h>
#include <clocksource/hyperv_timer.h>
#include <asm/mshyperv.h>
#include <linux/set_memory.h>
@@ -25,6 +26,7 @@
/* The one and only */
struct hv_context hv_context;
+EXPORT_SYMBOL_FOR_MODULES(hv_context, "mshv_vtl");
/*
* hv_init - Main initialization routine.
@@ -74,7 +76,11 @@ int hv_post_message(union hv_connection_id connection_id,
aligned_msg->payload_size = payload_size;
memcpy((void *)aligned_msg->payload, payload, payload_size);
- if (ms_hyperv.paravisor_present) {
+ if (ms_hyperv.paravisor_present && !vmbus_is_confidential()) {
+ /*
+ * If the VMBus isn't confidential, use the CoCo-specific
+ * mechanism to communicate with the hypervisor.
+ */
if (hv_isolation_type_tdx())
status = hv_tdx_hypercall(HVCALL_POST_MESSAGE,
virt_to_phys(aligned_msg), 0);
@@ -85,19 +91,87 @@ int hv_post_message(union hv_connection_id connection_id,
else
status = HV_STATUS_INVALID_PARAMETER;
} else {
- status = hv_do_hypercall(HVCALL_POST_MESSAGE,
- aligned_msg, NULL);
+ u64 control = HVCALL_POST_MESSAGE;
+
+ control |= hv_nested ? HV_HYPERCALL_NESTED : 0;
+ /*
+ * If there is no paravisor, this will go to the hypervisor.
+ * In the Confidential VMBus case, there is the paravisor
+ * to which this will trap.
+ */
+ status = hv_do_hypercall(control, aligned_msg, NULL);
}
local_irq_restore(flags);
return hv_result(status);
}
+EXPORT_SYMBOL_FOR_MODULES(hv_post_message, "mshv_vtl");
+
+static int hv_alloc_page(void **page, bool decrypt, const char *note)
+{
+ int ret = 0;
+
+ /*
+ * After the page changes its encryption status, its contents might
+ * appear scrambled on some hardware. Thus `get_zeroed_page` would
+ * zero the page out in vain, so do that explicitly exactly once.
+ *
+ * By default, the page is allocated encrypted in a CoCo VM.
+ */
+ *page = (void *)__get_free_page(GFP_KERNEL);
+ if (!*page)
+ return -ENOMEM;
+
+ if (decrypt)
+ ret = set_memory_decrypted((unsigned long)*page, 1);
+ if (ret)
+ goto failed;
+
+ memset(*page, 0, PAGE_SIZE);
+ return 0;
+
+failed:
+ /*
+ * Report the failure but don't put the page back on the free list as
+ * its encryption status is unknown.
+ */
+ pr_err("allocation failed for %s page, error %d, decrypted %d\n",
+ note, ret, decrypt);
+ *page = NULL;
+ return ret;
+}
+
+static int hv_free_page(void **page, bool encrypt, const char *note)
+{
+ int ret = 0;
+
+ if (!*page)
+ return 0;
+
+ if (encrypt)
+ ret = set_memory_encrypted((unsigned long)*page, 1);
+
+ /*
+ * In the case of the failure, the page is leaked. Something is wrong,
+ * prefer to lose the page with the unknown encryption status and stay afloat.
+ */
+ if (ret)
+ pr_err("deallocation failed for %s page, error %d, encrypt %d\n",
+ note, ret, encrypt);
+ else
+ free_page((unsigned long)*page);
+
+ *page = NULL;
+
+ return ret;
+}
int hv_synic_alloc(void)
{
int cpu, ret = -ENOMEM;
struct hv_per_cpu_context *hv_cpu;
+ const bool decrypt = !vmbus_is_confidential();
/*
* First, zero all per-cpu memory areas so hv_synic_free() can
@@ -109,8 +183,7 @@ int hv_synic_alloc(void)
memset(hv_cpu, 0, sizeof(*hv_cpu));
}
- hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask),
- GFP_KERNEL);
+ hv_context.hv_numa_map = kzalloc_objs(struct cpumask, nr_node_ids);
if (!hv_context.hv_numa_map) {
pr_err("Unable to allocate NUMA map\n");
goto err;
@@ -123,73 +196,37 @@ int hv_synic_alloc(void)
vmbus_on_msg_dpc, (unsigned long)hv_cpu);
if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
- hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC);
- if (!hv_cpu->post_msg_page) {
- pr_err("Unable to allocate post msg page\n");
- goto err;
- }
-
- ret = set_memory_decrypted((unsigned long)hv_cpu->post_msg_page, 1);
- if (ret) {
- pr_err("Failed to decrypt post msg page: %d\n", ret);
- /* Just leak the page, as it's unsafe to free the page. */
- hv_cpu->post_msg_page = NULL;
+ ret = hv_alloc_page(&hv_cpu->post_msg_page,
+ decrypt, "post msg");
+ if (ret)
goto err;
- }
-
- memset(hv_cpu->post_msg_page, 0, PAGE_SIZE);
}
/*
- * Synic message and event pages are allocated by paravisor.
- * Skip these pages allocation here.
+ * If these SynIC pages are not allocated, SIEF and SIM pages
+ * are configured using what the root partition or the paravisor
+ * provides upon reading the SIEFP and SIMP registers.
*/
- if (!ms_hyperv.paravisor_present && !hv_root_partition) {
- hv_cpu->synic_message_page =
- (void *)get_zeroed_page(GFP_ATOMIC);
- if (!hv_cpu->synic_message_page) {
- pr_err("Unable to allocate SYNIC message page\n");
+ if (!ms_hyperv.paravisor_present && !hv_root_partition()) {
+ ret = hv_alloc_page(&hv_cpu->hyp_synic_message_page,
+ decrypt, "hypervisor SynIC msg");
+ if (ret)
goto err;
- }
-
- hv_cpu->synic_event_page =
- (void *)get_zeroed_page(GFP_ATOMIC);
- if (!hv_cpu->synic_event_page) {
- pr_err("Unable to allocate SYNIC event page\n");
-
- free_page((unsigned long)hv_cpu->synic_message_page);
- hv_cpu->synic_message_page = NULL;
+ ret = hv_alloc_page(&hv_cpu->hyp_synic_event_page,
+ decrypt, "hypervisor SynIC event");
+ if (ret)
goto err;
- }
}
- if (!ms_hyperv.paravisor_present &&
- (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
- ret = set_memory_decrypted((unsigned long)
- hv_cpu->synic_message_page, 1);
- if (ret) {
- pr_err("Failed to decrypt SYNIC msg page: %d\n", ret);
- hv_cpu->synic_message_page = NULL;
-
- /*
- * Free the event page here so that hv_synic_free()
- * won't later try to re-encrypt it.
- */
- free_page((unsigned long)hv_cpu->synic_event_page);
- hv_cpu->synic_event_page = NULL;
+ if (vmbus_is_confidential()) {
+ ret = hv_alloc_page(&hv_cpu->para_synic_message_page,
+ false, "paravisor SynIC msg");
+ if (ret)
goto err;
- }
-
- ret = set_memory_decrypted((unsigned long)
- hv_cpu->synic_event_page, 1);
- if (ret) {
- pr_err("Failed to decrypt SYNIC event page: %d\n", ret);
- hv_cpu->synic_event_page = NULL;
+ ret = hv_alloc_page(&hv_cpu->para_synic_event_page,
+ false, "paravisor SynIC event");
+ if (ret)
goto err;
- }
-
- memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
- memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
}
}
@@ -205,106 +242,83 @@ err:
void hv_synic_free(void)
{
- int cpu, ret;
+ int cpu;
+ const bool encrypt = !vmbus_is_confidential();
for_each_present_cpu(cpu) {
struct hv_per_cpu_context *hv_cpu =
per_cpu_ptr(hv_context.cpu_context, cpu);
- /* It's better to leak the page if the encryption fails. */
- if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
- if (hv_cpu->post_msg_page) {
- ret = set_memory_encrypted((unsigned long)
- hv_cpu->post_msg_page, 1);
- if (ret) {
- pr_err("Failed to encrypt post msg page: %d\n", ret);
- hv_cpu->post_msg_page = NULL;
- }
- }
+ if (ms_hyperv.paravisor_present && hv_isolation_type_tdx())
+ hv_free_page(&hv_cpu->post_msg_page,
+ encrypt, "post msg");
+ if (!ms_hyperv.paravisor_present && !hv_root_partition()) {
+ hv_free_page(&hv_cpu->hyp_synic_event_page,
+ encrypt, "hypervisor SynIC event");
+ hv_free_page(&hv_cpu->hyp_synic_message_page,
+ encrypt, "hypervisor SynIC msg");
}
-
- if (!ms_hyperv.paravisor_present &&
- (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
- if (hv_cpu->synic_message_page) {
- ret = set_memory_encrypted((unsigned long)
- hv_cpu->synic_message_page, 1);
- if (ret) {
- pr_err("Failed to encrypt SYNIC msg page: %d\n", ret);
- hv_cpu->synic_message_page = NULL;
- }
- }
-
- if (hv_cpu->synic_event_page) {
- ret = set_memory_encrypted((unsigned long)
- hv_cpu->synic_event_page, 1);
- if (ret) {
- pr_err("Failed to encrypt SYNIC event page: %d\n", ret);
- hv_cpu->synic_event_page = NULL;
- }
- }
+ if (vmbus_is_confidential()) {
+ hv_free_page(&hv_cpu->para_synic_event_page,
+ false, "paravisor SynIC event");
+ hv_free_page(&hv_cpu->para_synic_message_page,
+ false, "paravisor SynIC msg");
}
-
- free_page((unsigned long)hv_cpu->post_msg_page);
- free_page((unsigned long)hv_cpu->synic_event_page);
- free_page((unsigned long)hv_cpu->synic_message_page);
}
kfree(hv_context.hv_numa_map);
}
/*
- * hv_synic_init - Initialize the Synthetic Interrupt Controller.
- *
- * If it is already initialized by another entity (ie x2v shim), we need to
- * retrieve the initialized message and event pages. Otherwise, we create and
- * initialize the message and event pages.
+ * hv_hyp_synic_enable_regs - Initialize the Synthetic Interrupt Controller
+ * with the hypervisor.
*/
-void hv_synic_enable_regs(unsigned int cpu)
+void hv_hyp_synic_enable_regs(unsigned int cpu)
{
struct hv_per_cpu_context *hv_cpu =
per_cpu_ptr(hv_context.cpu_context, cpu);
union hv_synic_simp simp;
union hv_synic_siefp siefp;
union hv_synic_sint shared_sint;
- union hv_synic_scontrol sctrl;
- /* Setup the Synic's message page */
+ /* Setup the Synic's message page with the hypervisor. */
simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
simp.simp_enabled = 1;
- if (ms_hyperv.paravisor_present || hv_root_partition) {
- /* Mask out vTOM bit. ioremap_cache() maps decrypted */
+ if (ms_hyperv.paravisor_present || hv_root_partition()) {
+ /* Mask out vTOM bit and map as decrypted */
u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
- hv_cpu->synic_message_page =
- (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
- if (!hv_cpu->synic_message_page)
+ hv_cpu->hyp_synic_message_page =
+ memremap(base, HV_HYP_PAGE_SIZE, MEMREMAP_WB | MEMREMAP_DEC);
+ if (!hv_cpu->hyp_synic_message_page)
pr_err("Fail to map synic message page.\n");
} else {
- simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page)
+ simp.base_simp_gpa = virt_to_phys(hv_cpu->hyp_synic_message_page)
>> HV_HYP_PAGE_SHIFT;
}
hv_set_msr(HV_MSR_SIMP, simp.as_uint64);
- /* Setup the Synic's event page */
+ /* Setup the Synic's event page with the hypervisor. */
siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
siefp.siefp_enabled = 1;
- if (ms_hyperv.paravisor_present || hv_root_partition) {
- /* Mask out vTOM bit. ioremap_cache() maps decrypted */
+ if (ms_hyperv.paravisor_present || hv_root_partition()) {
+ /* Mask out vTOM bit and map as decrypted */
u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
- hv_cpu->synic_event_page =
- (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
- if (!hv_cpu->synic_event_page)
+ hv_cpu->hyp_synic_event_page =
+ memremap(base, HV_HYP_PAGE_SIZE, MEMREMAP_WB | MEMREMAP_DEC);
+ if (!hv_cpu->hyp_synic_event_page)
pr_err("Fail to map synic event page.\n");
} else {
- siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page)
+ siefp.base_siefp_gpa = virt_to_phys(hv_cpu->hyp_synic_event_page)
>> HV_HYP_PAGE_SHIFT;
}
hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
+ hv_enable_coco_interrupt(cpu, vmbus_interrupt, true);
/* Setup the shared SINT. */
if (vmbus_irq != -1)
@@ -313,18 +327,13 @@ void hv_synic_enable_regs(unsigned int cpu)
shared_sint.vector = vmbus_interrupt;
shared_sint.masked = false;
-
- /*
- * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64),
- * it doesn't provide a recommendation flag and AEOI must be disabled.
- */
-#ifdef HV_DEPRECATING_AEOI_RECOMMENDED
- shared_sint.auto_eoi =
- !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED);
-#else
- shared_sint.auto_eoi = 0;
-#endif
+ shared_sint.auto_eoi = hv_recommend_using_aeoi();
hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+}
+
+static void hv_hyp_synic_enable_interrupts(void)
+{
+ union hv_synic_scontrol sctrl;
/* Enable the global synic bit */
sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL);
@@ -333,23 +342,72 @@ void hv_synic_enable_regs(unsigned int cpu)
hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
}
+static void hv_para_synic_enable_regs(unsigned int cpu)
+{
+ union hv_synic_simp simp;
+ union hv_synic_siefp siefp;
+ struct hv_per_cpu_context *hv_cpu
+ = per_cpu_ptr(hv_context.cpu_context, cpu);
+
+ /* Setup the Synic's message page with the paravisor. */
+ simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP);
+ simp.simp_enabled = 1;
+ simp.base_simp_gpa = virt_to_phys(hv_cpu->para_synic_message_page)
+ >> HV_HYP_PAGE_SHIFT;
+ hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64);
+
+ /* Setup the Synic's event page with the paravisor. */
+ siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP);
+ siefp.siefp_enabled = 1;
+ siefp.base_siefp_gpa = virt_to_phys(hv_cpu->para_synic_event_page)
+ >> HV_HYP_PAGE_SHIFT;
+ hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64);
+}
+
+static void hv_para_synic_enable_interrupts(void)
+{
+ union hv_synic_scontrol sctrl;
+
+ /* Enable the global synic bit */
+ sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL);
+ sctrl.enable = 1;
+ hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64);
+}
+
int hv_synic_init(unsigned int cpu)
{
- hv_synic_enable_regs(cpu);
+ if (vmbus_is_confidential())
+ hv_para_synic_enable_regs(cpu);
+
+ /*
+ * The SINT is set in hv_hyp_synic_enable_regs() by calling
+ * hv_set_msr(). hv_set_msr() in turn has special case code for the
+ * SINT MSRs that write to the hypervisor version of the MSR *and*
+ * the paravisor version of the MSR (but *without* the proxy bit when
+ * VMBus is confidential).
+ *
+ * Then enable interrupts via the paravisor if VMBus is confidential,
+ * and otherwise via the hypervisor.
+ */
+
+ hv_hyp_synic_enable_regs(cpu);
+ if (vmbus_is_confidential())
+ hv_para_synic_enable_interrupts();
+ else
+ hv_hyp_synic_enable_interrupts();
hv_stimer_legacy_init(cpu, VMBUS_MESSAGE_SINT);
return 0;
}
-void hv_synic_disable_regs(unsigned int cpu)
+void hv_hyp_synic_disable_regs(unsigned int cpu)
{
struct hv_per_cpu_context *hv_cpu =
per_cpu_ptr(hv_context.cpu_context, cpu);
union hv_synic_sint shared_sint;
union hv_synic_simp simp;
union hv_synic_siefp siefp;
- union hv_synic_scontrol sctrl;
shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT);
@@ -358,18 +416,21 @@ void hv_synic_disable_regs(unsigned int cpu)
/* Need to correctly cleanup in the case of SMP!!! */
/* Disable the interrupt */
hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+ hv_enable_coco_interrupt(cpu, vmbus_interrupt, false);
simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
/*
- * In Isolation VM, sim and sief pages are allocated by
+ * In Isolation VM, simp and sief pages are allocated by
* paravisor. These pages also will be used by kdump
* kernel. So just reset enable bit here and keep page
* addresses.
*/
simp.simp_enabled = 0;
- if (ms_hyperv.paravisor_present || hv_root_partition) {
- iounmap(hv_cpu->synic_message_page);
- hv_cpu->synic_message_page = NULL;
+ if (ms_hyperv.paravisor_present || hv_root_partition()) {
+ if (hv_cpu->hyp_synic_message_page) {
+ memunmap(hv_cpu->hyp_synic_message_page);
+ hv_cpu->hyp_synic_message_page = NULL;
+ }
} else {
simp.base_simp_gpa = 0;
}
@@ -379,22 +440,52 @@ void hv_synic_disable_regs(unsigned int cpu)
siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
siefp.siefp_enabled = 0;
- if (ms_hyperv.paravisor_present || hv_root_partition) {
- iounmap(hv_cpu->synic_event_page);
- hv_cpu->synic_event_page = NULL;
+ if (ms_hyperv.paravisor_present || hv_root_partition()) {
+ if (hv_cpu->hyp_synic_event_page) {
+ memunmap(hv_cpu->hyp_synic_event_page);
+ hv_cpu->hyp_synic_event_page = NULL;
+ }
} else {
siefp.base_siefp_gpa = 0;
}
hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
+}
+
+static void hv_hyp_synic_disable_interrupts(void)
+{
+ union hv_synic_scontrol sctrl;
/* Disable the global synic bit */
sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL);
sctrl.enable = 0;
hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
+}
- if (vmbus_irq != -1)
- disable_percpu_irq(vmbus_irq);
+static void hv_para_synic_disable_regs(unsigned int cpu)
+{
+ union hv_synic_simp simp;
+ union hv_synic_siefp siefp;
+
+ /* Disable SynIC's message page in the paravisor. */
+ simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP);
+ simp.simp_enabled = 0;
+ hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64);
+
+ /* Disable SynIC's event page in the paravisor. */
+ siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP);
+ siefp.siefp_enabled = 0;
+ hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64);
+}
+
+static void hv_para_synic_disable_interrupts(void)
+{
+ union hv_synic_scontrol sctrl;
+
+ /* Disable the global synic bit */
+ sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL);
+ sctrl.enable = 0;
+ hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64);
}
#define HV_MAX_TRIES 3
@@ -407,16 +498,18 @@ void hv_synic_disable_regs(unsigned int cpu)
* that the normal interrupt handling mechanism will find and process the channel interrupt
* "very soon", and in the process clear the bit.
*/
-static bool hv_synic_event_pending(void)
+static bool __hv_synic_event_pending(union hv_synic_event_flags *event, int sint)
{
- struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
- union hv_synic_event_flags *event =
- (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT;
- unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */
+ unsigned long *recv_int_page;
bool pending;
u32 relid;
int tries = 0;
+ if (!event)
+ return false;
+
+ event += sint;
+ recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */
retry:
pending = false;
for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) {
@@ -433,13 +526,58 @@ retry:
return pending;
}
+static bool hv_synic_event_pending(void)
+{
+ struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
+ union hv_synic_event_flags *hyp_synic_event_page = hv_cpu->hyp_synic_event_page;
+ union hv_synic_event_flags *para_synic_event_page = hv_cpu->para_synic_event_page;
+
+ return
+ __hv_synic_event_pending(hyp_synic_event_page, VMBUS_MESSAGE_SINT) ||
+ __hv_synic_event_pending(para_synic_event_page, VMBUS_MESSAGE_SINT);
+}
+
+static int hv_pick_new_cpu(struct vmbus_channel *channel)
+{
+ int ret = -EBUSY;
+ int start;
+ int cpu;
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_held(&vmbus_connection.channel_mutex);
+
+ /*
+ * We can't assume that the relevant interrupts will be sent before
+ * the cpu is offlined on older versions of hyperv.
+ */
+ if (vmbus_proto_version < VERSION_WIN10_V5_3)
+ return -EBUSY;
+
+ start = get_random_u32_below(nr_cpu_ids);
+
+ for_each_cpu_wrap(cpu, cpu_online_mask, start) {
+ if (channel->target_cpu == cpu ||
+ channel->target_cpu == VMBUS_CONNECT_CPU)
+ continue;
+
+ ret = vmbus_channel_set_cpu(channel, cpu);
+ if (!ret)
+ break;
+ }
+
+ if (ret)
+ ret = vmbus_channel_set_cpu(channel, VMBUS_CONNECT_CPU);
+
+ return ret;
+}
+
/*
* hv_synic_cleanup - Cleanup routine for hv_synic_init().
*/
int hv_synic_cleanup(unsigned int cpu)
{
struct vmbus_channel *channel, *sc;
- bool channel_found = false;
+ int ret = 0;
if (vmbus_connection.conn_state != CONNECTED)
goto always_cleanup;
@@ -456,38 +594,34 @@ int hv_synic_cleanup(unsigned int cpu)
/*
* Search for channels which are bound to the CPU we're about to
- * cleanup. In case we find one and vmbus is still connected, we
- * fail; this will effectively prevent CPU offlining.
- *
- * TODO: Re-bind the channels to different CPUs.
+ * cleanup.
*/
mutex_lock(&vmbus_connection.channel_mutex);
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
if (channel->target_cpu == cpu) {
- channel_found = true;
- break;
+ ret = hv_pick_new_cpu(channel);
+ if (ret) {
+ mutex_unlock(&vmbus_connection.channel_mutex);
+ return ret;
+ }
}
list_for_each_entry(sc, &channel->sc_list, sc_list) {
if (sc->target_cpu == cpu) {
- channel_found = true;
- break;
+ ret = hv_pick_new_cpu(sc);
+ if (ret) {
+ mutex_unlock(&vmbus_connection.channel_mutex);
+ return ret;
+ }
}
}
- if (channel_found)
- break;
}
mutex_unlock(&vmbus_connection.channel_mutex);
- if (channel_found)
- return -EBUSY;
-
/*
- * channel_found == false means that any channels that were previously
- * assigned to the CPU have been reassigned elsewhere with a call of
- * vmbus_send_modifychannel(). Scan the event flags page looking for
- * bits that are set and waiting with a timeout for vmbus_chan_sched()
- * to process such bits. If bits are still set after this operation
- * and VMBus is connected, fail the CPU offlining operation.
+ * Scan the event flags page looking for bits that are set and waiting
+ * with a timeout for vmbus_chan_sched() to process such bits. If bits
+ * are still set after this operation and VMBus is connected, fail the
+ * CPU offlining operation.
*/
if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending())
return -EBUSY;
@@ -495,7 +629,27 @@ int hv_synic_cleanup(unsigned int cpu)
always_cleanup:
hv_stimer_legacy_cleanup(cpu);
- hv_synic_disable_regs(cpu);
+ /*
+ * First, disable the event and message pages
+ * used for communicating with the host, and then
+ * disable the host interrupts if VMBus is not
+ * confidential.
+ */
+ hv_hyp_synic_disable_regs(cpu);
+ if (!vmbus_is_confidential())
+ hv_hyp_synic_disable_interrupts();
- return 0;
+ /*
+ * Perform the same steps for the Confidential VMBus.
+ * The sequencing provides the guarantee that no data
+ * may be posted for processing before disabling interrupts.
+ */
+ if (vmbus_is_confidential()) {
+ hv_para_synic_disable_regs(cpu);
+ hv_para_synic_disable_interrupts();
+ }
+ if (vmbus_irq != -1)
+ disable_percpu_irq(vmbus_irq);
+
+ return ret;
}
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index fec2f18679e3..9a55f5c43307 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -801,7 +801,7 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
* is, create a gap and update covered_end_pfn.
*/
if (has->covered_end_pfn != start_pfn) {
- gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC);
+ gap = kzalloc_obj(struct hv_hotadd_gap, GFP_ATOMIC);
if (!gap) {
ret = -ENOMEM;
break;
@@ -1192,6 +1192,7 @@ static void free_balloon_pages(struct hv_dynmem_device *dm,
__ClearPageOffline(pg);
__free_page(pg);
dm->num_pages_ballooned--;
+ mod_node_page_state(page_pgdat(pg), NR_BALLOON_PAGES, -1);
adjust_managed_page_count(pg, 1);
}
}
@@ -1221,6 +1222,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
return i * alloc_unit;
dm->num_pages_ballooned += alloc_unit;
+ mod_node_page_state(page_pgdat(pg), NR_BALLOON_PAGES, alloc_unit);
/*
* If we allocatted 2M pages; split them so we
@@ -1661,7 +1663,7 @@ static void enable_page_reporting(void)
* We let the page_reporting_order parameter decide the order
* in the page_reporting code
*/
- dm_device.pr_dev_info.order = 0;
+ dm_device.pr_dev_info.order = PAGE_REPORTING_ORDER_UNSPECIFIED;
ret = page_reporting_register(&dm_device.pr_dev_info);
if (ret < 0) {
dm_device.pr_dev_info.report = NULL;
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index f2e6f55d6ca6..6b67ac616789 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -31,8 +31,14 @@
#include <hyperv/hvhdk.h>
#include <asm/mshyperv.h>
+u64 hv_current_partition_id = HV_PARTITION_ID_SELF;
+EXPORT_SYMBOL_GPL(hv_current_partition_id);
+
+enum hv_partition_type hv_curr_partition_type;
+EXPORT_SYMBOL_GPL(hv_curr_partition_type);
+
/*
- * hv_root_partition, ms_hyperv and hv_nested are defined here with other
+ * ms_hyperv and hv_nested are defined here with other
* Hyper-V specific globals so they are shared across all architectures and are
* built only when CONFIG_HYPERV is defined. But on x86,
* ms_hyperv_init_platform() is built even when CONFIG_HYPERV is not
@@ -40,9 +46,6 @@
* here, allowing for an overriding definition in the module containing
* ms_hyperv_init_platform().
*/
-bool __weak hv_root_partition;
-EXPORT_SYMBOL_GPL(hv_root_partition);
-
bool __weak hv_nested;
EXPORT_SYMBOL_GPL(hv_nested);
@@ -66,6 +69,16 @@ static void hv_kmsg_dump_unregister(void);
static struct ctl_table_header *hv_ctl_table_hdr;
/*
+ * Per-cpu array holding the tail pointer for the SynIC event ring buffer
+ * for each SINT.
+ *
+ * We cannot maintain this in mshv driver because the tail pointer should
+ * persist even if the mshv driver is unloaded.
+ */
+u8 * __percpu *hv_synic_eventring_tail;
+EXPORT_SYMBOL_GPL(hv_synic_eventring_tail);
+
+/*
* Hyper-V specific initialization and shutdown code that is
* common across all architectures. Called from architecture
* specific initialization functions.
@@ -87,46 +100,10 @@ void __init hv_common_free(void)
free_percpu(hyperv_pcpu_input_arg);
hyperv_pcpu_input_arg = NULL;
-}
-
-/*
- * Functions for allocating and freeing memory with size and
- * alignment HV_HYP_PAGE_SIZE. These functions are needed because
- * the guest page size may not be the same as the Hyper-V page
- * size. We depend upon kmalloc() aligning power-of-two size
- * allocations to the allocation size boundary, so that the
- * allocated memory appears to Hyper-V as a page of the size
- * it expects.
- */
-void *hv_alloc_hyperv_page(void)
-{
- BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE);
-
- if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
- return (void *)__get_free_page(GFP_KERNEL);
- else
- return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page);
-
-void *hv_alloc_hyperv_zeroed_page(void)
-{
- if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
- return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
- else
- return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
+ free_percpu(hv_synic_eventring_tail);
+ hv_synic_eventring_tail = NULL;
}
-EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page);
-
-void hv_free_hyperv_page(void *addr)
-{
- if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
- free_page((unsigned long)addr);
- else
- kfree(addr);
-}
-EXPORT_SYMBOL_GPL(hv_free_hyperv_page);
static void *hv_panic_page;
@@ -218,13 +195,15 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper,
/*
* Write dump contents to the page. No need to synchronize; panic should
- * be single-threaded.
+ * be single-threaded. Ignore failures from kmsg_dump_get_buffer() since
+ * panic notification should be done even if there is no message data.
+ * Don't assume bytes_written is set in case of failure, so initialize it.
*/
kmsg_dump_rewind(&iter);
- kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE,
+ bytes_written = 0;
+ (void)kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE,
&bytes_written);
- if (!bytes_written)
- return;
+
/*
* P3 to contain the physical address of the panic page & P4 to
* contain the size of the panic data in that page. Rest of the
@@ -233,7 +212,7 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper,
hv_set_msr(HV_MSR_CRASH_P0, 0);
hv_set_msr(HV_MSR_CRASH_P1, 0);
hv_set_msr(HV_MSR_CRASH_P2, 0);
- hv_set_msr(HV_MSR_CRASH_P3, virt_to_phys(hv_panic_page));
+ hv_set_msr(HV_MSR_CRASH_P3, bytes_written ? virt_to_phys(hv_panic_page) : 0);
hv_set_msr(HV_MSR_CRASH_P4, bytes_written);
/*
@@ -256,7 +235,7 @@ static void hv_kmsg_dump_unregister(void)
atomic_notifier_chain_unregister(&panic_notifier_list,
&hyperv_panic_report_block);
- hv_free_hyperv_page(hv_panic_page);
+ kfree(hv_panic_page);
hv_panic_page = NULL;
}
@@ -264,7 +243,7 @@ static void hv_kmsg_dump_register(void)
{
int ret;
- hv_panic_page = hv_alloc_hyperv_zeroed_page();
+ hv_panic_page = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
if (!hv_panic_page) {
pr_err("Hyper-V: panic message page memory allocation failed\n");
return;
@@ -273,24 +252,74 @@ static void hv_kmsg_dump_register(void)
ret = kmsg_dump_register(&hv_kmsg_dumper);
if (ret) {
pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret);
- hv_free_hyperv_page(hv_panic_page);
+ kfree(hv_panic_page);
hv_panic_page = NULL;
}
}
static inline bool hv_output_page_exists(void)
{
- return hv_root_partition || IS_ENABLED(CONFIG_HYPERV_VTL_MODE);
+ return hv_parent_partition() || IS_ENABLED(CONFIG_HYPERV_VTL_MODE);
}
+void __init hv_get_partition_id(void)
+{
+ struct hv_output_get_partition_id *output;
+ unsigned long flags;
+ u64 status, pt_id;
+
+ local_irq_save(flags);
+ output = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output);
+ pt_id = output->partition_id;
+ local_irq_restore(flags);
+
+ if (hv_result_success(status))
+ hv_current_partition_id = pt_id;
+ else
+ pr_err("Hyper-V: failed to get partition ID: %#x\n",
+ hv_result(status));
+}
+#if IS_ENABLED(CONFIG_HYPERV_VTL_MODE)
+u8 __init get_vtl(void)
+{
+ u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS;
+ struct hv_input_get_vp_registers *input;
+ struct hv_output_get_vp_registers *output;
+ unsigned long flags;
+ u64 ret;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ memset(input, 0, struct_size(input, names, 1));
+ input->partition_id = HV_PARTITION_ID_SELF;
+ input->vp_index = HV_VP_INDEX_SELF;
+ input->input_vtl.as_uint8 = 0;
+ input->names[0] = HV_REGISTER_VSM_VP_STATUS;
+
+ ret = hv_do_hypercall(control, input, output);
+ if (hv_result_success(ret)) {
+ ret = output->values[0].reg8 & HV_VTL_MASK;
+ } else {
+ pr_err("Failed to get VTL(error: %lld) exiting...\n", ret);
+ BUG();
+ }
+
+ local_irq_restore(flags);
+ return ret;
+}
+#endif
+
int __init hv_common_init(void)
{
int i;
union hv_hypervisor_version_info version;
- /* Get information about the Hyper-V host version */
+ /* Get information about the Microsoft Hypervisor version */
if (!hv_get_hypervisor_version(&version))
- pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n",
+ pr_info("Hyper-V: Hypervisor Build %d.%d.%d.%d-%d-%d\n",
version.major_version, version.minor_version,
version.build_number, version.service_number,
version.service_pack, version.service_branch);
@@ -350,6 +379,11 @@ int __init hv_common_init(void)
BUG_ON(!hyperv_pcpu_output_arg);
}
+ if (hv_parent_partition()) {
+ hv_synic_eventring_tail = alloc_percpu(u8 *);
+ BUG_ON(!hv_synic_eventring_tail);
+ }
+
hv_vp_index = kmalloc_array(nr_cpu_ids, sizeof(*hv_vp_index),
GFP_KERNEL);
if (!hv_vp_index) {
@@ -438,11 +472,12 @@ error:
int hv_common_cpu_init(unsigned int cpu)
{
void **inputarg, **outputarg;
+ u8 **synic_eventring_tail;
u64 msr_vp_index;
gfp_t flags;
const int pgcount = hv_output_page_exists() ? 2 : 1;
void *mem;
- int ret;
+ int ret = 0;
/* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL;
@@ -450,11 +485,11 @@ int hv_common_cpu_init(unsigned int cpu)
inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
/*
- * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already
- * allocated if this CPU was previously online and then taken offline
+ * The per-cpu memory is already allocated if this CPU was previously
+ * online and then taken offline
*/
if (!*inputarg) {
- mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
+ mem = kmalloc_array(pgcount, HV_HYP_PAGE_SIZE, flags);
if (!mem)
return -ENOMEM;
@@ -498,11 +533,21 @@ int hv_common_cpu_init(unsigned int cpu)
if (msr_vp_index > hv_max_vp_index)
hv_max_vp_index = msr_vp_index;
- return 0;
+ if (hv_parent_partition()) {
+ synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
+ *synic_eventring_tail = kcalloc(HV_SYNIC_SINT_COUNT,
+ sizeof(u8), flags);
+ /* No need to unwind any of the above on failure here */
+ if (unlikely(!*synic_eventring_tail))
+ ret = -ENOMEM;
+ }
+
+ return ret;
}
int hv_common_cpu_die(unsigned int cpu)
{
+ u8 **synic_eventring_tail;
/*
* The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory
* is not freed when the CPU goes offline as the hyperv_pcpu_input_arg
@@ -515,6 +560,12 @@ int hv_common_cpu_die(unsigned int cpu)
* originally allocated memory is reused in hv_common_cpu_init().
*/
+ if (hv_parent_partition()) {
+ synic_eventring_tail = this_cpu_ptr(hv_synic_eventring_tail);
+ kfree(*synic_eventring_tail);
+ *synic_eventring_tail = NULL;
+ }
+
return 0;
}
@@ -572,7 +623,7 @@ EXPORT_SYMBOL_GPL(hv_setup_dma_ops);
bool hv_is_hibernation_supported(void)
{
- return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
+ return !hv_root_partition() && acpi_sleep_state_supported(ACPI_STATE_S4);
}
EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
@@ -625,6 +676,11 @@ void __weak hv_remove_vmbus_handler(void)
}
EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler);
+void __weak hv_setup_mshv_handler(void (*handler)(void))
+{
+}
+EXPORT_SYMBOL_GPL(hv_setup_mshv_handler);
+
void __weak hv_setup_kexec_handler(void (*handler)(void))
{
}
@@ -661,3 +717,149 @@ u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
return HV_STATUS_INVALID_PARAMETER;
}
EXPORT_SYMBOL_GPL(hv_tdx_hypercall);
+
+void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set)
+{
+}
+EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt);
+
+void __weak hv_para_set_sint_proxy(bool enable)
+{
+}
+EXPORT_SYMBOL_GPL(hv_para_set_sint_proxy);
+
+u64 __weak hv_para_get_synic_register(unsigned int reg)
+{
+ return ~0ULL;
+}
+EXPORT_SYMBOL_GPL(hv_para_get_synic_register);
+
+void __weak hv_para_set_synic_register(unsigned int reg, u64 val)
+{
+}
+EXPORT_SYMBOL_GPL(hv_para_set_synic_register);
+
+void hv_identify_partition_type(void)
+{
+ /* Assume guest role */
+ hv_curr_partition_type = HV_PARTITION_TYPE_GUEST;
+ /*
+ * Check partition creation and cpu management privileges
+ *
+ * Hyper-V should never specify running as root and as a Confidential
+ * VM. But to protect against a compromised/malicious Hyper-V trying
+ * to exploit root behavior to expose Confidential VM memory, ignore
+ * the root partition setting if also a Confidential VM.
+ */
+ if ((ms_hyperv.priv_high & HV_CREATE_PARTITIONS) &&
+ !(ms_hyperv.priv_high & HV_ISOLATION)) {
+
+ if (!IS_ENABLED(CONFIG_MSHV_ROOT)) {
+ pr_crit("Hyper-V: CONFIG_MSHV_ROOT not enabled!\n");
+ } else if (ms_hyperv.priv_high & HV_CPU_MANAGEMENT) {
+ pr_info("Hyper-V: running as root partition\n");
+ hv_curr_partition_type = HV_PARTITION_TYPE_ROOT;
+ } else {
+ pr_info("Hyper-V: running as L1VH partition\n");
+ hv_curr_partition_type = HV_PARTITION_TYPE_L1VH;
+ }
+ }
+}
+
+struct hv_status_info {
+ char *string;
+ int errno;
+ u16 code;
+};
+
+/*
+ * Note on the errno mappings:
+ * A failed hypercall is usually only recoverable (or loggable) near
+ * the call site where the HV_STATUS_* code is known. So the errno
+ * it gets converted to is not too useful further up the stack.
+ * Provide a few mappings that could be useful, and revert to -EIO
+ * as a fallback.
+ */
+static const struct hv_status_info hv_status_infos[] = {
+#define _STATUS_INFO(status, errno) { #status, (errno), (status) }
+ _STATUS_INFO(HV_STATUS_SUCCESS, 0),
+ _STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_CODE, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_INPUT, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_ALIGNMENT, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_PARAMETER, -EINVAL),
+ _STATUS_INFO(HV_STATUS_ACCESS_DENIED, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_PARTITION_STATE, -EIO),
+ _STATUS_INFO(HV_STATUS_OPERATION_DENIED, -EIO),
+ _STATUS_INFO(HV_STATUS_UNKNOWN_PROPERTY, -EIO),
+ _STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE, -EIO),
+ _STATUS_INFO(HV_STATUS_INSUFFICIENT_MEMORY, -ENOMEM),
+ _STATUS_INFO(HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY, -ENOMEM),
+ _STATUS_INFO(HV_STATUS_INSUFFICIENT_ROOT_MEMORY, -ENOMEM),
+ _STATUS_INFO(HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY, -ENOMEM),
+ _STATUS_INFO(HV_STATUS_INVALID_PARTITION_ID, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_VP_INDEX, -EINVAL),
+ _STATUS_INFO(HV_STATUS_NOT_FOUND, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_PORT_ID, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_CONNECTION_ID, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INSUFFICIENT_BUFFERS, -EIO),
+ _STATUS_INFO(HV_STATUS_NOT_ACKNOWLEDGED, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_VP_STATE, -EIO),
+ _STATUS_INFO(HV_STATUS_NO_RESOURCES, -EIO),
+ _STATUS_INFO(HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EIO),
+ _STATUS_INFO(HV_STATUS_OPERATION_FAILED, -EIO),
+ _STATUS_INFO(HV_STATUS_TIME_OUT, -EIO),
+ _STATUS_INFO(HV_STATUS_CALL_PENDING, -EIO),
+ _STATUS_INFO(HV_STATUS_VTL_ALREADY_ENABLED, -EIO),
+#undef _STATUS_INFO
+};
+
+static inline const struct hv_status_info *find_hv_status_info(u64 hv_status)
+{
+ int i;
+ u16 code = hv_result(hv_status);
+
+ for (i = 0; i < ARRAY_SIZE(hv_status_infos); ++i) {
+ const struct hv_status_info *info = &hv_status_infos[i];
+
+ if (info->code == code)
+ return info;
+ }
+
+ return NULL;
+}
+
+/* Convert a hypercall result into a linux-friendly error code. */
+int hv_result_to_errno(u64 status)
+{
+ const struct hv_status_info *info;
+
+ /* hv_do_hypercall() may return U64_MAX, hypercalls aren't possible */
+ if (unlikely(status == U64_MAX))
+ return -EOPNOTSUPP;
+
+ info = find_hv_status_info(status);
+ if (info)
+ return info->errno;
+
+ return -EIO;
+}
+EXPORT_SYMBOL_GPL(hv_result_to_errno);
+
+const char *hv_result_to_string(u64 status)
+{
+ const struct hv_status_info *info;
+
+ if (unlikely(status == U64_MAX))
+ return "Hypercall page missing!";
+
+ info = find_hv_status_info(status);
+ if (info)
+ return info->string;
+
+ return "Unknown";
+}
+EXPORT_SYMBOL_GPL(hv_result_to_string);
diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index 62795f6cbb00..0d73daf745a7 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -134,7 +134,7 @@ kvp_register(int reg_value)
struct hv_kvp_msg *kvp_msg;
char *version;
- kvp_msg = kzalloc(sizeof(*kvp_msg), GFP_KERNEL);
+ kvp_msg = kzalloc_obj(*kvp_msg);
if (kvp_msg) {
version = kvp_msg->body.kvp_register.version;
@@ -385,7 +385,7 @@ kvp_send_key(struct work_struct *dummy)
if (kvp_transaction.state != HVUTIL_HOSTMSG_RECEIVED)
return;
- message = kzalloc(sizeof(*message), GFP_KERNEL);
+ message = kzalloc_obj(*message);
if (!message)
return;
diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c
new file mode 100644
index 000000000000..57b2c64197cb
--- /dev/null
+++ b/drivers/hv/hv_proc.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/clockchips.h>
+#include <linux/slab.h>
+#include <linux/cpuhotplug.h>
+#include <linux/minmax.h>
+#include <linux/export.h>
+#include <asm/mshyperv.h>
+
+/*
+ * See struct hv_deposit_memory. The first u64 is partition ID, the rest
+ * are GPAs.
+ */
+#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1)
+
+/* Deposits exact number of pages. Must be called with interrupts enabled. */
+int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
+{
+ struct page **pages, *page;
+ int *counts;
+ int num_allocations;
+ int i, j, page_count;
+ int order;
+ u64 status;
+ int ret;
+ u64 base_pfn;
+ struct hv_deposit_memory *input_page;
+ unsigned long flags;
+
+ if (num_pages > HV_DEPOSIT_MAX)
+ return -E2BIG;
+ if (!num_pages)
+ return 0;
+
+ /* One buffer for page pointers and counts */
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ pages = page_address(page);
+
+ counts = kzalloc_objs(int, HV_DEPOSIT_MAX);
+ if (!counts) {
+ free_page((unsigned long)pages);
+ return -ENOMEM;
+ }
+
+ /* Allocate all the pages before disabling interrupts */
+ i = 0;
+
+ while (num_pages) {
+ /* Find highest order we can actually allocate */
+ order = 31 - __builtin_clz(num_pages);
+
+ while (1) {
+ pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
+ if (pages[i])
+ break;
+ if (!order) {
+ ret = -ENOMEM;
+ num_allocations = i;
+ goto err_free_allocations;
+ }
+ --order;
+ }
+
+ split_page(pages[i], order);
+ counts[i] = 1 << order;
+ num_pages -= counts[i];
+ i++;
+ }
+ num_allocations = i;
+
+ local_irq_save(flags);
+
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input_page->partition_id = partition_id;
+
+ /* Populate gpa_page_list - these will fit on the input page */
+ for (i = 0, page_count = 0; i < num_allocations; ++i) {
+ base_pfn = page_to_pfn(pages[i]);
+ for (j = 0; j < counts[i]; ++j, ++page_count)
+ input_page->gpa_page_list[page_count] = base_pfn + j;
+ }
+ status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
+ page_count, 0, input_page, NULL);
+ local_irq_restore(flags);
+ if (!hv_result_success(status)) {
+ hv_status_err(status, "\n");
+ ret = hv_result_to_errno(status);
+ goto err_free_allocations;
+ }
+
+ ret = 0;
+ goto free_buf;
+
+err_free_allocations:
+ for (i = 0; i < num_allocations; ++i) {
+ base_pfn = page_to_pfn(pages[i]);
+ for (j = 0; j < counts[i]; ++j)
+ __free_page(pfn_to_page(base_pfn + j));
+ }
+
+free_buf:
+ free_page((unsigned long)pages);
+ kfree(counts);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hv_call_deposit_pages);
+
+int hv_deposit_memory_node(int node, u64 partition_id,
+ u64 hv_status)
+{
+ u32 num_pages = 1;
+
+ switch (hv_result(hv_status)) {
+ case HV_STATUS_INSUFFICIENT_MEMORY:
+ break;
+ case HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY:
+ num_pages = HV_MAX_CONTIGUOUS_ALLOCATION_PAGES;
+ break;
+
+ case HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY:
+ num_pages = HV_MAX_CONTIGUOUS_ALLOCATION_PAGES;
+ fallthrough;
+ case HV_STATUS_INSUFFICIENT_ROOT_MEMORY:
+ if (!hv_root_partition()) {
+ hv_status_err(hv_status, "Unexpected root memory deposit\n");
+ return -ENOMEM;
+ }
+ partition_id = HV_PARTITION_ID_SELF;
+ break;
+
+ default:
+ hv_status_err(hv_status, "Unexpected!\n");
+ return -ENOMEM;
+ }
+ return hv_call_deposit_pages(node, partition_id, num_pages);
+}
+EXPORT_SYMBOL_GPL(hv_deposit_memory_node);
+
+bool hv_result_needs_memory(u64 status)
+{
+ switch (hv_result(status)) {
+ case HV_STATUS_INSUFFICIENT_MEMORY:
+ case HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY:
+ case HV_STATUS_INSUFFICIENT_ROOT_MEMORY:
+ case HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY:
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(hv_result_needs_memory);
+
+int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
+{
+ struct hv_input_add_logical_processor *input;
+ struct hv_output_add_logical_processor *output;
+ u64 status;
+ unsigned long flags;
+ int ret = 0;
+
+ /*
+ * When adding a logical processor, the hypervisor may return
+ * HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more
+ * pages and retry.
+ */
+ do {
+ local_irq_save(flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ /* We don't do anything with the output right now */
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ input->lp_index = lp_index;
+ input->apic_id = apic_id;
+ input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
+ status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR,
+ input, output);
+ local_irq_restore(flags);
+
+ if (!hv_result_needs_memory(status)) {
+ if (!hv_result_success(status)) {
+ hv_status_err(status, "cpu %u apic ID: %u\n",
+ lp_index, apic_id);
+ ret = hv_result_to_errno(status);
+ }
+ break;
+ }
+ ret = hv_deposit_memory_node(node, hv_current_partition_id,
+ status);
+ } while (!ret);
+
+ return ret;
+}
+
+int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
+{
+ struct hv_create_vp *input;
+ u64 status;
+ unsigned long irq_flags;
+ int ret = 0;
+
+ /* Root VPs don't seem to need pages deposited */
+ if (partition_id != hv_current_partition_id) {
+ /* The value 90 is empirically determined. It may change. */
+ ret = hv_call_deposit_pages(node, partition_id, 90);
+ if (ret)
+ return ret;
+ }
+
+ do {
+ local_irq_save(irq_flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input->partition_id = partition_id;
+ input->vp_index = vp_index;
+ input->flags = flags;
+ input->subnode_type = HV_SUBNODE_ANY;
+ input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
+ status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
+ local_irq_restore(irq_flags);
+
+ if (!hv_result_needs_memory(status)) {
+ if (!hv_result_success(status)) {
+ hv_status_err(status, "vcpu: %u, lp: %u\n",
+ vp_index, flags);
+ ret = hv_result_to_errno(status);
+ }
+ break;
+ }
+ ret = hv_deposit_memory_node(node, partition_id, status);
+
+ } while (!ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hv_call_create_vp);
+
+int hv_call_notify_all_processors_started(void)
+{
+ struct hv_input_notify_partition_event *input;
+ u64 status;
+ unsigned long irq_flags;
+ int ret = 0;
+
+ local_irq_save(irq_flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+ input->event = HV_PARTITION_ALL_LOGICAL_PROCESSORS_STARTED;
+ status = hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT,
+ input, NULL);
+ local_irq_restore(irq_flags);
+
+ if (!hv_result_success(status)) {
+ hv_status_err(status, "\n");
+ ret = hv_result_to_errno(status);
+ }
+ return ret;
+}
+
+bool hv_lp_exists(u32 lp_index)
+{
+ struct hv_input_get_logical_processor_run_time *input;
+ struct hv_output_get_logical_processor_run_time *output;
+ unsigned long flags;
+ u64 status;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ input->lp_index = lp_index;
+ status = hv_do_hypercall(HVCALL_GET_LOGICAL_PROCESSOR_RUN_TIME,
+ input, output);
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status) &&
+ hv_result(status) != HV_STATUS_INVALID_LP_INDEX) {
+ hv_status_err(status, "\n");
+ BUG();
+ }
+
+ return hv_result_success(status);
+}
diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c
index 2e7f537d53cf..506871aeacf0 100644
--- a/drivers/hv/hv_snapshot.c
+++ b/drivers/hv/hv_snapshot.c
@@ -184,7 +184,7 @@ static void vss_send_op(void)
return;
}
- vss_msg = kzalloc(sizeof(*vss_msg), GFP_KERNEL);
+ vss_msg = kzalloc_obj(*vss_msg);
if (!vss_msg)
return;
@@ -424,7 +424,7 @@ int hv_vss_pre_suspend(void)
* write() will fail with EINVAL (see vss_on_msg()), and the daemon
* will reset the device by closing and re-opening it.
*/
- vss_msg = kzalloc(sizeof(*vss_msg), GFP_KERNEL);
+ vss_msg = kzalloc_obj(*vss_msg);
if (!vss_msg)
return -ENOMEM;
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index 36ee89c0358b..7e9c8e169c66 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -586,7 +586,7 @@ static int util_probe(struct hv_device *dev,
(struct hv_util_service *)dev_id->driver_data;
int ret;
- srv->recv_buffer = kmalloc(HV_HYP_PAGE_SIZE * 4, GFP_KERNEL);
+ srv->recv_buffer = kmalloc_array(4, HV_HYP_PAGE_SIZE, GFP_KERNEL);
if (!srv->recv_buffer)
return -ENOMEM;
srv->channel = dev->channel;
diff --git a/drivers/hv/hv_utils_transport.c b/drivers/hv/hv_utils_transport.c
index 832885198643..c3be5c570263 100644
--- a/drivers/hv/hv_utils_transport.c
+++ b/drivers/hv/hv_utils_transport.c
@@ -129,8 +129,7 @@ static int hvt_op_open(struct inode *inode, struct file *file)
* device gets released.
*/
hvt->mode = HVUTIL_TRANSPORT_CHARDEV;
- }
- else if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) {
+ } else if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) {
/*
* We're switching from netlink communication to using char
* device. Issue the reset first.
@@ -195,7 +194,7 @@ static void hvt_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
}
spin_unlock(&hvt_list_lock);
if (!hvt_found) {
- pr_warn("hvt_cn_callback: spurious message received!\n");
+ pr_warn("%s: spurious message received!\n", __func__);
return;
}
@@ -210,7 +209,7 @@ static void hvt_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
if (hvt->mode == HVUTIL_TRANSPORT_NETLINK)
hvt_found->on_msg(msg->data, msg->len);
else
- pr_warn("hvt_cn_callback: unexpected netlink message!\n");
+ pr_warn("%s: unexpected netlink message!\n", __func__);
mutex_unlock(&hvt->lock);
}
@@ -260,8 +259,9 @@ int hvutil_transport_send(struct hvutil_transport *hvt, void *msg, int len,
hvt->outmsg_len = len;
hvt->on_read = on_read_cb;
wake_up_interruptible(&hvt->outmsg_q);
- } else
+ } else {
ret = -ENOMEM;
+ }
out_unlock:
mutex_unlock(&hvt->lock);
return ret;
@@ -274,7 +274,7 @@ struct hvutil_transport *hvutil_transport_init(const char *name,
{
struct hvutil_transport *hvt;
- hvt = kzalloc(sizeof(*hvt), GFP_KERNEL);
+ hvt = kzalloc_obj(*hvt);
if (!hvt)
return NULL;
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 29780f3a7478..05a36854389a 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -15,6 +15,7 @@
#include <linux/list.h>
#include <linux/bitops.h>
#include <asm/sync_bitops.h>
+#include <asm/mshyperv.h>
#include <linux/atomic.h>
#include <linux/hyperv.h>
#include <linux/interrupt.h>
@@ -32,6 +33,7 @@
*/
#define HV_UTIL_NEGO_TIMEOUT 55
+void vmbus_isr(void);
/* Definitions for the monitored notification facility */
union hv_monitor_trigger_group {
@@ -120,8 +122,26 @@ enum {
* Per cpu state for channel handling
*/
struct hv_per_cpu_context {
- void *synic_message_page;
- void *synic_event_page;
+ /*
+ * SynIC pages for communicating with the host.
+ *
+ * These pages are accessible to the host partition and the hypervisor.
+ * They may be used for exchanging data with the host partition and the
+ * hypervisor even when they aren't trusted yet the guest partition
+ * must be prepared to handle the malicious behavior.
+ */
+ void *hyp_synic_message_page;
+ void *hyp_synic_event_page;
+ /*
+ * SynIC pages for communicating with the paravisor.
+ *
+ * These pages may be accessed from within the guest partition only in
+ * CoCo VMs. Neither the host partition nor the hypervisor can access
+ * these pages in that case; they are used for exchanging data with the
+ * paravisor.
+ */
+ void *para_synic_message_page;
+ void *para_synic_event_page;
/*
* The page is only used in hv_post_message() for a TDX VM (with the
@@ -171,10 +191,10 @@ extern int hv_synic_alloc(void);
extern void hv_synic_free(void);
-extern void hv_synic_enable_regs(unsigned int cpu);
+extern void hv_hyp_synic_enable_regs(unsigned int cpu);
extern int hv_synic_init(unsigned int cpu);
-extern void hv_synic_disable_regs(unsigned int cpu);
+extern void hv_hyp_synic_disable_regs(unsigned int cpu);
extern int hv_synic_cleanup(unsigned int cpu);
/* Interface */
@@ -182,7 +202,8 @@ extern int hv_synic_cleanup(unsigned int cpu);
void hv_ringbuffer_pre_init(struct vmbus_channel *channel);
int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
- struct page *pages, u32 pagecnt, u32 max_pkt_size);
+ struct page *pages, u32 pagecnt, u32 max_pkt_size,
+ bool confidential);
void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info);
@@ -255,8 +276,9 @@ struct vmbus_connection {
struct list_head chn_list;
struct mutex channel_mutex;
- /* Array of channels */
+ /* Array of channel pointers, indexed by relid */
struct vmbus_channel **channels;
+ u32 relid_hiwater;
/*
* An offer message is handled first on the work_queue, and then
@@ -333,6 +355,51 @@ extern const struct vmbus_channel_message_table_entry
/* General vmbus interface */
+bool vmbus_is_confidential(void);
+
+#if IS_ENABLED(CONFIG_HYPERV_VMBUS)
+/* Free the message slot and signal end-of-message if required */
+static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
+{
+ /*
+ * On crash we're reading some other CPU's message page and we need
+ * to be careful: this other CPU may already had cleared the header
+ * and the host may already had delivered some other message there.
+ * In case we blindly write msg->header.message_type we're going
+ * to lose it. We can still lose a message of the same type but
+ * we count on the fact that there can only be one
+ * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages
+ * on crash.
+ */
+ if (!try_cmpxchg(&msg->header.message_type,
+ &old_msg_type, HVMSG_NONE))
+ return;
+
+ /*
+ * The cmpxchg() above does an implicit memory barrier to
+ * ensure the write to MessageType (ie set to
+ * HVMSG_NONE) happens before we read the
+ * MessagePending and EOMing. Otherwise, the EOMing
+ * will not deliver any more messages since there is
+ * no empty slot
+ */
+ if (msg->header.message_flags.msg_pending) {
+ /*
+ * This will cause message queue rescan to
+ * possibly deliver another msg from the
+ * hypervisor
+ */
+ if (vmbus_is_confidential())
+ hv_para_set_synic_register(HV_MSR_EOM, 0);
+ else
+ hv_set_msr(HV_MSR_EOM, 0);
+ }
+}
+
+extern int vmbus_interrupt;
+extern int vmbus_irq;
+#endif /* CONFIG_HYPERV_VMBUS */
+
struct hv_device *vmbus_device_create(const guid_t *type,
const guid_t *instance,
struct vmbus_channel *channel);
@@ -477,4 +544,10 @@ static inline int hv_debug_add_dev_dir(struct hv_device *dev)
#endif /* CONFIG_HYPERV_TESTING */
+/* Create and remove sysfs entry for memory mapped ring buffers for a channel */
+int hv_create_ring_sysfs(struct vmbus_channel *channel,
+ int (*hv_mmap_prepare_ring_buffer)(struct vmbus_channel *channel,
+ struct vm_area_desc *desc));
+int hv_remove_ring_sysfs(struct vmbus_channel *channel);
+
#endif /* _HYPERV_VMBUS_H */
diff --git a/drivers/hv/mshv.h b/drivers/hv/mshv.h
new file mode 100644
index 000000000000..d4813df92b9c
--- /dev/null
+++ b/drivers/hv/mshv.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ */
+
+#ifndef _MSHV_H_
+#define _MSHV_H_
+
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <hyperv/hvhdk.h>
+
+#define mshv_field_nonzero(STRUCT, MEMBER) \
+ memchr_inv(&((STRUCT).MEMBER), \
+ 0, sizeof_field(typeof(STRUCT), MEMBER))
+
+int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ union hv_input_vtl input_vtl,
+ struct hv_register_assoc *registers);
+
+int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ union hv_input_vtl input_vtl,
+ struct hv_register_assoc *registers);
+
+int hv_call_get_partition_property(u64 partition_id, u64 property_code,
+ u64 *property_value);
+
+#endif /* _MSHV_H */
diff --git a/drivers/hv/mshv_common.c b/drivers/hv/mshv_common.c
new file mode 100644
index 000000000000..63f09bb5136e
--- /dev/null
+++ b/drivers/hv/mshv_common.c
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, Microsoft Corporation.
+ *
+ * This file contains functions that will be called from one or more modules.
+ * If any of these modules are configured to build, this file is built and just
+ * statically linked in.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <asm/mshyperv.h>
+#include <linux/resume_user_mode.h>
+#include <linux/export.h>
+#include <linux/acpi.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+
+#include "mshv.h"
+
+#define HV_GET_REGISTER_BATCH_SIZE \
+ (HV_HYP_PAGE_SIZE / sizeof(union hv_register_value))
+#define HV_SET_REGISTER_BATCH_SIZE \
+ ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_registers)) \
+ / sizeof(struct hv_register_assoc))
+
+int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ union hv_input_vtl input_vtl,
+ struct hv_register_assoc *registers)
+{
+ struct hv_input_get_vp_registers *input_page;
+ union hv_register_value *output_page;
+ u16 completed = 0;
+ unsigned long remaining = count;
+ int rep_count, i;
+ u64 status = HV_STATUS_SUCCESS;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ input_page->partition_id = partition_id;
+ input_page->vp_index = vp_index;
+ input_page->input_vtl.as_uint8 = input_vtl.as_uint8;
+ input_page->rsvd_z8 = 0;
+ input_page->rsvd_z16 = 0;
+
+ while (remaining) {
+ rep_count = min(remaining, HV_GET_REGISTER_BATCH_SIZE);
+ for (i = 0; i < rep_count; ++i)
+ input_page->names[i] = registers[i].name;
+
+ status = hv_do_rep_hypercall(HVCALL_GET_VP_REGISTERS, rep_count,
+ 0, input_page, output_page);
+ if (!hv_result_success(status))
+ break;
+
+ completed = hv_repcomp(status);
+ for (i = 0; i < completed; ++i)
+ registers[i].value = output_page[i];
+
+ registers += completed;
+ remaining -= completed;
+ }
+ local_irq_restore(flags);
+
+ return hv_result_to_errno(status);
+}
+EXPORT_SYMBOL_GPL(hv_call_get_vp_registers);
+
+int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ union hv_input_vtl input_vtl,
+ struct hv_register_assoc *registers)
+{
+ struct hv_input_set_vp_registers *input_page;
+ u16 completed = 0;
+ unsigned long remaining = count;
+ int rep_count;
+ u64 status = HV_STATUS_SUCCESS;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input_page->partition_id = partition_id;
+ input_page->vp_index = vp_index;
+ input_page->input_vtl.as_uint8 = input_vtl.as_uint8;
+ input_page->rsvd_z8 = 0;
+ input_page->rsvd_z16 = 0;
+
+ while (remaining) {
+ rep_count = min(remaining, HV_SET_REGISTER_BATCH_SIZE);
+ memcpy(input_page->elements, registers,
+ sizeof(struct hv_register_assoc) * rep_count);
+
+ status = hv_do_rep_hypercall(HVCALL_SET_VP_REGISTERS, rep_count,
+ 0, input_page, NULL);
+ if (!hv_result_success(status))
+ break;
+
+ completed = hv_repcomp(status);
+ registers += completed;
+ remaining -= completed;
+ }
+
+ local_irq_restore(flags);
+
+ return hv_result_to_errno(status);
+}
+EXPORT_SYMBOL_GPL(hv_call_set_vp_registers);
+
+int hv_call_get_partition_property(u64 partition_id,
+ u64 property_code,
+ u64 *property_value)
+{
+ u64 status;
+ unsigned long flags;
+ struct hv_input_get_partition_property *input;
+ struct hv_output_get_partition_property *output;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+ memset(input, 0, sizeof(*input));
+ input->partition_id = partition_id;
+ input->property_code = property_code;
+ status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY, input, output);
+
+ if (!hv_result_success(status)) {
+ local_irq_restore(flags);
+ return hv_result_to_errno(status);
+ }
+ *property_value = output->property_value;
+
+ local_irq_restore(flags);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hv_call_get_partition_property);
+
+#ifdef CONFIG_X86
+/*
+ * Corresponding sleep states have to be initialized in order for a subsequent
+ * HVCALL_ENTER_SLEEP_STATE call to succeed. Currently only S5 state as per
+ * ACPI 6.4 chapter 7.4.2 is relevant, while S1, S2 and S3 can be supported.
+ *
+ * In order to pass proper PM values to mshv, ACPI should be initialized and
+ * should support S5 sleep state when this method is invoked.
+ */
+static int hv_initialize_sleep_states(void)
+{
+ u64 status;
+ unsigned long flags;
+ struct hv_input_set_system_property *in;
+ acpi_status acpi_status;
+ u8 sleep_type_a, sleep_type_b;
+
+ if (!acpi_sleep_state_supported(ACPI_STATE_S5)) {
+ pr_err("%s: S5 sleep state not supported.\n", __func__);
+ return -ENODEV;
+ }
+
+ acpi_status = acpi_get_sleep_type_data(ACPI_STATE_S5, &sleep_type_a,
+ &sleep_type_b);
+ if (ACPI_FAILURE(acpi_status))
+ return -ENODEV;
+
+ local_irq_save(flags);
+ in = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(in, 0, sizeof(*in));
+
+ in->property_id = HV_SYSTEM_PROPERTY_SLEEP_STATE;
+ in->set_sleep_state_info.sleep_state = HV_SLEEP_STATE_S5;
+ in->set_sleep_state_info.pm1a_slp_typ = sleep_type_a;
+ in->set_sleep_state_info.pm1b_slp_typ = sleep_type_b;
+
+ status = hv_do_hypercall(HVCALL_SET_SYSTEM_PROPERTY, in, NULL);
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status)) {
+ hv_status_err(status, "\n");
+ return hv_result_to_errno(status);
+ }
+
+ return 0;
+}
+
+/*
+ * This notifier initializes sleep states in mshv hypervisor which will be
+ * used during power off.
+ */
+static int hv_reboot_notifier_handler(struct notifier_block *this,
+ unsigned long code, void *another)
+{
+ int ret = 0;
+
+ if (code == SYS_HALT || code == SYS_POWER_OFF)
+ ret = hv_initialize_sleep_states();
+
+ return ret ? NOTIFY_DONE : NOTIFY_OK;
+}
+
+static struct notifier_block hv_reboot_notifier = {
+ .notifier_call = hv_reboot_notifier_handler,
+};
+
+void hv_sleep_notifiers_register(void)
+{
+ int ret;
+
+ ret = register_reboot_notifier(&hv_reboot_notifier);
+ if (ret)
+ pr_err("%s: cannot register reboot notifier %d\n", __func__,
+ ret);
+}
+
+/*
+ * Power off the machine by entering S5 sleep state via Hyper-V hypercall.
+ * This call does not return if successful.
+ */
+void hv_machine_power_off(void)
+{
+ unsigned long flags;
+ struct hv_input_enter_sleep_state *in;
+
+ local_irq_save(flags);
+ in = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ in->sleep_state = HV_SLEEP_STATE_S5;
+
+ (void)hv_do_hypercall(HVCALL_ENTER_SLEEP_STATE, in, NULL);
+ local_irq_restore(flags);
+
+ /* should never reach here */
+ BUG();
+
+}
+#endif
diff --git a/drivers/hv/mshv_debugfs.c b/drivers/hv/mshv_debugfs.c
new file mode 100644
index 000000000000..418b6dc8f3c2
--- /dev/null
+++ b/drivers/hv/mshv_debugfs.c
@@ -0,0 +1,724 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2026, Microsoft Corporation.
+ *
+ * The /sys/kernel/debug/mshv directory contents.
+ * Contains various statistics data, provided by the hypervisor.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/debugfs.h>
+#include <linux/stringify.h>
+#include <asm/mshyperv.h>
+#include <linux/slab.h>
+
+#include "mshv.h"
+#include "mshv_root.h"
+
+/* Ensure this file is not used elsewhere by accident */
+#define MSHV_DEBUGFS_C
+#include "mshv_debugfs_counters.c"
+
+#define U32_BUF_SZ 11
+#define U64_BUF_SZ 21
+/* Only support SELF and PARENT areas */
+#define NUM_STATS_AREAS 2
+static_assert(HV_STATS_AREA_SELF == 0 && HV_STATS_AREA_PARENT == 1,
+ "SELF and PARENT areas must be usable as indices into an array of size NUM_STATS_AREAS");
+/* HV_HYPERVISOR_COUNTER */
+#define HV_HYPERVISOR_COUNTER_LOGICAL_PROCESSORS 1
+
+static struct dentry *mshv_debugfs;
+static struct dentry *mshv_debugfs_partition;
+static struct dentry *mshv_debugfs_lp;
+static struct dentry **parent_vp_stats;
+static struct dentry *parent_partition_stats;
+
+static u64 mshv_lps_count;
+static struct hv_stats_page **mshv_lps_stats;
+
+static int lp_stats_show(struct seq_file *m, void *v)
+{
+ const struct hv_stats_page *stats = m->private;
+ int idx;
+
+ for (idx = 0; idx < ARRAY_SIZE(hv_lp_counters); idx++) {
+ char *name = hv_lp_counters[idx];
+
+ if (!name)
+ continue;
+ seq_printf(m, "%-32s: %llu\n", name, stats->data[idx]);
+ }
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(lp_stats);
+
+static void mshv_lp_stats_unmap(u32 lp_index)
+{
+ union hv_stats_object_identity identity = {
+ .lp.lp_index = lp_index,
+ .lp.stats_area_type = HV_STATS_AREA_SELF,
+ };
+ int err;
+
+ err = hv_unmap_stats_page(HV_STATS_OBJECT_LOGICAL_PROCESSOR,
+ mshv_lps_stats[lp_index], &identity);
+ if (err)
+ pr_err("%s: failed to unmap logical processor %u stats, err: %d\n",
+ __func__, lp_index, err);
+
+ mshv_lps_stats[lp_index] = NULL;
+}
+
+static struct hv_stats_page * __init mshv_lp_stats_map(u32 lp_index)
+{
+ union hv_stats_object_identity identity = {
+ .lp.lp_index = lp_index,
+ .lp.stats_area_type = HV_STATS_AREA_SELF,
+ };
+ struct hv_stats_page *stats;
+ int err;
+
+ err = hv_map_stats_page(HV_STATS_OBJECT_LOGICAL_PROCESSOR, &identity,
+ &stats);
+ if (err) {
+ pr_err("%s: failed to map logical processor %u stats, err: %d\n",
+ __func__, lp_index, err);
+ return ERR_PTR(err);
+ }
+ mshv_lps_stats[lp_index] = stats;
+
+ return stats;
+}
+
+static struct hv_stats_page * __init lp_debugfs_stats_create(u32 lp_index,
+ struct dentry *parent)
+{
+ struct dentry *dentry;
+ struct hv_stats_page *stats;
+
+ stats = mshv_lp_stats_map(lp_index);
+ if (IS_ERR(stats))
+ return stats;
+
+ dentry = debugfs_create_file("stats", 0400, parent,
+ stats, &lp_stats_fops);
+ if (IS_ERR(dentry)) {
+ mshv_lp_stats_unmap(lp_index);
+ return ERR_CAST(dentry);
+ }
+ return stats;
+}
+
+static int __init lp_debugfs_create(u32 lp_index, struct dentry *parent)
+{
+ struct dentry *idx;
+ char lp_idx_str[U32_BUF_SZ];
+ struct hv_stats_page *stats;
+ int err;
+
+ sprintf(lp_idx_str, "%u", lp_index);
+
+ idx = debugfs_create_dir(lp_idx_str, parent);
+ if (IS_ERR(idx))
+ return PTR_ERR(idx);
+
+ stats = lp_debugfs_stats_create(lp_index, idx);
+ if (IS_ERR(stats)) {
+ err = PTR_ERR(stats);
+ goto remove_debugfs_lp_idx;
+ }
+
+ return 0;
+
+remove_debugfs_lp_idx:
+ debugfs_remove_recursive(idx);
+ return err;
+}
+
+static void mshv_debugfs_lp_remove(void)
+{
+ int lp_index;
+
+ debugfs_remove_recursive(mshv_debugfs_lp);
+
+ for (lp_index = 0; lp_index < mshv_lps_count; lp_index++)
+ mshv_lp_stats_unmap(lp_index);
+
+ kfree(mshv_lps_stats);
+ mshv_lps_stats = NULL;
+}
+
+static int __init mshv_debugfs_lp_create(struct dentry *parent)
+{
+ struct dentry *lp_dir;
+ int err, lp_index;
+
+ mshv_lps_stats = kzalloc_objs(*mshv_lps_stats, mshv_lps_count,
+ GFP_KERNEL_ACCOUNT);
+
+ if (!mshv_lps_stats)
+ return -ENOMEM;
+
+ lp_dir = debugfs_create_dir("lp", parent);
+ if (IS_ERR(lp_dir)) {
+ err = PTR_ERR(lp_dir);
+ goto free_lp_stats;
+ }
+
+ for (lp_index = 0; lp_index < mshv_lps_count; lp_index++) {
+ err = lp_debugfs_create(lp_index, lp_dir);
+ if (err)
+ goto remove_debugfs_lps;
+ }
+
+ mshv_debugfs_lp = lp_dir;
+
+ return 0;
+
+remove_debugfs_lps:
+ for (lp_index -= 1; lp_index >= 0; lp_index--)
+ mshv_lp_stats_unmap(lp_index);
+ debugfs_remove_recursive(lp_dir);
+free_lp_stats:
+ kfree(mshv_lps_stats);
+ mshv_lps_stats = NULL;
+
+ return err;
+}
+
+static int vp_stats_show(struct seq_file *m, void *v)
+{
+ const struct hv_stats_page **pstats = m->private;
+ u64 parent_val, self_val;
+ int idx;
+
+ /*
+ * For VP and partition stats, there may be two stats areas mapped,
+ * SELF and PARENT. These refer to the privilege level of the data in
+ * each page. Some fields may be 0 in SELF and nonzero in PARENT, or
+ * vice versa.
+ *
+ * Hence, prioritize printing from the PARENT page (more privileged
+ * data), but use the value from the SELF page if the PARENT value is
+ * 0.
+ */
+
+ for (idx = 0; idx < ARRAY_SIZE(hv_vp_counters); idx++) {
+ char *name = hv_vp_counters[idx];
+
+ if (!name)
+ continue;
+
+ parent_val = pstats[HV_STATS_AREA_PARENT]->data[idx];
+ self_val = pstats[HV_STATS_AREA_SELF]->data[idx];
+ seq_printf(m, "%-43s: %llu\n", name,
+ parent_val ? parent_val : self_val);
+ }
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(vp_stats);
+
+static void vp_debugfs_remove(struct dentry *vp_stats)
+{
+ debugfs_remove_recursive(vp_stats->d_parent);
+}
+
+static int vp_debugfs_create(u64 partition_id, u32 vp_index,
+ struct hv_stats_page **pstats,
+ struct dentry **vp_stats_ptr,
+ struct dentry *parent)
+{
+ struct dentry *vp_idx_dir, *d;
+ char vp_idx_str[U32_BUF_SZ];
+ int err;
+
+ sprintf(vp_idx_str, "%u", vp_index);
+
+ vp_idx_dir = debugfs_create_dir(vp_idx_str, parent);
+ if (IS_ERR(vp_idx_dir))
+ return PTR_ERR(vp_idx_dir);
+
+ d = debugfs_create_file("stats", 0400, vp_idx_dir,
+ pstats, &vp_stats_fops);
+ if (IS_ERR(d)) {
+ err = PTR_ERR(d);
+ goto remove_debugfs_vp_idx;
+ }
+
+ *vp_stats_ptr = d;
+
+ return 0;
+
+remove_debugfs_vp_idx:
+ debugfs_remove_recursive(vp_idx_dir);
+ return err;
+}
+
+static int partition_stats_show(struct seq_file *m, void *v)
+{
+ const struct hv_stats_page **pstats = m->private;
+ u64 parent_val, self_val;
+ int idx;
+
+ for (idx = 0; idx < ARRAY_SIZE(hv_partition_counters); idx++) {
+ char *name = hv_partition_counters[idx];
+
+ if (!name)
+ continue;
+
+ parent_val = pstats[HV_STATS_AREA_PARENT]->data[idx];
+ self_val = pstats[HV_STATS_AREA_SELF]->data[idx];
+ seq_printf(m, "%-37s: %llu\n", name,
+ parent_val ? parent_val : self_val);
+ }
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(partition_stats);
+
+static void mshv_partition_stats_unmap(u64 partition_id,
+ struct hv_stats_page *stats_page,
+ enum hv_stats_area_type stats_area_type)
+{
+ union hv_stats_object_identity identity = {
+ .partition.partition_id = partition_id,
+ .partition.stats_area_type = stats_area_type,
+ };
+ int err;
+
+ err = hv_unmap_stats_page(HV_STATS_OBJECT_PARTITION, stats_page,
+ &identity);
+ if (err)
+ pr_err("%s: failed to unmap partition %lld %s stats, err: %d\n",
+ __func__, partition_id,
+ (stats_area_type == HV_STATS_AREA_SELF) ? "self" : "parent",
+ err);
+}
+
+static struct hv_stats_page *mshv_partition_stats_map(u64 partition_id,
+ enum hv_stats_area_type stats_area_type)
+{
+ union hv_stats_object_identity identity = {
+ .partition.partition_id = partition_id,
+ .partition.stats_area_type = stats_area_type,
+ };
+ struct hv_stats_page *stats;
+ int err;
+
+ err = hv_map_stats_page(HV_STATS_OBJECT_PARTITION, &identity, &stats);
+ if (err) {
+ pr_err("%s: failed to map partition %lld %s stats, err: %d\n",
+ __func__, partition_id,
+ (stats_area_type == HV_STATS_AREA_SELF) ? "self" : "parent",
+ err);
+ return ERR_PTR(err);
+ }
+ return stats;
+}
+
+static int mshv_debugfs_partition_stats_create(u64 partition_id,
+ struct dentry **partition_stats_ptr,
+ struct dentry *parent)
+{
+ struct dentry *dentry;
+ struct hv_stats_page **pstats;
+ int err;
+
+ pstats = kzalloc_objs(struct hv_stats_page *, NUM_STATS_AREAS,
+ GFP_KERNEL_ACCOUNT);
+ if (!pstats)
+ return -ENOMEM;
+
+ pstats[HV_STATS_AREA_SELF] = mshv_partition_stats_map(partition_id,
+ HV_STATS_AREA_SELF);
+ if (IS_ERR(pstats[HV_STATS_AREA_SELF])) {
+ err = PTR_ERR(pstats[HV_STATS_AREA_SELF]);
+ goto cleanup;
+ }
+
+ /*
+ * L1VH partition cannot access its partition stats in parent area.
+ */
+ if (is_l1vh_parent(partition_id)) {
+ pstats[HV_STATS_AREA_PARENT] = pstats[HV_STATS_AREA_SELF];
+ } else {
+ pstats[HV_STATS_AREA_PARENT] = mshv_partition_stats_map(partition_id,
+ HV_STATS_AREA_PARENT);
+ if (IS_ERR(pstats[HV_STATS_AREA_PARENT])) {
+ err = PTR_ERR(pstats[HV_STATS_AREA_PARENT]);
+ goto unmap_self;
+ }
+ if (!pstats[HV_STATS_AREA_PARENT])
+ pstats[HV_STATS_AREA_PARENT] = pstats[HV_STATS_AREA_SELF];
+ }
+
+ dentry = debugfs_create_file("stats", 0400, parent,
+ pstats, &partition_stats_fops);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto unmap_partition_stats;
+ }
+
+ *partition_stats_ptr = dentry;
+ return 0;
+
+unmap_partition_stats:
+ if (pstats[HV_STATS_AREA_PARENT] != pstats[HV_STATS_AREA_SELF])
+ mshv_partition_stats_unmap(partition_id, pstats[HV_STATS_AREA_PARENT],
+ HV_STATS_AREA_PARENT);
+unmap_self:
+ mshv_partition_stats_unmap(partition_id, pstats[HV_STATS_AREA_SELF],
+ HV_STATS_AREA_SELF);
+cleanup:
+ kfree(pstats);
+ return err;
+}
+
+static void partition_debugfs_remove(u64 partition_id, struct dentry *dentry)
+{
+ struct hv_stats_page **pstats = NULL;
+
+ pstats = dentry->d_inode->i_private;
+
+ debugfs_remove_recursive(dentry->d_parent);
+
+ if (pstats[HV_STATS_AREA_PARENT] != pstats[HV_STATS_AREA_SELF]) {
+ mshv_partition_stats_unmap(partition_id,
+ pstats[HV_STATS_AREA_PARENT],
+ HV_STATS_AREA_PARENT);
+ }
+
+ mshv_partition_stats_unmap(partition_id,
+ pstats[HV_STATS_AREA_SELF],
+ HV_STATS_AREA_SELF);
+
+ kfree(pstats);
+}
+
+static int partition_debugfs_create(u64 partition_id,
+ struct dentry **vp_dir_ptr,
+ struct dentry **partition_stats_ptr,
+ struct dentry *parent)
+{
+ char part_id_str[U64_BUF_SZ];
+ struct dentry *part_id_dir, *vp_dir;
+ int err;
+
+ if (is_l1vh_parent(partition_id))
+ sprintf(part_id_str, "self");
+ else
+ sprintf(part_id_str, "%llu", partition_id);
+
+ part_id_dir = debugfs_create_dir(part_id_str, parent);
+ if (IS_ERR(part_id_dir))
+ return PTR_ERR(part_id_dir);
+
+ vp_dir = debugfs_create_dir("vp", part_id_dir);
+ if (IS_ERR(vp_dir)) {
+ err = PTR_ERR(vp_dir);
+ goto remove_debugfs_partition_id;
+ }
+
+ err = mshv_debugfs_partition_stats_create(partition_id,
+ partition_stats_ptr,
+ part_id_dir);
+ if (err)
+ goto remove_debugfs_partition_id;
+
+ *vp_dir_ptr = vp_dir;
+
+ return 0;
+
+remove_debugfs_partition_id:
+ debugfs_remove_recursive(part_id_dir);
+ return err;
+}
+
+static void parent_vp_debugfs_remove(u32 vp_index,
+ struct dentry *vp_stats_ptr)
+{
+ struct hv_stats_page **pstats;
+
+ pstats = vp_stats_ptr->d_inode->i_private;
+ vp_debugfs_remove(vp_stats_ptr);
+ mshv_vp_stats_unmap(hv_current_partition_id, vp_index, pstats);
+ kfree(pstats);
+}
+
+static void mshv_debugfs_parent_partition_remove(void)
+{
+ int idx;
+
+ for_each_online_cpu(idx)
+ parent_vp_debugfs_remove(hv_vp_index[idx],
+ parent_vp_stats[idx]);
+
+ partition_debugfs_remove(hv_current_partition_id,
+ parent_partition_stats);
+ kfree(parent_vp_stats);
+ parent_vp_stats = NULL;
+ parent_partition_stats = NULL;
+}
+
+static int __init parent_vp_debugfs_create(u32 vp_index,
+ struct dentry **vp_stats_ptr,
+ struct dentry *parent)
+{
+ struct hv_stats_page **pstats;
+ int err;
+
+ pstats = kzalloc_objs(struct hv_stats_page *, NUM_STATS_AREAS,
+ GFP_KERNEL_ACCOUNT);
+ if (!pstats)
+ return -ENOMEM;
+
+ err = mshv_vp_stats_map(hv_current_partition_id, vp_index, pstats);
+ if (err)
+ goto cleanup;
+
+ err = vp_debugfs_create(hv_current_partition_id, vp_index, pstats,
+ vp_stats_ptr, parent);
+ if (err)
+ goto unmap_vp_stats;
+
+ return 0;
+
+unmap_vp_stats:
+ mshv_vp_stats_unmap(hv_current_partition_id, vp_index, pstats);
+cleanup:
+ kfree(pstats);
+ return err;
+}
+
+static int __init mshv_debugfs_parent_partition_create(void)
+{
+ struct dentry *vp_dir;
+ int err, idx, i;
+
+ mshv_debugfs_partition = debugfs_create_dir("partition",
+ mshv_debugfs);
+ if (IS_ERR(mshv_debugfs_partition))
+ return PTR_ERR(mshv_debugfs_partition);
+
+ err = partition_debugfs_create(hv_current_partition_id,
+ &vp_dir,
+ &parent_partition_stats,
+ mshv_debugfs_partition);
+ if (err)
+ goto remove_debugfs_partition;
+
+ parent_vp_stats = kzalloc_objs(*parent_vp_stats, nr_cpu_ids);
+ if (!parent_vp_stats) {
+ err = -ENOMEM;
+ goto remove_debugfs_partition;
+ }
+
+ for_each_online_cpu(idx) {
+ err = parent_vp_debugfs_create(hv_vp_index[idx],
+ &parent_vp_stats[idx],
+ vp_dir);
+ if (err)
+ goto remove_debugfs_partition_vp;
+ }
+
+ return 0;
+
+remove_debugfs_partition_vp:
+ for_each_online_cpu(i) {
+ if (i >= idx)
+ break;
+ parent_vp_debugfs_remove(i, parent_vp_stats[i]);
+ }
+ partition_debugfs_remove(hv_current_partition_id,
+ parent_partition_stats);
+
+ kfree(parent_vp_stats);
+ parent_vp_stats = NULL;
+ parent_partition_stats = NULL;
+
+remove_debugfs_partition:
+ debugfs_remove_recursive(mshv_debugfs_partition);
+ mshv_debugfs_partition = NULL;
+ return err;
+}
+
+static int hv_stats_show(struct seq_file *m, void *v)
+{
+ const struct hv_stats_page *stats = m->private;
+ int idx;
+
+ for (idx = 0; idx < ARRAY_SIZE(hv_hypervisor_counters); idx++) {
+ char *name = hv_hypervisor_counters[idx];
+
+ if (!name)
+ continue;
+ seq_printf(m, "%-27s: %llu\n", name, stats->data[idx]);
+ }
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(hv_stats);
+
+static void mshv_hv_stats_unmap(void)
+{
+ union hv_stats_object_identity identity = {
+ .hv.stats_area_type = HV_STATS_AREA_SELF,
+ };
+ int err;
+
+ err = hv_unmap_stats_page(HV_STATS_OBJECT_HYPERVISOR, NULL, &identity);
+ if (err)
+ pr_err("%s: failed to unmap hypervisor stats: %d\n",
+ __func__, err);
+}
+
+static void * __init mshv_hv_stats_map(void)
+{
+ union hv_stats_object_identity identity = {
+ .hv.stats_area_type = HV_STATS_AREA_SELF,
+ };
+ struct hv_stats_page *stats;
+ int err;
+
+ err = hv_map_stats_page(HV_STATS_OBJECT_HYPERVISOR, &identity, &stats);
+ if (err) {
+ pr_err("%s: failed to map hypervisor stats: %d\n",
+ __func__, err);
+ return ERR_PTR(err);
+ }
+ return stats;
+}
+
+static int __init mshv_debugfs_hv_stats_create(struct dentry *parent)
+{
+ struct dentry *dentry;
+ u64 *stats;
+ int err;
+
+ stats = mshv_hv_stats_map();
+ if (IS_ERR(stats))
+ return PTR_ERR(stats);
+
+ dentry = debugfs_create_file("stats", 0400, parent,
+ stats, &hv_stats_fops);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ pr_err("%s: failed to create hypervisor stats dentry: %d\n",
+ __func__, err);
+ goto unmap_hv_stats;
+ }
+
+ mshv_lps_count = stats[HV_HYPERVISOR_COUNTER_LOGICAL_PROCESSORS];
+
+ return 0;
+
+unmap_hv_stats:
+ mshv_hv_stats_unmap();
+ return err;
+}
+
+int mshv_debugfs_vp_create(struct mshv_vp *vp)
+{
+ struct mshv_partition *p = vp->vp_partition;
+
+ if (!mshv_debugfs)
+ return 0;
+
+ return vp_debugfs_create(p->pt_id, vp->vp_index,
+ vp->vp_stats_pages,
+ &vp->vp_stats_dentry,
+ p->pt_vp_dentry);
+}
+
+void mshv_debugfs_vp_remove(struct mshv_vp *vp)
+{
+ if (!mshv_debugfs)
+ return;
+
+ vp_debugfs_remove(vp->vp_stats_dentry);
+}
+
+int mshv_debugfs_partition_create(struct mshv_partition *partition)
+{
+ int err;
+
+ if (!mshv_debugfs)
+ return 0;
+
+ err = partition_debugfs_create(partition->pt_id,
+ &partition->pt_vp_dentry,
+ &partition->pt_stats_dentry,
+ mshv_debugfs_partition);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+void mshv_debugfs_partition_remove(struct mshv_partition *partition)
+{
+ if (!mshv_debugfs)
+ return;
+
+ partition_debugfs_remove(partition->pt_id,
+ partition->pt_stats_dentry);
+}
+
+int __init mshv_debugfs_init(void)
+{
+ int err;
+
+ mshv_debugfs = debugfs_create_dir("mshv", NULL);
+ if (IS_ERR(mshv_debugfs)) {
+ pr_err("%s: failed to create debugfs directory\n", __func__);
+ return PTR_ERR(mshv_debugfs);
+ }
+
+ if (hv_root_partition()) {
+ err = mshv_debugfs_hv_stats_create(mshv_debugfs);
+ if (err)
+ goto remove_mshv_dir;
+
+ err = mshv_debugfs_lp_create(mshv_debugfs);
+ if (err)
+ goto unmap_hv_stats;
+ }
+
+ err = mshv_debugfs_parent_partition_create();
+ if (err)
+ goto unmap_lp_stats;
+
+ return 0;
+
+unmap_lp_stats:
+ if (hv_root_partition()) {
+ mshv_debugfs_lp_remove();
+ mshv_debugfs_lp = NULL;
+ }
+unmap_hv_stats:
+ if (hv_root_partition())
+ mshv_hv_stats_unmap();
+remove_mshv_dir:
+ debugfs_remove_recursive(mshv_debugfs);
+ mshv_debugfs = NULL;
+ return err;
+}
+
+void mshv_debugfs_exit(void)
+{
+ mshv_debugfs_parent_partition_remove();
+
+ if (hv_root_partition()) {
+ mshv_debugfs_lp_remove();
+ mshv_debugfs_lp = NULL;
+ mshv_hv_stats_unmap();
+ }
+
+ debugfs_remove_recursive(mshv_debugfs);
+ mshv_debugfs = NULL;
+ mshv_debugfs_partition = NULL;
+}
diff --git a/drivers/hv/mshv_debugfs_counters.c b/drivers/hv/mshv_debugfs_counters.c
new file mode 100644
index 000000000000..978536ba691f
--- /dev/null
+++ b/drivers/hv/mshv_debugfs_counters.c
@@ -0,0 +1,490 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2026, Microsoft Corporation.
+ *
+ * Data for printing stats page counters via debugfs.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+/*
+ * For simplicity, this file is included directly in mshv_debugfs.c.
+ * If these are ever needed elsewhere they should be compiled separately.
+ * Ensure this file is not used twice by accident.
+ */
+#ifndef MSHV_DEBUGFS_C
+#error "This file should only be included in mshv_debugfs.c"
+#endif
+
+/* HV_HYPERVISOR_COUNTER */
+static char *hv_hypervisor_counters[] = {
+ [1] = "HvLogicalProcessors",
+ [2] = "HvPartitions",
+ [3] = "HvTotalPages",
+ [4] = "HvVirtualProcessors",
+ [5] = "HvMonitoredNotifications",
+ [6] = "HvModernStandbyEntries",
+ [7] = "HvPlatformIdleTransitions",
+ [8] = "HvHypervisorStartupCost",
+
+ [10] = "HvIOSpacePages",
+ [11] = "HvNonEssentialPagesForDump",
+ [12] = "HvSubsumedPages",
+};
+
+/* HV_CPU_COUNTER */
+static char *hv_lp_counters[] = {
+ [1] = "LpGlobalTime",
+ [2] = "LpTotalRunTime",
+ [3] = "LpHypervisorRunTime",
+ [4] = "LpHardwareInterrupts",
+ [5] = "LpContextSwitches",
+ [6] = "LpInterProcessorInterrupts",
+ [7] = "LpSchedulerInterrupts",
+ [8] = "LpTimerInterrupts",
+ [9] = "LpInterProcessorInterruptsSent",
+ [10] = "LpProcessorHalts",
+ [11] = "LpMonitorTransitionCost",
+ [12] = "LpContextSwitchTime",
+ [13] = "LpC1TransitionsCount",
+ [14] = "LpC1RunTime",
+ [15] = "LpC2TransitionsCount",
+ [16] = "LpC2RunTime",
+ [17] = "LpC3TransitionsCount",
+ [18] = "LpC3RunTime",
+ [19] = "LpRootVpIndex",
+ [20] = "LpIdleSequenceNumber",
+ [21] = "LpGlobalTscCount",
+ [22] = "LpActiveTscCount",
+ [23] = "LpIdleAccumulation",
+ [24] = "LpReferenceCycleCount0",
+ [25] = "LpActualCycleCount0",
+ [26] = "LpReferenceCycleCount1",
+ [27] = "LpActualCycleCount1",
+ [28] = "LpProximityDomainId",
+ [29] = "LpPostedInterruptNotifications",
+ [30] = "LpBranchPredictorFlushes",
+#if IS_ENABLED(CONFIG_X86_64)
+ [31] = "LpL1DataCacheFlushes",
+ [32] = "LpImmediateL1DataCacheFlushes",
+ [33] = "LpMbFlushes",
+ [34] = "LpCounterRefreshSequenceNumber",
+ [35] = "LpCounterRefreshReferenceTime",
+ [36] = "LpIdleAccumulationSnapshot",
+ [37] = "LpActiveTscCountSnapshot",
+ [38] = "LpHwpRequestContextSwitches",
+ [39] = "LpPlaceholder1",
+ [40] = "LpPlaceholder2",
+ [41] = "LpPlaceholder3",
+ [42] = "LpPlaceholder4",
+ [43] = "LpPlaceholder5",
+ [44] = "LpPlaceholder6",
+ [45] = "LpPlaceholder7",
+ [46] = "LpPlaceholder8",
+ [47] = "LpPlaceholder9",
+ [48] = "LpSchLocalRunListSize",
+ [49] = "LpReserveGroupId",
+ [50] = "LpRunningPriority",
+ [51] = "LpPerfmonInterruptCount",
+#elif IS_ENABLED(CONFIG_ARM64)
+ [31] = "LpCounterRefreshSequenceNumber",
+ [32] = "LpCounterRefreshReferenceTime",
+ [33] = "LpIdleAccumulationSnapshot",
+ [34] = "LpActiveTscCountSnapshot",
+ [35] = "LpHwpRequestContextSwitches",
+ [36] = "LpPlaceholder2",
+ [37] = "LpPlaceholder3",
+ [38] = "LpPlaceholder4",
+ [39] = "LpPlaceholder5",
+ [40] = "LpPlaceholder6",
+ [41] = "LpPlaceholder7",
+ [42] = "LpPlaceholder8",
+ [43] = "LpPlaceholder9",
+ [44] = "LpSchLocalRunListSize",
+ [45] = "LpReserveGroupId",
+ [46] = "LpRunningPriority",
+#endif
+};
+
+/* HV_PROCESS_COUNTER */
+static char *hv_partition_counters[] = {
+ [1] = "PtVirtualProcessors",
+
+ [3] = "PtTlbSize",
+ [4] = "PtAddressSpaces",
+ [5] = "PtDepositedPages",
+ [6] = "PtGpaPages",
+ [7] = "PtGpaSpaceModifications",
+ [8] = "PtVirtualTlbFlushEntires",
+ [9] = "PtRecommendedTlbSize",
+ [10] = "PtGpaPages4K",
+ [11] = "PtGpaPages2M",
+ [12] = "PtGpaPages1G",
+ [13] = "PtGpaPages512G",
+ [14] = "PtDevicePages4K",
+ [15] = "PtDevicePages2M",
+ [16] = "PtDevicePages1G",
+ [17] = "PtDevicePages512G",
+ [18] = "PtAttachedDevices",
+ [19] = "PtDeviceInterruptMappings",
+ [20] = "PtIoTlbFlushes",
+ [21] = "PtIoTlbFlushCost",
+ [22] = "PtDeviceInterruptErrors",
+ [23] = "PtDeviceDmaErrors",
+ [24] = "PtDeviceInterruptThrottleEvents",
+ [25] = "PtSkippedTimerTicks",
+ [26] = "PtPartitionId",
+#if IS_ENABLED(CONFIG_X86_64)
+ [27] = "PtNestedTlbSize",
+ [28] = "PtRecommendedNestedTlbSize",
+ [29] = "PtNestedTlbFreeListSize",
+ [30] = "PtNestedTlbTrimmedPages",
+ [31] = "PtPagesShattered",
+ [32] = "PtPagesRecombined",
+ [33] = "PtHwpRequestValue",
+ [34] = "PtAutoSuspendEnableTime",
+ [35] = "PtAutoSuspendTriggerTime",
+ [36] = "PtAutoSuspendDisableTime",
+ [37] = "PtPlaceholder1",
+ [38] = "PtPlaceholder2",
+ [39] = "PtPlaceholder3",
+ [40] = "PtPlaceholder4",
+ [41] = "PtPlaceholder5",
+ [42] = "PtPlaceholder6",
+ [43] = "PtPlaceholder7",
+ [44] = "PtPlaceholder8",
+ [45] = "PtHypervisorStateTransferGeneration",
+ [46] = "PtNumberofActiveChildPartitions",
+#elif IS_ENABLED(CONFIG_ARM64)
+ [27] = "PtHwpRequestValue",
+ [28] = "PtAutoSuspendEnableTime",
+ [29] = "PtAutoSuspendTriggerTime",
+ [30] = "PtAutoSuspendDisableTime",
+ [31] = "PtPlaceholder1",
+ [32] = "PtPlaceholder2",
+ [33] = "PtPlaceholder3",
+ [34] = "PtPlaceholder4",
+ [35] = "PtPlaceholder5",
+ [36] = "PtPlaceholder6",
+ [37] = "PtPlaceholder7",
+ [38] = "PtPlaceholder8",
+ [39] = "PtHypervisorStateTransferGeneration",
+ [40] = "PtNumberofActiveChildPartitions",
+#endif
+};
+
+/* HV_THREAD_COUNTER */
+static char *hv_vp_counters[] = {
+ [1] = "VpTotalRunTime",
+ [2] = "VpHypervisorRunTime",
+ [3] = "VpRemoteNodeRunTime",
+ [4] = "VpNormalizedRunTime",
+ [5] = "VpIdealCpu",
+
+ [7] = "VpHypercallsCount",
+ [8] = "VpHypercallsTime",
+#if IS_ENABLED(CONFIG_X86_64)
+ [9] = "VpPageInvalidationsCount",
+ [10] = "VpPageInvalidationsTime",
+ [11] = "VpControlRegisterAccessesCount",
+ [12] = "VpControlRegisterAccessesTime",
+ [13] = "VpIoInstructionsCount",
+ [14] = "VpIoInstructionsTime",
+ [15] = "VpHltInstructionsCount",
+ [16] = "VpHltInstructionsTime",
+ [17] = "VpMwaitInstructionsCount",
+ [18] = "VpMwaitInstructionsTime",
+ [19] = "VpCpuidInstructionsCount",
+ [20] = "VpCpuidInstructionsTime",
+ [21] = "VpMsrAccessesCount",
+ [22] = "VpMsrAccessesTime",
+ [23] = "VpOtherInterceptsCount",
+ [24] = "VpOtherInterceptsTime",
+ [25] = "VpExternalInterruptsCount",
+ [26] = "VpExternalInterruptsTime",
+ [27] = "VpPendingInterruptsCount",
+ [28] = "VpPendingInterruptsTime",
+ [29] = "VpEmulatedInstructionsCount",
+ [30] = "VpEmulatedInstructionsTime",
+ [31] = "VpDebugRegisterAccessesCount",
+ [32] = "VpDebugRegisterAccessesTime",
+ [33] = "VpPageFaultInterceptsCount",
+ [34] = "VpPageFaultInterceptsTime",
+ [35] = "VpGuestPageTableMaps",
+ [36] = "VpLargePageTlbFills",
+ [37] = "VpSmallPageTlbFills",
+ [38] = "VpReflectedGuestPageFaults",
+ [39] = "VpApicMmioAccesses",
+ [40] = "VpIoInterceptMessages",
+ [41] = "VpMemoryInterceptMessages",
+ [42] = "VpApicEoiAccesses",
+ [43] = "VpOtherMessages",
+ [44] = "VpPageTableAllocations",
+ [45] = "VpLogicalProcessorMigrations",
+ [46] = "VpAddressSpaceEvictions",
+ [47] = "VpAddressSpaceSwitches",
+ [48] = "VpAddressDomainFlushes",
+ [49] = "VpAddressSpaceFlushes",
+ [50] = "VpGlobalGvaRangeFlushes",
+ [51] = "VpLocalGvaRangeFlushes",
+ [52] = "VpPageTableEvictions",
+ [53] = "VpPageTableReclamations",
+ [54] = "VpPageTableResets",
+ [55] = "VpPageTableValidations",
+ [56] = "VpApicTprAccesses",
+ [57] = "VpPageTableWriteIntercepts",
+ [58] = "VpSyntheticInterrupts",
+ [59] = "VpVirtualInterrupts",
+ [60] = "VpApicIpisSent",
+ [61] = "VpApicSelfIpisSent",
+ [62] = "VpGpaSpaceHypercalls",
+ [63] = "VpLogicalProcessorHypercalls",
+ [64] = "VpLongSpinWaitHypercalls",
+ [65] = "VpOtherHypercalls",
+ [66] = "VpSyntheticInterruptHypercalls",
+ [67] = "VpVirtualInterruptHypercalls",
+ [68] = "VpVirtualMmuHypercalls",
+ [69] = "VpVirtualProcessorHypercalls",
+ [70] = "VpHardwareInterrupts",
+ [71] = "VpNestedPageFaultInterceptsCount",
+ [72] = "VpNestedPageFaultInterceptsTime",
+ [73] = "VpPageScans",
+ [74] = "VpLogicalProcessorDispatches",
+ [75] = "VpWaitingForCpuTime",
+ [76] = "VpExtendedHypercalls",
+ [77] = "VpExtendedHypercallInterceptMessages",
+ [78] = "VpMbecNestedPageTableSwitches",
+ [79] = "VpOtherReflectedGuestExceptions",
+ [80] = "VpGlobalIoTlbFlushes",
+ [81] = "VpGlobalIoTlbFlushCost",
+ [82] = "VpLocalIoTlbFlushes",
+ [83] = "VpLocalIoTlbFlushCost",
+ [84] = "VpHypercallsForwardedCount",
+ [85] = "VpHypercallsForwardingTime",
+ [86] = "VpPageInvalidationsForwardedCount",
+ [87] = "VpPageInvalidationsForwardingTime",
+ [88] = "VpControlRegisterAccessesForwardedCount",
+ [89] = "VpControlRegisterAccessesForwardingTime",
+ [90] = "VpIoInstructionsForwardedCount",
+ [91] = "VpIoInstructionsForwardingTime",
+ [92] = "VpHltInstructionsForwardedCount",
+ [93] = "VpHltInstructionsForwardingTime",
+ [94] = "VpMwaitInstructionsForwardedCount",
+ [95] = "VpMwaitInstructionsForwardingTime",
+ [96] = "VpCpuidInstructionsForwardedCount",
+ [97] = "VpCpuidInstructionsForwardingTime",
+ [98] = "VpMsrAccessesForwardedCount",
+ [99] = "VpMsrAccessesForwardingTime",
+ [100] = "VpOtherInterceptsForwardedCount",
+ [101] = "VpOtherInterceptsForwardingTime",
+ [102] = "VpExternalInterruptsForwardedCount",
+ [103] = "VpExternalInterruptsForwardingTime",
+ [104] = "VpPendingInterruptsForwardedCount",
+ [105] = "VpPendingInterruptsForwardingTime",
+ [106] = "VpEmulatedInstructionsForwardedCount",
+ [107] = "VpEmulatedInstructionsForwardingTime",
+ [108] = "VpDebugRegisterAccessesForwardedCount",
+ [109] = "VpDebugRegisterAccessesForwardingTime",
+ [110] = "VpPageFaultInterceptsForwardedCount",
+ [111] = "VpPageFaultInterceptsForwardingTime",
+ [112] = "VpVmclearEmulationCount",
+ [113] = "VpVmclearEmulationTime",
+ [114] = "VpVmptrldEmulationCount",
+ [115] = "VpVmptrldEmulationTime",
+ [116] = "VpVmptrstEmulationCount",
+ [117] = "VpVmptrstEmulationTime",
+ [118] = "VpVmreadEmulationCount",
+ [119] = "VpVmreadEmulationTime",
+ [120] = "VpVmwriteEmulationCount",
+ [121] = "VpVmwriteEmulationTime",
+ [122] = "VpVmxoffEmulationCount",
+ [123] = "VpVmxoffEmulationTime",
+ [124] = "VpVmxonEmulationCount",
+ [125] = "VpVmxonEmulationTime",
+ [126] = "VpNestedVMEntriesCount",
+ [127] = "VpNestedVMEntriesTime",
+ [128] = "VpNestedSLATSoftPageFaultsCount",
+ [129] = "VpNestedSLATSoftPageFaultsTime",
+ [130] = "VpNestedSLATHardPageFaultsCount",
+ [131] = "VpNestedSLATHardPageFaultsTime",
+ [132] = "VpInvEptAllContextEmulationCount",
+ [133] = "VpInvEptAllContextEmulationTime",
+ [134] = "VpInvEptSingleContextEmulationCount",
+ [135] = "VpInvEptSingleContextEmulationTime",
+ [136] = "VpInvVpidAllContextEmulationCount",
+ [137] = "VpInvVpidAllContextEmulationTime",
+ [138] = "VpInvVpidSingleContextEmulationCount",
+ [139] = "VpInvVpidSingleContextEmulationTime",
+ [140] = "VpInvVpidSingleAddressEmulationCount",
+ [141] = "VpInvVpidSingleAddressEmulationTime",
+ [142] = "VpNestedTlbPageTableReclamations",
+ [143] = "VpNestedTlbPageTableEvictions",
+ [144] = "VpFlushGuestPhysicalAddressSpaceHypercalls",
+ [145] = "VpFlushGuestPhysicalAddressListHypercalls",
+ [146] = "VpPostedInterruptNotifications",
+ [147] = "VpPostedInterruptScans",
+ [148] = "VpTotalCoreRunTime",
+ [149] = "VpMaximumRunTime",
+ [150] = "VpHwpRequestContextSwitches",
+ [151] = "VpWaitingForCpuTimeBucket0",
+ [152] = "VpWaitingForCpuTimeBucket1",
+ [153] = "VpWaitingForCpuTimeBucket2",
+ [154] = "VpWaitingForCpuTimeBucket3",
+ [155] = "VpWaitingForCpuTimeBucket4",
+ [156] = "VpWaitingForCpuTimeBucket5",
+ [157] = "VpWaitingForCpuTimeBucket6",
+ [158] = "VpVmloadEmulationCount",
+ [159] = "VpVmloadEmulationTime",
+ [160] = "VpVmsaveEmulationCount",
+ [161] = "VpVmsaveEmulationTime",
+ [162] = "VpGifInstructionEmulationCount",
+ [163] = "VpGifInstructionEmulationTime",
+ [164] = "VpEmulatedErrataSvmInstructions",
+ [165] = "VpPlaceholder1",
+ [166] = "VpPlaceholder2",
+ [167] = "VpPlaceholder3",
+ [168] = "VpPlaceholder4",
+ [169] = "VpPlaceholder5",
+ [170] = "VpPlaceholder6",
+ [171] = "VpPlaceholder7",
+ [172] = "VpPlaceholder8",
+ [173] = "VpContentionTime",
+ [174] = "VpWakeUpTime",
+ [175] = "VpSchedulingPriority",
+ [176] = "VpRdpmcInstructionsCount",
+ [177] = "VpRdpmcInstructionsTime",
+ [178] = "VpPerfmonPmuMsrAccessesCount",
+ [179] = "VpPerfmonLbrMsrAccessesCount",
+ [180] = "VpPerfmonIptMsrAccessesCount",
+ [181] = "VpPerfmonInterruptCount",
+ [182] = "VpVtl1DispatchCount",
+ [183] = "VpVtl2DispatchCount",
+ [184] = "VpVtl2DispatchBucket0",
+ [185] = "VpVtl2DispatchBucket1",
+ [186] = "VpVtl2DispatchBucket2",
+ [187] = "VpVtl2DispatchBucket3",
+ [188] = "VpVtl2DispatchBucket4",
+ [189] = "VpVtl2DispatchBucket5",
+ [190] = "VpVtl2DispatchBucket6",
+ [191] = "VpVtl1RunTime",
+ [192] = "VpVtl2RunTime",
+ [193] = "VpIommuHypercalls",
+ [194] = "VpCpuGroupHypercalls",
+ [195] = "VpVsmHypercalls",
+ [196] = "VpEventLogHypercalls",
+ [197] = "VpDeviceDomainHypercalls",
+ [198] = "VpDepositHypercalls",
+ [199] = "VpSvmHypercalls",
+ [200] = "VpBusLockAcquisitionCount",
+ [201] = "VpLoadAvg",
+ [202] = "VpRootDispatchThreadBlocked",
+ [203] = "VpIdleCpuTime",
+ [204] = "VpWaitingForCpuTimeBucket7",
+ [205] = "VpWaitingForCpuTimeBucket8",
+ [206] = "VpWaitingForCpuTimeBucket9",
+ [207] = "VpWaitingForCpuTimeBucket10",
+ [208] = "VpWaitingForCpuTimeBucket11",
+ [209] = "VpWaitingForCpuTimeBucket12",
+ [210] = "VpHierarchicalSuspendTime",
+ [211] = "VpExpressSchedulingAttempts",
+ [212] = "VpExpressSchedulingCount",
+#elif IS_ENABLED(CONFIG_ARM64)
+ [9] = "VpSysRegAccessesCount",
+ [10] = "VpSysRegAccessesTime",
+ [11] = "VpSmcInstructionsCount",
+ [12] = "VpSmcInstructionsTime",
+ [13] = "VpOtherInterceptsCount",
+ [14] = "VpOtherInterceptsTime",
+ [15] = "VpExternalInterruptsCount",
+ [16] = "VpExternalInterruptsTime",
+ [17] = "VpPendingInterruptsCount",
+ [18] = "VpPendingInterruptsTime",
+ [19] = "VpGuestPageTableMaps",
+ [20] = "VpLargePageTlbFills",
+ [21] = "VpSmallPageTlbFills",
+ [22] = "VpReflectedGuestPageFaults",
+ [23] = "VpMemoryInterceptMessages",
+ [24] = "VpOtherMessages",
+ [25] = "VpLogicalProcessorMigrations",
+ [26] = "VpAddressDomainFlushes",
+ [27] = "VpAddressSpaceFlushes",
+ [28] = "VpSyntheticInterrupts",
+ [29] = "VpVirtualInterrupts",
+ [30] = "VpApicSelfIpisSent",
+ [31] = "VpGpaSpaceHypercalls",
+ [32] = "VpLogicalProcessorHypercalls",
+ [33] = "VpLongSpinWaitHypercalls",
+ [34] = "VpOtherHypercalls",
+ [35] = "VpSyntheticInterruptHypercalls",
+ [36] = "VpVirtualInterruptHypercalls",
+ [37] = "VpVirtualMmuHypercalls",
+ [38] = "VpVirtualProcessorHypercalls",
+ [39] = "VpHardwareInterrupts",
+ [40] = "VpNestedPageFaultInterceptsCount",
+ [41] = "VpNestedPageFaultInterceptsTime",
+ [42] = "VpLogicalProcessorDispatches",
+ [43] = "VpWaitingForCpuTime",
+ [44] = "VpExtendedHypercalls",
+ [45] = "VpExtendedHypercallInterceptMessages",
+ [46] = "VpMbecNestedPageTableSwitches",
+ [47] = "VpOtherReflectedGuestExceptions",
+ [48] = "VpGlobalIoTlbFlushes",
+ [49] = "VpGlobalIoTlbFlushCost",
+ [50] = "VpLocalIoTlbFlushes",
+ [51] = "VpLocalIoTlbFlushCost",
+ [52] = "VpFlushGuestPhysicalAddressSpaceHypercalls",
+ [53] = "VpFlushGuestPhysicalAddressListHypercalls",
+ [54] = "VpPostedInterruptNotifications",
+ [55] = "VpPostedInterruptScans",
+ [56] = "VpTotalCoreRunTime",
+ [57] = "VpMaximumRunTime",
+ [58] = "VpWaitingForCpuTimeBucket0",
+ [59] = "VpWaitingForCpuTimeBucket1",
+ [60] = "VpWaitingForCpuTimeBucket2",
+ [61] = "VpWaitingForCpuTimeBucket3",
+ [62] = "VpWaitingForCpuTimeBucket4",
+ [63] = "VpWaitingForCpuTimeBucket5",
+ [64] = "VpWaitingForCpuTimeBucket6",
+ [65] = "VpHwpRequestContextSwitches",
+ [66] = "VpPlaceholder2",
+ [67] = "VpPlaceholder3",
+ [68] = "VpPlaceholder4",
+ [69] = "VpPlaceholder5",
+ [70] = "VpPlaceholder6",
+ [71] = "VpPlaceholder7",
+ [72] = "VpPlaceholder8",
+ [73] = "VpContentionTime",
+ [74] = "VpWakeUpTime",
+ [75] = "VpSchedulingPriority",
+ [76] = "VpVtl1DispatchCount",
+ [77] = "VpVtl2DispatchCount",
+ [78] = "VpVtl2DispatchBucket0",
+ [79] = "VpVtl2DispatchBucket1",
+ [80] = "VpVtl2DispatchBucket2",
+ [81] = "VpVtl2DispatchBucket3",
+ [82] = "VpVtl2DispatchBucket4",
+ [83] = "VpVtl2DispatchBucket5",
+ [84] = "VpVtl2DispatchBucket6",
+ [85] = "VpVtl1RunTime",
+ [86] = "VpVtl2RunTime",
+ [87] = "VpIommuHypercalls",
+ [88] = "VpCpuGroupHypercalls",
+ [89] = "VpVsmHypercalls",
+ [90] = "VpEventLogHypercalls",
+ [91] = "VpDeviceDomainHypercalls",
+ [92] = "VpDepositHypercalls",
+ [93] = "VpSvmHypercalls",
+ [94] = "VpLoadAvg",
+ [95] = "VpRootDispatchThreadBlocked",
+ [96] = "VpIdleCpuTime",
+ [97] = "VpWaitingForCpuTimeBucket7",
+ [98] = "VpWaitingForCpuTimeBucket8",
+ [99] = "VpWaitingForCpuTimeBucket9",
+ [100] = "VpWaitingForCpuTimeBucket10",
+ [101] = "VpWaitingForCpuTimeBucket11",
+ [102] = "VpWaitingForCpuTimeBucket12",
+ [103] = "VpHierarchicalSuspendTime",
+ [104] = "VpExpressSchedulingAttempts",
+ [105] = "VpExpressSchedulingCount",
+#endif
+};
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
new file mode 100644
index 000000000000..90959f639dc3
--- /dev/null
+++ b/drivers/hv/mshv_eventfd.c
@@ -0,0 +1,853 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * eventfd support for mshv
+ *
+ * Heavily inspired from KVM implementation of irqfd/ioeventfd. The basic
+ * framework code is taken from the kvm implementation.
+ *
+ * All credits to kvm developers.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/eventfd.h>
+
+#if IS_ENABLED(CONFIG_X86_64)
+#include <asm/apic.h>
+#endif
+#include <asm/mshyperv.h>
+
+#include "mshv_eventfd.h"
+#include "mshv.h"
+#include "mshv_root.h"
+
+static struct workqueue_struct *irqfd_cleanup_wq;
+
+void mshv_register_irq_ack_notifier(struct mshv_partition *partition,
+ struct mshv_irq_ack_notifier *mian)
+{
+ mutex_lock(&partition->pt_irq_lock);
+ hlist_add_head_rcu(&mian->link, &partition->irq_ack_notifier_list);
+ mutex_unlock(&partition->pt_irq_lock);
+}
+
+void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition,
+ struct mshv_irq_ack_notifier *mian)
+{
+ mutex_lock(&partition->pt_irq_lock);
+ hlist_del_init_rcu(&mian->link);
+ mutex_unlock(&partition->pt_irq_lock);
+ synchronize_rcu();
+}
+
+bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi)
+{
+ struct mshv_irq_ack_notifier *mian;
+ bool acked = false;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mian, &partition->irq_ack_notifier_list,
+ link) {
+ if (mian->irq_ack_gsi == gsi) {
+ mian->irq_acked(mian);
+ acked = true;
+ }
+ }
+ rcu_read_unlock();
+
+ return acked;
+}
+
+#if IS_ENABLED(CONFIG_ARM64)
+static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type)
+{
+ return false;
+}
+#elif IS_ENABLED(CONFIG_X86_64)
+static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type)
+{
+ return type == HV_X64_INTERRUPT_TYPE_EXTINT;
+}
+#endif
+
+static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian)
+{
+ struct mshv_irqfd_resampler *resampler;
+ struct mshv_partition *partition;
+ struct mshv_irqfd *irqfd;
+ int idx;
+
+ resampler = container_of(mian, struct mshv_irqfd_resampler,
+ rsmplr_notifier);
+ partition = resampler->rsmplr_partn;
+
+ idx = srcu_read_lock(&partition->pt_irq_srcu);
+
+ hlist_for_each_entry_srcu(irqfd, &resampler->rsmplr_irqfd_list,
+ irqfd_resampler_hnode,
+ srcu_read_lock_held(&partition->pt_irq_srcu)) {
+ if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type))
+ hv_call_clear_virtual_interrupt(partition->pt_id);
+
+ eventfd_signal(irqfd->irqfd_resamplefd);
+ }
+
+ srcu_read_unlock(&partition->pt_irq_srcu, idx);
+}
+
+#if IS_ENABLED(CONFIG_X86_64)
+static bool
+mshv_vp_irq_vector_injected(union hv_vp_register_page_interrupt_vectors iv,
+ u32 vector)
+{
+ int i;
+
+ for (i = 0; i < iv.vector_count; i++) {
+ if (iv.vector[i] == vector)
+ return true;
+ }
+
+ return false;
+}
+
+static int mshv_vp_irq_try_set_vector(struct mshv_vp *vp, u32 vector)
+{
+ union hv_vp_register_page_interrupt_vectors iv, new_iv;
+
+ iv = vp->vp_register_page->interrupt_vectors;
+ new_iv = iv;
+
+ if (mshv_vp_irq_vector_injected(iv, vector))
+ return 0;
+
+ if (iv.vector_count >= HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT)
+ return -ENOSPC;
+
+ new_iv.vector[new_iv.vector_count++] = vector;
+
+ if (!try_cmpxchg(&vp->vp_register_page->interrupt_vectors.as_uint64,
+ &iv.as_uint64, new_iv.as_uint64))
+ return -EAGAIN;
+
+ return 0;
+}
+
+static int mshv_vp_irq_set_vector(struct mshv_vp *vp, u32 vector)
+{
+ int ret;
+
+ do {
+ ret = mshv_vp_irq_try_set_vector(vp, vector);
+ } while (ret == -EAGAIN && !need_resched());
+
+ return ret;
+}
+
+/*
+ * Try to raise irq for guest via shared vector array. hyp does the actual
+ * inject of the interrupt.
+ */
+static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
+{
+ struct mshv_partition *partition = irqfd->irqfd_partn;
+ struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
+ struct mshv_vp *vp;
+
+ if (!(ms_hyperv.ext_features &
+ HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE))
+ return -EOPNOTSUPP;
+
+ if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
+ return -EOPNOTSUPP;
+
+#if IS_ENABLED(CONFIG_X86)
+ if (irq->lapic_control.logical_dest_mode)
+ return -EOPNOTSUPP;
+#endif
+
+ vp = partition->pt_vp_array[irq->lapic_apic_id];
+
+ if (!vp->vp_register_page)
+ return -EOPNOTSUPP;
+
+ if (mshv_vp_irq_set_vector(vp, irq->lapic_vector))
+ return -EINVAL;
+
+ if (vp->run.flags.root_sched_dispatched &&
+ vp->vp_register_page->interrupt_vectors.as_uint64)
+ return -EBUSY;
+
+ wake_up(&vp->run.vp_suspend_queue);
+
+ return 0;
+}
+#else /* CONFIG_X86_64 */
+static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
+{
+ struct mshv_partition *partition = irqfd->irqfd_partn;
+ struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
+ unsigned int seq;
+ int idx;
+
+#if IS_ENABLED(CONFIG_X86)
+ WARN_ON(irqfd->irqfd_resampler &&
+ !irq->lapic_control.level_triggered);
+#endif
+
+ idx = srcu_read_lock(&partition->pt_irq_srcu);
+ if (irqfd->irqfd_girq_ent.guest_irq_num) {
+ if (!irqfd->irqfd_girq_ent.girq_entry_valid) {
+ srcu_read_unlock(&partition->pt_irq_srcu, idx);
+ return;
+ }
+
+ do {
+ seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+ } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+ }
+
+ hv_call_assert_virtual_interrupt(irqfd->irqfd_partn->pt_id,
+ irq->lapic_vector, irq->lapic_apic_id,
+ irq->lapic_control);
+ srcu_read_unlock(&partition->pt_irq_srcu, idx);
+}
+
+static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd)
+{
+ struct mshv_irqfd_resampler *rp = irqfd->irqfd_resampler;
+ struct mshv_partition *pt = rp->rsmplr_partn;
+
+ mutex_lock(&pt->irqfds_resampler_lock);
+
+ hlist_del_rcu(&irqfd->irqfd_resampler_hnode);
+ synchronize_srcu(&pt->pt_irq_srcu);
+
+ if (hlist_empty(&rp->rsmplr_irqfd_list)) {
+ hlist_del(&rp->rsmplr_hnode);
+ mshv_unregister_irq_ack_notifier(pt, &rp->rsmplr_notifier);
+ kfree(rp);
+ }
+
+ mutex_unlock(&pt->irqfds_resampler_lock);
+}
+
+/*
+ * Race-free decouple logic (ordering is critical)
+ */
+static void mshv_irqfd_shutdown(struct work_struct *work)
+{
+ struct mshv_irqfd *irqfd =
+ container_of(work, struct mshv_irqfd, irqfd_shutdown);
+ u64 cnt;
+
+ /*
+ * Synchronize with the wait-queue and unhook ourselves to prevent
+ * further events.
+ */
+ eventfd_ctx_remove_wait_queue(irqfd->irqfd_eventfd_ctx, &irqfd->irqfd_wait, &cnt);
+
+ if (irqfd->irqfd_resampler) {
+ mshv_irqfd_resampler_shutdown(irqfd);
+ eventfd_ctx_put(irqfd->irqfd_resamplefd);
+ }
+
+ /*
+ * It is now safe to release the object's resources
+ */
+ eventfd_ctx_put(irqfd->irqfd_eventfd_ctx);
+ kfree(irqfd);
+}
+
+/* assumes partition->pt_irqfds_lock is held */
+static bool mshv_irqfd_is_active(struct mshv_irqfd *irqfd)
+{
+ return !hlist_unhashed(&irqfd->irqfd_hnode);
+}
+
+/*
+ * Mark the irqfd as inactive and schedule it for removal
+ *
+ * assumes partition->pt_irqfds_lock is held
+ */
+static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd)
+{
+ if (!mshv_irqfd_is_active(irqfd))
+ return;
+
+ hlist_del(&irqfd->irqfd_hnode);
+
+ queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown);
+}
+
+/*
+ * Called with wqh->lock held and interrupts disabled
+ */
+static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
+ int sync, void *key)
+{
+ struct mshv_irqfd *irqfd = container_of(wait, struct mshv_irqfd,
+ irqfd_wait);
+ __poll_t flags = key_to_poll(key);
+ int idx;
+ unsigned int seq;
+ struct mshv_partition *pt = irqfd->irqfd_partn;
+ int ret = 0;
+
+ if (flags & EPOLLIN) {
+ u64 cnt;
+
+ eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt);
+ idx = srcu_read_lock(&pt->pt_irq_srcu);
+ do {
+ seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+ } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+
+ /* An event has been signaled, raise an interrupt */
+ ret = mshv_try_assert_irq_fast(irqfd);
+ if (ret)
+ mshv_assert_irq_slow(irqfd);
+
+ srcu_read_unlock(&pt->pt_irq_srcu, idx);
+
+ ret = 1;
+ }
+
+ if (flags & EPOLLHUP) {
+ /* The eventfd is closing, detach from the partition */
+ unsigned long flags;
+
+ spin_lock_irqsave(&pt->pt_irqfds_lock, flags);
+
+ /*
+ * We must check if someone deactivated the irqfd before
+ * we could acquire the pt_irqfds_lock since the item is
+ * deactivated from the mshv side before it is unhooked from
+ * the wait-queue. If it is already deactivated, we can
+ * simply return knowing the other side will cleanup for us.
+ * We cannot race against the irqfd going away since the
+ * other side is required to acquire wqh->lock, which we hold
+ */
+ if (mshv_irqfd_is_active(irqfd))
+ mshv_irqfd_deactivate(irqfd);
+
+ spin_unlock_irqrestore(&pt->pt_irqfds_lock, flags);
+ }
+
+ return ret;
+}
+
+/* Must be called under pt_irqfds_lock */
+static void mshv_irqfd_update(struct mshv_partition *pt,
+ struct mshv_irqfd *irqfd)
+{
+ write_seqcount_begin(&irqfd->irqfd_irqe_sc);
+ irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt,
+ irqfd->irqfd_irqnum);
+ mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq);
+ write_seqcount_end(&irqfd->irqfd_irqe_sc);
+}
+
+void mshv_irqfd_routing_update(struct mshv_partition *pt)
+{
+ struct mshv_irqfd *irqfd;
+
+ spin_lock_irq(&pt->pt_irqfds_lock);
+ hlist_for_each_entry(irqfd, &pt->pt_irqfds_list, irqfd_hnode)
+ mshv_irqfd_update(pt, irqfd);
+ spin_unlock_irq(&pt->pt_irqfds_lock);
+}
+
+static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh,
+ poll_table *polltbl)
+{
+ struct mshv_irqfd *irqfd =
+ container_of(polltbl, struct mshv_irqfd, irqfd_polltbl);
+
+ /*
+ * TODO: Ensure there isn't already an exclusive, priority waiter, e.g.
+ * that the irqfd isn't already bound to another partition. Only the
+ * first exclusive waiter encountered will be notified, and
+ * add_wait_queue_priority() doesn't enforce exclusivity.
+ */
+ irqfd->irqfd_wait.flags |= WQ_FLAG_EXCLUSIVE;
+ add_wait_queue_priority(wqh, &irqfd->irqfd_wait);
+}
+
+static int mshv_irqfd_assign(struct mshv_partition *pt,
+ struct mshv_user_irqfd *args)
+{
+ struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
+ struct mshv_irqfd *irqfd, *tmp;
+ __poll_t events;
+ int ret;
+ int idx;
+
+ CLASS(fd, f)(args->fd);
+
+ irqfd = kzalloc_obj(*irqfd);
+ if (!irqfd)
+ return -ENOMEM;
+
+ irqfd->irqfd_partn = pt;
+ irqfd->irqfd_irqnum = args->gsi;
+ INIT_WORK(&irqfd->irqfd_shutdown, mshv_irqfd_shutdown);
+ seqcount_spinlock_init(&irqfd->irqfd_irqe_sc, &pt->pt_irqfds_lock);
+
+ if (fd_empty(f)) {
+ ret = -EBADF;
+ goto out;
+ }
+
+ eventfd = eventfd_ctx_fileget(fd_file(f));
+ if (IS_ERR(eventfd)) {
+ ret = PTR_ERR(eventfd);
+ goto fail;
+ }
+
+ irqfd->irqfd_eventfd_ctx = eventfd;
+
+ if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE)) {
+ struct mshv_irqfd_resampler *rp;
+
+ resamplefd = eventfd_ctx_fdget(args->resamplefd);
+ if (IS_ERR(resamplefd)) {
+ ret = PTR_ERR(resamplefd);
+ goto fail;
+ }
+
+ irqfd->irqfd_resamplefd = resamplefd;
+
+ mutex_lock(&pt->irqfds_resampler_lock);
+
+ hlist_for_each_entry(rp, &pt->irqfds_resampler_list,
+ rsmplr_hnode) {
+ if (rp->rsmplr_notifier.irq_ack_gsi ==
+ irqfd->irqfd_irqnum) {
+ irqfd->irqfd_resampler = rp;
+ break;
+ }
+ }
+
+ if (!irqfd->irqfd_resampler) {
+ rp = kzalloc_obj(*rp, GFP_KERNEL_ACCOUNT);
+ if (!rp) {
+ ret = -ENOMEM;
+ mutex_unlock(&pt->irqfds_resampler_lock);
+ goto fail;
+ }
+
+ rp->rsmplr_partn = pt;
+ INIT_HLIST_HEAD(&rp->rsmplr_irqfd_list);
+ rp->rsmplr_notifier.irq_ack_gsi = irqfd->irqfd_irqnum;
+ rp->rsmplr_notifier.irq_acked =
+ mshv_irqfd_resampler_ack;
+
+ hlist_add_head(&rp->rsmplr_hnode,
+ &pt->irqfds_resampler_list);
+ mshv_register_irq_ack_notifier(pt,
+ &rp->rsmplr_notifier);
+ irqfd->irqfd_resampler = rp;
+ }
+
+ hlist_add_head_rcu(&irqfd->irqfd_resampler_hnode,
+ &irqfd->irqfd_resampler->rsmplr_irqfd_list);
+
+ mutex_unlock(&pt->irqfds_resampler_lock);
+ }
+
+ /*
+ * Install our own custom wake-up handling so we are notified via
+ * a callback whenever someone signals the underlying eventfd
+ */
+ init_waitqueue_func_entry(&irqfd->irqfd_wait, mshv_irqfd_wakeup);
+ init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc);
+
+ spin_lock_irq(&pt->pt_irqfds_lock);
+#if IS_ENABLED(CONFIG_X86)
+ if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) &&
+ !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) {
+ /*
+ * Resample Fd must be for level triggered interrupt
+ * Otherwise return with failure
+ */
+ spin_unlock_irq(&pt->pt_irqfds_lock);
+ ret = -EINVAL;
+ goto fail;
+ }
+#endif
+ ret = 0;
+ hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) {
+ if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx)
+ continue;
+ /* This fd is used for another irq already. */
+ ret = -EBUSY;
+ spin_unlock_irq(&pt->pt_irqfds_lock);
+ goto fail;
+ }
+
+ idx = srcu_read_lock(&pt->pt_irq_srcu);
+ mshv_irqfd_update(pt, irqfd);
+ hlist_add_head(&irqfd->irqfd_hnode, &pt->pt_irqfds_list);
+ spin_unlock_irq(&pt->pt_irqfds_lock);
+
+ /*
+ * Check if there was an event already pending on the eventfd
+ * before we registered, and trigger it as if we didn't miss it.
+ */
+ events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl);
+
+ if (events & EPOLLIN)
+ mshv_assert_irq_slow(irqfd);
+
+ srcu_read_unlock(&pt->pt_irq_srcu, idx);
+ return 0;
+
+fail:
+ if (irqfd->irqfd_resampler)
+ mshv_irqfd_resampler_shutdown(irqfd);
+
+ if (resamplefd && !IS_ERR(resamplefd))
+ eventfd_ctx_put(resamplefd);
+
+ if (eventfd && !IS_ERR(eventfd))
+ eventfd_ctx_put(eventfd);
+
+out:
+ kfree(irqfd);
+ return ret;
+}
+
+/*
+ * shutdown any irqfd's that match fd+gsi
+ */
+static int mshv_irqfd_deassign(struct mshv_partition *pt,
+ struct mshv_user_irqfd *args)
+{
+ struct mshv_irqfd *irqfd;
+ struct hlist_node *n;
+ struct eventfd_ctx *eventfd;
+
+ eventfd = eventfd_ctx_fdget(args->fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list,
+ irqfd_hnode) {
+ if (irqfd->irqfd_eventfd_ctx == eventfd &&
+ irqfd->irqfd_irqnum == args->gsi)
+
+ mshv_irqfd_deactivate(irqfd);
+ }
+
+ eventfd_ctx_put(eventfd);
+
+ /*
+ * Block until we know all outstanding shutdown jobs have completed
+ * so that we guarantee there will not be any more interrupts on this
+ * gsi once this deassign function returns.
+ */
+ flush_workqueue(irqfd_cleanup_wq);
+
+ return 0;
+}
+
+int mshv_set_unset_irqfd(struct mshv_partition *pt,
+ struct mshv_user_irqfd *args)
+{
+ if (args->flags & ~MSHV_IRQFD_FLAGS_MASK)
+ return -EINVAL;
+
+ if (args->flags & BIT(MSHV_IRQFD_BIT_DEASSIGN))
+ return mshv_irqfd_deassign(pt, args);
+
+ return mshv_irqfd_assign(pt, args);
+}
+
+/*
+ * This function is called as the mshv VM fd is being released.
+ * Shutdown all irqfds that still remain open
+ */
+static void mshv_irqfd_release(struct mshv_partition *pt)
+{
+ struct mshv_irqfd *irqfd;
+ struct hlist_node *n;
+
+ spin_lock_irq(&pt->pt_irqfds_lock);
+
+ hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, irqfd_hnode)
+ mshv_irqfd_deactivate(irqfd);
+
+ spin_unlock_irq(&pt->pt_irqfds_lock);
+
+ /*
+ * Block until we know all outstanding shutdown jobs have completed
+ * since we do not take a mshv_partition* reference.
+ */
+ flush_workqueue(irqfd_cleanup_wq);
+}
+
+int mshv_irqfd_wq_init(void)
+{
+ irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", WQ_PERCPU, 0);
+ if (!irqfd_cleanup_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void mshv_irqfd_wq_cleanup(void)
+{
+ destroy_workqueue(irqfd_cleanup_wq);
+}
+
+/*
+ * --------------------------------------------------------------------
+ * ioeventfd: translate a MMIO memory write to an eventfd signal.
+ *
+ * userspace can register a MMIO address with an eventfd for receiving
+ * notification when the memory has been touched.
+ * --------------------------------------------------------------------
+ */
+
+static void ioeventfd_release(struct mshv_ioeventfd *p, u64 partition_id)
+{
+ if (p->iovntfd_doorbell_id > 0)
+ mshv_unregister_doorbell(partition_id, p->iovntfd_doorbell_id);
+ eventfd_ctx_put(p->iovntfd_eventfd);
+ kfree(p);
+}
+
+/* MMIO writes trigger an event if the addr/val match */
+static void ioeventfd_mmio_write(int doorbell_id, void *data)
+{
+ struct mshv_partition *partition = (struct mshv_partition *)data;
+ struct mshv_ioeventfd *p;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(p, &partition->ioeventfds_list, iovntfd_hnode)
+ if (p->iovntfd_doorbell_id == doorbell_id) {
+ eventfd_signal(p->iovntfd_eventfd);
+ break;
+ }
+
+ rcu_read_unlock();
+}
+
+static bool ioeventfd_check_collision(struct mshv_partition *pt,
+ struct mshv_ioeventfd *p)
+ __must_hold(&pt->mutex)
+{
+ struct mshv_ioeventfd *_p;
+
+ hlist_for_each_entry(_p, &pt->ioeventfds_list, iovntfd_hnode)
+ if (_p->iovntfd_addr == p->iovntfd_addr &&
+ _p->iovntfd_length == p->iovntfd_length &&
+ (_p->iovntfd_wildcard || p->iovntfd_wildcard ||
+ _p->iovntfd_datamatch == p->iovntfd_datamatch))
+ return true;
+
+ return false;
+}
+
+static int mshv_assign_ioeventfd(struct mshv_partition *pt,
+ struct mshv_user_ioeventfd *args)
+ __must_hold(&pt->mutex)
+{
+ struct mshv_ioeventfd *p;
+ struct eventfd_ctx *eventfd;
+ u64 doorbell_flags = 0;
+ int ret;
+
+ /* This mutex is currently protecting ioeventfd.items list */
+ WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex));
+
+ if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO))
+ return -EOPNOTSUPP;
+
+ /* must be natural-word sized */
+ switch (args->len) {
+ case 0:
+ doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_ANY;
+ break;
+ case 1:
+ doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_BYTE;
+ break;
+ case 2:
+ doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_WORD;
+ break;
+ case 4:
+ doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_DWORD;
+ break;
+ case 8:
+ doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_QWORD;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* check for range overflow */
+ if (args->addr + args->len < args->addr)
+ return -EINVAL;
+
+ /* check for extra flags that we don't understand */
+ if (args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK)
+ return -EINVAL;
+
+ eventfd = eventfd_ctx_fdget(args->fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ p = kzalloc_obj(*p);
+ if (!p) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ p->iovntfd_addr = args->addr;
+ p->iovntfd_length = args->len;
+ p->iovntfd_eventfd = eventfd;
+
+ /* The datamatch feature is optional, otherwise this is a wildcard */
+ if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)) {
+ p->iovntfd_datamatch = args->datamatch;
+ } else {
+ p->iovntfd_wildcard = true;
+ doorbell_flags |= HV_DOORBELL_FLAG_TRIGGER_ANY_VALUE;
+ }
+
+ if (ioeventfd_check_collision(pt, p)) {
+ ret = -EEXIST;
+ goto unlock_fail;
+ }
+
+ ret = mshv_register_doorbell(pt->pt_id, ioeventfd_mmio_write,
+ (void *)pt, p->iovntfd_addr,
+ p->iovntfd_datamatch, doorbell_flags);
+
+ trace_mshv_assign_ioeventfd(pt->pt_id, p->iovntfd_addr,
+ p->iovntfd_length,
+ p->iovntfd_datamatch,
+ p->iovntfd_wildcard,
+ p->iovntfd_eventfd,
+ ret);
+
+ if (ret < 0)
+ goto unlock_fail;
+
+ p->iovntfd_doorbell_id = ret;
+
+ hlist_add_head_rcu(&p->iovntfd_hnode, &pt->ioeventfds_list);
+
+ return 0;
+
+unlock_fail:
+ kfree(p);
+
+fail:
+ eventfd_ctx_put(eventfd);
+
+ return ret;
+}
+
+static int mshv_deassign_ioeventfd(struct mshv_partition *pt,
+ struct mshv_user_ioeventfd *args)
+ __must_hold(&pt->mutex)
+{
+ struct mshv_ioeventfd *p;
+ struct eventfd_ctx *eventfd;
+ struct hlist_node *n;
+ int ret = -ENOENT;
+
+ /* This mutex is currently protecting ioeventfd.items list */
+ WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex));
+
+ eventfd = eventfd_ctx_fdget(args->fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ hlist_for_each_entry_safe(p, n, &pt->ioeventfds_list, iovntfd_hnode) {
+ bool wildcard = !(args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH));
+
+ if (p->iovntfd_eventfd != eventfd ||
+ p->iovntfd_addr != args->addr ||
+ p->iovntfd_length != args->len ||
+ p->iovntfd_wildcard != wildcard)
+ continue;
+
+ if (!p->iovntfd_wildcard &&
+ p->iovntfd_datamatch != args->datamatch)
+ continue;
+
+ trace_mshv_deassign_ioeventfd(pt->pt_id, p->iovntfd_addr,
+ p->iovntfd_length,
+ p->iovntfd_datamatch,
+ p->iovntfd_wildcard,
+ p->iovntfd_eventfd);
+
+ hlist_del_rcu(&p->iovntfd_hnode);
+ synchronize_rcu();
+ ioeventfd_release(p, pt->pt_id);
+ ret = 0;
+ break;
+ }
+
+ eventfd_ctx_put(eventfd);
+
+ return ret;
+}
+
+int mshv_set_unset_ioeventfd(struct mshv_partition *pt,
+ struct mshv_user_ioeventfd *args)
+ __must_hold(&pt->mutex)
+{
+ if ((args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) ||
+ mshv_field_nonzero(*args, rsvd))
+ return -EINVAL;
+
+ /* PIO not yet implemented */
+ if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO))
+ return -EOPNOTSUPP;
+
+ if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DEASSIGN))
+ return mshv_deassign_ioeventfd(pt, args);
+
+ return mshv_assign_ioeventfd(pt, args);
+}
+
+void mshv_eventfd_init(struct mshv_partition *pt)
+{
+ spin_lock_init(&pt->pt_irqfds_lock);
+ INIT_HLIST_HEAD(&pt->pt_irqfds_list);
+
+ INIT_HLIST_HEAD(&pt->irqfds_resampler_list);
+ mutex_init(&pt->irqfds_resampler_lock);
+
+ INIT_HLIST_HEAD(&pt->ioeventfds_list);
+}
+
+void mshv_eventfd_release(struct mshv_partition *pt)
+{
+ struct hlist_head items;
+ struct hlist_node *n;
+ struct mshv_ioeventfd *p;
+
+ hlist_move_list(&pt->ioeventfds_list, &items);
+ synchronize_rcu();
+
+ hlist_for_each_entry_safe(p, n, &items, iovntfd_hnode) {
+ hlist_del(&p->iovntfd_hnode);
+ ioeventfd_release(p, pt->pt_id);
+ }
+
+ mshv_irqfd_release(pt);
+}
diff --git a/drivers/hv/mshv_eventfd.h b/drivers/hv/mshv_eventfd.h
new file mode 100644
index 000000000000..464c6b81ab33
--- /dev/null
+++ b/drivers/hv/mshv_eventfd.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * irqfd: Allows an fd to be used to inject an interrupt to the guest.
+ * ioeventfd: Allow an fd to be used to receive a signal from the guest.
+ * All credit goes to kvm developers.
+ */
+
+#ifndef __LINUX_MSHV_EVENTFD_H
+#define __LINUX_MSHV_EVENTFD_H
+
+#include <linux/poll.h>
+
+#include "mshv.h"
+#include "mshv_root.h"
+
+/* struct to contain list of irqfds sharing an irq. Updates are protected by
+ * partition.irqfds.resampler_lock
+ */
+struct mshv_irqfd_resampler {
+ struct mshv_partition *rsmplr_partn;
+ struct hlist_head rsmplr_irqfd_list;
+ struct mshv_irq_ack_notifier rsmplr_notifier;
+ struct hlist_node rsmplr_hnode;
+};
+
+struct mshv_irqfd {
+ struct mshv_partition *irqfd_partn;
+ struct eventfd_ctx *irqfd_eventfd_ctx;
+ struct mshv_guest_irq_ent irqfd_girq_ent;
+ seqcount_spinlock_t irqfd_irqe_sc;
+ u32 irqfd_irqnum;
+ struct mshv_lapic_irq irqfd_lapic_irq;
+ struct hlist_node irqfd_hnode;
+ poll_table irqfd_polltbl;
+ wait_queue_entry_t irqfd_wait;
+ struct work_struct irqfd_shutdown;
+ struct mshv_irqfd_resampler *irqfd_resampler;
+ struct eventfd_ctx *irqfd_resamplefd;
+ struct hlist_node irqfd_resampler_hnode;
+};
+
+void mshv_eventfd_init(struct mshv_partition *partition);
+void mshv_eventfd_release(struct mshv_partition *partition);
+
+void mshv_register_irq_ack_notifier(struct mshv_partition *partition,
+ struct mshv_irq_ack_notifier *mian);
+void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition,
+ struct mshv_irq_ack_notifier *mian);
+bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi);
+
+int mshv_set_unset_irqfd(struct mshv_partition *partition,
+ struct mshv_user_irqfd *args);
+
+int mshv_irqfd_wq_init(void);
+void mshv_irqfd_wq_cleanup(void);
+
+struct mshv_ioeventfd {
+ struct hlist_node iovntfd_hnode;
+ u64 iovntfd_addr;
+ int iovntfd_length;
+ struct eventfd_ctx *iovntfd_eventfd;
+ u64 iovntfd_datamatch;
+ int iovntfd_doorbell_id;
+ bool iovntfd_wildcard;
+};
+
+int mshv_set_unset_ioeventfd(struct mshv_partition *pt,
+ struct mshv_user_ioeventfd *args);
+
+#endif /* __LINUX_MSHV_EVENTFD_H */
diff --git a/drivers/hv/mshv_irq.c b/drivers/hv/mshv_irq.c
new file mode 100644
index 000000000000..b3142c84dcbc
--- /dev/null
+++ b/drivers/hv/mshv_irq.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/mshyperv.h>
+
+#include "mshv_eventfd.h"
+#include "mshv.h"
+#include "mshv_root.h"
+
+/* called from the ioctl code, user wants to update the guest irq table */
+int mshv_update_routing_table(struct mshv_partition *partition,
+ const struct mshv_user_irq_entry *ue,
+ unsigned int numents)
+{
+ struct mshv_girq_routing_table *new = NULL, *old;
+ u32 i, nr_rt_entries = 0;
+ int r = 0;
+
+ if (numents == 0)
+ goto swap_routes;
+
+ for (i = 0; i < numents; i++) {
+ if (ue[i].gsi >= MSHV_MAX_GUEST_IRQS)
+ return -EINVAL;
+
+ if (ue[i].address_hi)
+ return -EINVAL;
+
+ nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
+ }
+ nr_rt_entries += 1;
+
+ new = kzalloc_flex(*new, mshv_girq_info_tbl, nr_rt_entries,
+ GFP_KERNEL_ACCOUNT);
+ if (!new)
+ return -ENOMEM;
+
+ new->num_rt_entries = nr_rt_entries;
+ for (i = 0; i < numents; i++) {
+ struct mshv_guest_irq_ent *girq;
+
+ girq = &new->mshv_girq_info_tbl[ue[i].gsi];
+
+ /*
+ * Allow only one to one mapping between GSI and MSI routing.
+ */
+ if (girq->guest_irq_num != 0) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ girq->guest_irq_num = ue[i].gsi;
+ girq->girq_addr_lo = ue[i].address_lo;
+ girq->girq_addr_hi = ue[i].address_hi;
+ girq->girq_irq_data = ue[i].data;
+ girq->girq_entry_valid = true;
+ }
+
+swap_routes:
+ mutex_lock(&partition->pt_irq_lock);
+ old = rcu_dereference_protected(partition->pt_girq_tbl, 1);
+ rcu_assign_pointer(partition->pt_girq_tbl, new);
+ mshv_irqfd_routing_update(partition);
+ mutex_unlock(&partition->pt_irq_lock);
+
+ synchronize_srcu_expedited(&partition->pt_irq_srcu);
+
+ trace_mshv_update_routing_table(partition->pt_id,
+ old, new, numents);
+
+ new = old;
+
+out:
+ kfree(new);
+
+ return r;
+}
+
+/* vm is going away, kfree the irq routing table */
+void mshv_free_routing_table(struct mshv_partition *partition)
+{
+ struct mshv_girq_routing_table *rt =
+ rcu_access_pointer(partition->pt_girq_tbl);
+
+ kfree(rt);
+}
+
+struct mshv_guest_irq_ent
+mshv_ret_girq_entry(struct mshv_partition *partition, u32 irqnum)
+{
+ struct mshv_guest_irq_ent entry = { 0 };
+ struct mshv_girq_routing_table *girq_tbl;
+
+ girq_tbl = srcu_dereference_check(partition->pt_girq_tbl,
+ &partition->pt_irq_srcu,
+ lockdep_is_held(&partition->pt_irq_lock));
+ if (!girq_tbl || irqnum >= girq_tbl->num_rt_entries) {
+ /*
+ * Premature register_irqfd, setting valid_entry = 0
+ * would ignore this entry anyway
+ */
+ entry.guest_irq_num = irqnum;
+ return entry;
+ }
+
+ return girq_tbl->mshv_girq_info_tbl[irqnum];
+}
+
+void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent,
+ struct mshv_lapic_irq *lirq)
+{
+ memset(lirq, 0, sizeof(*lirq));
+ if (!ent || !ent->girq_entry_valid)
+ return;
+
+ lirq->lapic_vector = ent->girq_irq_data & 0xFF;
+ lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF;
+ lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8;
+#if IS_ENABLED(CONFIG_X86)
+ lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1;
+ lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1;
+#elif IS_ENABLED(CONFIG_ARM64)
+ lirq->lapic_control.asserted = 1;
+#endif
+}
diff --git a/drivers/hv/mshv_portid_table.c b/drivers/hv/mshv_portid_table.c
new file mode 100644
index 000000000000..c349af1f0aaa
--- /dev/null
+++ b/drivers/hv/mshv_portid_table.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <asm/mshyperv.h>
+
+#include "mshv.h"
+#include "mshv_root.h"
+
+/*
+ * Ports and connections are hypervisor struct used for inter-partition
+ * communication. Port represents the source and connection represents
+ * the destination. Partitions are responsible for managing the port and
+ * connection ids.
+ *
+ */
+
+#define PORTID_MIN 1
+#define PORTID_MAX INT_MAX
+
+static DEFINE_IDR(port_table_idr);
+
+void
+mshv_port_table_fini(void)
+{
+ struct port_table_info *port_info;
+ unsigned long i, tmp;
+
+ idr_lock(&port_table_idr);
+ if (!idr_is_empty(&port_table_idr)) {
+ idr_for_each_entry_ul(&port_table_idr, port_info, tmp, i) {
+ port_info = idr_remove(&port_table_idr, i);
+ kfree_rcu(port_info, portbl_rcu);
+ }
+ }
+ idr_unlock(&port_table_idr);
+}
+
+int
+mshv_portid_alloc(struct port_table_info *info)
+{
+ int ret = 0;
+
+ idr_lock(&port_table_idr);
+ ret = idr_alloc(&port_table_idr, info, PORTID_MIN,
+ PORTID_MAX, GFP_KERNEL);
+ idr_unlock(&port_table_idr);
+
+ return ret;
+}
+
+void
+mshv_portid_free(int port_id)
+{
+ struct port_table_info *info;
+
+ idr_lock(&port_table_idr);
+ info = idr_remove(&port_table_idr, port_id);
+ WARN_ON(!info);
+ idr_unlock(&port_table_idr);
+
+ synchronize_rcu();
+ kfree(info);
+}
+
+int
+mshv_portid_lookup(int port_id, struct port_table_info *info)
+{
+ struct port_table_info *_info;
+ int ret = -ENOENT;
+
+ rcu_read_lock();
+ _info = idr_find(&port_table_idr, port_id);
+ rcu_read_unlock();
+
+ if (_info) {
+ *info = *_info;
+ ret = 0;
+ }
+
+ return ret;
+}
diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
new file mode 100644
index 000000000000..fdffd4f002f6
--- /dev/null
+++ b/drivers/hv/mshv_regions.c
@@ -0,0 +1,590 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2025, Microsoft Corporation.
+ *
+ * Memory region management for mshv_root module.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/hmm.h>
+#include <linux/hyperv.h>
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+#include <asm/mshyperv.h>
+
+#include "mshv_root.h"
+
+#define MSHV_MAP_FAULT_IN_PAGES PTRS_PER_PMD
+
+/**
+ * mshv_chunk_stride - Compute stride for mapping guest memory
+ * @page : The page to check for huge page backing
+ * @gfn : Guest frame number for the mapping
+ * @page_count: Total number of pages in the mapping
+ *
+ * Determines the appropriate stride (in pages) for mapping guest memory.
+ * Uses huge page stride if the backing page is huge and the guest mapping
+ * is properly aligned; otherwise falls back to single page stride.
+ *
+ * Return: Stride in pages, or -EINVAL if page order is unsupported.
+ */
+static int mshv_chunk_stride(struct page *page,
+ u64 gfn, u64 page_count)
+{
+ unsigned int page_order;
+
+ /*
+ * Use single page stride by default. For huge page stride, the
+ * page must be compound and point to the head of the compound
+ * page, and both gfn and page_count must be huge-page aligned.
+ */
+ if (!PageCompound(page) || !PageHead(page) ||
+ !IS_ALIGNED(gfn, PTRS_PER_PMD) ||
+ !IS_ALIGNED(page_count, PTRS_PER_PMD))
+ return 1;
+
+ page_order = folio_order(page_folio(page));
+ /* The hypervisor only supports 2M huge page */
+ if (page_order != PMD_ORDER)
+ return -EINVAL;
+
+ return 1 << page_order;
+}
+
+/**
+ * mshv_region_process_chunk - Processes a contiguous chunk of memory pages
+ * in a region.
+ * @region : Pointer to the memory region structure.
+ * @flags : Flags to pass to the handler.
+ * @page_offset: Offset into the region's pages array to start processing.
+ * @page_count : Number of pages to process.
+ * @handler : Callback function to handle the chunk.
+ *
+ * This function scans the region's pages starting from @page_offset,
+ * checking for contiguous present pages of the same size (normal or huge).
+ * It invokes @handler for the chunk of contiguous pages found. Returns the
+ * number of pages handled, or a negative error code if the first page is
+ * not present or the handler fails.
+ *
+ * Note: The @handler callback must be able to handle both normal and huge
+ * pages.
+ *
+ * Return: Number of pages handled, or negative error code.
+ */
+static long mshv_region_process_chunk(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count,
+ int (*handler)(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset,
+ u64 page_count,
+ bool huge_page))
+{
+ u64 gfn = region->start_gfn + page_offset;
+ u64 count;
+ struct page *page;
+ int stride, ret;
+
+ page = region->mreg_pages[page_offset];
+ if (!page)
+ return -EINVAL;
+
+ stride = mshv_chunk_stride(page, gfn, page_count);
+ if (stride < 0)
+ return stride;
+
+ /* Start at stride since the first stride is validated */
+ for (count = stride; count < page_count; count += stride) {
+ page = region->mreg_pages[page_offset + count];
+
+ /* Break if current page is not present */
+ if (!page)
+ break;
+
+ /* Break if stride size changes */
+ if (stride != mshv_chunk_stride(page, gfn + count,
+ page_count - count))
+ break;
+ }
+
+ ret = handler(region, flags, page_offset, count, stride > 1);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+/**
+ * mshv_region_process_range - Processes a range of memory pages in a
+ * region.
+ * @region : Pointer to the memory region structure.
+ * @flags : Flags to pass to the handler.
+ * @page_offset: Offset into the region's pages array to start processing.
+ * @page_count : Number of pages to process.
+ * @handler : Callback function to handle each chunk of contiguous
+ * pages.
+ *
+ * Iterates over the specified range of pages in @region, skipping
+ * non-present pages. For each contiguous chunk of present pages, invokes
+ * @handler via mshv_region_process_chunk.
+ *
+ * Note: The @handler callback must be able to handle both normal and huge
+ * pages.
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+static int mshv_region_process_range(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count,
+ int (*handler)(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset,
+ u64 page_count,
+ bool huge_page))
+{
+ long ret;
+
+ if (page_offset + page_count > region->nr_pages)
+ return -EINVAL;
+
+ while (page_count) {
+ /* Skip non-present pages */
+ if (!region->mreg_pages[page_offset]) {
+ page_offset++;
+ page_count--;
+ continue;
+ }
+
+ ret = mshv_region_process_chunk(region, flags,
+ page_offset,
+ page_count,
+ handler);
+ if (ret < 0)
+ return ret;
+
+ page_offset += ret;
+ page_count -= ret;
+ }
+
+ return 0;
+}
+
+struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
+ u64 uaddr, u32 flags)
+{
+ struct mshv_mem_region *region;
+
+ region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages);
+ if (!region)
+ return ERR_PTR(-ENOMEM);
+
+ region->nr_pages = nr_pages;
+ region->start_gfn = guest_pfn;
+ region->start_uaddr = uaddr;
+ region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE;
+ if (flags & BIT(MSHV_SET_MEM_BIT_WRITABLE))
+ region->hv_map_flags |= HV_MAP_GPA_WRITABLE;
+ if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE))
+ region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE;
+
+ kref_init(&region->mreg_refcount);
+
+ return region;
+}
+
+static int mshv_region_chunk_share(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count,
+ bool huge_page)
+{
+ if (huge_page)
+ flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
+
+ return hv_call_modify_spa_host_access(region->partition->pt_id,
+ region->mreg_pages + page_offset,
+ page_count,
+ HV_MAP_GPA_READABLE |
+ HV_MAP_GPA_WRITABLE,
+ flags, true);
+}
+
+int mshv_region_share(struct mshv_mem_region *region)
+{
+ u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED;
+
+ return mshv_region_process_range(region, flags,
+ 0, region->nr_pages,
+ mshv_region_chunk_share);
+}
+
+static int mshv_region_chunk_unshare(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count,
+ bool huge_page)
+{
+ if (huge_page)
+ flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
+
+ return hv_call_modify_spa_host_access(region->partition->pt_id,
+ region->mreg_pages + page_offset,
+ page_count, 0,
+ flags, false);
+}
+
+int mshv_region_unshare(struct mshv_mem_region *region)
+{
+ u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE;
+
+ return mshv_region_process_range(region, flags,
+ 0, region->nr_pages,
+ mshv_region_chunk_unshare);
+}
+
+static int mshv_region_chunk_remap(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count,
+ bool huge_page)
+{
+ if (huge_page)
+ flags |= HV_MAP_GPA_LARGE_PAGE;
+
+ return hv_call_map_gpa_pages(region->partition->pt_id,
+ region->start_gfn + page_offset,
+ page_count, flags,
+ region->mreg_pages + page_offset);
+}
+
+static int mshv_region_remap_pages(struct mshv_mem_region *region,
+ u32 map_flags,
+ u64 page_offset, u64 page_count)
+{
+ return mshv_region_process_range(region, map_flags,
+ page_offset, page_count,
+ mshv_region_chunk_remap);
+}
+
+int mshv_region_map(struct mshv_mem_region *region)
+{
+ u32 map_flags = region->hv_map_flags;
+
+ return mshv_region_remap_pages(region, map_flags,
+ 0, region->nr_pages);
+}
+
+static void mshv_region_invalidate_pages(struct mshv_mem_region *region,
+ u64 page_offset, u64 page_count)
+{
+ if (region->mreg_type == MSHV_REGION_TYPE_MEM_PINNED)
+ unpin_user_pages(region->mreg_pages + page_offset, page_count);
+
+ memset(region->mreg_pages + page_offset, 0,
+ page_count * sizeof(struct page *));
+}
+
+void mshv_region_invalidate(struct mshv_mem_region *region)
+{
+ mshv_region_invalidate_pages(region, 0, region->nr_pages);
+}
+
+int mshv_region_pin(struct mshv_mem_region *region)
+{
+ u64 done_count, nr_pages;
+ struct page **pages;
+ __u64 userspace_addr;
+ int ret;
+
+ for (done_count = 0; done_count < region->nr_pages; done_count += ret) {
+ pages = region->mreg_pages + done_count;
+ userspace_addr = region->start_uaddr +
+ done_count * HV_HYP_PAGE_SIZE;
+ nr_pages = min(region->nr_pages - done_count,
+ MSHV_PIN_PAGES_BATCH_SIZE);
+
+ /*
+ * Pinning assuming 4k pages works for large pages too.
+ * All page structs within the large page are returned.
+ *
+ * Pin requests are batched because pin_user_pages_fast
+ * with the FOLL_LONGTERM flag does a large temporary
+ * allocation of contiguous memory.
+ */
+ ret = pin_user_pages_fast(userspace_addr, nr_pages,
+ FOLL_WRITE | FOLL_LONGTERM,
+ pages);
+ if (ret != nr_pages)
+ goto release_pages;
+ }
+
+ return 0;
+
+release_pages:
+ if (ret > 0)
+ done_count += ret;
+ mshv_region_invalidate_pages(region, 0, done_count);
+ return ret < 0 ? ret : -ENOMEM;
+}
+
+static int mshv_region_chunk_unmap(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count,
+ bool huge_page)
+{
+ if (huge_page)
+ flags |= HV_UNMAP_GPA_LARGE_PAGE;
+
+ return hv_call_unmap_gpa_pages(region->partition->pt_id,
+ region->start_gfn + page_offset,
+ page_count, flags);
+}
+
+static int mshv_region_unmap(struct mshv_mem_region *region)
+{
+ return mshv_region_process_range(region, 0,
+ 0, region->nr_pages,
+ mshv_region_chunk_unmap);
+}
+
+static void mshv_region_destroy(struct kref *ref)
+{
+ struct mshv_mem_region *region =
+ container_of(ref, struct mshv_mem_region, mreg_refcount);
+ struct mshv_partition *partition = region->partition;
+ int ret;
+
+ if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE)
+ mshv_region_movable_fini(region);
+
+ if (mshv_partition_encrypted(partition)) {
+ ret = mshv_region_share(region);
+ if (ret) {
+ pt_err(partition,
+ "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n",
+ ret);
+ return;
+ }
+ }
+
+ mshv_region_unmap(region);
+
+ mshv_region_invalidate(region);
+
+ vfree(region);
+}
+
+void mshv_region_put(struct mshv_mem_region *region)
+{
+ kref_put(&region->mreg_refcount, mshv_region_destroy);
+}
+
+int mshv_region_get(struct mshv_mem_region *region)
+{
+ return kref_get_unless_zero(&region->mreg_refcount);
+}
+
+/**
+ * mshv_region_hmm_fault_and_lock - Handle HMM faults and lock the memory region
+ * @region: Pointer to the memory region structure
+ * @range: Pointer to the HMM range structure
+ *
+ * This function performs the following steps:
+ * 1. Reads the notifier sequence for the HMM range.
+ * 2. Acquires a read lock on the memory map.
+ * 3. Handles HMM faults for the specified range.
+ * 4. Releases the read lock on the memory map.
+ * 5. If successful, locks the memory region mutex.
+ * 6. Verifies if the notifier sequence has changed during the operation.
+ * If it has, releases the mutex and returns -EBUSY to match with
+ * hmm_range_fault() return code for repeating.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region,
+ struct hmm_range *range)
+{
+ int ret;
+
+ range->notifier_seq = mmu_interval_read_begin(range->notifier);
+ mmap_read_lock(region->mreg_mni.mm);
+ ret = hmm_range_fault(range);
+ mmap_read_unlock(region->mreg_mni.mm);
+ if (ret)
+ return ret;
+
+ mutex_lock(&region->mreg_mutex);
+
+ if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) {
+ mutex_unlock(&region->mreg_mutex);
+ cond_resched();
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+/**
+ * mshv_region_range_fault - Handle memory range faults for a given region.
+ * @region: Pointer to the memory region structure.
+ * @page_offset: Offset of the page within the region.
+ * @page_count: Number of pages to handle.
+ *
+ * This function resolves memory faults for a specified range of pages
+ * within a memory region. It uses HMM (Heterogeneous Memory Management)
+ * to fault in the required pages and updates the region's page array.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int mshv_region_range_fault(struct mshv_mem_region *region,
+ u64 page_offset, u64 page_count)
+{
+ struct hmm_range range = {
+ .notifier = &region->mreg_mni,
+ .default_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE,
+ };
+ unsigned long *pfns;
+ int ret;
+ u64 i;
+
+ pfns = kmalloc_array(page_count, sizeof(*pfns), GFP_KERNEL);
+ if (!pfns)
+ return -ENOMEM;
+
+ range.hmm_pfns = pfns;
+ range.start = region->start_uaddr + page_offset * HV_HYP_PAGE_SIZE;
+ range.end = range.start + page_count * HV_HYP_PAGE_SIZE;
+
+ do {
+ ret = mshv_region_hmm_fault_and_lock(region, &range);
+ } while (ret == -EBUSY);
+
+ if (ret)
+ goto out;
+
+ for (i = 0; i < page_count; i++)
+ region->mreg_pages[page_offset + i] = hmm_pfn_to_page(pfns[i]);
+
+ ret = mshv_region_remap_pages(region, region->hv_map_flags,
+ page_offset, page_count);
+
+ mutex_unlock(&region->mreg_mutex);
+out:
+ kfree(pfns);
+ return ret;
+}
+
+bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn)
+{
+ u64 page_offset, page_count;
+ int ret;
+
+ /* Align the page offset to the nearest MSHV_MAP_FAULT_IN_PAGES. */
+ page_offset = ALIGN_DOWN(gfn - region->start_gfn,
+ MSHV_MAP_FAULT_IN_PAGES);
+
+ /* Map more pages than requested to reduce the number of faults. */
+ page_count = min(region->nr_pages - page_offset,
+ MSHV_MAP_FAULT_IN_PAGES);
+
+ ret = mshv_region_range_fault(region, page_offset, page_count);
+
+ WARN_ONCE(ret,
+ "p%llu: GPA intercept failed: region %#llx-%#llx, gfn %#llx, page_offset %llu, page_count %llu\n",
+ region->partition->pt_id, region->start_uaddr,
+ region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT),
+ gfn, page_offset, page_count);
+
+ return !ret;
+}
+
+/**
+ * mshv_region_interval_invalidate - Invalidate a range of memory region
+ * @mni: Pointer to the mmu_interval_notifier structure
+ * @range: Pointer to the mmu_notifier_range structure
+ * @cur_seq: Current sequence number for the interval notifier
+ *
+ * This function invalidates a memory region by remapping its pages with
+ * no access permissions. It locks the region's mutex to ensure thread safety
+ * and updates the sequence number for the interval notifier. If the range
+ * is blockable, it uses a blocking lock; otherwise, it attempts a non-blocking
+ * lock and returns false if unsuccessful.
+ *
+ * NOTE: Failure to invalidate a region is a serious error, as the pages will
+ * be considered freed while they are still mapped by the hypervisor.
+ * Any attempt to access such pages will likely crash the system.
+ *
+ * Return: true if the region was successfully invalidated, false otherwise.
+ */
+static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
+{
+ struct mshv_mem_region *region = container_of(mni,
+ struct mshv_mem_region,
+ mreg_mni);
+ u64 page_offset, page_count;
+ unsigned long mstart, mend;
+ int ret = -EPERM;
+
+ mstart = max(range->start, region->start_uaddr);
+ mend = min(range->end, region->start_uaddr +
+ (region->nr_pages << HV_HYP_PAGE_SHIFT));
+
+ page_offset = HVPFN_DOWN(mstart - region->start_uaddr);
+ page_count = HVPFN_DOWN(mend - mstart);
+
+ if (mmu_notifier_range_blockable(range))
+ mutex_lock(&region->mreg_mutex);
+ else if (!mutex_trylock(&region->mreg_mutex))
+ goto out_fail;
+
+ mmu_interval_set_seq(mni, cur_seq);
+
+ ret = mshv_region_remap_pages(region, HV_MAP_GPA_NO_ACCESS,
+ page_offset, page_count);
+ if (ret)
+ goto out_unlock;
+
+ mshv_region_invalidate_pages(region, page_offset, page_count);
+
+ mutex_unlock(&region->mreg_mutex);
+
+ return true;
+
+out_unlock:
+ mutex_unlock(&region->mreg_mutex);
+out_fail:
+ WARN_ONCE(ret,
+ "Failed to invalidate region %#llx-%#llx (range %#lx-%#lx, event: %u, pages %#llx-%#llx, mm: %#llx): %d\n",
+ region->start_uaddr,
+ region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT),
+ range->start, range->end, range->event,
+ page_offset, page_offset + page_count - 1, (u64)range->mm, ret);
+ return false;
+}
+
+static const struct mmu_interval_notifier_ops mshv_region_mni_ops = {
+ .invalidate = mshv_region_interval_invalidate,
+};
+
+void mshv_region_movable_fini(struct mshv_mem_region *region)
+{
+ mmu_interval_notifier_remove(&region->mreg_mni);
+}
+
+bool mshv_region_movable_init(struct mshv_mem_region *region)
+{
+ int ret;
+
+ ret = mmu_interval_notifier_insert(&region->mreg_mni, current->mm,
+ region->start_uaddr,
+ region->nr_pages << HV_HYP_PAGE_SHIFT,
+ &mshv_region_mni_ops);
+ if (ret)
+ return false;
+
+ mutex_init(&region->mreg_mutex);
+
+ return true;
+}
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
new file mode 100644
index 000000000000..1f086dcb7aa1
--- /dev/null
+++ b/drivers/hv/mshv_root.h
@@ -0,0 +1,381 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ */
+
+#ifndef _MSHV_ROOT_H_
+#define _MSHV_ROOT_H_
+
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/sched.h>
+#include <linux/srcu.h>
+#include <linux/wait.h>
+#include <linux/hashtable.h>
+#include <linux/dev_printk.h>
+#include <linux/build_bug.h>
+#include <linux/mmu_notifier.h>
+#include <uapi/linux/mshv.h>
+#include "mshv_trace.h"
+
+/*
+ * Hypervisor must be between these version numbers (inclusive)
+ * to guarantee compatibility
+ */
+#define MSHV_HV_MIN_VERSION (27744)
+#define MSHV_HV_MAX_VERSION (27751)
+
+static_assert(HV_HYP_PAGE_SIZE == MSHV_HV_PAGE_SIZE);
+
+#define MSHV_MAX_VPS 256
+
+#define MSHV_PARTITIONS_HASH_BITS 9
+
+#define MSHV_PIN_PAGES_BATCH_SIZE (0x10000000ULL / HV_HYP_PAGE_SIZE)
+
+struct mshv_vp {
+ u32 vp_index;
+ struct mshv_partition *vp_partition;
+ struct mutex vp_mutex;
+ struct hv_vp_register_page *vp_register_page;
+ struct hv_message *vp_intercept_msg_page;
+ void *vp_ghcb_page;
+ struct hv_stats_page *vp_stats_pages[2];
+ struct {
+ atomic64_t vp_signaled_count;
+ struct {
+ u64 intercept_suspend: 1;
+ u64 root_sched_blocked: 1; /* root scheduler only */
+ u64 root_sched_dispatched: 1; /* root scheduler only */
+ u64 reserved: 61;
+ } flags;
+ unsigned int kicked_by_hv;
+ wait_queue_head_t vp_suspend_queue;
+ } run;
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+ struct dentry *vp_stats_dentry;
+#endif
+};
+
+#define vp_fmt(fmt) "p%lluvp%u: " fmt
+#define vp_devprintk(level, v, fmt, ...) \
+do { \
+ const struct mshv_vp *__vp = (v); \
+ const struct mshv_partition *__pt = __vp->vp_partition; \
+ dev_##level(__pt->pt_module_dev, vp_fmt(fmt), __pt->pt_id, \
+ __vp->vp_index, ##__VA_ARGS__); \
+} while (0)
+#define vp_emerg(v, fmt, ...) vp_devprintk(emerg, v, fmt, ##__VA_ARGS__)
+#define vp_crit(v, fmt, ...) vp_devprintk(crit, v, fmt, ##__VA_ARGS__)
+#define vp_alert(v, fmt, ...) vp_devprintk(alert, v, fmt, ##__VA_ARGS__)
+#define vp_err(v, fmt, ...) vp_devprintk(err, v, fmt, ##__VA_ARGS__)
+#define vp_warn(v, fmt, ...) vp_devprintk(warn, v, fmt, ##__VA_ARGS__)
+#define vp_notice(v, fmt, ...) vp_devprintk(notice, v, fmt, ##__VA_ARGS__)
+#define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__)
+#define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__)
+
+enum mshv_region_type {
+ MSHV_REGION_TYPE_MEM_PINNED,
+ MSHV_REGION_TYPE_MEM_MOVABLE,
+ MSHV_REGION_TYPE_MMIO
+};
+
+struct mshv_mem_region {
+ struct hlist_node hnode;
+ struct kref mreg_refcount;
+ u64 nr_pages;
+ u64 start_gfn;
+ u64 start_uaddr;
+ u32 hv_map_flags;
+ struct mshv_partition *partition;
+ enum mshv_region_type mreg_type;
+ struct mmu_interval_notifier mreg_mni;
+ struct mutex mreg_mutex; /* protects region pages remapping */
+ struct page *mreg_pages[];
+};
+
+struct mshv_irq_ack_notifier {
+ struct hlist_node link;
+ unsigned int irq_ack_gsi;
+ void (*irq_acked)(struct mshv_irq_ack_notifier *mian);
+};
+
+struct mshv_partition {
+ struct device *pt_module_dev;
+
+ struct hlist_node pt_hnode;
+ u64 pt_id;
+ refcount_t pt_ref_count;
+ struct mutex pt_mutex;
+
+ spinlock_t pt_mem_regions_lock;
+ struct hlist_head pt_mem_regions; // not ordered
+
+ u32 pt_vp_count;
+ struct mshv_vp *pt_vp_array[MSHV_MAX_VPS];
+
+ struct mutex pt_irq_lock;
+ struct srcu_struct pt_irq_srcu;
+ struct hlist_head irq_ack_notifier_list;
+
+ struct hlist_head pt_devices;
+
+ /*
+ * MSHV does not support more than one async hypercall in flight
+ * for a single partition. Thus, it is okay to define per partition
+ * async hypercall status.
+ */
+ struct completion async_hypercall;
+ u64 async_hypercall_status;
+
+ spinlock_t pt_irqfds_lock;
+ struct hlist_head pt_irqfds_list;
+ struct mutex irqfds_resampler_lock;
+ struct hlist_head irqfds_resampler_list;
+
+ struct hlist_head ioeventfds_list;
+
+ struct mshv_girq_routing_table __rcu *pt_girq_tbl;
+ u64 isolation_type;
+ bool import_completed;
+ bool pt_initialized;
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+ struct dentry *pt_stats_dentry;
+ struct dentry *pt_vp_dentry;
+#endif
+};
+
+#define pt_fmt(fmt) "p%llu: " fmt
+#define pt_devprintk(level, p, fmt, ...) \
+do { \
+ const struct mshv_partition *__pt = (p); \
+ dev_##level(__pt->pt_module_dev, pt_fmt(fmt), __pt->pt_id, \
+ ##__VA_ARGS__); \
+} while (0)
+#define pt_emerg(p, fmt, ...) pt_devprintk(emerg, p, fmt, ##__VA_ARGS__)
+#define pt_crit(p, fmt, ...) pt_devprintk(crit, p, fmt, ##__VA_ARGS__)
+#define pt_alert(p, fmt, ...) pt_devprintk(alert, p, fmt, ##__VA_ARGS__)
+#define pt_err(p, fmt, ...) pt_devprintk(err, p, fmt, ##__VA_ARGS__)
+#define pt_warn(p, fmt, ...) pt_devprintk(warn, p, fmt, ##__VA_ARGS__)
+#define pt_notice(p, fmt, ...) pt_devprintk(notice, p, fmt, ##__VA_ARGS__)
+#define pt_info(p, fmt, ...) pt_devprintk(info, p, fmt, ##__VA_ARGS__)
+#define pt_dbg(p, fmt, ...) pt_devprintk(dbg, p, fmt, ##__VA_ARGS__)
+
+struct mshv_lapic_irq {
+ u32 lapic_vector;
+ u64 lapic_apic_id;
+ union hv_interrupt_control lapic_control;
+};
+
+#define MSHV_MAX_GUEST_IRQS 4096
+
+/* representation of one guest irq entry, either msi or legacy */
+struct mshv_guest_irq_ent {
+ u32 girq_entry_valid; /* vfio looks at this */
+ u32 guest_irq_num; /* a unique number for each irq */
+ u32 girq_addr_lo; /* guest irq msi address info */
+ u32 girq_addr_hi;
+ u32 girq_irq_data; /* idt vector in some cases */
+};
+
+struct mshv_girq_routing_table {
+ u32 num_rt_entries;
+ struct mshv_guest_irq_ent mshv_girq_info_tbl[];
+};
+
+struct hv_synic_pages {
+ struct hv_message_page *hyp_synic_message_page;
+ struct hv_synic_event_flags_page *synic_event_flags_page;
+ struct hv_synic_event_ring_page *synic_event_ring_page;
+};
+
+struct mshv_root {
+ spinlock_t pt_ht_lock;
+ DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS);
+ struct hv_partition_property_vmm_capabilities vmm_caps;
+};
+
+/*
+ * Callback for doorbell events.
+ * NOTE: This is called in interrupt context. Callback
+ * should defer slow and sleeping logic to later.
+ */
+typedef void (*doorbell_cb_t) (int doorbell_id, void *);
+
+/*
+ * port table information
+ */
+struct port_table_info {
+ struct rcu_head portbl_rcu;
+ enum hv_port_type hv_port_type;
+ union {
+ struct {
+ u64 reserved[2];
+ } hv_port_message;
+ struct {
+ u64 reserved[2];
+ } hv_port_event;
+ struct {
+ u64 reserved[2];
+ } hv_port_monitor;
+ struct {
+ doorbell_cb_t doorbell_cb;
+ void *data;
+ } hv_port_doorbell;
+ };
+};
+
+int mshv_update_routing_table(struct mshv_partition *partition,
+ const struct mshv_user_irq_entry *entries,
+ unsigned int numents);
+void mshv_free_routing_table(struct mshv_partition *partition);
+
+struct mshv_guest_irq_ent mshv_ret_girq_entry(struct mshv_partition *partition,
+ u32 irq_num);
+
+void mshv_copy_girq_info(struct mshv_guest_irq_ent *src_irq,
+ struct mshv_lapic_irq *dest_irq);
+
+void mshv_irqfd_routing_update(struct mshv_partition *partition);
+
+void mshv_port_table_fini(void);
+int mshv_portid_alloc(struct port_table_info *info);
+int mshv_portid_lookup(int port_id, struct port_table_info *info);
+void mshv_portid_free(int port_id);
+
+int mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb,
+ void *data, u64 gpa, u64 val, u64 flags);
+void mshv_unregister_doorbell(u64 partition_id, int doorbell_portid);
+
+void mshv_isr(void);
+int mshv_synic_init(struct device *dev);
+void mshv_synic_exit(void);
+
+static inline bool mshv_partition_encrypted(struct mshv_partition *partition)
+{
+ return partition->isolation_type == HV_PARTITION_ISOLATION_TYPE_SNP;
+}
+
+struct mshv_partition *mshv_partition_get(struct mshv_partition *partition);
+void mshv_partition_put(struct mshv_partition *partition);
+struct mshv_partition *mshv_partition_find(u64 partition_id) __must_hold(RCU);
+
+static inline bool is_l1vh_parent(u64 partition_id)
+{
+ return hv_l1vh_partition() && (partition_id == HV_PARTITION_ID_SELF);
+}
+
+int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
+ struct hv_stats_page **stats_pages);
+void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
+ struct hv_stats_page **stats_pages);
+
+/* hypercalls */
+
+int hv_call_withdraw_memory(u64 count, int node, u64 partition_id);
+int hv_call_create_partition(u64 flags,
+ struct hv_partition_creation_properties creation_properties,
+ union hv_partition_isolation_properties isolation_properties,
+ u64 *partition_id);
+int hv_call_initialize_partition(u64 partition_id);
+int hv_call_finalize_partition(u64 partition_id);
+int hv_call_delete_partition(u64 partition_id);
+int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs);
+int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count,
+ u32 flags, struct page **pages);
+int hv_call_unmap_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count,
+ u32 flags);
+int hv_call_delete_vp(u64 partition_id, u32 vp_index);
+int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector,
+ u64 dest_addr,
+ union hv_interrupt_control control);
+int hv_call_clear_virtual_interrupt(u64 partition_id);
+int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
+ union hv_gpa_page_access_state_flags state_flags,
+ int *written_total,
+ union hv_gpa_page_access_state *states);
+int hv_call_get_vp_state(u32 vp_index, u64 partition_id,
+ struct hv_vp_state_data state_data,
+ /* Choose between pages and ret_output */
+ u64 page_count, struct page **pages,
+ union hv_output_get_vp_state *ret_output);
+int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
+ /* Choose between pages and bytes */
+ struct hv_vp_state_data state_data, u64 page_count,
+ struct page **pages, u32 num_bytes, u8 *bytes);
+int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl,
+ struct page **state_page);
+int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ struct page *state_page,
+ union hv_input_vtl input_vtl);
+int hv_call_create_port(u64 port_partition_id, union hv_port_id port_id,
+ u64 connection_partition_id, struct hv_port_info *port_info,
+ u8 port_vtl, u8 min_connection_vtl, int node);
+int hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id);
+int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
+ u64 connection_partition_id,
+ union hv_connection_id connection_id,
+ struct hv_connection_info *connection_info,
+ u8 connection_vtl, int node);
+int hv_call_disconnect_port(u64 connection_partition_id,
+ union hv_connection_id connection_id);
+int hv_call_notify_port_ring_empty(u32 sint_index);
+int hv_map_stats_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity,
+ struct hv_stats_page **addr);
+int hv_unmap_stats_page(enum hv_stats_object_type type,
+ struct hv_stats_page *page_addr,
+ const union hv_stats_object_identity *identity);
+int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
+ u64 page_struct_count, u32 host_access,
+ u32 flags, u8 acquire);
+int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 arg,
+ void *property_value, size_t property_value_sz);
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+int __init mshv_debugfs_init(void);
+void mshv_debugfs_exit(void);
+
+int mshv_debugfs_partition_create(struct mshv_partition *partition);
+void mshv_debugfs_partition_remove(struct mshv_partition *partition);
+int mshv_debugfs_vp_create(struct mshv_vp *vp);
+void mshv_debugfs_vp_remove(struct mshv_vp *vp);
+#else
+static inline int __init mshv_debugfs_init(void)
+{
+ return 0;
+}
+static inline void mshv_debugfs_exit(void) { }
+
+static inline int mshv_debugfs_partition_create(struct mshv_partition *partition)
+{
+ return 0;
+}
+static inline void mshv_debugfs_partition_remove(struct mshv_partition *partition) { }
+static inline int mshv_debugfs_vp_create(struct mshv_vp *vp)
+{
+ return 0;
+}
+static inline void mshv_debugfs_vp_remove(struct mshv_vp *vp) { }
+#endif
+
+extern struct mshv_root mshv_root;
+extern enum hv_scheduler_type hv_scheduler_type;
+extern u8 * __percpu *hv_synic_eventring_tail;
+
+struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
+ u64 uaddr, u32 flags);
+int mshv_region_share(struct mshv_mem_region *region);
+int mshv_region_unshare(struct mshv_mem_region *region);
+int mshv_region_map(struct mshv_mem_region *region);
+void mshv_region_invalidate(struct mshv_mem_region *region);
+int mshv_region_pin(struct mshv_mem_region *region);
+void mshv_region_put(struct mshv_mem_region *region);
+int mshv_region_get(struct mshv_mem_region *region);
+bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn);
+void mshv_region_movable_fini(struct mshv_mem_region *region);
+bool mshv_region_movable_init(struct mshv_mem_region *region);
+
+#endif /* _MSHV_ROOT_H_ */
diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
new file mode 100644
index 000000000000..cb55d4d4be2e
--- /dev/null
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -0,0 +1,1074 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * Hypercall helper functions used by the mshv_root module.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <asm/mshyperv.h>
+
+#include "mshv_root.h"
+
+/* Determined empirically */
+#define HV_INIT_PARTITION_DEPOSIT_PAGES 208
+#define HV_MAP_GPA_DEPOSIT_PAGES 256
+#define HV_UMAP_GPA_PAGES 512
+
+#define HV_PAGE_COUNT_2M_ALIGNED(pg_count) (!((pg_count) & (0x200 - 1)))
+
+#define HV_WITHDRAW_BATCH_SIZE (HV_HYP_PAGE_SIZE / sizeof(u64))
+#define HV_MAP_GPA_BATCH_SIZE \
+ ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_map_gpa_pages)) \
+ / sizeof(u64))
+#define HV_GET_VP_STATE_BATCH_SIZE \
+ ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_get_vp_state)) \
+ / sizeof(u64))
+#define HV_SET_VP_STATE_BATCH_SIZE \
+ ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_state)) \
+ / sizeof(u64))
+#define HV_GET_GPA_ACCESS_STATES_BATCH_SIZE \
+ ((HV_HYP_PAGE_SIZE - sizeof(union hv_gpa_page_access_state)) \
+ / sizeof(union hv_gpa_page_access_state))
+#define HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT \
+ ((HV_HYP_PAGE_SIZE - \
+ sizeof(struct hv_input_modify_sparse_spa_page_host_access)) / \
+ sizeof(u64))
+
+int hv_call_withdraw_memory(u64 count, int node, u64 partition_id)
+{
+ struct hv_input_withdraw_memory *input_page;
+ struct hv_output_withdraw_memory *output_page;
+ struct page *page;
+ u16 completed;
+ u64 status, withdrawn = 0;
+ int i;
+ unsigned long flags;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ output_page = page_address(page);
+
+ while (withdrawn < count) {
+ local_irq_save(flags);
+
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ memset(input_page, 0, sizeof(*input_page));
+ input_page->partition_id = partition_id;
+ status = hv_do_rep_hypercall(HVCALL_WITHDRAW_MEMORY,
+ min(count - withdrawn, HV_WITHDRAW_BATCH_SIZE),
+ 0, input_page, output_page);
+
+ local_irq_restore(flags);
+
+ completed = hv_repcomp(status);
+
+ for (i = 0; i < completed; i++)
+ __free_page(pfn_to_page(output_page->gpa_page_list[i]));
+
+ if (!hv_result_success(status)) {
+ if (hv_result(status) == HV_STATUS_NO_RESOURCES)
+ status = HV_STATUS_SUCCESS;
+ break;
+ }
+
+ withdrawn += completed;
+ }
+ free_page((unsigned long)output_page);
+
+ trace_mshv_hvcall_withdraw_memory(partition_id, withdrawn, status);
+
+ return hv_result_to_errno(status);
+}
+
+int hv_call_create_partition(u64 flags,
+ struct hv_partition_creation_properties creation_properties,
+ union hv_partition_isolation_properties isolation_properties,
+ u64 *partition_id)
+{
+ struct hv_input_create_partition *input;
+ struct hv_output_create_partition *output;
+ u64 status;
+ int ret;
+ unsigned long irq_flags;
+
+ do {
+ local_irq_save(irq_flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->flags = flags;
+ input->compatibility_version = HV_COMPATIBILITY_21_H2;
+
+ memcpy(&input->partition_creation_properties, &creation_properties,
+ sizeof(creation_properties));
+
+ memcpy(&input->isolation_properties, &isolation_properties,
+ sizeof(isolation_properties));
+
+ status = hv_do_hypercall(HVCALL_CREATE_PARTITION,
+ input, output);
+
+ if (!hv_result_needs_memory(status)) {
+ if (hv_result_success(status))
+ *partition_id = output->partition_id;
+ local_irq_restore(irq_flags);
+ ret = hv_result_to_errno(status);
+ break;
+ }
+ local_irq_restore(irq_flags);
+ ret = hv_deposit_memory(hv_current_partition_id, status);
+ } while (!ret);
+
+ trace_mshv_hvcall_create_partition(flags, ret ? ret : *partition_id);
+
+ return ret;
+}
+
+int hv_call_initialize_partition(u64 partition_id)
+{
+ struct hv_input_initialize_partition input;
+ u64 status;
+ int ret;
+
+ input.partition_id = partition_id;
+
+ ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
+ HV_INIT_PARTITION_DEPOSIT_PAGES);
+ if (ret)
+ return ret;
+
+ do {
+ status = hv_do_fast_hypercall8(HVCALL_INITIALIZE_PARTITION,
+ *(u64 *)&input);
+
+ if (!hv_result_needs_memory(status)) {
+ ret = hv_result_to_errno(status);
+ break;
+ }
+ ret = hv_deposit_memory(partition_id, status);
+ } while (!ret);
+
+ trace_mshv_hvcall_initialize_partition(partition_id, status);
+
+ return ret;
+}
+
+int hv_call_finalize_partition(u64 partition_id)
+{
+ struct hv_input_finalize_partition input;
+ u64 status;
+
+ input.partition_id = partition_id;
+ status = hv_do_fast_hypercall8(HVCALL_FINALIZE_PARTITION,
+ *(u64 *)&input);
+
+ trace_mshv_hvcall_finalize_partition(partition_id, status);
+
+ return hv_result_to_errno(status);
+}
+
+int hv_call_delete_partition(u64 partition_id)
+{
+ struct hv_input_delete_partition input;
+ u64 status;
+
+ input.partition_id = partition_id;
+ status = hv_do_fast_hypercall8(HVCALL_DELETE_PARTITION, *(u64 *)&input);
+
+ trace_mshv_hvcall_delete_partition(partition_id, status);
+
+ return hv_result_to_errno(status);
+}
+
+/* Ask the hypervisor to map guest ram pages or the guest mmio space */
+static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
+ u32 flags, struct page **pages, u64 mmio_spa)
+{
+ struct hv_input_map_gpa_pages *input_page;
+ u64 status, *pfnlist;
+ unsigned long irq_flags, large_shift = 0;
+ int ret = 0, done = 0;
+ u64 page_count = page_struct_count;
+
+ if (page_count == 0 || (pages && mmio_spa))
+ return -EINVAL;
+
+ if (flags & HV_MAP_GPA_LARGE_PAGE) {
+ if (mmio_spa)
+ return -EINVAL;
+
+ if (!HV_PAGE_COUNT_2M_ALIGNED(page_count))
+ return -EINVAL;
+
+ large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT;
+ page_count >>= large_shift;
+ }
+
+ while (done < page_count) {
+ ulong i, completed, remain = page_count - done;
+ int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE);
+
+ local_irq_save(irq_flags);
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input_page->target_partition_id = partition_id;
+ input_page->target_gpa_base = gfn + (done << large_shift);
+ input_page->map_flags = flags;
+ pfnlist = input_page->source_gpa_page_list;
+
+ for (i = 0; i < rep_count; i++)
+ if (flags & HV_MAP_GPA_NO_ACCESS) {
+ pfnlist[i] = 0;
+ } else if (pages) {
+ u64 index = (done + i) << large_shift;
+
+ if (index >= page_struct_count) {
+ ret = -EINVAL;
+ break;
+ }
+ pfnlist[i] = page_to_pfn(pages[index]);
+ } else {
+ pfnlist[i] = mmio_spa + done + i;
+ }
+ if (ret)
+ break;
+
+ status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0,
+ input_page, NULL);
+ local_irq_restore(irq_flags);
+
+ completed = hv_repcomp(status);
+
+ if (hv_result_needs_memory(status)) {
+ ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
+ HV_MAP_GPA_DEPOSIT_PAGES);
+ if (ret)
+ break;
+
+ } else if (!hv_result_success(status)) {
+ ret = hv_result_to_errno(status);
+ break;
+ }
+
+ done += completed;
+ }
+
+ if (ret && done) {
+ u32 unmap_flags = 0;
+
+ if (flags & HV_MAP_GPA_LARGE_PAGE)
+ unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
+ hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags);
+ }
+
+ return ret;
+}
+
+/* Ask the hypervisor to map guest ram pages */
+int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count,
+ u32 flags, struct page **pages)
+{
+ return hv_do_map_gpa_hcall(partition_id, gpa_target, page_count,
+ flags, pages, 0);
+}
+
+/* Ask the hypervisor to map guest mmio space */
+int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs)
+{
+ int i;
+ u32 flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE |
+ HV_MAP_GPA_NOT_CACHED;
+
+ for (i = 0; i < numpgs; i++)
+ if (page_is_ram(mmio_spa + i))
+ return -EINVAL;
+
+ return hv_do_map_gpa_hcall(partition_id, gfn, numpgs, flags, NULL,
+ mmio_spa);
+}
+
+int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
+ u32 flags)
+{
+ struct hv_input_unmap_gpa_pages *input_page;
+ u64 status, page_count = page_count_4k;
+ unsigned long irq_flags, large_shift = 0;
+ int ret = 0, done = 0;
+
+ if (page_count == 0)
+ return -EINVAL;
+
+ if (flags & HV_UNMAP_GPA_LARGE_PAGE) {
+ if (!HV_PAGE_COUNT_2M_ALIGNED(page_count))
+ return -EINVAL;
+
+ large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT;
+ page_count >>= large_shift;
+ }
+
+ while (done < page_count) {
+ ulong completed, remain = page_count - done;
+ int rep_count = min(remain, HV_UMAP_GPA_PAGES);
+
+ local_irq_save(irq_flags);
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input_page->target_partition_id = partition_id;
+ input_page->target_gpa_base = gfn + (done << large_shift);
+ input_page->unmap_flags = flags;
+ status = hv_do_rep_hypercall(HVCALL_UNMAP_GPA_PAGES, rep_count,
+ 0, input_page, NULL);
+ local_irq_restore(irq_flags);
+
+ completed = hv_repcomp(status);
+ if (!hv_result_success(status)) {
+ ret = hv_result_to_errno(status);
+ break;
+ }
+
+ done += completed;
+ }
+
+ return ret;
+}
+
+int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
+ union hv_gpa_page_access_state_flags state_flags,
+ int *written_total,
+ union hv_gpa_page_access_state *states)
+{
+ struct hv_input_get_gpa_pages_access_state *input_page;
+ union hv_gpa_page_access_state *output_page;
+ int completed = 0;
+ unsigned long remaining = count;
+ int rep_count, i;
+ u64 status = 0;
+ unsigned long flags;
+
+ *written_total = 0;
+ while (remaining) {
+ local_irq_save(flags);
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ input_page->partition_id = partition_id;
+ input_page->hv_gpa_page_number = gpa_base_pfn + *written_total;
+ input_page->flags = state_flags;
+ rep_count = min(remaining, HV_GET_GPA_ACCESS_STATES_BATCH_SIZE);
+
+ status = hv_do_rep_hypercall(HVCALL_GET_GPA_PAGES_ACCESS_STATES, rep_count,
+ 0, input_page, output_page);
+ if (!hv_result_success(status)) {
+ local_irq_restore(flags);
+ break;
+ }
+ completed = hv_repcomp(status);
+ for (i = 0; i < completed; ++i)
+ states[i].as_uint8 = output_page[i].as_uint8;
+
+ local_irq_restore(flags);
+ states += completed;
+ *written_total += completed;
+ remaining -= completed;
+ }
+
+ return hv_result_to_errno(status);
+}
+
+int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector,
+ u64 dest_addr,
+ union hv_interrupt_control control)
+{
+ struct hv_input_assert_virtual_interrupt *input;
+ unsigned long flags;
+ u64 status;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+ input->partition_id = partition_id;
+ input->vector = vector;
+ /*
+ * NOTE: dest_addr only needs to be provided while asserting an
+ * interrupt on x86 platform
+ */
+#if IS_ENABLED(CONFIG_X86)
+ input->dest_addr = dest_addr;
+#endif
+ input->control = control;
+ status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL);
+ local_irq_restore(flags);
+
+ return hv_result_to_errno(status);
+}
+
+int hv_call_delete_vp(u64 partition_id, u32 vp_index)
+{
+ union hv_input_delete_vp input = {};
+ u64 status;
+
+ input.partition_id = partition_id;
+ input.vp_index = vp_index;
+
+ status = hv_do_fast_hypercall16(HVCALL_DELETE_VP,
+ input.as_uint64[0], input.as_uint64[1]);
+
+ return hv_result_to_errno(status);
+}
+EXPORT_SYMBOL_GPL(hv_call_delete_vp);
+
+int hv_call_get_vp_state(u32 vp_index, u64 partition_id,
+ struct hv_vp_state_data state_data,
+ /* Choose between pages and ret_output */
+ u64 page_count, struct page **pages,
+ union hv_output_get_vp_state *ret_output)
+{
+ struct hv_input_get_vp_state *input;
+ union hv_output_get_vp_state *output;
+ u64 status;
+ int i;
+ u64 control;
+ unsigned long flags;
+ int ret = 0;
+
+ if (page_count > HV_GET_VP_STATE_BATCH_SIZE)
+ return -EINVAL;
+
+ if (!page_count && !ret_output)
+ return -EINVAL;
+
+ do {
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+ memset(input, 0, sizeof(*input));
+ memset(output, 0, sizeof(*output));
+
+ input->partition_id = partition_id;
+ input->vp_index = vp_index;
+ input->state_data = state_data;
+ for (i = 0; i < page_count; i++)
+ input->output_data_pfns[i] = page_to_pfn(pages[i]);
+
+ control = (HVCALL_GET_VP_STATE) |
+ (page_count << HV_HYPERCALL_VARHEAD_OFFSET);
+
+ status = hv_do_hypercall(control, input, output);
+
+ if (!hv_result_needs_memory(status)) {
+ if (hv_result_success(status) && ret_output)
+ memcpy(ret_output, output, sizeof(*output));
+
+ local_irq_restore(flags);
+ ret = hv_result_to_errno(status);
+ break;
+ }
+ local_irq_restore(flags);
+
+ ret = hv_deposit_memory(partition_id, status);
+ } while (!ret);
+
+ return ret;
+}
+
+int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
+ /* Choose between pages and bytes */
+ struct hv_vp_state_data state_data, u64 page_count,
+ struct page **pages, u32 num_bytes, u8 *bytes)
+{
+ struct hv_input_set_vp_state *input;
+ u64 status;
+ int i;
+ u64 control;
+ unsigned long flags;
+ int ret = 0;
+ u16 varhead_sz;
+
+ if (page_count > HV_SET_VP_STATE_BATCH_SIZE)
+ return -EINVAL;
+ if (sizeof(*input) + num_bytes > HV_HYP_PAGE_SIZE)
+ return -EINVAL;
+
+ if (num_bytes)
+ /* round up to 8 and divide by 8 */
+ varhead_sz = (num_bytes + 7) >> 3;
+ else if (page_count)
+ varhead_sz = page_count;
+ else
+ return -EINVAL;
+
+ do {
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+
+ input->partition_id = partition_id;
+ input->vp_index = vp_index;
+ input->state_data = state_data;
+ if (num_bytes) {
+ memcpy((u8 *)input->data, bytes, num_bytes);
+ } else {
+ for (i = 0; i < page_count; i++)
+ input->data[i].pfns = page_to_pfn(pages[i]);
+ }
+
+ control = (HVCALL_SET_VP_STATE) |
+ (varhead_sz << HV_HYPERCALL_VARHEAD_OFFSET);
+
+ status = hv_do_hypercall(control, input, NULL);
+
+ if (!hv_result_needs_memory(status)) {
+ local_irq_restore(flags);
+ ret = hv_result_to_errno(status);
+ break;
+ }
+ local_irq_restore(flags);
+
+ ret = hv_deposit_memory(partition_id, status);
+ } while (!ret);
+
+ return ret;
+}
+
+static int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl,
+ struct page **state_page)
+{
+ struct hv_input_map_vp_state_page *input;
+ struct hv_output_map_vp_state_page *output;
+ u64 status;
+ int ret;
+ unsigned long flags;
+
+ do {
+ local_irq_save(flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->partition_id = partition_id;
+ input->vp_index = vp_index;
+ input->type = type;
+ input->input_vtl = input_vtl;
+
+ if (*state_page) {
+ input->flags.map_location_provided = 1;
+ input->requested_map_location =
+ page_to_pfn(*state_page);
+ }
+
+ status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input,
+ output);
+
+ if (!hv_result_needs_memory(status)) {
+ if (hv_result_success(status))
+ *state_page = pfn_to_page(output->map_location);
+ local_irq_restore(flags);
+ ret = hv_result_to_errno(status);
+ break;
+ }
+
+ local_irq_restore(flags);
+
+ ret = hv_deposit_memory(partition_id, status);
+ } while (!ret);
+
+ trace_mshv_hvcall_map_vp_state_page(partition_id, vp_index,
+ type, status);
+
+ return ret;
+}
+
+static bool mshv_use_overlay_gpfn(void)
+{
+ return hv_l1vh_partition() &&
+ mshv_root.vmm_caps.vmm_can_provide_overlay_gpfn;
+}
+
+int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl,
+ struct page **state_page)
+{
+ int ret = 0;
+ struct page *allocated_page = NULL;
+
+ if (mshv_use_overlay_gpfn()) {
+ allocated_page = alloc_page(GFP_KERNEL);
+ if (!allocated_page)
+ return -ENOMEM;
+ *state_page = allocated_page;
+ } else {
+ *state_page = NULL;
+ }
+
+ ret = hv_call_map_vp_state_page(partition_id, vp_index, type, input_vtl,
+ state_page);
+
+ if (ret && allocated_page) {
+ __free_page(allocated_page);
+ *state_page = NULL;
+ }
+
+ return ret;
+}
+
+static int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl)
+{
+ unsigned long flags;
+ u64 status;
+ struct hv_input_unmap_vp_state_page *input;
+
+ local_irq_save(flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ memset(input, 0, sizeof(*input));
+
+ input->partition_id = partition_id;
+ input->vp_index = vp_index;
+ input->type = type;
+ input->input_vtl = input_vtl;
+
+ status = hv_do_hypercall(HVCALL_UNMAP_VP_STATE_PAGE, input, NULL);
+
+ local_irq_restore(flags);
+
+ return hv_result_to_errno(status);
+}
+
+int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ struct page *state_page, union hv_input_vtl input_vtl)
+{
+ int ret = hv_call_unmap_vp_state_page(partition_id, vp_index, type, input_vtl);
+
+ if (mshv_use_overlay_gpfn() && state_page)
+ __free_page(state_page);
+
+ return ret;
+}
+
+int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code,
+ u64 arg, void *property_value,
+ size_t property_value_sz)
+{
+ u64 status;
+ unsigned long flags;
+ struct hv_input_get_partition_property_ex *input;
+ struct hv_output_get_partition_property_ex *output;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->partition_id = partition_id;
+ input->property_code = property_code;
+ input->arg = arg;
+ status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY_EX, input, output);
+
+ if (!hv_result_success(status)) {
+ local_irq_restore(flags);
+ hv_status_debug(status, "\n");
+ return hv_result_to_errno(status);
+ }
+ memcpy(property_value, &output->property_value, property_value_sz);
+
+ local_irq_restore(flags);
+
+ return 0;
+}
+
+int
+hv_call_clear_virtual_interrupt(u64 partition_id)
+{
+ int status;
+
+ status = hv_do_fast_hypercall8(HVCALL_CLEAR_VIRTUAL_INTERRUPT,
+ partition_id);
+
+ return hv_result_to_errno(status);
+}
+
+int
+hv_call_create_port(u64 port_partition_id, union hv_port_id port_id,
+ u64 connection_partition_id,
+ struct hv_port_info *port_info,
+ u8 port_vtl, u8 min_connection_vtl, int node)
+{
+ struct hv_input_create_port *input;
+ unsigned long flags;
+ int ret = 0;
+ int status;
+
+ do {
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+
+ input->port_partition_id = port_partition_id;
+ input->port_id = port_id;
+ input->connection_partition_id = connection_partition_id;
+ input->port_info = *port_info;
+ input->port_vtl = port_vtl;
+ input->min_connection_vtl = min_connection_vtl;
+ input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
+ status = hv_do_hypercall(HVCALL_CREATE_PORT, input, NULL);
+ local_irq_restore(flags);
+ if (hv_result_success(status))
+ break;
+
+ if (!hv_result_needs_memory(status)) {
+ ret = hv_result_to_errno(status);
+ break;
+ }
+ ret = hv_deposit_memory(port_partition_id, status);
+ } while (!ret);
+
+ return ret;
+}
+
+int
+hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id)
+{
+ union hv_input_delete_port input = { 0 };
+ int status;
+
+ input.port_partition_id = port_partition_id;
+ input.port_id = port_id;
+ status = hv_do_fast_hypercall16(HVCALL_DELETE_PORT,
+ input.as_uint64[0],
+ input.as_uint64[1]);
+
+ return hv_result_to_errno(status);
+}
+
+int
+hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
+ u64 connection_partition_id,
+ union hv_connection_id connection_id,
+ struct hv_connection_info *connection_info,
+ u8 connection_vtl, int node)
+{
+ struct hv_input_connect_port *input;
+ unsigned long flags;
+ int ret = 0, status;
+
+ do {
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+ input->port_partition_id = port_partition_id;
+ input->port_id = port_id;
+ input->connection_partition_id = connection_partition_id;
+ input->connection_id = connection_id;
+ input->connection_info = *connection_info;
+ input->connection_vtl = connection_vtl;
+ input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
+ status = hv_do_hypercall(HVCALL_CONNECT_PORT, input, NULL);
+
+ local_irq_restore(flags);
+ if (hv_result_success(status))
+ break;
+
+ if (!hv_result_needs_memory(status)) {
+ ret = hv_result_to_errno(status);
+ break;
+ }
+ ret = hv_deposit_memory(connection_partition_id, status);
+ } while (!ret);
+
+ return ret;
+}
+
+int
+hv_call_disconnect_port(u64 connection_partition_id,
+ union hv_connection_id connection_id)
+{
+ union hv_input_disconnect_port input = { 0 };
+ int status;
+
+ input.connection_partition_id = connection_partition_id;
+ input.connection_id = connection_id;
+ input.is_doorbell = 1;
+ status = hv_do_fast_hypercall16(HVCALL_DISCONNECT_PORT,
+ input.as_uint64[0],
+ input.as_uint64[1]);
+
+ return hv_result_to_errno(status);
+}
+
+int
+hv_call_notify_port_ring_empty(u32 sint_index)
+{
+ union hv_input_notify_port_ring_empty input = { 0 };
+ int status;
+
+ input.sint_index = sint_index;
+ status = hv_do_fast_hypercall8(HVCALL_NOTIFY_PORT_RING_EMPTY,
+ input.as_uint64);
+
+ return hv_result_to_errno(status);
+}
+
+/*
+ * Equivalent of hv_call_map_stats_page() for cases when the caller provides
+ * the map location.
+ *
+ * NOTE: This is a newer hypercall that always supports SELF and PARENT stats
+ * areas, unlike hv_call_map_stats_page().
+ */
+static int hv_call_map_stats_page2(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity,
+ u64 map_location)
+{
+ unsigned long flags;
+ struct hv_input_map_stats_page2 *input;
+ u64 status;
+ int ret;
+
+ if (!map_location || !mshv_use_overlay_gpfn())
+ return -EINVAL;
+
+ do {
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->type = type;
+ input->identity = *identity;
+ input->map_location = map_location;
+
+ status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE2, input, NULL);
+
+ local_irq_restore(flags);
+
+ ret = hv_result_to_errno(status);
+
+ if (!ret)
+ break;
+
+ if (!hv_result_needs_memory(status)) {
+ hv_status_debug(status, "\n");
+ break;
+ }
+
+ ret = hv_deposit_memory(hv_current_partition_id, status);
+ } while (!ret);
+
+ return ret;
+}
+
+static int
+hv_stats_get_area_type(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity)
+{
+ switch (type) {
+ case HV_STATS_OBJECT_HYPERVISOR:
+ return identity->hv.stats_area_type;
+ case HV_STATS_OBJECT_LOGICAL_PROCESSOR:
+ return identity->lp.stats_area_type;
+ case HV_STATS_OBJECT_PARTITION:
+ return identity->partition.stats_area_type;
+ case HV_STATS_OBJECT_VP:
+ return identity->vp.stats_area_type;
+ }
+
+ return -EINVAL;
+}
+
+/*
+ * Map a stats page, where the page location is provided by the hypervisor.
+ *
+ * NOTE: The concept of separate SELF and PARENT stats areas does not exist on
+ * older hypervisor versions. All the available stats information can be found
+ * on the SELF page. When attempting to map the PARENT area on a hypervisor
+ * that doesn't support it, return "success" but with a NULL address. The
+ * caller should check for this case and instead fallback to the SELF area
+ * alone.
+ */
+static int
+hv_call_map_stats_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity,
+ struct hv_stats_page **addr)
+{
+ unsigned long flags;
+ struct hv_input_map_stats_page *input;
+ struct hv_output_map_stats_page *output;
+ u64 status, pfn;
+ int ret = 0;
+
+ do {
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->type = type;
+ input->identity = *identity;
+
+ status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE, input, output);
+ pfn = output->map_location;
+
+ local_irq_restore(flags);
+
+ if (!hv_result_needs_memory(status)) {
+ if (hv_result_success(status))
+ break;
+
+ if (hv_stats_get_area_type(type, identity) == HV_STATS_AREA_PARENT &&
+ hv_result(status) == HV_STATUS_INVALID_PARAMETER) {
+ *addr = NULL;
+ return 0;
+ }
+
+ hv_status_debug(status, "\n");
+ return hv_result_to_errno(status);
+ }
+
+ ret = hv_deposit_memory(hv_current_partition_id, status);
+ if (ret)
+ return ret;
+ } while (!ret);
+
+ *addr = page_address(pfn_to_page(pfn));
+
+ return ret;
+}
+
+int hv_map_stats_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity,
+ struct hv_stats_page **addr)
+{
+ int ret;
+ struct page *allocated_page = NULL;
+
+ if (!addr)
+ return -EINVAL;
+
+ if (mshv_use_overlay_gpfn()) {
+ allocated_page = alloc_page(GFP_KERNEL);
+ if (!allocated_page)
+ return -ENOMEM;
+
+ ret = hv_call_map_stats_page2(type, identity,
+ page_to_pfn(allocated_page));
+ *addr = page_address(allocated_page);
+ } else {
+ ret = hv_call_map_stats_page(type, identity, addr);
+ }
+
+ if (ret && allocated_page) {
+ __free_page(allocated_page);
+ *addr = NULL;
+ }
+
+ return ret;
+}
+
+static int hv_call_unmap_stats_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity)
+{
+ unsigned long flags;
+ struct hv_input_unmap_stats_page *input;
+ u64 status;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->type = type;
+ input->identity = *identity;
+
+ status = hv_do_hypercall(HVCALL_UNMAP_STATS_PAGE, input, NULL);
+ local_irq_restore(flags);
+
+ return hv_result_to_errno(status);
+}
+
+int hv_unmap_stats_page(enum hv_stats_object_type type,
+ struct hv_stats_page *page_addr,
+ const union hv_stats_object_identity *identity)
+{
+ int ret;
+
+ ret = hv_call_unmap_stats_page(type, identity);
+
+ if (mshv_use_overlay_gpfn() && page_addr)
+ __free_page(virt_to_page(page_addr));
+
+ return ret;
+}
+
+int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
+ u64 page_struct_count, u32 host_access,
+ u32 flags, u8 acquire)
+{
+ struct hv_input_modify_sparse_spa_page_host_access *input_page;
+ u64 status;
+ int done = 0;
+ unsigned long irq_flags, large_shift = 0;
+ u64 page_count = page_struct_count;
+ u16 code = acquire ? HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS :
+ HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS;
+
+ if (page_count == 0)
+ return -EINVAL;
+
+ if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE) {
+ if (!HV_PAGE_COUNT_2M_ALIGNED(page_count))
+ return -EINVAL;
+ large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT;
+ page_count >>= large_shift;
+ }
+
+ while (done < page_count) {
+ ulong i, completed, remain = page_count - done;
+ int rep_count = min(remain,
+ HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT);
+
+ local_irq_save(irq_flags);
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ memset(input_page, 0, sizeof(*input_page));
+ /* Only set the partition id if you are making the pages
+ * exclusive
+ */
+ if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE)
+ input_page->partition_id = partition_id;
+ input_page->flags = flags;
+ input_page->host_access = host_access;
+
+ for (i = 0; i < rep_count; i++) {
+ u64 index = (done + i) << large_shift;
+
+ if (index >= page_struct_count)
+ return -EINVAL;
+
+ input_page->spa_page_list[i] =
+ page_to_pfn(pages[index]);
+ }
+
+ status = hv_do_rep_hypercall(code, rep_count, 0, input_page,
+ NULL);
+ local_irq_restore(irq_flags);
+
+ completed = hv_repcomp(status);
+
+ if (!hv_result_success(status))
+ return hv_result_to_errno(status);
+
+ done += completed;
+ }
+
+ return 0;
+}
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
new file mode 100644
index 000000000000..bd1359eb58dd
--- /dev/null
+++ b/drivers/hv/mshv_root_main.c
@@ -0,0 +1,2417 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, Microsoft Corporation.
+ *
+ * The main part of the mshv_root module, providing APIs to create
+ * and manage guest partitions.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/entry-virt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/cpuhotplug.h>
+#include <linux/random.h>
+#include <asm/mshyperv.h>
+#include <linux/hyperv.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+#include <linux/page-flags.h>
+#include <linux/crash_dump.h>
+#include <linux/panic_notifier.h>
+#include <linux/vmalloc.h>
+#include <linux/rseq.h>
+
+#include "mshv_eventfd.h"
+#include "mshv.h"
+#include "mshv_root.h"
+
+MODULE_AUTHOR("Microsoft");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
+
+/* HV_THREAD_COUNTER */
+#if defined(CONFIG_X86_64)
+#define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 202
+#elif defined(CONFIG_ARM64)
+#define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95
+#endif
+
+struct mshv_root mshv_root;
+
+enum hv_scheduler_type hv_scheduler_type;
+
+/* Once we implement the fast extended hypercall ABI they can go away. */
+static void * __percpu *root_scheduler_input;
+static void * __percpu *root_scheduler_output;
+
+static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
+static int mshv_dev_open(struct inode *inode, struct file *filp);
+static int mshv_dev_release(struct inode *inode, struct file *filp);
+static int mshv_vp_release(struct inode *inode, struct file *filp);
+static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
+static int mshv_partition_release(struct inode *inode, struct file *filp);
+static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
+static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma);
+static vm_fault_t mshv_vp_fault(struct vm_fault *vmf);
+static int mshv_init_async_handler(struct mshv_partition *partition);
+static void mshv_async_hvcall_handler(void *data, u64 *status);
+
+static const union hv_input_vtl input_vtl_zero;
+static const union hv_input_vtl input_vtl_normal = {
+ .target_vtl = HV_NORMAL_VTL,
+ .use_target_vtl = 1,
+};
+
+static const struct vm_operations_struct mshv_vp_vm_ops = {
+ .fault = mshv_vp_fault,
+};
+
+static const struct file_operations mshv_vp_fops = {
+ .owner = THIS_MODULE,
+ .release = mshv_vp_release,
+ .unlocked_ioctl = mshv_vp_ioctl,
+ .llseek = noop_llseek,
+ .mmap = mshv_vp_mmap,
+};
+
+static const struct file_operations mshv_partition_fops = {
+ .owner = THIS_MODULE,
+ .release = mshv_partition_release,
+ .unlocked_ioctl = mshv_partition_ioctl,
+ .llseek = noop_llseek,
+};
+
+static const struct file_operations mshv_dev_fops = {
+ .owner = THIS_MODULE,
+ .open = mshv_dev_open,
+ .release = mshv_dev_release,
+ .unlocked_ioctl = mshv_dev_ioctl,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice mshv_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "mshv",
+ .fops = &mshv_dev_fops,
+ .mode = 0600,
+};
+
+/*
+ * Only allow hypercalls that have a u64 partition id as the first member of
+ * the input structure.
+ * These are sorted by value.
+ */
+static u16 mshv_passthru_hvcalls[] = {
+ HVCALL_GET_PARTITION_PROPERTY,
+ HVCALL_GET_PARTITION_PROPERTY_EX,
+ HVCALL_SET_PARTITION_PROPERTY,
+ HVCALL_INSTALL_INTERCEPT,
+ HVCALL_GET_VP_REGISTERS,
+ HVCALL_SET_VP_REGISTERS,
+ HVCALL_TRANSLATE_VIRTUAL_ADDRESS,
+ HVCALL_CLEAR_VIRTUAL_INTERRUPT,
+ HVCALL_REGISTER_INTERCEPT_RESULT,
+ HVCALL_ASSERT_VIRTUAL_INTERRUPT,
+ HVCALL_GET_GPA_PAGES_ACCESS_STATES,
+ HVCALL_SIGNAL_EVENT_DIRECT,
+ HVCALL_POST_MESSAGE_DIRECT,
+ HVCALL_GET_VP_CPUID_VALUES,
+};
+
+/*
+ * Only allow hypercalls that are safe to be called by the VMM with the host
+ * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a
+ * hypercall cannot be misused by the VMM before adding it to this list.
+ */
+static u16 mshv_self_passthru_hvcalls[] = {
+ HVCALL_GET_PARTITION_PROPERTY,
+ HVCALL_GET_PARTITION_PROPERTY_EX,
+};
+
+static bool mshv_hvcall_is_async(u16 code)
+{
+ switch (code) {
+ case HVCALL_SET_PARTITION_PROPERTY:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id)
+{
+ int i;
+ int n = ARRAY_SIZE(mshv_passthru_hvcalls);
+ u16 *allowed_hvcalls = mshv_passthru_hvcalls;
+
+ if (pt_id == HV_PARTITION_ID_SELF) {
+ n = ARRAY_SIZE(mshv_self_passthru_hvcalls);
+ allowed_hvcalls = mshv_self_passthru_hvcalls;
+ }
+
+ for (i = 0; i < n; ++i)
+ if (allowed_hvcalls[i] == code)
+ return true;
+
+ return false;
+}
+
+static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
+ bool partition_locked,
+ void __user *user_args)
+{
+ u64 status;
+ int ret = 0;
+ bool is_async;
+ struct mshv_root_hvcall args;
+ struct page *page;
+ unsigned int pages_order;
+ void *input_pg = NULL;
+ void *output_pg = NULL;
+ u16 reps_completed;
+ u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) ||
+ mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE)
+ return -EINVAL;
+
+ if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
+ return -EINVAL;
+
+ if (!mshv_passthru_hvcall_allowed(args.code, pt_id))
+ return -EINVAL;
+
+ is_async = mshv_hvcall_is_async(args.code);
+ if (is_async) {
+ /* async hypercalls can only be called from partition fd */
+ if (!partition || !partition_locked)
+ return -EINVAL;
+ ret = mshv_init_async_handler(partition);
+ if (ret)
+ return ret;
+ }
+
+ pages_order = args.out_ptr ? 1 : 0;
+ page = alloc_pages(GFP_KERNEL, pages_order);
+ if (!page)
+ return -ENOMEM;
+ input_pg = page_address(page);
+
+ if (args.out_ptr)
+ output_pg = (char *)input_pg + PAGE_SIZE;
+ else
+ output_pg = NULL;
+
+ if (copy_from_user(input_pg, (void __user *)args.in_ptr,
+ args.in_sz)) {
+ ret = -EFAULT;
+ goto free_pages_out;
+ }
+
+ /*
+ * NOTE: This only works because all the allowed hypercalls' input
+ * structs begin with a u64 partition_id field.
+ */
+ *(u64 *)input_pg = pt_id;
+
+ reps_completed = 0;
+ do {
+ if (args.reps) {
+ status = hv_do_rep_hypercall_ex(args.code, args.reps,
+ 0, reps_completed,
+ input_pg, output_pg);
+ reps_completed = hv_repcomp(status);
+ } else {
+ status = hv_do_hypercall(args.code, input_pg, output_pg);
+ }
+
+ if (hv_result(status) == HV_STATUS_CALL_PENDING) {
+ if (is_async) {
+ mshv_async_hvcall_handler(partition, &status);
+ } else { /* Paranoia check. This shouldn't happen! */
+ ret = -EBADFD;
+ goto free_pages_out;
+ }
+ }
+
+ if (hv_result_success(status))
+ break;
+
+ if (!hv_result_needs_memory(status))
+ ret = hv_result_to_errno(status);
+ else
+ ret = hv_deposit_memory(pt_id, status);
+ } while (!ret);
+
+ args.status = hv_result(status);
+ args.reps = reps_completed;
+ if (copy_to_user(user_args, &args, sizeof(args)))
+ ret = -EFAULT;
+
+ if (!ret && output_pg &&
+ copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
+ ret = -EFAULT;
+
+free_pages_out:
+ free_pages((unsigned long)input_pg, pages_order);
+
+ return ret;
+}
+
+static inline bool is_ghcb_mapping_available(void)
+{
+#if IS_ENABLED(CONFIG_X86_64)
+ return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE;
+#else
+ return 0;
+#endif
+}
+
+static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ struct hv_register_assoc *registers)
+{
+ return hv_call_get_vp_registers(vp_index, partition_id,
+ count, input_vtl_zero, registers);
+}
+
+static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ struct hv_register_assoc *registers)
+{
+ return hv_call_set_vp_registers(vp_index, partition_id,
+ count, input_vtl_zero, registers);
+}
+
+/*
+ * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by
+ * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend,
+ * done by the hypervisor.
+ * "Intercept" suspend leads to asynchronous message delivery to dom0 which
+ * should be awaited to keep the VP loop consistent (i.e. no message pending
+ * upon VP resume).
+ * VP intercept suspend can't be done when the VP is explicitly suspended
+ * already, and thus can be only two possible race scenarios:
+ * 1. implicit suspend bit set -> explicit suspend bit set -> message sent
+ * 2. implicit suspend bit set -> message sent -> explicit suspend bit set
+ * Checking for implicit suspend bit set after explicit suspend request has
+ * succeeded in either case allows us to reliably identify, if there is a
+ * message to receive and deliver to VMM.
+ */
+static int
+mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight)
+{
+ struct hv_register_assoc explicit_suspend = {
+ .name = HV_REGISTER_EXPLICIT_SUSPEND
+ };
+ struct hv_register_assoc intercept_suspend = {
+ .name = HV_REGISTER_INTERCEPT_SUSPEND
+ };
+ union hv_explicit_suspend_register *es =
+ &explicit_suspend.value.explicit_suspend;
+ union hv_intercept_suspend_register *is =
+ &intercept_suspend.value.intercept_suspend;
+ int ret;
+
+ es->suspended = 1;
+
+ ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+ 1, &explicit_suspend);
+ if (ret) {
+ vp_err(vp, "Failed to explicitly suspend vCPU\n");
+ return ret;
+ }
+
+ ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+ 1, &intercept_suspend);
+ if (ret) {
+ vp_err(vp, "Failed to get intercept suspend state\n");
+ return ret;
+ }
+
+ *message_in_flight = is->suspended;
+
+ return 0;
+}
+
+/*
+ * This function is used when VPs are scheduled by the hypervisor's
+ * scheduler.
+ *
+ * Caller has to make sure the registers contain cleared
+ * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers
+ * exactly in this order (the hypervisor clears them sequentially) to avoid
+ * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND
+ * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the
+ * opposite order.
+ */
+static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp)
+{
+ long ret;
+ struct hv_register_assoc suspend_regs[2] = {
+ { .name = HV_REGISTER_INTERCEPT_SUSPEND },
+ { .name = HV_REGISTER_EXPLICIT_SUSPEND }
+ };
+ size_t count = ARRAY_SIZE(suspend_regs);
+
+ /* Resume VP execution */
+ ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+ count, suspend_regs);
+ if (ret) {
+ vp_err(vp, "Failed to resume vp execution. %lx\n", ret);
+ return ret;
+ }
+
+ ret = wait_event_interruptible(vp->run.vp_suspend_queue,
+ vp->run.kicked_by_hv == 1);
+ if (ret) {
+ bool message_in_flight;
+
+ /*
+ * Otherwise the waiting was interrupted by a signal: suspend
+ * the vCPU explicitly and copy message in flight (if any).
+ */
+ ret = mshv_suspend_vp(vp, &message_in_flight);
+ if (ret)
+ return ret;
+
+ /* Return if no message in flight */
+ if (!message_in_flight)
+ return -EINTR;
+
+ /* Wait for the message in flight. */
+ wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1);
+ }
+
+ /*
+ * Reset the flag to make the wait_event call above work
+ * next time.
+ */
+ vp->run.kicked_by_hv = 0;
+
+ return 0;
+}
+
+static int
+mshv_vp_dispatch(struct mshv_vp *vp, u32 flags,
+ struct hv_output_dispatch_vp *res)
+{
+ struct hv_input_dispatch_vp *input;
+ struct hv_output_dispatch_vp *output;
+ u64 status;
+
+ preempt_disable();
+ input = *this_cpu_ptr(root_scheduler_input);
+ output = *this_cpu_ptr(root_scheduler_output);
+
+ memset(input, 0, sizeof(*input));
+ memset(output, 0, sizeof(*output));
+
+ input->partition_id = vp->vp_partition->pt_id;
+ input->vp_index = vp->vp_index;
+ input->time_slice = 0; /* Run forever until something happens */
+ input->spec_ctrl = 0; /* TODO: set sensible flags */
+ input->flags = flags;
+
+ vp->run.flags.root_sched_dispatched = 1;
+ status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output);
+ vp->run.flags.root_sched_dispatched = 0;
+
+ trace_mshv_hvcall_dispatch_vp(vp->vp_partition->pt_id,
+ vp->vp_index, flags,
+ output->dispatch_state,
+ output->dispatch_event,
+#if defined(CONFIG_X86_64)
+ vp->vp_register_page->interrupt_vectors.as_uint64,
+#else
+ 0,
+#endif
+ status);
+
+ *res = *output;
+ preempt_enable();
+
+ if (!hv_result_success(status))
+ vp_err(vp, "%s: status %s\n", __func__,
+ hv_result_to_string(status));
+
+ return hv_result_to_errno(status);
+}
+
+static int
+mshv_vp_clear_explicit_suspend(struct mshv_vp *vp)
+{
+ struct hv_register_assoc explicit_suspend = {
+ .name = HV_REGISTER_EXPLICIT_SUSPEND,
+ .value.explicit_suspend.suspended = 0,
+ };
+ int ret;
+
+ ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+ 1, &explicit_suspend);
+
+ trace_mshv_vp_clear_explicit_suspend(vp->vp_partition->pt_id,
+ vp->vp_index, ret);
+
+ if (ret)
+ vp_err(vp, "Failed to unsuspend\n");
+
+ return ret;
+}
+
+#if IS_ENABLED(CONFIG_X86_64)
+static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
+{
+ if (!vp->vp_register_page)
+ return 0;
+ return vp->vp_register_page->interrupt_vectors.as_uint64;
+}
+#else
+static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
+{
+ return 0;
+}
+#endif
+
+static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp)
+{
+ struct hv_stats_page **stats = vp->vp_stats_pages;
+ u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->data;
+ u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->data;
+
+ return parent_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED] ||
+ self_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED];
+}
+
+static int
+mshv_vp_wait_for_hv_kick(struct mshv_vp *vp)
+{
+ int ret;
+
+ ret = wait_event_interruptible(vp->run.vp_suspend_queue,
+ (vp->run.kicked_by_hv == 1 &&
+ !mshv_vp_dispatch_thread_blocked(vp)) ||
+ mshv_vp_interrupt_pending(vp));
+ if (ret)
+ return -EINTR;
+
+ trace_mshv_vp_wait_for_hv_kick(vp->vp_partition->pt_id,
+ vp->vp_index,
+ vp->run.kicked_by_hv,
+ mshv_vp_dispatch_thread_blocked(vp),
+ mshv_vp_interrupt_pending(vp));
+
+ vp->run.flags.root_sched_blocked = 0;
+ vp->run.kicked_by_hv = 0;
+
+ return 0;
+}
+
+/* Must be called with interrupts enabled */
+static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
+{
+ long ret;
+
+ if (vp->run.flags.root_sched_blocked) {
+ /*
+ * Dispatch state of this VP is blocked. Need to wait
+ * for the hypervisor to clear the blocked state before
+ * dispatching it.
+ */
+ ret = mshv_vp_wait_for_hv_kick(vp);
+ if (ret)
+ return ret;
+ }
+
+ do {
+ u32 flags = 0;
+ struct hv_output_dispatch_vp output;
+
+ if (__xfer_to_guest_mode_work_pending()) {
+ ret = xfer_to_guest_mode_handle_work();
+
+ trace_mshv_xfer_to_guest_mode_work(vp->vp_partition->pt_id,
+ vp->vp_index,
+ read_thread_flags(),
+ ret);
+
+ if (ret)
+ break;
+ }
+
+ if (vp->run.flags.intercept_suspend)
+ flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND;
+
+ if (mshv_vp_interrupt_pending(vp))
+ flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION;
+
+ ret = mshv_vp_dispatch(vp, flags, &output);
+ if (ret)
+ break;
+
+ vp->run.flags.intercept_suspend = 0;
+
+ if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) {
+ if (output.dispatch_event ==
+ HV_VP_DISPATCH_EVENT_SUSPEND) {
+ /*
+ * TODO: remove the warning once VP canceling
+ * is supported
+ */
+ WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count),
+ "%s: vp#%d: unexpected explicit suspend\n",
+ __func__, vp->vp_index);
+ /*
+ * Need to clear explicit suspend before
+ * dispatching.
+ * Explicit suspend is either:
+ * - set right after the first VP dispatch or
+ * - set explicitly via hypercall
+ * Since the latter case is not yet supported,
+ * simply clear it here.
+ */
+ ret = mshv_vp_clear_explicit_suspend(vp);
+ if (ret)
+ break;
+
+ ret = mshv_vp_wait_for_hv_kick(vp);
+ if (ret)
+ break;
+ } else {
+ vp->run.flags.root_sched_blocked = 1;
+ ret = mshv_vp_wait_for_hv_kick(vp);
+ if (ret)
+ break;
+ }
+ } else {
+ /* HV_VP_DISPATCH_STATE_READY */
+ if (output.dispatch_event ==
+ HV_VP_DISPATCH_EVENT_INTERCEPT)
+ vp->run.flags.intercept_suspend = 1;
+ }
+ } while (!vp->run.flags.intercept_suspend);
+
+ rseq_virt_userspace_exit();
+
+ return ret;
+}
+
+static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
+ "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
+
+static struct mshv_mem_region *
+mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
+{
+ struct mshv_mem_region *region;
+
+ hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
+ if (gfn >= region->start_gfn &&
+ gfn < region->start_gfn + region->nr_pages)
+ return region;
+ }
+
+ return NULL;
+}
+
+static struct mshv_mem_region *
+mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
+{
+ struct mshv_mem_region *region;
+
+ spin_lock(&p->pt_mem_regions_lock);
+ region = mshv_partition_region_by_gfn(p, gfn);
+ if (!region || !mshv_region_get(region)) {
+ spin_unlock(&p->pt_mem_regions_lock);
+ return NULL;
+ }
+ spin_unlock(&p->pt_mem_regions_lock);
+
+ return region;
+}
+
+/**
+ * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
+ * @vp: Pointer to the virtual processor structure.
+ *
+ * This function processes GPA intercepts by identifying the memory region
+ * corresponding to the intercepted GPA, aligning the page offset, and
+ * mapping the required pages. It ensures that the region is valid and
+ * handles faults efficiently by mapping multiple pages at once.
+ *
+ * Return: true if the intercept was handled successfully, false otherwise.
+ */
+static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
+{
+ struct mshv_partition *p = vp->vp_partition;
+ struct mshv_mem_region *region;
+ bool ret = false;
+ u64 gfn;
+#if defined(CONFIG_X86_64)
+ struct hv_x64_memory_intercept_message *msg =
+ (struct hv_x64_memory_intercept_message *)
+ vp->vp_intercept_msg_page->u.payload;
+#elif defined(CONFIG_ARM64)
+ struct hv_arm64_memory_intercept_message *msg =
+ (struct hv_arm64_memory_intercept_message *)
+ vp->vp_intercept_msg_page->u.payload;
+#endif
+ enum hv_intercept_access_type access_type =
+ msg->header.intercept_access_type;
+
+ gfn = HVPFN_DOWN(msg->guest_physical_address);
+
+ region = mshv_partition_region_by_gfn_get(p, gfn);
+ if (!region)
+ goto out;
+
+ if (access_type == HV_INTERCEPT_ACCESS_WRITE &&
+ !(region->hv_map_flags & HV_MAP_GPA_WRITABLE))
+ goto put_region;
+
+ if (access_type == HV_INTERCEPT_ACCESS_EXECUTE &&
+ !(region->hv_map_flags & HV_MAP_GPA_EXECUTABLE))
+ goto put_region;
+
+ /* Only movable memory ranges are supported for GPA intercepts */
+ if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE)
+ ret = mshv_region_handle_gfn_fault(region, gfn);
+
+put_region:
+ mshv_region_put(region);
+out:
+ trace_mshv_handle_gpa_intercept(p->pt_id, vp->vp_index, gfn,
+ access_type, ret);
+ return ret;
+}
+
+static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
+{
+ switch (vp->vp_intercept_msg_page->header.message_type) {
+ case HVMSG_GPA_INTERCEPT:
+ return mshv_handle_gpa_intercept(vp);
+ }
+ return false;
+}
+
+static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
+{
+ long rc;
+
+ trace_mshv_run_vp_entry(vp->vp_partition->pt_id, vp->vp_index);
+
+ do {
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+ rc = mshv_run_vp_with_root_scheduler(vp);
+ else
+ rc = mshv_run_vp_with_hyp_scheduler(vp);
+ } while (rc == 0 && mshv_vp_handle_intercept(vp));
+
+ trace_mshv_run_vp_exit(vp->vp_partition->pt_id, vp->vp_index,
+ vp->vp_intercept_msg_page->header.message_type,
+ rc);
+
+ if (rc)
+ return rc;
+
+ if (copy_to_user(ret_msg, vp->vp_intercept_msg_page,
+ sizeof(struct hv_message)))
+ rc = -EFAULT;
+
+ return rc;
+}
+
+static int
+mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp,
+ struct hv_vp_state_data state_data,
+ unsigned long user_pfn, size_t page_count,
+ bool is_set)
+{
+ int completed, ret = 0;
+ unsigned long check;
+ struct page **pages;
+
+ if (page_count > INT_MAX)
+ return -EINVAL;
+ /*
+ * Check the arithmetic for wraparound/overflow.
+ * The last page address in the buffer is:
+ * (user_pfn + (page_count - 1)) * PAGE_SIZE
+ */
+ if (check_add_overflow(user_pfn, (page_count - 1), &check))
+ return -EOVERFLOW;
+ if (check_mul_overflow(check, PAGE_SIZE, &check))
+ return -EOVERFLOW;
+
+ /* Pin user pages so hypervisor can copy directly to them */
+ pages = kzalloc_objs(struct page *, page_count);
+ if (!pages)
+ return -ENOMEM;
+
+ for (completed = 0; completed < page_count; completed += ret) {
+ unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE;
+ int remaining = page_count - completed;
+
+ ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE,
+ &pages[completed]);
+ if (ret < 0) {
+ vp_err(vp, "%s: Failed to pin user pages error %i\n",
+ __func__, ret);
+ goto unpin_pages;
+ }
+ }
+
+ if (is_set)
+ ret = hv_call_set_vp_state(vp->vp_index,
+ vp->vp_partition->pt_id,
+ state_data, page_count, pages,
+ 0, NULL);
+ else
+ ret = hv_call_get_vp_state(vp->vp_index,
+ vp->vp_partition->pt_id,
+ state_data, page_count, pages,
+ NULL);
+
+unpin_pages:
+ unpin_user_pages(pages, completed);
+ kfree(pages);
+ return ret;
+}
+
+static long
+mshv_vp_ioctl_get_set_state(struct mshv_vp *vp,
+ struct mshv_get_set_vp_state __user *user_args,
+ bool is_set)
+{
+ struct mshv_get_set_vp_state args;
+ long ret = 0;
+ union hv_output_get_vp_state vp_state;
+ u32 data_sz;
+ struct hv_vp_state_data state_data = {};
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) ||
+ !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) ||
+ !PAGE_ALIGNED(args.buf_ptr))
+ return -EINVAL;
+
+ if (!access_ok((void __user *)args.buf_ptr, args.buf_sz))
+ return -EFAULT;
+
+ switch (args.type) {
+ case MSHV_VP_STATE_LAPIC:
+ state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE;
+ data_sz = HV_HYP_PAGE_SIZE;
+ break;
+ case MSHV_VP_STATE_XSAVE:
+ {
+ u64 data_sz_64;
+
+ ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
+ HV_PARTITION_PROPERTY_XSAVE_STATES,
+ &state_data.xsave.states.as_uint64);
+ if (ret)
+ return ret;
+
+ ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
+ HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE,
+ &data_sz_64);
+ if (ret)
+ return ret;
+
+ data_sz = (u32)data_sz_64;
+ state_data.xsave.flags = 0;
+ /* Always request legacy states */
+ state_data.xsave.states.legacy_x87 = 1;
+ state_data.xsave.states.legacy_sse = 1;
+ state_data.type = HV_GET_SET_VP_STATE_XSAVE;
+ break;
+ }
+ case MSHV_VP_STATE_SIMP:
+ state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE;
+ data_sz = HV_HYP_PAGE_SIZE;
+ break;
+ case MSHV_VP_STATE_SIEFP:
+ state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE;
+ data_sz = HV_HYP_PAGE_SIZE;
+ break;
+ case MSHV_VP_STATE_SYNTHETIC_TIMERS:
+ state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS;
+ data_sz = sizeof(vp_state.synthetic_timers_state);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz)))
+ return -EFAULT;
+
+ if (data_sz > args.buf_sz)
+ return -EINVAL;
+
+ /* If the data is transmitted via pfns, delegate to helper */
+ if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) {
+ unsigned long user_pfn = PFN_DOWN(args.buf_ptr);
+ size_t page_count = PFN_DOWN(args.buf_sz);
+
+ return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn,
+ page_count, is_set);
+ }
+
+ /* Paranoia check - this shouldn't happen! */
+ if (data_sz > sizeof(vp_state)) {
+ vp_err(vp, "Invalid vp state data size!\n");
+ return -EINVAL;
+ }
+
+ if (is_set) {
+ if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz))
+ return -EFAULT;
+
+ return hv_call_set_vp_state(vp->vp_index,
+ vp->vp_partition->pt_id,
+ state_data, 0, NULL,
+ sizeof(vp_state), (u8 *)&vp_state);
+ }
+
+ ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id,
+ state_data, 0, NULL, &vp_state);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long
+mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+ struct mshv_vp *vp = filp->private_data;
+ long r = -ENOTTY;
+
+ if (mutex_lock_killable(&vp->vp_mutex))
+ return -EINTR;
+
+ switch (ioctl) {
+ case MSHV_RUN_VP:
+ r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg);
+ break;
+ case MSHV_GET_VP_STATE:
+ r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false);
+ break;
+ case MSHV_SET_VP_STATE:
+ r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true);
+ break;
+ case MSHV_ROOT_HVCALL:
+ r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false,
+ (void __user *)arg);
+ break;
+ default:
+ vp_warn(vp, "Invalid ioctl: %#x\n", ioctl);
+ break;
+ }
+ mutex_unlock(&vp->vp_mutex);
+
+ return r;
+}
+
+static vm_fault_t mshv_vp_fault(struct vm_fault *vmf)
+{
+ struct mshv_vp *vp = vmf->vma->vm_file->private_data;
+
+ switch (vmf->vma->vm_pgoff) {
+ case MSHV_VP_MMAP_OFFSET_REGISTERS:
+ vmf->page = virt_to_page(vp->vp_register_page);
+ break;
+ case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
+ vmf->page = virt_to_page(vp->vp_intercept_msg_page);
+ break;
+ case MSHV_VP_MMAP_OFFSET_GHCB:
+ vmf->page = virt_to_page(vp->vp_ghcb_page);
+ break;
+ default:
+ return VM_FAULT_SIGBUS;
+ }
+
+ get_page(vmf->page);
+
+ return 0;
+}
+
+static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct mshv_vp *vp = file->private_data;
+
+ switch (vma->vm_pgoff) {
+ case MSHV_VP_MMAP_OFFSET_REGISTERS:
+ if (!vp->vp_register_page)
+ return -ENODEV;
+ break;
+ case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
+ if (!vp->vp_intercept_msg_page)
+ return -ENODEV;
+ break;
+ case MSHV_VP_MMAP_OFFSET_GHCB:
+ if (!vp->vp_ghcb_page)
+ return -ENODEV;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ vma->vm_ops = &mshv_vp_vm_ops;
+ return 0;
+}
+
+static int
+mshv_vp_release(struct inode *inode, struct file *filp)
+{
+ struct mshv_vp *vp = filp->private_data;
+
+ trace_mshv_vp_release(vp->vp_partition->pt_id, vp->vp_index);
+
+ /* Rest of VP cleanup happens in destroy_partition() */
+ mshv_partition_put(vp->vp_partition);
+ return 0;
+}
+
+void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
+ struct hv_stats_page *stats_pages[])
+{
+ union hv_stats_object_identity identity = {
+ .vp.partition_id = partition_id,
+ .vp.vp_index = vp_index,
+ };
+ int err;
+
+ identity.vp.stats_area_type = HV_STATS_AREA_SELF;
+ err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
+ stats_pages[HV_STATS_AREA_SELF],
+ &identity);
+ if (err)
+ pr_err("%s: failed to unmap partition %llu vp %u self stats, err: %d\n",
+ __func__, partition_id, vp_index, err);
+
+ if (stats_pages[HV_STATS_AREA_PARENT] != stats_pages[HV_STATS_AREA_SELF]) {
+ identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
+ err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
+ stats_pages[HV_STATS_AREA_PARENT],
+ &identity);
+ if (err)
+ pr_err("%s: failed to unmap partition %llu vp %u parent stats, err: %d\n",
+ __func__, partition_id, vp_index, err);
+ }
+}
+
+int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
+ struct hv_stats_page *stats_pages[])
+{
+ union hv_stats_object_identity identity = {
+ .vp.partition_id = partition_id,
+ .vp.vp_index = vp_index,
+ };
+ int err;
+
+ identity.vp.stats_area_type = HV_STATS_AREA_SELF;
+ err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
+ &stats_pages[HV_STATS_AREA_SELF]);
+ if (err) {
+ pr_err("%s: failed to map partition %llu vp %u self stats, err: %d\n",
+ __func__, partition_id, vp_index, err);
+ return err;
+ }
+
+ /*
+ * L1VH partition cannot access its vp stats in parent area.
+ */
+ if (is_l1vh_parent(partition_id)) {
+ stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
+ } else {
+ identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
+ err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
+ &stats_pages[HV_STATS_AREA_PARENT]);
+ if (err) {
+ pr_err("%s: failed to map partition %llu vp %u parent stats, err: %d\n",
+ __func__, partition_id, vp_index, err);
+ goto unmap_self;
+ }
+ if (!stats_pages[HV_STATS_AREA_PARENT])
+ stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
+ }
+
+ return 0;
+
+unmap_self:
+ identity.vp.stats_area_type = HV_STATS_AREA_SELF;
+ hv_unmap_stats_page(HV_STATS_OBJECT_VP,
+ stats_pages[HV_STATS_AREA_SELF],
+ &identity);
+ return err;
+}
+
+static long
+mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
+ void __user *arg)
+{
+ struct mshv_create_vp args;
+ struct mshv_vp *vp;
+ struct page *intercept_msg_page, *register_page, *ghcb_page;
+ struct hv_stats_page *stats_pages[2];
+ long ret;
+
+ if (copy_from_user(&args, arg, sizeof(args)))
+ return -EFAULT;
+
+ if (args.vp_index >= MSHV_MAX_VPS)
+ return -EINVAL;
+
+ if (partition->pt_vp_array[args.vp_index])
+ return -EEXIST;
+
+ ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index,
+ 0 /* Only valid for root partition VPs */);
+ if (ret)
+ return ret;
+
+ ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+ input_vtl_zero, &intercept_msg_page);
+ if (ret)
+ goto destroy_vp;
+
+ if (!mshv_partition_encrypted(partition)) {
+ ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_REGISTERS,
+ input_vtl_zero, &register_page);
+ if (ret)
+ goto unmap_intercept_message_page;
+ }
+
+ if (mshv_partition_encrypted(partition) &&
+ is_ghcb_mapping_available()) {
+ ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_GHCB,
+ input_vtl_normal, &ghcb_page);
+ if (ret)
+ goto unmap_register_page;
+ }
+
+ ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
+ stats_pages);
+ if (ret)
+ goto unmap_ghcb_page;
+
+ vp = kzalloc_obj(*vp);
+ if (!vp)
+ goto unmap_stats_pages;
+
+ vp->vp_partition = mshv_partition_get(partition);
+ if (!vp->vp_partition) {
+ ret = -EBADF;
+ goto free_vp;
+ }
+
+ mutex_init(&vp->vp_mutex);
+ init_waitqueue_head(&vp->run.vp_suspend_queue);
+ atomic64_set(&vp->run.vp_signaled_count, 0);
+
+ vp->vp_index = args.vp_index;
+ vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page);
+ if (!mshv_partition_encrypted(partition))
+ vp->vp_register_page = page_to_virt(register_page);
+
+ if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
+ vp->vp_ghcb_page = page_to_virt(ghcb_page);
+
+ memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
+
+ ret = mshv_debugfs_vp_create(vp);
+ if (ret)
+ goto put_partition;
+
+ /*
+ * Keep anon_inode_getfd last: it installs fd in the file struct and
+ * thus makes the state accessible in user space.
+ */
+ ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
+ O_RDWR | O_CLOEXEC);
+ if (ret < 0)
+ goto remove_debugfs_vp;
+
+ /* already exclusive with the partition mutex for all ioctls */
+ partition->pt_vp_count++;
+ partition->pt_vp_array[args.vp_index] = vp;
+
+ goto out;
+
+remove_debugfs_vp:
+ mshv_debugfs_vp_remove(vp);
+put_partition:
+ mshv_partition_put(partition);
+free_vp:
+ kfree(vp);
+unmap_stats_pages:
+ mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
+unmap_ghcb_page:
+ if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
+ hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_GHCB, ghcb_page,
+ input_vtl_normal);
+unmap_register_page:
+ if (!mshv_partition_encrypted(partition))
+ hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_REGISTERS,
+ register_page, input_vtl_zero);
+unmap_intercept_message_page:
+ hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+ intercept_msg_page, input_vtl_zero);
+destroy_vp:
+ hv_call_delete_vp(partition->pt_id, args.vp_index);
+out:
+ trace_mshv_create_vp(partition->pt_id, args.vp_index, ret);
+ return ret;
+}
+
+static int mshv_init_async_handler(struct mshv_partition *partition)
+{
+ if (completion_done(&partition->async_hypercall)) {
+ pt_err(partition,
+ "Cannot issue async hypercall while another one in progress!\n");
+ return -EPERM;
+ }
+
+ reinit_completion(&partition->async_hypercall);
+ return 0;
+}
+
+static void mshv_async_hvcall_handler(void *data, u64 *status)
+{
+ struct mshv_partition *partition = data;
+
+ wait_for_completion(&partition->async_hypercall);
+ pt_dbg(partition, "Async hypercall completed!\n");
+
+ *status = partition->async_hypercall_status;
+}
+
+/*
+ * NB: caller checks and makes sure mem->size is page aligned
+ * Returns: 0 with regionpp updated on success, or -errno
+ */
+static int mshv_partition_create_region(struct mshv_partition *partition,
+ struct mshv_user_mem_region *mem,
+ struct mshv_mem_region **regionpp,
+ bool is_mmio)
+{
+ struct mshv_mem_region *rg;
+ u64 nr_pages = HVPFN_DOWN(mem->size);
+
+ /* Reject overlapping regions */
+ spin_lock(&partition->pt_mem_regions_lock);
+ hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
+ if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
+ rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
+ continue;
+ spin_unlock(&partition->pt_mem_regions_lock);
+ return -EEXIST;
+ }
+ spin_unlock(&partition->pt_mem_regions_lock);
+
+ rg = mshv_region_create(mem->guest_pfn, nr_pages,
+ mem->userspace_addr, mem->flags);
+ if (IS_ERR(rg))
+ return PTR_ERR(rg);
+
+ if (is_mmio)
+ rg->mreg_type = MSHV_REGION_TYPE_MMIO;
+ else if (mshv_partition_encrypted(partition) ||
+ !mshv_region_movable_init(rg))
+ rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED;
+ else
+ rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE;
+
+ rg->partition = partition;
+
+ *regionpp = rg;
+
+ return 0;
+}
+
+/**
+ * mshv_prepare_pinned_region - Pin and map memory regions
+ * @region: Pointer to the memory region structure
+ *
+ * This function processes memory regions that are explicitly marked as pinned.
+ * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
+ * population. The function ensures the region is properly populated, handles
+ * encryption requirements for SNP partitions if applicable, maps the region,
+ * and performs necessary sharing or eviction operations based on the mapping
+ * result.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
+{
+ struct mshv_partition *partition = region->partition;
+ int ret;
+
+ ret = mshv_region_pin(region);
+ if (ret) {
+ pt_err(partition, "Failed to pin memory region: %d\n",
+ ret);
+ goto err_out;
+ }
+
+ /*
+ * For an SNP partition it is a requirement that for every memory region
+ * that we are going to map for this partition we should make sure that
+ * host access to that region is released. This is ensured by doing an
+ * additional hypercall which will update the SLAT to release host
+ * access to guest memory regions.
+ */
+ if (mshv_partition_encrypted(partition)) {
+ ret = mshv_region_unshare(region);
+ if (ret) {
+ pt_err(partition,
+ "Failed to unshare memory region (guest_pfn: %llu): %d\n",
+ region->start_gfn, ret);
+ goto invalidate_region;
+ }
+ }
+
+ ret = mshv_region_map(region);
+ if (ret && mshv_partition_encrypted(partition)) {
+ int shrc;
+
+ shrc = mshv_region_share(region);
+ if (!shrc)
+ goto invalidate_region;
+
+ pt_err(partition,
+ "Failed to share memory region (guest_pfn: %llu): %d\n",
+ region->start_gfn, shrc);
+ /*
+ * Don't unpin if marking shared failed because pages are no
+ * longer mapped in the host, ie root, anymore.
+ */
+ goto err_out;
+ }
+
+ return 0;
+
+invalidate_region:
+ mshv_region_invalidate(region);
+err_out:
+ return ret;
+}
+
+/*
+ * This maps two things: guest RAM and for pci passthru mmio space.
+ *
+ * mmio:
+ * - vfio overloads vm_pgoff to store the mmio start pfn/spa.
+ * - Two things need to happen for mapping mmio range:
+ * 1. mapped in the uaddr so VMM can access it.
+ * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
+ *
+ * This function takes care of the second. The first one is managed by vfio,
+ * and hence is taken care of via vfio_pci_mmap_fault().
+ */
+static long
+mshv_map_user_memory(struct mshv_partition *partition,
+ struct mshv_user_mem_region *mem)
+{
+ struct mshv_mem_region *region;
+ struct vm_area_struct *vma;
+ bool is_mmio;
+ ulong mmio_pfn;
+ long ret;
+
+ if (mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
+ !access_ok((const void __user *)mem->userspace_addr, mem->size))
+ return -EINVAL;
+
+ mmap_read_lock(current->mm);
+ vma = vma_lookup(current->mm, mem->userspace_addr);
+ is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
+ mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
+ mmap_read_unlock(current->mm);
+
+ if (!vma)
+ return -EINVAL;
+
+ ret = mshv_partition_create_region(partition, mem, &region,
+ is_mmio);
+ if (ret)
+ return ret;
+
+ switch (region->mreg_type) {
+ case MSHV_REGION_TYPE_MEM_PINNED:
+ ret = mshv_prepare_pinned_region(region);
+ break;
+ case MSHV_REGION_TYPE_MEM_MOVABLE:
+ /*
+ * For movable memory regions, remap with no access to let
+ * the hypervisor track dirty pages, enabling pre-copy live
+ * migration.
+ */
+ ret = hv_call_map_gpa_pages(partition->pt_id,
+ region->start_gfn,
+ region->nr_pages,
+ HV_MAP_GPA_NO_ACCESS, NULL);
+ break;
+ case MSHV_REGION_TYPE_MMIO:
+ ret = hv_call_map_mmio_pages(partition->pt_id,
+ region->start_gfn,
+ mmio_pfn,
+ region->nr_pages);
+ break;
+ }
+
+ trace_mshv_map_user_memory(partition->pt_id, region->start_uaddr,
+ region->start_gfn, region->nr_pages,
+ region->hv_map_flags, ret);
+
+ if (ret)
+ goto errout;
+
+ spin_lock(&partition->pt_mem_regions_lock);
+ hlist_add_head(&region->hnode, &partition->pt_mem_regions);
+ spin_unlock(&partition->pt_mem_regions_lock);
+
+ return 0;
+
+errout:
+ mshv_region_put(region);
+ return ret;
+}
+
+/* Called for unmapping both the guest ram and the mmio space */
+static long
+mshv_unmap_user_memory(struct mshv_partition *partition,
+ struct mshv_user_mem_region *mem)
+{
+ struct mshv_mem_region *region;
+
+ if (!(mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
+ return -EINVAL;
+
+ spin_lock(&partition->pt_mem_regions_lock);
+
+ region = mshv_partition_region_by_gfn(partition, mem->guest_pfn);
+ if (!region) {
+ spin_unlock(&partition->pt_mem_regions_lock);
+ return -ENOENT;
+ }
+
+ /* Paranoia check */
+ if (region->start_uaddr != mem->userspace_addr ||
+ region->start_gfn != mem->guest_pfn ||
+ region->nr_pages != HVPFN_DOWN(mem->size)) {
+ spin_unlock(&partition->pt_mem_regions_lock);
+ return -EINVAL;
+ }
+
+ hlist_del(&region->hnode);
+
+ spin_unlock(&partition->pt_mem_regions_lock);
+
+ mshv_region_put(region);
+
+ return 0;
+}
+
+static long
+mshv_partition_ioctl_set_memory(struct mshv_partition *partition,
+ struct mshv_user_mem_region __user *user_mem)
+{
+ struct mshv_user_mem_region mem;
+
+ if (copy_from_user(&mem, user_mem, sizeof(mem)))
+ return -EFAULT;
+
+ if (!mem.size ||
+ !PAGE_ALIGNED(mem.size) ||
+ !PAGE_ALIGNED(mem.userspace_addr) ||
+ (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) ||
+ mshv_field_nonzero(mem, rsvd))
+ return -EINVAL;
+
+ if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))
+ return mshv_unmap_user_memory(partition, &mem);
+
+ return mshv_map_user_memory(partition, &mem);
+}
+
+static long
+mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition,
+ void __user *user_args)
+{
+ struct mshv_user_ioeventfd args;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ return mshv_set_unset_ioeventfd(partition, &args);
+}
+
+static long
+mshv_partition_ioctl_irqfd(struct mshv_partition *partition,
+ void __user *user_args)
+{
+ struct mshv_user_irqfd args;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ return mshv_set_unset_irqfd(partition, &args);
+}
+
+static long
+mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition,
+ void __user *user_args)
+{
+ struct mshv_gpap_access_bitmap args;
+ union hv_gpa_page_access_state *states;
+ long ret, i;
+ union hv_gpa_page_access_state_flags hv_flags = {};
+ u8 hv_type_mask;
+ ulong bitmap_buf_sz, states_buf_sz;
+ int written = 0;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT ||
+ args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT ||
+ mshv_field_nonzero(args, rsvd) || !args.page_count ||
+ !args.bitmap_ptr)
+ return -EINVAL;
+
+ if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz))
+ return -E2BIG;
+
+ /* Num bytes needed to store bitmap; one bit per page rounded up */
+ bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8);
+
+ /* Sanity check */
+ if (bitmap_buf_sz > states_buf_sz)
+ return -EBADFD;
+
+ switch (args.access_type) {
+ case MSHV_GPAP_ACCESS_TYPE_ACCESSED:
+ hv_type_mask = 1;
+ if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
+ hv_flags.clear_accessed = 1;
+ /* not accessed implies not dirty */
+ hv_flags.clear_dirty = 1;
+ } else { /* MSHV_GPAP_ACCESS_OP_SET */
+ hv_flags.set_accessed = 1;
+ }
+ break;
+ case MSHV_GPAP_ACCESS_TYPE_DIRTY:
+ hv_type_mask = 2;
+ if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
+ hv_flags.clear_dirty = 1;
+ } else { /* MSHV_GPAP_ACCESS_OP_SET */
+ hv_flags.set_dirty = 1;
+ /* dirty implies accessed */
+ hv_flags.set_accessed = 1;
+ }
+ break;
+ }
+
+ states = vzalloc(states_buf_sz);
+ if (!states)
+ return -ENOMEM;
+
+ ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count,
+ args.gpap_base, hv_flags, &written,
+ states);
+ if (ret)
+ goto free_return;
+
+ /*
+ * Overwrite states buffer with bitmap - the bits in hv_type_mask
+ * correspond to bitfields in hv_gpa_page_access_state
+ */
+ for (i = 0; i < written; ++i)
+ __assign_bit(i, (ulong *)states,
+ states[i].as_uint8 & hv_type_mask);
+
+ /* zero the unused bits in the last byte(s) of the returned bitmap */
+ for (i = written; i < bitmap_buf_sz * 8; ++i)
+ __clear_bit(i, (ulong *)states);
+
+ if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz))
+ ret = -EFAULT;
+
+free_return:
+ vfree(states);
+ return ret;
+}
+
+static long
+mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition,
+ void __user *user_args)
+{
+ struct mshv_user_irq_entry *entries = NULL;
+ struct mshv_user_irq_table args;
+ long ret;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ if (args.nr > MSHV_MAX_GUEST_IRQS ||
+ mshv_field_nonzero(args, rsvd))
+ return -EINVAL;
+
+ if (args.nr) {
+ struct mshv_user_irq_table __user *urouting = user_args;
+
+ entries = vmemdup_user(urouting->entries,
+ array_size(sizeof(*entries),
+ args.nr));
+ if (IS_ERR(entries))
+ return PTR_ERR(entries);
+ }
+ ret = mshv_update_routing_table(partition, entries, args.nr);
+ kvfree(entries);
+
+ return ret;
+}
+
+static long
+mshv_partition_ioctl_initialize(struct mshv_partition *partition)
+{
+ long ret;
+
+ if (partition->pt_initialized)
+ return 0;
+
+ ret = hv_call_initialize_partition(partition->pt_id);
+ if (ret)
+ goto withdraw_mem;
+
+ ret = mshv_debugfs_partition_create(partition);
+ if (ret)
+ goto finalize_partition;
+
+ partition->pt_initialized = true;
+
+ return 0;
+
+finalize_partition:
+ hv_call_finalize_partition(partition->pt_id);
+withdraw_mem:
+ hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
+
+ return ret;
+}
+
+static long
+mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+ struct mshv_partition *partition = filp->private_data;
+ long ret;
+ void __user *uarg = (void __user *)arg;
+
+ if (mutex_lock_killable(&partition->pt_mutex))
+ return -EINTR;
+
+ switch (ioctl) {
+ case MSHV_INITIALIZE_PARTITION:
+ ret = mshv_partition_ioctl_initialize(partition);
+ break;
+ case MSHV_SET_GUEST_MEMORY:
+ ret = mshv_partition_ioctl_set_memory(partition, uarg);
+ break;
+ case MSHV_CREATE_VP:
+ ret = mshv_partition_ioctl_create_vp(partition, uarg);
+ break;
+ case MSHV_IRQFD:
+ ret = mshv_partition_ioctl_irqfd(partition, uarg);
+ break;
+ case MSHV_IOEVENTFD:
+ ret = mshv_partition_ioctl_ioeventfd(partition, uarg);
+ break;
+ case MSHV_SET_MSI_ROUTING:
+ ret = mshv_partition_ioctl_set_msi_routing(partition, uarg);
+ break;
+ case MSHV_GET_GPAP_ACCESS_BITMAP:
+ ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition,
+ uarg);
+ break;
+ case MSHV_ROOT_HVCALL:
+ ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
+ break;
+ default:
+ ret = -ENOTTY;
+ }
+
+ mutex_unlock(&partition->pt_mutex);
+ return ret;
+}
+
+static int
+disable_vp_dispatch(struct mshv_vp *vp)
+{
+ int ret;
+ struct hv_register_assoc dispatch_suspend = {
+ .name = HV_REGISTER_DISPATCH_SUSPEND,
+ .value.dispatch_suspend.suspended = 1,
+ };
+
+ ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+ 1, &dispatch_suspend);
+ if (ret)
+ vp_err(vp, "failed to suspend\n");
+
+ trace_mshv_disable_vp_dispatch(vp->vp_partition->pt_id,
+ vp->vp_index, ret);
+
+ return ret;
+}
+
+static int
+get_vp_signaled_count(struct mshv_vp *vp, u64 *count)
+{
+ int ret;
+ struct hv_register_assoc root_signal_count = {
+ .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT,
+ };
+
+ ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+ 1, &root_signal_count);
+
+ if (ret) {
+ vp_err(vp, "Failed to get root signal count");
+ *count = 0;
+ return ret;
+ }
+
+ *count = root_signal_count.value.reg64;
+
+ return ret;
+}
+
+static void
+drain_vp_signals(struct mshv_vp *vp)
+{
+ u64 hv_signal_count;
+ u64 vp_signal_count;
+
+ get_vp_signaled_count(vp, &hv_signal_count);
+
+ vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
+
+ /*
+ * There should be at most 1 outstanding notification, but be extra
+ * careful anyway.
+ */
+ while (hv_signal_count != vp_signal_count) {
+ WARN_ON(hv_signal_count - vp_signal_count != 1);
+
+ if (wait_event_interruptible(vp->run.vp_suspend_queue,
+ vp->run.kicked_by_hv == 1))
+ break;
+ vp->run.kicked_by_hv = 0;
+ vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
+ }
+
+ trace_mshv_drain_vp_signals(vp->vp_partition->pt_id, vp->vp_index);
+}
+
+static void drain_all_vps(const struct mshv_partition *partition)
+{
+ int i;
+ struct mshv_vp *vp;
+
+ /*
+ * VPs are reachable from ISR. It is safe to not take the partition
+ * lock because nobody else can enter this function and drop the
+ * partition from the list.
+ */
+ for (i = 0; i < MSHV_MAX_VPS; i++) {
+ vp = partition->pt_vp_array[i];
+ if (!vp)
+ continue;
+ /*
+ * Disable dispatching of the VP in the hypervisor. After this
+ * the hypervisor guarantees it won't generate any signals for
+ * the VP and the hypervisor's VP signal count won't change.
+ */
+ disable_vp_dispatch(vp);
+ drain_vp_signals(vp);
+ }
+}
+
+static void
+remove_partition(struct mshv_partition *partition)
+{
+ spin_lock(&mshv_root.pt_ht_lock);
+ hlist_del_rcu(&partition->pt_hnode);
+ spin_unlock(&mshv_root.pt_ht_lock);
+
+ synchronize_rcu();
+}
+
+/*
+ * Tear down a partition and remove it from the list.
+ * Partition's refcount must be 0
+ */
+static void destroy_partition(struct mshv_partition *partition)
+{
+ struct mshv_vp *vp;
+ struct mshv_mem_region *region;
+ struct hlist_node *n;
+ int i;
+
+ if (refcount_read(&partition->pt_ref_count)) {
+ pt_err(partition,
+ "Attempt to destroy partition but refcount > 0\n");
+ return;
+ }
+
+ trace_mshv_destroy_partition(partition->pt_id);
+
+ if (partition->pt_initialized) {
+ /*
+ * We only need to drain signals for root scheduler. This should be
+ * done before removing the partition from the partition list.
+ */
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+ drain_all_vps(partition);
+
+ /* Remove vps */
+ for (i = 0; i < MSHV_MAX_VPS; ++i) {
+ vp = partition->pt_vp_array[i];
+ if (!vp)
+ continue;
+
+ mshv_debugfs_vp_remove(vp);
+ mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
+ vp->vp_stats_pages);
+
+ if (vp->vp_register_page) {
+ (void)hv_unmap_vp_state_page(partition->pt_id,
+ vp->vp_index,
+ HV_VP_STATE_PAGE_REGISTERS,
+ virt_to_page(vp->vp_register_page),
+ input_vtl_zero);
+ vp->vp_register_page = NULL;
+ }
+
+ (void)hv_unmap_vp_state_page(partition->pt_id,
+ vp->vp_index,
+ HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+ virt_to_page(vp->vp_intercept_msg_page),
+ input_vtl_zero);
+ vp->vp_intercept_msg_page = NULL;
+
+ if (vp->vp_ghcb_page) {
+ (void)hv_unmap_vp_state_page(partition->pt_id,
+ vp->vp_index,
+ HV_VP_STATE_PAGE_GHCB,
+ virt_to_page(vp->vp_ghcb_page),
+ input_vtl_normal);
+ vp->vp_ghcb_page = NULL;
+ }
+
+ kfree(vp);
+
+ partition->pt_vp_array[i] = NULL;
+ }
+
+ mshv_debugfs_partition_remove(partition);
+
+ /* Deallocates and unmaps everything including vcpus, GPA mappings etc */
+ hv_call_finalize_partition(partition->pt_id);
+
+ partition->pt_initialized = false;
+ }
+
+ remove_partition(partition);
+
+ hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
+ hnode) {
+ hlist_del(&region->hnode);
+ mshv_region_put(region);
+ }
+
+ /* Withdraw and free all pages we deposited */
+ hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
+ hv_call_delete_partition(partition->pt_id);
+
+ mshv_free_routing_table(partition);
+ kfree(partition);
+}
+
+struct
+mshv_partition *mshv_partition_get(struct mshv_partition *partition)
+{
+ if (refcount_inc_not_zero(&partition->pt_ref_count))
+ return partition;
+ return NULL;
+}
+
+struct
+mshv_partition *mshv_partition_find(u64 partition_id)
+ __must_hold(RCU)
+{
+ struct mshv_partition *p;
+
+ hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode,
+ partition_id)
+ if (p->pt_id == partition_id)
+ return p;
+
+ return NULL;
+}
+
+void
+mshv_partition_put(struct mshv_partition *partition)
+{
+ if (refcount_dec_and_test(&partition->pt_ref_count))
+ destroy_partition(partition);
+}
+
+static int
+mshv_partition_release(struct inode *inode, struct file *filp)
+{
+ struct mshv_partition *partition = filp->private_data;
+
+ trace_mshv_partition_release(partition->pt_id);
+
+ mshv_eventfd_release(partition);
+
+ cleanup_srcu_struct(&partition->pt_irq_srcu);
+
+ mshv_partition_put(partition);
+
+ return 0;
+}
+
+static int
+add_partition(struct mshv_partition *partition)
+{
+ spin_lock(&mshv_root.pt_ht_lock);
+
+ hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode,
+ partition->pt_id);
+
+ spin_unlock(&mshv_root.pt_ht_lock);
+
+ return 0;
+}
+
+static_assert(MSHV_NUM_CPU_FEATURES_BANKS ==
+ HV_PARTITION_PROCESSOR_FEATURES_BANKS);
+
+static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
+ struct hv_partition_creation_properties *cr_props,
+ union hv_partition_isolation_properties *isol_props)
+{
+ int i;
+ struct mshv_create_partition_v2 args;
+ union hv_partition_processor_features *disabled_procs;
+ union hv_partition_processor_xsave_features *disabled_xsave;
+
+ /* First, copy v1 struct in case user is on previous versions */
+ if (copy_from_user(&args, user_arg,
+ sizeof(struct mshv_create_partition)))
+ return -EFAULT;
+
+ if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
+ args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
+ return -EINVAL;
+
+ disabled_procs = &cr_props->disabled_processor_features;
+ disabled_xsave = &cr_props->disabled_processor_xsave_features;
+
+ /* Check if user provided newer struct with feature fields */
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) {
+ if (copy_from_user(&args, user_arg, sizeof(args)))
+ return -EFAULT;
+
+ /* Re-validate v1 fields after second copy_from_user() */
+ if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
+ args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
+ return -EINVAL;
+
+ if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS ||
+ mshv_field_nonzero(args, pt_rsvd) ||
+ mshv_field_nonzero(args, pt_rsvd1))
+ return -EINVAL;
+
+ /*
+ * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never
+ * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS
+ * (i.e. 2).
+ *
+ * Further banks (index >= 2) will be modifiable as 'early'
+ * properties via the set partition property hypercall.
+ */
+ for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
+ disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i];
+
+#if IS_ENABLED(CONFIG_X86_64)
+ disabled_xsave->as_uint64 = args.pt_disabled_xsave;
+#else
+ /*
+ * In practice this field is ignored on arm64, but safer to
+ * zero it in case it is ever used.
+ */
+ disabled_xsave->as_uint64 = 0;
+
+ if (mshv_field_nonzero(args, pt_rsvd2))
+ return -EINVAL;
+#endif
+ } else {
+ /*
+ * v1 behavior: try to enable everything. The hypervisor will
+ * disable features that are not supported. The banks can be
+ * queried via the get partition property hypercall.
+ */
+ for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
+ disabled_procs->as_uint64[i] = 0;
+
+ disabled_xsave->as_uint64 = 0;
+ }
+
+ /* Only support EXO partitions */
+ *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
+ HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
+
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC))
+ *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC))
+ *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
+ *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
+ if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION))
+ *pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE;
+ if (args.pt_flags & BIT(MSHV_PT_BIT_SMT_ENABLED_GUEST))
+ *pt_flags |= HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST;
+
+ isol_props->as_uint64 = 0;
+
+ switch (args.pt_isolation) {
+ case MSHV_PT_ISOLATION_NONE:
+ isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE;
+ break;
+ }
+
+ return 0;
+}
+
+static long
+mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
+{
+ u64 creation_flags;
+ struct hv_partition_creation_properties creation_properties;
+ union hv_partition_isolation_properties isolation_properties;
+ struct mshv_partition *partition;
+ u64 pt_id = -1;
+ long ret;
+
+ ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags,
+ &creation_properties,
+ &isolation_properties);
+ if (ret)
+ return ret;
+
+ partition = kzalloc_obj(*partition);
+ if (!partition)
+ return -ENOMEM;
+
+ partition->pt_module_dev = module_dev;
+ partition->isolation_type = isolation_properties.isolation_type;
+
+ refcount_set(&partition->pt_ref_count, 1);
+
+ mutex_init(&partition->pt_mutex);
+
+ mutex_init(&partition->pt_irq_lock);
+
+ init_completion(&partition->async_hypercall);
+
+ INIT_HLIST_HEAD(&partition->irq_ack_notifier_list);
+
+ INIT_HLIST_HEAD(&partition->pt_devices);
+
+ spin_lock_init(&partition->pt_mem_regions_lock);
+ INIT_HLIST_HEAD(&partition->pt_mem_regions);
+
+ mshv_eventfd_init(partition);
+
+ ret = init_srcu_struct(&partition->pt_irq_srcu);
+ if (ret)
+ goto free_partition;
+
+ ret = hv_call_create_partition(creation_flags,
+ creation_properties,
+ isolation_properties,
+ &pt_id);
+ if (ret)
+ goto cleanup_irq_srcu;
+
+ partition->pt_id = pt_id;
+
+ ret = add_partition(partition);
+ if (ret)
+ goto delete_partition;
+
+ ret = mshv_init_async_handler(partition);
+ if (ret)
+ goto remove_partition;
+
+ ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition",
+ &mshv_partition_fops,
+ partition, O_RDWR));
+ if (ret < 0)
+ goto remove_partition;
+
+ goto out;
+
+remove_partition:
+ remove_partition(partition);
+delete_partition:
+ hv_call_delete_partition(partition->pt_id);
+cleanup_irq_srcu:
+ cleanup_srcu_struct(&partition->pt_irq_srcu);
+free_partition:
+ kfree(partition);
+out:
+ trace_mshv_create_partition(pt_id, ret);
+ return ret;
+}
+
+static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct miscdevice *misc = filp->private_data;
+
+ switch (ioctl) {
+ case MSHV_CREATE_PARTITION:
+ return mshv_ioctl_create_partition((void __user *)arg,
+ misc->this_device);
+ case MSHV_ROOT_HVCALL:
+ return mshv_ioctl_passthru_hvcall(NULL, false,
+ (void __user *)arg);
+ }
+
+ return -ENOTTY;
+}
+
+static int
+mshv_dev_open(struct inode *inode, struct file *filp)
+{
+ return 0;
+}
+
+static int
+mshv_dev_release(struct inode *inode, struct file *filp)
+{
+ return 0;
+}
+
+static int mshv_root_sched_online;
+
+static const char *scheduler_type_to_string(enum hv_scheduler_type type)
+{
+ switch (type) {
+ case HV_SCHEDULER_TYPE_LP:
+ return "classic scheduler without SMT";
+ case HV_SCHEDULER_TYPE_LP_SMT:
+ return "classic scheduler with SMT";
+ case HV_SCHEDULER_TYPE_CORE_SMT:
+ return "core scheduler";
+ case HV_SCHEDULER_TYPE_ROOT:
+ return "root scheduler";
+ default:
+ return "unknown scheduler";
+ };
+}
+
+static int __init l1vh_retrieve_scheduler_type(enum hv_scheduler_type *out)
+{
+ u64 integrated_sched_enabled;
+ int ret;
+
+ *out = HV_SCHEDULER_TYPE_CORE_SMT;
+
+ if (!mshv_root.vmm_caps.vmm_enable_integrated_scheduler)
+ return 0;
+
+ ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
+ HV_PARTITION_PROPERTY_INTEGRATED_SCHEDULER_ENABLED,
+ 0, &integrated_sched_enabled,
+ sizeof(integrated_sched_enabled));
+ if (ret)
+ return ret;
+
+ if (integrated_sched_enabled)
+ *out = HV_SCHEDULER_TYPE_ROOT;
+
+ return 0;
+}
+
+/* TODO move this to hv_common.c when needed outside */
+static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out)
+{
+ struct hv_input_get_system_property *input;
+ struct hv_output_get_system_property *output;
+ unsigned long flags;
+ u64 status;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ memset(input, 0, sizeof(*input));
+ memset(output, 0, sizeof(*output));
+ input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE;
+
+ status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
+ if (!hv_result_success(status)) {
+ local_irq_restore(flags);
+ pr_err("%s: %s\n", __func__, hv_result_to_string(status));
+ return hv_result_to_errno(status);
+ }
+
+ *out = output->scheduler_type;
+ local_irq_restore(flags);
+
+ return 0;
+}
+
+/* Retrieve and stash the supported scheduler type */
+static int __init mshv_retrieve_scheduler_type(struct device *dev)
+{
+ int ret;
+
+ if (hv_l1vh_partition())
+ ret = l1vh_retrieve_scheduler_type(&hv_scheduler_type);
+ else
+ ret = hv_retrieve_scheduler_type(&hv_scheduler_type);
+ if (ret)
+ return ret;
+
+ dev_info(dev, "Hypervisor using %s\n",
+ scheduler_type_to_string(hv_scheduler_type));
+
+ switch (hv_scheduler_type) {
+ case HV_SCHEDULER_TYPE_CORE_SMT:
+ case HV_SCHEDULER_TYPE_LP_SMT:
+ case HV_SCHEDULER_TYPE_ROOT:
+ case HV_SCHEDULER_TYPE_LP:
+ /* Supported scheduler, nothing to do */
+ break;
+ default:
+ dev_err(dev, "unsupported scheduler 0x%x, bailing.\n",
+ hv_scheduler_type);
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int mshv_root_scheduler_init(unsigned int cpu)
+{
+ void **inputarg, **outputarg, *p;
+
+ inputarg = (void **)this_cpu_ptr(root_scheduler_input);
+ outputarg = (void **)this_cpu_ptr(root_scheduler_output);
+
+ /* Allocate two consecutive pages. One for input, one for output. */
+ p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ *inputarg = p;
+ *outputarg = (char *)p + HV_HYP_PAGE_SIZE;
+
+ return 0;
+}
+
+static int mshv_root_scheduler_cleanup(unsigned int cpu)
+{
+ void *p, **inputarg, **outputarg;
+
+ inputarg = (void **)this_cpu_ptr(root_scheduler_input);
+ outputarg = (void **)this_cpu_ptr(root_scheduler_output);
+
+ p = *inputarg;
+
+ *inputarg = NULL;
+ *outputarg = NULL;
+
+ kfree(p);
+
+ return 0;
+}
+
+/* Must be called after retrieving the scheduler type */
+static int
+root_scheduler_init(struct device *dev)
+{
+ int ret;
+
+ if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
+ return 0;
+
+ root_scheduler_input = alloc_percpu(void *);
+ root_scheduler_output = alloc_percpu(void *);
+
+ if (!root_scheduler_input || !root_scheduler_output) {
+ dev_err(dev, "Failed to allocate root scheduler buffers\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched",
+ mshv_root_scheduler_init,
+ mshv_root_scheduler_cleanup);
+
+ if (ret < 0) {
+ dev_err(dev, "Failed to setup root scheduler state: %i\n", ret);
+ goto out;
+ }
+
+ mshv_root_sched_online = ret;
+
+ return 0;
+
+out:
+ free_percpu(root_scheduler_input);
+ free_percpu(root_scheduler_output);
+ return ret;
+}
+
+static void
+root_scheduler_deinit(void)
+{
+ if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
+ return;
+
+ cpuhp_remove_state(mshv_root_sched_online);
+ free_percpu(root_scheduler_input);
+ free_percpu(root_scheduler_output);
+}
+
+static int __init mshv_init_vmm_caps(struct device *dev)
+{
+ int ret;
+
+ ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
+ HV_PARTITION_PROPERTY_VMM_CAPABILITIES,
+ 0, &mshv_root.vmm_caps,
+ sizeof(mshv_root.vmm_caps));
+ if (ret && hv_l1vh_partition()) {
+ dev_err(dev, "Failed to get VMM capabilities: %d\n", ret);
+ return ret;
+ }
+
+ dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]);
+
+ return 0;
+}
+
+static int __init mshv_parent_partition_init(void)
+{
+ int ret;
+ struct device *dev;
+ union hv_hypervisor_version_info version_info;
+
+ if (!hv_parent_partition() || is_kdump_kernel())
+ return -ENODEV;
+
+ if (hv_get_hypervisor_version(&version_info))
+ return -ENODEV;
+
+ ret = misc_register(&mshv_dev);
+ if (ret)
+ return ret;
+
+ dev = mshv_dev.this_device;
+
+ if (version_info.build_number < MSHV_HV_MIN_VERSION ||
+ version_info.build_number > MSHV_HV_MAX_VERSION) {
+ dev_err(dev, "Running on unvalidated Hyper-V version\n");
+ dev_err(dev, "Versions: current: %u min: %u max: %u\n",
+ version_info.build_number, MSHV_HV_MIN_VERSION,
+ MSHV_HV_MAX_VERSION);
+ }
+
+ ret = mshv_synic_init(dev);
+ if (ret)
+ goto device_deregister;
+
+ ret = mshv_init_vmm_caps(dev);
+ if (ret)
+ goto synic_cleanup;
+
+ ret = mshv_retrieve_scheduler_type(dev);
+ if (ret)
+ goto synic_cleanup;
+
+ ret = root_scheduler_init(dev);
+ if (ret)
+ goto synic_cleanup;
+
+ ret = mshv_debugfs_init();
+ if (ret)
+ goto deinit_root_scheduler;
+
+ ret = mshv_irqfd_wq_init();
+ if (ret)
+ goto exit_debugfs;
+
+ spin_lock_init(&mshv_root.pt_ht_lock);
+ hash_init(mshv_root.pt_htable);
+
+ hv_setup_mshv_handler(mshv_isr);
+
+ return 0;
+
+exit_debugfs:
+ mshv_debugfs_exit();
+deinit_root_scheduler:
+ root_scheduler_deinit();
+synic_cleanup:
+ mshv_synic_exit();
+device_deregister:
+ misc_deregister(&mshv_dev);
+ return ret;
+}
+
+static void __exit mshv_parent_partition_exit(void)
+{
+ hv_setup_mshv_handler(NULL);
+ mshv_port_table_fini();
+ mshv_debugfs_exit();
+ misc_deregister(&mshv_dev);
+ mshv_irqfd_wq_cleanup();
+ root_scheduler_deinit();
+ mshv_synic_exit();
+}
+
+module_init(mshv_parent_partition_init);
+module_exit(mshv_parent_partition_exit);
diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
new file mode 100644
index 000000000000..e2288a726fec
--- /dev/null
+++ b/drivers/hv/mshv_synic.c
@@ -0,0 +1,820 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * mshv_root module's main interrupt handler and associated functionality.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/cpuhotplug.h>
+#include <linux/reboot.h>
+#include <asm/mshyperv.h>
+#include <linux/acpi.h>
+
+#include "mshv_eventfd.h"
+#include "mshv.h"
+
+static int synic_cpuhp_online;
+static struct hv_synic_pages __percpu *synic_pages;
+static int mshv_sint_vector = -1; /* hwirq for the SynIC SINTs */
+static int mshv_sint_irq = -1; /* Linux IRQ for mshv_sint_vector */
+
+static u32 synic_event_ring_get_queued_port(u32 sint_index)
+{
+ struct hv_synic_event_ring_page **event_ring_page;
+ volatile struct hv_synic_event_ring *ring;
+ struct hv_synic_pages *spages;
+ u8 **synic_eventring_tail;
+ u32 message;
+ u8 tail;
+
+ spages = this_cpu_ptr(synic_pages);
+ event_ring_page = &spages->synic_event_ring_page;
+ synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
+
+ if (unlikely(!*synic_eventring_tail)) {
+ pr_debug("Missing synic event ring tail!\n");
+ return 0;
+ }
+ tail = (*synic_eventring_tail)[sint_index];
+
+ if (unlikely(!*event_ring_page)) {
+ pr_debug("Missing synic event ring page!\n");
+ return 0;
+ }
+
+ ring = &(*event_ring_page)->sint_event_ring[sint_index];
+
+ /*
+ * Get the message.
+ */
+ message = ring->data[tail];
+
+ if (!message) {
+ if (ring->ring_full) {
+ /*
+ * Ring is marked full, but we would have consumed all
+ * the messages. Notify the hypervisor that ring is now
+ * empty and check again.
+ */
+ ring->ring_full = 0;
+ hv_call_notify_port_ring_empty(sint_index);
+ message = ring->data[tail];
+ }
+
+ if (!message) {
+ ring->signal_masked = 0;
+ /*
+ * Unmask the signal and sync with hypervisor
+ * before one last check for any message.
+ */
+ mb();
+ message = ring->data[tail];
+
+ /*
+ * Ok, lets bail out.
+ */
+ if (!message)
+ return 0;
+ }
+
+ ring->signal_masked = 1;
+ }
+
+ /*
+ * Clear the message in the ring buffer.
+ */
+ ring->data[tail] = 0;
+
+ if (++tail == HV_SYNIC_EVENT_RING_MESSAGE_COUNT)
+ tail = 0;
+
+ (*synic_eventring_tail)[sint_index] = tail;
+
+ return message;
+}
+
+static bool
+mshv_doorbell_isr(struct hv_message *msg)
+{
+ struct hv_notification_message_payload *notification;
+ u32 port;
+
+ if (msg->header.message_type != HVMSG_SYNIC_SINT_INTERCEPT)
+ return false;
+
+ notification = (struct hv_notification_message_payload *)msg->u.payload;
+ if (notification->sint_index != HV_SYNIC_DOORBELL_SINT_INDEX)
+ return false;
+
+ while ((port = synic_event_ring_get_queued_port(HV_SYNIC_DOORBELL_SINT_INDEX))) {
+ struct port_table_info ptinfo = { 0 };
+
+ if (mshv_portid_lookup(port, &ptinfo)) {
+ pr_debug("Failed to get port info from port_table!\n");
+ continue;
+ }
+
+ if (ptinfo.hv_port_type != HV_PORT_TYPE_DOORBELL) {
+ pr_debug("Not a doorbell port!, port: %d, port_type: %d\n",
+ port, ptinfo.hv_port_type);
+ continue;
+ }
+
+ /* Invoke the callback */
+ ptinfo.hv_port_doorbell.doorbell_cb(port,
+ ptinfo.hv_port_doorbell.data);
+ }
+
+ return true;
+}
+
+static bool mshv_async_call_completion_isr(struct hv_message *msg)
+{
+ bool handled = false;
+ struct hv_async_completion_message_payload *async_msg;
+ struct mshv_partition *partition;
+ u64 partition_id;
+
+ if (msg->header.message_type != HVMSG_ASYNC_CALL_COMPLETION)
+ goto out;
+
+ async_msg =
+ (struct hv_async_completion_message_payload *)msg->u.payload;
+
+ partition_id = async_msg->partition_id;
+
+ /*
+ * Hold this lock for the rest of the isr, because the partition could
+ * be released anytime.
+ * e.g. the MSHV_RUN_VP thread could wake on another cpu; it could
+ * release the partition unless we hold this!
+ */
+ rcu_read_lock();
+
+ partition = mshv_partition_find(partition_id);
+
+ if (unlikely(!partition)) {
+ pr_debug("failed to find partition %llu\n", partition_id);
+ goto unlock_out;
+ }
+
+ partition->async_hypercall_status = async_msg->status;
+ complete(&partition->async_hypercall);
+
+ handled = true;
+
+unlock_out:
+ rcu_read_unlock();
+out:
+ return handled;
+}
+
+static void kick_vp(struct mshv_vp *vp)
+{
+ atomic64_inc(&vp->run.vp_signaled_count);
+ vp->run.kicked_by_hv = 1;
+ wake_up(&vp->run.vp_suspend_queue);
+}
+
+static void
+handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg)
+{
+ int bank_idx, vps_signaled = 0, bank_mask_size;
+ struct mshv_partition *partition;
+ const struct hv_vpset *vpset;
+ const u64 *bank_contents;
+ u64 partition_id = msg->partition_id;
+
+ if (msg->vp_bitset.bitset.format != HV_GENERIC_SET_SPARSE_4K) {
+ pr_debug("scheduler message format is not HV_GENERIC_SET_SPARSE_4K");
+ return;
+ }
+
+ if (msg->vp_count == 0) {
+ pr_debug("scheduler message with no VP specified");
+ return;
+ }
+
+ rcu_read_lock();
+
+ partition = mshv_partition_find(partition_id);
+ if (unlikely(!partition)) {
+ pr_debug("failed to find partition %llu\n", partition_id);
+ goto unlock_out;
+ }
+
+ vpset = &msg->vp_bitset.bitset;
+
+ bank_idx = -1;
+ bank_contents = vpset->bank_contents;
+ bank_mask_size = sizeof(vpset->valid_bank_mask) * BITS_PER_BYTE;
+
+ while (true) {
+ int vp_bank_idx = -1;
+ int vp_bank_size = sizeof(*bank_contents) * BITS_PER_BYTE;
+ int vp_index;
+
+ bank_idx = find_next_bit((unsigned long *)&vpset->valid_bank_mask,
+ bank_mask_size, bank_idx + 1);
+ if (bank_idx == bank_mask_size)
+ break;
+
+ while (true) {
+ struct mshv_vp *vp;
+
+ vp_bank_idx = find_next_bit((unsigned long *)bank_contents,
+ vp_bank_size, vp_bank_idx + 1);
+ if (vp_bank_idx == vp_bank_size)
+ break;
+
+ vp_index = (bank_idx * vp_bank_size) + vp_bank_idx;
+
+ /* This shouldn't happen, but just in case. */
+ if (unlikely(vp_index >= MSHV_MAX_VPS)) {
+ pr_debug("VP index %u out of bounds\n",
+ vp_index);
+ goto unlock_out;
+ }
+
+ vp = partition->pt_vp_array[vp_index];
+ if (unlikely(!vp)) {
+ pr_debug("failed to find VP %u\n", vp_index);
+ goto unlock_out;
+ }
+
+ kick_vp(vp);
+ vps_signaled++;
+ }
+
+ bank_contents++;
+ }
+
+unlock_out:
+ rcu_read_unlock();
+
+ if (vps_signaled != msg->vp_count)
+ pr_debug("asked to signal %u VPs but only did %u\n",
+ msg->vp_count, vps_signaled);
+}
+
+static void
+handle_pair_message(const struct hv_vp_signal_pair_scheduler_message *msg)
+{
+ struct mshv_partition *partition = NULL;
+ struct mshv_vp *vp;
+ int idx;
+
+ rcu_read_lock();
+
+ for (idx = 0; idx < msg->vp_count; idx++) {
+ u64 partition_id = msg->partition_ids[idx];
+ u32 vp_index = msg->vp_indexes[idx];
+
+ if (idx == 0 || partition->pt_id != partition_id) {
+ partition = mshv_partition_find(partition_id);
+ if (unlikely(!partition)) {
+ pr_debug("failed to find partition %llu\n",
+ partition_id);
+ break;
+ }
+ }
+
+ /* This shouldn't happen, but just in case. */
+ if (unlikely(vp_index >= MSHV_MAX_VPS)) {
+ pr_debug("VP index %u out of bounds\n", vp_index);
+ break;
+ }
+
+ vp = partition->pt_vp_array[vp_index];
+ if (!vp) {
+ pr_debug("failed to find VP %u\n", vp_index);
+ break;
+ }
+
+ kick_vp(vp);
+ }
+
+ rcu_read_unlock();
+}
+
+static bool
+mshv_scheduler_isr(struct hv_message *msg)
+{
+ if (msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_BITSET &&
+ msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_PAIR)
+ return false;
+
+ if (msg->header.message_type == HVMSG_SCHEDULER_VP_SIGNAL_BITSET)
+ handle_bitset_message((struct hv_vp_signal_bitset_scheduler_message *)
+ msg->u.payload);
+ else
+ handle_pair_message((struct hv_vp_signal_pair_scheduler_message *)
+ msg->u.payload);
+
+ return true;
+}
+
+static bool
+mshv_intercept_isr(struct hv_message *msg)
+{
+ struct mshv_partition *partition;
+ bool handled = false;
+ struct mshv_vp *vp;
+ u64 partition_id;
+ u32 vp_index;
+
+ partition_id = msg->header.sender;
+
+ rcu_read_lock();
+
+ partition = mshv_partition_find(partition_id);
+ if (unlikely(!partition)) {
+ pr_debug("failed to find partition %llu\n",
+ partition_id);
+ goto unlock_out;
+ }
+
+ if (msg->header.message_type == HVMSG_X64_APIC_EOI) {
+ /*
+ * Check if this gsi is registered in the
+ * ack_notifier list and invoke the callback
+ * if registered.
+ */
+
+ /*
+ * If there is a notifier, the ack callback is supposed
+ * to handle the VMEXIT. So we need not pass this message
+ * to vcpu thread.
+ */
+ struct hv_x64_apic_eoi_message *eoi_msg =
+ (struct hv_x64_apic_eoi_message *)&msg->u.payload[0];
+
+ if (mshv_notify_acked_gsi(partition, eoi_msg->interrupt_vector)) {
+ handled = true;
+ goto unlock_out;
+ }
+ }
+
+ /*
+ * We should get an opaque intercept message here for all intercept
+ * messages, since we're using the mapped VP intercept message page.
+ *
+ * The intercept message will have been placed in intercept message
+ * page at this point.
+ *
+ * Make sure the message type matches our expectation.
+ */
+ if (msg->header.message_type != HVMSG_OPAQUE_INTERCEPT) {
+ pr_debug("wrong message type %d", msg->header.message_type);
+ goto unlock_out;
+ }
+
+ /*
+ * Since we directly index the vp, and it has to exist for us to be here
+ * (because the vp is only deleted when the partition is), no additional
+ * locking is needed here
+ */
+ vp_index =
+ ((struct hv_opaque_intercept_message *)msg->u.payload)->vp_index;
+ vp = partition->pt_vp_array[vp_index];
+ if (unlikely(!vp)) {
+ pr_debug("failed to find VP %u\n", vp_index);
+ goto unlock_out;
+ }
+
+ kick_vp(vp);
+
+ handled = true;
+
+unlock_out:
+ rcu_read_unlock();
+
+ return handled;
+}
+
+void mshv_isr(void)
+{
+ struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
+ struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
+ struct hv_message *msg;
+ bool handled;
+
+ if (unlikely(!(*msg_page))) {
+ pr_debug("Missing synic page!\n");
+ return;
+ }
+
+ msg = &((*msg_page)->sint_message[HV_SYNIC_INTERCEPTION_SINT_INDEX]);
+
+ /*
+ * If the type isn't set, there isn't really a message;
+ * it may be some other hyperv interrupt
+ */
+ if (msg->header.message_type == HVMSG_NONE)
+ return;
+
+ handled = mshv_doorbell_isr(msg);
+
+ if (!handled)
+ handled = mshv_scheduler_isr(msg);
+
+ if (!handled)
+ handled = mshv_async_call_completion_isr(msg);
+
+ if (!handled)
+ handled = mshv_intercept_isr(msg);
+
+ if (handled) {
+ /*
+ * Acknowledge message with hypervisor if another message is
+ * pending.
+ */
+ msg->header.message_type = HVMSG_NONE;
+ /*
+ * Ensure the write is complete so the hypervisor will deliver
+ * the next message if available.
+ */
+ mb();
+ if (msg->header.message_flags.msg_pending)
+ hv_set_non_nested_msr(HV_MSR_EOM, 0);
+ } else {
+ pr_warn_once("%s: unknown message type 0x%x\n", __func__,
+ msg->header.message_type);
+ }
+}
+
+static int mshv_synic_cpu_init(unsigned int cpu)
+{
+ union hv_synic_simp simp;
+ union hv_synic_siefp siefp;
+ union hv_synic_sirbp sirbp;
+ union hv_synic_sint sint;
+ union hv_synic_scontrol sctrl;
+ struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
+ struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
+ struct hv_synic_event_flags_page **event_flags_page =
+ &spages->synic_event_flags_page;
+ struct hv_synic_event_ring_page **event_ring_page =
+ &spages->synic_event_ring_page;
+
+ /* Setup the Synic's message page */
+ simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP);
+ simp.simp_enabled = true;
+ *msg_page = memremap(simp.base_simp_gpa << HV_HYP_PAGE_SHIFT,
+ HV_HYP_PAGE_SIZE,
+ MEMREMAP_WB);
+
+ if (!(*msg_page))
+ return -EFAULT;
+
+ hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64);
+
+ /* Setup the Synic's event flags page */
+ siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP);
+ siefp.siefp_enabled = true;
+ *event_flags_page = memremap(siefp.base_siefp_gpa << PAGE_SHIFT,
+ PAGE_SIZE, MEMREMAP_WB);
+
+ if (!(*event_flags_page))
+ goto cleanup;
+
+ hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64);
+
+ /* Setup the Synic's event ring page */
+ sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP);
+ sirbp.sirbp_enabled = true;
+ *event_ring_page = memremap(sirbp.base_sirbp_gpa << PAGE_SHIFT,
+ PAGE_SIZE, MEMREMAP_WB);
+
+ if (!(*event_ring_page))
+ goto cleanup;
+
+ hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
+
+ if (mshv_sint_irq != -1)
+ enable_percpu_irq(mshv_sint_irq, 0);
+
+ /* Enable intercepts */
+ sint.as_uint64 = 0;
+ sint.vector = mshv_sint_vector;
+ sint.masked = false;
+ sint.auto_eoi = hv_recommend_using_aeoi();
+ hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
+ sint.as_uint64);
+
+ /* Doorbell SINT */
+ sint.as_uint64 = 0;
+ sint.vector = mshv_sint_vector;
+ sint.masked = false;
+ sint.as_intercept = 1;
+ sint.auto_eoi = hv_recommend_using_aeoi();
+ hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
+ sint.as_uint64);
+
+ /* Enable global synic bit */
+ sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL);
+ sctrl.enable = 1;
+ hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
+
+ return 0;
+
+cleanup:
+ if (*event_ring_page) {
+ sirbp.sirbp_enabled = false;
+ hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
+ memunmap(*event_ring_page);
+ }
+ if (*event_flags_page) {
+ siefp.siefp_enabled = false;
+ hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64);
+ memunmap(*event_flags_page);
+ }
+ if (*msg_page) {
+ simp.simp_enabled = false;
+ hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64);
+ memunmap(*msg_page);
+ }
+
+ return -EFAULT;
+}
+
+static int mshv_synic_cpu_exit(unsigned int cpu)
+{
+ union hv_synic_sint sint;
+ union hv_synic_simp simp;
+ union hv_synic_siefp siefp;
+ union hv_synic_sirbp sirbp;
+ union hv_synic_scontrol sctrl;
+ struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
+ struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
+ struct hv_synic_event_flags_page **event_flags_page =
+ &spages->synic_event_flags_page;
+ struct hv_synic_event_ring_page **event_ring_page =
+ &spages->synic_event_ring_page;
+
+ /* Disable the interrupt */
+ sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX);
+ sint.masked = true;
+ hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
+ sint.as_uint64);
+
+ /* Disable Doorbell SINT */
+ sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX);
+ sint.masked = true;
+ hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
+ sint.as_uint64);
+
+ if (mshv_sint_irq != -1)
+ disable_percpu_irq(mshv_sint_irq);
+
+ /* Disable Synic's event ring page */
+ sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP);
+ sirbp.sirbp_enabled = false;
+ hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
+ memunmap(*event_ring_page);
+
+ /* Disable Synic's event flags page */
+ siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP);
+ siefp.siefp_enabled = false;
+ hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64);
+ memunmap(*event_flags_page);
+
+ /* Disable Synic's message page */
+ simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP);
+ simp.simp_enabled = false;
+ hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64);
+ memunmap(*msg_page);
+
+ /* Disable global synic bit */
+ sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL);
+ sctrl.enable = 0;
+ hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
+
+ return 0;
+}
+
+int
+mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb, void *data,
+ u64 gpa, u64 val, u64 flags)
+{
+ struct hv_connection_info connection_info = { 0 };
+ union hv_connection_id connection_id = { 0 };
+ struct port_table_info *port_table_info;
+ struct hv_port_info port_info = { 0 };
+ union hv_port_id port_id = { 0 };
+ int ret;
+
+ port_table_info = kmalloc_obj(*port_table_info);
+ if (!port_table_info)
+ return -ENOMEM;
+
+ port_table_info->hv_port_type = HV_PORT_TYPE_DOORBELL;
+ port_table_info->hv_port_doorbell.doorbell_cb = doorbell_cb;
+ port_table_info->hv_port_doorbell.data = data;
+ ret = mshv_portid_alloc(port_table_info);
+ if (ret < 0) {
+ kfree(port_table_info);
+ return ret;
+ }
+
+ port_id.u.id = ret;
+ port_info.port_type = HV_PORT_TYPE_DOORBELL;
+ port_info.doorbell_port_info.target_sint = HV_SYNIC_DOORBELL_SINT_INDEX;
+ port_info.doorbell_port_info.target_vp = HV_ANY_VP;
+ ret = hv_call_create_port(hv_current_partition_id, port_id, partition_id,
+ &port_info,
+ 0, 0, NUMA_NO_NODE);
+
+ if (ret < 0) {
+ mshv_portid_free(port_id.u.id);
+ return ret;
+ }
+
+ connection_id.u.id = port_id.u.id;
+ connection_info.port_type = HV_PORT_TYPE_DOORBELL;
+ connection_info.doorbell_connection_info.gpa = gpa;
+ connection_info.doorbell_connection_info.trigger_value = val;
+ connection_info.doorbell_connection_info.flags = flags;
+
+ ret = hv_call_connect_port(hv_current_partition_id, port_id, partition_id,
+ connection_id, &connection_info, 0, NUMA_NO_NODE);
+ if (ret < 0) {
+ hv_call_delete_port(hv_current_partition_id, port_id);
+ mshv_portid_free(port_id.u.id);
+ return ret;
+ }
+
+ // lets use the port_id as the doorbell_id
+ return port_id.u.id;
+}
+
+void
+mshv_unregister_doorbell(u64 partition_id, int doorbell_portid)
+{
+ union hv_port_id port_id = { 0 };
+ union hv_connection_id connection_id = { 0 };
+
+ connection_id.u.id = doorbell_portid;
+ hv_call_disconnect_port(partition_id, connection_id);
+
+ port_id.u.id = doorbell_portid;
+ hv_call_delete_port(hv_current_partition_id, port_id);
+
+ mshv_portid_free(doorbell_portid);
+}
+
+static int mshv_synic_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (!hv_root_partition())
+ return 0;
+
+ cpuhp_remove_state(synic_cpuhp_online);
+ return 0;
+}
+
+static struct notifier_block mshv_synic_reboot_nb = {
+ .notifier_call = mshv_synic_reboot_notify,
+};
+
+#ifndef HYPERVISOR_CALLBACK_VECTOR
+static DEFINE_PER_CPU(long, mshv_evt);
+
+static irqreturn_t mshv_percpu_isr(int irq, void *dev_id)
+{
+ mshv_isr();
+ return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_ACPI
+static int __init mshv_acpi_setup_sint_irq(void)
+{
+ return acpi_register_gsi(NULL, mshv_sint_vector, ACPI_EDGE_SENSITIVE,
+ ACPI_ACTIVE_HIGH);
+}
+
+static void mshv_acpi_cleanup_sint_irq(void)
+{
+ acpi_unregister_gsi(mshv_sint_vector);
+}
+#else
+static int __init mshv_acpi_setup_sint_irq(void)
+{
+ return -ENODEV;
+}
+
+static void mshv_acpi_cleanup_sint_irq(void)
+{
+}
+#endif
+
+static int __init mshv_sint_vector_setup(void)
+{
+ int ret;
+ struct hv_register_assoc reg = {
+ .name = HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID,
+ };
+ union hv_input_vtl input_vtl = { 0 };
+
+ if (acpi_disabled)
+ return -ENODEV;
+
+ ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ 1, input_vtl, &reg);
+ if (ret || !reg.value.reg64)
+ return -ENODEV;
+
+ mshv_sint_vector = reg.value.reg64;
+ ret = mshv_acpi_setup_sint_irq();
+ if (ret < 0) {
+ pr_err("Failed to setup IRQ for MSHV SINT vector %d: %d\n",
+ mshv_sint_vector, ret);
+ goto out_fail;
+ }
+
+ mshv_sint_irq = ret;
+
+ ret = request_percpu_irq(mshv_sint_irq, mshv_percpu_isr, "MSHV",
+ &mshv_evt);
+ if (ret)
+ goto out_unregister;
+
+ return 0;
+
+out_unregister:
+ mshv_acpi_cleanup_sint_irq();
+out_fail:
+ return ret;
+}
+
+static void mshv_sint_vector_cleanup(void)
+{
+ free_percpu_irq(mshv_sint_irq, &mshv_evt);
+ mshv_acpi_cleanup_sint_irq();
+}
+#else /* !HYPERVISOR_CALLBACK_VECTOR */
+static int __init mshv_sint_vector_setup(void)
+{
+ mshv_sint_vector = HYPERVISOR_CALLBACK_VECTOR;
+ return 0;
+}
+
+static void mshv_sint_vector_cleanup(void)
+{
+}
+#endif /* HYPERVISOR_CALLBACK_VECTOR */
+
+int __init mshv_synic_init(struct device *dev)
+{
+ int ret = 0;
+
+ ret = mshv_sint_vector_setup();
+ if (ret)
+ return ret;
+
+ synic_pages = alloc_percpu(struct hv_synic_pages);
+ if (!synic_pages) {
+ dev_err(dev, "Failed to allocate percpu synic page\n");
+ ret = -ENOMEM;
+ goto sint_vector_cleanup;
+ }
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
+ mshv_synic_cpu_init,
+ mshv_synic_cpu_exit);
+ if (ret < 0) {
+ dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
+ goto free_synic_pages;
+ }
+
+ synic_cpuhp_online = ret;
+
+ ret = register_reboot_notifier(&mshv_synic_reboot_nb);
+ if (ret)
+ goto remove_cpuhp_state;
+
+ return 0;
+
+remove_cpuhp_state:
+ cpuhp_remove_state(synic_cpuhp_online);
+free_synic_pages:
+ free_percpu(synic_pages);
+sint_vector_cleanup:
+ mshv_sint_vector_cleanup();
+ return ret;
+}
+
+void mshv_synic_exit(void)
+{
+ unregister_reboot_notifier(&mshv_synic_reboot_nb);
+ cpuhp_remove_state(synic_cpuhp_online);
+ free_percpu(synic_pages);
+ mshv_sint_vector_cleanup();
+}
diff --git a/drivers/hv/mshv_trace.c b/drivers/hv/mshv_trace.c
new file mode 100644
index 000000000000..0936b2f95edd
--- /dev/null
+++ b/drivers/hv/mshv_trace.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2026, Microsoft Corporation.
+ *
+ * Tracepoint definitions for mshv driver.
+ */
+
+#define CREATE_TRACE_POINTS
+#include "mshv_trace.h"
diff --git a/drivers/hv/mshv_trace.h b/drivers/hv/mshv_trace.h
new file mode 100644
index 000000000000..e7280c47e579
--- /dev/null
+++ b/drivers/hv/mshv_trace.h
@@ -0,0 +1,544 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2026, Microsoft Corporation.
+ *
+ * Tracepoint declarations for mshv driver.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mshv
+
+#if !defined(__MSHV_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _MSHV_TRACE_H_
+
+#include <linux/tracepoint.h>
+#include <hyperv/hvhdk.h>
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../drivers/hv
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mshv_trace
+
+TRACE_EVENT(mshv_create_partition,
+ TP_PROTO(u64 partition_id, int vm_fd),
+ TP_ARGS(partition_id, vm_fd),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(int, vm_fd)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->vm_fd = vm_fd;
+ ),
+ TP_printk("partition_id=%llu vm_fd=%d",
+ __entry->partition_id,
+ __entry->vm_fd
+ )
+);
+
+TRACE_EVENT(mshv_hvcall_create_partition,
+ TP_PROTO(u64 flags, s64 partition_id),
+ TP_ARGS(flags, partition_id),
+ TP_STRUCT__entry(
+ __field(u64, flags)
+ __field(s64, partition_id)
+ ),
+ TP_fast_assign(
+ __entry->flags = flags;
+ __entry->partition_id = partition_id;
+ ),
+ TP_printk("flags=%#llx partition_id=%lld",
+ __entry->flags,
+ __entry->partition_id
+ )
+);
+
+TRACE_EVENT(mshv_hvcall_initialize_partition,
+ TP_PROTO(u64 partition_id, u64 status),
+ TP_ARGS(partition_id, status),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u64, status)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->status = status;
+ ),
+ TP_printk("partition_id=%llu status=%#llx",
+ __entry->partition_id,
+ __entry->status
+ )
+);
+
+TRACE_EVENT(mshv_partition_release,
+ TP_PROTO(u64 partition_id),
+ TP_ARGS(partition_id),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ ),
+ TP_printk("partition_id=%llu",
+ __entry->partition_id
+ )
+);
+
+TRACE_EVENT(mshv_destroy_partition,
+ TP_PROTO(u64 partition_id),
+ TP_ARGS(partition_id),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ ),
+ TP_printk("partition_id=%llu",
+ __entry->partition_id
+ )
+);
+
+TRACE_EVENT(mshv_hvcall_finalize_partition,
+ TP_PROTO(u64 partition_id, u64 status),
+ TP_ARGS(partition_id, status),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u64, status)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->status = status;
+ ),
+ TP_printk("partition_id=%llu status=%#llx ",
+ __entry->partition_id,
+ __entry->status
+ )
+);
+
+TRACE_EVENT(mshv_hvcall_withdraw_memory,
+ TP_PROTO(u64 partition_id, u64 withdrawn, u64 status),
+ TP_ARGS(partition_id, withdrawn, status),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u64, withdrawn)
+ __field(u64, status)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->withdrawn = withdrawn;
+ __entry->status = status;
+ ),
+ TP_printk("partition_id=%llu withdrawn=%llu status=%#llx",
+ __entry->partition_id,
+ __entry->withdrawn,
+ __entry->status
+ )
+);
+
+TRACE_EVENT(mshv_hvcall_delete_partition,
+ TP_PROTO(u64 partition_id, u64 status),
+ TP_ARGS(partition_id, status),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u64, status)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->status = status;
+ ),
+ TP_printk("partition_id=%llu status=%#llx",
+ __entry->partition_id,
+ __entry->status
+ )
+);
+
+TRACE_EVENT(mshv_create_vp,
+ TP_PROTO(u64 partition_id, u32 vp_index, long vp_fd),
+ TP_ARGS(partition_id, vp_index, vp_fd),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u32, vp_index)
+ __field(long, vp_fd)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->vp_index = vp_index;
+ __entry->vp_fd = vp_fd;
+ ),
+ TP_printk("partition_id=%llu vp_index=%u vp_fd=%ld",
+ __entry->partition_id,
+ __entry->vp_index,
+ __entry->vp_fd
+ )
+);
+
+TRACE_EVENT(mshv_hvcall_map_vp_state_page,
+ TP_PROTO(u64 partition_id, u32 vp_index, u32 page_type, u64 status),
+ TP_ARGS(partition_id, vp_index, page_type, status),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u32, vp_index)
+ __field(u32, page_type)
+ __field(u64, status)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->vp_index = vp_index;
+ __entry->page_type = page_type;
+ __entry->status = status;
+ ),
+ TP_printk("partition_id=%llu vp_index=%u page_type=%u status=%#llx",
+ __entry->partition_id,
+ __entry->vp_index,
+ __entry->page_type,
+ __entry->status
+ )
+);
+
+TRACE_EVENT(mshv_drain_vp_signals,
+ TP_PROTO(u64 partition_id, u32 vp_index),
+ TP_ARGS(partition_id, vp_index),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u32, vp_index)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->vp_index = vp_index;
+ ),
+ TP_printk("partition_id=%llu vp_index=%u",
+ __entry->partition_id,
+ __entry->vp_index
+ )
+);
+
+TRACE_EVENT(mshv_disable_vp_dispatch,
+ TP_PROTO(u64 partition_id, u32 vp_index, int ret),
+ TP_ARGS(partition_id, vp_index, ret),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u32, vp_index)
+ __field(int, ret)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->vp_index = vp_index;
+ __entry->ret = ret;
+ ),
+ TP_printk("partition_id=%llu vp_index=%u ret=%d",
+ __entry->partition_id,
+ __entry->vp_index,
+ __entry->ret
+ )
+);
+
+TRACE_EVENT(mshv_vp_release,
+ TP_PROTO(u64 partition_id, u32 vp_index),
+ TP_ARGS(partition_id, vp_index),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u32, vp_index)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->vp_index = vp_index;
+ ),
+ TP_printk("partition_id=%llu vp_index=%u",
+ __entry->partition_id,
+ __entry->vp_index
+ )
+);
+
+TRACE_EVENT(mshv_run_vp_entry,
+ TP_PROTO(u64 partition_id, u32 vp_index),
+ TP_ARGS(partition_id, vp_index),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u32, vp_index)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->vp_index = vp_index;
+ ),
+ TP_printk("partition_id=%llu vp_index=%u",
+ __entry->partition_id,
+ __entry->vp_index
+ )
+);
+
+TRACE_EVENT(mshv_run_vp_exit,
+ TP_PROTO(u64 partition_id, u32 vp_index, u64 hv_message_type, long ret),
+ TP_ARGS(partition_id, vp_index, hv_message_type, ret),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u32, vp_index)
+ __field(u64, hv_message_type)
+ __field(long, ret)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->vp_index = vp_index;
+ __entry->hv_message_type = hv_message_type;
+ __entry->ret = ret;
+ ),
+ TP_printk("partition_id=%llu vp_index=%u hv_message_type=%#llx ret=%ld",
+ __entry->partition_id,
+ __entry->vp_index,
+ __entry->hv_message_type,
+ __entry->ret
+ )
+);
+
+TRACE_EVENT(mshv_vp_clear_explicit_suspend,
+ TP_PROTO(u64 partition_id, u32 vp_index, int ret),
+ TP_ARGS(partition_id, vp_index, ret),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u32, vp_index)
+ __field(int, ret)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->vp_index = vp_index;
+ __entry->ret = ret;
+ ),
+ TP_printk("partition_id=%llu vp_index=%u ret=%d",
+ __entry->partition_id,
+ __entry->vp_index,
+ __entry->ret
+ )
+);
+
+TRACE_EVENT(mshv_xfer_to_guest_mode_work,
+ TP_PROTO(u64 partition_id, u32 vp_index, unsigned long thread_info_flag, long ret),
+ TP_ARGS(partition_id, vp_index, thread_info_flag, ret),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u32, vp_index)
+ __field(unsigned long, thread_info_flag)
+ __field(long, ret)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->vp_index = vp_index;
+ __entry->thread_info_flag = thread_info_flag;
+ __entry->ret = ret;
+ ),
+ TP_printk("partition_id=%llu vp_index=%u thread_info_flag=%#lx ret=%ld",
+ __entry->partition_id,
+ __entry->vp_index,
+ __entry->thread_info_flag,
+ __entry->ret
+ )
+);
+
+TRACE_EVENT(mshv_hvcall_dispatch_vp,
+ TP_PROTO(u64 partition_id, u32 vp_index, u32 flags,
+ u32 dispatch_state, u32 dispatch_event, u64 irq_vectors, u64 status),
+ TP_ARGS(partition_id, vp_index, flags, dispatch_state, dispatch_event, irq_vectors,
+ status),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u32, vp_index)
+ __field(u32, flags)
+ __field(u32, dispatch_state)
+ __field(u32, dispatch_event)
+ __field(u64, irq_vectors)
+ __field(u64, status)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->vp_index = vp_index;
+ __entry->flags = flags;
+ __entry->dispatch_state = dispatch_state;
+ __entry->dispatch_event = dispatch_event;
+ __entry->irq_vectors = irq_vectors;
+ __entry->status = status;
+ ),
+ TP_printk("partition_id=%llu vp_index=%u flags=%#x dispatch_state=%#x dispatch_event=%#x irq_vectors=%#016llx status=%#llx",
+ __entry->partition_id,
+ __entry->vp_index,
+ __entry->flags,
+ __entry->dispatch_state,
+ __entry->dispatch_event,
+ __entry->irq_vectors,
+ __entry->status
+ )
+);
+
+TRACE_EVENT(mshv_update_routing_table,
+ TP_PROTO(u64 partition_id, void *old, void *new, u32 numents),
+ TP_ARGS(partition_id, old, new, numents),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(struct mshv_girq_routing_table *, old)
+ __field(struct mshv_girq_routing_table *, new)
+ __field(u32, numents)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->old = old;
+ __entry->new = new;
+ __entry->numents = numents;
+ ),
+ TP_printk("partition_id=%llu old=%p new=%p numents=%u",
+ __entry->partition_id,
+ __entry->old,
+ __entry->new,
+ __entry->numents
+ )
+);
+
+TRACE_EVENT(mshv_map_user_memory,
+ TP_PROTO(u64 partition_id, u64 start_uaddr, u64 start_gfn, u64 nr_pages, u32 map_flags,
+ long ret),
+ TP_ARGS(partition_id, start_uaddr, start_gfn, nr_pages, map_flags, ret),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u64, start_uaddr)
+ __field(u64, start_gfn)
+ __field(u64, nr_pages)
+ __field(u32, map_flags)
+ __field(long, ret)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->start_uaddr = start_uaddr;
+ __entry->start_gfn = start_gfn;
+ __entry->nr_pages = nr_pages;
+ __entry->map_flags = map_flags;
+ __entry->ret = ret;
+ ),
+ TP_printk("partition_id=%llu start_uaddr=%#llx start_gfn=%#llx nr_pages=%llu map_flags=%#x ret=%ld",
+ __entry->partition_id,
+ __entry->start_uaddr,
+ __entry->start_gfn,
+ __entry->nr_pages,
+ __entry->map_flags,
+ __entry->ret
+ )
+);
+
+TRACE_EVENT(mshv_assign_ioeventfd,
+ TP_PROTO(u64 partition_id, u64 addr, u64 length, u64 datamatch, bool wildcard,
+ void *eventfd, int ret),
+ TP_ARGS(partition_id, addr, length, datamatch, wildcard, eventfd, ret),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u64, addr)
+ __field(u64, length)
+ __field(u64, datamatch)
+ __field(bool, wildcard)
+ __field(struct eventfd_ctx *, eventfd)
+ __field(int, ret)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->addr = addr;
+ __entry->length = length;
+ __entry->datamatch = datamatch;
+ __entry->wildcard = wildcard;
+ __entry->eventfd = eventfd;
+ __entry->ret = ret;
+ ),
+ TP_printk("partition_id=%llu addr=%#016llx length=%#llx datamatch=%#llx wildcard=%d eventfd=%p ret=%d",
+ __entry->partition_id,
+ __entry->addr,
+ __entry->length,
+ __entry->datamatch,
+ __entry->wildcard,
+ __entry->eventfd,
+ __entry->ret
+ )
+);
+
+TRACE_EVENT(mshv_deassign_ioeventfd,
+ TP_PROTO(u64 partition_id, u64 addr, u64 length, u64 datamatch, bool wildcard,
+ void *eventfd),
+ TP_ARGS(partition_id, addr, length, datamatch, wildcard, eventfd),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u64, addr)
+ __field(u64, length)
+ __field(u64, datamatch)
+ __field(bool, wildcard)
+ __field(struct eventfd_ctx *, eventfd)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->addr = addr;
+ __entry->length = length;
+ __entry->datamatch = datamatch;
+ __entry->wildcard = wildcard;
+ __entry->eventfd = eventfd;
+ ),
+ TP_printk("partition_id=%llu addr=%#016llx length=%#llx datamatch=%#llx wildcard=%d eventfd=%p",
+ __entry->partition_id,
+ __entry->addr,
+ __entry->length,
+ __entry->datamatch,
+ __entry->wildcard,
+ __entry->eventfd
+ )
+);
+
+TRACE_EVENT(mshv_vp_wait_for_hv_kick,
+ TP_PROTO(u64 partition_id, u32 vp_index, bool kicked_by_hv, bool blocked,
+ bool irq_pending),
+ TP_ARGS(partition_id, vp_index, kicked_by_hv, blocked, irq_pending),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u32, vp_index)
+ __field(bool, kicked_by_hv)
+ __field(bool, blocked)
+ __field(bool, irq_pending)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->vp_index = vp_index;
+ __entry->kicked_by_hv = kicked_by_hv;
+ __entry->blocked = blocked;
+ __entry->irq_pending = irq_pending;
+ ),
+ TP_printk("partition_id=%llu vp_index=%u kicked_by_hv=%d blocked=%d irq_pending=%d",
+ __entry->partition_id,
+ __entry->vp_index,
+ __entry->kicked_by_hv,
+ __entry->blocked,
+ __entry->irq_pending
+ )
+);
+
+TRACE_EVENT(mshv_handle_gpa_intercept,
+ TP_PROTO(u64 partition_id, u32 vp_index, u64 gfn, u8 access_type, bool handled),
+ TP_ARGS(partition_id, vp_index, gfn, access_type, handled),
+ TP_STRUCT__entry(
+ __field(u64, partition_id)
+ __field(u32, vp_index)
+ __field(u64, gfn)
+ __field(u8, access_type)
+ __field(bool, handled)
+ ),
+ TP_fast_assign(
+ __entry->partition_id = partition_id;
+ __entry->vp_index = vp_index;
+ __entry->gfn = gfn;
+ __entry->access_type = access_type == HV_INTERCEPT_ACCESS_READ ? 'R' :
+ (access_type == HV_INTERCEPT_ACCESS_WRITE ? 'W' :
+ (access_type == HV_INTERCEPT_ACCESS_EXECUTE ? 'X' : '?'));
+ __entry->handled = handled;
+ ),
+ TP_printk("partition_id=%llu vp_index=%u gfn=0x%llx access_type=%c handled=%d",
+ __entry->partition_id,
+ __entry->vp_index,
+ __entry->gfn,
+ __entry->access_type,
+ __entry->handled
+ )
+);
+
+#endif /* _MSHV_TRACE_H_ */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/drivers/hv/mshv_vtl.h b/drivers/hv/mshv_vtl.h
new file mode 100644
index 000000000000..a6eea52f7aa2
--- /dev/null
+++ b/drivers/hv/mshv_vtl.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _MSHV_VTL_H
+#define _MSHV_VTL_H
+
+#include <linux/mshv.h>
+#include <linux/types.h>
+
+struct mshv_vtl_run {
+ u32 cancel;
+ u32 vtl_ret_action_size;
+ u32 pad[2];
+ char exit_message[MSHV_MAX_RUN_MSG_SIZE];
+ union {
+ struct mshv_vtl_cpu_context cpu_context;
+
+ /*
+ * Reserving room for the cpu context to grow and to maintain compatibility
+ * with user mode.
+ */
+ char reserved[1024];
+ };
+ char vtl_ret_actions[MSHV_MAX_RUN_MSG_SIZE];
+};
+
+#endif /* _MSHV_VTL_H */
diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c
new file mode 100644
index 000000000000..c19400701467
--- /dev/null
+++ b/drivers/hv/mshv_vtl_main.c
@@ -0,0 +1,1399 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * Author:
+ * Roman Kisel <romank@linux.microsoft.com>
+ * Saurabh Sengar <ssengar@linux.microsoft.com>
+ * Naman Jain <namjain@linux.microsoft.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/anon_inodes.h>
+#include <linux/cpuhotplug.h>
+#include <linux/count_zeros.h>
+#include <linux/entry-virt.h>
+#include <linux/eventfd.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+#include <asm/debugreg.h>
+#include <asm/mshyperv.h>
+#include <trace/events/ipi.h>
+#include <uapi/asm/mtrr.h>
+#include <uapi/linux/mshv.h>
+#include <hyperv/hvhdk.h>
+
+#include "../../kernel/fpu/legacy.h"
+#include "mshv.h"
+#include "mshv_vtl.h"
+#include "hyperv_vmbus.h"
+
+MODULE_AUTHOR("Microsoft");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Microsoft Hyper-V VTL Driver");
+
+#define MSHV_ENTRY_REASON_LOWER_VTL_CALL 0x1
+#define MSHV_ENTRY_REASON_INTERRUPT 0x2
+#define MSHV_ENTRY_REASON_INTERCEPT 0x3
+
+#define MSHV_REAL_OFF_SHIFT 16
+#define MSHV_PG_OFF_CPU_MASK (BIT_ULL(MSHV_REAL_OFF_SHIFT) - 1)
+#define MSHV_RUN_PAGE_OFFSET 0
+#define MSHV_REG_PAGE_OFFSET 1
+#define VTL2_VMBUS_SINT_INDEX 7
+
+static struct device *mem_dev;
+
+static struct tasklet_struct msg_dpc;
+static wait_queue_head_t fd_wait_queue;
+static bool has_message;
+static struct eventfd_ctx *flag_eventfds[HV_EVENT_FLAGS_COUNT];
+static DEFINE_MUTEX(flag_lock);
+static bool __read_mostly mshv_has_reg_page;
+
+/* hvcall code is of type u16, allocate a bitmap of size (1 << 16) to accommodate it */
+#define MAX_BITMAP_SIZE ((U16_MAX + 1) / 8)
+
+struct mshv_vtl_hvcall_fd {
+ u8 allow_bitmap[MAX_BITMAP_SIZE];
+ bool allow_map_initialized;
+ /*
+ * Used to protect hvcall setup in IOCTLs
+ */
+ struct mutex init_mutex;
+ struct miscdevice *dev;
+};
+
+struct mshv_vtl_poll_file {
+ struct file *file;
+ wait_queue_entry_t wait;
+ wait_queue_head_t *wqh;
+ poll_table pt;
+ int cpu;
+};
+
+struct mshv_vtl {
+ struct device *module_dev;
+ u64 id;
+};
+
+struct mshv_vtl_per_cpu {
+ struct mshv_vtl_run *run;
+ struct page *reg_page;
+};
+
+/* SYNIC_OVERLAY_PAGE_MSR - internal, identical to hv_synic_simp */
+union hv_synic_overlay_page_msr {
+ u64 as_uint64;
+ struct {
+ u64 enabled: 1;
+ u64 reserved: 11;
+ u64 pfn: 52;
+ } __packed;
+};
+
+static struct mutex mshv_vtl_poll_file_lock;
+static union hv_register_vsm_page_offsets mshv_vsm_page_offsets;
+static union hv_register_vsm_capabilities mshv_vsm_capabilities;
+
+static DEFINE_PER_CPU(struct mshv_vtl_poll_file, mshv_vtl_poll_file);
+static DEFINE_PER_CPU(unsigned long long, num_vtl0_transitions);
+static DEFINE_PER_CPU(struct mshv_vtl_per_cpu, mshv_vtl_per_cpu);
+
+static const union hv_input_vtl input_vtl_zero;
+static const union hv_input_vtl input_vtl_normal = {
+ .use_target_vtl = 1,
+};
+
+static const struct file_operations mshv_vtl_fops;
+
+static long
+mshv_ioctl_create_vtl(void __user *user_arg, struct device *module_dev)
+{
+ struct mshv_vtl *vtl;
+ struct file *file;
+ int fd;
+
+ vtl = kzalloc_obj(*vtl);
+ if (!vtl)
+ return -ENOMEM;
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0) {
+ kfree(vtl);
+ return fd;
+ }
+ file = anon_inode_getfile("mshv_vtl", &mshv_vtl_fops,
+ vtl, O_RDWR);
+ if (IS_ERR(file)) {
+ kfree(vtl);
+ return PTR_ERR(file);
+ }
+ vtl->module_dev = module_dev;
+ fd_install(fd, file);
+
+ return fd;
+}
+
+static long
+mshv_ioctl_check_extension(void __user *user_arg)
+{
+ u32 arg;
+
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
+ return -EFAULT;
+
+ switch (arg) {
+ case MSHV_CAP_CORE_API_STABLE:
+ return 0;
+ case MSHV_CAP_REGISTER_PAGE:
+ return mshv_has_reg_page;
+ case MSHV_CAP_VTL_RETURN_ACTION:
+ return mshv_vsm_capabilities.return_action_available;
+ case MSHV_CAP_DR6_SHARED:
+ return mshv_vsm_capabilities.dr6_shared;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static long
+mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+ struct miscdevice *misc = filp->private_data;
+
+ switch (ioctl) {
+ case MSHV_CHECK_EXTENSION:
+ return mshv_ioctl_check_extension((void __user *)arg);
+ case MSHV_CREATE_VTL:
+ return mshv_ioctl_create_vtl((void __user *)arg, misc->this_device);
+ }
+
+ return -ENOTTY;
+}
+
+static const struct file_operations mshv_dev_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = mshv_dev_ioctl,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice mshv_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "mshv",
+ .fops = &mshv_dev_fops,
+ .mode = 0600,
+};
+
+static struct mshv_vtl_run *mshv_vtl_this_run(void)
+{
+ return *this_cpu_ptr(&mshv_vtl_per_cpu.run);
+}
+
+static struct mshv_vtl_run *mshv_vtl_cpu_run(int cpu)
+{
+ return *per_cpu_ptr(&mshv_vtl_per_cpu.run, cpu);
+}
+
+static struct page *mshv_vtl_cpu_reg_page(int cpu)
+{
+ return *per_cpu_ptr(&mshv_vtl_per_cpu.reg_page, cpu);
+}
+
+static void mshv_vtl_configure_reg_page(struct mshv_vtl_per_cpu *per_cpu)
+{
+ struct hv_register_assoc reg_assoc = {};
+ union hv_synic_overlay_page_msr overlay = {};
+ struct page *reg_page;
+
+ reg_page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL);
+ if (!reg_page) {
+ WARN(1, "failed to allocate register page\n");
+ return;
+ }
+
+ overlay.enabled = 1;
+ overlay.pfn = page_to_hvpfn(reg_page);
+ reg_assoc.name = HV_X64_REGISTER_REG_PAGE;
+ reg_assoc.value.reg64 = overlay.as_uint64;
+
+ if (hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ 1, input_vtl_zero, &reg_assoc)) {
+ WARN(1, "failed to setup register page\n");
+ __free_page(reg_page);
+ return;
+ }
+
+ per_cpu->reg_page = reg_page;
+ mshv_has_reg_page = true;
+}
+
+static void mshv_vtl_synic_enable_regs(unsigned int cpu)
+{
+ union hv_synic_sint sint;
+
+ sint.as_uint64 = 0;
+ sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+ sint.masked = false;
+ sint.auto_eoi = hv_recommend_using_aeoi();
+
+ /* Enable intercepts */
+ if (!mshv_vsm_capabilities.intercept_page_available)
+ hv_set_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
+ sint.as_uint64);
+
+ /* VTL2 Host VSP SINT is (un)masked when the user mode requests that */
+}
+
+static int mshv_vtl_get_vsm_regs(void)
+{
+ struct hv_register_assoc registers[2];
+ int ret, count = 2;
+
+ registers[0].name = HV_REGISTER_VSM_CODE_PAGE_OFFSETS;
+ registers[1].name = HV_REGISTER_VSM_CAPABILITIES;
+
+ ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ count, input_vtl_zero, registers);
+ if (ret)
+ return ret;
+
+ mshv_vsm_page_offsets.as_uint64 = registers[0].value.reg64;
+ mshv_vsm_capabilities.as_uint64 = registers[1].value.reg64;
+
+ return ret;
+}
+
+static int mshv_vtl_configure_vsm_partition(struct device *dev)
+{
+ union hv_register_vsm_partition_config config;
+ struct hv_register_assoc reg_assoc;
+
+ config.as_uint64 = 0;
+ config.default_vtl_protection_mask = HV_MAP_GPA_PERMISSIONS_MASK;
+ config.enable_vtl_protection = 1;
+ config.zero_memory_on_reset = 1;
+ config.intercept_vp_startup = 1;
+ config.intercept_cpuid_unimplemented = 1;
+
+ if (mshv_vsm_capabilities.intercept_page_available) {
+ dev_dbg(dev, "using intercept page\n");
+ config.intercept_page = 1;
+ }
+
+ reg_assoc.name = HV_REGISTER_VSM_PARTITION_CONFIG;
+ reg_assoc.value.reg64 = config.as_uint64;
+
+ return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ 1, input_vtl_zero, &reg_assoc);
+}
+
+static void mshv_vtl_vmbus_isr(void)
+{
+ struct hv_per_cpu_context *per_cpu;
+ struct hv_message *msg;
+ u32 message_type;
+ union hv_synic_event_flags *event_flags;
+ struct eventfd_ctx *eventfd;
+ u16 i;
+
+ per_cpu = this_cpu_ptr(hv_context.cpu_context);
+ if (smp_processor_id() == 0) {
+ msg = (struct hv_message *)per_cpu->hyp_synic_message_page + VTL2_VMBUS_SINT_INDEX;
+ message_type = READ_ONCE(msg->header.message_type);
+ if (message_type != HVMSG_NONE)
+ tasklet_schedule(&msg_dpc);
+ }
+
+ event_flags = (union hv_synic_event_flags *)per_cpu->hyp_synic_event_page +
+ VTL2_VMBUS_SINT_INDEX;
+ for_each_set_bit(i, event_flags->flags, HV_EVENT_FLAGS_COUNT) {
+ if (!sync_test_and_clear_bit(i, event_flags->flags))
+ continue;
+ rcu_read_lock();
+ eventfd = READ_ONCE(flag_eventfds[i]);
+ if (eventfd)
+ eventfd_signal(eventfd);
+ rcu_read_unlock();
+ }
+
+ vmbus_isr();
+}
+
+static int mshv_vtl_alloc_context(unsigned int cpu)
+{
+ struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu);
+
+ per_cpu->run = (struct mshv_vtl_run *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ if (!per_cpu->run)
+ return -ENOMEM;
+
+ if (mshv_vsm_capabilities.intercept_page_available)
+ mshv_vtl_configure_reg_page(per_cpu);
+
+ mshv_vtl_synic_enable_regs(cpu);
+
+ return 0;
+}
+
+static int mshv_vtl_cpuhp_online;
+
+static int hv_vtl_setup_synic(void)
+{
+ int ret;
+
+ /* Use our isr to first filter out packets destined for userspace */
+ hv_setup_vmbus_handler(mshv_vtl_vmbus_isr);
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vtl:online",
+ mshv_vtl_alloc_context, NULL);
+ if (ret < 0) {
+ hv_setup_vmbus_handler(vmbus_isr);
+ return ret;
+ }
+
+ mshv_vtl_cpuhp_online = ret;
+
+ return 0;
+}
+
+static void hv_vtl_remove_synic(void)
+{
+ cpuhp_remove_state(mshv_vtl_cpuhp_online);
+ hv_setup_vmbus_handler(vmbus_isr);
+}
+
+static int vtl_get_vp_register(struct hv_register_assoc *reg)
+{
+ return hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ 1, input_vtl_normal, reg);
+}
+
+static int vtl_set_vp_register(struct hv_register_assoc *reg)
+{
+ return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ 1, input_vtl_normal, reg);
+}
+
+static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg)
+{
+ struct mshv_vtl_ram_disposition vtl0_mem;
+ struct dev_pagemap *pgmap;
+ void *addr;
+
+ if (copy_from_user(&vtl0_mem, arg, sizeof(vtl0_mem)))
+ return -EFAULT;
+ if (vtl0_mem.last_pfn <= vtl0_mem.start_pfn) {
+ dev_err(vtl->module_dev, "range start pfn (%llx) > end pfn (%llx)\n",
+ vtl0_mem.start_pfn, vtl0_mem.last_pfn);
+ return -EFAULT;
+ }
+
+ pgmap = kzalloc_obj(*pgmap);
+ if (!pgmap)
+ return -ENOMEM;
+
+ /*
+ * vtl0_mem.last_pfn is excluded in the pagemap range for VTL0 as per design.
+ * last_pfn is not reserved or wasted, and reflects 'start_pfn + size' of pagemap range.
+ */
+ pgmap->ranges[0].start = PFN_PHYS(vtl0_mem.start_pfn);
+ pgmap->ranges[0].end = PFN_PHYS(vtl0_mem.last_pfn) - 1;
+ pgmap->nr_range = 1;
+ pgmap->type = MEMORY_DEVICE_GENERIC;
+
+ /*
+ * Determine the highest page order that can be used for the given memory range.
+ * This works best when the range is aligned; i.e. both the start and the length.
+ * Clamp to MAX_FOLIO_ORDER to avoid a WARN in memremap_pages() when the range
+ * alignment exceeds the maximum supported folio order for this kernel config.
+ */
+ pgmap->vmemmap_shift = min(count_trailing_zeros(vtl0_mem.start_pfn | vtl0_mem.last_pfn),
+ MAX_FOLIO_ORDER);
+ dev_dbg(vtl->module_dev,
+ "Add VTL0 memory: start: 0x%llx, end_pfn: 0x%llx, page order: %lu\n",
+ vtl0_mem.start_pfn, vtl0_mem.last_pfn, pgmap->vmemmap_shift);
+
+ addr = devm_memremap_pages(mem_dev, pgmap);
+ if (IS_ERR(addr)) {
+ dev_err(vtl->module_dev, "devm_memremap_pages error: %ld\n", PTR_ERR(addr));
+ kfree(pgmap);
+ return PTR_ERR(addr);
+ }
+
+ /* Don't free pgmap, since it has to stick around until the memory
+ * is unmapped, which will never happen as there is no scenario
+ * where VTL0 can be released/shutdown without bringing down VTL2.
+ */
+ return 0;
+}
+
+static void mshv_vtl_cancel(int cpu)
+{
+ int here = get_cpu();
+
+ if (here != cpu) {
+ if (!xchg_relaxed(&mshv_vtl_cpu_run(cpu)->cancel, 1))
+ smp_send_reschedule(cpu);
+ } else {
+ WRITE_ONCE(mshv_vtl_this_run()->cancel, 1);
+ }
+ put_cpu();
+}
+
+static int mshv_vtl_poll_file_wake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
+{
+ struct mshv_vtl_poll_file *poll_file = container_of(wait, struct mshv_vtl_poll_file, wait);
+
+ mshv_vtl_cancel(poll_file->cpu);
+
+ return 0;
+}
+
+static void mshv_vtl_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
+{
+ struct mshv_vtl_poll_file *poll_file = container_of(pt, struct mshv_vtl_poll_file, pt);
+
+ WARN_ON(poll_file->wqh);
+ poll_file->wqh = wqh;
+ add_wait_queue(wqh, &poll_file->wait);
+}
+
+static int mshv_vtl_ioctl_set_poll_file(struct mshv_vtl_set_poll_file __user *user_input)
+{
+ struct file *file, *old_file;
+ struct mshv_vtl_poll_file *poll_file;
+ struct mshv_vtl_set_poll_file input;
+
+ if (copy_from_user(&input, user_input, sizeof(input)))
+ return -EFAULT;
+
+ if (input.cpu >= num_possible_cpus() || !cpu_online(input.cpu))
+ return -EINVAL;
+ /*
+ * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists.
+ * CPU is expected to remain online after above cpu_online() check.
+ */
+
+ file = NULL;
+ file = fget(input.fd);
+ if (!file)
+ return -EBADFD;
+
+ poll_file = per_cpu_ptr(&mshv_vtl_poll_file, READ_ONCE(input.cpu));
+ if (!poll_file)
+ return -EINVAL;
+
+ mutex_lock(&mshv_vtl_poll_file_lock);
+
+ if (poll_file->wqh)
+ remove_wait_queue(poll_file->wqh, &poll_file->wait);
+ poll_file->wqh = NULL;
+
+ old_file = poll_file->file;
+ poll_file->file = file;
+ poll_file->cpu = input.cpu;
+
+ if (file) {
+ init_waitqueue_func_entry(&poll_file->wait, mshv_vtl_poll_file_wake);
+ init_poll_funcptr(&poll_file->pt, mshv_vtl_ptable_queue_proc);
+ vfs_poll(file, &poll_file->pt);
+ }
+
+ mutex_unlock(&mshv_vtl_poll_file_lock);
+
+ if (old_file)
+ fput(old_file);
+
+ return 0;
+}
+
+/* Static table mapping register names to their corresponding actions */
+static const struct {
+ enum hv_register_name reg_name;
+ int debug_reg_num; /* -1 if not a debug register */
+ u32 msr_addr; /* 0 if not an MSR */
+} reg_table[] = {
+ /* Debug registers */
+ {HV_X64_REGISTER_DR0, 0, 0},
+ {HV_X64_REGISTER_DR1, 1, 0},
+ {HV_X64_REGISTER_DR2, 2, 0},
+ {HV_X64_REGISTER_DR3, 3, 0},
+ {HV_X64_REGISTER_DR6, 6, 0},
+ /* MTRR MSRs */
+ {HV_X64_REGISTER_MSR_MTRR_CAP, -1, MSR_MTRRcap},
+ {HV_X64_REGISTER_MSR_MTRR_DEF_TYPE, -1, MSR_MTRRdefType},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0, -1, MTRRphysBase_MSR(0)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1, -1, MTRRphysBase_MSR(1)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2, -1, MTRRphysBase_MSR(2)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3, -1, MTRRphysBase_MSR(3)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4, -1, MTRRphysBase_MSR(4)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5, -1, MTRRphysBase_MSR(5)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6, -1, MTRRphysBase_MSR(6)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7, -1, MTRRphysBase_MSR(7)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8, -1, MTRRphysBase_MSR(8)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9, -1, MTRRphysBase_MSR(9)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA, -1, MTRRphysBase_MSR(0xa)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB, -1, MTRRphysBase_MSR(0xb)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC, -1, MTRRphysBase_MSR(0xc)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASED, -1, MTRRphysBase_MSR(0xd)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE, -1, MTRRphysBase_MSR(0xe)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF, -1, MTRRphysBase_MSR(0xf)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0, -1, MTRRphysMask_MSR(0)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1, -1, MTRRphysMask_MSR(1)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2, -1, MTRRphysMask_MSR(2)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3, -1, MTRRphysMask_MSR(3)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4, -1, MTRRphysMask_MSR(4)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5, -1, MTRRphysMask_MSR(5)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6, -1, MTRRphysMask_MSR(6)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7, -1, MTRRphysMask_MSR(7)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8, -1, MTRRphysMask_MSR(8)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9, -1, MTRRphysMask_MSR(9)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA, -1, MTRRphysMask_MSR(0xa)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB, -1, MTRRphysMask_MSR(0xb)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC, -1, MTRRphysMask_MSR(0xc)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD, -1, MTRRphysMask_MSR(0xd)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE, -1, MTRRphysMask_MSR(0xe)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF, -1, MTRRphysMask_MSR(0xf)},
+ {HV_X64_REGISTER_MSR_MTRR_FIX64K00000, -1, MSR_MTRRfix64K_00000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX16K80000, -1, MSR_MTRRfix16K_80000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX16KA0000, -1, MSR_MTRRfix16K_A0000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KC0000, -1, MSR_MTRRfix4K_C0000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KC8000, -1, MSR_MTRRfix4K_C8000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KD0000, -1, MSR_MTRRfix4K_D0000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KD8000, -1, MSR_MTRRfix4K_D8000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KE0000, -1, MSR_MTRRfix4K_E0000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KE8000, -1, MSR_MTRRfix4K_E8000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KF0000, -1, MSR_MTRRfix4K_F0000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KF8000, -1, MSR_MTRRfix4K_F8000},
+};
+
+static int mshv_vtl_get_set_reg(struct hv_register_assoc *regs, bool set)
+{
+ u64 *reg64;
+ enum hv_register_name gpr_name;
+ int i;
+
+ gpr_name = regs->name;
+ reg64 = &regs->value.reg64;
+
+ /* Search for the register in the table */
+ for (i = 0; i < ARRAY_SIZE(reg_table); i++) {
+ if (reg_table[i].reg_name != gpr_name)
+ continue;
+ if (reg_table[i].debug_reg_num != -1) {
+ /* Handle debug registers */
+ if (gpr_name == HV_X64_REGISTER_DR6 &&
+ !mshv_vsm_capabilities.dr6_shared)
+ goto hypercall;
+ if (set)
+ native_set_debugreg(reg_table[i].debug_reg_num, *reg64);
+ else
+ *reg64 = native_get_debugreg(reg_table[i].debug_reg_num);
+ } else {
+ /* Handle MSRs */
+ if (set)
+ wrmsrl(reg_table[i].msr_addr, *reg64);
+ else
+ rdmsrl(reg_table[i].msr_addr, *reg64);
+ }
+ return 0;
+ }
+
+hypercall:
+ return 1;
+}
+
+static void mshv_vtl_return(struct mshv_vtl_cpu_context *vtl0)
+{
+ struct hv_vp_assist_page *hvp;
+
+ hvp = hv_vp_assist_page[smp_processor_id()];
+
+ /*
+ * Process signal event direct set in the run page, if any.
+ */
+ if (mshv_vsm_capabilities.return_action_available) {
+ u32 offset = READ_ONCE(mshv_vtl_this_run()->vtl_ret_action_size);
+
+ WRITE_ONCE(mshv_vtl_this_run()->vtl_ret_action_size, 0);
+
+ /*
+ * Hypervisor will take care of clearing out the actions
+ * set in the assist page.
+ */
+ memcpy(hvp->vtl_ret_actions,
+ mshv_vtl_this_run()->vtl_ret_actions,
+ min_t(u32, offset, sizeof(hvp->vtl_ret_actions)));
+ }
+
+ mshv_vtl_return_call(vtl0);
+}
+
+static bool mshv_vtl_process_intercept(void)
+{
+ struct hv_per_cpu_context *mshv_cpu;
+ void *synic_message_page;
+ struct hv_message *msg;
+ u32 message_type;
+
+ mshv_cpu = this_cpu_ptr(hv_context.cpu_context);
+ synic_message_page = mshv_cpu->hyp_synic_message_page;
+ if (unlikely(!synic_message_page))
+ return true;
+
+ msg = (struct hv_message *)synic_message_page + HV_SYNIC_INTERCEPTION_SINT_INDEX;
+ message_type = READ_ONCE(msg->header.message_type);
+ if (message_type == HVMSG_NONE)
+ return true;
+
+ memcpy(mshv_vtl_this_run()->exit_message, msg, sizeof(*msg));
+ vmbus_signal_eom(msg, message_type);
+
+ return false;
+}
+
+static int mshv_vtl_ioctl_return_to_lower_vtl(void)
+{
+ preempt_disable();
+ for (;;) {
+ unsigned long irq_flags;
+ struct hv_vp_assist_page *hvp;
+ int ret;
+
+ if (__xfer_to_guest_mode_work_pending()) {
+ preempt_enable();
+ ret = xfer_to_guest_mode_handle_work();
+ if (ret)
+ return ret;
+ preempt_disable();
+ }
+
+ local_irq_save(irq_flags);
+ if (READ_ONCE(mshv_vtl_this_run()->cancel)) {
+ local_irq_restore(irq_flags);
+ preempt_enable();
+ return -EINTR;
+ }
+
+ mshv_vtl_return(&mshv_vtl_this_run()->cpu_context);
+ local_irq_restore(irq_flags);
+
+ hvp = hv_vp_assist_page[smp_processor_id()];
+ this_cpu_inc(num_vtl0_transitions);
+ switch (hvp->vtl_entry_reason) {
+ case MSHV_ENTRY_REASON_INTERRUPT:
+ if (!mshv_vsm_capabilities.intercept_page_available &&
+ likely(!mshv_vtl_process_intercept()))
+ goto done;
+ break;
+
+ case MSHV_ENTRY_REASON_INTERCEPT:
+ WARN_ON(!mshv_vsm_capabilities.intercept_page_available);
+ memcpy(mshv_vtl_this_run()->exit_message, hvp->intercept_message,
+ sizeof(hvp->intercept_message));
+ goto done;
+
+ default:
+ panic("unknown entry reason: %d", hvp->vtl_entry_reason);
+ }
+ }
+
+done:
+ preempt_enable();
+
+ return 0;
+}
+
+static long
+mshv_vtl_ioctl_get_regs(void __user *user_args)
+{
+ struct mshv_vp_registers args;
+ struct hv_register_assoc reg;
+ long ret;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ /* This IOCTL supports processing only one register at a time. */
+ if (args.count != 1)
+ return -EINVAL;
+
+ if (copy_from_user(&reg, (void __user *)args.regs_ptr,
+ sizeof(reg)))
+ return -EFAULT;
+
+ ret = mshv_vtl_get_set_reg(&reg, false);
+ if (!ret)
+ goto copy_args; /* No need of hypercall */
+ ret = vtl_get_vp_register(&reg);
+ if (ret)
+ return ret;
+
+copy_args:
+ if (copy_to_user((void __user *)args.regs_ptr, &reg, sizeof(reg)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static long
+mshv_vtl_ioctl_set_regs(void __user *user_args)
+{
+ struct mshv_vp_registers args;
+ struct hv_register_assoc reg;
+ long ret;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ /* This IOCTL supports processing only one register at a time. */
+ if (args.count != 1)
+ return -EINVAL;
+
+ if (copy_from_user(&reg, (void __user *)args.regs_ptr, sizeof(reg)))
+ return -EFAULT;
+
+ ret = mshv_vtl_get_set_reg(&reg, true);
+ if (!ret)
+ return ret; /* No need of hypercall */
+ ret = vtl_set_vp_register(&reg);
+
+ return ret;
+}
+
+static long
+mshv_vtl_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+ long ret;
+ struct mshv_vtl *vtl = filp->private_data;
+
+ switch (ioctl) {
+ case MSHV_SET_POLL_FILE:
+ ret = mshv_vtl_ioctl_set_poll_file((struct mshv_vtl_set_poll_file __user *)arg);
+ break;
+ case MSHV_GET_VP_REGISTERS:
+ ret = mshv_vtl_ioctl_get_regs((void __user *)arg);
+ break;
+ case MSHV_SET_VP_REGISTERS:
+ ret = mshv_vtl_ioctl_set_regs((void __user *)arg);
+ break;
+ case MSHV_RETURN_TO_LOWER_VTL:
+ ret = mshv_vtl_ioctl_return_to_lower_vtl();
+ break;
+ case MSHV_ADD_VTL0_MEMORY:
+ ret = mshv_vtl_ioctl_add_vtl0_mem(vtl, (void __user *)arg);
+ break;
+ default:
+ dev_err(vtl->module_dev, "invalid vtl ioctl: %#x\n", ioctl);
+ ret = -ENOTTY;
+ }
+
+ return ret;
+}
+
+static vm_fault_t mshv_vtl_fault(struct vm_fault *vmf)
+{
+ struct page *page;
+ int cpu = vmf->pgoff & MSHV_PG_OFF_CPU_MASK;
+ int real_off = vmf->pgoff >> MSHV_REAL_OFF_SHIFT;
+
+ if (!cpu_online(cpu))
+ return VM_FAULT_SIGBUS;
+ /*
+ * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists.
+ * CPU is expected to remain online after above cpu_online() check.
+ */
+
+ if (real_off == MSHV_RUN_PAGE_OFFSET) {
+ page = virt_to_page(mshv_vtl_cpu_run(cpu));
+ } else if (real_off == MSHV_REG_PAGE_OFFSET) {
+ if (!mshv_has_reg_page)
+ return VM_FAULT_SIGBUS;
+ page = mshv_vtl_cpu_reg_page(cpu);
+ } else {
+ return VM_FAULT_NOPAGE;
+ }
+
+ get_page(page);
+ vmf->page = page;
+
+ return 0;
+}
+
+static const struct vm_operations_struct mshv_vtl_vm_ops = {
+ .fault = mshv_vtl_fault,
+};
+
+static int mshv_vtl_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ vma->vm_ops = &mshv_vtl_vm_ops;
+
+ return 0;
+}
+
+static int mshv_vtl_release(struct inode *inode, struct file *filp)
+{
+ struct mshv_vtl *vtl = filp->private_data;
+
+ kfree(vtl);
+
+ return 0;
+}
+
+static const struct file_operations mshv_vtl_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = mshv_vtl_ioctl,
+ .release = mshv_vtl_release,
+ .mmap = mshv_vtl_mmap,
+};
+
+static void mshv_vtl_synic_mask_vmbus_sint(void *info)
+{
+ union hv_synic_sint sint;
+ const u8 *mask = info;
+
+ sint.as_uint64 = 0;
+ sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+ sint.masked = (*mask != 0);
+ sint.auto_eoi = hv_recommend_using_aeoi();
+
+ hv_set_msr(HV_MSR_SINT0 + VTL2_VMBUS_SINT_INDEX,
+ sint.as_uint64);
+
+ if (!sint.masked)
+ pr_debug("%s: Unmasking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id());
+ else
+ pr_debug("%s: Masking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id());
+}
+
+static void mshv_vtl_read_remote(void *buffer)
+{
+ struct hv_per_cpu_context *mshv_cpu = this_cpu_ptr(hv_context.cpu_context);
+ struct hv_message *msg = (struct hv_message *)mshv_cpu->hyp_synic_message_page +
+ VTL2_VMBUS_SINT_INDEX;
+ u32 message_type = READ_ONCE(msg->header.message_type);
+
+ WRITE_ONCE(has_message, false);
+ if (message_type == HVMSG_NONE)
+ return;
+
+ memcpy(buffer, msg, sizeof(*msg));
+ vmbus_signal_eom(msg, message_type);
+}
+
+static bool vtl_synic_mask_vmbus_sint_masked = true;
+
+static ssize_t mshv_vtl_sint_read(struct file *filp, char __user *arg, size_t size, loff_t *offset)
+{
+ struct hv_message msg = {};
+ int ret;
+
+ if (size < sizeof(msg))
+ return -EINVAL;
+
+ for (;;) {
+ smp_call_function_single(VMBUS_CONNECT_CPU, mshv_vtl_read_remote, &msg, true);
+ if (msg.header.message_type != HVMSG_NONE)
+ break;
+
+ if (READ_ONCE(vtl_synic_mask_vmbus_sint_masked))
+ return 0; /* EOF */
+
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ ret = wait_event_interruptible(fd_wait_queue,
+ READ_ONCE(has_message) ||
+ READ_ONCE(vtl_synic_mask_vmbus_sint_masked));
+ if (ret)
+ return ret;
+ }
+
+ if (copy_to_user(arg, &msg, sizeof(msg)))
+ return -EFAULT;
+
+ return sizeof(msg);
+}
+
+static __poll_t mshv_vtl_sint_poll(struct file *filp, poll_table *wait)
+{
+ __poll_t mask = 0;
+
+ poll_wait(filp, &fd_wait_queue, wait);
+ if (READ_ONCE(has_message) || READ_ONCE(vtl_synic_mask_vmbus_sint_masked))
+ mask |= EPOLLIN | EPOLLRDNORM;
+
+ return mask;
+}
+
+static void mshv_vtl_sint_on_msg_dpc(unsigned long data)
+{
+ WRITE_ONCE(has_message, true);
+ wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN);
+}
+
+static int mshv_vtl_sint_ioctl_post_msg(struct mshv_vtl_sint_post_msg __user *arg)
+{
+ struct mshv_vtl_sint_post_msg message;
+ u8 payload[HV_MESSAGE_PAYLOAD_BYTE_COUNT];
+
+ if (copy_from_user(&message, arg, sizeof(message)))
+ return -EFAULT;
+ if (message.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT)
+ return -EINVAL;
+ if (copy_from_user(payload, (void __user *)message.payload_ptr,
+ message.payload_size))
+ return -EFAULT;
+
+ return hv_post_message((union hv_connection_id)message.connection_id,
+ message.message_type, (void *)payload,
+ message.payload_size);
+}
+
+static int mshv_vtl_sint_ioctl_signal_event(struct mshv_vtl_signal_event __user *arg)
+{
+ u64 input, status;
+ struct mshv_vtl_signal_event signal_event;
+
+ if (copy_from_user(&signal_event, arg, sizeof(signal_event)))
+ return -EFAULT;
+
+ input = signal_event.connection_id | ((u64)signal_event.flag << 32);
+
+ status = hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, input);
+
+ return hv_result_to_errno(status);
+}
+
+static int mshv_vtl_sint_ioctl_set_eventfd(struct mshv_vtl_set_eventfd __user *arg)
+{
+ struct mshv_vtl_set_eventfd set_eventfd;
+ struct eventfd_ctx *eventfd, *old_eventfd;
+
+ if (copy_from_user(&set_eventfd, arg, sizeof(set_eventfd)))
+ return -EFAULT;
+ if (set_eventfd.flag >= HV_EVENT_FLAGS_COUNT)
+ return -EINVAL;
+
+ eventfd = NULL;
+ if (set_eventfd.fd >= 0) {
+ eventfd = eventfd_ctx_fdget(set_eventfd.fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+ }
+
+ guard(mutex)(&flag_lock);
+ old_eventfd = READ_ONCE(flag_eventfds[set_eventfd.flag]);
+ WRITE_ONCE(flag_eventfds[set_eventfd.flag], eventfd);
+
+ if (old_eventfd) {
+ synchronize_rcu();
+ eventfd_ctx_put(old_eventfd);
+ }
+
+ return 0;
+}
+
+static int mshv_vtl_sint_ioctl_pause_msg_stream(struct mshv_sint_mask __user *arg)
+{
+ static DEFINE_MUTEX(vtl2_vmbus_sint_mask_mutex);
+ struct mshv_sint_mask mask;
+
+ if (copy_from_user(&mask, arg, sizeof(mask)))
+ return -EFAULT;
+ guard(mutex)(&vtl2_vmbus_sint_mask_mutex);
+ on_each_cpu(mshv_vtl_synic_mask_vmbus_sint, &mask.mask, 1);
+ WRITE_ONCE(vtl_synic_mask_vmbus_sint_masked, mask.mask != 0);
+ if (mask.mask)
+ wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN);
+
+ return 0;
+}
+
+static long mshv_vtl_sint_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case MSHV_SINT_POST_MESSAGE:
+ return mshv_vtl_sint_ioctl_post_msg((struct mshv_vtl_sint_post_msg __user *)arg);
+ case MSHV_SINT_SIGNAL_EVENT:
+ return mshv_vtl_sint_ioctl_signal_event((struct mshv_vtl_signal_event __user *)arg);
+ case MSHV_SINT_SET_EVENTFD:
+ return mshv_vtl_sint_ioctl_set_eventfd((struct mshv_vtl_set_eventfd __user *)arg);
+ case MSHV_SINT_PAUSE_MESSAGE_STREAM:
+ return mshv_vtl_sint_ioctl_pause_msg_stream((struct mshv_sint_mask __user *)arg);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+static const struct file_operations mshv_vtl_sint_ops = {
+ .owner = THIS_MODULE,
+ .read = mshv_vtl_sint_read,
+ .poll = mshv_vtl_sint_poll,
+ .unlocked_ioctl = mshv_vtl_sint_ioctl,
+};
+
+static struct miscdevice mshv_vtl_sint_dev = {
+ .name = "mshv_sint",
+ .fops = &mshv_vtl_sint_ops,
+ .mode = 0600,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+static int mshv_vtl_hvcall_dev_open(struct inode *node, struct file *f)
+{
+ struct miscdevice *dev = f->private_data;
+ struct mshv_vtl_hvcall_fd *fd;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ fd = vzalloc(sizeof(*fd));
+ if (!fd)
+ return -ENOMEM;
+ fd->dev = dev;
+ f->private_data = fd;
+ mutex_init(&fd->init_mutex);
+
+ return 0;
+}
+
+static int mshv_vtl_hvcall_dev_release(struct inode *node, struct file *f)
+{
+ struct mshv_vtl_hvcall_fd *fd;
+
+ fd = f->private_data;
+ if (fd) {
+ vfree(fd);
+ f->private_data = NULL;
+ }
+
+ return 0;
+}
+
+static int mshv_vtl_hvcall_do_setup(struct mshv_vtl_hvcall_fd *fd,
+ struct mshv_vtl_hvcall_setup __user *hvcall_setup_user)
+{
+ struct mshv_vtl_hvcall_setup hvcall_setup;
+
+ guard(mutex)(&fd->init_mutex);
+
+ if (fd->allow_map_initialized) {
+ dev_err(fd->dev->this_device,
+ "Hypercall allow map has already been set, pid %d\n",
+ current->pid);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(&hvcall_setup, hvcall_setup_user,
+ sizeof(struct mshv_vtl_hvcall_setup))) {
+ return -EFAULT;
+ }
+ if (hvcall_setup.bitmap_array_size > ARRAY_SIZE(fd->allow_bitmap))
+ return -EINVAL;
+
+ if (copy_from_user(&fd->allow_bitmap,
+ (void __user *)hvcall_setup.allow_bitmap_ptr,
+ hvcall_setup.bitmap_array_size)) {
+ return -EFAULT;
+ }
+
+ dev_info(fd->dev->this_device, "Hypercall allow map has been set, pid %d\n",
+ current->pid);
+ fd->allow_map_initialized = true;
+ return 0;
+}
+
+static bool mshv_vtl_hvcall_is_allowed(struct mshv_vtl_hvcall_fd *fd, u16 call_code)
+{
+ return test_bit(call_code, (unsigned long *)fd->allow_bitmap);
+}
+
+static int mshv_vtl_hvcall_call(struct mshv_vtl_hvcall_fd *fd,
+ struct mshv_vtl_hvcall __user *hvcall_user)
+{
+ struct mshv_vtl_hvcall hvcall;
+ void *in, *out;
+ int ret;
+
+ if (copy_from_user(&hvcall, hvcall_user, sizeof(struct mshv_vtl_hvcall)))
+ return -EFAULT;
+ if (hvcall.input_size > HV_HYP_PAGE_SIZE)
+ return -EINVAL;
+ if (hvcall.output_size > HV_HYP_PAGE_SIZE)
+ return -EINVAL;
+
+ /*
+ * By default, all hypercalls are not allowed.
+ * The user mode code has to set up the allow bitmap once.
+ */
+
+ if (!mshv_vtl_hvcall_is_allowed(fd, hvcall.control & 0xFFFF)) {
+ dev_err(fd->dev->this_device,
+ "Hypercall with control data %#llx isn't allowed\n",
+ hvcall.control);
+ return -EPERM;
+ }
+
+ /*
+ * This may create a problem for Confidential VM (CVM) usecase where we need to use
+ * Hyper-V driver allocated per-cpu input and output pages (hyperv_pcpu_input_arg and
+ * hyperv_pcpu_output_arg) for making a hypervisor call.
+ *
+ * TODO: Take care of this when CVM support is added.
+ */
+ in = (void *)__get_free_page(GFP_KERNEL);
+ out = (void *)__get_free_page(GFP_KERNEL);
+
+ if (copy_from_user(in, (void __user *)hvcall.input_ptr, hvcall.input_size)) {
+ ret = -EFAULT;
+ goto free_pages;
+ }
+
+ hvcall.status = hv_do_hypercall(hvcall.control, in, out);
+
+ if (copy_to_user((void __user *)hvcall.output_ptr, out, hvcall.output_size)) {
+ ret = -EFAULT;
+ goto free_pages;
+ }
+ ret = put_user(hvcall.status, &hvcall_user->status);
+free_pages:
+ free_page((unsigned long)in);
+ free_page((unsigned long)out);
+
+ return ret;
+}
+
+static long mshv_vtl_hvcall_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+ struct mshv_vtl_hvcall_fd *fd = f->private_data;
+
+ switch (cmd) {
+ case MSHV_HVCALL_SETUP:
+ return mshv_vtl_hvcall_do_setup(fd, (struct mshv_vtl_hvcall_setup __user *)arg);
+ case MSHV_HVCALL:
+ return mshv_vtl_hvcall_call(fd, (struct mshv_vtl_hvcall __user *)arg);
+ default:
+ break;
+ }
+
+ return -ENOIOCTLCMD;
+}
+
+static const struct file_operations mshv_vtl_hvcall_dev_file_ops = {
+ .owner = THIS_MODULE,
+ .open = mshv_vtl_hvcall_dev_open,
+ .release = mshv_vtl_hvcall_dev_release,
+ .unlocked_ioctl = mshv_vtl_hvcall_dev_ioctl,
+};
+
+static struct miscdevice mshv_vtl_hvcall_dev = {
+ .name = "mshv_hvcall",
+ .nodename = "mshv_hvcall",
+ .fops = &mshv_vtl_hvcall_dev_file_ops,
+ .mode = 0600,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+static int mshv_vtl_low_open(struct inode *inodep, struct file *filp)
+{
+ pid_t pid = task_pid_vnr(current);
+ uid_t uid = current_uid().val;
+ int ret = 0;
+
+ pr_debug("%s: Opening VTL low, task group %d, uid %d\n", __func__, pid, uid);
+
+ if (capable(CAP_SYS_ADMIN)) {
+ filp->private_data = inodep;
+ } else {
+ pr_err("%s: VTL low open failed: CAP_SYS_ADMIN required. task group %d, uid %d",
+ __func__, pid, uid);
+ ret = -EPERM;
+ }
+
+ return ret;
+}
+
+static bool can_fault(struct vm_fault *vmf, unsigned long size, unsigned long *pfn)
+{
+ unsigned long mask = size - 1;
+ unsigned long start = vmf->address & ~mask;
+ unsigned long end = start + size;
+ bool is_valid;
+
+ is_valid = (vmf->address & mask) == ((vmf->pgoff << PAGE_SHIFT) & mask) &&
+ start >= vmf->vma->vm_start &&
+ end <= vmf->vma->vm_end;
+
+ if (is_valid)
+ *pfn = vmf->pgoff & ~(mask >> PAGE_SHIFT);
+
+ return is_valid;
+}
+
+static vm_fault_t mshv_vtl_low_huge_fault(struct vm_fault *vmf, unsigned int order)
+{
+ unsigned long pfn = vmf->pgoff;
+ vm_fault_t ret = VM_FAULT_FALLBACK;
+
+ switch (order) {
+ case 0:
+ return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+
+ case PMD_ORDER:
+ if (can_fault(vmf, PMD_SIZE, &pfn))
+ ret = vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
+ return ret;
+
+ case PUD_ORDER:
+ if (can_fault(vmf, PUD_SIZE, &pfn))
+ ret = vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
+ return ret;
+
+ default:
+ return VM_FAULT_SIGBUS;
+ }
+}
+
+static vm_fault_t mshv_vtl_low_fault(struct vm_fault *vmf)
+{
+ return mshv_vtl_low_huge_fault(vmf, 0);
+}
+
+static const struct vm_operations_struct mshv_vtl_low_vm_ops = {
+ .fault = mshv_vtl_low_fault,
+ .huge_fault = mshv_vtl_low_huge_fault,
+};
+
+static int mshv_vtl_low_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ vma->vm_ops = &mshv_vtl_low_vm_ops;
+ vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP);
+
+ return 0;
+}
+
+static const struct file_operations mshv_vtl_low_file_ops = {
+ .owner = THIS_MODULE,
+ .open = mshv_vtl_low_open,
+ .mmap = mshv_vtl_low_mmap,
+};
+
+static struct miscdevice mshv_vtl_low = {
+ .name = "mshv_vtl_low",
+ .nodename = "mshv_vtl_low",
+ .fops = &mshv_vtl_low_file_ops,
+ .mode = 0600,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+static int __init mshv_vtl_init(void)
+{
+ int ret;
+ struct device *dev = mshv_dev.this_device;
+
+ /*
+ * This creates /dev/mshv which provides functionality to create VTLs and partitions.
+ */
+ ret = misc_register(&mshv_dev);
+ if (ret) {
+ dev_err(dev, "mshv device register failed: %d\n", ret);
+ goto free_dev;
+ }
+
+ tasklet_init(&msg_dpc, mshv_vtl_sint_on_msg_dpc, 0);
+ init_waitqueue_head(&fd_wait_queue);
+
+ if (mshv_vtl_get_vsm_regs()) {
+ dev_emerg(dev, "Unable to get VSM capabilities !!\n");
+ ret = -ENODEV;
+ goto free_dev;
+ }
+ if (mshv_vtl_configure_vsm_partition(dev)) {
+ dev_emerg(dev, "VSM configuration failed !!\n");
+ ret = -ENODEV;
+ goto free_dev;
+ }
+
+ mshv_vtl_return_call_init(mshv_vsm_page_offsets.vtl_return_offset);
+ ret = hv_vtl_setup_synic();
+ if (ret)
+ goto free_dev;
+
+ /*
+ * mshv_sint device adds VMBus relay ioctl support.
+ * This provides a channel for VTL0 to communicate with VTL2.
+ */
+ ret = misc_register(&mshv_vtl_sint_dev);
+ if (ret)
+ goto free_synic;
+
+ /*
+ * mshv_hvcall device adds interface to enable userspace for direct hypercalls support.
+ */
+ ret = misc_register(&mshv_vtl_hvcall_dev);
+ if (ret)
+ goto free_sint;
+
+ /*
+ * mshv_vtl_low device is used to map VTL0 address space to a user-mode process in VTL2.
+ * It implements mmap() to allow a user-mode process in VTL2 to map to the address of VTL0.
+ */
+ ret = misc_register(&mshv_vtl_low);
+ if (ret)
+ goto free_hvcall;
+
+ /*
+ * "mshv vtl mem dev" device is later used to setup VTL0 memory.
+ */
+ mem_dev = kzalloc_obj(*mem_dev);
+ if (!mem_dev) {
+ ret = -ENOMEM;
+ goto free_low;
+ }
+
+ mutex_init(&mshv_vtl_poll_file_lock);
+
+ device_initialize(mem_dev);
+ dev_set_name(mem_dev, "mshv vtl mem dev");
+ ret = device_add(mem_dev);
+ if (ret) {
+ dev_err(dev, "mshv vtl mem dev add: %d\n", ret);
+ goto free_mem;
+ }
+
+ return 0;
+
+free_mem:
+ kfree(mem_dev);
+free_low:
+ misc_deregister(&mshv_vtl_low);
+free_hvcall:
+ misc_deregister(&mshv_vtl_hvcall_dev);
+free_sint:
+ misc_deregister(&mshv_vtl_sint_dev);
+free_synic:
+ hv_vtl_remove_synic();
+free_dev:
+ misc_deregister(&mshv_dev);
+
+ return ret;
+}
+
+static void __exit mshv_vtl_exit(void)
+{
+ device_del(mem_dev);
+ kfree(mem_dev);
+ misc_deregister(&mshv_vtl_low);
+ misc_deregister(&mshv_vtl_hvcall_dev);
+ misc_deregister(&mshv_vtl_sint_dev);
+ hv_vtl_remove_synic();
+ misc_deregister(&mshv_dev);
+}
+
+module_init(mshv_vtl_init);
+module_exit(mshv_vtl_exit);
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 3c9b02471760..592a9601faaa 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/prefetch.h>
#include <linux/io.h>
+#include <linux/export.h>
#include <asm/mshyperv.h>
#include "hyperv_vmbus.h"
@@ -183,7 +184,8 @@ void hv_ringbuffer_pre_init(struct vmbus_channel *channel)
/* Initialize the ring buffer. */
int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
- struct page *pages, u32 page_cnt, u32 max_pkt_size)
+ struct page *pages, u32 page_cnt, u32 max_pkt_size,
+ bool confidential)
{
struct page **pages_wraparound;
int i;
@@ -194,9 +196,7 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
* First page holds struct hv_ring_buffer, do wraparound mapping for
* the rest.
*/
- pages_wraparound = kcalloc(page_cnt * 2 - 1,
- sizeof(struct page *),
- GFP_KERNEL);
+ pages_wraparound = kzalloc_objs(struct page *, page_cnt * 2 - 1);
if (!pages_wraparound)
return -ENOMEM;
@@ -207,7 +207,7 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
ring_info->ring_buffer = (struct hv_ring_buffer *)
vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP,
- pgprot_decrypted(PAGE_KERNEL));
+ confidential ? PAGE_KERNEL : pgprot_decrypted(PAGE_KERNEL));
kfree(pages_wraparound);
if (!ring_info->ring_buffer)
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 0f6cd44fff29..d28ff45d4cfd 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -25,17 +25,18 @@
#include <linux/cpu.h>
#include <linux/sched/isolation.h>
#include <linux/sched/task_stack.h>
+#include <linux/smpboot.h>
#include <linux/delay.h>
#include <linux/panic_notifier.h>
#include <linux/ptrace.h>
-#include <linux/screen_info.h>
+#include <linux/sysfb.h>
#include <linux/efi.h>
-#include <linux/random.h>
#include <linux/kernel.h>
#include <linux/syscore_ops.h>
#include <linux/dma-map-ops.h>
#include <linux/pci.h>
+#include <linux/export.h>
#include <clocksource/hyperv_timer.h>
#include <asm/mshyperv.h>
#include "hyperv_vmbus.h"
@@ -45,17 +46,30 @@ struct vmbus_dynid {
struct hv_vmbus_device_id id;
};
-static struct device *hv_dev;
+/* VMBus Root Device */
+static struct device *vmbus_root_device;
static int hyperv_cpuhp_online;
-static long __percpu *vmbus_evt;
+static DEFINE_PER_CPU(long, vmbus_evt);
/* Values parsed from ACPI DSDT */
int vmbus_irq;
int vmbus_interrupt;
/*
+ * If the Confidential VMBus is used, the data on the "wire" is not
+ * visible to either the host or the hypervisor.
+ */
+static bool is_confidential;
+
+bool vmbus_is_confidential(void)
+{
+ return is_confidential;
+}
+EXPORT_SYMBOL_GPL(vmbus_is_confidential);
+
+/*
* The panic notifier below is responsible solely for unloading the
* vmbus connection, which is necessary in a panic event.
*
@@ -80,13 +94,17 @@ static struct resource *fb_mmio;
static struct resource *hyperv_mmio;
static DEFINE_MUTEX(hyperv_mmio_lock);
-static int vmbus_exists(void)
+struct device *hv_get_vmbus_root_device(void)
{
- if (hv_dev == NULL)
- return -ENODEV;
+ return vmbus_root_device;
+}
+EXPORT_SYMBOL_GPL(hv_get_vmbus_root_device);
- return 0;
+bool hv_vmbus_exists(void)
+{
+ return vmbus_root_device != NULL;
}
+EXPORT_SYMBOL_GPL(hv_vmbus_exists);
static u8 channel_monitor_group(const struct vmbus_channel *channel)
{
@@ -315,7 +333,7 @@ static ssize_t out_read_index_show(struct device *dev,
&outbound);
if (ret < 0)
return ret;
- return sysfs_emit(buf, "%d\n", outbound.current_read_index);
+ return sysfs_emit(buf, "%u\n", outbound.current_read_index);
}
static DEVICE_ATTR_RO(out_read_index);
@@ -334,7 +352,7 @@ static ssize_t out_write_index_show(struct device *dev,
&outbound);
if (ret < 0)
return ret;
- return sysfs_emit(buf, "%d\n", outbound.current_write_index);
+ return sysfs_emit(buf, "%u\n", outbound.current_write_index);
}
static DEVICE_ATTR_RO(out_write_index);
@@ -707,12 +725,35 @@ static const struct hv_vmbus_device_id *hv_vmbus_get_id(const struct hv_driver *
return id;
}
-/* vmbus_add_dynid - add a new device ID to this driver and re-probe devices */
+/* vmbus_add_dynid - add a new device ID to this driver and re-probe devices
+ *
+ * This function can race with vmbus_device_register(). This function is
+ * typically running on a user thread in response to writing to the "new_id"
+ * sysfs entry for a driver. vmbus_device_register() is running on a
+ * workqueue thread in response to the Hyper-V host offering a device to the
+ * guest. This function calls driver_attach(), which looks for an existing
+ * device matching the new id, and attaches the driver to which the new id
+ * has been assigned. vmbus_device_register() calls device_register(), which
+ * looks for a driver that matches the device being registered. If both
+ * operations are running simultaneously, the device driver probe function runs
+ * on whichever thread establishes the linkage between the driver and device.
+ *
+ * In most cases, it doesn't matter which thread runs the driver probe
+ * function. But if vmbus_device_register() does not find a matching driver,
+ * it proceeds to create the "channels" subdirectory and numbered per-channel
+ * subdirectory in sysfs. While that multi-step creation is in progress, this
+ * function could run the driver probe function. If the probe function checks
+ * for, or operates on, entries in the "channels" subdirectory, including by
+ * calling hv_create_ring_sysfs(), the operation may or may not succeed
+ * depending on the race. The race can't create a kernel failure in VMBus
+ * or device subsystem code, but probe functions in VMBus drivers doing such
+ * operations must be prepared for the failure case.
+ */
static int vmbus_add_dynid(struct hv_driver *drv, guid_t *guid)
{
struct vmbus_dynid *dynid;
- dynid = kzalloc(sizeof(*dynid), GFP_KERNEL);
+ dynid = kzalloc_obj(*dynid);
if (!dynid)
return -ENOMEM;
@@ -861,7 +902,7 @@ static int vmbus_dma_configure(struct device *child_device)
* On x86/x64 coherence is assumed and these calls have no effect.
*/
hv_setup_dma_ops(child_device,
- device_get_dma_attr(hv_dev) == DEV_DMA_COHERENT);
+ device_get_dma_attr(vmbus_root_device) == DEV_DMA_COHERENT);
return 0;
}
@@ -1015,12 +1056,9 @@ static void vmbus_onmessage_work(struct work_struct *work)
kfree(ctx);
}
-void vmbus_on_msg_dpc(unsigned long data)
+static void __vmbus_on_msg_dpc(void *message_page_addr)
{
- struct hv_per_cpu_context *hv_cpu = (void *)data;
- void *page_addr = hv_cpu->synic_message_page;
- struct hv_message msg_copy, *msg = (struct hv_message *)page_addr +
- VMBUS_MESSAGE_SINT;
+ struct hv_message msg_copy, *msg;
struct vmbus_channel_message_header *hdr;
enum vmbus_channel_message_type msgtype;
const struct vmbus_channel_message_table_entry *entry;
@@ -1028,6 +1066,10 @@ void vmbus_on_msg_dpc(unsigned long data)
__u8 payload_size;
u32 message_type;
+ if (!message_page_addr)
+ return;
+ msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
+
/*
* 'enum vmbus_channel_message_type' is supposed to always be 'u32' as
* it is being used in 'struct vmbus_channel_message_header' definition
@@ -1075,7 +1117,7 @@ void vmbus_on_msg_dpc(unsigned long data)
}
if (entry->handler_type == VMHT_BLOCKING) {
- ctx = kmalloc(struct_size(ctx, msg.payload, payload_size), GFP_ATOMIC);
+ ctx = kmalloc_flex(*ctx, msg.payload, payload_size, GFP_ATOMIC);
if (ctx == NULL)
return;
@@ -1153,6 +1195,14 @@ msg_handled:
vmbus_signal_eom(msg, message_type);
}
+void vmbus_on_msg_dpc(unsigned long data)
+{
+ struct hv_per_cpu_context *hv_cpu = (void *)data;
+
+ __vmbus_on_msg_dpc(hv_cpu->hyp_synic_message_page);
+ __vmbus_on_msg_dpc(hv_cpu->para_synic_message_page);
+}
+
#ifdef CONFIG_PM_SLEEP
/*
* Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for
@@ -1191,23 +1241,21 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel)
#endif /* CONFIG_PM_SLEEP */
/*
- * Schedule all channels with events pending
+ * Schedule all channels with events pending.
+ * The event page can be directly checked to get the id of
+ * the channel that has the interrupt pending.
*/
-static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
+static void vmbus_chan_sched(void *event_page_addr)
{
unsigned long *recv_int_page;
u32 maxbits, relid;
+ union hv_synic_event_flags *event;
- /*
- * The event page can be directly checked to get the id of
- * the channel that has the interrupt pending.
- */
- void *page_addr = hv_cpu->synic_event_page;
- union hv_synic_event_flags *event
- = (union hv_synic_event_flags *)page_addr +
- VMBUS_MESSAGE_SINT;
+ if (!event_page_addr)
+ return;
+ event = (union hv_synic_event_flags *)event_page_addr + VMBUS_MESSAGE_SINT;
- maxbits = HV_EVENT_FLAGS_COUNT;
+ maxbits = READ_ONCE(vmbus_connection.relid_hiwater) + 1;
recv_int_page = event->flags;
if (unlikely(!recv_int_page))
@@ -1276,29 +1324,84 @@ sched_unlock_rcu:
}
}
-static void vmbus_isr(void)
+static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message_page_addr)
{
- struct hv_per_cpu_context *hv_cpu
- = this_cpu_ptr(hv_context.cpu_context);
- void *page_addr;
struct hv_message *msg;
- vmbus_chan_sched(hv_cpu);
-
- page_addr = hv_cpu->synic_message_page;
- msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
+ if (!message_page_addr)
+ return;
+ msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
/* Check if there are actual msgs to be processed */
if (msg->header.message_type != HVMSG_NONE) {
if (msg->header.message_type == HVMSG_TIMER_EXPIRED) {
hv_stimer0_isr();
vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED);
- } else
+ } else {
tasklet_schedule(&hv_cpu->msg_dpc);
+ }
}
+}
- add_interrupt_randomness(vmbus_interrupt);
+static void __vmbus_isr(void)
+{
+ struct hv_per_cpu_context *hv_cpu
+ = this_cpu_ptr(hv_context.cpu_context);
+
+ vmbus_chan_sched(hv_cpu->hyp_synic_event_page);
+ vmbus_chan_sched(hv_cpu->para_synic_event_page);
+
+ vmbus_message_sched(hv_cpu, hv_cpu->hyp_synic_message_page);
+ vmbus_message_sched(hv_cpu, hv_cpu->para_synic_message_page);
+}
+
+static DEFINE_PER_CPU(bool, vmbus_irq_pending);
+static DEFINE_PER_CPU(struct task_struct *, vmbus_irqd);
+
+static void vmbus_irqd_wake(void)
+{
+ struct task_struct *tsk = __this_cpu_read(vmbus_irqd);
+
+ __this_cpu_write(vmbus_irq_pending, true);
+ wake_up_process(tsk);
+}
+
+static void vmbus_irqd_setup(unsigned int cpu)
+{
+ sched_set_fifo(current);
+}
+
+static int vmbus_irqd_should_run(unsigned int cpu)
+{
+ return __this_cpu_read(vmbus_irq_pending);
+}
+
+static void run_vmbus_irqd(unsigned int cpu)
+{
+ __this_cpu_write(vmbus_irq_pending, false);
+ __vmbus_isr();
+}
+
+static bool vmbus_irq_initialized;
+
+static struct smp_hotplug_thread vmbus_irq_threads = {
+ .store = &vmbus_irqd,
+ .setup = vmbus_irqd_setup,
+ .thread_should_run = vmbus_irqd_should_run,
+ .thread_fn = run_vmbus_irqd,
+ .thread_comm = "vmbus_irq/%u",
+};
+
+void vmbus_isr(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ vmbus_irqd_wake();
+ } else {
+ lockdep_hardirq_threaded();
+ __vmbus_isr();
+ }
}
+EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl");
static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
{
@@ -1313,6 +1416,58 @@ static void vmbus_percpu_work(struct work_struct *work)
hv_synic_init(cpu);
}
+static int vmbus_alloc_synic_and_connect(void)
+{
+ int ret, cpu;
+ struct work_struct __percpu *works;
+
+ ret = hv_synic_alloc();
+ if (ret < 0)
+ goto err_alloc;
+
+ works = alloc_percpu(struct work_struct);
+ if (!works) {
+ ret = -ENOMEM;
+ goto err_alloc;
+ }
+
+ /*
+ * Initialize the per-cpu interrupt state and stimer state.
+ * Then connect to the host.
+ */
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+
+ INIT_WORK(work, vmbus_percpu_work);
+ schedule_work_on(cpu, work);
+ }
+
+ for_each_online_cpu(cpu)
+ flush_work(per_cpu_ptr(works, cpu));
+
+ /* Register the callbacks for possible CPU online/offline'ing */
+ ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
+ hv_synic_init, hv_synic_cleanup);
+ cpus_read_unlock();
+ free_percpu(works);
+ if (ret < 0)
+ goto err_alloc;
+ hyperv_cpuhp_online = ret;
+
+ ret = vmbus_connect();
+ if (ret)
+ goto err_connect;
+ return 0;
+
+err_connect:
+ cpuhp_remove_state(hyperv_cpuhp_online);
+ return -ENODEV;
+err_alloc:
+ hv_synic_free();
+ return -ENOMEM;
+}
+
/*
* vmbus_bus_init -Main vmbus driver initialization routine.
*
@@ -1323,8 +1478,7 @@ static void vmbus_percpu_work(struct work_struct *work)
*/
static int vmbus_bus_init(void)
{
- int ret, cpu;
- struct work_struct __percpu *works;
+ int ret;
ret = hv_init();
if (ret != 0) {
@@ -1345,55 +1499,34 @@ static int vmbus_bus_init(void)
* the VMbus interrupt handler.
*/
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !vmbus_irq_initialized) {
+ ret = smpboot_register_percpu_thread(&vmbus_irq_threads);
+ if (ret)
+ goto err_kthread;
+ vmbus_irq_initialized = true;
+ }
+
if (vmbus_irq == -1) {
hv_setup_vmbus_handler(vmbus_isr);
} else {
- vmbus_evt = alloc_percpu(long);
ret = request_percpu_irq(vmbus_irq, vmbus_percpu_isr,
- "Hyper-V VMbus", vmbus_evt);
+ "Hyper-V VMbus", &vmbus_evt);
if (ret) {
pr_err("Can't request Hyper-V VMbus IRQ %d, Err %d",
vmbus_irq, ret);
- free_percpu(vmbus_evt);
goto err_setup;
}
}
- ret = hv_synic_alloc();
- if (ret)
- goto err_alloc;
-
- works = alloc_percpu(struct work_struct);
- if (!works) {
- ret = -ENOMEM;
- goto err_alloc;
- }
-
/*
- * Initialize the per-cpu interrupt state and stimer state.
- * Then connect to the host.
+ * Cache the value as getting it involves a VM exit on x86(_64), and
+ * doing that on each VP while initializing SynIC's wastes time.
*/
- cpus_read_lock();
- for_each_online_cpu(cpu) {
- struct work_struct *work = per_cpu_ptr(works, cpu);
-
- INIT_WORK(work, vmbus_percpu_work);
- schedule_work_on(cpu, work);
- }
-
- for_each_online_cpu(cpu)
- flush_work(per_cpu_ptr(works, cpu));
-
- /* Register the callbacks for possible CPU online/offline'ing */
- ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
- hv_synic_init, hv_synic_cleanup);
- cpus_read_unlock();
- free_percpu(works);
- if (ret < 0)
- goto err_alloc;
- hyperv_cpuhp_online = ret;
-
- ret = vmbus_connect();
+ is_confidential = ms_hyperv.confidential_vmbus_available;
+ if (is_confidential)
+ pr_info("Establishing connection to the confidential VMBus\n");
+ hv_para_set_sint_proxy(!is_confidential);
+ ret = vmbus_alloc_synic_and_connect();
if (ret)
goto err_connect;
@@ -1409,16 +1542,16 @@ static int vmbus_bus_init(void)
return 0;
err_connect:
- cpuhp_remove_state(hyperv_cpuhp_online);
-err_alloc:
- hv_synic_free();
- if (vmbus_irq == -1) {
+ if (vmbus_irq == -1)
hv_remove_vmbus_handler();
- } else {
- free_percpu_irq(vmbus_irq, vmbus_evt);
- free_percpu(vmbus_evt);
- }
+ else
+ free_percpu_irq(vmbus_irq, &vmbus_evt);
err_setup:
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && vmbus_irq_initialized) {
+ smpboot_unregister_percpu_thread(&vmbus_irq_threads);
+ vmbus_irq_initialized = false;
+ }
+err_kthread:
bus_unregister(&hv_bus);
return ret;
}
@@ -1438,11 +1571,10 @@ int __vmbus_driver_register(struct hv_driver *hv_driver, struct module *owner, c
{
int ret;
- pr_info("registering driver %s\n", hv_driver->name);
+ if (!hv_vmbus_exists())
+ return -ENODEV;
- ret = vmbus_exists();
- if (ret < 0)
- return ret;
+ pr_info("registering driver %s\n", hv_driver->name);
hv_driver->driver.name = hv_driver->name;
hv_driver->driver.owner = owner;
@@ -1468,9 +1600,8 @@ EXPORT_SYMBOL_GPL(__vmbus_driver_register);
*/
void vmbus_driver_unregister(struct hv_driver *hv_driver)
{
- pr_info("unregistering driver %s\n", hv_driver->name);
-
- if (!vmbus_exists()) {
+ if (hv_vmbus_exists()) {
+ pr_info("unregistering driver %s\n", hv_driver->name);
driver_unregister(&hv_driver->driver);
vmbus_free_dynids(hv_driver);
}
@@ -1611,16 +1742,16 @@ static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf)
{
return sprintf(buf, "%u\n", channel->target_cpu);
}
-static ssize_t target_cpu_store(struct vmbus_channel *channel,
- const char *buf, size_t count)
+
+int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu)
{
- u32 target_cpu, origin_cpu;
- ssize_t ret = count;
+ u32 origin_cpu;
+ int ret = 0;
- if (vmbus_proto_version < VERSION_WIN10_V4_1)
- return -EIO;
+ lockdep_assert_cpus_held();
+ lockdep_assert_held(&vmbus_connection.channel_mutex);
- if (sscanf(buf, "%uu", &target_cpu) != 1)
+ if (vmbus_proto_version < VERSION_WIN10_V4_1)
return -EIO;
/* Validate target_cpu for the cpumask_test_cpu() operation below. */
@@ -1630,22 +1761,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
if (!cpumask_test_cpu(target_cpu, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ)))
return -EINVAL;
- /* No CPUs should come up or down during this. */
- cpus_read_lock();
-
- if (!cpu_online(target_cpu)) {
- cpus_read_unlock();
+ if (!cpu_online(target_cpu))
return -EINVAL;
- }
/*
- * Synchronizes target_cpu_store() and channel closure:
+ * Synchronizes vmbus_channel_set_cpu() and channel closure:
*
* { Initially: state = CHANNEL_OPENED }
*
* CPU1 CPU2
*
- * [target_cpu_store()] [vmbus_disconnect_ring()]
+ * [vmbus_channel_set_cpu()] [vmbus_disconnect_ring()]
*
* LOCK channel_mutex LOCK channel_mutex
* LOAD r1 = state LOAD r2 = state
@@ -1660,7 +1786,6 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
* Note. The host processes the channel messages "sequentially", in
* the order in which they are received on a per-partition basis.
*/
- mutex_lock(&vmbus_connection.channel_mutex);
/*
* Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels;
@@ -1668,17 +1793,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
*/
if (channel->state != CHANNEL_OPENED_STATE) {
ret = -EIO;
- goto cpu_store_unlock;
+ goto end;
}
origin_cpu = channel->target_cpu;
if (target_cpu == origin_cpu)
- goto cpu_store_unlock;
+ goto end;
if (vmbus_send_modifychannel(channel,
hv_cpu_number_to_vp_number(target_cpu))) {
ret = -EIO;
- goto cpu_store_unlock;
+ goto end;
}
/*
@@ -1708,10 +1833,26 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
origin_cpu, target_cpu);
}
-cpu_store_unlock:
+end:
+ return ret;
+}
+
+static ssize_t target_cpu_store(struct vmbus_channel *channel,
+ const char *buf, size_t count)
+{
+ u32 target_cpu;
+ ssize_t ret;
+
+ if (sscanf(buf, "%u", &target_cpu) != 1)
+ return -EIO;
+
+ cpus_read_lock();
+ mutex_lock(&vmbus_connection.channel_mutex);
+ ret = vmbus_channel_set_cpu(channel, target_cpu);
mutex_unlock(&vmbus_connection.channel_mutex);
cpus_read_unlock();
- return ret;
+
+ return ret ?: count;
}
static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store);
@@ -1792,6 +1933,33 @@ static ssize_t subchannel_id_show(struct vmbus_channel *channel,
}
static VMBUS_CHAN_ATTR_RO(subchannel_id);
+static int hv_mmap_ring_buffer_wrapper(struct file *filp, struct kobject *kobj,
+ const struct bin_attribute *attr,
+ struct vm_area_struct *vma)
+{
+ struct vmbus_channel *channel = container_of(kobj, struct vmbus_channel, kobj);
+ struct vm_area_desc desc;
+ int err;
+
+ /*
+ * hv_(create|remove)_ring_sysfs implementation ensures that
+ * mmap_prepare_ring_buffer is not NULL.
+ */
+ compat_set_desc_from_vma(&desc, filp, vma);
+ err = channel->mmap_prepare_ring_buffer(channel, &desc);
+ if (err)
+ return err;
+
+ return __compat_vma_mmap(&desc, vma);
+}
+
+static struct bin_attribute chan_attr_ring_buffer = {
+ .attr = {
+ .name = "ring",
+ .mode = 0600,
+ },
+ .mmap = hv_mmap_ring_buffer_wrapper,
+};
static struct attribute *vmbus_chan_attrs[] = {
&chan_attr_out_mask.attr,
&chan_attr_in_mask.attr,
@@ -1811,6 +1979,11 @@ static struct attribute *vmbus_chan_attrs[] = {
NULL
};
+static const struct bin_attribute *vmbus_chan_bin_attrs[] = {
+ &chan_attr_ring_buffer,
+ NULL
+};
+
/*
* Channel-level attribute_group callback function. Returns the permission for
* each attribute, and returns 0 if an attribute is not visible.
@@ -1831,9 +2004,34 @@ static umode_t vmbus_chan_attr_is_visible(struct kobject *kobj,
return attr->mode;
}
+static umode_t vmbus_chan_bin_attr_is_visible(struct kobject *kobj,
+ const struct bin_attribute *attr, int idx)
+{
+ const struct vmbus_channel *channel =
+ container_of(kobj, struct vmbus_channel, kobj);
+
+ /* Hide ring attribute if channel's ring_sysfs_visible is set to false */
+ if (attr == &chan_attr_ring_buffer && !channel->ring_sysfs_visible)
+ return 0;
+
+ return attr->attr.mode;
+}
+
+static size_t vmbus_chan_bin_size(struct kobject *kobj,
+ const struct bin_attribute *bin_attr, int a)
+{
+ const struct vmbus_channel *channel =
+ container_of(kobj, struct vmbus_channel, kobj);
+
+ return channel->ringbuffer_pagecount << PAGE_SHIFT;
+}
+
static const struct attribute_group vmbus_chan_group = {
.attrs = vmbus_chan_attrs,
- .is_visible = vmbus_chan_attr_is_visible
+ .bin_attrs = vmbus_chan_bin_attrs,
+ .is_visible = vmbus_chan_attr_is_visible,
+ .is_bin_visible = vmbus_chan_bin_attr_is_visible,
+ .bin_size = vmbus_chan_bin_size,
};
static const struct kobj_type vmbus_chan_ktype = {
@@ -1841,6 +2039,64 @@ static const struct kobj_type vmbus_chan_ktype = {
.release = vmbus_chan_release,
};
+/**
+ * hv_create_ring_sysfs() - create "ring" sysfs entry corresponding to ring buffers for a channel.
+ * @channel: Pointer to vmbus_channel structure
+ * @hv_mmap_prepare_ring_buffer: function pointer for initializing the function to be called on mmap
+ * channel's "ring" sysfs node, which is for the ring buffer of that channel.
+ * Function pointer is of below type:
+ * int (*hv_mmap_prepare_ring_buffer)(struct vmbus_channel *channel,
+ * struct vm_area_desc *desc))
+ * This has a pointer to the channel and a pointer to vm_area_desc,
+ * used for mmap_prepare, as arguments.
+ *
+ * Sysfs node for ring buffer of a channel is created along with other fields, however its
+ * visibility is disabled by default. Sysfs creation needs to be controlled when the use-case
+ * is running.
+ * For example, HV_NIC device is used either by uio_hv_generic or hv_netvsc at any given point of
+ * time, and "ring" sysfs is needed only when uio_hv_generic is bound to that device. To avoid
+ * exposing the ring buffer by default, this function is responsible to enable visibility of
+ * ring for userspace to use.
+ * Note: Race conditions can happen with userspace and it is not encouraged to create new
+ * use-cases for this. This was added to maintain backward compatibility, while solving
+ * one of the race conditions in uio_hv_generic while creating sysfs. See comments with
+ * vmbus_add_dynid() and vmbus_device_register().
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int hv_create_ring_sysfs(struct vmbus_channel *channel,
+ int (*hv_mmap_prepare_ring_buffer)(struct vmbus_channel *channel,
+ struct vm_area_desc *desc))
+{
+ struct kobject *kobj = &channel->kobj;
+
+ channel->mmap_prepare_ring_buffer = hv_mmap_prepare_ring_buffer;
+ channel->ring_sysfs_visible = true;
+
+ return sysfs_update_group(kobj, &vmbus_chan_group);
+}
+EXPORT_SYMBOL_GPL(hv_create_ring_sysfs);
+
+/**
+ * hv_remove_ring_sysfs() - remove ring sysfs entry corresponding to ring buffers for a channel.
+ * @channel: Pointer to vmbus_channel structure
+ *
+ * Hide "ring" sysfs for a channel by changing its is_visible attribute and updating sysfs group.
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int hv_remove_ring_sysfs(struct vmbus_channel *channel)
+{
+ struct kobject *kobj = &channel->kobj;
+ int ret;
+
+ channel->ring_sysfs_visible = false;
+ ret = sysfs_update_group(kobj, &vmbus_chan_group);
+ channel->mmap_prepare_ring_buffer = NULL;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hv_remove_ring_sysfs);
+
/*
* vmbus_add_channel_kobj - setup a sub-directory under device/channels
*/
@@ -1894,7 +2150,7 @@ struct hv_device *vmbus_device_create(const guid_t *type,
{
struct hv_device *child_device_obj;
- child_device_obj = kzalloc(sizeof(struct hv_device), GFP_KERNEL);
+ child_device_obj = kzalloc_obj(struct hv_device);
if (!child_device_obj) {
pr_err("Unable to allocate device object for child device\n");
return NULL;
@@ -1920,7 +2176,7 @@ int vmbus_device_register(struct hv_device *child_device_obj)
&child_device_obj->channel->offermsg.offer.if_instance);
child_device_obj->device.bus = &hv_bus;
- child_device_obj->device.parent = hv_dev;
+ child_device_obj->device.parent = vmbus_root_device;
child_device_obj->device.release = vmbus_device_release;
child_device_obj->device.dma_parms = &child_device_obj->dma_parms;
@@ -1938,6 +2194,20 @@ int vmbus_device_register(struct hv_device *child_device_obj)
return ret;
}
+ /*
+ * If device_register() found a driver to assign to the device, the
+ * driver's probe function has already run at this point. If that
+ * probe function accesses or operates on the "channels" subdirectory
+ * in sysfs, those operations will have failed because the "channels"
+ * subdirectory doesn't exist until the code below runs. Or if the
+ * probe function creates a /dev entry, a user space program could
+ * find and open the /dev entry, and then create a race by accessing
+ * the "channels" subdirectory while the creation steps are in progress
+ * here. The race can't result in a kernel failure, but the user space
+ * program may get an error in accessing "channels" or its
+ * subdirectories. See also comments with vmbus_add_dynid() about a
+ * related race condition.
+ */
child_device_obj->channels_kset = kset_create_and_add("channels",
NULL, kobj);
if (!child_device_obj->channels_kset) {
@@ -1948,7 +2218,7 @@ int vmbus_device_register(struct hv_device *child_device_obj)
ret = vmbus_add_channel_kobj(child_device_obj,
child_device_obj->channel);
if (ret) {
- pr_err("Unable to register primary channeln");
+ pr_err("Unable to register primary channel\n");
goto err_kset_unregister;
}
hv_debug_add_dev_dir(child_device_obj);
@@ -2042,7 +2312,7 @@ static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
if (end < 0x100000)
return AE_OK;
- new_res = kzalloc(sizeof(*new_res), GFP_ATOMIC);
+ new_res = kzalloc_obj(*new_res, GFP_ATOMIC);
if (!new_res)
return AE_NO_MEMORY;
@@ -2120,8 +2390,8 @@ static void __maybe_unused vmbus_reserve_fb(void)
if (efi_enabled(EFI_BOOT)) {
/* Gen2 VM: get FB base from EFI framebuffer */
if (IS_ENABLED(CONFIG_SYSFB)) {
- start = screen_info.lfb_base;
- size = max_t(__u32, screen_info.lfb_size, 0x800000);
+ start = sysfb_primary_display.screen.lfb_base;
+ size = max_t(__u32, sysfb_primary_display.screen.lfb_size, 0x800000);
}
} else {
/* Gen1 VM: get FB base from PCI */
@@ -2136,8 +2406,8 @@ static void __maybe_unused vmbus_reserve_fb(void)
}
/*
- * Release the PCI device so hyperv_drm or hyperv_fb driver can
- * grab it later.
+ * Release the PCI device so hyperv_drm driver can grab it
+ * later.
*/
pci_dev_put(pdev);
}
@@ -2262,12 +2532,25 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size)
struct resource *iter;
mutex_lock(&hyperv_mmio_lock);
+
+ /*
+ * If all bytes of the MMIO range to be released are within the
+ * special case fb_mmio shadow region, skip releasing the shadow
+ * region since no corresponding __request_region() was done
+ * in vmbus_allocate_mmio().
+ */
+ if (fb_mmio && start >= fb_mmio->start &&
+ (start + size - 1 <= fb_mmio->end))
+ goto skip_shadow_release;
+
for (iter = hyperv_mmio; iter; iter = iter->sibling) {
if ((iter->start >= start + size) || (iter->end <= start))
continue;
__release_region(iter, start, size);
}
+
+skip_shadow_release:
release_mem_region(start, size);
mutex_unlock(&hyperv_mmio_lock);
@@ -2282,7 +2565,7 @@ static int vmbus_acpi_add(struct platform_device *pdev)
struct acpi_device *ancestor;
struct acpi_device *device = ACPI_COMPANION(&pdev->dev);
- hv_dev = &device->dev;
+ vmbus_root_device = &device->dev;
/*
* Older versions of Hyper-V for ARM64 fail to include the _CCA
@@ -2334,6 +2617,32 @@ static int vmbus_acpi_add(struct platform_device *pdev)
return 0;
}
#endif
+#ifndef HYPERVISOR_CALLBACK_VECTOR
+static int vmbus_set_irq(struct platform_device *pdev)
+{
+ struct irq_data *data;
+ int irq;
+ irq_hw_number_t hwirq;
+
+ irq = platform_get_irq(pdev, 0);
+ /* platform_get_irq() may not return 0. */
+ if (irq < 0)
+ return irq;
+
+ data = irq_get_irq_data(irq);
+ if (!data) {
+ pr_err("No interrupt data for VMBus virq %d\n", irq);
+ return -ENODEV;
+ }
+ hwirq = irqd_to_hwirq(data);
+
+ vmbus_irq = irq;
+ vmbus_interrupt = hwirq;
+ pr_debug("VMBus virq %d, hwirq %d\n", vmbus_irq, vmbus_interrupt);
+
+ return 0;
+}
+#endif
static int vmbus_device_add(struct platform_device *pdev)
{
@@ -2343,16 +2652,21 @@ static int vmbus_device_add(struct platform_device *pdev)
struct device_node *np = pdev->dev.of_node;
int ret;
- hv_dev = &pdev->dev;
+ vmbus_root_device = &pdev->dev;
ret = of_range_parser_init(&parser, np);
if (ret)
return ret;
+#ifndef HYPERVISOR_CALLBACK_VECTOR
+ ret = vmbus_set_irq(pdev);
+ if (ret)
+ return ret;
+#endif
for_each_of_range(&parser, &range) {
struct resource *res;
- res = kzalloc(sizeof(*res), GFP_KERNEL);
+ res = kzalloc_obj(*res);
if (!res) {
vmbus_mmio_remove();
return -ENOMEM;
@@ -2573,7 +2887,6 @@ static struct platform_driver vmbus_platform_driver = {
static void hv_kexec_handler(void)
{
- hv_stimer_global_cleanup();
vmbus_initiate_unload(false);
/* Make sure conn_state is set as hv_synic_cleanup checks for it */
mb();
@@ -2592,10 +2905,10 @@ static void hv_crash_handler(struct pt_regs *regs)
*/
cpu = smp_processor_id();
hv_stimer_cleanup(cpu);
- hv_synic_disable_regs(cpu);
+ hv_hyp_synic_disable_regs(cpu);
};
-static int hv_synic_suspend(void)
+static int hv_synic_suspend(void *data)
{
/*
* When we reach here, all the non-boot CPUs have been offlined.
@@ -2617,14 +2930,14 @@ static int hv_synic_suspend(void)
* interrupts-disabled context.
*/
- hv_synic_disable_regs(0);
+ hv_hyp_synic_disable_regs(0);
return 0;
}
-static void hv_synic_resume(void)
+static void hv_synic_resume(void *data)
{
- hv_synic_enable_regs(0);
+ hv_hyp_synic_enable_regs(0);
/*
* Note: we don't need to call hv_stimer_init(0), because the timer
@@ -2634,11 +2947,15 @@ static void hv_synic_resume(void)
}
/* The callbacks run only on CPU0, with irqs_disabled. */
-static struct syscore_ops hv_synic_syscore_ops = {
+static const struct syscore_ops hv_synic_syscore_ops = {
.suspend = hv_synic_suspend,
.resume = hv_synic_resume,
};
+static struct syscore hv_synic_syscore = {
+ .ops = &hv_synic_syscore_ops,
+};
+
static int __init hv_acpi_init(void)
{
int ret;
@@ -2646,7 +2963,7 @@ static int __init hv_acpi_init(void)
if (!hv_is_hyperv_initialized())
return -ENODEV;
- if (hv_root_partition && !hv_nested)
+ if (hv_root_partition() && !hv_nested)
return 0;
/*
@@ -2656,7 +2973,7 @@ static int __init hv_acpi_init(void)
if (ret)
return ret;
- if (!hv_dev) {
+ if (!vmbus_root_device) {
ret = -ENODEV;
goto cleanup;
}
@@ -2681,13 +2998,13 @@ static int __init hv_acpi_init(void)
hv_setup_kexec_handler(hv_kexec_handler);
hv_setup_crash_handler(hv_crash_handler);
- register_syscore_ops(&hv_synic_syscore_ops);
+ register_syscore(&hv_synic_syscore);
return 0;
cleanup:
platform_driver_unregister(&vmbus_platform_driver);
- hv_dev = NULL;
+ vmbus_root_device = NULL;
return ret;
}
@@ -2695,18 +3012,20 @@ static void __exit vmbus_exit(void)
{
int cpu;
- unregister_syscore_ops(&hv_synic_syscore_ops);
+ unregister_syscore(&hv_synic_syscore);
hv_remove_kexec_handler();
hv_remove_crash_handler();
vmbus_connection.conn_state = DISCONNECTED;
hv_stimer_global_cleanup();
vmbus_disconnect();
- if (vmbus_irq == -1) {
+ if (vmbus_irq == -1)
hv_remove_vmbus_handler();
- } else {
- free_percpu_irq(vmbus_irq, vmbus_evt);
- free_percpu(vmbus_evt);
+ else
+ free_percpu_irq(vmbus_irq, &vmbus_evt);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && vmbus_irq_initialized) {
+ smpboot_unregister_percpu_thread(&vmbus_irq_threads);
+ vmbus_irq_initialized = false;
}
for_each_online_cpu(cpu) {
struct hv_per_cpu_context *hv_cpu