From 4cc1aa469cd6b714adc958547a4866247bfd60a9 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 17 Oct 2025 11:58:17 -0700 Subject: mshv: Fix deposit memory in MSHV_ROOT_HVCALL When the MSHV_ROOT_HVCALL ioctl is executing a hypercall, and gets HV_STATUS_INSUFFICIENT_MEMORY, it deposits memory and then returns -EAGAIN to userspace. The expectation is that the VMM will retry. However, some VMM code in the wild doesn't do this and simply fails. Rather than force the VMM to retry, change the ioctl to deposit memory on demand and immediately retry the hypercall as is done with all the other hypercall helper functions. In addition to making the ioctl easier to use, removing the need for multiple syscalls improves performance. There is a complication: unlike the other hypercall helper functions, in MSHV_ROOT_HVCALL the input is opaque to the kernel. This is problematic for rep hypercalls, because the next part of the input list can't be copied on each loop after depositing pages (this was the original reason for returning -EAGAIN in this case). Introduce hv_do_rep_hypercall_ex(), which adds a 'rep_start' parameter. This solves the issue, allowing the deposit loop in MSHV_ROOT_HVCALL to restart a rep hypercall after depositing pages partway through. Fixes: 621191d709b1 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs") Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- include/asm-generic/mshyperv.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 64ba6bc807d9..b89c7e3a2047 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -124,10 +124,12 @@ static inline unsigned int hv_repcomp(u64 status) /* * Rep hypercalls. Callers of this functions are supposed to ensure that - * rep_count and varhead_size comply with Hyper-V hypercall definition. + * rep_count, varhead_size, and rep_start comply with Hyper-V hypercall + * definition. */ -static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, - void *input, void *output) +static inline u64 hv_do_rep_hypercall_ex(u16 code, u16 rep_count, + u16 varhead_size, u16 rep_start, + void *input, void *output) { u64 control = code; u64 status; @@ -135,6 +137,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET; control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET; + control |= (u64)rep_start << HV_HYPERCALL_REP_START_OFFSET; do { status = hv_do_hypercall(control, input, output); @@ -152,6 +155,14 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, return status; } +/* For the typical case where rep_start is 0 */ +static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, + void *input, void *output) +{ + return hv_do_rep_hypercall_ex(code, rep_count, varhead_size, 0, + input, output); +} + /* Generate the guest OS identifier as described in the Hyper-V TLFS */ static inline u64 hv_generate_guest_id(u64 kernel_version) { -- cgit v1.2.3 From 3e1b611515d286c6725028e17170f7143e5e51fc Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 18 Sep 2025 11:00:20 -0400 Subject: drivers: hv: Allow vmbus message synic interrupt injected from Hyper-V When Secure AVIC is enabled, VMBus driver should call x2apic Secure AVIC interface to allow Hyper-V to inject VMBus message interrupt. Reviewed-by: Michael Kelley Reviewed-by: Neeraj Upadhyay Signed-off-by: Tianyu Lan Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_apic.c | 5 +++++ drivers/hv/hv.c | 2 ++ drivers/hv/hv_common.c | 5 +++++ include/asm-generic/mshyperv.h | 1 + 4 files changed, 13 insertions(+) (limited to 'include') diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index e669053b637d..a8de503def37 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -53,6 +53,11 @@ static void hv_apic_icr_write(u32 low, u32 id) wrmsrq(HV_X64_MSR_ICR, reg_val); } +void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set) +{ + apic_update_vector(cpu, vector, set); +} + static u32 hv_apic_read(u32 reg) { u32 reg_val, hi; diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index b14c5f9e0ef2..ec5d10839e0f 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -307,6 +307,7 @@ void hv_synic_enable_regs(unsigned int cpu) } hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64); + hv_enable_coco_interrupt(cpu, vmbus_interrupt, true); /* Setup the shared SINT. */ if (vmbus_irq != -1) @@ -350,6 +351,7 @@ void hv_synic_disable_regs(unsigned int cpu) /* Need to correctly cleanup in the case of SMP!!! */ /* Disable the interrupt */ hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + hv_enable_coco_interrupt(cpu, vmbus_interrupt, false); simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); /* diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index e109a620c83f..3e41f686daa5 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -716,6 +716,11 @@ u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2) } EXPORT_SYMBOL_GPL(hv_tdx_hypercall); +void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set) +{ +} +EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt); + void hv_identify_partition_type(void) { /* Assume guest role */ diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index b89c7e3a2047..db84aced1658 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -347,6 +347,7 @@ bool hv_is_isolation_supported(void); bool hv_isolation_type_snp(void); u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size); u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); +void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set); void hyperv_cleanup(void); bool hv_query_ext_cap(u64 cap_query); void hv_setup_dma_ops(struct device *dev, bool coherent); -- cgit v1.2.3 From 6802d8af47d1dccd9a74a1f708fb9129244ef843 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:04 -0700 Subject: Drivers: hv: VMBus protocol version 6.0 The confidential VMBus is supported starting from the protocol version 6.0 onwards. Provide the required definitions. No functional changes. Signed-off-by: Roman Kisel Reviewed-by: Alok Tiwari Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/hyperv_vmbus.h | 2 ++ drivers/hv/vmbus_drv.c | 12 ++++++++ include/hyperv/hvgdk_mini.h | 1 + include/linux/hyperv.h | 69 ++++++++++++++++++++++++++++++++------------- 4 files changed, 65 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 0b450e53161e..4a01797d4851 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -333,6 +333,8 @@ extern const struct vmbus_channel_message_table_entry /* General vmbus interface */ +bool vmbus_is_confidential(void); + struct hv_device *vmbus_device_create(const guid_t *type, const guid_t *instance, struct vmbus_channel *channel); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 69591dc7bad2..3c414560fa5f 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -56,6 +56,18 @@ static long __percpu *vmbus_evt; int vmbus_irq; int vmbus_interrupt; +/* + * If the Confidential VMBus is used, the data on the "wire" is not + * visible to either the host or the hypervisor. + */ +static bool is_confidential; + +bool vmbus_is_confidential(void) +{ + return is_confidential; +} +EXPORT_SYMBOL_GPL(vmbus_is_confidential); + /* * The panic notifier below is responsible solely for unloading the * vmbus connection, which is necessary in a panic event. diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 77abddfc750e..7f730a0e54e6 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -260,6 +260,7 @@ union hv_hypervisor_version_info { #define HYPERV_CPUID_VIRT_STACK_PROPERTIES 0x40000082 /* Support for the extended IOAPIC RTE format */ #define HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE BIT(2) +#define HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE BIT(3) #define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 #define HYPERV_CPUID_MIN 0x40000005 diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 59826c89171c..dfc516c1c719 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -265,16 +265,18 @@ static inline u32 hv_get_avail_to_write_percent( * Linux kernel. */ -#define VERSION_WS2008 ((0 << 16) | (13)) -#define VERSION_WIN7 ((1 << 16) | (1)) -#define VERSION_WIN8 ((2 << 16) | (4)) -#define VERSION_WIN8_1 ((3 << 16) | (0)) -#define VERSION_WIN10 ((4 << 16) | (0)) -#define VERSION_WIN10_V4_1 ((4 << 16) | (1)) -#define VERSION_WIN10_V5 ((5 << 16) | (0)) -#define VERSION_WIN10_V5_1 ((5 << 16) | (1)) -#define VERSION_WIN10_V5_2 ((5 << 16) | (2)) -#define VERSION_WIN10_V5_3 ((5 << 16) | (3)) +#define VMBUS_MAKE_VERSION(MAJ, MIN) ((((u32)MAJ) << 16) | (MIN)) +#define VERSION_WS2008 VMBUS_MAKE_VERSION(0, 13) +#define VERSION_WIN7 VMBUS_MAKE_VERSION(1, 1) +#define VERSION_WIN8 VMBUS_MAKE_VERSION(2, 4) +#define VERSION_WIN8_1 VMBUS_MAKE_VERSION(3, 0) +#define VERSION_WIN10 VMBUS_MAKE_VERSION(4, 0) +#define VERSION_WIN10_V4_1 VMBUS_MAKE_VERSION(4, 1) +#define VERSION_WIN10_V5 VMBUS_MAKE_VERSION(5, 0) +#define VERSION_WIN10_V5_1 VMBUS_MAKE_VERSION(5, 1) +#define VERSION_WIN10_V5_2 VMBUS_MAKE_VERSION(5, 2) +#define VERSION_WIN10_V5_3 VMBUS_MAKE_VERSION(5, 3) +#define VERSION_WIN10_V6_0 VMBUS_MAKE_VERSION(6, 0) /* Make maximum size of pipe payload of 16K */ #define MAX_PIPE_DATA_PAYLOAD (sizeof(u8) * 16384) @@ -335,14 +337,22 @@ struct vmbus_channel_offer { } __packed; /* Server Flags */ -#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE 1 -#define VMBUS_CHANNEL_SERVER_SUPPORTS_TRANSFER_PAGES 2 -#define VMBUS_CHANNEL_SERVER_SUPPORTS_GPADLS 4 -#define VMBUS_CHANNEL_NAMED_PIPE_MODE 0x10 -#define VMBUS_CHANNEL_LOOPBACK_OFFER 0x100 -#define VMBUS_CHANNEL_PARENT_OFFER 0x200 -#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION 0x400 -#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER 0x2000 +#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE 0x0001 +/* + * This flag indicates that the channel is offered by the paravisor, and must + * use encrypted memory for the channel ring buffer. + */ +#define VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER 0x0002 +/* + * This flag indicates that the channel is offered by the paravisor, and must + * use encrypted memory for GPA direct packets and additional GPADLs. + */ +#define VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY 0x0004 +#define VMBUS_CHANNEL_NAMED_PIPE_MODE 0x0010 +#define VMBUS_CHANNEL_LOOPBACK_OFFER 0x0100 +#define VMBUS_CHANNEL_PARENT_OFFER 0x0200 +#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION 0x0400 +#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER 0x2000 struct vmpacket_descriptor { u16 type; @@ -621,6 +631,12 @@ struct vmbus_channel_relid_released { u32 child_relid; } __packed; +/* + * Used by the paravisor only, means that the encrypted ring buffers and + * the encrypted external memory are supported + */ +#define VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS 0x10 + struct vmbus_channel_initiate_contact { struct vmbus_channel_message_header header; u32 vmbus_version_requested; @@ -630,7 +646,8 @@ struct vmbus_channel_initiate_contact { struct { u8 msg_sint; u8 msg_vtl; - u8 reserved[6]; + u8 reserved[2]; + u32 feature_flags; /* VMBus version 6.0 */ }; }; u64 monitor_page1; @@ -1003,6 +1020,10 @@ struct vmbus_channel { /* boolean to control visibility of sysfs for ring buffer */ bool ring_sysfs_visible; + /* The ring buffer is encrypted */ + bool co_ring_buffer; + /* The external memory is encrypted */ + bool co_external_memory; }; #define lock_requestor(channel, flags) \ @@ -1027,6 +1048,16 @@ u64 vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, u64 rqst_addr); u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id); +static inline bool is_co_ring_buffer(const struct vmbus_channel_offer_channel *o) +{ + return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER); +} + +static inline bool is_co_external_memory(const struct vmbus_channel_offer_channel *o) +{ + return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY); +} + static inline bool is_hvsock_offer(const struct vmbus_channel_offer_channel *o) { return !!(o->offer.chn_flags & VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER); -- cgit v1.2.3 From 7c8b6c326d830ca5c6b95f390c703966e14167e6 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:05 -0700 Subject: arch/x86: mshyperv: Discover Confidential VMBus availability Confidential VMBus requires enabling paravisor SynIC, and the x86_64 guest has to inspect the Virtualization Stack (VS) CPUID leaf to see if Confidential VMBus is available. If it is, the guest shall enable the paravisor SynIC. Read the relevant data from the VS CPUID leaf. Refactor the code to avoid repeating CPUID and add flags to the struct ms_hyperv_info. For ARM64, the flag for Confidential VMBus is not set which provides the desired behaviour for now as it is not available on ARM64 just yet. Once ARM64 CCA guests are supported, this flag will be set unconditionally when running such a guest. Signed-off-by: Roman Kisel Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- arch/x86/kernel/cpu/mshyperv.c | 28 +++++++++++++++------------- include/asm-generic/mshyperv.h | 2 ++ 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 217a41b63df3..1369fae7d8a9 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -440,7 +440,7 @@ EXPORT_SYMBOL_GPL(hv_get_hypervisor_version); static void __init ms_hyperv_init_platform(void) { - int hv_max_functions_eax; + int hv_max_functions_eax, eax; #ifdef CONFIG_PARAVIRT pv_info.name = "Hyper-V"; @@ -478,6 +478,19 @@ static void __init ms_hyperv_init_platform(void) pr_info("Hyper-V: running on a nested hypervisor\n"); } + /* + * There is no check against the max function for HYPERV_CPUID_VIRT_STACK_* CPUID + * leaves as the hypervisor doesn't handle them. Even a nested root partition (L2 + * root) will not get them because the nested (L1) hypervisor filters them out. + * These are handled through intercept processing by the Windows Hyper-V stack + * or the paravisor. + */ + eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES); + ms_hyperv.confidential_vmbus_available = + eax & HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE; + ms_hyperv.msi_ext_dest_id = + eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; + if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { x86_platform.calibrate_tsc = hv_get_tsc_khz; @@ -678,21 +691,10 @@ static bool __init ms_hyperv_x2apic_available(void) * pci-hyperv host bridge. * * Note: for a Hyper-V root partition, this will always return false. - * The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by - * default, they are implemented as intercepts by the Windows Hyper-V stack. - * Even a nested root partition (L2 root) will not get them because the - * nested (L1) hypervisor filters them out. */ static bool __init ms_hyperv_msi_ext_dest_id(void) { - u32 eax; - - eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE); - if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE) - return false; - - eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES); - return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; + return ms_hyperv.msi_ext_dest_id; } #ifdef CONFIG_AMD_MEM_ENCRYPT diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index db84aced1658..8da1893365f0 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -62,6 +62,8 @@ struct ms_hyperv_info { }; }; u64 shared_gpa_boundary; + bool msi_ext_dest_id; + bool confidential_vmbus_available; }; extern struct ms_hyperv_info ms_hyperv; extern bool hv_nested; -- cgit v1.2.3 From e6eeb3c782739cd1613a8da856b878b99f741943 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:06 -0700 Subject: arch: hyperv: Get/set SynIC synth.registers via paravisor The existing Hyper-V wrappers for getting and setting MSRs are hv_get/set_msr(). Via hv_get/set_non_nested_msr(), they detect when running in a CoCo VM with a paravisor, and use the TDX or SNP guest-host communication protocol to bypass the paravisor and go directly to the host hypervisor for SynIC MSRs. The "set" function also implements the required special handling for the SINT MSRs. Provide functions that allow manipulating the SynIC registers through the paravisor. Move vmbus_signal_eom() to a more appropriate location (which also avoids breaking KVM). Signed-off-by: Roman Kisel Reviewed-by: Alok Tiwari Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- arch/x86/kernel/cpu/mshyperv.c | 20 +++++++++++++++++++ drivers/hv/hv_common.c | 11 +++++++++++ drivers/hv/hyperv_vmbus.h | 44 ++++++++++++++++++++++++++++++++++++++++++ include/asm-generic/mshyperv.h | 42 ++-------------------------------------- 4 files changed, 77 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 1369fae7d8a9..eb9f1d7eec61 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -86,6 +86,26 @@ void hv_set_non_nested_msr(unsigned int reg, u64 value) } EXPORT_SYMBOL_GPL(hv_set_non_nested_msr); +/* + * Get the SynIC register value from the paravisor. + */ +u64 hv_para_get_synic_register(unsigned int reg) +{ + if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg))) + return ~0ULL; + return native_read_msr(reg); +} + +/* + * Set the SynIC register value with the paravisor. + */ +void hv_para_set_synic_register(unsigned int reg, u64 val) +{ + if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg))) + return; + native_write_msr(reg, val); +} + u64 hv_get_msr(unsigned int reg) { if (hv_nested) diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 3e41f686daa5..c39472ae8345 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -721,6 +721,17 @@ void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool } EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt); +u64 __weak hv_para_get_synic_register(unsigned int reg) +{ + return ~0ULL; +} +EXPORT_SYMBOL_GPL(hv_para_get_synic_register); + +void __weak hv_para_set_synic_register(unsigned int reg, u64 val) +{ +} +EXPORT_SYMBOL_GPL(hv_para_set_synic_register); + void hv_identify_partition_type(void) { /* Assume guest role */ diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 4a01797d4851..9ac6f5520287 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -335,6 +336,49 @@ extern const struct vmbus_channel_message_table_entry bool vmbus_is_confidential(void); +#if IS_ENABLED(CONFIG_HYPERV_VMBUS) +/* Free the message slot and signal end-of-message if required */ +static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) +{ + /* + * On crash we're reading some other CPU's message page and we need + * to be careful: this other CPU may already had cleared the header + * and the host may already had delivered some other message there. + * In case we blindly write msg->header.message_type we're going + * to lose it. We can still lose a message of the same type but + * we count on the fact that there can only be one + * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages + * on crash. + */ + if (cmpxchg(&msg->header.message_type, old_msg_type, + HVMSG_NONE) != old_msg_type) + return; + + /* + * The cmxchg() above does an implicit memory barrier to + * ensure the write to MessageType (ie set to + * HVMSG_NONE) happens before we read the + * MessagePending and EOMing. Otherwise, the EOMing + * will not deliver any more messages since there is + * no empty slot + */ + if (msg->header.message_flags.msg_pending) { + /* + * This will cause message queue rescan to + * possibly deliver another msg from the + * hypervisor + */ + if (vmbus_is_confidential()) + hv_para_set_synic_register(HV_MSR_EOM, 0); + else + hv_set_msr(HV_MSR_EOM, 0); + } +} + +extern int vmbus_interrupt; +extern int vmbus_irq; +#endif /* CONFIG_HYPERV_VMBUS */ + struct hv_device *vmbus_device_create(const guid_t *type, const guid_t *instance, struct vmbus_channel *channel); diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 8da1893365f0..c328265de624 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -176,46 +176,6 @@ static inline u64 hv_generate_guest_id(u64 kernel_version) return guest_id; } -#if IS_ENABLED(CONFIG_HYPERV_VMBUS) -/* Free the message slot and signal end-of-message if required */ -static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) -{ - /* - * On crash we're reading some other CPU's message page and we need - * to be careful: this other CPU may already had cleared the header - * and the host may already had delivered some other message there. - * In case we blindly write msg->header.message_type we're going - * to lose it. We can still lose a message of the same type but - * we count on the fact that there can only be one - * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages - * on crash. - */ - if (cmpxchg(&msg->header.message_type, old_msg_type, - HVMSG_NONE) != old_msg_type) - return; - - /* - * The cmxchg() above does an implicit memory barrier to - * ensure the write to MessageType (ie set to - * HVMSG_NONE) happens before we read the - * MessagePending and EOMing. Otherwise, the EOMing - * will not deliver any more messages since there is - * no empty slot - */ - if (msg->header.message_flags.msg_pending) { - /* - * This will cause message queue rescan to - * possibly deliver another msg from the - * hypervisor - */ - hv_set_msr(HV_MSR_EOM, 0); - } -} - -extern int vmbus_interrupt; -extern int vmbus_irq; -#endif /* CONFIG_HYPERV_VMBUS */ - int hv_get_hypervisor_version(union hv_hypervisor_version_info *info); void hv_setup_vmbus_handler(void (*handler)(void)); @@ -350,6 +310,8 @@ bool hv_isolation_type_snp(void); u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size); u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set); +u64 hv_para_get_synic_register(unsigned int reg); +void hv_para_set_synic_register(unsigned int reg, u64 val); void hyperv_cleanup(void); bool hv_query_ext_cap(u64 cap_query); void hv_setup_dma_ops(struct device *dev, bool coherent); -- cgit v1.2.3 From a156ad8c508209ce22f3213d25c3c2ae1774a57d Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:07 -0700 Subject: arch/x86: mshyperv: Trap on access for some synthetic MSRs hv_set_non_nested_msr() has special handling for SINT MSRs when a paravisor is present. In addition to updating the MSR on the host, the mirror MSR in the paravisor is updated, including with the proxy bit. But with Confidential VMBus, the proxy bit must not be used, so add a special case to skip it. Signed-off-by: Roman Kisel Reviewed-by: Alok Tiwari Reviewed-by: Tianyu Lan Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- arch/x86/kernel/cpu/mshyperv.c | 29 +++++++++++++++++++++++++---- drivers/hv/hv_common.c | 5 +++++ include/asm-generic/mshyperv.h | 1 + 3 files changed, 31 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index eb9f1d7eec61..80a641a6ac48 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +40,12 @@ bool hv_nested; struct ms_hyperv_info ms_hyperv; #if IS_ENABLED(CONFIG_HYPERV) +/* + * When running with the paravisor, controls proxying the synthetic interrupts + * from the host + */ +static bool hv_para_sint_proxy; + static inline unsigned int hv_get_nested_msr(unsigned int reg) { if (hv_is_sint_msr(reg)) @@ -75,17 +82,31 @@ EXPORT_SYMBOL_GPL(hv_get_non_nested_msr); void hv_set_non_nested_msr(unsigned int reg, u64 value) { if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) { + /* The hypervisor will get the intercept. */ hv_ivm_msr_write(reg, value); - /* Write proxy bit via wrmsl instruction */ - if (hv_is_sint_msr(reg)) - wrmsrq(reg, value | 1 << 20); + /* Using wrmsrq so the following goes to the paravisor. */ + if (hv_is_sint_msr(reg)) { + union hv_synic_sint sint = { .as_uint64 = value }; + + sint.proxy = hv_para_sint_proxy; + native_wrmsrq(reg, sint.as_uint64); + } } else { - wrmsrq(reg, value); + native_wrmsrq(reg, value); } } EXPORT_SYMBOL_GPL(hv_set_non_nested_msr); +/* + * Enable or disable proxying synthetic interrupts + * to the paravisor. + */ +void hv_para_set_sint_proxy(bool enable) +{ + hv_para_sint_proxy = enable; +} + /* * Get the SynIC register value from the paravisor. */ diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index c39472ae8345..4b44da7f23b2 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -721,6 +721,11 @@ void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool } EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt); +void __weak hv_para_set_sint_proxy(bool enable) +{ +} +EXPORT_SYMBOL_GPL(hv_para_set_sint_proxy); + u64 __weak hv_para_get_synic_register(unsigned int reg) { return ~0ULL; diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index c328265de624..ecedab554c80 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -310,6 +310,7 @@ bool hv_isolation_type_snp(void); u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size); u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set); +void hv_para_set_sint_proxy(bool enable); u64 hv_para_get_synic_register(unsigned int reg); void hv_para_set_synic_register(unsigned int reg, u64 val); void hyperv_cleanup(void); -- cgit v1.2.3 From 59aeea195948fd507cef2e439a5a964b8432750e Mon Sep 17 00:00:00 2001 From: Purna Pavan Chandra Aekkaladevi Date: Fri, 10 Oct 2025 14:55:48 -0700 Subject: mshv: Add the HVCALL_GET_PARTITION_PROPERTY_EX hypercall This hypercall can be used to fetch extended properties of a partition. Extended properties are properties with values larger than a u64. Some of these also need additional input arguments. Add helper function for using the hypercall in the mshv_root driver. Signed-off-by: Purna Pavan Chandra Aekkaladevi Signed-off-by: Nuno Das Neves Reviewed-by: Anirudh Rayabharam Reviewed-by: Praveen K Paladugu Reviewed-by: Easwar Hariharan Reviewed-by: Tianyu Lan Signed-off-by: Wei Liu --- drivers/hv/mshv_root.h | 2 ++ drivers/hv/mshv_root_hv_call.c | 31 +++++++++++++++++++++++++++++++ include/hyperv/hvgdk_mini.h | 1 + include/hyperv/hvhdk.h | 40 ++++++++++++++++++++++++++++++++++++++++ include/hyperv/hvhdk_mini.h | 26 ++++++++++++++++++++++++++ 5 files changed, 100 insertions(+) (limited to 'include') diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index db6b42db2fdc..0e62badfc9f1 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -303,6 +303,8 @@ int hv_call_unmap_stat_page(enum hv_stats_object_type type, int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, u64 page_struct_count, u32 host_access, u32 flags, u8 acquire); +int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 arg, + void *property_value, size_t property_value_sz); extern struct mshv_root mshv_root; extern enum hv_scheduler_type hv_scheduler_type; diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c index c9c274f29c3c..8049e51c45dc 100644 --- a/drivers/hv/mshv_root_hv_call.c +++ b/drivers/hv/mshv_root_hv_call.c @@ -590,6 +590,37 @@ int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, return hv_result_to_errno(status); } +int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, + u64 arg, void *property_value, + size_t property_value_sz) +{ + u64 status; + unsigned long flags; + struct hv_input_get_partition_property_ex *input; + struct hv_output_get_partition_property_ex *output; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->partition_id = partition_id; + input->property_code = property_code; + input->arg = arg; + status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY_EX, input, output); + + if (!hv_result_success(status)) { + local_irq_restore(flags); + hv_status_debug(status, "\n"); + return hv_result_to_errno(status); + } + memcpy(property_value, &output->property_value, property_value_sz); + + local_irq_restore(flags); + + return 0; +} + int hv_call_clear_virtual_interrupt(u64 partition_id) { diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 7f730a0e54e6..af85b1c36b6e 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -491,6 +491,7 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_GET_VP_STATE 0x00e3 #define HVCALL_SET_VP_STATE 0x00e4 #define HVCALL_GET_VP_CPUID_VALUES 0x00f4 +#define HVCALL_GET_PARTITION_PROPERTY_EX 0x0101 #define HVCALL_MMIO_READ 0x0106 #define HVCALL_MMIO_WRITE 0x0107 diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h index b4067ada02cf..416c0d45b793 100644 --- a/include/hyperv/hvhdk.h +++ b/include/hyperv/hvhdk.h @@ -376,6 +376,46 @@ struct hv_input_set_partition_property { u64 property_value; } __packed; +union hv_partition_property_arg { + u64 as_uint64; + struct { + union { + u32 arg; + u32 vp_index; + }; + u16 reserved0; + u8 reserved1; + u8 object_type; + } __packed; +}; + +struct hv_input_get_partition_property_ex { + u64 partition_id; + u32 property_code; /* enum hv_partition_property_code */ + u32 padding; + union { + union hv_partition_property_arg arg_data; + u64 arg; + }; +} __packed; + +/* + * NOTE: Should use hv_input_set_partition_property_ex_header to compute this + * size, but hv_input_get_partition_property_ex is identical so it suffices + */ +#define HV_PARTITION_PROPERTY_EX_MAX_VAR_SIZE \ + (HV_HYP_PAGE_SIZE - sizeof(struct hv_input_get_partition_property_ex)) + +union hv_partition_property_ex { + u8 buffer[HV_PARTITION_PROPERTY_EX_MAX_VAR_SIZE]; + struct hv_partition_property_vmm_capabilities vmm_capabilities; + /* More fields to be filled in when needed */ +}; + +struct hv_output_get_partition_property_ex { + union hv_partition_property_ex property_value; +} __packed; + enum hv_vp_state_page_type { HV_VP_STATE_PAGE_REGISTERS = 0, HV_VP_STATE_PAGE_INTERCEPT_MESSAGE = 1, diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index 858f6a3925b3..bf2ce27dfcc5 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -96,8 +96,34 @@ enum hv_partition_property_code { HV_PARTITION_PROPERTY_XSAVE_STATES = 0x00060007, HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE = 0x00060008, HV_PARTITION_PROPERTY_PROCESSOR_CLOCK_FREQUENCY = 0x00060009, + + /* Extended properties with larger property values */ + HV_PARTITION_PROPERTY_VMM_CAPABILITIES = 0x00090007, }; +#define HV_PARTITION_VMM_CAPABILITIES_BANK_COUNT 1 +#define HV_PARTITION_VMM_CAPABILITIES_RESERVED_BITFIELD_COUNT 59 + +struct hv_partition_property_vmm_capabilities { + u16 bank_count; + u16 reserved[3]; + union { + u64 as_uint64[HV_PARTITION_VMM_CAPABILITIES_BANK_COUNT]; + struct { + u64 map_gpa_preserve_adjustable: 1; + u64 vmm_can_provide_overlay_gpfn: 1; + u64 vp_affinity_property: 1; +#if IS_ENABLED(CONFIG_ARM64) + u64 vmm_can_provide_gic_overlay_locations: 1; +#else + u64 reservedbit3: 1; +#endif + u64 assignable_synthetic_proc_features: 1; + u64 reserved0: HV_PARTITION_VMM_CAPABILITIES_RESERVED_BITFIELD_COUNT; + } __packed; + }; +} __packed; + enum hv_snp_status { HV_SNP_STATUS_NONE = 0, HV_SNP_STATUS_AVAILABLE = 1, -- cgit v1.2.3 From d62313bdf5961b5f815f0b212f029cf146a8a804 Mon Sep 17 00:00:00 2001 From: Jinank Jain Date: Fri, 10 Oct 2025 14:55:51 -0700 Subject: mshv: Introduce new hypercall to map stats page for L1VH partitions Introduce HVCALL_MAP_STATS_PAGE2 which provides a map location (GPFN) to map the stats to. This hypercall is required for L1VH partitions, depending on the hypervisor version. This uses the same check as the state page map location; mshv_use_overlay_gpfn(). Add mshv_map_vp_state_page() helpers to use this new hypercall or the old one depending on availability. For unmapping, the original HVCALL_UNMAP_STATS_PAGE works for both cases. Signed-off-by: Jinank Jain Signed-off-by: Nuno Das Neves Reviewed-by: Easwar Hariharan Signed-off-by: Wei Liu --- drivers/hv/mshv_root.h | 10 ++--- drivers/hv/mshv_root_hv_call.c | 95 +++++++++++++++++++++++++++++++++++++++--- drivers/hv/mshv_root_main.c | 22 +++++----- include/hyperv/hvgdk_mini.h | 1 + include/hyperv/hvhdk_mini.h | 7 ++++ 5 files changed, 115 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index 5b57d894358a..3eb815011b46 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -297,11 +297,11 @@ int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id, int hv_call_disconnect_port(u64 connection_partition_id, union hv_connection_id connection_id); int hv_call_notify_port_ring_empty(u32 sint_index); -int hv_call_map_stat_page(enum hv_stats_object_type type, - const union hv_stats_object_identity *identity, - void **addr); -int hv_call_unmap_stat_page(enum hv_stats_object_type type, - const union hv_stats_object_identity *identity); +int hv_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + void **addr); +int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr, + const union hv_stats_object_identity *identity); int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, u64 page_struct_count, u32 host_access, u32 flags, u8 acquire); diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c index 6dac9fcc092c..caf02cfa49c9 100644 --- a/drivers/hv/mshv_root_hv_call.c +++ b/drivers/hv/mshv_root_hv_call.c @@ -807,9 +807,51 @@ hv_call_notify_port_ring_empty(u32 sint_index) return hv_result_to_errno(status); } -int hv_call_map_stat_page(enum hv_stats_object_type type, - const union hv_stats_object_identity *identity, - void **addr) +static int hv_call_map_stats_page2(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + u64 map_location) +{ + unsigned long flags; + struct hv_input_map_stats_page2 *input; + u64 status; + int ret; + + if (!map_location || !mshv_use_overlay_gpfn()) + return -EINVAL; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input, 0, sizeof(*input)); + input->type = type; + input->identity = *identity; + input->map_location = map_location; + + status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE2, input, NULL); + + local_irq_restore(flags); + + ret = hv_result_to_errno(status); + + if (!ret) + break; + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + hv_status_debug(status, "\n"); + break; + } + + ret = hv_call_deposit_pages(NUMA_NO_NODE, + hv_current_partition_id, 1); + } while (!ret); + + return ret; +} + +static int hv_call_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + void **addr) { unsigned long flags; struct hv_input_map_stats_page *input; @@ -848,8 +890,38 @@ int hv_call_map_stat_page(enum hv_stats_object_type type, return ret; } -int hv_call_unmap_stat_page(enum hv_stats_object_type type, - const union hv_stats_object_identity *identity) +int hv_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + void **addr) +{ + int ret; + struct page *allocated_page = NULL; + + if (!addr) + return -EINVAL; + + if (mshv_use_overlay_gpfn()) { + allocated_page = alloc_page(GFP_KERNEL); + if (!allocated_page) + return -ENOMEM; + + ret = hv_call_map_stats_page2(type, identity, + page_to_pfn(allocated_page)); + *addr = page_address(allocated_page); + } else { + ret = hv_call_map_stats_page(type, identity, addr); + } + + if (ret && allocated_page) { + __free_page(allocated_page); + *addr = NULL; + } + + return ret; +} + +static int hv_call_unmap_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity) { unsigned long flags; struct hv_input_unmap_stats_page *input; @@ -868,6 +940,19 @@ int hv_call_unmap_stat_page(enum hv_stats_object_type type, return hv_result_to_errno(status); } +int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr, + const union hv_stats_object_identity *identity) +{ + int ret; + + ret = hv_call_unmap_stats_page(type, identity); + + if (mshv_use_overlay_gpfn() && page_addr) + __free_page(virt_to_page(page_addr)); + + return ret; +} + int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, u64 page_struct_count, u32 host_access, u32 flags, u8 acquire) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 288695a859f7..7684645ef00d 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -843,7 +843,8 @@ mshv_vp_release(struct inode *inode, struct file *filp) return 0; } -static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index) +static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, + void *stats_pages[]) { union hv_stats_object_identity identity = { .vp.partition_id = partition_id, @@ -851,10 +852,10 @@ static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index) }; identity.vp.stats_area_type = HV_STATS_AREA_SELF; - hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); + hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); identity.vp.stats_area_type = HV_STATS_AREA_PARENT; - hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); + hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); } static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, @@ -867,14 +868,14 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, int err; identity.vp.stats_area_type = HV_STATS_AREA_SELF; - err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, - &stats_pages[HV_STATS_AREA_SELF]); + err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, + &stats_pages[HV_STATS_AREA_SELF]); if (err) return err; identity.vp.stats_area_type = HV_STATS_AREA_PARENT; - err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, - &stats_pages[HV_STATS_AREA_PARENT]); + err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, + &stats_pages[HV_STATS_AREA_PARENT]); if (err) goto unmap_self; @@ -882,7 +883,7 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, unmap_self: identity.vp.stats_area_type = HV_STATS_AREA_SELF; - hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); + hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); return err; } @@ -990,7 +991,7 @@ free_vp: kfree(vp); unmap_stats_pages: if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) - mshv_vp_stats_unmap(partition->pt_id, args.vp_index); + mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages); unmap_ghcb_page: if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) hv_unmap_vp_state_page(partition->pt_id, args.vp_index, @@ -1742,7 +1743,8 @@ static void destroy_partition(struct mshv_partition *partition) continue; if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) - mshv_vp_stats_unmap(partition->pt_id, vp->vp_index); + mshv_vp_stats_unmap(partition->pt_id, vp->vp_index, + (void **)vp->vp_stats_pages); if (vp->vp_register_page) { (void)hv_unmap_vp_state_page(partition->pt_id, diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index af85b1c36b6e..f6e31d1c3267 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -494,6 +494,7 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_GET_PARTITION_PROPERTY_EX 0x0101 #define HVCALL_MMIO_READ 0x0106 #define HVCALL_MMIO_WRITE 0x0107 +#define HVCALL_MAP_STATS_PAGE2 0x0131 /* HV_HYPERCALL_INPUT */ #define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index bf2ce27dfcc5..064bf735cab6 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -177,6 +177,13 @@ struct hv_input_map_stats_page { union hv_stats_object_identity identity; } __packed; +struct hv_input_map_stats_page2 { + u32 type; /* enum hv_stats_object_type */ + u32 padding; + union hv_stats_object_identity identity; + u64 map_location; +} __packed; + struct hv_output_map_stats_page { u64 map_location; } __packed; -- cgit v1.2.3 From 56c3feb3cc17b764f51191fd3dc461ab55a7b803 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Mon, 6 Oct 2025 15:42:04 -0700 Subject: hyperv: Add two new hypercall numbers to guest ABI public header In preparation for the subsequent crashdump patches, copy two hypercall numbers to the guest ABI header published by Hyper-V. One to notify hypervisor of an event that occurs in the root partition, other to ask hypervisor to disable the hypervisor. Signed-off-by: Mukesh Rathor Signed-off-by: Wei Liu --- include/hyperv/hvgdk_mini.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index f6e31d1c3267..7499a679e60a 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -470,6 +470,7 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_MAP_DEVICE_INTERRUPT 0x007c #define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d #define HVCALL_RETARGET_INTERRUPT 0x007e +#define HVCALL_NOTIFY_PARTITION_EVENT 0x0087 #define HVCALL_NOTIFY_PORT_RING_EMPTY 0x008b #define HVCALL_REGISTER_INTERCEPT_RESULT 0x0091 #define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094 @@ -494,6 +495,7 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_GET_PARTITION_PROPERTY_EX 0x0101 #define HVCALL_MMIO_READ 0x0106 #define HVCALL_MMIO_WRITE 0x0107 +#define HVCALL_DISABLE_HYP_EX 0x010f #define HVCALL_MAP_STATS_PAGE2 0x0131 /* HV_HYPERCALL_INPUT */ -- cgit v1.2.3 From e0a975ecd2e671664d208723476eeabb3baf08be Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Mon, 6 Oct 2025 15:42:05 -0700 Subject: hyperv: Add definitions for hypervisor crash dump support Add data structures for hypervisor crash dump support to the hypervisor host ABI header file. Details of their usages are in subsequent commits. Signed-off-by: Mukesh Rathor Signed-off-by: Wei Liu --- include/hyperv/hvhdk_mini.h | 55 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'include') diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index 064bf735cab6..f2d7b50de7a4 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -142,6 +142,17 @@ enum hv_system_property { /* Add more values when needed */ HV_SYSTEM_PROPERTY_SCHEDULER_TYPE = 15, HV_DYNAMIC_PROCESSOR_FEATURE_PROPERTY = 21, + HV_SYSTEM_PROPERTY_CRASHDUMPAREA = 47, +}; + +#define HV_PFN_RANGE_PGBITS 24 /* HV_SPA_PAGE_RANGE_ADDITIONAL_PAGES_BITS */ +union hv_pfn_range { /* HV_SPA_PAGE_RANGE */ + u64 as_uint64; + struct { + /* 39:0: base pfn. 63:40: additional pages */ + u64 base_pfn : 64 - HV_PFN_RANGE_PGBITS; + u64 add_pfns : HV_PFN_RANGE_PGBITS; + } __packed; }; enum hv_dynamic_processor_feature_property { @@ -168,6 +179,8 @@ struct hv_output_get_system_property { #if IS_ENABLED(CONFIG_X86) u64 hv_processor_feature_value; #endif + union hv_pfn_range hv_cda_info; /* CrashdumpAreaAddress */ + u64 hv_tramp_pa; /* CrashdumpTrampolineAddress */ }; } __packed; @@ -267,6 +280,48 @@ union hv_gpa_page_access_state { u8 as_uint8; } __packed; +enum hv_crashdump_action { + HV_CRASHDUMP_NONE = 0, + HV_CRASHDUMP_SUSPEND_ALL_VPS, + HV_CRASHDUMP_PREPARE_FOR_STATE_SAVE, + HV_CRASHDUMP_STATE_SAVED, + HV_CRASHDUMP_ENTRY, +}; + +struct hv_partition_event_root_crashdump_input { + u32 crashdump_action; /* enum hv_crashdump_action */ +} __packed; + +struct hv_input_disable_hyp_ex { /* HV_X64_INPUT_DISABLE_HYPERVISOR_EX */ + u64 rip; + u64 arg; +} __packed; + +struct hv_crashdump_area { /* HV_CRASHDUMP_AREA */ + u32 version; + union { + u32 flags_as_uint32; + struct { + u32 cda_valid : 1; + u32 cda_unused : 31; + } __packed; + }; + /* more unused fields */ +} __packed; + +union hv_partition_event_input { + struct hv_partition_event_root_crashdump_input crashdump_input; +}; + +enum hv_partition_event { + HV_PARTITION_EVENT_ROOT_CRASHDUMP = 2, +}; + +struct hv_input_notify_partition_event { + u32 event; /* enum hv_partition_event */ + union hv_partition_event_input input; +} __packed; + struct hv_lp_startup_status { u64 hv_status; u64 substatus1; -- cgit v1.2.3 From f91bc8f61abf0e1d23108ae9871c60d7612a09b2 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Thu, 6 Nov 2025 14:13:31 -0800 Subject: mshv: Allow mappings that overlap in uaddr Currently the MSHV driver rejects mappings that would overlap in userspace. Some VMMs require the same memory to be mapped to different parts of the guest's address space, and so working around this restriction is difficult. The hypervisor itself doesn't prohibit mappings that overlap in uaddr, (really in SPA; system physical addresses), so supporting this in the driver doesn't require any extra work: only the checks need to be removed. Since no userspace code until now has been able to overlap regions in userspace, relaxing this constraint can't break any existing code. Signed-off-by: Magnus Kulke Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 8 ++------ include/uapi/linux/mshv.h | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index a8f3c5f3ce34..3f73c468e975 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1188,12 +1188,8 @@ static int mshv_partition_create_region(struct mshv_partition *partition, /* Reject overlapping regions */ hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) { - u64 rg_size = rg->nr_pages << HV_HYP_PAGE_SHIFT; - - if ((mem->guest_pfn + nr_pages <= rg->start_gfn || - rg->start_gfn + rg->nr_pages <= mem->guest_pfn) && - (mem->userspace_addr + mem->size <= rg->start_uaddr || - rg->start_uaddr + rg_size <= mem->userspace_addr)) + if (mem->guest_pfn + nr_pages <= rg->start_gfn || + rg->start_gfn + rg->nr_pages <= mem->guest_pfn) continue; return -EEXIST; diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h index 876bfe4e4227..374f75e198bc 100644 --- a/include/uapi/linux/mshv.h +++ b/include/uapi/linux/mshv.h @@ -89,7 +89,7 @@ enum { * @rsvd: MBZ * * Map or unmap a region of userspace memory to Guest Physical Addresses (GPA). - * Mappings can't overlap in GPA space or userspace. + * Mappings can't overlap in GPA space. * To unmap, these fields must match an existing mapping. */ struct mshv_user_mem_region { -- cgit v1.2.3 From c91fe5f162f278d4aa960d06d2dbc42f9857593a Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 13 Nov 2025 11:45:33 -0800 Subject: mshv: Extend create partition ioctl to support cpu features The existing mshv create partition ioctl does not provide a way to specify which cpu features are enabled in the guest. Instead, it attempts to enable all features and those that are not supported are silently disabled by the hypervisor. This was done to reduce unnecessary complexity and is sufficient for many cases. However, new scenarios require fine-grained control over these features. Define a new mshv_create_partition_v2 structure which supports passing the disabled processor and xsave feature bits through to the create partition hypercall directly. Introduce a new flag MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES which enables the new structure. If unset, the original mshv_create_partition struct is used, with the old behavior of enabling all features. Co-developed-by: Jinank Jain Signed-off-by: Jinank Jain Signed-off-by: Muminul Islam Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/mshv_root_main.c | 118 ++++++++++++++++++++++++++++++++++++-------- include/uapi/linux/mshv.h | 34 +++++++++++++ 2 files changed, 131 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 3f73c468e975..45c7a5fea1cf 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -1855,43 +1855,119 @@ add_partition(struct mshv_partition *partition) return 0; } -static long -mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) +static_assert(MSHV_NUM_CPU_FEATURES_BANKS == + HV_PARTITION_PROCESSOR_FEATURES_BANKS); + +static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags, + struct hv_partition_creation_properties *cr_props, + union hv_partition_isolation_properties *isol_props) { - struct mshv_create_partition args; - u64 creation_flags; - struct hv_partition_creation_properties creation_properties = {}; - union hv_partition_isolation_properties isolation_properties = {}; - struct mshv_partition *partition; - struct file *file; - int fd; - long ret; + int i; + struct mshv_create_partition_v2 args; + union hv_partition_processor_features *disabled_procs; + union hv_partition_processor_xsave_features *disabled_xsave; - if (copy_from_user(&args, user_arg, sizeof(args))) + /* First, copy v1 struct in case user is on previous versions */ + if (copy_from_user(&args, user_arg, + sizeof(struct mshv_create_partition))) return -EFAULT; if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) return -EINVAL; + disabled_procs = &cr_props->disabled_processor_features; + disabled_xsave = &cr_props->disabled_processor_xsave_features; + + /* Check if user provided newer struct with feature fields */ + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) { + if (copy_from_user(&args, user_arg, sizeof(args))) + return -EFAULT; + + /* Re-validate v1 fields after second copy_from_user() */ + if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || + args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) + return -EINVAL; + + if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS || + mshv_field_nonzero(args, pt_rsvd) || + mshv_field_nonzero(args, pt_rsvd1)) + return -EINVAL; + + /* + * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never + * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS + * (i.e. 2). + * + * Further banks (index >= 2) will be modifiable as 'early' + * properties via the set partition property hypercall. + */ + for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) + disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i]; + +#if IS_ENABLED(CONFIG_X86_64) + disabled_xsave->as_uint64 = args.pt_disabled_xsave; +#else + /* + * In practice this field is ignored on arm64, but safer to + * zero it in case it is ever used. + */ + disabled_xsave->as_uint64 = 0; + + if (mshv_field_nonzero(args, pt_rsvd2)) + return -EINVAL; +#endif + } else { + /* + * v1 behavior: try to enable everything. The hypervisor will + * disable features that are not supported. The banks can be + * queried via the get partition property hypercall. + */ + for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) + disabled_procs->as_uint64[i] = 0; + + disabled_xsave->as_uint64 = 0; + } + /* Only support EXO partitions */ - creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | - HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; + *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | + HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; + + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; - if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC)) - creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; - if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC)) - creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; - if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES)) - creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; + isol_props->as_uint64 = 0; switch (args.pt_isolation) { case MSHV_PT_ISOLATION_NONE: - isolation_properties.isolation_type = - HV_PARTITION_ISOLATION_TYPE_NONE; + isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE; break; } + return 0; +} + +static long +mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) +{ + u64 creation_flags; + struct hv_partition_creation_properties creation_properties; + union hv_partition_isolation_properties isolation_properties; + struct mshv_partition *partition; + struct file *file; + int fd; + long ret; + + ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags, + &creation_properties, + &isolation_properties); + if (ret) + return ret; + partition = kzalloc(sizeof(*partition), GFP_KERNEL); if (!partition) return -ENOMEM; diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h index 374f75e198bc..b645d17cc531 100644 --- a/include/uapi/linux/mshv.h +++ b/include/uapi/linux/mshv.h @@ -26,6 +26,7 @@ enum { MSHV_PT_BIT_LAPIC, MSHV_PT_BIT_X2APIC, MSHV_PT_BIT_GPA_SUPER_PAGES, + MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES, MSHV_PT_BIT_COUNT, }; @@ -41,6 +42,8 @@ enum { * @pt_flags: Bitmask of 1 << MSHV_PT_BIT_* * @pt_isolation: MSHV_PT_ISOLATION_* * + * This is the initial/v1 version for backward compatibility. + * * Returns a file descriptor to act as a handle to a guest partition. * At this point the partition is not yet initialized in the hypervisor. * Some operations must be done with the partition in this state, e.g. setting @@ -52,6 +55,37 @@ struct mshv_create_partition { __u64 pt_isolation; }; +#define MSHV_NUM_CPU_FEATURES_BANKS 2 + +/** + * struct mshv_create_partition_v2 + * + * This is extended version of the above initial MSHV_CREATE_PARTITION + * ioctl and allows for following additional parameters: + * + * @pt_num_cpu_fbanks: Must be set to MSHV_NUM_CPU_FEATURES_BANKS. + * @pt_cpu_fbanks: Disabled processor feature banks array. + * @pt_disabled_xsave: Disabled xsave feature bits. + * + * pt_cpu_fbanks and pt_disabled_xsave are passed through as-is to the create + * partition hypercall. + * + * Returns : same as above original mshv_create_partition + */ +struct mshv_create_partition_v2 { + __u64 pt_flags; + __u64 pt_isolation; + __u16 pt_num_cpu_fbanks; + __u8 pt_rsvd[6]; /* MBZ */ + __u64 pt_cpu_fbanks[MSHV_NUM_CPU_FEATURES_BANKS]; + __u64 pt_rsvd1[2]; /* MBZ */ +#if defined(__x86_64__) + __u64 pt_disabled_xsave; +#else + __u64 pt_rsvd2; /* MBZ */ +#endif +} __packed; + /* /dev/mshv */ #define MSHV_CREATE_PARTITION _IOW(MSHV_IOCTL, 0x00, struct mshv_create_partition) -- cgit v1.2.3 From 796ef5a7fe86a8605f2844471ed7baa8e80bace8 Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Thu, 13 Nov 2025 04:41:47 +0000 Subject: static_call: allow using STATIC_CALL_TRAMP_STR() from assembly STATIC_CALL_TRAMP_STR() could not be used from .S files because static_call_types.h was not safe to include in assembly as it pulled in C types/constructs that are unavailable under __ASSEMBLY__. Make the header assembly-friendly by adding __ASSEMBLY__ checks and providing only the minimal definitions needed for assembly, so that it can be safely included by .S code. This enables emitting the static call trampoline symbol name via STATIC_CALL_TRAMP_STR() directly in assembly sources, to be used with 'call' instruction. Also, move a certain definitions out of __ASSEMBLY__ checks in compiler_types.h to meet the dependencies. No functional change for C compilation. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Naman Jain Signed-off-by: Wei Liu --- include/linux/compiler_types.h | 8 ++++---- include/linux/static_call_types.h | 4 ++++ tools/include/linux/static_call_types.h | 4 ++++ 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 59288a2c1ad2..6897d4d5cb28 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -11,6 +11,10 @@ #define __has_builtin(x) (0) #endif +/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ +#define ___PASTE(a, b) a##b +#define __PASTE(a, b) ___PASTE(a, b) + #ifndef __ASSEMBLY__ /* @@ -79,10 +83,6 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __builtin_warning(x, y...) (1) #endif /* __CHECKER__ */ -/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ -#define ___PASTE(a,b) a##b -#define __PASTE(a,b) ___PASTE(a,b) - #ifdef __KERNEL__ /* Attributes */ diff --git a/include/linux/static_call_types.h b/include/linux/static_call_types.h index 5a00b8b2cf9f..cfb6ddeb292b 100644 --- a/include/linux/static_call_types.h +++ b/include/linux/static_call_types.h @@ -25,6 +25,8 @@ #define STATIC_CALL_SITE_INIT 2UL /* init section */ #define STATIC_CALL_SITE_FLAGS 3UL +#ifndef __ASSEMBLY__ + /* * The static call site table needs to be created by external tooling (objtool * or a compiler plugin). @@ -100,4 +102,6 @@ struct static_call_key { #endif /* CONFIG_HAVE_STATIC_CALL */ +#endif /* __ASSEMBLY__ */ + #endif /* _STATIC_CALL_TYPES_H */ diff --git a/tools/include/linux/static_call_types.h b/tools/include/linux/static_call_types.h index 5a00b8b2cf9f..cfb6ddeb292b 100644 --- a/tools/include/linux/static_call_types.h +++ b/tools/include/linux/static_call_types.h @@ -25,6 +25,8 @@ #define STATIC_CALL_SITE_INIT 2UL /* init section */ #define STATIC_CALL_SITE_FLAGS 3UL +#ifndef __ASSEMBLY__ + /* * The static call site table needs to be created by external tooling (objtool * or a compiler plugin). @@ -100,4 +102,6 @@ struct static_call_key { #endif /* CONFIG_HAVE_STATIC_CALL */ +#endif /* __ASSEMBLY__ */ + #endif /* _STATIC_CALL_TYPES_H */ -- cgit v1.2.3 From 7bfe3b8ea6e30437e01fcb8e4f56ef6e4d986d0f Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Thu, 13 Nov 2025 04:41:49 +0000 Subject: Drivers: hv: Introduce mshv_vtl driver Provide an interface for Virtual Machine Monitor like OpenVMM and its use as OpenHCL paravisor to control VTL0 (Virtual trust Level). Expose devices and support IOCTLs for features like VTL creation, VTL0 memory management, context switch, making hypercalls, mapping VTL0 address space to VTL2 userspace, getting new VMBus messages and channel events in VTL2 etc. Co-developed-by: Roman Kisel Signed-off-by: Roman Kisel Co-developed-by: Saurabh Sengar Signed-off-by: Saurabh Sengar Reviewed-by: Michael Kelley Signed-off-by: Naman Jain Signed-off-by: Wei Liu --- arch/x86/hyperv/Makefile | 10 +- arch/x86/hyperv/hv_vtl.c | 30 + arch/x86/hyperv/mshv-asm-offsets.c | 37 + arch/x86/hyperv/mshv_vtl_asm.S | 116 +++ arch/x86/include/asm/mshyperv.h | 34 + drivers/hv/Kconfig | 27 +- drivers/hv/Makefile | 7 +- drivers/hv/mshv_vtl.h | 25 + drivers/hv/mshv_vtl_main.c | 1392 ++++++++++++++++++++++++++++++++++++ include/hyperv/hvgdk_mini.h | 106 +++ include/uapi/linux/mshv.h | 80 +++ 11 files changed, 1861 insertions(+), 3 deletions(-) create mode 100644 arch/x86/hyperv/mshv-asm-offsets.c create mode 100644 arch/x86/hyperv/mshv_vtl_asm.S create mode 100644 drivers/hv/mshv_vtl.h create mode 100644 drivers/hv/mshv_vtl_main.c (limited to 'include') diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index 6f5d97cddd80..56292102af62 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1,7 +1,12 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o obj-$(CONFIG_X86_64) += hv_apic.o -obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o +obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o mshv_vtl_asm.o + +$(obj)/mshv_vtl_asm.o: $(obj)/mshv-asm-offsets.h + +$(obj)/mshv-asm-offsets.h: $(obj)/mshv-asm-offsets.s FORCE + $(call filechk,offsets,__MSHV_ASM_OFFSETS_H__) ifdef CONFIG_X86_64 obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o @@ -12,3 +17,6 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o obj-$(CONFIG_CRASH_DUMP) += hv_crash.o hv_trampoline.o endif endif + +targets += mshv-asm-offsets.s +clean-files += mshv-asm-offsets.h diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 042e8712d8de..c0edaed0efb3 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -9,12 +9,17 @@ #include #include #include +#include +#include #include #include #include #include #include +#include +#include #include <../kernel/smpboot.h> +#include "../../kernel/fpu/legacy.h" extern struct boot_params boot_params; static struct real_mode_header hv_vtl_real_mode_header; @@ -249,3 +254,28 @@ int __init hv_vtl_early_init(void) return 0; } + +DEFINE_STATIC_CALL_NULL(__mshv_vtl_return_hypercall, void (*)(void)); + +void mshv_vtl_return_call_init(u64 vtl_return_offset) +{ + static_call_update(__mshv_vtl_return_hypercall, + (void *)((u8 *)hv_hypercall_pg + vtl_return_offset)); +} +EXPORT_SYMBOL(mshv_vtl_return_call_init); + +void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) +{ + struct hv_vp_assist_page *hvp; + + hvp = hv_vp_assist_page[smp_processor_id()]; + hvp->vtl_ret_x64rax = vtl0->rax; + hvp->vtl_ret_x64rcx = vtl0->rcx; + + kernel_fpu_begin_mask(0); + fxrstor(&vtl0->fx_state); + __mshv_vtl_return_call(vtl0); + fxsave(&vtl0->fx_state); + kernel_fpu_end(); +} +EXPORT_SYMBOL(mshv_vtl_return_call); diff --git a/arch/x86/hyperv/mshv-asm-offsets.c b/arch/x86/hyperv/mshv-asm-offsets.c new file mode 100644 index 000000000000..882c1db6df16 --- /dev/null +++ b/arch/x86/hyperv/mshv-asm-offsets.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + * + * Copyright (c) 2025, Microsoft Corporation. + * + * Author: + * Naman Jain + */ +#define COMPILE_OFFSETS + +#include +#include + +static void __used common(void) +{ + if (IS_ENABLED(CONFIG_HYPERV_VTL_MODE)) { + OFFSET(MSHV_VTL_CPU_CONTEXT_rax, mshv_vtl_cpu_context, rax); + OFFSET(MSHV_VTL_CPU_CONTEXT_rcx, mshv_vtl_cpu_context, rcx); + OFFSET(MSHV_VTL_CPU_CONTEXT_rdx, mshv_vtl_cpu_context, rdx); + OFFSET(MSHV_VTL_CPU_CONTEXT_rbx, mshv_vtl_cpu_context, rbx); + OFFSET(MSHV_VTL_CPU_CONTEXT_rbp, mshv_vtl_cpu_context, rbp); + OFFSET(MSHV_VTL_CPU_CONTEXT_rsi, mshv_vtl_cpu_context, rsi); + OFFSET(MSHV_VTL_CPU_CONTEXT_rdi, mshv_vtl_cpu_context, rdi); + OFFSET(MSHV_VTL_CPU_CONTEXT_r8, mshv_vtl_cpu_context, r8); + OFFSET(MSHV_VTL_CPU_CONTEXT_r9, mshv_vtl_cpu_context, r9); + OFFSET(MSHV_VTL_CPU_CONTEXT_r10, mshv_vtl_cpu_context, r10); + OFFSET(MSHV_VTL_CPU_CONTEXT_r11, mshv_vtl_cpu_context, r11); + OFFSET(MSHV_VTL_CPU_CONTEXT_r12, mshv_vtl_cpu_context, r12); + OFFSET(MSHV_VTL_CPU_CONTEXT_r13, mshv_vtl_cpu_context, r13); + OFFSET(MSHV_VTL_CPU_CONTEXT_r14, mshv_vtl_cpu_context, r14); + OFFSET(MSHV_VTL_CPU_CONTEXT_r15, mshv_vtl_cpu_context, r15); + OFFSET(MSHV_VTL_CPU_CONTEXT_cr2, mshv_vtl_cpu_context, cr2); + } +} diff --git a/arch/x86/hyperv/mshv_vtl_asm.S b/arch/x86/hyperv/mshv_vtl_asm.S new file mode 100644 index 000000000000..f595eefad9ab --- /dev/null +++ b/arch/x86/hyperv/mshv_vtl_asm.S @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Assembly level code for mshv_vtl VTL transition + * + * Copyright (c) 2025, Microsoft Corporation. + * + * Author: + * Naman Jain + */ + +#include +#include +#include +#include +#include +#include "mshv-asm-offsets.h" + + .text + .section .noinstr.text, "ax" +/* + * void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) + * + * This function is used to context switch between different Virtual Trust Levels. + * It is marked as 'noinstr' to prevent against instrumentation and debugging facilities. + * NMIs aren't a problem because the NMI handler saves/restores CR2 specifically to guard + * against #PFs in NMI context clobbering the guest state. + */ +SYM_FUNC_START(__mshv_vtl_return_call) + /* Push callee save registers */ + pushq %rbp + mov %rsp, %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + + /* register switch to VTL0 clobbers all registers except rax/rcx */ + mov %_ASM_ARG1, %rax + + /* grab rbx/rbp/rsi/rdi/r8-r15 */ + mov MSHV_VTL_CPU_CONTEXT_rbx(%rax), %rbx + mov MSHV_VTL_CPU_CONTEXT_rbp(%rax), %rbp + mov MSHV_VTL_CPU_CONTEXT_rsi(%rax), %rsi + mov MSHV_VTL_CPU_CONTEXT_rdi(%rax), %rdi + mov MSHV_VTL_CPU_CONTEXT_r8(%rax), %r8 + mov MSHV_VTL_CPU_CONTEXT_r9(%rax), %r9 + mov MSHV_VTL_CPU_CONTEXT_r10(%rax), %r10 + mov MSHV_VTL_CPU_CONTEXT_r11(%rax), %r11 + mov MSHV_VTL_CPU_CONTEXT_r12(%rax), %r12 + mov MSHV_VTL_CPU_CONTEXT_r13(%rax), %r13 + mov MSHV_VTL_CPU_CONTEXT_r14(%rax), %r14 + mov MSHV_VTL_CPU_CONTEXT_r15(%rax), %r15 + + mov MSHV_VTL_CPU_CONTEXT_cr2(%rax), %rdx + mov %rdx, %cr2 + mov MSHV_VTL_CPU_CONTEXT_rdx(%rax), %rdx + + /* stash host registers on stack */ + pushq %rax + pushq %rcx + + xor %ecx, %ecx + + /* make a hypercall to switch VTL */ + call STATIC_CALL_TRAMP_STR(__mshv_vtl_return_hypercall) + + /* stash guest registers on stack, restore saved host copies */ + pushq %rax + pushq %rcx + mov 16(%rsp), %rcx + mov 24(%rsp), %rax + + mov %rdx, MSHV_VTL_CPU_CONTEXT_rdx(%rax) + mov %cr2, %rdx + mov %rdx, MSHV_VTL_CPU_CONTEXT_cr2(%rax) + pop MSHV_VTL_CPU_CONTEXT_rcx(%rax) + pop MSHV_VTL_CPU_CONTEXT_rax(%rax) + add $16, %rsp + + /* save rbx/rbp/rsi/rdi/r8-r15 */ + mov %rbx, MSHV_VTL_CPU_CONTEXT_rbx(%rax) + mov %rbp, MSHV_VTL_CPU_CONTEXT_rbp(%rax) + mov %rsi, MSHV_VTL_CPU_CONTEXT_rsi(%rax) + mov %rdi, MSHV_VTL_CPU_CONTEXT_rdi(%rax) + mov %r8, MSHV_VTL_CPU_CONTEXT_r8(%rax) + mov %r9, MSHV_VTL_CPU_CONTEXT_r9(%rax) + mov %r10, MSHV_VTL_CPU_CONTEXT_r10(%rax) + mov %r11, MSHV_VTL_CPU_CONTEXT_r11(%rax) + mov %r12, MSHV_VTL_CPU_CONTEXT_r12(%rax) + mov %r13, MSHV_VTL_CPU_CONTEXT_r13(%rax) + mov %r14, MSHV_VTL_CPU_CONTEXT_r14(%rax) + mov %r15, MSHV_VTL_CPU_CONTEXT_r15(%rax) + + /* pop callee-save registers r12-r15, rbx */ + pop %rbx + pop %r15 + pop %r14 + pop %r13 + pop %r12 + + pop %rbp + RET +SYM_FUNC_END(__mshv_vtl_return_call) +/* + * Make sure that static_call_key symbol: __SCK____mshv_vtl_return_hypercall is accessible here. + * Below code is inspired from __ADDRESSABLE(sym) macro. Symbol name is kept simple, to avoid + * naming it something like "__UNIQUE_ID_addressable___SCK____mshv_vtl_return_hypercall_662.0" + * which would otherwise have been generated by the macro. + */ + .section .discard.addressable,"aw" + .align 8 + .type mshv_vtl_return_sym, @object + .size mshv_vtl_return_sym, 8 +mshv_vtl_return_sym: + .quad __SCK____mshv_vtl_return_hypercall diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 1342d55c2545..10037125099a 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -11,6 +11,7 @@ #include #include #include +#include /* * Hyper-V always provides a single IO-APIC at this MMIO address. @@ -269,13 +270,46 @@ static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; } static inline int hv_apicid_to_vp_index(u32 apic_id) { return -EINVAL; } #endif /* CONFIG_HYPERV */ +struct mshv_vtl_cpu_context { + union { + struct { + u64 rax; + u64 rcx; + u64 rdx; + u64 rbx; + u64 cr2; + u64 rbp; + u64 rsi; + u64 rdi; + u64 r8; + u64 r9; + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; + }; + u64 gp_regs[16]; + }; + + struct fxregs_state fx_state; +}; #ifdef CONFIG_HYPERV_VTL_MODE void __init hv_vtl_init_platform(void); int __init hv_vtl_early_init(void); +void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0); +void mshv_vtl_return_call_init(u64 vtl_return_offset); +void mshv_vtl_return_hypercall(void); +void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0); #else static inline void __init hv_vtl_init_platform(void) {} static inline int __init hv_vtl_early_init(void) { return 0; } +static inline void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {} +static inline void mshv_vtl_return_call_init(u64 vtl_return_offset) {} +static inline void mshv_vtl_return_hypercall(void) {} +static inline void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {} #endif #include diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index 0b8c391a0342..d4a8d349200c 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -17,7 +17,8 @@ config HYPERV config HYPERV_VTL_MODE bool "Enable Linux to boot in VTL context" - depends on (X86_64 || ARM64) && HYPERV + depends on (X86_64 && HAVE_STATIC_CALL) || ARM64 + depends on HYPERV depends on SMP default n help @@ -82,4 +83,28 @@ config MSHV_ROOT If unsure, say N. +config MSHV_VTL + tristate "Microsoft Hyper-V VTL driver" + depends on X86_64 && HYPERV_VTL_MODE + depends on HYPERV_VMBUS + # Mapping VTL0 memory to a userspace process in VTL2 is supported in OpenHCL. + # VTL2 for OpenHCL makes use of Huge Pages to improve performance on VMs, + # specially with large memory requirements. + depends on TRANSPARENT_HUGEPAGE + # MTRRs are controlled by VTL0, and are not specific to individual VTLs. + # Therefore, do not attempt to access or modify MTRRs here. + depends on !MTRR + select CPUMASK_OFFSTACK + select VIRT_XFER_TO_GUEST_WORK + default n + help + Select this option to enable Hyper-V VTL driver support. + This driver provides interfaces for Virtual Machine Manager (VMM) running in VTL2 + userspace to create VTLs and partitions, setup and manage VTL0 memory and + allow userspace to make direct hypercalls. This also allows to map VTL0's address + space to a usermode process in VTL2 and supports getting new VMBus messages and channel + events in VTL2. + + If unsure, say N. + endmenu diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index 1a1677bf4dac..6d929fb0e13d 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_HYPERV_VMBUS) += hv_vmbus.o obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o obj-$(CONFIG_MSHV_ROOT) += mshv_root.o +obj-$(CONFIG_MSHV_VTL) += mshv_vtl.o CFLAGS_hv_trace.o = -I$(src) CFLAGS_hv_balloon.o = -I$(src) @@ -14,7 +15,11 @@ hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \ mshv_root_hv_call.o mshv_portid_table.o +mshv_vtl-y := mshv_vtl_main.o # Code that must be built-in obj-$(CONFIG_HYPERV) += hv_common.o -obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_common.o +obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o +ifneq ($(CONFIG_MSHV_ROOT)$(CONFIG_MSHV_VTL),) + obj-y += mshv_common.o +endif diff --git a/drivers/hv/mshv_vtl.h b/drivers/hv/mshv_vtl.h new file mode 100644 index 000000000000..a6eea52f7aa2 --- /dev/null +++ b/drivers/hv/mshv_vtl.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _MSHV_VTL_H +#define _MSHV_VTL_H + +#include +#include + +struct mshv_vtl_run { + u32 cancel; + u32 vtl_ret_action_size; + u32 pad[2]; + char exit_message[MSHV_MAX_RUN_MSG_SIZE]; + union { + struct mshv_vtl_cpu_context cpu_context; + + /* + * Reserving room for the cpu context to grow and to maintain compatibility + * with user mode. + */ + char reserved[1024]; + }; + char vtl_ret_actions[MSHV_MAX_RUN_MSG_SIZE]; +}; + +#endif /* _MSHV_VTL_H */ diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c new file mode 100644 index 000000000000..2cebe9de5a5a --- /dev/null +++ b/drivers/hv/mshv_vtl_main.c @@ -0,0 +1,1392 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * Author: + * Roman Kisel + * Saurabh Sengar + * Naman Jain + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../kernel/fpu/legacy.h" +#include "mshv.h" +#include "mshv_vtl.h" +#include "hyperv_vmbus.h" + +MODULE_AUTHOR("Microsoft"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Microsoft Hyper-V VTL Driver"); + +#define MSHV_ENTRY_REASON_LOWER_VTL_CALL 0x1 +#define MSHV_ENTRY_REASON_INTERRUPT 0x2 +#define MSHV_ENTRY_REASON_INTERCEPT 0x3 + +#define MSHV_REAL_OFF_SHIFT 16 +#define MSHV_PG_OFF_CPU_MASK (BIT_ULL(MSHV_REAL_OFF_SHIFT) - 1) +#define MSHV_RUN_PAGE_OFFSET 0 +#define MSHV_REG_PAGE_OFFSET 1 +#define VTL2_VMBUS_SINT_INDEX 7 + +static struct device *mem_dev; + +static struct tasklet_struct msg_dpc; +static wait_queue_head_t fd_wait_queue; +static bool has_message; +static struct eventfd_ctx *flag_eventfds[HV_EVENT_FLAGS_COUNT]; +static DEFINE_MUTEX(flag_lock); +static bool __read_mostly mshv_has_reg_page; + +/* hvcall code is of type u16, allocate a bitmap of size (1 << 16) to accommodate it */ +#define MAX_BITMAP_SIZE ((U16_MAX + 1) / 8) + +struct mshv_vtl_hvcall_fd { + u8 allow_bitmap[MAX_BITMAP_SIZE]; + bool allow_map_initialized; + /* + * Used to protect hvcall setup in IOCTLs + */ + struct mutex init_mutex; + struct miscdevice *dev; +}; + +struct mshv_vtl_poll_file { + struct file *file; + wait_queue_entry_t wait; + wait_queue_head_t *wqh; + poll_table pt; + int cpu; +}; + +struct mshv_vtl { + struct device *module_dev; + u64 id; +}; + +struct mshv_vtl_per_cpu { + struct mshv_vtl_run *run; + struct page *reg_page; +}; + +/* SYNIC_OVERLAY_PAGE_MSR - internal, identical to hv_synic_simp */ +union hv_synic_overlay_page_msr { + u64 as_uint64; + struct { + u64 enabled: 1; + u64 reserved: 11; + u64 pfn: 52; + } __packed; +}; + +static struct mutex mshv_vtl_poll_file_lock; +static union hv_register_vsm_page_offsets mshv_vsm_page_offsets; +static union hv_register_vsm_capabilities mshv_vsm_capabilities; + +static DEFINE_PER_CPU(struct mshv_vtl_poll_file, mshv_vtl_poll_file); +static DEFINE_PER_CPU(unsigned long long, num_vtl0_transitions); +static DEFINE_PER_CPU(struct mshv_vtl_per_cpu, mshv_vtl_per_cpu); + +static const union hv_input_vtl input_vtl_zero; +static const union hv_input_vtl input_vtl_normal = { + .use_target_vtl = 1, +}; + +static const struct file_operations mshv_vtl_fops; + +static long +mshv_ioctl_create_vtl(void __user *user_arg, struct device *module_dev) +{ + struct mshv_vtl *vtl; + struct file *file; + int fd; + + vtl = kzalloc(sizeof(*vtl), GFP_KERNEL); + if (!vtl) + return -ENOMEM; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + kfree(vtl); + return fd; + } + file = anon_inode_getfile("mshv_vtl", &mshv_vtl_fops, + vtl, O_RDWR); + if (IS_ERR(file)) { + kfree(vtl); + return PTR_ERR(file); + } + vtl->module_dev = module_dev; + fd_install(fd, file); + + return fd; +} + +static long +mshv_ioctl_check_extension(void __user *user_arg) +{ + u32 arg; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + switch (arg) { + case MSHV_CAP_CORE_API_STABLE: + return 0; + case MSHV_CAP_REGISTER_PAGE: + return mshv_has_reg_page; + case MSHV_CAP_VTL_RETURN_ACTION: + return mshv_vsm_capabilities.return_action_available; + case MSHV_CAP_DR6_SHARED: + return mshv_vsm_capabilities.dr6_shared; + } + + return -EOPNOTSUPP; +} + +static long +mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + struct miscdevice *misc = filp->private_data; + + switch (ioctl) { + case MSHV_CHECK_EXTENSION: + return mshv_ioctl_check_extension((void __user *)arg); + case MSHV_CREATE_VTL: + return mshv_ioctl_create_vtl((void __user *)arg, misc->this_device); + } + + return -ENOTTY; +} + +static const struct file_operations mshv_dev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = mshv_dev_ioctl, + .llseek = noop_llseek, +}; + +static struct miscdevice mshv_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "mshv", + .fops = &mshv_dev_fops, + .mode = 0600, +}; + +static struct mshv_vtl_run *mshv_vtl_this_run(void) +{ + return *this_cpu_ptr(&mshv_vtl_per_cpu.run); +} + +static struct mshv_vtl_run *mshv_vtl_cpu_run(int cpu) +{ + return *per_cpu_ptr(&mshv_vtl_per_cpu.run, cpu); +} + +static struct page *mshv_vtl_cpu_reg_page(int cpu) +{ + return *per_cpu_ptr(&mshv_vtl_per_cpu.reg_page, cpu); +} + +static void mshv_vtl_configure_reg_page(struct mshv_vtl_per_cpu *per_cpu) +{ + struct hv_register_assoc reg_assoc = {}; + union hv_synic_overlay_page_msr overlay = {}; + struct page *reg_page; + + reg_page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL); + if (!reg_page) { + WARN(1, "failed to allocate register page\n"); + return; + } + + overlay.enabled = 1; + overlay.pfn = page_to_hvpfn(reg_page); + reg_assoc.name = HV_X64_REGISTER_REG_PAGE; + reg_assoc.value.reg64 = overlay.as_uint64; + + if (hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_zero, ®_assoc)) { + WARN(1, "failed to setup register page\n"); + __free_page(reg_page); + return; + } + + per_cpu->reg_page = reg_page; + mshv_has_reg_page = true; +} + +static void mshv_vtl_synic_enable_regs(unsigned int cpu) +{ + union hv_synic_sint sint; + + sint.as_uint64 = 0; + sint.vector = HYPERVISOR_CALLBACK_VECTOR; + sint.masked = false; + sint.auto_eoi = hv_recommend_using_aeoi(); + + /* Enable intercepts */ + if (!mshv_vsm_capabilities.intercept_page_available) + hv_set_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, + sint.as_uint64); + + /* VTL2 Host VSP SINT is (un)masked when the user mode requests that */ +} + +static int mshv_vtl_get_vsm_regs(void) +{ + struct hv_register_assoc registers[2]; + int ret, count = 2; + + registers[0].name = HV_REGISTER_VSM_CODE_PAGE_OFFSETS; + registers[1].name = HV_REGISTER_VSM_CAPABILITIES; + + ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + count, input_vtl_zero, registers); + if (ret) + return ret; + + mshv_vsm_page_offsets.as_uint64 = registers[0].value.reg64; + mshv_vsm_capabilities.as_uint64 = registers[1].value.reg64; + + return ret; +} + +static int mshv_vtl_configure_vsm_partition(struct device *dev) +{ + union hv_register_vsm_partition_config config; + struct hv_register_assoc reg_assoc; + + config.as_uint64 = 0; + config.default_vtl_protection_mask = HV_MAP_GPA_PERMISSIONS_MASK; + config.enable_vtl_protection = 1; + config.zero_memory_on_reset = 1; + config.intercept_vp_startup = 1; + config.intercept_cpuid_unimplemented = 1; + + if (mshv_vsm_capabilities.intercept_page_available) { + dev_dbg(dev, "using intercept page\n"); + config.intercept_page = 1; + } + + reg_assoc.name = HV_REGISTER_VSM_PARTITION_CONFIG; + reg_assoc.value.reg64 = config.as_uint64; + + return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_zero, ®_assoc); +} + +static void mshv_vtl_vmbus_isr(void) +{ + struct hv_per_cpu_context *per_cpu; + struct hv_message *msg; + u32 message_type; + union hv_synic_event_flags *event_flags; + struct eventfd_ctx *eventfd; + u16 i; + + per_cpu = this_cpu_ptr(hv_context.cpu_context); + if (smp_processor_id() == 0) { + msg = (struct hv_message *)per_cpu->hyp_synic_message_page + VTL2_VMBUS_SINT_INDEX; + message_type = READ_ONCE(msg->header.message_type); + if (message_type != HVMSG_NONE) + tasklet_schedule(&msg_dpc); + } + + event_flags = (union hv_synic_event_flags *)per_cpu->hyp_synic_event_page + + VTL2_VMBUS_SINT_INDEX; + for_each_set_bit(i, event_flags->flags, HV_EVENT_FLAGS_COUNT) { + if (!sync_test_and_clear_bit(i, event_flags->flags)) + continue; + rcu_read_lock(); + eventfd = READ_ONCE(flag_eventfds[i]); + if (eventfd) + eventfd_signal(eventfd); + rcu_read_unlock(); + } + + vmbus_isr(); +} + +static int mshv_vtl_alloc_context(unsigned int cpu) +{ + struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu); + + per_cpu->run = (struct mshv_vtl_run *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!per_cpu->run) + return -ENOMEM; + + if (mshv_vsm_capabilities.intercept_page_available) + mshv_vtl_configure_reg_page(per_cpu); + + mshv_vtl_synic_enable_regs(cpu); + + return 0; +} + +static int mshv_vtl_cpuhp_online; + +static int hv_vtl_setup_synic(void) +{ + int ret; + + /* Use our isr to first filter out packets destined for userspace */ + hv_setup_vmbus_handler(mshv_vtl_vmbus_isr); + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vtl:online", + mshv_vtl_alloc_context, NULL); + if (ret < 0) { + hv_setup_vmbus_handler(vmbus_isr); + return ret; + } + + mshv_vtl_cpuhp_online = ret; + + return 0; +} + +static void hv_vtl_remove_synic(void) +{ + cpuhp_remove_state(mshv_vtl_cpuhp_online); + hv_setup_vmbus_handler(vmbus_isr); +} + +static int vtl_get_vp_register(struct hv_register_assoc *reg) +{ + return hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_normal, reg); +} + +static int vtl_set_vp_register(struct hv_register_assoc *reg) +{ + return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_normal, reg); +} + +static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg) +{ + struct mshv_vtl_ram_disposition vtl0_mem; + struct dev_pagemap *pgmap; + void *addr; + + if (copy_from_user(&vtl0_mem, arg, sizeof(vtl0_mem))) + return -EFAULT; + /* vtl0_mem.last_pfn is excluded in the pagemap range for VTL0 as per design */ + if (vtl0_mem.last_pfn <= vtl0_mem.start_pfn) { + dev_err(vtl->module_dev, "range start pfn (%llx) > end pfn (%llx)\n", + vtl0_mem.start_pfn, vtl0_mem.last_pfn); + return -EFAULT; + } + + pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->ranges[0].start = PFN_PHYS(vtl0_mem.start_pfn); + pgmap->ranges[0].end = PFN_PHYS(vtl0_mem.last_pfn) - 1; + pgmap->nr_range = 1; + pgmap->type = MEMORY_DEVICE_GENERIC; + + /* + * Determine the highest page order that can be used for the given memory range. + * This works best when the range is aligned; i.e. both the start and the length. + */ + pgmap->vmemmap_shift = count_trailing_zeros(vtl0_mem.start_pfn | vtl0_mem.last_pfn); + dev_dbg(vtl->module_dev, + "Add VTL0 memory: start: 0x%llx, end_pfn: 0x%llx, page order: %lu\n", + vtl0_mem.start_pfn, vtl0_mem.last_pfn, pgmap->vmemmap_shift); + + addr = devm_memremap_pages(mem_dev, pgmap); + if (IS_ERR(addr)) { + dev_err(vtl->module_dev, "devm_memremap_pages error: %ld\n", PTR_ERR(addr)); + kfree(pgmap); + return -EFAULT; + } + + /* Don't free pgmap, since it has to stick around until the memory + * is unmapped, which will never happen as there is no scenario + * where VTL0 can be released/shutdown without bringing down VTL2. + */ + return 0; +} + +static void mshv_vtl_cancel(int cpu) +{ + int here = get_cpu(); + + if (here != cpu) { + if (!xchg_relaxed(&mshv_vtl_cpu_run(cpu)->cancel, 1)) + smp_send_reschedule(cpu); + } else { + WRITE_ONCE(mshv_vtl_this_run()->cancel, 1); + } + put_cpu(); +} + +static int mshv_vtl_poll_file_wake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) +{ + struct mshv_vtl_poll_file *poll_file = container_of(wait, struct mshv_vtl_poll_file, wait); + + mshv_vtl_cancel(poll_file->cpu); + + return 0; +} + +static void mshv_vtl_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) +{ + struct mshv_vtl_poll_file *poll_file = container_of(pt, struct mshv_vtl_poll_file, pt); + + WARN_ON(poll_file->wqh); + poll_file->wqh = wqh; + add_wait_queue(wqh, &poll_file->wait); +} + +static int mshv_vtl_ioctl_set_poll_file(struct mshv_vtl_set_poll_file __user *user_input) +{ + struct file *file, *old_file; + struct mshv_vtl_poll_file *poll_file; + struct mshv_vtl_set_poll_file input; + + if (copy_from_user(&input, user_input, sizeof(input))) + return -EFAULT; + + if (input.cpu >= num_possible_cpus() || !cpu_online(input.cpu)) + return -EINVAL; + /* + * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists. + * CPU is expected to remain online after above cpu_online() check. + */ + + file = NULL; + file = fget(input.fd); + if (!file) + return -EBADFD; + + poll_file = per_cpu_ptr(&mshv_vtl_poll_file, READ_ONCE(input.cpu)); + if (!poll_file) + return -EINVAL; + + mutex_lock(&mshv_vtl_poll_file_lock); + + if (poll_file->wqh) + remove_wait_queue(poll_file->wqh, &poll_file->wait); + poll_file->wqh = NULL; + + old_file = poll_file->file; + poll_file->file = file; + poll_file->cpu = input.cpu; + + if (file) { + init_waitqueue_func_entry(&poll_file->wait, mshv_vtl_poll_file_wake); + init_poll_funcptr(&poll_file->pt, mshv_vtl_ptable_queue_proc); + vfs_poll(file, &poll_file->pt); + } + + mutex_unlock(&mshv_vtl_poll_file_lock); + + if (old_file) + fput(old_file); + + return 0; +} + +/* Static table mapping register names to their corresponding actions */ +static const struct { + enum hv_register_name reg_name; + int debug_reg_num; /* -1 if not a debug register */ + u32 msr_addr; /* 0 if not an MSR */ +} reg_table[] = { + /* Debug registers */ + {HV_X64_REGISTER_DR0, 0, 0}, + {HV_X64_REGISTER_DR1, 1, 0}, + {HV_X64_REGISTER_DR2, 2, 0}, + {HV_X64_REGISTER_DR3, 3, 0}, + {HV_X64_REGISTER_DR6, 6, 0}, + /* MTRR MSRs */ + {HV_X64_REGISTER_MSR_MTRR_CAP, -1, MSR_MTRRcap}, + {HV_X64_REGISTER_MSR_MTRR_DEF_TYPE, -1, MSR_MTRRdefType}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0, -1, MTRRphysBase_MSR(0)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1, -1, MTRRphysBase_MSR(1)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2, -1, MTRRphysBase_MSR(2)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3, -1, MTRRphysBase_MSR(3)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4, -1, MTRRphysBase_MSR(4)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5, -1, MTRRphysBase_MSR(5)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6, -1, MTRRphysBase_MSR(6)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7, -1, MTRRphysBase_MSR(7)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8, -1, MTRRphysBase_MSR(8)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9, -1, MTRRphysBase_MSR(9)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA, -1, MTRRphysBase_MSR(0xa)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB, -1, MTRRphysBase_MSR(0xb)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC, -1, MTRRphysBase_MSR(0xc)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASED, -1, MTRRphysBase_MSR(0xd)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE, -1, MTRRphysBase_MSR(0xe)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF, -1, MTRRphysBase_MSR(0xf)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0, -1, MTRRphysMask_MSR(0)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1, -1, MTRRphysMask_MSR(1)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2, -1, MTRRphysMask_MSR(2)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3, -1, MTRRphysMask_MSR(3)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4, -1, MTRRphysMask_MSR(4)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5, -1, MTRRphysMask_MSR(5)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6, -1, MTRRphysMask_MSR(6)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7, -1, MTRRphysMask_MSR(7)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8, -1, MTRRphysMask_MSR(8)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9, -1, MTRRphysMask_MSR(9)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA, -1, MTRRphysMask_MSR(0xa)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB, -1, MTRRphysMask_MSR(0xb)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC, -1, MTRRphysMask_MSR(0xc)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD, -1, MTRRphysMask_MSR(0xd)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE, -1, MTRRphysMask_MSR(0xe)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF, -1, MTRRphysMask_MSR(0xf)}, + {HV_X64_REGISTER_MSR_MTRR_FIX64K00000, -1, MSR_MTRRfix64K_00000}, + {HV_X64_REGISTER_MSR_MTRR_FIX16K80000, -1, MSR_MTRRfix16K_80000}, + {HV_X64_REGISTER_MSR_MTRR_FIX16KA0000, -1, MSR_MTRRfix16K_A0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KC0000, -1, MSR_MTRRfix4K_C0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KC8000, -1, MSR_MTRRfix4K_C8000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KD0000, -1, MSR_MTRRfix4K_D0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KD8000, -1, MSR_MTRRfix4K_D8000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KE0000, -1, MSR_MTRRfix4K_E0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KE8000, -1, MSR_MTRRfix4K_E8000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KF0000, -1, MSR_MTRRfix4K_F0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KF8000, -1, MSR_MTRRfix4K_F8000}, +}; + +static int mshv_vtl_get_set_reg(struct hv_register_assoc *regs, bool set) +{ + u64 *reg64; + enum hv_register_name gpr_name; + int i; + + gpr_name = regs->name; + reg64 = ®s->value.reg64; + + /* Search for the register in the table */ + for (i = 0; i < ARRAY_SIZE(reg_table); i++) { + if (reg_table[i].reg_name != gpr_name) + continue; + if (reg_table[i].debug_reg_num != -1) { + /* Handle debug registers */ + if (gpr_name == HV_X64_REGISTER_DR6 && + !mshv_vsm_capabilities.dr6_shared) + goto hypercall; + if (set) + native_set_debugreg(reg_table[i].debug_reg_num, *reg64); + else + *reg64 = native_get_debugreg(reg_table[i].debug_reg_num); + } else { + /* Handle MSRs */ + if (set) + wrmsrl(reg_table[i].msr_addr, *reg64); + else + rdmsrl(reg_table[i].msr_addr, *reg64); + } + return 0; + } + +hypercall: + return 1; +} + +static void mshv_vtl_return(struct mshv_vtl_cpu_context *vtl0) +{ + struct hv_vp_assist_page *hvp; + + hvp = hv_vp_assist_page[smp_processor_id()]; + + /* + * Process signal event direct set in the run page, if any. + */ + if (mshv_vsm_capabilities.return_action_available) { + u32 offset = READ_ONCE(mshv_vtl_this_run()->vtl_ret_action_size); + + WRITE_ONCE(mshv_vtl_this_run()->vtl_ret_action_size, 0); + + /* + * Hypervisor will take care of clearing out the actions + * set in the assist page. + */ + memcpy(hvp->vtl_ret_actions, + mshv_vtl_this_run()->vtl_ret_actions, + min_t(u32, offset, sizeof(hvp->vtl_ret_actions))); + } + + mshv_vtl_return_call(vtl0); +} + +static bool mshv_vtl_process_intercept(void) +{ + struct hv_per_cpu_context *mshv_cpu; + void *synic_message_page; + struct hv_message *msg; + u32 message_type; + + mshv_cpu = this_cpu_ptr(hv_context.cpu_context); + synic_message_page = mshv_cpu->hyp_synic_message_page; + if (unlikely(!synic_message_page)) + return true; + + msg = (struct hv_message *)synic_message_page + HV_SYNIC_INTERCEPTION_SINT_INDEX; + message_type = READ_ONCE(msg->header.message_type); + if (message_type == HVMSG_NONE) + return true; + + memcpy(mshv_vtl_this_run()->exit_message, msg, sizeof(*msg)); + vmbus_signal_eom(msg, message_type); + + return false; +} + +static int mshv_vtl_ioctl_return_to_lower_vtl(void) +{ + preempt_disable(); + for (;;) { + unsigned long irq_flags; + struct hv_vp_assist_page *hvp; + int ret; + + if (__xfer_to_guest_mode_work_pending()) { + preempt_enable(); + ret = xfer_to_guest_mode_handle_work(); + if (ret) + return ret; + preempt_disable(); + } + + local_irq_save(irq_flags); + if (READ_ONCE(mshv_vtl_this_run()->cancel)) { + local_irq_restore(irq_flags); + preempt_enable(); + return -EINTR; + } + + mshv_vtl_return(&mshv_vtl_this_run()->cpu_context); + local_irq_restore(irq_flags); + + hvp = hv_vp_assist_page[smp_processor_id()]; + this_cpu_inc(num_vtl0_transitions); + switch (hvp->vtl_entry_reason) { + case MSHV_ENTRY_REASON_INTERRUPT: + if (!mshv_vsm_capabilities.intercept_page_available && + likely(!mshv_vtl_process_intercept())) + goto done; + break; + + case MSHV_ENTRY_REASON_INTERCEPT: + WARN_ON(!mshv_vsm_capabilities.intercept_page_available); + memcpy(mshv_vtl_this_run()->exit_message, hvp->intercept_message, + sizeof(hvp->intercept_message)); + goto done; + + default: + panic("unknown entry reason: %d", hvp->vtl_entry_reason); + } + } + +done: + preempt_enable(); + + return 0; +} + +static long +mshv_vtl_ioctl_get_regs(void __user *user_args) +{ + struct mshv_vp_registers args; + struct hv_register_assoc reg; + long ret; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + /* This IOCTL supports processing only one register at a time. */ + if (args.count != 1) + return -EINVAL; + + if (copy_from_user(®, (void __user *)args.regs_ptr, + sizeof(reg))) + return -EFAULT; + + ret = mshv_vtl_get_set_reg(®, false); + if (!ret) + goto copy_args; /* No need of hypercall */ + ret = vtl_get_vp_register(®); + if (ret) + return ret; + +copy_args: + if (copy_to_user((void __user *)args.regs_ptr, ®, sizeof(reg))) + ret = -EFAULT; + + return ret; +} + +static long +mshv_vtl_ioctl_set_regs(void __user *user_args) +{ + struct mshv_vp_registers args; + struct hv_register_assoc reg; + long ret; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + /* This IOCTL supports processing only one register at a time. */ + if (args.count != 1) + return -EINVAL; + + if (copy_from_user(®, (void __user *)args.regs_ptr, sizeof(reg))) + return -EFAULT; + + ret = mshv_vtl_get_set_reg(®, true); + if (!ret) + return ret; /* No need of hypercall */ + ret = vtl_set_vp_register(®); + + return ret; +} + +static long +mshv_vtl_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + long ret; + struct mshv_vtl *vtl = filp->private_data; + + switch (ioctl) { + case MSHV_SET_POLL_FILE: + ret = mshv_vtl_ioctl_set_poll_file((struct mshv_vtl_set_poll_file __user *)arg); + break; + case MSHV_GET_VP_REGISTERS: + ret = mshv_vtl_ioctl_get_regs((void __user *)arg); + break; + case MSHV_SET_VP_REGISTERS: + ret = mshv_vtl_ioctl_set_regs((void __user *)arg); + break; + case MSHV_RETURN_TO_LOWER_VTL: + ret = mshv_vtl_ioctl_return_to_lower_vtl(); + break; + case MSHV_ADD_VTL0_MEMORY: + ret = mshv_vtl_ioctl_add_vtl0_mem(vtl, (void __user *)arg); + break; + default: + dev_err(vtl->module_dev, "invalid vtl ioctl: %#x\n", ioctl); + ret = -ENOTTY; + } + + return ret; +} + +static vm_fault_t mshv_vtl_fault(struct vm_fault *vmf) +{ + struct page *page; + int cpu = vmf->pgoff & MSHV_PG_OFF_CPU_MASK; + int real_off = vmf->pgoff >> MSHV_REAL_OFF_SHIFT; + + if (!cpu_online(cpu)) + return VM_FAULT_SIGBUS; + /* + * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists. + * CPU is expected to remain online after above cpu_online() check. + */ + + if (real_off == MSHV_RUN_PAGE_OFFSET) { + page = virt_to_page(mshv_vtl_cpu_run(cpu)); + } else if (real_off == MSHV_REG_PAGE_OFFSET) { + if (!mshv_has_reg_page) + return VM_FAULT_SIGBUS; + page = mshv_vtl_cpu_reg_page(cpu); + } else { + return VM_FAULT_NOPAGE; + } + + get_page(page); + vmf->page = page; + + return 0; +} + +static const struct vm_operations_struct mshv_vtl_vm_ops = { + .fault = mshv_vtl_fault, +}; + +static int mshv_vtl_mmap(struct file *filp, struct vm_area_struct *vma) +{ + vma->vm_ops = &mshv_vtl_vm_ops; + + return 0; +} + +static int mshv_vtl_release(struct inode *inode, struct file *filp) +{ + struct mshv_vtl *vtl = filp->private_data; + + kfree(vtl); + + return 0; +} + +static const struct file_operations mshv_vtl_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = mshv_vtl_ioctl, + .release = mshv_vtl_release, + .mmap = mshv_vtl_mmap, +}; + +static void mshv_vtl_synic_mask_vmbus_sint(const u8 *mask) +{ + union hv_synic_sint sint; + + sint.as_uint64 = 0; + sint.vector = HYPERVISOR_CALLBACK_VECTOR; + sint.masked = (*mask != 0); + sint.auto_eoi = hv_recommend_using_aeoi(); + + hv_set_msr(HV_MSR_SINT0 + VTL2_VMBUS_SINT_INDEX, + sint.as_uint64); + + if (!sint.masked) + pr_debug("%s: Unmasking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id()); + else + pr_debug("%s: Masking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id()); +} + +static void mshv_vtl_read_remote(void *buffer) +{ + struct hv_per_cpu_context *mshv_cpu = this_cpu_ptr(hv_context.cpu_context); + struct hv_message *msg = (struct hv_message *)mshv_cpu->hyp_synic_message_page + + VTL2_VMBUS_SINT_INDEX; + u32 message_type = READ_ONCE(msg->header.message_type); + + WRITE_ONCE(has_message, false); + if (message_type == HVMSG_NONE) + return; + + memcpy(buffer, msg, sizeof(*msg)); + vmbus_signal_eom(msg, message_type); +} + +static bool vtl_synic_mask_vmbus_sint_masked = true; + +static ssize_t mshv_vtl_sint_read(struct file *filp, char __user *arg, size_t size, loff_t *offset) +{ + struct hv_message msg = {}; + int ret; + + if (size < sizeof(msg)) + return -EINVAL; + + for (;;) { + smp_call_function_single(VMBUS_CONNECT_CPU, mshv_vtl_read_remote, &msg, true); + if (msg.header.message_type != HVMSG_NONE) + break; + + if (READ_ONCE(vtl_synic_mask_vmbus_sint_masked)) + return 0; /* EOF */ + + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + ret = wait_event_interruptible(fd_wait_queue, + READ_ONCE(has_message) || + READ_ONCE(vtl_synic_mask_vmbus_sint_masked)); + if (ret) + return ret; + } + + if (copy_to_user(arg, &msg, sizeof(msg))) + return -EFAULT; + + return sizeof(msg); +} + +static __poll_t mshv_vtl_sint_poll(struct file *filp, poll_table *wait) +{ + __poll_t mask = 0; + + poll_wait(filp, &fd_wait_queue, wait); + if (READ_ONCE(has_message) || READ_ONCE(vtl_synic_mask_vmbus_sint_masked)) + mask |= EPOLLIN | EPOLLRDNORM; + + return mask; +} + +static void mshv_vtl_sint_on_msg_dpc(unsigned long data) +{ + WRITE_ONCE(has_message, true); + wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN); +} + +static int mshv_vtl_sint_ioctl_post_msg(struct mshv_vtl_sint_post_msg __user *arg) +{ + struct mshv_vtl_sint_post_msg message; + u8 payload[HV_MESSAGE_PAYLOAD_BYTE_COUNT]; + + if (copy_from_user(&message, arg, sizeof(message))) + return -EFAULT; + if (message.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) + return -EINVAL; + if (copy_from_user(payload, (void __user *)message.payload_ptr, + message.payload_size)) + return -EFAULT; + + return hv_post_message((union hv_connection_id)message.connection_id, + message.message_type, (void *)payload, + message.payload_size); +} + +static int mshv_vtl_sint_ioctl_signal_event(struct mshv_vtl_signal_event __user *arg) +{ + u64 input, status; + struct mshv_vtl_signal_event signal_event; + + if (copy_from_user(&signal_event, arg, sizeof(signal_event))) + return -EFAULT; + + input = signal_event.connection_id | ((u64)signal_event.flag << 32); + + status = hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, input); + + return hv_result_to_errno(status); +} + +static int mshv_vtl_sint_ioctl_set_eventfd(struct mshv_vtl_set_eventfd __user *arg) +{ + struct mshv_vtl_set_eventfd set_eventfd; + struct eventfd_ctx *eventfd, *old_eventfd; + + if (copy_from_user(&set_eventfd, arg, sizeof(set_eventfd))) + return -EFAULT; + if (set_eventfd.flag >= HV_EVENT_FLAGS_COUNT) + return -EINVAL; + + eventfd = NULL; + if (set_eventfd.fd >= 0) { + eventfd = eventfd_ctx_fdget(set_eventfd.fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + } + + guard(mutex)(&flag_lock); + old_eventfd = READ_ONCE(flag_eventfds[set_eventfd.flag]); + WRITE_ONCE(flag_eventfds[set_eventfd.flag], eventfd); + + if (old_eventfd) { + synchronize_rcu(); + eventfd_ctx_put(old_eventfd); + } + + return 0; +} + +static int mshv_vtl_sint_ioctl_pause_msg_stream(struct mshv_sint_mask __user *arg) +{ + static DEFINE_MUTEX(vtl2_vmbus_sint_mask_mutex); + struct mshv_sint_mask mask; + + if (copy_from_user(&mask, arg, sizeof(mask))) + return -EFAULT; + guard(mutex)(&vtl2_vmbus_sint_mask_mutex); + on_each_cpu((smp_call_func_t)mshv_vtl_synic_mask_vmbus_sint, &mask.mask, 1); + WRITE_ONCE(vtl_synic_mask_vmbus_sint_masked, mask.mask != 0); + if (mask.mask) + wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN); + + return 0; +} + +static long mshv_vtl_sint_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case MSHV_SINT_POST_MESSAGE: + return mshv_vtl_sint_ioctl_post_msg((struct mshv_vtl_sint_post_msg __user *)arg); + case MSHV_SINT_SIGNAL_EVENT: + return mshv_vtl_sint_ioctl_signal_event((struct mshv_vtl_signal_event __user *)arg); + case MSHV_SINT_SET_EVENTFD: + return mshv_vtl_sint_ioctl_set_eventfd((struct mshv_vtl_set_eventfd __user *)arg); + case MSHV_SINT_PAUSE_MESSAGE_STREAM: + return mshv_vtl_sint_ioctl_pause_msg_stream((struct mshv_sint_mask __user *)arg); + default: + return -ENOIOCTLCMD; + } +} + +static const struct file_operations mshv_vtl_sint_ops = { + .owner = THIS_MODULE, + .read = mshv_vtl_sint_read, + .poll = mshv_vtl_sint_poll, + .unlocked_ioctl = mshv_vtl_sint_ioctl, +}; + +static struct miscdevice mshv_vtl_sint_dev = { + .name = "mshv_sint", + .fops = &mshv_vtl_sint_ops, + .mode = 0600, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int mshv_vtl_hvcall_dev_open(struct inode *node, struct file *f) +{ + struct miscdevice *dev = f->private_data; + struct mshv_vtl_hvcall_fd *fd; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + fd = vzalloc(sizeof(*fd)); + if (!fd) + return -ENOMEM; + fd->dev = dev; + f->private_data = fd; + mutex_init(&fd->init_mutex); + + return 0; +} + +static int mshv_vtl_hvcall_dev_release(struct inode *node, struct file *f) +{ + struct mshv_vtl_hvcall_fd *fd; + + fd = f->private_data; + if (fd) { + vfree(fd); + f->private_data = NULL; + } + + return 0; +} + +static int mshv_vtl_hvcall_do_setup(struct mshv_vtl_hvcall_fd *fd, + struct mshv_vtl_hvcall_setup __user *hvcall_setup_user) +{ + struct mshv_vtl_hvcall_setup hvcall_setup; + + guard(mutex)(&fd->init_mutex); + + if (fd->allow_map_initialized) { + dev_err(fd->dev->this_device, + "Hypercall allow map has already been set, pid %d\n", + current->pid); + return -EINVAL; + } + + if (copy_from_user(&hvcall_setup, hvcall_setup_user, + sizeof(struct mshv_vtl_hvcall_setup))) { + return -EFAULT; + } + if (hvcall_setup.bitmap_array_size > ARRAY_SIZE(fd->allow_bitmap)) + return -EINVAL; + + if (copy_from_user(&fd->allow_bitmap, + (void __user *)hvcall_setup.allow_bitmap_ptr, + hvcall_setup.bitmap_array_size)) { + return -EFAULT; + } + + dev_info(fd->dev->this_device, "Hypercall allow map has been set, pid %d\n", + current->pid); + fd->allow_map_initialized = true; + return 0; +} + +static bool mshv_vtl_hvcall_is_allowed(struct mshv_vtl_hvcall_fd *fd, u16 call_code) +{ + return test_bit(call_code, (unsigned long *)fd->allow_bitmap); +} + +static int mshv_vtl_hvcall_call(struct mshv_vtl_hvcall_fd *fd, + struct mshv_vtl_hvcall __user *hvcall_user) +{ + struct mshv_vtl_hvcall hvcall; + void *in, *out; + int ret; + + if (copy_from_user(&hvcall, hvcall_user, sizeof(struct mshv_vtl_hvcall))) + return -EFAULT; + if (hvcall.input_size > HV_HYP_PAGE_SIZE) + return -EINVAL; + if (hvcall.output_size > HV_HYP_PAGE_SIZE) + return -EINVAL; + + /* + * By default, all hypercalls are not allowed. + * The user mode code has to set up the allow bitmap once. + */ + + if (!mshv_vtl_hvcall_is_allowed(fd, hvcall.control & 0xFFFF)) { + dev_err(fd->dev->this_device, + "Hypercall with control data %#llx isn't allowed\n", + hvcall.control); + return -EPERM; + } + + /* + * This may create a problem for Confidential VM (CVM) usecase where we need to use + * Hyper-V driver allocated per-cpu input and output pages (hyperv_pcpu_input_arg and + * hyperv_pcpu_output_arg) for making a hypervisor call. + * + * TODO: Take care of this when CVM support is added. + */ + in = (void *)__get_free_page(GFP_KERNEL); + out = (void *)__get_free_page(GFP_KERNEL); + + if (copy_from_user(in, (void __user *)hvcall.input_ptr, hvcall.input_size)) { + ret = -EFAULT; + goto free_pages; + } + + hvcall.status = hv_do_hypercall(hvcall.control, in, out); + + if (copy_to_user((void __user *)hvcall.output_ptr, out, hvcall.output_size)) { + ret = -EFAULT; + goto free_pages; + } + ret = put_user(hvcall.status, &hvcall_user->status); +free_pages: + free_page((unsigned long)in); + free_page((unsigned long)out); + + return ret; +} + +static long mshv_vtl_hvcall_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + struct mshv_vtl_hvcall_fd *fd = f->private_data; + + switch (cmd) { + case MSHV_HVCALL_SETUP: + return mshv_vtl_hvcall_do_setup(fd, (struct mshv_vtl_hvcall_setup __user *)arg); + case MSHV_HVCALL: + return mshv_vtl_hvcall_call(fd, (struct mshv_vtl_hvcall __user *)arg); + default: + break; + } + + return -ENOIOCTLCMD; +} + +static const struct file_operations mshv_vtl_hvcall_dev_file_ops = { + .owner = THIS_MODULE, + .open = mshv_vtl_hvcall_dev_open, + .release = mshv_vtl_hvcall_dev_release, + .unlocked_ioctl = mshv_vtl_hvcall_dev_ioctl, +}; + +static struct miscdevice mshv_vtl_hvcall_dev = { + .name = "mshv_hvcall", + .nodename = "mshv_hvcall", + .fops = &mshv_vtl_hvcall_dev_file_ops, + .mode = 0600, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int mshv_vtl_low_open(struct inode *inodep, struct file *filp) +{ + pid_t pid = task_pid_vnr(current); + uid_t uid = current_uid().val; + int ret = 0; + + pr_debug("%s: Opening VTL low, task group %d, uid %d\n", __func__, pid, uid); + + if (capable(CAP_SYS_ADMIN)) { + filp->private_data = inodep; + } else { + pr_err("%s: VTL low open failed: CAP_SYS_ADMIN required. task group %d, uid %d", + __func__, pid, uid); + ret = -EPERM; + } + + return ret; +} + +static bool can_fault(struct vm_fault *vmf, unsigned long size, unsigned long *pfn) +{ + unsigned long mask = size - 1; + unsigned long start = vmf->address & ~mask; + unsigned long end = start + size; + bool is_valid; + + is_valid = (vmf->address & mask) == ((vmf->pgoff << PAGE_SHIFT) & mask) && + start >= vmf->vma->vm_start && + end <= vmf->vma->vm_end; + + if (is_valid) + *pfn = vmf->pgoff & ~(mask >> PAGE_SHIFT); + + return is_valid; +} + +static vm_fault_t mshv_vtl_low_huge_fault(struct vm_fault *vmf, unsigned int order) +{ + unsigned long pfn = vmf->pgoff; + vm_fault_t ret = VM_FAULT_FALLBACK; + + switch (order) { + case 0: + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); + + case PMD_ORDER: + if (can_fault(vmf, PMD_SIZE, &pfn)) + ret = vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); + return ret; + + case PUD_ORDER: + if (can_fault(vmf, PUD_SIZE, &pfn)) + ret = vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); + return ret; + + default: + return VM_FAULT_SIGBUS; + } +} + +static vm_fault_t mshv_vtl_low_fault(struct vm_fault *vmf) +{ + return mshv_vtl_low_huge_fault(vmf, 0); +} + +static const struct vm_operations_struct mshv_vtl_low_vm_ops = { + .fault = mshv_vtl_low_fault, + .huge_fault = mshv_vtl_low_huge_fault, +}; + +static int mshv_vtl_low_mmap(struct file *filp, struct vm_area_struct *vma) +{ + vma->vm_ops = &mshv_vtl_low_vm_ops; + vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP); + + return 0; +} + +static const struct file_operations mshv_vtl_low_file_ops = { + .owner = THIS_MODULE, + .open = mshv_vtl_low_open, + .mmap = mshv_vtl_low_mmap, +}; + +static struct miscdevice mshv_vtl_low = { + .name = "mshv_vtl_low", + .nodename = "mshv_vtl_low", + .fops = &mshv_vtl_low_file_ops, + .mode = 0600, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int __init mshv_vtl_init(void) +{ + int ret; + struct device *dev = mshv_dev.this_device; + + /* + * This creates /dev/mshv which provides functionality to create VTLs and partitions. + */ + ret = misc_register(&mshv_dev); + if (ret) { + dev_err(dev, "mshv device register failed: %d\n", ret); + goto free_dev; + } + + tasklet_init(&msg_dpc, mshv_vtl_sint_on_msg_dpc, 0); + init_waitqueue_head(&fd_wait_queue); + + if (mshv_vtl_get_vsm_regs()) { + dev_emerg(dev, "Unable to get VSM capabilities !!\n"); + ret = -ENODEV; + goto free_dev; + } + if (mshv_vtl_configure_vsm_partition(dev)) { + dev_emerg(dev, "VSM configuration failed !!\n"); + ret = -ENODEV; + goto free_dev; + } + + mshv_vtl_return_call_init(mshv_vsm_page_offsets.vtl_return_offset); + ret = hv_vtl_setup_synic(); + if (ret) + goto free_dev; + + /* + * mshv_sint device adds VMBus relay ioctl support. + * This provides a channel for VTL0 to communicate with VTL2. + */ + ret = misc_register(&mshv_vtl_sint_dev); + if (ret) + goto free_synic; + + /* + * mshv_hvcall device adds interface to enable userspace for direct hypercalls support. + */ + ret = misc_register(&mshv_vtl_hvcall_dev); + if (ret) + goto free_sint; + + /* + * mshv_vtl_low device is used to map VTL0 address space to a user-mode process in VTL2. + * It implements mmap() to allow a user-mode process in VTL2 to map to the address of VTL0. + */ + ret = misc_register(&mshv_vtl_low); + if (ret) + goto free_hvcall; + + /* + * "mshv vtl mem dev" device is later used to setup VTL0 memory. + */ + mem_dev = kzalloc(sizeof(*mem_dev), GFP_KERNEL); + if (!mem_dev) { + ret = -ENOMEM; + goto free_low; + } + + mutex_init(&mshv_vtl_poll_file_lock); + + device_initialize(mem_dev); + dev_set_name(mem_dev, "mshv vtl mem dev"); + ret = device_add(mem_dev); + if (ret) { + dev_err(dev, "mshv vtl mem dev add: %d\n", ret); + goto free_mem; + } + + return 0; + +free_mem: + kfree(mem_dev); +free_low: + misc_deregister(&mshv_vtl_low); +free_hvcall: + misc_deregister(&mshv_vtl_hvcall_dev); +free_sint: + misc_deregister(&mshv_vtl_sint_dev); +free_synic: + hv_vtl_remove_synic(); +free_dev: + misc_deregister(&mshv_dev); + + return ret; +} + +static void __exit mshv_vtl_exit(void) +{ + device_del(mem_dev); + kfree(mem_dev); + misc_deregister(&mshv_vtl_low); + misc_deregister(&mshv_vtl_hvcall_dev); + misc_deregister(&mshv_vtl_sint_dev); + hv_vtl_remove_synic(); + misc_deregister(&mshv_dev); +} + +module_init(mshv_vtl_init); +module_exit(mshv_vtl_exit); diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 7499a679e60a..1d5ce11be8b6 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -885,6 +885,48 @@ struct hv_get_vp_from_apic_id_in { u32 apic_ids[]; } __packed; +union hv_register_vsm_partition_config { + u64 as_uint64; + struct { + u64 enable_vtl_protection : 1; + u64 default_vtl_protection_mask : 4; + u64 zero_memory_on_reset : 1; + u64 deny_lower_vtl_startup : 1; + u64 intercept_acceptance : 1; + u64 intercept_enable_vtl_protection : 1; + u64 intercept_vp_startup : 1; + u64 intercept_cpuid_unimplemented : 1; + u64 intercept_unrecoverable_exception : 1; + u64 intercept_page : 1; + u64 mbz : 51; + } __packed; +}; + +union hv_register_vsm_capabilities { + u64 as_uint64; + struct { + u64 dr6_shared: 1; + u64 mbec_vtl_mask: 16; + u64 deny_lower_vtl_startup: 1; + u64 supervisor_shadow_stack: 1; + u64 hardware_hvpt_available: 1; + u64 software_hvpt_available: 1; + u64 hardware_hvpt_range_bits: 6; + u64 intercept_page_available: 1; + u64 return_action_available: 1; + u64 reserved: 35; + } __packed; +}; + +union hv_register_vsm_page_offsets { + struct { + u64 vtl_call_offset : 12; + u64 vtl_return_offset : 12; + u64 reserved_mbz : 40; + } __packed; + u64 as_uint64; +}; + struct hv_nested_enlightenments_control { struct { u32 directhypercall : 1; @@ -1007,6 +1049,70 @@ enum hv_register_name { /* VSM */ HV_REGISTER_VSM_VP_STATUS = 0x000D0003, + + /* Synthetic VSM registers */ + HV_REGISTER_VSM_CODE_PAGE_OFFSETS = 0x000D0002, + HV_REGISTER_VSM_CAPABILITIES = 0x000D0006, + HV_REGISTER_VSM_PARTITION_CONFIG = 0x000D0007, + +#if defined(CONFIG_X86) + /* X64 Debug Registers */ + HV_X64_REGISTER_DR0 = 0x00050000, + HV_X64_REGISTER_DR1 = 0x00050001, + HV_X64_REGISTER_DR2 = 0x00050002, + HV_X64_REGISTER_DR3 = 0x00050003, + HV_X64_REGISTER_DR6 = 0x00050004, + HV_X64_REGISTER_DR7 = 0x00050005, + + /* X64 Cache control MSRs */ + HV_X64_REGISTER_MSR_MTRR_CAP = 0x0008000D, + HV_X64_REGISTER_MSR_MTRR_DEF_TYPE = 0x0008000E, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0 = 0x00080010, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1 = 0x00080011, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2 = 0x00080012, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3 = 0x00080013, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4 = 0x00080014, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5 = 0x00080015, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6 = 0x00080016, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7 = 0x00080017, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8 = 0x00080018, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9 = 0x00080019, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA = 0x0008001A, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB = 0x0008001B, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC = 0x0008001C, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASED = 0x0008001D, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE = 0x0008001E, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF = 0x0008001F, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0 = 0x00080040, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1 = 0x00080041, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2 = 0x00080042, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3 = 0x00080043, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4 = 0x00080044, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5 = 0x00080045, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6 = 0x00080046, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7 = 0x00080047, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8 = 0x00080048, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9 = 0x00080049, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA = 0x0008004A, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB = 0x0008004B, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC = 0x0008004C, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD = 0x0008004D, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE = 0x0008004E, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF = 0x0008004F, + HV_X64_REGISTER_MSR_MTRR_FIX64K00000 = 0x00080070, + HV_X64_REGISTER_MSR_MTRR_FIX16K80000 = 0x00080071, + HV_X64_REGISTER_MSR_MTRR_FIX16KA0000 = 0x00080072, + HV_X64_REGISTER_MSR_MTRR_FIX4KC0000 = 0x00080073, + HV_X64_REGISTER_MSR_MTRR_FIX4KC8000 = 0x00080074, + HV_X64_REGISTER_MSR_MTRR_FIX4KD0000 = 0x00080075, + HV_X64_REGISTER_MSR_MTRR_FIX4KD8000 = 0x00080076, + HV_X64_REGISTER_MSR_MTRR_FIX4KE0000 = 0x00080077, + HV_X64_REGISTER_MSR_MTRR_FIX4KE8000 = 0x00080078, + HV_X64_REGISTER_MSR_MTRR_FIX4KF0000 = 0x00080079, + HV_X64_REGISTER_MSR_MTRR_FIX4KF8000 = 0x0008007A, + + HV_X64_REGISTER_REG_PAGE = 0x0009001C, +#endif }; /* diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h index b645d17cc531..dee3ece28ce5 100644 --- a/include/uapi/linux/mshv.h +++ b/include/uapi/linux/mshv.h @@ -322,4 +322,84 @@ struct mshv_get_set_vp_state { * #define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) */ +/* Structure definitions, macros and IOCTLs for mshv_vtl */ + +#define MSHV_CAP_CORE_API_STABLE 0x0 +#define MSHV_CAP_REGISTER_PAGE 0x1 +#define MSHV_CAP_VTL_RETURN_ACTION 0x2 +#define MSHV_CAP_DR6_SHARED 0x3 +#define MSHV_MAX_RUN_MSG_SIZE 256 + +struct mshv_vp_registers { + __u32 count; /* supports only 1 register at a time */ + __u32 reserved; /* Reserved for alignment or future use */ + __u64 regs_ptr; /* pointer to struct hv_register_assoc */ +}; + +struct mshv_vtl_set_eventfd { + __s32 fd; + __u32 flag; +}; + +struct mshv_vtl_signal_event { + __u32 connection_id; + __u32 flag; +}; + +struct mshv_vtl_sint_post_msg { + __u64 message_type; + __u32 connection_id; + __u32 payload_size; /* Must not exceed HV_MESSAGE_PAYLOAD_BYTE_COUNT */ + __u64 payload_ptr; /* pointer to message payload (bytes) */ +}; + +struct mshv_vtl_ram_disposition { + __u64 start_pfn; + __u64 last_pfn; +}; + +struct mshv_vtl_set_poll_file { + __u32 cpu; + __u32 fd; +}; + +struct mshv_vtl_hvcall_setup { + __u64 bitmap_array_size; /* stores number of bytes */ + __u64 allow_bitmap_ptr; +}; + +struct mshv_vtl_hvcall { + __u64 control; /* Hypercall control code */ + __u64 input_size; /* Size of the input data */ + __u64 input_ptr; /* Pointer to the input struct */ + __u64 status; /* Status of the hypercall (output) */ + __u64 output_size; /* Size of the output data */ + __u64 output_ptr; /* Pointer to the output struct */ +}; + +struct mshv_sint_mask { + __u8 mask; + __u8 reserved[7]; +}; + +/* /dev/mshv device IOCTL */ +#define MSHV_CHECK_EXTENSION _IOW(MSHV_IOCTL, 0x00, __u32) + +/* vtl device */ +#define MSHV_CREATE_VTL _IOR(MSHV_IOCTL, 0x1D, char) +#define MSHV_ADD_VTL0_MEMORY _IOW(MSHV_IOCTL, 0x21, struct mshv_vtl_ram_disposition) +#define MSHV_SET_POLL_FILE _IOW(MSHV_IOCTL, 0x25, struct mshv_vtl_set_poll_file) +#define MSHV_RETURN_TO_LOWER_VTL _IO(MSHV_IOCTL, 0x27) +#define MSHV_GET_VP_REGISTERS _IOWR(MSHV_IOCTL, 0x05, struct mshv_vp_registers) +#define MSHV_SET_VP_REGISTERS _IOW(MSHV_IOCTL, 0x06, struct mshv_vp_registers) + +/* VMBus device IOCTLs */ +#define MSHV_SINT_SIGNAL_EVENT _IOW(MSHV_IOCTL, 0x22, struct mshv_vtl_signal_event) +#define MSHV_SINT_POST_MESSAGE _IOW(MSHV_IOCTL, 0x23, struct mshv_vtl_sint_post_msg) +#define MSHV_SINT_SET_EVENTFD _IOW(MSHV_IOCTL, 0x24, struct mshv_vtl_set_eventfd) +#define MSHV_SINT_PAUSE_MESSAGE_STREAM _IOW(MSHV_IOCTL, 0x25, struct mshv_sint_mask) + +/* hv_hvcall device */ +#define MSHV_HVCALL_SETUP _IOW(MSHV_IOCTL, 0x1E, struct mshv_vtl_hvcall_setup) +#define MSHV_HVCALL _IOWR(MSHV_IOCTL, 0x1F, struct mshv_vtl_hvcall) #endif -- cgit v1.2.3 From 9d70ef7a18e0ec1653ac63020a13a5d4dda7cc0d Mon Sep 17 00:00:00 2001 From: Jinank Jain Date: Mon, 24 Nov 2025 14:25:59 +0000 Subject: mshv: adjust interrupt control structure for ARM64 Interrupt control structure (union hv_interupt_control) has different fields when it comes to x86 vs ARM64. Bring in the correct structure from HyperV header files and adjust the existing interrupt routing code accordingly. Signed-off-by: Jinank Jain Signed-off-by: Anirudh Rayabharam (Microsoft) Signed-off-by: Wei Liu --- drivers/hv/mshv_eventfd.c | 6 ++++++ drivers/hv/mshv_irq.c | 4 ++++ drivers/hv/mshv_root_hv_call.c | 6 ++++++ include/hyperv/hvhdk.h | 6 ++++++ 4 files changed, 22 insertions(+) (limited to 'include') diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c index 2a80af1d610a..d93a18f09c76 100644 --- a/drivers/hv/mshv_eventfd.c +++ b/drivers/hv/mshv_eventfd.c @@ -163,8 +163,10 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd) if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) return -EOPNOTSUPP; +#if IS_ENABLED(CONFIG_X86) if (irq->lapic_control.logical_dest_mode) return -EOPNOTSUPP; +#endif vp = partition->pt_vp_array[irq->lapic_apic_id]; @@ -196,8 +198,10 @@ static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd) unsigned int seq; int idx; +#if IS_ENABLED(CONFIG_X86) WARN_ON(irqfd->irqfd_resampler && !irq->lapic_control.level_triggered); +#endif idx = srcu_read_lock(&partition->pt_irq_srcu); if (irqfd->irqfd_girq_ent.guest_irq_num) { @@ -469,6 +473,7 @@ static int mshv_irqfd_assign(struct mshv_partition *pt, init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc); spin_lock_irq(&pt->pt_irqfds_lock); +#if IS_ENABLED(CONFIG_X86) if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) && !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) { /* @@ -479,6 +484,7 @@ static int mshv_irqfd_assign(struct mshv_partition *pt, ret = -EINVAL; goto fail; } +#endif ret = 0; hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) { if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx) diff --git a/drivers/hv/mshv_irq.c b/drivers/hv/mshv_irq.c index d0fb9ef734f4..798e7e1ab06e 100644 --- a/drivers/hv/mshv_irq.c +++ b/drivers/hv/mshv_irq.c @@ -119,6 +119,10 @@ void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent, lirq->lapic_vector = ent->girq_irq_data & 0xFF; lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF; lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8; +#if IS_ENABLED(CONFIG_X86) lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1; lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1; +#elif IS_ENABLED(CONFIG_ARM64) + lirq->lapic_control.asserted = 1; +#endif } diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c index caf02cfa49c9..598eaff4ff29 100644 --- a/drivers/hv/mshv_root_hv_call.c +++ b/drivers/hv/mshv_root_hv_call.c @@ -388,7 +388,13 @@ int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector, memset(input, 0, sizeof(*input)); input->partition_id = partition_id; input->vector = vector; + /* + * NOTE: dest_addr only needs to be provided while asserting an + * interrupt on x86 platform + */ +#if IS_ENABLED(CONFIG_X86) input->dest_addr = dest_addr; +#endif input->control = control; status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL); local_irq_restore(flags); diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h index 416c0d45b793..469186df7826 100644 --- a/include/hyperv/hvhdk.h +++ b/include/hyperv/hvhdk.h @@ -579,9 +579,15 @@ union hv_interrupt_control { u64 as_uint64; struct { u32 interrupt_type; /* enum hv_interrupt_type */ +#if IS_ENABLED(CONFIG_X86) u32 level_triggered : 1; u32 logical_dest_mode : 1; u32 rsvd : 30; +#elif IS_ENABLED(CONFIG_ARM64) + u32 rsvd1 : 2; + u32 asserted : 1; + u32 rsvd2 : 29; +#endif } __packed; }; -- cgit v1.2.3 From 723c47a221ee407901055c9d9b4434e68c5d650e Mon Sep 17 00:00:00 2001 From: Praveen K Paladugu Date: Fri, 5 Dec 2025 14:17:06 -0600 Subject: mshv: Add definitions for MSHV sleep state configuration Add the definitions required to configure sleep states in mshv hypervsior. Signed-off-by: Praveen K Paladugu Co-developed-by: Anatol Belski Signed-off-by: Anatol Belski Reviewed-by: Easwar Hariharan Reviewed-by: Nuno Das Neves Acked-by: Stanislav Kinsburskii Signed-off-by: Wei Liu --- include/hyperv/hvgdk_mini.h | 4 +++- include/hyperv/hvhdk_mini.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 1d5ce11be8b6..04b18d0e37af 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -465,19 +465,21 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_RESET_DEBUG_SESSION 0x006b #define HVCALL_MAP_STATS_PAGE 0x006c #define HVCALL_UNMAP_STATS_PAGE 0x006d +#define HVCALL_SET_SYSTEM_PROPERTY 0x006f #define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076 #define HVCALL_GET_SYSTEM_PROPERTY 0x007b #define HVCALL_MAP_DEVICE_INTERRUPT 0x007c #define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d #define HVCALL_RETARGET_INTERRUPT 0x007e #define HVCALL_NOTIFY_PARTITION_EVENT 0x0087 +#define HVCALL_ENTER_SLEEP_STATE 0x0084 #define HVCALL_NOTIFY_PORT_RING_EMPTY 0x008b #define HVCALL_REGISTER_INTERCEPT_RESULT 0x0091 #define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094 #define HVCALL_CREATE_PORT 0x0095 #define HVCALL_CONNECT_PORT 0x0096 #define HVCALL_START_VP 0x0099 -#define HVCALL_GET_VP_INDEX_FROM_APIC_ID 0x009a +#define HVCALL_GET_VP_INDEX_FROM_APIC_ID 0x009a #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 #define HVCALL_SIGNAL_EVENT_DIRECT 0x00c0 diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index f2d7b50de7a4..41a29bf8ec14 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -140,6 +140,7 @@ enum hv_snp_status { enum hv_system_property { /* Add more values when needed */ + HV_SYSTEM_PROPERTY_SLEEP_STATE = 3, HV_SYSTEM_PROPERTY_SCHEDULER_TYPE = 15, HV_DYNAMIC_PROCESSOR_FEATURE_PROPERTY = 21, HV_SYSTEM_PROPERTY_CRASHDUMPAREA = 47, @@ -155,6 +156,19 @@ union hv_pfn_range { /* HV_SPA_PAGE_RANGE */ } __packed; }; +enum hv_sleep_state { + HV_SLEEP_STATE_S1 = 1, + HV_SLEEP_STATE_S2 = 2, + HV_SLEEP_STATE_S3 = 3, + HV_SLEEP_STATE_S4 = 4, + HV_SLEEP_STATE_S5 = 5, + /* + * After hypervisor has received this, any follow up sleep + * state registration requests will be rejected. + */ + HV_SLEEP_STATE_LOCK = 6 +}; + enum hv_dynamic_processor_feature_property { /* Add more values when needed */ HV_X64_DYNAMIC_PROCESSOR_FEATURE_MAX_ENCRYPTED_PARTITIONS = 13, @@ -184,6 +198,32 @@ struct hv_output_get_system_property { }; } __packed; +struct hv_sleep_state_info { + u32 sleep_state; /* enum hv_sleep_state */ + u8 pm1a_slp_typ; + u8 pm1b_slp_typ; +} __packed; + +struct hv_input_set_system_property { + u32 property_id; /* enum hv_system_property */ + u32 reserved; + union { + /* More fields to be filled in when needed */ + struct hv_sleep_state_info set_sleep_state_info; + + /* + * Add a reserved field to ensure the union is 8-byte aligned as + * existing members may not be. This is a temporary measure + * until all remaining members are added. + */ + u64 reserved0[8]; + }; +} __packed; + +struct hv_input_enter_sleep_state { /* HV_INPUT_ENTER_SLEEP_STATE */ + u32 sleep_state; /* enum hv_sleep_state */ +} __packed; + struct hv_input_map_stats_page { u32 type; /* enum hv_stats_object_type */ u32 padding; -- cgit v1.2.3