diff options
Diffstat (limited to 'drivers/hv')
33 files changed, 11501 insertions, 510 deletions
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index 862c47b191af..2d0b3fcb0ff8 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -3,19 +3,21 @@ menu "Microsoft Hyper-V guest support" config HYPERV - tristate "Microsoft Hyper-V client drivers" + bool "Microsoft Hyper-V core hypervisor support" depends on (X86 && X86_LOCAL_APIC && HYPERVISOR_GUEST) \ - || (ACPI && ARM64 && !CPU_BIG_ENDIAN) + || (ARM64 && !CPU_BIG_ENDIAN) select PARAVIRT select X86_HV_CALLBACK_VECTOR if X86 select OF_EARLY_FLATTREE if OF + select IRQ_MSI_LIB if X86 help Select this option to run Linux as a Hyper-V client operating system. config HYPERV_VTL_MODE bool "Enable Linux to boot in VTL context" - depends on X86_64 && HYPERV + depends on (X86_64 && HAVE_STATIC_CALL) || ARM64 + depends on HYPERV depends on SMP default n help @@ -31,7 +33,7 @@ config HYPERV_VTL_MODE Select this option to build a Linux kernel to run at a VTL other than the normal VTL0, which currently is only VTL2. This option - initializes the x86 platform for VTL2, and adds the ability to boot + initializes the kernel to run in VTL2, and adds the ability to boot secondary CPUs directly into 64-bit context as required for VTLs other than 0. A kernel built with this option must run at VTL2, and will not run as a normal guest. @@ -43,16 +45,68 @@ config HYPERV_TIMER config HYPERV_UTILS tristate "Microsoft Hyper-V Utilities driver" - depends on HYPERV && CONNECTOR && NLS + depends on HYPERV_VMBUS && CONNECTOR && NLS depends on PTP_1588_CLOCK_OPTIONAL help Select this option to enable the Hyper-V Utilities. config HYPERV_BALLOON tristate "Microsoft Hyper-V Balloon driver" - depends on HYPERV + depends on HYPERV_VMBUS select PAGE_REPORTING help Select this option to enable Hyper-V Balloon driver. +config HYPERV_VMBUS + tristate "Microsoft Hyper-V VMBus driver" + depends on HYPERV + default HYPERV + select SYSFB if EFI && !HYPERV_VTL_MODE + help + Select this option to enable Hyper-V Vmbus driver. + +config MSHV_ROOT + tristate "Microsoft Hyper-V root partition support" + depends on HYPERV && (X86_64 || ARM64) + depends on !HYPERV_VTL_MODE + # The hypervisor interface operates on 4k pages. Enforcing it here + # simplifies many assumptions in the root partition code. + # e.g. When withdrawing memory, the hypervisor gives back 4k pages in + # no particular order, making it impossible to reassemble larger pages + depends on PAGE_SIZE_4KB + select EVENTFD + select VIRT_XFER_TO_GUEST_WORK + select HMM_MIRROR + select MMU_NOTIFIER + default n + help + Select this option to enable support for booting and running as root + partition on Microsoft Hyper-V. + + If unsure, say N. + +config MSHV_VTL + tristate "Microsoft Hyper-V VTL driver" + depends on X86_64 && HYPERV_VTL_MODE + depends on HYPERV_VMBUS + # Mapping VTL0 memory to a userspace process in VTL2 is supported in OpenHCL. + # VTL2 for OpenHCL makes use of Huge Pages to improve performance on VMs, + # specially with large memory requirements. + depends on TRANSPARENT_HUGEPAGE + # MTRRs are controlled by VTL0, and are not specific to individual VTLs. + # Therefore, do not attempt to access or modify MTRRs here. + depends on !MTRR + select CPUMASK_OFFSTACK + select VIRT_XFER_TO_GUEST_WORK + default n + help + Select this option to enable Hyper-V VTL driver support. + This driver provides interfaces for Virtual Machine Manager (VMM) running in VTL2 + userspace to create VTLs and partitions, setup and manage VTL0 memory and + allow userspace to make direct hypercalls. This also allows to map VTL0's address + space to a usermode process in VTL2 and supports getting new VMBus messages and channel + events in VTL2. + + If unsure, say N. + endmenu diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index b992c0ed182b..888a748cc7cb 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -1,7 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_HYPERV) += hv_vmbus.o +obj-$(CONFIG_HYPERV_VMBUS) += hv_vmbus.o obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o +obj-$(CONFIG_MSHV_ROOT) += mshv_root.o +obj-$(CONFIG_MSHV_VTL) += mshv_vtl.o CFLAGS_hv_trace.o = -I$(src) CFLAGS_hv_balloon.o = -I$(src) @@ -11,6 +13,15 @@ hv_vmbus-y := vmbus_drv.o \ channel_mgmt.o ring_buffer.o hv_trace.o hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o +mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \ + mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o +mshv_root-$(CONFIG_DEBUG_FS) += mshv_debugfs.o +mshv_root-$(CONFIG_TRACEPOINTS) += mshv_trace.o +mshv_vtl-y := mshv_vtl_main.o # Code that must be built-in -obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o +obj-$(CONFIG_HYPERV) += hv_common.o +obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o +ifneq ($(CONFIG_MSHV_ROOT)$(CONFIG_MSHV_VTL),) + obj-y += mshv_common.o +endif diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index fb8cd8469328..6821f225248b 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -18,6 +18,7 @@ #include <linux/uio.h> #include <linux/interrupt.h> #include <linux/set_memory.h> +#include <linux/export.h> #include <asm/page.h> #include <asm/mshyperv.h> @@ -409,6 +410,21 @@ static int create_gpadl_header(enum hv_gpadl_type type, void *kbuffer, return 0; } +static void vmbus_free_channel_msginfo(struct vmbus_channel_msginfo *msginfo) +{ + struct vmbus_channel_msginfo *submsginfo, *tmp; + + if (!msginfo) + return; + + list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist, + msglistentry) { + kfree(submsginfo); + } + + kfree(msginfo); +} + /* * __vmbus_establish_gpadl - Establish a GPADL for a buffer or ringbuffer * @@ -428,7 +444,7 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel, struct vmbus_channel_gpadl_header *gpadlmsg; struct vmbus_channel_gpadl_body *gpadl_body; struct vmbus_channel_msginfo *msginfo = NULL; - struct vmbus_channel_msginfo *submsginfo, *tmp; + struct vmbus_channel_msginfo *submsginfo; struct list_head *curr; u32 next_gpadl_handle; unsigned long flags; @@ -443,20 +459,24 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel, return ret; } - /* - * Set the "decrypted" flag to true for the set_memory_decrypted() - * success case. In the failure case, the encryption state of the - * memory is unknown. Leave "decrypted" as true to ensure the - * memory will be leaked instead of going back on the free list. - */ - gpadl->decrypted = true; - ret = set_memory_decrypted((unsigned long)kbuffer, - PFN_UP(size)); - if (ret) { - dev_warn(&channel->device_obj->device, - "Failed to set host visibility for new GPADL %d.\n", - ret); - return ret; + gpadl->decrypted = !((channel->co_external_memory && type == HV_GPADL_BUFFER) || + (channel->co_ring_buffer && type == HV_GPADL_RING)); + if (gpadl->decrypted) { + /* + * The "decrypted" flag being true assumes that set_memory_decrypted() succeeds. + * But if it fails, the encryption state of the memory is unknown. In that case, + * leave "decrypted" as true to ensure the memory is leaked instead of going back + * on the free list. + */ + ret = set_memory_decrypted((unsigned long)kbuffer, + PFN_UP(size)); + if (ret) { + dev_warn(&channel->device_obj->device, + "Failed to set host visibility for new GPADL %d.\n", + ret); + vmbus_free_channel_msginfo(msginfo); + return ret; + } } init_completion(&msginfo->waitevent); @@ -531,12 +551,8 @@ cleanup: spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); list_del(&msginfo->msglistentry); spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); - list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist, - msglistentry) { - kfree(submsginfo); - } - kfree(msginfo); + vmbus_free_channel_msginfo(msginfo); if (ret) { /* @@ -544,8 +560,10 @@ cleanup: * left as true so the memory is leaked instead of being * put back on the free list. */ - if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size))) - gpadl->decrypted = false; + if (gpadl->decrypted) { + if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size))) + gpadl->decrypted = false; + } } return ret; @@ -572,7 +590,7 @@ EXPORT_SYMBOL_GPL(vmbus_establish_gpadl); * keeps track of the next available slot in the array. Initially, each * slot points to the next one (as in a Linked List). The last slot * does not point to anything, so its value is U64_MAX by default. - * @size The size of the array + * @size: The size of the array */ static u64 *request_arr_init(u32 size) { @@ -676,12 +694,13 @@ static int __vmbus_open(struct vmbus_channel *newchannel, goto error_clean_ring; err = hv_ringbuffer_init(&newchannel->outbound, - page, send_pages, 0); + page, send_pages, 0, newchannel->co_ring_buffer); if (err) goto error_free_gpadl; err = hv_ringbuffer_init(&newchannel->inbound, &page[send_pages], - recv_pages, newchannel->max_pkt_size); + recv_pages, newchannel->max_pkt_size, + newchannel->co_ring_buffer); if (err) goto error_free_gpadl; @@ -862,8 +881,11 @@ post_msg_err: kfree(info); - ret = set_memory_encrypted((unsigned long)gpadl->buffer, - PFN_UP(gpadl->size)); + if (gpadl->decrypted) + ret = set_memory_encrypted((unsigned long)gpadl->buffer, + PFN_UP(gpadl->size)); + else + ret = 0; if (ret) pr_warn("Fail to set mem host visibility in GPADL teardown %d.\n", ret); @@ -924,7 +946,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel) /* Send a closing message */ - msg = &channel->close_msg.msg; + msg = &channel->close_msg; msg->header.msgtype = CHANNELMSG_CLOSECHANNEL; msg->child_relid = channel->offermsg.child_relid; @@ -1077,68 +1099,10 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, EXPORT_SYMBOL(vmbus_sendpacket); /* - * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer - * packets using a GPADL Direct packet type. This interface allows you - * to control notifying the host. This will be useful for sending - * batched data. Also the sender can control the send flags - * explicitly. - */ -int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, - struct hv_page_buffer pagebuffers[], - u32 pagecount, void *buffer, u32 bufferlen, - u64 requestid) -{ - int i; - struct vmbus_channel_packet_page_buffer desc; - u32 descsize; - u32 packetlen; - u32 packetlen_aligned; - struct kvec bufferlist[3]; - u64 aligned_data = 0; - - if (pagecount > MAX_PAGE_BUFFER_COUNT) - return -EINVAL; - - /* - * Adjust the size down since vmbus_channel_packet_page_buffer is the - * largest size we support - */ - descsize = sizeof(struct vmbus_channel_packet_page_buffer) - - ((MAX_PAGE_BUFFER_COUNT - pagecount) * - sizeof(struct hv_page_buffer)); - packetlen = descsize + bufferlen; - packetlen_aligned = ALIGN(packetlen, sizeof(u64)); - - /* Setup the descriptor */ - desc.type = VM_PKT_DATA_USING_GPA_DIRECT; - desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; - desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */ - desc.length8 = (u16)(packetlen_aligned >> 3); - desc.transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */ - desc.reserved = 0; - desc.rangecount = pagecount; - - for (i = 0; i < pagecount; i++) { - desc.range[i].len = pagebuffers[i].len; - desc.range[i].offset = pagebuffers[i].offset; - desc.range[i].pfn = pagebuffers[i].pfn; - } - - bufferlist[0].iov_base = &desc; - bufferlist[0].iov_len = descsize; - bufferlist[1].iov_base = buffer; - bufferlist[1].iov_len = bufferlen; - bufferlist[2].iov_base = &aligned_data; - bufferlist[2].iov_len = (packetlen_aligned - packetlen); - - return hv_ringbuffer_write(channel, bufferlist, 3, requestid, NULL); -} -EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer); - -/* - * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet + * vmbus_sendpacket_mpb_desc - Send one or more multi-page buffer packets * using a GPADL Direct packet type. - * The buffer includes the vmbus descriptor. + * The desc argument must include space for the VMBus descriptor. The + * rangecount field must already be set. */ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, struct vmbus_packet_mpb_array *desc, @@ -1160,7 +1124,6 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, desc->length8 = (u16)(packetlen_aligned >> 3); desc->transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */ desc->reserved = 0; - desc->rangecount = 1; bufferlist[0].iov_base = desc; bufferlist[0].iov_len = desc_size; diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 6e084c207414..84eb0a6a0b54 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -20,6 +20,7 @@ #include <linux/delay.h> #include <linux/cpu.h> #include <linux/hyperv.h> +#include <linux/export.h> #include <asm/mshyperv.h> #include <linux/sched/isolation.h> @@ -353,7 +354,7 @@ static struct vmbus_channel *alloc_channel(void) { struct vmbus_channel *channel; - channel = kzalloc(sizeof(*channel), GFP_ATOMIC); + channel = kzalloc_obj(*channel, GFP_ATOMIC); if (!channel) return NULL; @@ -383,8 +384,18 @@ static void free_channel(struct vmbus_channel *channel) void vmbus_channel_map_relid(struct vmbus_channel *channel) { - if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS)) + u32 new_relid = channel->offermsg.child_relid; + + if (WARN_ON(new_relid >= MAX_CHANNEL_RELIDS)) return; + + /* + * This function is always called in the tasklet for the connect CPU. + * So updating the relid hiwater mark does not need to be atomic. + */ + if (new_relid > READ_ONCE(vmbus_connection.relid_hiwater)) + WRITE_ONCE(vmbus_connection.relid_hiwater, new_relid); + /* * The mapping of the channel's relid is visible from the CPUs that * execute vmbus_chan_sched() by the time that vmbus_chan_sched() will @@ -410,9 +421,7 @@ void vmbus_channel_map_relid(struct vmbus_channel *channel) * of the VMBus driver and vmbus_chan_sched() can not run before * vmbus_bus_resume() has completed execution (cf. resume_noirq). */ - virt_store_mb( - vmbus_connection.channels[channel->offermsg.child_relid], - channel); + virt_store_mb(vmbus_connection.channels[new_relid], channel); } void vmbus_channel_unmap_relid(struct vmbus_channel *channel) @@ -843,14 +852,14 @@ static void vmbus_wait_for_unload(void) = per_cpu_ptr(hv_context.cpu_context, cpu); /* - * In a CoCo VM the synic_message_page is not allocated + * In a CoCo VM the hyp_synic_message_page is not allocated * in hv_synic_alloc(). Instead it is set/cleared in - * hv_synic_enable_regs() and hv_synic_disable_regs() + * hv_hyp_synic_enable_regs() and hv_hyp_synic_disable_regs() * such that it is set only when the CPU is online. If * not all present CPUs are online, the message page * might be NULL, so skip such CPUs. */ - page_addr = hv_cpu->synic_message_page; + page_addr = hv_cpu->hyp_synic_message_page; if (!page_addr) continue; @@ -891,7 +900,7 @@ completed: struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); - page_addr = hv_cpu->synic_message_page; + page_addr = hv_cpu->hyp_synic_message_page; if (!page_addr) continue; @@ -1021,6 +1030,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) struct vmbus_channel_offer_channel *offer; struct vmbus_channel *oldchannel, *newchannel; size_t offer_sz; + bool co_ring_buffer, co_external_memory; offer = (struct vmbus_channel_offer_channel *)hdr; @@ -1033,6 +1043,22 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) return; } + co_ring_buffer = is_co_ring_buffer(offer); + co_external_memory = is_co_external_memory(offer); + if (!co_ring_buffer && co_external_memory) { + pr_err("Invalid offer relid=%d: the ring buffer isn't encrypted\n", + offer->child_relid); + return; + } + if (co_ring_buffer || co_external_memory) { + if (vmbus_proto_version < VERSION_WIN10_V6_0 || !vmbus_is_confidential()) { + pr_err("Invalid offer relid=%d: no support for confidential VMBus\n", + offer->child_relid); + atomic_dec(&vmbus_connection.offer_in_progress); + return; + } + } + oldchannel = find_primary_channel_by_offer(offer); if (oldchannel != NULL) { @@ -1111,6 +1137,8 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) pr_err("Unable to allocate channel object\n"); return; } + newchannel->co_ring_buffer = co_ring_buffer; + newchannel->co_external_memory = co_external_memory; vmbus_setup_channel_state(newchannel, offer); diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 8351360bba16..b5b322ce16df 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version); * Linux guests and are not listed. */ static __u32 vmbus_versions[] = { + VERSION_WIN10_V6_0, VERSION_WIN10_V5_3, VERSION_WIN10_V5_2, VERSION_WIN10_V5_1, @@ -65,7 +66,7 @@ static __u32 vmbus_versions[] = { * Maximal VMBus protocol version guests can negotiate. Useful to cap the * VMBus version for testing and debugging purpose. */ -static uint max_version = VERSION_WIN10_V5_3; +static uint max_version = VERSION_WIN10_V6_0; module_param(max_version, uint, S_IRUGO); MODULE_PARM_DESC(max_version, @@ -105,6 +106,9 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID; } + if (vmbus_is_confidential() && version >= VERSION_WIN10_V6_0) + msg->feature_flags = VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS; + /* * shared_gpa_boundary is zero in non-SNP VMs, so it's safe to always * bitwise OR it @@ -207,10 +211,19 @@ int vmbus_connect(void) mutex_init(&vmbus_connection.channel_mutex); /* + * The following Hyper-V interrupt and monitor pages can be used by + * UIO for mapping to user-space, so they should always be allocated on + * system page boundaries. The system page size must be >= the Hyper-V + * page size. + */ + BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE); + + /* * Setup the vmbus event connection for channel interrupt * abstraction stuff */ - vmbus_connection.int_page = hv_alloc_hyperv_zeroed_page(); + vmbus_connection.int_page = + (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); if (vmbus_connection.int_page == NULL) { ret = -ENOMEM; goto cleanup; @@ -225,8 +238,8 @@ int vmbus_connect(void) * Setup the monitor notification facility. The 1st page for * parent->child and the 2nd page for child->parent */ - vmbus_connection.monitor_pages[0] = hv_alloc_hyperv_page(); - vmbus_connection.monitor_pages[1] = hv_alloc_hyperv_page(); + vmbus_connection.monitor_pages[0] = (void *)__get_free_page(GFP_KERNEL); + vmbus_connection.monitor_pages[1] = (void *)__get_free_page(GFP_KERNEL); if ((vmbus_connection.monitor_pages[0] == NULL) || (vmbus_connection.monitor_pages[1] == NULL)) { ret = -ENOMEM; @@ -300,9 +313,8 @@ int vmbus_connect(void) pr_info("Vmbus version:%d.%d\n", version >> 16, version & 0xFFFF); - vmbus_connection.channels = kcalloc(MAX_CHANNEL_RELIDS, - sizeof(struct vmbus_channel *), - GFP_KERNEL); + vmbus_connection.channels = kzalloc_objs(struct vmbus_channel *, + MAX_CHANNEL_RELIDS); if (vmbus_connection.channels == NULL) { ret = -ENOMEM; goto cleanup; @@ -342,21 +354,23 @@ void vmbus_disconnect(void) destroy_workqueue(vmbus_connection.work_queue); if (vmbus_connection.int_page) { - hv_free_hyperv_page(vmbus_connection.int_page); + free_page((unsigned long)vmbus_connection.int_page); vmbus_connection.int_page = NULL; } if (vmbus_connection.monitor_pages[0]) { if (!set_memory_encrypted( (unsigned long)vmbus_connection.monitor_pages[0], 1)) - hv_free_hyperv_page(vmbus_connection.monitor_pages[0]); + free_page((unsigned long) + vmbus_connection.monitor_pages[0]); vmbus_connection.monitor_pages[0] = NULL; } if (vmbus_connection.monitor_pages[1]) { if (!set_memory_encrypted( (unsigned long)vmbus_connection.monitor_pages[1], 1)) - hv_free_hyperv_page(vmbus_connection.monitor_pages[1]); + free_page((unsigned long) + vmbus_connection.monitor_pages[1]); vmbus_connection.monitor_pages[1] = NULL; } } @@ -508,7 +522,10 @@ void vmbus_set_event(struct vmbus_channel *channel) else WARN_ON_ONCE(1); } else { - hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, channel->sig_event); + u64 control = HVCALL_SIGNAL_EVENT; + + control |= hv_nested ? HV_HYPERCALL_NESTED : 0; + hv_do_fast_hypercall8(control, channel->sig_event); } } EXPORT_SYMBOL_GPL(vmbus_set_event); diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 36d9ba097ff5..ae60fd542292 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -18,6 +18,7 @@ #include <linux/clockchips.h> #include <linux/delay.h> #include <linux/interrupt.h> +#include <linux/export.h> #include <clocksource/hyperv_timer.h> #include <asm/mshyperv.h> #include <linux/set_memory.h> @@ -25,6 +26,7 @@ /* The one and only */ struct hv_context hv_context; +EXPORT_SYMBOL_FOR_MODULES(hv_context, "mshv_vtl"); /* * hv_init - Main initialization routine. @@ -74,7 +76,11 @@ int hv_post_message(union hv_connection_id connection_id, aligned_msg->payload_size = payload_size; memcpy((void *)aligned_msg->payload, payload, payload_size); - if (ms_hyperv.paravisor_present) { + if (ms_hyperv.paravisor_present && !vmbus_is_confidential()) { + /* + * If the VMBus isn't confidential, use the CoCo-specific + * mechanism to communicate with the hypervisor. + */ if (hv_isolation_type_tdx()) status = hv_tdx_hypercall(HVCALL_POST_MESSAGE, virt_to_phys(aligned_msg), 0); @@ -85,19 +91,87 @@ int hv_post_message(union hv_connection_id connection_id, else status = HV_STATUS_INVALID_PARAMETER; } else { - status = hv_do_hypercall(HVCALL_POST_MESSAGE, - aligned_msg, NULL); + u64 control = HVCALL_POST_MESSAGE; + + control |= hv_nested ? HV_HYPERCALL_NESTED : 0; + /* + * If there is no paravisor, this will go to the hypervisor. + * In the Confidential VMBus case, there is the paravisor + * to which this will trap. + */ + status = hv_do_hypercall(control, aligned_msg, NULL); } local_irq_restore(flags); return hv_result(status); } +EXPORT_SYMBOL_FOR_MODULES(hv_post_message, "mshv_vtl"); + +static int hv_alloc_page(void **page, bool decrypt, const char *note) +{ + int ret = 0; + + /* + * After the page changes its encryption status, its contents might + * appear scrambled on some hardware. Thus `get_zeroed_page` would + * zero the page out in vain, so do that explicitly exactly once. + * + * By default, the page is allocated encrypted in a CoCo VM. + */ + *page = (void *)__get_free_page(GFP_KERNEL); + if (!*page) + return -ENOMEM; + + if (decrypt) + ret = set_memory_decrypted((unsigned long)*page, 1); + if (ret) + goto failed; + + memset(*page, 0, PAGE_SIZE); + return 0; + +failed: + /* + * Report the failure but don't put the page back on the free list as + * its encryption status is unknown. + */ + pr_err("allocation failed for %s page, error %d, decrypted %d\n", + note, ret, decrypt); + *page = NULL; + return ret; +} + +static int hv_free_page(void **page, bool encrypt, const char *note) +{ + int ret = 0; + + if (!*page) + return 0; + + if (encrypt) + ret = set_memory_encrypted((unsigned long)*page, 1); + + /* + * In the case of the failure, the page is leaked. Something is wrong, + * prefer to lose the page with the unknown encryption status and stay afloat. + */ + if (ret) + pr_err("deallocation failed for %s page, error %d, encrypt %d\n", + note, ret, encrypt); + else + free_page((unsigned long)*page); + + *page = NULL; + + return ret; +} int hv_synic_alloc(void) { int cpu, ret = -ENOMEM; struct hv_per_cpu_context *hv_cpu; + const bool decrypt = !vmbus_is_confidential(); /* * First, zero all per-cpu memory areas so hv_synic_free() can @@ -109,8 +183,7 @@ int hv_synic_alloc(void) memset(hv_cpu, 0, sizeof(*hv_cpu)); } - hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask), - GFP_KERNEL); + hv_context.hv_numa_map = kzalloc_objs(struct cpumask, nr_node_ids); if (!hv_context.hv_numa_map) { pr_err("Unable to allocate NUMA map\n"); goto err; @@ -123,73 +196,37 @@ int hv_synic_alloc(void) vmbus_on_msg_dpc, (unsigned long)hv_cpu); if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { - hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (!hv_cpu->post_msg_page) { - pr_err("Unable to allocate post msg page\n"); - goto err; - } - - ret = set_memory_decrypted((unsigned long)hv_cpu->post_msg_page, 1); - if (ret) { - pr_err("Failed to decrypt post msg page: %d\n", ret); - /* Just leak the page, as it's unsafe to free the page. */ - hv_cpu->post_msg_page = NULL; + ret = hv_alloc_page(&hv_cpu->post_msg_page, + decrypt, "post msg"); + if (ret) goto err; - } - - memset(hv_cpu->post_msg_page, 0, PAGE_SIZE); } /* - * Synic message and event pages are allocated by paravisor. - * Skip these pages allocation here. + * If these SynIC pages are not allocated, SIEF and SIM pages + * are configured using what the root partition or the paravisor + * provides upon reading the SIEFP and SIMP registers. */ - if (!ms_hyperv.paravisor_present && !hv_root_partition) { - hv_cpu->synic_message_page = - (void *)get_zeroed_page(GFP_ATOMIC); - if (!hv_cpu->synic_message_page) { - pr_err("Unable to allocate SYNIC message page\n"); + if (!ms_hyperv.paravisor_present && !hv_root_partition()) { + ret = hv_alloc_page(&hv_cpu->hyp_synic_message_page, + decrypt, "hypervisor SynIC msg"); + if (ret) goto err; - } - - hv_cpu->synic_event_page = - (void *)get_zeroed_page(GFP_ATOMIC); - if (!hv_cpu->synic_event_page) { - pr_err("Unable to allocate SYNIC event page\n"); - - free_page((unsigned long)hv_cpu->synic_message_page); - hv_cpu->synic_message_page = NULL; + ret = hv_alloc_page(&hv_cpu->hyp_synic_event_page, + decrypt, "hypervisor SynIC event"); + if (ret) goto err; - } } - if (!ms_hyperv.paravisor_present && - (hv_isolation_type_snp() || hv_isolation_type_tdx())) { - ret = set_memory_decrypted((unsigned long) - hv_cpu->synic_message_page, 1); - if (ret) { - pr_err("Failed to decrypt SYNIC msg page: %d\n", ret); - hv_cpu->synic_message_page = NULL; - - /* - * Free the event page here so that hv_synic_free() - * won't later try to re-encrypt it. - */ - free_page((unsigned long)hv_cpu->synic_event_page); - hv_cpu->synic_event_page = NULL; + if (vmbus_is_confidential()) { + ret = hv_alloc_page(&hv_cpu->para_synic_message_page, + false, "paravisor SynIC msg"); + if (ret) goto err; - } - - ret = set_memory_decrypted((unsigned long) - hv_cpu->synic_event_page, 1); - if (ret) { - pr_err("Failed to decrypt SYNIC event page: %d\n", ret); - hv_cpu->synic_event_page = NULL; + ret = hv_alloc_page(&hv_cpu->para_synic_event_page, + false, "paravisor SynIC event"); + if (ret) goto err; - } - - memset(hv_cpu->synic_message_page, 0, PAGE_SIZE); - memset(hv_cpu->synic_event_page, 0, PAGE_SIZE); } } @@ -205,106 +242,83 @@ err: void hv_synic_free(void) { - int cpu, ret; + int cpu; + const bool encrypt = !vmbus_is_confidential(); for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); - /* It's better to leak the page if the encryption fails. */ - if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { - if (hv_cpu->post_msg_page) { - ret = set_memory_encrypted((unsigned long) - hv_cpu->post_msg_page, 1); - if (ret) { - pr_err("Failed to encrypt post msg page: %d\n", ret); - hv_cpu->post_msg_page = NULL; - } - } + if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) + hv_free_page(&hv_cpu->post_msg_page, + encrypt, "post msg"); + if (!ms_hyperv.paravisor_present && !hv_root_partition()) { + hv_free_page(&hv_cpu->hyp_synic_event_page, + encrypt, "hypervisor SynIC event"); + hv_free_page(&hv_cpu->hyp_synic_message_page, + encrypt, "hypervisor SynIC msg"); } - - if (!ms_hyperv.paravisor_present && - (hv_isolation_type_snp() || hv_isolation_type_tdx())) { - if (hv_cpu->synic_message_page) { - ret = set_memory_encrypted((unsigned long) - hv_cpu->synic_message_page, 1); - if (ret) { - pr_err("Failed to encrypt SYNIC msg page: %d\n", ret); - hv_cpu->synic_message_page = NULL; - } - } - - if (hv_cpu->synic_event_page) { - ret = set_memory_encrypted((unsigned long) - hv_cpu->synic_event_page, 1); - if (ret) { - pr_err("Failed to encrypt SYNIC event page: %d\n", ret); - hv_cpu->synic_event_page = NULL; - } - } + if (vmbus_is_confidential()) { + hv_free_page(&hv_cpu->para_synic_event_page, + false, "paravisor SynIC event"); + hv_free_page(&hv_cpu->para_synic_message_page, + false, "paravisor SynIC msg"); } - - free_page((unsigned long)hv_cpu->post_msg_page); - free_page((unsigned long)hv_cpu->synic_event_page); - free_page((unsigned long)hv_cpu->synic_message_page); } kfree(hv_context.hv_numa_map); } /* - * hv_synic_init - Initialize the Synthetic Interrupt Controller. - * - * If it is already initialized by another entity (ie x2v shim), we need to - * retrieve the initialized message and event pages. Otherwise, we create and - * initialize the message and event pages. + * hv_hyp_synic_enable_regs - Initialize the Synthetic Interrupt Controller + * with the hypervisor. */ -void hv_synic_enable_regs(unsigned int cpu) +void hv_hyp_synic_enable_regs(unsigned int cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); union hv_synic_simp simp; union hv_synic_siefp siefp; union hv_synic_sint shared_sint; - union hv_synic_scontrol sctrl; - /* Setup the Synic's message page */ + /* Setup the Synic's message page with the hypervisor. */ simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); simp.simp_enabled = 1; - if (ms_hyperv.paravisor_present || hv_root_partition) { - /* Mask out vTOM bit. ioremap_cache() maps decrypted */ + if (ms_hyperv.paravisor_present || hv_root_partition()) { + /* Mask out vTOM bit and map as decrypted */ u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; - hv_cpu->synic_message_page = - (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); - if (!hv_cpu->synic_message_page) + hv_cpu->hyp_synic_message_page = + memremap(base, HV_HYP_PAGE_SIZE, MEMREMAP_WB | MEMREMAP_DEC); + if (!hv_cpu->hyp_synic_message_page) pr_err("Fail to map synic message page.\n"); } else { - simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page) + simp.base_simp_gpa = virt_to_phys(hv_cpu->hyp_synic_message_page) >> HV_HYP_PAGE_SHIFT; } hv_set_msr(HV_MSR_SIMP, simp.as_uint64); - /* Setup the Synic's event page */ + /* Setup the Synic's event page with the hypervisor. */ siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP); siefp.siefp_enabled = 1; - if (ms_hyperv.paravisor_present || hv_root_partition) { - /* Mask out vTOM bit. ioremap_cache() maps decrypted */ + if (ms_hyperv.paravisor_present || hv_root_partition()) { + /* Mask out vTOM bit and map as decrypted */ u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; - hv_cpu->synic_event_page = - (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); - if (!hv_cpu->synic_event_page) + hv_cpu->hyp_synic_event_page = + memremap(base, HV_HYP_PAGE_SIZE, MEMREMAP_WB | MEMREMAP_DEC); + if (!hv_cpu->hyp_synic_event_page) pr_err("Fail to map synic event page.\n"); } else { - siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page) + siefp.base_siefp_gpa = virt_to_phys(hv_cpu->hyp_synic_event_page) >> HV_HYP_PAGE_SHIFT; } hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64); + hv_enable_coco_interrupt(cpu, vmbus_interrupt, true); /* Setup the shared SINT. */ if (vmbus_irq != -1) @@ -313,18 +327,13 @@ void hv_synic_enable_regs(unsigned int cpu) shared_sint.vector = vmbus_interrupt; shared_sint.masked = false; - - /* - * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64), - * it doesn't provide a recommendation flag and AEOI must be disabled. - */ -#ifdef HV_DEPRECATING_AEOI_RECOMMENDED - shared_sint.auto_eoi = - !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED); -#else - shared_sint.auto_eoi = 0; -#endif + shared_sint.auto_eoi = hv_recommend_using_aeoi(); hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); +} + +static void hv_hyp_synic_enable_interrupts(void) +{ + union hv_synic_scontrol sctrl; /* Enable the global synic bit */ sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL); @@ -333,23 +342,72 @@ void hv_synic_enable_regs(unsigned int cpu) hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64); } +static void hv_para_synic_enable_regs(unsigned int cpu) +{ + union hv_synic_simp simp; + union hv_synic_siefp siefp; + struct hv_per_cpu_context *hv_cpu + = per_cpu_ptr(hv_context.cpu_context, cpu); + + /* Setup the Synic's message page with the paravisor. */ + simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP); + simp.simp_enabled = 1; + simp.base_simp_gpa = virt_to_phys(hv_cpu->para_synic_message_page) + >> HV_HYP_PAGE_SHIFT; + hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64); + + /* Setup the Synic's event page with the paravisor. */ + siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP); + siefp.siefp_enabled = 1; + siefp.base_siefp_gpa = virt_to_phys(hv_cpu->para_synic_event_page) + >> HV_HYP_PAGE_SHIFT; + hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64); +} + +static void hv_para_synic_enable_interrupts(void) +{ + union hv_synic_scontrol sctrl; + + /* Enable the global synic bit */ + sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL); + sctrl.enable = 1; + hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64); +} + int hv_synic_init(unsigned int cpu) { - hv_synic_enable_regs(cpu); + if (vmbus_is_confidential()) + hv_para_synic_enable_regs(cpu); + + /* + * The SINT is set in hv_hyp_synic_enable_regs() by calling + * hv_set_msr(). hv_set_msr() in turn has special case code for the + * SINT MSRs that write to the hypervisor version of the MSR *and* + * the paravisor version of the MSR (but *without* the proxy bit when + * VMBus is confidential). + * + * Then enable interrupts via the paravisor if VMBus is confidential, + * and otherwise via the hypervisor. + */ + + hv_hyp_synic_enable_regs(cpu); + if (vmbus_is_confidential()) + hv_para_synic_enable_interrupts(); + else + hv_hyp_synic_enable_interrupts(); hv_stimer_legacy_init(cpu, VMBUS_MESSAGE_SINT); return 0; } -void hv_synic_disable_regs(unsigned int cpu) +void hv_hyp_synic_disable_regs(unsigned int cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); union hv_synic_sint shared_sint; union hv_synic_simp simp; union hv_synic_siefp siefp; - union hv_synic_scontrol sctrl; shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT); @@ -358,18 +416,21 @@ void hv_synic_disable_regs(unsigned int cpu) /* Need to correctly cleanup in the case of SMP!!! */ /* Disable the interrupt */ hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + hv_enable_coco_interrupt(cpu, vmbus_interrupt, false); simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); /* - * In Isolation VM, sim and sief pages are allocated by + * In Isolation VM, simp and sief pages are allocated by * paravisor. These pages also will be used by kdump * kernel. So just reset enable bit here and keep page * addresses. */ simp.simp_enabled = 0; - if (ms_hyperv.paravisor_present || hv_root_partition) { - iounmap(hv_cpu->synic_message_page); - hv_cpu->synic_message_page = NULL; + if (ms_hyperv.paravisor_present || hv_root_partition()) { + if (hv_cpu->hyp_synic_message_page) { + memunmap(hv_cpu->hyp_synic_message_page); + hv_cpu->hyp_synic_message_page = NULL; + } } else { simp.base_simp_gpa = 0; } @@ -379,22 +440,52 @@ void hv_synic_disable_regs(unsigned int cpu) siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP); siefp.siefp_enabled = 0; - if (ms_hyperv.paravisor_present || hv_root_partition) { - iounmap(hv_cpu->synic_event_page); - hv_cpu->synic_event_page = NULL; + if (ms_hyperv.paravisor_present || hv_root_partition()) { + if (hv_cpu->hyp_synic_event_page) { + memunmap(hv_cpu->hyp_synic_event_page); + hv_cpu->hyp_synic_event_page = NULL; + } } else { siefp.base_siefp_gpa = 0; } hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64); +} + +static void hv_hyp_synic_disable_interrupts(void) +{ + union hv_synic_scontrol sctrl; /* Disable the global synic bit */ sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL); sctrl.enable = 0; hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64); +} - if (vmbus_irq != -1) - disable_percpu_irq(vmbus_irq); +static void hv_para_synic_disable_regs(unsigned int cpu) +{ + union hv_synic_simp simp; + union hv_synic_siefp siefp; + + /* Disable SynIC's message page in the paravisor. */ + simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP); + simp.simp_enabled = 0; + hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64); + + /* Disable SynIC's event page in the paravisor. */ + siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP); + siefp.siefp_enabled = 0; + hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64); +} + +static void hv_para_synic_disable_interrupts(void) +{ + union hv_synic_scontrol sctrl; + + /* Disable the global synic bit */ + sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL); + sctrl.enable = 0; + hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64); } #define HV_MAX_TRIES 3 @@ -407,16 +498,18 @@ void hv_synic_disable_regs(unsigned int cpu) * that the normal interrupt handling mechanism will find and process the channel interrupt * "very soon", and in the process clear the bit. */ -static bool hv_synic_event_pending(void) +static bool __hv_synic_event_pending(union hv_synic_event_flags *event, int sint) { - struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); - union hv_synic_event_flags *event = - (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT; - unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */ + unsigned long *recv_int_page; bool pending; u32 relid; int tries = 0; + if (!event) + return false; + + event += sint; + recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */ retry: pending = false; for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) { @@ -433,13 +526,58 @@ retry: return pending; } +static bool hv_synic_event_pending(void) +{ + struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); + union hv_synic_event_flags *hyp_synic_event_page = hv_cpu->hyp_synic_event_page; + union hv_synic_event_flags *para_synic_event_page = hv_cpu->para_synic_event_page; + + return + __hv_synic_event_pending(hyp_synic_event_page, VMBUS_MESSAGE_SINT) || + __hv_synic_event_pending(para_synic_event_page, VMBUS_MESSAGE_SINT); +} + +static int hv_pick_new_cpu(struct vmbus_channel *channel) +{ + int ret = -EBUSY; + int start; + int cpu; + + lockdep_assert_cpus_held(); + lockdep_assert_held(&vmbus_connection.channel_mutex); + + /* + * We can't assume that the relevant interrupts will be sent before + * the cpu is offlined on older versions of hyperv. + */ + if (vmbus_proto_version < VERSION_WIN10_V5_3) + return -EBUSY; + + start = get_random_u32_below(nr_cpu_ids); + + for_each_cpu_wrap(cpu, cpu_online_mask, start) { + if (channel->target_cpu == cpu || + channel->target_cpu == VMBUS_CONNECT_CPU) + continue; + + ret = vmbus_channel_set_cpu(channel, cpu); + if (!ret) + break; + } + + if (ret) + ret = vmbus_channel_set_cpu(channel, VMBUS_CONNECT_CPU); + + return ret; +} + /* * hv_synic_cleanup - Cleanup routine for hv_synic_init(). */ int hv_synic_cleanup(unsigned int cpu) { struct vmbus_channel *channel, *sc; - bool channel_found = false; + int ret = 0; if (vmbus_connection.conn_state != CONNECTED) goto always_cleanup; @@ -456,38 +594,34 @@ int hv_synic_cleanup(unsigned int cpu) /* * Search for channels which are bound to the CPU we're about to - * cleanup. In case we find one and vmbus is still connected, we - * fail; this will effectively prevent CPU offlining. - * - * TODO: Re-bind the channels to different CPUs. + * cleanup. */ mutex_lock(&vmbus_connection.channel_mutex); list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { if (channel->target_cpu == cpu) { - channel_found = true; - break; + ret = hv_pick_new_cpu(channel); + if (ret) { + mutex_unlock(&vmbus_connection.channel_mutex); + return ret; + } } list_for_each_entry(sc, &channel->sc_list, sc_list) { if (sc->target_cpu == cpu) { - channel_found = true; - break; + ret = hv_pick_new_cpu(sc); + if (ret) { + mutex_unlock(&vmbus_connection.channel_mutex); + return ret; + } } } - if (channel_found) - break; } mutex_unlock(&vmbus_connection.channel_mutex); - if (channel_found) - return -EBUSY; - /* - * channel_found == false means that any channels that were previously - * assigned to the CPU have been reassigned elsewhere with a call of - * vmbus_send_modifychannel(). Scan the event flags page looking for - * bits that are set and waiting with a timeout for vmbus_chan_sched() - * to process such bits. If bits are still set after this operation - * and VMBus is connected, fail the CPU offlining operation. + * Scan the event flags page looking for bits that are set and waiting + * with a timeout for vmbus_chan_sched() to process such bits. If bits + * are still set after this operation and VMBus is connected, fail the + * CPU offlining operation. */ if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending()) return -EBUSY; @@ -495,7 +629,27 @@ int hv_synic_cleanup(unsigned int cpu) always_cleanup: hv_stimer_legacy_cleanup(cpu); - hv_synic_disable_regs(cpu); + /* + * First, disable the event and message pages + * used for communicating with the host, and then + * disable the host interrupts if VMBus is not + * confidential. + */ + hv_hyp_synic_disable_regs(cpu); + if (!vmbus_is_confidential()) + hv_hyp_synic_disable_interrupts(); - return 0; + /* + * Perform the same steps for the Confidential VMBus. + * The sequencing provides the guarantee that no data + * may be posted for processing before disabling interrupts. + */ + if (vmbus_is_confidential()) { + hv_para_synic_disable_regs(cpu); + hv_para_synic_disable_interrupts(); + } + if (vmbus_irq != -1) + disable_percpu_irq(vmbus_irq); + + return ret; } diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index fec2f18679e3..9a55f5c43307 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -801,7 +801,7 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) * is, create a gap and update covered_end_pfn. */ if (has->covered_end_pfn != start_pfn) { - gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC); + gap = kzalloc_obj(struct hv_hotadd_gap, GFP_ATOMIC); if (!gap) { ret = -ENOMEM; break; @@ -1192,6 +1192,7 @@ static void free_balloon_pages(struct hv_dynmem_device *dm, __ClearPageOffline(pg); __free_page(pg); dm->num_pages_ballooned--; + mod_node_page_state(page_pgdat(pg), NR_BALLOON_PAGES, -1); adjust_managed_page_count(pg, 1); } } @@ -1221,6 +1222,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, return i * alloc_unit; dm->num_pages_ballooned += alloc_unit; + mod_node_page_state(page_pgdat(pg), NR_BALLOON_PAGES, alloc_unit); /* * If we allocatted 2M pages; split them so we @@ -1661,7 +1663,7 @@ static void enable_page_reporting(void) * We let the page_reporting_order parameter decide the order * in the page_reporting code */ - dm_device.pr_dev_info.order = 0; + dm_device.pr_dev_info.order = PAGE_REPORTING_ORDER_UNSPECIFIED; ret = page_reporting_register(&dm_device.pr_dev_info); if (ret < 0) { dm_device.pr_dev_info.report = NULL; diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index f2e6f55d6ca6..6b67ac616789 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -31,8 +31,14 @@ #include <hyperv/hvhdk.h> #include <asm/mshyperv.h> +u64 hv_current_partition_id = HV_PARTITION_ID_SELF; +EXPORT_SYMBOL_GPL(hv_current_partition_id); + +enum hv_partition_type hv_curr_partition_type; +EXPORT_SYMBOL_GPL(hv_curr_partition_type); + /* - * hv_root_partition, ms_hyperv and hv_nested are defined here with other + * ms_hyperv and hv_nested are defined here with other * Hyper-V specific globals so they are shared across all architectures and are * built only when CONFIG_HYPERV is defined. But on x86, * ms_hyperv_init_platform() is built even when CONFIG_HYPERV is not @@ -40,9 +46,6 @@ * here, allowing for an overriding definition in the module containing * ms_hyperv_init_platform(). */ -bool __weak hv_root_partition; -EXPORT_SYMBOL_GPL(hv_root_partition); - bool __weak hv_nested; EXPORT_SYMBOL_GPL(hv_nested); @@ -66,6 +69,16 @@ static void hv_kmsg_dump_unregister(void); static struct ctl_table_header *hv_ctl_table_hdr; /* + * Per-cpu array holding the tail pointer for the SynIC event ring buffer + * for each SINT. + * + * We cannot maintain this in mshv driver because the tail pointer should + * persist even if the mshv driver is unloaded. + */ +u8 * __percpu *hv_synic_eventring_tail; +EXPORT_SYMBOL_GPL(hv_synic_eventring_tail); + +/* * Hyper-V specific initialization and shutdown code that is * common across all architectures. Called from architecture * specific initialization functions. @@ -87,46 +100,10 @@ void __init hv_common_free(void) free_percpu(hyperv_pcpu_input_arg); hyperv_pcpu_input_arg = NULL; -} - -/* - * Functions for allocating and freeing memory with size and - * alignment HV_HYP_PAGE_SIZE. These functions are needed because - * the guest page size may not be the same as the Hyper-V page - * size. We depend upon kmalloc() aligning power-of-two size - * allocations to the allocation size boundary, so that the - * allocated memory appears to Hyper-V as a page of the size - * it expects. - */ -void *hv_alloc_hyperv_page(void) -{ - BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE); - - if (PAGE_SIZE == HV_HYP_PAGE_SIZE) - return (void *)__get_free_page(GFP_KERNEL); - else - return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); -} -EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page); - -void *hv_alloc_hyperv_zeroed_page(void) -{ - if (PAGE_SIZE == HV_HYP_PAGE_SIZE) - return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - else - return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); + free_percpu(hv_synic_eventring_tail); + hv_synic_eventring_tail = NULL; } -EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page); - -void hv_free_hyperv_page(void *addr) -{ - if (PAGE_SIZE == HV_HYP_PAGE_SIZE) - free_page((unsigned long)addr); - else - kfree(addr); -} -EXPORT_SYMBOL_GPL(hv_free_hyperv_page); static void *hv_panic_page; @@ -218,13 +195,15 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper, /* * Write dump contents to the page. No need to synchronize; panic should - * be single-threaded. + * be single-threaded. Ignore failures from kmsg_dump_get_buffer() since + * panic notification should be done even if there is no message data. + * Don't assume bytes_written is set in case of failure, so initialize it. */ kmsg_dump_rewind(&iter); - kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE, + bytes_written = 0; + (void)kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE, &bytes_written); - if (!bytes_written) - return; + /* * P3 to contain the physical address of the panic page & P4 to * contain the size of the panic data in that page. Rest of the @@ -233,7 +212,7 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper, hv_set_msr(HV_MSR_CRASH_P0, 0); hv_set_msr(HV_MSR_CRASH_P1, 0); hv_set_msr(HV_MSR_CRASH_P2, 0); - hv_set_msr(HV_MSR_CRASH_P3, virt_to_phys(hv_panic_page)); + hv_set_msr(HV_MSR_CRASH_P3, bytes_written ? virt_to_phys(hv_panic_page) : 0); hv_set_msr(HV_MSR_CRASH_P4, bytes_written); /* @@ -256,7 +235,7 @@ static void hv_kmsg_dump_unregister(void) atomic_notifier_chain_unregister(&panic_notifier_list, &hyperv_panic_report_block); - hv_free_hyperv_page(hv_panic_page); + kfree(hv_panic_page); hv_panic_page = NULL; } @@ -264,7 +243,7 @@ static void hv_kmsg_dump_register(void) { int ret; - hv_panic_page = hv_alloc_hyperv_zeroed_page(); + hv_panic_page = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); if (!hv_panic_page) { pr_err("Hyper-V: panic message page memory allocation failed\n"); return; @@ -273,24 +252,74 @@ static void hv_kmsg_dump_register(void) ret = kmsg_dump_register(&hv_kmsg_dumper); if (ret) { pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret); - hv_free_hyperv_page(hv_panic_page); + kfree(hv_panic_page); hv_panic_page = NULL; } } static inline bool hv_output_page_exists(void) { - return hv_root_partition || IS_ENABLED(CONFIG_HYPERV_VTL_MODE); + return hv_parent_partition() || IS_ENABLED(CONFIG_HYPERV_VTL_MODE); } +void __init hv_get_partition_id(void) +{ + struct hv_output_get_partition_id *output; + unsigned long flags; + u64 status, pt_id; + + local_irq_save(flags); + output = *this_cpu_ptr(hyperv_pcpu_input_arg); + status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output); + pt_id = output->partition_id; + local_irq_restore(flags); + + if (hv_result_success(status)) + hv_current_partition_id = pt_id; + else + pr_err("Hyper-V: failed to get partition ID: %#x\n", + hv_result(status)); +} +#if IS_ENABLED(CONFIG_HYPERV_VTL_MODE) +u8 __init get_vtl(void) +{ + u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS; + struct hv_input_get_vp_registers *input; + struct hv_output_get_vp_registers *output; + unsigned long flags; + u64 ret; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, struct_size(input, names, 1)); + input->partition_id = HV_PARTITION_ID_SELF; + input->vp_index = HV_VP_INDEX_SELF; + input->input_vtl.as_uint8 = 0; + input->names[0] = HV_REGISTER_VSM_VP_STATUS; + + ret = hv_do_hypercall(control, input, output); + if (hv_result_success(ret)) { + ret = output->values[0].reg8 & HV_VTL_MASK; + } else { + pr_err("Failed to get VTL(error: %lld) exiting...\n", ret); + BUG(); + } + + local_irq_restore(flags); + return ret; +} +#endif + int __init hv_common_init(void) { int i; union hv_hypervisor_version_info version; - /* Get information about the Hyper-V host version */ + /* Get information about the Microsoft Hypervisor version */ if (!hv_get_hypervisor_version(&version)) - pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n", + pr_info("Hyper-V: Hypervisor Build %d.%d.%d.%d-%d-%d\n", version.major_version, version.minor_version, version.build_number, version.service_number, version.service_pack, version.service_branch); @@ -350,6 +379,11 @@ int __init hv_common_init(void) BUG_ON(!hyperv_pcpu_output_arg); } + if (hv_parent_partition()) { + hv_synic_eventring_tail = alloc_percpu(u8 *); + BUG_ON(!hv_synic_eventring_tail); + } + hv_vp_index = kmalloc_array(nr_cpu_ids, sizeof(*hv_vp_index), GFP_KERNEL); if (!hv_vp_index) { @@ -438,11 +472,12 @@ error: int hv_common_cpu_init(unsigned int cpu) { void **inputarg, **outputarg; + u8 **synic_eventring_tail; u64 msr_vp_index; gfp_t flags; const int pgcount = hv_output_page_exists() ? 2 : 1; void *mem; - int ret; + int ret = 0; /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */ flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL; @@ -450,11 +485,11 @@ int hv_common_cpu_init(unsigned int cpu) inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); /* - * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already - * allocated if this CPU was previously online and then taken offline + * The per-cpu memory is already allocated if this CPU was previously + * online and then taken offline */ if (!*inputarg) { - mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); + mem = kmalloc_array(pgcount, HV_HYP_PAGE_SIZE, flags); if (!mem) return -ENOMEM; @@ -498,11 +533,21 @@ int hv_common_cpu_init(unsigned int cpu) if (msr_vp_index > hv_max_vp_index) hv_max_vp_index = msr_vp_index; - return 0; + if (hv_parent_partition()) { + synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail); + *synic_eventring_tail = kcalloc(HV_SYNIC_SINT_COUNT, + sizeof(u8), flags); + /* No need to unwind any of the above on failure here */ + if (unlikely(!*synic_eventring_tail)) + ret = -ENOMEM; + } + + return ret; } int hv_common_cpu_die(unsigned int cpu) { + u8 **synic_eventring_tail; /* * The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory * is not freed when the CPU goes offline as the hyperv_pcpu_input_arg @@ -515,6 +560,12 @@ int hv_common_cpu_die(unsigned int cpu) * originally allocated memory is reused in hv_common_cpu_init(). */ + if (hv_parent_partition()) { + synic_eventring_tail = this_cpu_ptr(hv_synic_eventring_tail); + kfree(*synic_eventring_tail); + *synic_eventring_tail = NULL; + } + return 0; } @@ -572,7 +623,7 @@ EXPORT_SYMBOL_GPL(hv_setup_dma_ops); bool hv_is_hibernation_supported(void) { - return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4); + return !hv_root_partition() && acpi_sleep_state_supported(ACPI_STATE_S4); } EXPORT_SYMBOL_GPL(hv_is_hibernation_supported); @@ -625,6 +676,11 @@ void __weak hv_remove_vmbus_handler(void) } EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler); +void __weak hv_setup_mshv_handler(void (*handler)(void)) +{ +} +EXPORT_SYMBOL_GPL(hv_setup_mshv_handler); + void __weak hv_setup_kexec_handler(void (*handler)(void)) { } @@ -661,3 +717,149 @@ u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2) return HV_STATUS_INVALID_PARAMETER; } EXPORT_SYMBOL_GPL(hv_tdx_hypercall); + +void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set) +{ +} +EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt); + +void __weak hv_para_set_sint_proxy(bool enable) +{ +} +EXPORT_SYMBOL_GPL(hv_para_set_sint_proxy); + +u64 __weak hv_para_get_synic_register(unsigned int reg) +{ + return ~0ULL; +} +EXPORT_SYMBOL_GPL(hv_para_get_synic_register); + +void __weak hv_para_set_synic_register(unsigned int reg, u64 val) +{ +} +EXPORT_SYMBOL_GPL(hv_para_set_synic_register); + +void hv_identify_partition_type(void) +{ + /* Assume guest role */ + hv_curr_partition_type = HV_PARTITION_TYPE_GUEST; + /* + * Check partition creation and cpu management privileges + * + * Hyper-V should never specify running as root and as a Confidential + * VM. But to protect against a compromised/malicious Hyper-V trying + * to exploit root behavior to expose Confidential VM memory, ignore + * the root partition setting if also a Confidential VM. + */ + if ((ms_hyperv.priv_high & HV_CREATE_PARTITIONS) && + !(ms_hyperv.priv_high & HV_ISOLATION)) { + + if (!IS_ENABLED(CONFIG_MSHV_ROOT)) { + pr_crit("Hyper-V: CONFIG_MSHV_ROOT not enabled!\n"); + } else if (ms_hyperv.priv_high & HV_CPU_MANAGEMENT) { + pr_info("Hyper-V: running as root partition\n"); + hv_curr_partition_type = HV_PARTITION_TYPE_ROOT; + } else { + pr_info("Hyper-V: running as L1VH partition\n"); + hv_curr_partition_type = HV_PARTITION_TYPE_L1VH; + } + } +} + +struct hv_status_info { + char *string; + int errno; + u16 code; +}; + +/* + * Note on the errno mappings: + * A failed hypercall is usually only recoverable (or loggable) near + * the call site where the HV_STATUS_* code is known. So the errno + * it gets converted to is not too useful further up the stack. + * Provide a few mappings that could be useful, and revert to -EIO + * as a fallback. + */ +static const struct hv_status_info hv_status_infos[] = { +#define _STATUS_INFO(status, errno) { #status, (errno), (status) } + _STATUS_INFO(HV_STATUS_SUCCESS, 0), + _STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_CODE, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_INPUT, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_ALIGNMENT, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_PARAMETER, -EINVAL), + _STATUS_INFO(HV_STATUS_ACCESS_DENIED, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_PARTITION_STATE, -EIO), + _STATUS_INFO(HV_STATUS_OPERATION_DENIED, -EIO), + _STATUS_INFO(HV_STATUS_UNKNOWN_PROPERTY, -EIO), + _STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE, -EIO), + _STATUS_INFO(HV_STATUS_INSUFFICIENT_MEMORY, -ENOMEM), + _STATUS_INFO(HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY, -ENOMEM), + _STATUS_INFO(HV_STATUS_INSUFFICIENT_ROOT_MEMORY, -ENOMEM), + _STATUS_INFO(HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY, -ENOMEM), + _STATUS_INFO(HV_STATUS_INVALID_PARTITION_ID, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_VP_INDEX, -EINVAL), + _STATUS_INFO(HV_STATUS_NOT_FOUND, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_PORT_ID, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_CONNECTION_ID, -EINVAL), + _STATUS_INFO(HV_STATUS_INSUFFICIENT_BUFFERS, -EIO), + _STATUS_INFO(HV_STATUS_NOT_ACKNOWLEDGED, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_VP_STATE, -EIO), + _STATUS_INFO(HV_STATUS_NO_RESOURCES, -EIO), + _STATUS_INFO(HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EIO), + _STATUS_INFO(HV_STATUS_OPERATION_FAILED, -EIO), + _STATUS_INFO(HV_STATUS_TIME_OUT, -EIO), + _STATUS_INFO(HV_STATUS_CALL_PENDING, -EIO), + _STATUS_INFO(HV_STATUS_VTL_ALREADY_ENABLED, -EIO), +#undef _STATUS_INFO +}; + +static inline const struct hv_status_info *find_hv_status_info(u64 hv_status) +{ + int i; + u16 code = hv_result(hv_status); + + for (i = 0; i < ARRAY_SIZE(hv_status_infos); ++i) { + const struct hv_status_info *info = &hv_status_infos[i]; + + if (info->code == code) + return info; + } + + return NULL; +} + +/* Convert a hypercall result into a linux-friendly error code. */ +int hv_result_to_errno(u64 status) +{ + const struct hv_status_info *info; + + /* hv_do_hypercall() may return U64_MAX, hypercalls aren't possible */ + if (unlikely(status == U64_MAX)) + return -EOPNOTSUPP; + + info = find_hv_status_info(status); + if (info) + return info->errno; + + return -EIO; +} +EXPORT_SYMBOL_GPL(hv_result_to_errno); + +const char *hv_result_to_string(u64 status) +{ + const struct hv_status_info *info; + + if (unlikely(status == U64_MAX)) + return "Hypercall page missing!"; + + info = find_hv_status_info(status); + if (info) + return info->string; + + return "Unknown"; +} +EXPORT_SYMBOL_GPL(hv_result_to_string); diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index 62795f6cbb00..0d73daf745a7 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -134,7 +134,7 @@ kvp_register(int reg_value) struct hv_kvp_msg *kvp_msg; char *version; - kvp_msg = kzalloc(sizeof(*kvp_msg), GFP_KERNEL); + kvp_msg = kzalloc_obj(*kvp_msg); if (kvp_msg) { version = kvp_msg->body.kvp_register.version; @@ -385,7 +385,7 @@ kvp_send_key(struct work_struct *dummy) if (kvp_transaction.state != HVUTIL_HOSTMSG_RECEIVED) return; - message = kzalloc(sizeof(*message), GFP_KERNEL); + message = kzalloc_obj(*message); if (!message) return; diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c new file mode 100644 index 000000000000..57b2c64197cb --- /dev/null +++ b/drivers/hv/hv_proc.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/types.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/clockchips.h> +#include <linux/slab.h> +#include <linux/cpuhotplug.h> +#include <linux/minmax.h> +#include <linux/export.h> +#include <asm/mshyperv.h> + +/* + * See struct hv_deposit_memory. The first u64 is partition ID, the rest + * are GPAs. + */ +#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1) + +/* Deposits exact number of pages. Must be called with interrupts enabled. */ +int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) +{ + struct page **pages, *page; + int *counts; + int num_allocations; + int i, j, page_count; + int order; + u64 status; + int ret; + u64 base_pfn; + struct hv_deposit_memory *input_page; + unsigned long flags; + + if (num_pages > HV_DEPOSIT_MAX) + return -E2BIG; + if (!num_pages) + return 0; + + /* One buffer for page pointers and counts */ + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + pages = page_address(page); + + counts = kzalloc_objs(int, HV_DEPOSIT_MAX); + if (!counts) { + free_page((unsigned long)pages); + return -ENOMEM; + } + + /* Allocate all the pages before disabling interrupts */ + i = 0; + + while (num_pages) { + /* Find highest order we can actually allocate */ + order = 31 - __builtin_clz(num_pages); + + while (1) { + pages[i] = alloc_pages_node(node, GFP_KERNEL, order); + if (pages[i]) + break; + if (!order) { + ret = -ENOMEM; + num_allocations = i; + goto err_free_allocations; + } + --order; + } + + split_page(pages[i], order); + counts[i] = 1 << order; + num_pages -= counts[i]; + i++; + } + num_allocations = i; + + local_irq_save(flags); + + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input_page->partition_id = partition_id; + + /* Populate gpa_page_list - these will fit on the input page */ + for (i = 0, page_count = 0; i < num_allocations; ++i) { + base_pfn = page_to_pfn(pages[i]); + for (j = 0; j < counts[i]; ++j, ++page_count) + input_page->gpa_page_list[page_count] = base_pfn + j; + } + status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY, + page_count, 0, input_page, NULL); + local_irq_restore(flags); + if (!hv_result_success(status)) { + hv_status_err(status, "\n"); + ret = hv_result_to_errno(status); + goto err_free_allocations; + } + + ret = 0; + goto free_buf; + +err_free_allocations: + for (i = 0; i < num_allocations; ++i) { + base_pfn = page_to_pfn(pages[i]); + for (j = 0; j < counts[i]; ++j) + __free_page(pfn_to_page(base_pfn + j)); + } + +free_buf: + free_page((unsigned long)pages); + kfree(counts); + return ret; +} +EXPORT_SYMBOL_GPL(hv_call_deposit_pages); + +int hv_deposit_memory_node(int node, u64 partition_id, + u64 hv_status) +{ + u32 num_pages = 1; + + switch (hv_result(hv_status)) { + case HV_STATUS_INSUFFICIENT_MEMORY: + break; + case HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY: + num_pages = HV_MAX_CONTIGUOUS_ALLOCATION_PAGES; + break; + + case HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY: + num_pages = HV_MAX_CONTIGUOUS_ALLOCATION_PAGES; + fallthrough; + case HV_STATUS_INSUFFICIENT_ROOT_MEMORY: + if (!hv_root_partition()) { + hv_status_err(hv_status, "Unexpected root memory deposit\n"); + return -ENOMEM; + } + partition_id = HV_PARTITION_ID_SELF; + break; + + default: + hv_status_err(hv_status, "Unexpected!\n"); + return -ENOMEM; + } + return hv_call_deposit_pages(node, partition_id, num_pages); +} +EXPORT_SYMBOL_GPL(hv_deposit_memory_node); + +bool hv_result_needs_memory(u64 status) +{ + switch (hv_result(status)) { + case HV_STATUS_INSUFFICIENT_MEMORY: + case HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY: + case HV_STATUS_INSUFFICIENT_ROOT_MEMORY: + case HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY: + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(hv_result_needs_memory); + +int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) +{ + struct hv_input_add_logical_processor *input; + struct hv_output_add_logical_processor *output; + u64 status; + unsigned long flags; + int ret = 0; + + /* + * When adding a logical processor, the hypervisor may return + * HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more + * pages and retry. + */ + do { + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + /* We don't do anything with the output right now */ + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + input->lp_index = lp_index; + input->apic_id = apic_id; + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); + status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR, + input, output); + local_irq_restore(flags); + + if (!hv_result_needs_memory(status)) { + if (!hv_result_success(status)) { + hv_status_err(status, "cpu %u apic ID: %u\n", + lp_index, apic_id); + ret = hv_result_to_errno(status); + } + break; + } + ret = hv_deposit_memory_node(node, hv_current_partition_id, + status); + } while (!ret); + + return ret; +} + +int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) +{ + struct hv_create_vp *input; + u64 status; + unsigned long irq_flags; + int ret = 0; + + /* Root VPs don't seem to need pages deposited */ + if (partition_id != hv_current_partition_id) { + /* The value 90 is empirically determined. It may change. */ + ret = hv_call_deposit_pages(node, partition_id, 90); + if (ret) + return ret; + } + + do { + local_irq_save(irq_flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input->partition_id = partition_id; + input->vp_index = vp_index; + input->flags = flags; + input->subnode_type = HV_SUBNODE_ANY; + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); + status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL); + local_irq_restore(irq_flags); + + if (!hv_result_needs_memory(status)) { + if (!hv_result_success(status)) { + hv_status_err(status, "vcpu: %u, lp: %u\n", + vp_index, flags); + ret = hv_result_to_errno(status); + } + break; + } + ret = hv_deposit_memory_node(node, partition_id, status); + + } while (!ret); + + return ret; +} +EXPORT_SYMBOL_GPL(hv_call_create_vp); + +int hv_call_notify_all_processors_started(void) +{ + struct hv_input_notify_partition_event *input; + u64 status; + unsigned long irq_flags; + int ret = 0; + + local_irq_save(irq_flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->event = HV_PARTITION_ALL_LOGICAL_PROCESSORS_STARTED; + status = hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, + input, NULL); + local_irq_restore(irq_flags); + + if (!hv_result_success(status)) { + hv_status_err(status, "\n"); + ret = hv_result_to_errno(status); + } + return ret; +} + +bool hv_lp_exists(u32 lp_index) +{ + struct hv_input_get_logical_processor_run_time *input; + struct hv_output_get_logical_processor_run_time *output; + unsigned long flags; + u64 status; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + input->lp_index = lp_index; + status = hv_do_hypercall(HVCALL_GET_LOGICAL_PROCESSOR_RUN_TIME, + input, output); + local_irq_restore(flags); + + if (!hv_result_success(status) && + hv_result(status) != HV_STATUS_INVALID_LP_INDEX) { + hv_status_err(status, "\n"); + BUG(); + } + + return hv_result_success(status); +} diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index 2e7f537d53cf..506871aeacf0 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -184,7 +184,7 @@ static void vss_send_op(void) return; } - vss_msg = kzalloc(sizeof(*vss_msg), GFP_KERNEL); + vss_msg = kzalloc_obj(*vss_msg); if (!vss_msg) return; @@ -424,7 +424,7 @@ int hv_vss_pre_suspend(void) * write() will fail with EINVAL (see vss_on_msg()), and the daemon * will reset the device by closing and re-opening it. */ - vss_msg = kzalloc(sizeof(*vss_msg), GFP_KERNEL); + vss_msg = kzalloc_obj(*vss_msg); if (!vss_msg) return -ENOMEM; diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index 36ee89c0358b..7e9c8e169c66 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -586,7 +586,7 @@ static int util_probe(struct hv_device *dev, (struct hv_util_service *)dev_id->driver_data; int ret; - srv->recv_buffer = kmalloc(HV_HYP_PAGE_SIZE * 4, GFP_KERNEL); + srv->recv_buffer = kmalloc_array(4, HV_HYP_PAGE_SIZE, GFP_KERNEL); if (!srv->recv_buffer) return -ENOMEM; srv->channel = dev->channel; diff --git a/drivers/hv/hv_utils_transport.c b/drivers/hv/hv_utils_transport.c index 832885198643..c3be5c570263 100644 --- a/drivers/hv/hv_utils_transport.c +++ b/drivers/hv/hv_utils_transport.c @@ -129,8 +129,7 @@ static int hvt_op_open(struct inode *inode, struct file *file) * device gets released. */ hvt->mode = HVUTIL_TRANSPORT_CHARDEV; - } - else if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) { + } else if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) { /* * We're switching from netlink communication to using char * device. Issue the reset first. @@ -195,7 +194,7 @@ static void hvt_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) } spin_unlock(&hvt_list_lock); if (!hvt_found) { - pr_warn("hvt_cn_callback: spurious message received!\n"); + pr_warn("%s: spurious message received!\n", __func__); return; } @@ -210,7 +209,7 @@ static void hvt_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) hvt_found->on_msg(msg->data, msg->len); else - pr_warn("hvt_cn_callback: unexpected netlink message!\n"); + pr_warn("%s: unexpected netlink message!\n", __func__); mutex_unlock(&hvt->lock); } @@ -260,8 +259,9 @@ int hvutil_transport_send(struct hvutil_transport *hvt, void *msg, int len, hvt->outmsg_len = len; hvt->on_read = on_read_cb; wake_up_interruptible(&hvt->outmsg_q); - } else + } else { ret = -ENOMEM; + } out_unlock: mutex_unlock(&hvt->lock); return ret; @@ -274,7 +274,7 @@ struct hvutil_transport *hvutil_transport_init(const char *name, { struct hvutil_transport *hvt; - hvt = kzalloc(sizeof(*hvt), GFP_KERNEL); + hvt = kzalloc_obj(*hvt); if (!hvt) return NULL; diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 29780f3a7478..05a36854389a 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -15,6 +15,7 @@ #include <linux/list.h> #include <linux/bitops.h> #include <asm/sync_bitops.h> +#include <asm/mshyperv.h> #include <linux/atomic.h> #include <linux/hyperv.h> #include <linux/interrupt.h> @@ -32,6 +33,7 @@ */ #define HV_UTIL_NEGO_TIMEOUT 55 +void vmbus_isr(void); /* Definitions for the monitored notification facility */ union hv_monitor_trigger_group { @@ -120,8 +122,26 @@ enum { * Per cpu state for channel handling */ struct hv_per_cpu_context { - void *synic_message_page; - void *synic_event_page; + /* + * SynIC pages for communicating with the host. + * + * These pages are accessible to the host partition and the hypervisor. + * They may be used for exchanging data with the host partition and the + * hypervisor even when they aren't trusted yet the guest partition + * must be prepared to handle the malicious behavior. + */ + void *hyp_synic_message_page; + void *hyp_synic_event_page; + /* + * SynIC pages for communicating with the paravisor. + * + * These pages may be accessed from within the guest partition only in + * CoCo VMs. Neither the host partition nor the hypervisor can access + * these pages in that case; they are used for exchanging data with the + * paravisor. + */ + void *para_synic_message_page; + void *para_synic_event_page; /* * The page is only used in hv_post_message() for a TDX VM (with the @@ -171,10 +191,10 @@ extern int hv_synic_alloc(void); extern void hv_synic_free(void); -extern void hv_synic_enable_regs(unsigned int cpu); +extern void hv_hyp_synic_enable_regs(unsigned int cpu); extern int hv_synic_init(unsigned int cpu); -extern void hv_synic_disable_regs(unsigned int cpu); +extern void hv_hyp_synic_disable_regs(unsigned int cpu); extern int hv_synic_cleanup(unsigned int cpu); /* Interface */ @@ -182,7 +202,8 @@ extern int hv_synic_cleanup(unsigned int cpu); void hv_ringbuffer_pre_init(struct vmbus_channel *channel); int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, - struct page *pages, u32 pagecnt, u32 max_pkt_size); + struct page *pages, u32 pagecnt, u32 max_pkt_size, + bool confidential); void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info); @@ -255,8 +276,9 @@ struct vmbus_connection { struct list_head chn_list; struct mutex channel_mutex; - /* Array of channels */ + /* Array of channel pointers, indexed by relid */ struct vmbus_channel **channels; + u32 relid_hiwater; /* * An offer message is handled first on the work_queue, and then @@ -333,6 +355,51 @@ extern const struct vmbus_channel_message_table_entry /* General vmbus interface */ +bool vmbus_is_confidential(void); + +#if IS_ENABLED(CONFIG_HYPERV_VMBUS) +/* Free the message slot and signal end-of-message if required */ +static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) +{ + /* + * On crash we're reading some other CPU's message page and we need + * to be careful: this other CPU may already had cleared the header + * and the host may already had delivered some other message there. + * In case we blindly write msg->header.message_type we're going + * to lose it. We can still lose a message of the same type but + * we count on the fact that there can only be one + * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages + * on crash. + */ + if (!try_cmpxchg(&msg->header.message_type, + &old_msg_type, HVMSG_NONE)) + return; + + /* + * The cmpxchg() above does an implicit memory barrier to + * ensure the write to MessageType (ie set to + * HVMSG_NONE) happens before we read the + * MessagePending and EOMing. Otherwise, the EOMing + * will not deliver any more messages since there is + * no empty slot + */ + if (msg->header.message_flags.msg_pending) { + /* + * This will cause message queue rescan to + * possibly deliver another msg from the + * hypervisor + */ + if (vmbus_is_confidential()) + hv_para_set_synic_register(HV_MSR_EOM, 0); + else + hv_set_msr(HV_MSR_EOM, 0); + } +} + +extern int vmbus_interrupt; +extern int vmbus_irq; +#endif /* CONFIG_HYPERV_VMBUS */ + struct hv_device *vmbus_device_create(const guid_t *type, const guid_t *instance, struct vmbus_channel *channel); @@ -477,4 +544,10 @@ static inline int hv_debug_add_dev_dir(struct hv_device *dev) #endif /* CONFIG_HYPERV_TESTING */ +/* Create and remove sysfs entry for memory mapped ring buffers for a channel */ +int hv_create_ring_sysfs(struct vmbus_channel *channel, + int (*hv_mmap_prepare_ring_buffer)(struct vmbus_channel *channel, + struct vm_area_desc *desc)); +int hv_remove_ring_sysfs(struct vmbus_channel *channel); + #endif /* _HYPERV_VMBUS_H */ diff --git a/drivers/hv/mshv.h b/drivers/hv/mshv.h new file mode 100644 index 000000000000..d4813df92b9c --- /dev/null +++ b/drivers/hv/mshv.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2023, Microsoft Corporation. + */ + +#ifndef _MSHV_H_ +#define _MSHV_H_ + +#include <linux/stddef.h> +#include <linux/string.h> +#include <hyperv/hvhdk.h> + +#define mshv_field_nonzero(STRUCT, MEMBER) \ + memchr_inv(&((STRUCT).MEMBER), \ + 0, sizeof_field(typeof(STRUCT), MEMBER)) + +int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, + union hv_input_vtl input_vtl, + struct hv_register_assoc *registers); + +int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, + union hv_input_vtl input_vtl, + struct hv_register_assoc *registers); + +int hv_call_get_partition_property(u64 partition_id, u64 property_code, + u64 *property_value); + +#endif /* _MSHV_H */ diff --git a/drivers/hv/mshv_common.c b/drivers/hv/mshv_common.c new file mode 100644 index 000000000000..63f09bb5136e --- /dev/null +++ b/drivers/hv/mshv_common.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Microsoft Corporation. + * + * This file contains functions that will be called from one or more modules. + * If any of these modules are configured to build, this file is built and just + * statically linked in. + * + * Authors: Microsoft Linux virtualization team + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <asm/mshyperv.h> +#include <linux/resume_user_mode.h> +#include <linux/export.h> +#include <linux/acpi.h> +#include <linux/notifier.h> +#include <linux/reboot.h> + +#include "mshv.h" + +#define HV_GET_REGISTER_BATCH_SIZE \ + (HV_HYP_PAGE_SIZE / sizeof(union hv_register_value)) +#define HV_SET_REGISTER_BATCH_SIZE \ + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_registers)) \ + / sizeof(struct hv_register_assoc)) + +int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, + union hv_input_vtl input_vtl, + struct hv_register_assoc *registers) +{ + struct hv_input_get_vp_registers *input_page; + union hv_register_value *output_page; + u16 completed = 0; + unsigned long remaining = count; + int rep_count, i; + u64 status = HV_STATUS_SUCCESS; + unsigned long flags; + + local_irq_save(flags); + + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); + + input_page->partition_id = partition_id; + input_page->vp_index = vp_index; + input_page->input_vtl.as_uint8 = input_vtl.as_uint8; + input_page->rsvd_z8 = 0; + input_page->rsvd_z16 = 0; + + while (remaining) { + rep_count = min(remaining, HV_GET_REGISTER_BATCH_SIZE); + for (i = 0; i < rep_count; ++i) + input_page->names[i] = registers[i].name; + + status = hv_do_rep_hypercall(HVCALL_GET_VP_REGISTERS, rep_count, + 0, input_page, output_page); + if (!hv_result_success(status)) + break; + + completed = hv_repcomp(status); + for (i = 0; i < completed; ++i) + registers[i].value = output_page[i]; + + registers += completed; + remaining -= completed; + } + local_irq_restore(flags); + + return hv_result_to_errno(status); +} +EXPORT_SYMBOL_GPL(hv_call_get_vp_registers); + +int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, + union hv_input_vtl input_vtl, + struct hv_register_assoc *registers) +{ + struct hv_input_set_vp_registers *input_page; + u16 completed = 0; + unsigned long remaining = count; + int rep_count; + u64 status = HV_STATUS_SUCCESS; + unsigned long flags; + + local_irq_save(flags); + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input_page->partition_id = partition_id; + input_page->vp_index = vp_index; + input_page->input_vtl.as_uint8 = input_vtl.as_uint8; + input_page->rsvd_z8 = 0; + input_page->rsvd_z16 = 0; + + while (remaining) { + rep_count = min(remaining, HV_SET_REGISTER_BATCH_SIZE); + memcpy(input_page->elements, registers, + sizeof(struct hv_register_assoc) * rep_count); + + status = hv_do_rep_hypercall(HVCALL_SET_VP_REGISTERS, rep_count, + 0, input_page, NULL); + if (!hv_result_success(status)) + break; + + completed = hv_repcomp(status); + registers += completed; + remaining -= completed; + } + + local_irq_restore(flags); + + return hv_result_to_errno(status); +} +EXPORT_SYMBOL_GPL(hv_call_set_vp_registers); + +int hv_call_get_partition_property(u64 partition_id, + u64 property_code, + u64 *property_value) +{ + u64 status; + unsigned long flags; + struct hv_input_get_partition_property *input; + struct hv_output_get_partition_property *output; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + memset(input, 0, sizeof(*input)); + input->partition_id = partition_id; + input->property_code = property_code; + status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY, input, output); + + if (!hv_result_success(status)) { + local_irq_restore(flags); + return hv_result_to_errno(status); + } + *property_value = output->property_value; + + local_irq_restore(flags); + + return 0; +} +EXPORT_SYMBOL_GPL(hv_call_get_partition_property); + +#ifdef CONFIG_X86 +/* + * Corresponding sleep states have to be initialized in order for a subsequent + * HVCALL_ENTER_SLEEP_STATE call to succeed. Currently only S5 state as per + * ACPI 6.4 chapter 7.4.2 is relevant, while S1, S2 and S3 can be supported. + * + * In order to pass proper PM values to mshv, ACPI should be initialized and + * should support S5 sleep state when this method is invoked. + */ +static int hv_initialize_sleep_states(void) +{ + u64 status; + unsigned long flags; + struct hv_input_set_system_property *in; + acpi_status acpi_status; + u8 sleep_type_a, sleep_type_b; + + if (!acpi_sleep_state_supported(ACPI_STATE_S5)) { + pr_err("%s: S5 sleep state not supported.\n", __func__); + return -ENODEV; + } + + acpi_status = acpi_get_sleep_type_data(ACPI_STATE_S5, &sleep_type_a, + &sleep_type_b); + if (ACPI_FAILURE(acpi_status)) + return -ENODEV; + + local_irq_save(flags); + in = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(in, 0, sizeof(*in)); + + in->property_id = HV_SYSTEM_PROPERTY_SLEEP_STATE; + in->set_sleep_state_info.sleep_state = HV_SLEEP_STATE_S5; + in->set_sleep_state_info.pm1a_slp_typ = sleep_type_a; + in->set_sleep_state_info.pm1b_slp_typ = sleep_type_b; + + status = hv_do_hypercall(HVCALL_SET_SYSTEM_PROPERTY, in, NULL); + local_irq_restore(flags); + + if (!hv_result_success(status)) { + hv_status_err(status, "\n"); + return hv_result_to_errno(status); + } + + return 0; +} + +/* + * This notifier initializes sleep states in mshv hypervisor which will be + * used during power off. + */ +static int hv_reboot_notifier_handler(struct notifier_block *this, + unsigned long code, void *another) +{ + int ret = 0; + + if (code == SYS_HALT || code == SYS_POWER_OFF) + ret = hv_initialize_sleep_states(); + + return ret ? NOTIFY_DONE : NOTIFY_OK; +} + +static struct notifier_block hv_reboot_notifier = { + .notifier_call = hv_reboot_notifier_handler, +}; + +void hv_sleep_notifiers_register(void) +{ + int ret; + + ret = register_reboot_notifier(&hv_reboot_notifier); + if (ret) + pr_err("%s: cannot register reboot notifier %d\n", __func__, + ret); +} + +/* + * Power off the machine by entering S5 sleep state via Hyper-V hypercall. + * This call does not return if successful. + */ +void hv_machine_power_off(void) +{ + unsigned long flags; + struct hv_input_enter_sleep_state *in; + + local_irq_save(flags); + in = *this_cpu_ptr(hyperv_pcpu_input_arg); + in->sleep_state = HV_SLEEP_STATE_S5; + + (void)hv_do_hypercall(HVCALL_ENTER_SLEEP_STATE, in, NULL); + local_irq_restore(flags); + + /* should never reach here */ + BUG(); + +} +#endif diff --git a/drivers/hv/mshv_debugfs.c b/drivers/hv/mshv_debugfs.c new file mode 100644 index 000000000000..418b6dc8f3c2 --- /dev/null +++ b/drivers/hv/mshv_debugfs.c @@ -0,0 +1,724 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2026, Microsoft Corporation. + * + * The /sys/kernel/debug/mshv directory contents. + * Contains various statistics data, provided by the hypervisor. + * + * Authors: Microsoft Linux virtualization team + */ + +#include <linux/debugfs.h> +#include <linux/stringify.h> +#include <asm/mshyperv.h> +#include <linux/slab.h> + +#include "mshv.h" +#include "mshv_root.h" + +/* Ensure this file is not used elsewhere by accident */ +#define MSHV_DEBUGFS_C +#include "mshv_debugfs_counters.c" + +#define U32_BUF_SZ 11 +#define U64_BUF_SZ 21 +/* Only support SELF and PARENT areas */ +#define NUM_STATS_AREAS 2 +static_assert(HV_STATS_AREA_SELF == 0 && HV_STATS_AREA_PARENT == 1, + "SELF and PARENT areas must be usable as indices into an array of size NUM_STATS_AREAS"); +/* HV_HYPERVISOR_COUNTER */ +#define HV_HYPERVISOR_COUNTER_LOGICAL_PROCESSORS 1 + +static struct dentry *mshv_debugfs; +static struct dentry *mshv_debugfs_partition; +static struct dentry *mshv_debugfs_lp; +static struct dentry **parent_vp_stats; +static struct dentry *parent_partition_stats; + +static u64 mshv_lps_count; +static struct hv_stats_page **mshv_lps_stats; + +static int lp_stats_show(struct seq_file *m, void *v) +{ + const struct hv_stats_page *stats = m->private; + int idx; + + for (idx = 0; idx < ARRAY_SIZE(hv_lp_counters); idx++) { + char *name = hv_lp_counters[idx]; + + if (!name) + continue; + seq_printf(m, "%-32s: %llu\n", name, stats->data[idx]); + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(lp_stats); + +static void mshv_lp_stats_unmap(u32 lp_index) +{ + union hv_stats_object_identity identity = { + .lp.lp_index = lp_index, + .lp.stats_area_type = HV_STATS_AREA_SELF, + }; + int err; + + err = hv_unmap_stats_page(HV_STATS_OBJECT_LOGICAL_PROCESSOR, + mshv_lps_stats[lp_index], &identity); + if (err) + pr_err("%s: failed to unmap logical processor %u stats, err: %d\n", + __func__, lp_index, err); + + mshv_lps_stats[lp_index] = NULL; +} + +static struct hv_stats_page * __init mshv_lp_stats_map(u32 lp_index) +{ + union hv_stats_object_identity identity = { + .lp.lp_index = lp_index, + .lp.stats_area_type = HV_STATS_AREA_SELF, + }; + struct hv_stats_page *stats; + int err; + + err = hv_map_stats_page(HV_STATS_OBJECT_LOGICAL_PROCESSOR, &identity, + &stats); + if (err) { + pr_err("%s: failed to map logical processor %u stats, err: %d\n", + __func__, lp_index, err); + return ERR_PTR(err); + } + mshv_lps_stats[lp_index] = stats; + + return stats; +} + +static struct hv_stats_page * __init lp_debugfs_stats_create(u32 lp_index, + struct dentry *parent) +{ + struct dentry *dentry; + struct hv_stats_page *stats; + + stats = mshv_lp_stats_map(lp_index); + if (IS_ERR(stats)) + return stats; + + dentry = debugfs_create_file("stats", 0400, parent, + stats, &lp_stats_fops); + if (IS_ERR(dentry)) { + mshv_lp_stats_unmap(lp_index); + return ERR_CAST(dentry); + } + return stats; +} + +static int __init lp_debugfs_create(u32 lp_index, struct dentry *parent) +{ + struct dentry *idx; + char lp_idx_str[U32_BUF_SZ]; + struct hv_stats_page *stats; + int err; + + sprintf(lp_idx_str, "%u", lp_index); + + idx = debugfs_create_dir(lp_idx_str, parent); + if (IS_ERR(idx)) + return PTR_ERR(idx); + + stats = lp_debugfs_stats_create(lp_index, idx); + if (IS_ERR(stats)) { + err = PTR_ERR(stats); + goto remove_debugfs_lp_idx; + } + + return 0; + +remove_debugfs_lp_idx: + debugfs_remove_recursive(idx); + return err; +} + +static void mshv_debugfs_lp_remove(void) +{ + int lp_index; + + debugfs_remove_recursive(mshv_debugfs_lp); + + for (lp_index = 0; lp_index < mshv_lps_count; lp_index++) + mshv_lp_stats_unmap(lp_index); + + kfree(mshv_lps_stats); + mshv_lps_stats = NULL; +} + +static int __init mshv_debugfs_lp_create(struct dentry *parent) +{ + struct dentry *lp_dir; + int err, lp_index; + + mshv_lps_stats = kzalloc_objs(*mshv_lps_stats, mshv_lps_count, + GFP_KERNEL_ACCOUNT); + + if (!mshv_lps_stats) + return -ENOMEM; + + lp_dir = debugfs_create_dir("lp", parent); + if (IS_ERR(lp_dir)) { + err = PTR_ERR(lp_dir); + goto free_lp_stats; + } + + for (lp_index = 0; lp_index < mshv_lps_count; lp_index++) { + err = lp_debugfs_create(lp_index, lp_dir); + if (err) + goto remove_debugfs_lps; + } + + mshv_debugfs_lp = lp_dir; + + return 0; + +remove_debugfs_lps: + for (lp_index -= 1; lp_index >= 0; lp_index--) + mshv_lp_stats_unmap(lp_index); + debugfs_remove_recursive(lp_dir); +free_lp_stats: + kfree(mshv_lps_stats); + mshv_lps_stats = NULL; + + return err; +} + +static int vp_stats_show(struct seq_file *m, void *v) +{ + const struct hv_stats_page **pstats = m->private; + u64 parent_val, self_val; + int idx; + + /* + * For VP and partition stats, there may be two stats areas mapped, + * SELF and PARENT. These refer to the privilege level of the data in + * each page. Some fields may be 0 in SELF and nonzero in PARENT, or + * vice versa. + * + * Hence, prioritize printing from the PARENT page (more privileged + * data), but use the value from the SELF page if the PARENT value is + * 0. + */ + + for (idx = 0; idx < ARRAY_SIZE(hv_vp_counters); idx++) { + char *name = hv_vp_counters[idx]; + + if (!name) + continue; + + parent_val = pstats[HV_STATS_AREA_PARENT]->data[idx]; + self_val = pstats[HV_STATS_AREA_SELF]->data[idx]; + seq_printf(m, "%-43s: %llu\n", name, + parent_val ? parent_val : self_val); + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(vp_stats); + +static void vp_debugfs_remove(struct dentry *vp_stats) +{ + debugfs_remove_recursive(vp_stats->d_parent); +} + +static int vp_debugfs_create(u64 partition_id, u32 vp_index, + struct hv_stats_page **pstats, + struct dentry **vp_stats_ptr, + struct dentry *parent) +{ + struct dentry *vp_idx_dir, *d; + char vp_idx_str[U32_BUF_SZ]; + int err; + + sprintf(vp_idx_str, "%u", vp_index); + + vp_idx_dir = debugfs_create_dir(vp_idx_str, parent); + if (IS_ERR(vp_idx_dir)) + return PTR_ERR(vp_idx_dir); + + d = debugfs_create_file("stats", 0400, vp_idx_dir, + pstats, &vp_stats_fops); + if (IS_ERR(d)) { + err = PTR_ERR(d); + goto remove_debugfs_vp_idx; + } + + *vp_stats_ptr = d; + + return 0; + +remove_debugfs_vp_idx: + debugfs_remove_recursive(vp_idx_dir); + return err; +} + +static int partition_stats_show(struct seq_file *m, void *v) +{ + const struct hv_stats_page **pstats = m->private; + u64 parent_val, self_val; + int idx; + + for (idx = 0; idx < ARRAY_SIZE(hv_partition_counters); idx++) { + char *name = hv_partition_counters[idx]; + + if (!name) + continue; + + parent_val = pstats[HV_STATS_AREA_PARENT]->data[idx]; + self_val = pstats[HV_STATS_AREA_SELF]->data[idx]; + seq_printf(m, "%-37s: %llu\n", name, + parent_val ? parent_val : self_val); + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(partition_stats); + +static void mshv_partition_stats_unmap(u64 partition_id, + struct hv_stats_page *stats_page, + enum hv_stats_area_type stats_area_type) +{ + union hv_stats_object_identity identity = { + .partition.partition_id = partition_id, + .partition.stats_area_type = stats_area_type, + }; + int err; + + err = hv_unmap_stats_page(HV_STATS_OBJECT_PARTITION, stats_page, + &identity); + if (err) + pr_err("%s: failed to unmap partition %lld %s stats, err: %d\n", + __func__, partition_id, + (stats_area_type == HV_STATS_AREA_SELF) ? "self" : "parent", + err); +} + +static struct hv_stats_page *mshv_partition_stats_map(u64 partition_id, + enum hv_stats_area_type stats_area_type) +{ + union hv_stats_object_identity identity = { + .partition.partition_id = partition_id, + .partition.stats_area_type = stats_area_type, + }; + struct hv_stats_page *stats; + int err; + + err = hv_map_stats_page(HV_STATS_OBJECT_PARTITION, &identity, &stats); + if (err) { + pr_err("%s: failed to map partition %lld %s stats, err: %d\n", + __func__, partition_id, + (stats_area_type == HV_STATS_AREA_SELF) ? "self" : "parent", + err); + return ERR_PTR(err); + } + return stats; +} + +static int mshv_debugfs_partition_stats_create(u64 partition_id, + struct dentry **partition_stats_ptr, + struct dentry *parent) +{ + struct dentry *dentry; + struct hv_stats_page **pstats; + int err; + + pstats = kzalloc_objs(struct hv_stats_page *, NUM_STATS_AREAS, + GFP_KERNEL_ACCOUNT); + if (!pstats) + return -ENOMEM; + + pstats[HV_STATS_AREA_SELF] = mshv_partition_stats_map(partition_id, + HV_STATS_AREA_SELF); + if (IS_ERR(pstats[HV_STATS_AREA_SELF])) { + err = PTR_ERR(pstats[HV_STATS_AREA_SELF]); + goto cleanup; + } + + /* + * L1VH partition cannot access its partition stats in parent area. + */ + if (is_l1vh_parent(partition_id)) { + pstats[HV_STATS_AREA_PARENT] = pstats[HV_STATS_AREA_SELF]; + } else { + pstats[HV_STATS_AREA_PARENT] = mshv_partition_stats_map(partition_id, + HV_STATS_AREA_PARENT); + if (IS_ERR(pstats[HV_STATS_AREA_PARENT])) { + err = PTR_ERR(pstats[HV_STATS_AREA_PARENT]); + goto unmap_self; + } + if (!pstats[HV_STATS_AREA_PARENT]) + pstats[HV_STATS_AREA_PARENT] = pstats[HV_STATS_AREA_SELF]; + } + + dentry = debugfs_create_file("stats", 0400, parent, + pstats, &partition_stats_fops); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto unmap_partition_stats; + } + + *partition_stats_ptr = dentry; + return 0; + +unmap_partition_stats: + if (pstats[HV_STATS_AREA_PARENT] != pstats[HV_STATS_AREA_SELF]) + mshv_partition_stats_unmap(partition_id, pstats[HV_STATS_AREA_PARENT], + HV_STATS_AREA_PARENT); +unmap_self: + mshv_partition_stats_unmap(partition_id, pstats[HV_STATS_AREA_SELF], + HV_STATS_AREA_SELF); +cleanup: + kfree(pstats); + return err; +} + +static void partition_debugfs_remove(u64 partition_id, struct dentry *dentry) +{ + struct hv_stats_page **pstats = NULL; + + pstats = dentry->d_inode->i_private; + + debugfs_remove_recursive(dentry->d_parent); + + if (pstats[HV_STATS_AREA_PARENT] != pstats[HV_STATS_AREA_SELF]) { + mshv_partition_stats_unmap(partition_id, + pstats[HV_STATS_AREA_PARENT], + HV_STATS_AREA_PARENT); + } + + mshv_partition_stats_unmap(partition_id, + pstats[HV_STATS_AREA_SELF], + HV_STATS_AREA_SELF); + + kfree(pstats); +} + +static int partition_debugfs_create(u64 partition_id, + struct dentry **vp_dir_ptr, + struct dentry **partition_stats_ptr, + struct dentry *parent) +{ + char part_id_str[U64_BUF_SZ]; + struct dentry *part_id_dir, *vp_dir; + int err; + + if (is_l1vh_parent(partition_id)) + sprintf(part_id_str, "self"); + else + sprintf(part_id_str, "%llu", partition_id); + + part_id_dir = debugfs_create_dir(part_id_str, parent); + if (IS_ERR(part_id_dir)) + return PTR_ERR(part_id_dir); + + vp_dir = debugfs_create_dir("vp", part_id_dir); + if (IS_ERR(vp_dir)) { + err = PTR_ERR(vp_dir); + goto remove_debugfs_partition_id; + } + + err = mshv_debugfs_partition_stats_create(partition_id, + partition_stats_ptr, + part_id_dir); + if (err) + goto remove_debugfs_partition_id; + + *vp_dir_ptr = vp_dir; + + return 0; + +remove_debugfs_partition_id: + debugfs_remove_recursive(part_id_dir); + return err; +} + +static void parent_vp_debugfs_remove(u32 vp_index, + struct dentry *vp_stats_ptr) +{ + struct hv_stats_page **pstats; + + pstats = vp_stats_ptr->d_inode->i_private; + vp_debugfs_remove(vp_stats_ptr); + mshv_vp_stats_unmap(hv_current_partition_id, vp_index, pstats); + kfree(pstats); +} + +static void mshv_debugfs_parent_partition_remove(void) +{ + int idx; + + for_each_online_cpu(idx) + parent_vp_debugfs_remove(hv_vp_index[idx], + parent_vp_stats[idx]); + + partition_debugfs_remove(hv_current_partition_id, + parent_partition_stats); + kfree(parent_vp_stats); + parent_vp_stats = NULL; + parent_partition_stats = NULL; +} + +static int __init parent_vp_debugfs_create(u32 vp_index, + struct dentry **vp_stats_ptr, + struct dentry *parent) +{ + struct hv_stats_page **pstats; + int err; + + pstats = kzalloc_objs(struct hv_stats_page *, NUM_STATS_AREAS, + GFP_KERNEL_ACCOUNT); + if (!pstats) + return -ENOMEM; + + err = mshv_vp_stats_map(hv_current_partition_id, vp_index, pstats); + if (err) + goto cleanup; + + err = vp_debugfs_create(hv_current_partition_id, vp_index, pstats, + vp_stats_ptr, parent); + if (err) + goto unmap_vp_stats; + + return 0; + +unmap_vp_stats: + mshv_vp_stats_unmap(hv_current_partition_id, vp_index, pstats); +cleanup: + kfree(pstats); + return err; +} + +static int __init mshv_debugfs_parent_partition_create(void) +{ + struct dentry *vp_dir; + int err, idx, i; + + mshv_debugfs_partition = debugfs_create_dir("partition", + mshv_debugfs); + if (IS_ERR(mshv_debugfs_partition)) + return PTR_ERR(mshv_debugfs_partition); + + err = partition_debugfs_create(hv_current_partition_id, + &vp_dir, + &parent_partition_stats, + mshv_debugfs_partition); + if (err) + goto remove_debugfs_partition; + + parent_vp_stats = kzalloc_objs(*parent_vp_stats, nr_cpu_ids); + if (!parent_vp_stats) { + err = -ENOMEM; + goto remove_debugfs_partition; + } + + for_each_online_cpu(idx) { + err = parent_vp_debugfs_create(hv_vp_index[idx], + &parent_vp_stats[idx], + vp_dir); + if (err) + goto remove_debugfs_partition_vp; + } + + return 0; + +remove_debugfs_partition_vp: + for_each_online_cpu(i) { + if (i >= idx) + break; + parent_vp_debugfs_remove(i, parent_vp_stats[i]); + } + partition_debugfs_remove(hv_current_partition_id, + parent_partition_stats); + + kfree(parent_vp_stats); + parent_vp_stats = NULL; + parent_partition_stats = NULL; + +remove_debugfs_partition: + debugfs_remove_recursive(mshv_debugfs_partition); + mshv_debugfs_partition = NULL; + return err; +} + +static int hv_stats_show(struct seq_file *m, void *v) +{ + const struct hv_stats_page *stats = m->private; + int idx; + + for (idx = 0; idx < ARRAY_SIZE(hv_hypervisor_counters); idx++) { + char *name = hv_hypervisor_counters[idx]; + + if (!name) + continue; + seq_printf(m, "%-27s: %llu\n", name, stats->data[idx]); + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(hv_stats); + +static void mshv_hv_stats_unmap(void) +{ + union hv_stats_object_identity identity = { + .hv.stats_area_type = HV_STATS_AREA_SELF, + }; + int err; + + err = hv_unmap_stats_page(HV_STATS_OBJECT_HYPERVISOR, NULL, &identity); + if (err) + pr_err("%s: failed to unmap hypervisor stats: %d\n", + __func__, err); +} + +static void * __init mshv_hv_stats_map(void) +{ + union hv_stats_object_identity identity = { + .hv.stats_area_type = HV_STATS_AREA_SELF, + }; + struct hv_stats_page *stats; + int err; + + err = hv_map_stats_page(HV_STATS_OBJECT_HYPERVISOR, &identity, &stats); + if (err) { + pr_err("%s: failed to map hypervisor stats: %d\n", + __func__, err); + return ERR_PTR(err); + } + return stats; +} + +static int __init mshv_debugfs_hv_stats_create(struct dentry *parent) +{ + struct dentry *dentry; + u64 *stats; + int err; + + stats = mshv_hv_stats_map(); + if (IS_ERR(stats)) + return PTR_ERR(stats); + + dentry = debugfs_create_file("stats", 0400, parent, + stats, &hv_stats_fops); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + pr_err("%s: failed to create hypervisor stats dentry: %d\n", + __func__, err); + goto unmap_hv_stats; + } + + mshv_lps_count = stats[HV_HYPERVISOR_COUNTER_LOGICAL_PROCESSORS]; + + return 0; + +unmap_hv_stats: + mshv_hv_stats_unmap(); + return err; +} + +int mshv_debugfs_vp_create(struct mshv_vp *vp) +{ + struct mshv_partition *p = vp->vp_partition; + + if (!mshv_debugfs) + return 0; + + return vp_debugfs_create(p->pt_id, vp->vp_index, + vp->vp_stats_pages, + &vp->vp_stats_dentry, + p->pt_vp_dentry); +} + +void mshv_debugfs_vp_remove(struct mshv_vp *vp) +{ + if (!mshv_debugfs) + return; + + vp_debugfs_remove(vp->vp_stats_dentry); +} + +int mshv_debugfs_partition_create(struct mshv_partition *partition) +{ + int err; + + if (!mshv_debugfs) + return 0; + + err = partition_debugfs_create(partition->pt_id, + &partition->pt_vp_dentry, + &partition->pt_stats_dentry, + mshv_debugfs_partition); + if (err) + return err; + + return 0; +} + +void mshv_debugfs_partition_remove(struct mshv_partition *partition) +{ + if (!mshv_debugfs) + return; + + partition_debugfs_remove(partition->pt_id, + partition->pt_stats_dentry); +} + +int __init mshv_debugfs_init(void) +{ + int err; + + mshv_debugfs = debugfs_create_dir("mshv", NULL); + if (IS_ERR(mshv_debugfs)) { + pr_err("%s: failed to create debugfs directory\n", __func__); + return PTR_ERR(mshv_debugfs); + } + + if (hv_root_partition()) { + err = mshv_debugfs_hv_stats_create(mshv_debugfs); + if (err) + goto remove_mshv_dir; + + err = mshv_debugfs_lp_create(mshv_debugfs); + if (err) + goto unmap_hv_stats; + } + + err = mshv_debugfs_parent_partition_create(); + if (err) + goto unmap_lp_stats; + + return 0; + +unmap_lp_stats: + if (hv_root_partition()) { + mshv_debugfs_lp_remove(); + mshv_debugfs_lp = NULL; + } +unmap_hv_stats: + if (hv_root_partition()) + mshv_hv_stats_unmap(); +remove_mshv_dir: + debugfs_remove_recursive(mshv_debugfs); + mshv_debugfs = NULL; + return err; +} + +void mshv_debugfs_exit(void) +{ + mshv_debugfs_parent_partition_remove(); + + if (hv_root_partition()) { + mshv_debugfs_lp_remove(); + mshv_debugfs_lp = NULL; + mshv_hv_stats_unmap(); + } + + debugfs_remove_recursive(mshv_debugfs); + mshv_debugfs = NULL; + mshv_debugfs_partition = NULL; +} diff --git a/drivers/hv/mshv_debugfs_counters.c b/drivers/hv/mshv_debugfs_counters.c new file mode 100644 index 000000000000..978536ba691f --- /dev/null +++ b/drivers/hv/mshv_debugfs_counters.c @@ -0,0 +1,490 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2026, Microsoft Corporation. + * + * Data for printing stats page counters via debugfs. + * + * Authors: Microsoft Linux virtualization team + */ + +/* + * For simplicity, this file is included directly in mshv_debugfs.c. + * If these are ever needed elsewhere they should be compiled separately. + * Ensure this file is not used twice by accident. + */ +#ifndef MSHV_DEBUGFS_C +#error "This file should only be included in mshv_debugfs.c" +#endif + +/* HV_HYPERVISOR_COUNTER */ +static char *hv_hypervisor_counters[] = { + [1] = "HvLogicalProcessors", + [2] = "HvPartitions", + [3] = "HvTotalPages", + [4] = "HvVirtualProcessors", + [5] = "HvMonitoredNotifications", + [6] = "HvModernStandbyEntries", + [7] = "HvPlatformIdleTransitions", + [8] = "HvHypervisorStartupCost", + + [10] = "HvIOSpacePages", + [11] = "HvNonEssentialPagesForDump", + [12] = "HvSubsumedPages", +}; + +/* HV_CPU_COUNTER */ +static char *hv_lp_counters[] = { + [1] = "LpGlobalTime", + [2] = "LpTotalRunTime", + [3] = "LpHypervisorRunTime", + [4] = "LpHardwareInterrupts", + [5] = "LpContextSwitches", + [6] = "LpInterProcessorInterrupts", + [7] = "LpSchedulerInterrupts", + [8] = "LpTimerInterrupts", + [9] = "LpInterProcessorInterruptsSent", + [10] = "LpProcessorHalts", + [11] = "LpMonitorTransitionCost", + [12] = "LpContextSwitchTime", + [13] = "LpC1TransitionsCount", + [14] = "LpC1RunTime", + [15] = "LpC2TransitionsCount", + [16] = "LpC2RunTime", + [17] = "LpC3TransitionsCount", + [18] = "LpC3RunTime", + [19] = "LpRootVpIndex", + [20] = "LpIdleSequenceNumber", + [21] = "LpGlobalTscCount", + [22] = "LpActiveTscCount", + [23] = "LpIdleAccumulation", + [24] = "LpReferenceCycleCount0", + [25] = "LpActualCycleCount0", + [26] = "LpReferenceCycleCount1", + [27] = "LpActualCycleCount1", + [28] = "LpProximityDomainId", + [29] = "LpPostedInterruptNotifications", + [30] = "LpBranchPredictorFlushes", +#if IS_ENABLED(CONFIG_X86_64) + [31] = "LpL1DataCacheFlushes", + [32] = "LpImmediateL1DataCacheFlushes", + [33] = "LpMbFlushes", + [34] = "LpCounterRefreshSequenceNumber", + [35] = "LpCounterRefreshReferenceTime", + [36] = "LpIdleAccumulationSnapshot", + [37] = "LpActiveTscCountSnapshot", + [38] = "LpHwpRequestContextSwitches", + [39] = "LpPlaceholder1", + [40] = "LpPlaceholder2", + [41] = "LpPlaceholder3", + [42] = "LpPlaceholder4", + [43] = "LpPlaceholder5", + [44] = "LpPlaceholder6", + [45] = "LpPlaceholder7", + [46] = "LpPlaceholder8", + [47] = "LpPlaceholder9", + [48] = "LpSchLocalRunListSize", + [49] = "LpReserveGroupId", + [50] = "LpRunningPriority", + [51] = "LpPerfmonInterruptCount", +#elif IS_ENABLED(CONFIG_ARM64) + [31] = "LpCounterRefreshSequenceNumber", + [32] = "LpCounterRefreshReferenceTime", + [33] = "LpIdleAccumulationSnapshot", + [34] = "LpActiveTscCountSnapshot", + [35] = "LpHwpRequestContextSwitches", + [36] = "LpPlaceholder2", + [37] = "LpPlaceholder3", + [38] = "LpPlaceholder4", + [39] = "LpPlaceholder5", + [40] = "LpPlaceholder6", + [41] = "LpPlaceholder7", + [42] = "LpPlaceholder8", + [43] = "LpPlaceholder9", + [44] = "LpSchLocalRunListSize", + [45] = "LpReserveGroupId", + [46] = "LpRunningPriority", +#endif +}; + +/* HV_PROCESS_COUNTER */ +static char *hv_partition_counters[] = { + [1] = "PtVirtualProcessors", + + [3] = "PtTlbSize", + [4] = "PtAddressSpaces", + [5] = "PtDepositedPages", + [6] = "PtGpaPages", + [7] = "PtGpaSpaceModifications", + [8] = "PtVirtualTlbFlushEntires", + [9] = "PtRecommendedTlbSize", + [10] = "PtGpaPages4K", + [11] = "PtGpaPages2M", + [12] = "PtGpaPages1G", + [13] = "PtGpaPages512G", + [14] = "PtDevicePages4K", + [15] = "PtDevicePages2M", + [16] = "PtDevicePages1G", + [17] = "PtDevicePages512G", + [18] = "PtAttachedDevices", + [19] = "PtDeviceInterruptMappings", + [20] = "PtIoTlbFlushes", + [21] = "PtIoTlbFlushCost", + [22] = "PtDeviceInterruptErrors", + [23] = "PtDeviceDmaErrors", + [24] = "PtDeviceInterruptThrottleEvents", + [25] = "PtSkippedTimerTicks", + [26] = "PtPartitionId", +#if IS_ENABLED(CONFIG_X86_64) + [27] = "PtNestedTlbSize", + [28] = "PtRecommendedNestedTlbSize", + [29] = "PtNestedTlbFreeListSize", + [30] = "PtNestedTlbTrimmedPages", + [31] = "PtPagesShattered", + [32] = "PtPagesRecombined", + [33] = "PtHwpRequestValue", + [34] = "PtAutoSuspendEnableTime", + [35] = "PtAutoSuspendTriggerTime", + [36] = "PtAutoSuspendDisableTime", + [37] = "PtPlaceholder1", + [38] = "PtPlaceholder2", + [39] = "PtPlaceholder3", + [40] = "PtPlaceholder4", + [41] = "PtPlaceholder5", + [42] = "PtPlaceholder6", + [43] = "PtPlaceholder7", + [44] = "PtPlaceholder8", + [45] = "PtHypervisorStateTransferGeneration", + [46] = "PtNumberofActiveChildPartitions", +#elif IS_ENABLED(CONFIG_ARM64) + [27] = "PtHwpRequestValue", + [28] = "PtAutoSuspendEnableTime", + [29] = "PtAutoSuspendTriggerTime", + [30] = "PtAutoSuspendDisableTime", + [31] = "PtPlaceholder1", + [32] = "PtPlaceholder2", + [33] = "PtPlaceholder3", + [34] = "PtPlaceholder4", + [35] = "PtPlaceholder5", + [36] = "PtPlaceholder6", + [37] = "PtPlaceholder7", + [38] = "PtPlaceholder8", + [39] = "PtHypervisorStateTransferGeneration", + [40] = "PtNumberofActiveChildPartitions", +#endif +}; + +/* HV_THREAD_COUNTER */ +static char *hv_vp_counters[] = { + [1] = "VpTotalRunTime", + [2] = "VpHypervisorRunTime", + [3] = "VpRemoteNodeRunTime", + [4] = "VpNormalizedRunTime", + [5] = "VpIdealCpu", + + [7] = "VpHypercallsCount", + [8] = "VpHypercallsTime", +#if IS_ENABLED(CONFIG_X86_64) + [9] = "VpPageInvalidationsCount", + [10] = "VpPageInvalidationsTime", + [11] = "VpControlRegisterAccessesCount", + [12] = "VpControlRegisterAccessesTime", + [13] = "VpIoInstructionsCount", + [14] = "VpIoInstructionsTime", + [15] = "VpHltInstructionsCount", + [16] = "VpHltInstructionsTime", + [17] = "VpMwaitInstructionsCount", + [18] = "VpMwaitInstructionsTime", + [19] = "VpCpuidInstructionsCount", + [20] = "VpCpuidInstructionsTime", + [21] = "VpMsrAccessesCount", + [22] = "VpMsrAccessesTime", + [23] = "VpOtherInterceptsCount", + [24] = "VpOtherInterceptsTime", + [25] = "VpExternalInterruptsCount", + [26] = "VpExternalInterruptsTime", + [27] = "VpPendingInterruptsCount", + [28] = "VpPendingInterruptsTime", + [29] = "VpEmulatedInstructionsCount", + [30] = "VpEmulatedInstructionsTime", + [31] = "VpDebugRegisterAccessesCount", + [32] = "VpDebugRegisterAccessesTime", + [33] = "VpPageFaultInterceptsCount", + [34] = "VpPageFaultInterceptsTime", + [35] = "VpGuestPageTableMaps", + [36] = "VpLargePageTlbFills", + [37] = "VpSmallPageTlbFills", + [38] = "VpReflectedGuestPageFaults", + [39] = "VpApicMmioAccesses", + [40] = "VpIoInterceptMessages", + [41] = "VpMemoryInterceptMessages", + [42] = "VpApicEoiAccesses", + [43] = "VpOtherMessages", + [44] = "VpPageTableAllocations", + [45] = "VpLogicalProcessorMigrations", + [46] = "VpAddressSpaceEvictions", + [47] = "VpAddressSpaceSwitches", + [48] = "VpAddressDomainFlushes", + [49] = "VpAddressSpaceFlushes", + [50] = "VpGlobalGvaRangeFlushes", + [51] = "VpLocalGvaRangeFlushes", + [52] = "VpPageTableEvictions", + [53] = "VpPageTableReclamations", + [54] = "VpPageTableResets", + [55] = "VpPageTableValidations", + [56] = "VpApicTprAccesses", + [57] = "VpPageTableWriteIntercepts", + [58] = "VpSyntheticInterrupts", + [59] = "VpVirtualInterrupts", + [60] = "VpApicIpisSent", + [61] = "VpApicSelfIpisSent", + [62] = "VpGpaSpaceHypercalls", + [63] = "VpLogicalProcessorHypercalls", + [64] = "VpLongSpinWaitHypercalls", + [65] = "VpOtherHypercalls", + [66] = "VpSyntheticInterruptHypercalls", + [67] = "VpVirtualInterruptHypercalls", + [68] = "VpVirtualMmuHypercalls", + [69] = "VpVirtualProcessorHypercalls", + [70] = "VpHardwareInterrupts", + [71] = "VpNestedPageFaultInterceptsCount", + [72] = "VpNestedPageFaultInterceptsTime", + [73] = "VpPageScans", + [74] = "VpLogicalProcessorDispatches", + [75] = "VpWaitingForCpuTime", + [76] = "VpExtendedHypercalls", + [77] = "VpExtendedHypercallInterceptMessages", + [78] = "VpMbecNestedPageTableSwitches", + [79] = "VpOtherReflectedGuestExceptions", + [80] = "VpGlobalIoTlbFlushes", + [81] = "VpGlobalIoTlbFlushCost", + [82] = "VpLocalIoTlbFlushes", + [83] = "VpLocalIoTlbFlushCost", + [84] = "VpHypercallsForwardedCount", + [85] = "VpHypercallsForwardingTime", + [86] = "VpPageInvalidationsForwardedCount", + [87] = "VpPageInvalidationsForwardingTime", + [88] = "VpControlRegisterAccessesForwardedCount", + [89] = "VpControlRegisterAccessesForwardingTime", + [90] = "VpIoInstructionsForwardedCount", + [91] = "VpIoInstructionsForwardingTime", + [92] = "VpHltInstructionsForwardedCount", + [93] = "VpHltInstructionsForwardingTime", + [94] = "VpMwaitInstructionsForwardedCount", + [95] = "VpMwaitInstructionsForwardingTime", + [96] = "VpCpuidInstructionsForwardedCount", + [97] = "VpCpuidInstructionsForwardingTime", + [98] = "VpMsrAccessesForwardedCount", + [99] = "VpMsrAccessesForwardingTime", + [100] = "VpOtherInterceptsForwardedCount", + [101] = "VpOtherInterceptsForwardingTime", + [102] = "VpExternalInterruptsForwardedCount", + [103] = "VpExternalInterruptsForwardingTime", + [104] = "VpPendingInterruptsForwardedCount", + [105] = "VpPendingInterruptsForwardingTime", + [106] = "VpEmulatedInstructionsForwardedCount", + [107] = "VpEmulatedInstructionsForwardingTime", + [108] = "VpDebugRegisterAccessesForwardedCount", + [109] = "VpDebugRegisterAccessesForwardingTime", + [110] = "VpPageFaultInterceptsForwardedCount", + [111] = "VpPageFaultInterceptsForwardingTime", + [112] = "VpVmclearEmulationCount", + [113] = "VpVmclearEmulationTime", + [114] = "VpVmptrldEmulationCount", + [115] = "VpVmptrldEmulationTime", + [116] = "VpVmptrstEmulationCount", + [117] = "VpVmptrstEmulationTime", + [118] = "VpVmreadEmulationCount", + [119] = "VpVmreadEmulationTime", + [120] = "VpVmwriteEmulationCount", + [121] = "VpVmwriteEmulationTime", + [122] = "VpVmxoffEmulationCount", + [123] = "VpVmxoffEmulationTime", + [124] = "VpVmxonEmulationCount", + [125] = "VpVmxonEmulationTime", + [126] = "VpNestedVMEntriesCount", + [127] = "VpNestedVMEntriesTime", + [128] = "VpNestedSLATSoftPageFaultsCount", + [129] = "VpNestedSLATSoftPageFaultsTime", + [130] = "VpNestedSLATHardPageFaultsCount", + [131] = "VpNestedSLATHardPageFaultsTime", + [132] = "VpInvEptAllContextEmulationCount", + [133] = "VpInvEptAllContextEmulationTime", + [134] = "VpInvEptSingleContextEmulationCount", + [135] = "VpInvEptSingleContextEmulationTime", + [136] = "VpInvVpidAllContextEmulationCount", + [137] = "VpInvVpidAllContextEmulationTime", + [138] = "VpInvVpidSingleContextEmulationCount", + [139] = "VpInvVpidSingleContextEmulationTime", + [140] = "VpInvVpidSingleAddressEmulationCount", + [141] = "VpInvVpidSingleAddressEmulationTime", + [142] = "VpNestedTlbPageTableReclamations", + [143] = "VpNestedTlbPageTableEvictions", + [144] = "VpFlushGuestPhysicalAddressSpaceHypercalls", + [145] = "VpFlushGuestPhysicalAddressListHypercalls", + [146] = "VpPostedInterruptNotifications", + [147] = "VpPostedInterruptScans", + [148] = "VpTotalCoreRunTime", + [149] = "VpMaximumRunTime", + [150] = "VpHwpRequestContextSwitches", + [151] = "VpWaitingForCpuTimeBucket0", + [152] = "VpWaitingForCpuTimeBucket1", + [153] = "VpWaitingForCpuTimeBucket2", + [154] = "VpWaitingForCpuTimeBucket3", + [155] = "VpWaitingForCpuTimeBucket4", + [156] = "VpWaitingForCpuTimeBucket5", + [157] = "VpWaitingForCpuTimeBucket6", + [158] = "VpVmloadEmulationCount", + [159] = "VpVmloadEmulationTime", + [160] = "VpVmsaveEmulationCount", + [161] = "VpVmsaveEmulationTime", + [162] = "VpGifInstructionEmulationCount", + [163] = "VpGifInstructionEmulationTime", + [164] = "VpEmulatedErrataSvmInstructions", + [165] = "VpPlaceholder1", + [166] = "VpPlaceholder2", + [167] = "VpPlaceholder3", + [168] = "VpPlaceholder4", + [169] = "VpPlaceholder5", + [170] = "VpPlaceholder6", + [171] = "VpPlaceholder7", + [172] = "VpPlaceholder8", + [173] = "VpContentionTime", + [174] = "VpWakeUpTime", + [175] = "VpSchedulingPriority", + [176] = "VpRdpmcInstructionsCount", + [177] = "VpRdpmcInstructionsTime", + [178] = "VpPerfmonPmuMsrAccessesCount", + [179] = "VpPerfmonLbrMsrAccessesCount", + [180] = "VpPerfmonIptMsrAccessesCount", + [181] = "VpPerfmonInterruptCount", + [182] = "VpVtl1DispatchCount", + [183] = "VpVtl2DispatchCount", + [184] = "VpVtl2DispatchBucket0", + [185] = "VpVtl2DispatchBucket1", + [186] = "VpVtl2DispatchBucket2", + [187] = "VpVtl2DispatchBucket3", + [188] = "VpVtl2DispatchBucket4", + [189] = "VpVtl2DispatchBucket5", + [190] = "VpVtl2DispatchBucket6", + [191] = "VpVtl1RunTime", + [192] = "VpVtl2RunTime", + [193] = "VpIommuHypercalls", + [194] = "VpCpuGroupHypercalls", + [195] = "VpVsmHypercalls", + [196] = "VpEventLogHypercalls", + [197] = "VpDeviceDomainHypercalls", + [198] = "VpDepositHypercalls", + [199] = "VpSvmHypercalls", + [200] = "VpBusLockAcquisitionCount", + [201] = "VpLoadAvg", + [202] = "VpRootDispatchThreadBlocked", + [203] = "VpIdleCpuTime", + [204] = "VpWaitingForCpuTimeBucket7", + [205] = "VpWaitingForCpuTimeBucket8", + [206] = "VpWaitingForCpuTimeBucket9", + [207] = "VpWaitingForCpuTimeBucket10", + [208] = "VpWaitingForCpuTimeBucket11", + [209] = "VpWaitingForCpuTimeBucket12", + [210] = "VpHierarchicalSuspendTime", + [211] = "VpExpressSchedulingAttempts", + [212] = "VpExpressSchedulingCount", +#elif IS_ENABLED(CONFIG_ARM64) + [9] = "VpSysRegAccessesCount", + [10] = "VpSysRegAccessesTime", + [11] = "VpSmcInstructionsCount", + [12] = "VpSmcInstructionsTime", + [13] = "VpOtherInterceptsCount", + [14] = "VpOtherInterceptsTime", + [15] = "VpExternalInterruptsCount", + [16] = "VpExternalInterruptsTime", + [17] = "VpPendingInterruptsCount", + [18] = "VpPendingInterruptsTime", + [19] = "VpGuestPageTableMaps", + [20] = "VpLargePageTlbFills", + [21] = "VpSmallPageTlbFills", + [22] = "VpReflectedGuestPageFaults", + [23] = "VpMemoryInterceptMessages", + [24] = "VpOtherMessages", + [25] = "VpLogicalProcessorMigrations", + [26] = "VpAddressDomainFlushes", + [27] = "VpAddressSpaceFlushes", + [28] = "VpSyntheticInterrupts", + [29] = "VpVirtualInterrupts", + [30] = "VpApicSelfIpisSent", + [31] = "VpGpaSpaceHypercalls", + [32] = "VpLogicalProcessorHypercalls", + [33] = "VpLongSpinWaitHypercalls", + [34] = "VpOtherHypercalls", + [35] = "VpSyntheticInterruptHypercalls", + [36] = "VpVirtualInterruptHypercalls", + [37] = "VpVirtualMmuHypercalls", + [38] = "VpVirtualProcessorHypercalls", + [39] = "VpHardwareInterrupts", + [40] = "VpNestedPageFaultInterceptsCount", + [41] = "VpNestedPageFaultInterceptsTime", + [42] = "VpLogicalProcessorDispatches", + [43] = "VpWaitingForCpuTime", + [44] = "VpExtendedHypercalls", + [45] = "VpExtendedHypercallInterceptMessages", + [46] = "VpMbecNestedPageTableSwitches", + [47] = "VpOtherReflectedGuestExceptions", + [48] = "VpGlobalIoTlbFlushes", + [49] = "VpGlobalIoTlbFlushCost", + [50] = "VpLocalIoTlbFlushes", + [51] = "VpLocalIoTlbFlushCost", + [52] = "VpFlushGuestPhysicalAddressSpaceHypercalls", + [53] = "VpFlushGuestPhysicalAddressListHypercalls", + [54] = "VpPostedInterruptNotifications", + [55] = "VpPostedInterruptScans", + [56] = "VpTotalCoreRunTime", + [57] = "VpMaximumRunTime", + [58] = "VpWaitingForCpuTimeBucket0", + [59] = "VpWaitingForCpuTimeBucket1", + [60] = "VpWaitingForCpuTimeBucket2", + [61] = "VpWaitingForCpuTimeBucket3", + [62] = "VpWaitingForCpuTimeBucket4", + [63] = "VpWaitingForCpuTimeBucket5", + [64] = "VpWaitingForCpuTimeBucket6", + [65] = "VpHwpRequestContextSwitches", + [66] = "VpPlaceholder2", + [67] = "VpPlaceholder3", + [68] = "VpPlaceholder4", + [69] = "VpPlaceholder5", + [70] = "VpPlaceholder6", + [71] = "VpPlaceholder7", + [72] = "VpPlaceholder8", + [73] = "VpContentionTime", + [74] = "VpWakeUpTime", + [75] = "VpSchedulingPriority", + [76] = "VpVtl1DispatchCount", + [77] = "VpVtl2DispatchCount", + [78] = "VpVtl2DispatchBucket0", + [79] = "VpVtl2DispatchBucket1", + [80] = "VpVtl2DispatchBucket2", + [81] = "VpVtl2DispatchBucket3", + [82] = "VpVtl2DispatchBucket4", + [83] = "VpVtl2DispatchBucket5", + [84] = "VpVtl2DispatchBucket6", + [85] = "VpVtl1RunTime", + [86] = "VpVtl2RunTime", + [87] = "VpIommuHypercalls", + [88] = "VpCpuGroupHypercalls", + [89] = "VpVsmHypercalls", + [90] = "VpEventLogHypercalls", + [91] = "VpDeviceDomainHypercalls", + [92] = "VpDepositHypercalls", + [93] = "VpSvmHypercalls", + [94] = "VpLoadAvg", + [95] = "VpRootDispatchThreadBlocked", + [96] = "VpIdleCpuTime", + [97] = "VpWaitingForCpuTimeBucket7", + [98] = "VpWaitingForCpuTimeBucket8", + [99] = "VpWaitingForCpuTimeBucket9", + [100] = "VpWaitingForCpuTimeBucket10", + [101] = "VpWaitingForCpuTimeBucket11", + [102] = "VpWaitingForCpuTimeBucket12", + [103] = "VpHierarchicalSuspendTime", + [104] = "VpExpressSchedulingAttempts", + [105] = "VpExpressSchedulingCount", +#endif +}; diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c new file mode 100644 index 000000000000..90959f639dc3 --- /dev/null +++ b/drivers/hv/mshv_eventfd.c @@ -0,0 +1,853 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * eventfd support for mshv + * + * Heavily inspired from KVM implementation of irqfd/ioeventfd. The basic + * framework code is taken from the kvm implementation. + * + * All credits to kvm developers. + */ + +#include <linux/syscalls.h> +#include <linux/wait.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/list.h> +#include <linux/workqueue.h> +#include <linux/eventfd.h> + +#if IS_ENABLED(CONFIG_X86_64) +#include <asm/apic.h> +#endif +#include <asm/mshyperv.h> + +#include "mshv_eventfd.h" +#include "mshv.h" +#include "mshv_root.h" + +static struct workqueue_struct *irqfd_cleanup_wq; + +void mshv_register_irq_ack_notifier(struct mshv_partition *partition, + struct mshv_irq_ack_notifier *mian) +{ + mutex_lock(&partition->pt_irq_lock); + hlist_add_head_rcu(&mian->link, &partition->irq_ack_notifier_list); + mutex_unlock(&partition->pt_irq_lock); +} + +void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition, + struct mshv_irq_ack_notifier *mian) +{ + mutex_lock(&partition->pt_irq_lock); + hlist_del_init_rcu(&mian->link); + mutex_unlock(&partition->pt_irq_lock); + synchronize_rcu(); +} + +bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi) +{ + struct mshv_irq_ack_notifier *mian; + bool acked = false; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mian, &partition->irq_ack_notifier_list, + link) { + if (mian->irq_ack_gsi == gsi) { + mian->irq_acked(mian); + acked = true; + } + } + rcu_read_unlock(); + + return acked; +} + +#if IS_ENABLED(CONFIG_ARM64) +static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type) +{ + return false; +} +#elif IS_ENABLED(CONFIG_X86_64) +static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type) +{ + return type == HV_X64_INTERRUPT_TYPE_EXTINT; +} +#endif + +static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian) +{ + struct mshv_irqfd_resampler *resampler; + struct mshv_partition *partition; + struct mshv_irqfd *irqfd; + int idx; + + resampler = container_of(mian, struct mshv_irqfd_resampler, + rsmplr_notifier); + partition = resampler->rsmplr_partn; + + idx = srcu_read_lock(&partition->pt_irq_srcu); + + hlist_for_each_entry_srcu(irqfd, &resampler->rsmplr_irqfd_list, + irqfd_resampler_hnode, + srcu_read_lock_held(&partition->pt_irq_srcu)) { + if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type)) + hv_call_clear_virtual_interrupt(partition->pt_id); + + eventfd_signal(irqfd->irqfd_resamplefd); + } + + srcu_read_unlock(&partition->pt_irq_srcu, idx); +} + +#if IS_ENABLED(CONFIG_X86_64) +static bool +mshv_vp_irq_vector_injected(union hv_vp_register_page_interrupt_vectors iv, + u32 vector) +{ + int i; + + for (i = 0; i < iv.vector_count; i++) { + if (iv.vector[i] == vector) + return true; + } + + return false; +} + +static int mshv_vp_irq_try_set_vector(struct mshv_vp *vp, u32 vector) +{ + union hv_vp_register_page_interrupt_vectors iv, new_iv; + + iv = vp->vp_register_page->interrupt_vectors; + new_iv = iv; + + if (mshv_vp_irq_vector_injected(iv, vector)) + return 0; + + if (iv.vector_count >= HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT) + return -ENOSPC; + + new_iv.vector[new_iv.vector_count++] = vector; + + if (!try_cmpxchg(&vp->vp_register_page->interrupt_vectors.as_uint64, + &iv.as_uint64, new_iv.as_uint64)) + return -EAGAIN; + + return 0; +} + +static int mshv_vp_irq_set_vector(struct mshv_vp *vp, u32 vector) +{ + int ret; + + do { + ret = mshv_vp_irq_try_set_vector(vp, vector); + } while (ret == -EAGAIN && !need_resched()); + + return ret; +} + +/* + * Try to raise irq for guest via shared vector array. hyp does the actual + * inject of the interrupt. + */ +static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd) +{ + struct mshv_partition *partition = irqfd->irqfd_partn; + struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq; + struct mshv_vp *vp; + + if (!(ms_hyperv.ext_features & + HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE)) + return -EOPNOTSUPP; + + if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) + return -EOPNOTSUPP; + +#if IS_ENABLED(CONFIG_X86) + if (irq->lapic_control.logical_dest_mode) + return -EOPNOTSUPP; +#endif + + vp = partition->pt_vp_array[irq->lapic_apic_id]; + + if (!vp->vp_register_page) + return -EOPNOTSUPP; + + if (mshv_vp_irq_set_vector(vp, irq->lapic_vector)) + return -EINVAL; + + if (vp->run.flags.root_sched_dispatched && + vp->vp_register_page->interrupt_vectors.as_uint64) + return -EBUSY; + + wake_up(&vp->run.vp_suspend_queue); + + return 0; +} +#else /* CONFIG_X86_64 */ +static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd) +{ + return -EOPNOTSUPP; +} +#endif + +static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd) +{ + struct mshv_partition *partition = irqfd->irqfd_partn; + struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq; + unsigned int seq; + int idx; + +#if IS_ENABLED(CONFIG_X86) + WARN_ON(irqfd->irqfd_resampler && + !irq->lapic_control.level_triggered); +#endif + + idx = srcu_read_lock(&partition->pt_irq_srcu); + if (irqfd->irqfd_girq_ent.guest_irq_num) { + if (!irqfd->irqfd_girq_ent.girq_entry_valid) { + srcu_read_unlock(&partition->pt_irq_srcu, idx); + return; + } + + do { + seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc); + } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq)); + } + + hv_call_assert_virtual_interrupt(irqfd->irqfd_partn->pt_id, + irq->lapic_vector, irq->lapic_apic_id, + irq->lapic_control); + srcu_read_unlock(&partition->pt_irq_srcu, idx); +} + +static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd) +{ + struct mshv_irqfd_resampler *rp = irqfd->irqfd_resampler; + struct mshv_partition *pt = rp->rsmplr_partn; + + mutex_lock(&pt->irqfds_resampler_lock); + + hlist_del_rcu(&irqfd->irqfd_resampler_hnode); + synchronize_srcu(&pt->pt_irq_srcu); + + if (hlist_empty(&rp->rsmplr_irqfd_list)) { + hlist_del(&rp->rsmplr_hnode); + mshv_unregister_irq_ack_notifier(pt, &rp->rsmplr_notifier); + kfree(rp); + } + + mutex_unlock(&pt->irqfds_resampler_lock); +} + +/* + * Race-free decouple logic (ordering is critical) + */ +static void mshv_irqfd_shutdown(struct work_struct *work) +{ + struct mshv_irqfd *irqfd = + container_of(work, struct mshv_irqfd, irqfd_shutdown); + u64 cnt; + + /* + * Synchronize with the wait-queue and unhook ourselves to prevent + * further events. + */ + eventfd_ctx_remove_wait_queue(irqfd->irqfd_eventfd_ctx, &irqfd->irqfd_wait, &cnt); + + if (irqfd->irqfd_resampler) { + mshv_irqfd_resampler_shutdown(irqfd); + eventfd_ctx_put(irqfd->irqfd_resamplefd); + } + + /* + * It is now safe to release the object's resources + */ + eventfd_ctx_put(irqfd->irqfd_eventfd_ctx); + kfree(irqfd); +} + +/* assumes partition->pt_irqfds_lock is held */ +static bool mshv_irqfd_is_active(struct mshv_irqfd *irqfd) +{ + return !hlist_unhashed(&irqfd->irqfd_hnode); +} + +/* + * Mark the irqfd as inactive and schedule it for removal + * + * assumes partition->pt_irqfds_lock is held + */ +static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd) +{ + if (!mshv_irqfd_is_active(irqfd)) + return; + + hlist_del(&irqfd->irqfd_hnode); + + queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown); +} + +/* + * Called with wqh->lock held and interrupts disabled + */ +static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, + int sync, void *key) +{ + struct mshv_irqfd *irqfd = container_of(wait, struct mshv_irqfd, + irqfd_wait); + __poll_t flags = key_to_poll(key); + int idx; + unsigned int seq; + struct mshv_partition *pt = irqfd->irqfd_partn; + int ret = 0; + + if (flags & EPOLLIN) { + u64 cnt; + + eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt); + idx = srcu_read_lock(&pt->pt_irq_srcu); + do { + seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc); + } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq)); + + /* An event has been signaled, raise an interrupt */ + ret = mshv_try_assert_irq_fast(irqfd); + if (ret) + mshv_assert_irq_slow(irqfd); + + srcu_read_unlock(&pt->pt_irq_srcu, idx); + + ret = 1; + } + + if (flags & EPOLLHUP) { + /* The eventfd is closing, detach from the partition */ + unsigned long flags; + + spin_lock_irqsave(&pt->pt_irqfds_lock, flags); + + /* + * We must check if someone deactivated the irqfd before + * we could acquire the pt_irqfds_lock since the item is + * deactivated from the mshv side before it is unhooked from + * the wait-queue. If it is already deactivated, we can + * simply return knowing the other side will cleanup for us. + * We cannot race against the irqfd going away since the + * other side is required to acquire wqh->lock, which we hold + */ + if (mshv_irqfd_is_active(irqfd)) + mshv_irqfd_deactivate(irqfd); + + spin_unlock_irqrestore(&pt->pt_irqfds_lock, flags); + } + + return ret; +} + +/* Must be called under pt_irqfds_lock */ +static void mshv_irqfd_update(struct mshv_partition *pt, + struct mshv_irqfd *irqfd) +{ + write_seqcount_begin(&irqfd->irqfd_irqe_sc); + irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt, + irqfd->irqfd_irqnum); + mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq); + write_seqcount_end(&irqfd->irqfd_irqe_sc); +} + +void mshv_irqfd_routing_update(struct mshv_partition *pt) +{ + struct mshv_irqfd *irqfd; + + spin_lock_irq(&pt->pt_irqfds_lock); + hlist_for_each_entry(irqfd, &pt->pt_irqfds_list, irqfd_hnode) + mshv_irqfd_update(pt, irqfd); + spin_unlock_irq(&pt->pt_irqfds_lock); +} + +static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh, + poll_table *polltbl) +{ + struct mshv_irqfd *irqfd = + container_of(polltbl, struct mshv_irqfd, irqfd_polltbl); + + /* + * TODO: Ensure there isn't already an exclusive, priority waiter, e.g. + * that the irqfd isn't already bound to another partition. Only the + * first exclusive waiter encountered will be notified, and + * add_wait_queue_priority() doesn't enforce exclusivity. + */ + irqfd->irqfd_wait.flags |= WQ_FLAG_EXCLUSIVE; + add_wait_queue_priority(wqh, &irqfd->irqfd_wait); +} + +static int mshv_irqfd_assign(struct mshv_partition *pt, + struct mshv_user_irqfd *args) +{ + struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; + struct mshv_irqfd *irqfd, *tmp; + __poll_t events; + int ret; + int idx; + + CLASS(fd, f)(args->fd); + + irqfd = kzalloc_obj(*irqfd); + if (!irqfd) + return -ENOMEM; + + irqfd->irqfd_partn = pt; + irqfd->irqfd_irqnum = args->gsi; + INIT_WORK(&irqfd->irqfd_shutdown, mshv_irqfd_shutdown); + seqcount_spinlock_init(&irqfd->irqfd_irqe_sc, &pt->pt_irqfds_lock); + + if (fd_empty(f)) { + ret = -EBADF; + goto out; + } + + eventfd = eventfd_ctx_fileget(fd_file(f)); + if (IS_ERR(eventfd)) { + ret = PTR_ERR(eventfd); + goto fail; + } + + irqfd->irqfd_eventfd_ctx = eventfd; + + if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE)) { + struct mshv_irqfd_resampler *rp; + + resamplefd = eventfd_ctx_fdget(args->resamplefd); + if (IS_ERR(resamplefd)) { + ret = PTR_ERR(resamplefd); + goto fail; + } + + irqfd->irqfd_resamplefd = resamplefd; + + mutex_lock(&pt->irqfds_resampler_lock); + + hlist_for_each_entry(rp, &pt->irqfds_resampler_list, + rsmplr_hnode) { + if (rp->rsmplr_notifier.irq_ack_gsi == + irqfd->irqfd_irqnum) { + irqfd->irqfd_resampler = rp; + break; + } + } + + if (!irqfd->irqfd_resampler) { + rp = kzalloc_obj(*rp, GFP_KERNEL_ACCOUNT); + if (!rp) { + ret = -ENOMEM; + mutex_unlock(&pt->irqfds_resampler_lock); + goto fail; + } + + rp->rsmplr_partn = pt; + INIT_HLIST_HEAD(&rp->rsmplr_irqfd_list); + rp->rsmplr_notifier.irq_ack_gsi = irqfd->irqfd_irqnum; + rp->rsmplr_notifier.irq_acked = + mshv_irqfd_resampler_ack; + + hlist_add_head(&rp->rsmplr_hnode, + &pt->irqfds_resampler_list); + mshv_register_irq_ack_notifier(pt, + &rp->rsmplr_notifier); + irqfd->irqfd_resampler = rp; + } + + hlist_add_head_rcu(&irqfd->irqfd_resampler_hnode, + &irqfd->irqfd_resampler->rsmplr_irqfd_list); + + mutex_unlock(&pt->irqfds_resampler_lock); + } + + /* + * Install our own custom wake-up handling so we are notified via + * a callback whenever someone signals the underlying eventfd + */ + init_waitqueue_func_entry(&irqfd->irqfd_wait, mshv_irqfd_wakeup); + init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc); + + spin_lock_irq(&pt->pt_irqfds_lock); +#if IS_ENABLED(CONFIG_X86) + if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) && + !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) { + /* + * Resample Fd must be for level triggered interrupt + * Otherwise return with failure + */ + spin_unlock_irq(&pt->pt_irqfds_lock); + ret = -EINVAL; + goto fail; + } +#endif + ret = 0; + hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) { + if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx) + continue; + /* This fd is used for another irq already. */ + ret = -EBUSY; + spin_unlock_irq(&pt->pt_irqfds_lock); + goto fail; + } + + idx = srcu_read_lock(&pt->pt_irq_srcu); + mshv_irqfd_update(pt, irqfd); + hlist_add_head(&irqfd->irqfd_hnode, &pt->pt_irqfds_list); + spin_unlock_irq(&pt->pt_irqfds_lock); + + /* + * Check if there was an event already pending on the eventfd + * before we registered, and trigger it as if we didn't miss it. + */ + events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl); + + if (events & EPOLLIN) + mshv_assert_irq_slow(irqfd); + + srcu_read_unlock(&pt->pt_irq_srcu, idx); + return 0; + +fail: + if (irqfd->irqfd_resampler) + mshv_irqfd_resampler_shutdown(irqfd); + + if (resamplefd && !IS_ERR(resamplefd)) + eventfd_ctx_put(resamplefd); + + if (eventfd && !IS_ERR(eventfd)) + eventfd_ctx_put(eventfd); + +out: + kfree(irqfd); + return ret; +} + +/* + * shutdown any irqfd's that match fd+gsi + */ +static int mshv_irqfd_deassign(struct mshv_partition *pt, + struct mshv_user_irqfd *args) +{ + struct mshv_irqfd *irqfd; + struct hlist_node *n; + struct eventfd_ctx *eventfd; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, + irqfd_hnode) { + if (irqfd->irqfd_eventfd_ctx == eventfd && + irqfd->irqfd_irqnum == args->gsi) + + mshv_irqfd_deactivate(irqfd); + } + + eventfd_ctx_put(eventfd); + + /* + * Block until we know all outstanding shutdown jobs have completed + * so that we guarantee there will not be any more interrupts on this + * gsi once this deassign function returns. + */ + flush_workqueue(irqfd_cleanup_wq); + + return 0; +} + +int mshv_set_unset_irqfd(struct mshv_partition *pt, + struct mshv_user_irqfd *args) +{ + if (args->flags & ~MSHV_IRQFD_FLAGS_MASK) + return -EINVAL; + + if (args->flags & BIT(MSHV_IRQFD_BIT_DEASSIGN)) + return mshv_irqfd_deassign(pt, args); + + return mshv_irqfd_assign(pt, args); +} + +/* + * This function is called as the mshv VM fd is being released. + * Shutdown all irqfds that still remain open + */ +static void mshv_irqfd_release(struct mshv_partition *pt) +{ + struct mshv_irqfd *irqfd; + struct hlist_node *n; + + spin_lock_irq(&pt->pt_irqfds_lock); + + hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, irqfd_hnode) + mshv_irqfd_deactivate(irqfd); + + spin_unlock_irq(&pt->pt_irqfds_lock); + + /* + * Block until we know all outstanding shutdown jobs have completed + * since we do not take a mshv_partition* reference. + */ + flush_workqueue(irqfd_cleanup_wq); +} + +int mshv_irqfd_wq_init(void) +{ + irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", WQ_PERCPU, 0); + if (!irqfd_cleanup_wq) + return -ENOMEM; + + return 0; +} + +void mshv_irqfd_wq_cleanup(void) +{ + destroy_workqueue(irqfd_cleanup_wq); +} + +/* + * -------------------------------------------------------------------- + * ioeventfd: translate a MMIO memory write to an eventfd signal. + * + * userspace can register a MMIO address with an eventfd for receiving + * notification when the memory has been touched. + * -------------------------------------------------------------------- + */ + +static void ioeventfd_release(struct mshv_ioeventfd *p, u64 partition_id) +{ + if (p->iovntfd_doorbell_id > 0) + mshv_unregister_doorbell(partition_id, p->iovntfd_doorbell_id); + eventfd_ctx_put(p->iovntfd_eventfd); + kfree(p); +} + +/* MMIO writes trigger an event if the addr/val match */ +static void ioeventfd_mmio_write(int doorbell_id, void *data) +{ + struct mshv_partition *partition = (struct mshv_partition *)data; + struct mshv_ioeventfd *p; + + rcu_read_lock(); + hlist_for_each_entry_rcu(p, &partition->ioeventfds_list, iovntfd_hnode) + if (p->iovntfd_doorbell_id == doorbell_id) { + eventfd_signal(p->iovntfd_eventfd); + break; + } + + rcu_read_unlock(); +} + +static bool ioeventfd_check_collision(struct mshv_partition *pt, + struct mshv_ioeventfd *p) + __must_hold(&pt->mutex) +{ + struct mshv_ioeventfd *_p; + + hlist_for_each_entry(_p, &pt->ioeventfds_list, iovntfd_hnode) + if (_p->iovntfd_addr == p->iovntfd_addr && + _p->iovntfd_length == p->iovntfd_length && + (_p->iovntfd_wildcard || p->iovntfd_wildcard || + _p->iovntfd_datamatch == p->iovntfd_datamatch)) + return true; + + return false; +} + +static int mshv_assign_ioeventfd(struct mshv_partition *pt, + struct mshv_user_ioeventfd *args) + __must_hold(&pt->mutex) +{ + struct mshv_ioeventfd *p; + struct eventfd_ctx *eventfd; + u64 doorbell_flags = 0; + int ret; + + /* This mutex is currently protecting ioeventfd.items list */ + WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex)); + + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO)) + return -EOPNOTSUPP; + + /* must be natural-word sized */ + switch (args->len) { + case 0: + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_ANY; + break; + case 1: + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_BYTE; + break; + case 2: + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_WORD; + break; + case 4: + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_DWORD; + break; + case 8: + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_QWORD; + break; + default: + return -EINVAL; + } + + /* check for range overflow */ + if (args->addr + args->len < args->addr) + return -EINVAL; + + /* check for extra flags that we don't understand */ + if (args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) + return -EINVAL; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + p = kzalloc_obj(*p); + if (!p) { + ret = -ENOMEM; + goto fail; + } + + p->iovntfd_addr = args->addr; + p->iovntfd_length = args->len; + p->iovntfd_eventfd = eventfd; + + /* The datamatch feature is optional, otherwise this is a wildcard */ + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)) { + p->iovntfd_datamatch = args->datamatch; + } else { + p->iovntfd_wildcard = true; + doorbell_flags |= HV_DOORBELL_FLAG_TRIGGER_ANY_VALUE; + } + + if (ioeventfd_check_collision(pt, p)) { + ret = -EEXIST; + goto unlock_fail; + } + + ret = mshv_register_doorbell(pt->pt_id, ioeventfd_mmio_write, + (void *)pt, p->iovntfd_addr, + p->iovntfd_datamatch, doorbell_flags); + + trace_mshv_assign_ioeventfd(pt->pt_id, p->iovntfd_addr, + p->iovntfd_length, + p->iovntfd_datamatch, + p->iovntfd_wildcard, + p->iovntfd_eventfd, + ret); + + if (ret < 0) + goto unlock_fail; + + p->iovntfd_doorbell_id = ret; + + hlist_add_head_rcu(&p->iovntfd_hnode, &pt->ioeventfds_list); + + return 0; + +unlock_fail: + kfree(p); + +fail: + eventfd_ctx_put(eventfd); + + return ret; +} + +static int mshv_deassign_ioeventfd(struct mshv_partition *pt, + struct mshv_user_ioeventfd *args) + __must_hold(&pt->mutex) +{ + struct mshv_ioeventfd *p; + struct eventfd_ctx *eventfd; + struct hlist_node *n; + int ret = -ENOENT; + + /* This mutex is currently protecting ioeventfd.items list */ + WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex)); + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + hlist_for_each_entry_safe(p, n, &pt->ioeventfds_list, iovntfd_hnode) { + bool wildcard = !(args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)); + + if (p->iovntfd_eventfd != eventfd || + p->iovntfd_addr != args->addr || + p->iovntfd_length != args->len || + p->iovntfd_wildcard != wildcard) + continue; + + if (!p->iovntfd_wildcard && + p->iovntfd_datamatch != args->datamatch) + continue; + + trace_mshv_deassign_ioeventfd(pt->pt_id, p->iovntfd_addr, + p->iovntfd_length, + p->iovntfd_datamatch, + p->iovntfd_wildcard, + p->iovntfd_eventfd); + + hlist_del_rcu(&p->iovntfd_hnode); + synchronize_rcu(); + ioeventfd_release(p, pt->pt_id); + ret = 0; + break; + } + + eventfd_ctx_put(eventfd); + + return ret; +} + +int mshv_set_unset_ioeventfd(struct mshv_partition *pt, + struct mshv_user_ioeventfd *args) + __must_hold(&pt->mutex) +{ + if ((args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) || + mshv_field_nonzero(*args, rsvd)) + return -EINVAL; + + /* PIO not yet implemented */ + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO)) + return -EOPNOTSUPP; + + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DEASSIGN)) + return mshv_deassign_ioeventfd(pt, args); + + return mshv_assign_ioeventfd(pt, args); +} + +void mshv_eventfd_init(struct mshv_partition *pt) +{ + spin_lock_init(&pt->pt_irqfds_lock); + INIT_HLIST_HEAD(&pt->pt_irqfds_list); + + INIT_HLIST_HEAD(&pt->irqfds_resampler_list); + mutex_init(&pt->irqfds_resampler_lock); + + INIT_HLIST_HEAD(&pt->ioeventfds_list); +} + +void mshv_eventfd_release(struct mshv_partition *pt) +{ + struct hlist_head items; + struct hlist_node *n; + struct mshv_ioeventfd *p; + + hlist_move_list(&pt->ioeventfds_list, &items); + synchronize_rcu(); + + hlist_for_each_entry_safe(p, n, &items, iovntfd_hnode) { + hlist_del(&p->iovntfd_hnode); + ioeventfd_release(p, pt->pt_id); + } + + mshv_irqfd_release(pt); +} diff --git a/drivers/hv/mshv_eventfd.h b/drivers/hv/mshv_eventfd.h new file mode 100644 index 000000000000..464c6b81ab33 --- /dev/null +++ b/drivers/hv/mshv_eventfd.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * irqfd: Allows an fd to be used to inject an interrupt to the guest. + * ioeventfd: Allow an fd to be used to receive a signal from the guest. + * All credit goes to kvm developers. + */ + +#ifndef __LINUX_MSHV_EVENTFD_H +#define __LINUX_MSHV_EVENTFD_H + +#include <linux/poll.h> + +#include "mshv.h" +#include "mshv_root.h" + +/* struct to contain list of irqfds sharing an irq. Updates are protected by + * partition.irqfds.resampler_lock + */ +struct mshv_irqfd_resampler { + struct mshv_partition *rsmplr_partn; + struct hlist_head rsmplr_irqfd_list; + struct mshv_irq_ack_notifier rsmplr_notifier; + struct hlist_node rsmplr_hnode; +}; + +struct mshv_irqfd { + struct mshv_partition *irqfd_partn; + struct eventfd_ctx *irqfd_eventfd_ctx; + struct mshv_guest_irq_ent irqfd_girq_ent; + seqcount_spinlock_t irqfd_irqe_sc; + u32 irqfd_irqnum; + struct mshv_lapic_irq irqfd_lapic_irq; + struct hlist_node irqfd_hnode; + poll_table irqfd_polltbl; + wait_queue_entry_t irqfd_wait; + struct work_struct irqfd_shutdown; + struct mshv_irqfd_resampler *irqfd_resampler; + struct eventfd_ctx *irqfd_resamplefd; + struct hlist_node irqfd_resampler_hnode; +}; + +void mshv_eventfd_init(struct mshv_partition *partition); +void mshv_eventfd_release(struct mshv_partition *partition); + +void mshv_register_irq_ack_notifier(struct mshv_partition *partition, + struct mshv_irq_ack_notifier *mian); +void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition, + struct mshv_irq_ack_notifier *mian); +bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi); + +int mshv_set_unset_irqfd(struct mshv_partition *partition, + struct mshv_user_irqfd *args); + +int mshv_irqfd_wq_init(void); +void mshv_irqfd_wq_cleanup(void); + +struct mshv_ioeventfd { + struct hlist_node iovntfd_hnode; + u64 iovntfd_addr; + int iovntfd_length; + struct eventfd_ctx *iovntfd_eventfd; + u64 iovntfd_datamatch; + int iovntfd_doorbell_id; + bool iovntfd_wildcard; +}; + +int mshv_set_unset_ioeventfd(struct mshv_partition *pt, + struct mshv_user_ioeventfd *args); + +#endif /* __LINUX_MSHV_EVENTFD_H */ diff --git a/drivers/hv/mshv_irq.c b/drivers/hv/mshv_irq.c new file mode 100644 index 000000000000..b3142c84dcbc --- /dev/null +++ b/drivers/hv/mshv_irq.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * Authors: Microsoft Linux virtualization team + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <asm/mshyperv.h> + +#include "mshv_eventfd.h" +#include "mshv.h" +#include "mshv_root.h" + +/* called from the ioctl code, user wants to update the guest irq table */ +int mshv_update_routing_table(struct mshv_partition *partition, + const struct mshv_user_irq_entry *ue, + unsigned int numents) +{ + struct mshv_girq_routing_table *new = NULL, *old; + u32 i, nr_rt_entries = 0; + int r = 0; + + if (numents == 0) + goto swap_routes; + + for (i = 0; i < numents; i++) { + if (ue[i].gsi >= MSHV_MAX_GUEST_IRQS) + return -EINVAL; + + if (ue[i].address_hi) + return -EINVAL; + + nr_rt_entries = max(nr_rt_entries, ue[i].gsi); + } + nr_rt_entries += 1; + + new = kzalloc_flex(*new, mshv_girq_info_tbl, nr_rt_entries, + GFP_KERNEL_ACCOUNT); + if (!new) + return -ENOMEM; + + new->num_rt_entries = nr_rt_entries; + for (i = 0; i < numents; i++) { + struct mshv_guest_irq_ent *girq; + + girq = &new->mshv_girq_info_tbl[ue[i].gsi]; + + /* + * Allow only one to one mapping between GSI and MSI routing. + */ + if (girq->guest_irq_num != 0) { + r = -EINVAL; + goto out; + } + + girq->guest_irq_num = ue[i].gsi; + girq->girq_addr_lo = ue[i].address_lo; + girq->girq_addr_hi = ue[i].address_hi; + girq->girq_irq_data = ue[i].data; + girq->girq_entry_valid = true; + } + +swap_routes: + mutex_lock(&partition->pt_irq_lock); + old = rcu_dereference_protected(partition->pt_girq_tbl, 1); + rcu_assign_pointer(partition->pt_girq_tbl, new); + mshv_irqfd_routing_update(partition); + mutex_unlock(&partition->pt_irq_lock); + + synchronize_srcu_expedited(&partition->pt_irq_srcu); + + trace_mshv_update_routing_table(partition->pt_id, + old, new, numents); + + new = old; + +out: + kfree(new); + + return r; +} + +/* vm is going away, kfree the irq routing table */ +void mshv_free_routing_table(struct mshv_partition *partition) +{ + struct mshv_girq_routing_table *rt = + rcu_access_pointer(partition->pt_girq_tbl); + + kfree(rt); +} + +struct mshv_guest_irq_ent +mshv_ret_girq_entry(struct mshv_partition *partition, u32 irqnum) +{ + struct mshv_guest_irq_ent entry = { 0 }; + struct mshv_girq_routing_table *girq_tbl; + + girq_tbl = srcu_dereference_check(partition->pt_girq_tbl, + &partition->pt_irq_srcu, + lockdep_is_held(&partition->pt_irq_lock)); + if (!girq_tbl || irqnum >= girq_tbl->num_rt_entries) { + /* + * Premature register_irqfd, setting valid_entry = 0 + * would ignore this entry anyway + */ + entry.guest_irq_num = irqnum; + return entry; + } + + return girq_tbl->mshv_girq_info_tbl[irqnum]; +} + +void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent, + struct mshv_lapic_irq *lirq) +{ + memset(lirq, 0, sizeof(*lirq)); + if (!ent || !ent->girq_entry_valid) + return; + + lirq->lapic_vector = ent->girq_irq_data & 0xFF; + lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF; + lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8; +#if IS_ENABLED(CONFIG_X86) + lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1; + lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1; +#elif IS_ENABLED(CONFIG_ARM64) + lirq->lapic_control.asserted = 1; +#endif +} diff --git a/drivers/hv/mshv_portid_table.c b/drivers/hv/mshv_portid_table.c new file mode 100644 index 000000000000..c349af1f0aaa --- /dev/null +++ b/drivers/hv/mshv_portid_table.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <asm/mshyperv.h> + +#include "mshv.h" +#include "mshv_root.h" + +/* + * Ports and connections are hypervisor struct used for inter-partition + * communication. Port represents the source and connection represents + * the destination. Partitions are responsible for managing the port and + * connection ids. + * + */ + +#define PORTID_MIN 1 +#define PORTID_MAX INT_MAX + +static DEFINE_IDR(port_table_idr); + +void +mshv_port_table_fini(void) +{ + struct port_table_info *port_info; + unsigned long i, tmp; + + idr_lock(&port_table_idr); + if (!idr_is_empty(&port_table_idr)) { + idr_for_each_entry_ul(&port_table_idr, port_info, tmp, i) { + port_info = idr_remove(&port_table_idr, i); + kfree_rcu(port_info, portbl_rcu); + } + } + idr_unlock(&port_table_idr); +} + +int +mshv_portid_alloc(struct port_table_info *info) +{ + int ret = 0; + + idr_lock(&port_table_idr); + ret = idr_alloc(&port_table_idr, info, PORTID_MIN, + PORTID_MAX, GFP_KERNEL); + idr_unlock(&port_table_idr); + + return ret; +} + +void +mshv_portid_free(int port_id) +{ + struct port_table_info *info; + + idr_lock(&port_table_idr); + info = idr_remove(&port_table_idr, port_id); + WARN_ON(!info); + idr_unlock(&port_table_idr); + + synchronize_rcu(); + kfree(info); +} + +int +mshv_portid_lookup(int port_id, struct port_table_info *info) +{ + struct port_table_info *_info; + int ret = -ENOENT; + + rcu_read_lock(); + _info = idr_find(&port_table_idr, port_id); + rcu_read_unlock(); + + if (_info) { + *info = *_info; + ret = 0; + } + + return ret; +} diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c new file mode 100644 index 000000000000..fdffd4f002f6 --- /dev/null +++ b/drivers/hv/mshv_regions.c @@ -0,0 +1,590 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2025, Microsoft Corporation. + * + * Memory region management for mshv_root module. + * + * Authors: Microsoft Linux virtualization team + */ + +#include <linux/hmm.h> +#include <linux/hyperv.h> +#include <linux/kref.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> + +#include <asm/mshyperv.h> + +#include "mshv_root.h" + +#define MSHV_MAP_FAULT_IN_PAGES PTRS_PER_PMD + +/** + * mshv_chunk_stride - Compute stride for mapping guest memory + * @page : The page to check for huge page backing + * @gfn : Guest frame number for the mapping + * @page_count: Total number of pages in the mapping + * + * Determines the appropriate stride (in pages) for mapping guest memory. + * Uses huge page stride if the backing page is huge and the guest mapping + * is properly aligned; otherwise falls back to single page stride. + * + * Return: Stride in pages, or -EINVAL if page order is unsupported. + */ +static int mshv_chunk_stride(struct page *page, + u64 gfn, u64 page_count) +{ + unsigned int page_order; + + /* + * Use single page stride by default. For huge page stride, the + * page must be compound and point to the head of the compound + * page, and both gfn and page_count must be huge-page aligned. + */ + if (!PageCompound(page) || !PageHead(page) || + !IS_ALIGNED(gfn, PTRS_PER_PMD) || + !IS_ALIGNED(page_count, PTRS_PER_PMD)) + return 1; + + page_order = folio_order(page_folio(page)); + /* The hypervisor only supports 2M huge page */ + if (page_order != PMD_ORDER) + return -EINVAL; + + return 1 << page_order; +} + +/** + * mshv_region_process_chunk - Processes a contiguous chunk of memory pages + * in a region. + * @region : Pointer to the memory region structure. + * @flags : Flags to pass to the handler. + * @page_offset: Offset into the region's pages array to start processing. + * @page_count : Number of pages to process. + * @handler : Callback function to handle the chunk. + * + * This function scans the region's pages starting from @page_offset, + * checking for contiguous present pages of the same size (normal or huge). + * It invokes @handler for the chunk of contiguous pages found. Returns the + * number of pages handled, or a negative error code if the first page is + * not present or the handler fails. + * + * Note: The @handler callback must be able to handle both normal and huge + * pages. + * + * Return: Number of pages handled, or negative error code. + */ +static long mshv_region_process_chunk(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count, + int (*handler)(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, + u64 page_count, + bool huge_page)) +{ + u64 gfn = region->start_gfn + page_offset; + u64 count; + struct page *page; + int stride, ret; + + page = region->mreg_pages[page_offset]; + if (!page) + return -EINVAL; + + stride = mshv_chunk_stride(page, gfn, page_count); + if (stride < 0) + return stride; + + /* Start at stride since the first stride is validated */ + for (count = stride; count < page_count; count += stride) { + page = region->mreg_pages[page_offset + count]; + + /* Break if current page is not present */ + if (!page) + break; + + /* Break if stride size changes */ + if (stride != mshv_chunk_stride(page, gfn + count, + page_count - count)) + break; + } + + ret = handler(region, flags, page_offset, count, stride > 1); + if (ret) + return ret; + + return count; +} + +/** + * mshv_region_process_range - Processes a range of memory pages in a + * region. + * @region : Pointer to the memory region structure. + * @flags : Flags to pass to the handler. + * @page_offset: Offset into the region's pages array to start processing. + * @page_count : Number of pages to process. + * @handler : Callback function to handle each chunk of contiguous + * pages. + * + * Iterates over the specified range of pages in @region, skipping + * non-present pages. For each contiguous chunk of present pages, invokes + * @handler via mshv_region_process_chunk. + * + * Note: The @handler callback must be able to handle both normal and huge + * pages. + * + * Returns 0 on success, or a negative error code on failure. + */ +static int mshv_region_process_range(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count, + int (*handler)(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, + u64 page_count, + bool huge_page)) +{ + long ret; + + if (page_offset + page_count > region->nr_pages) + return -EINVAL; + + while (page_count) { + /* Skip non-present pages */ + if (!region->mreg_pages[page_offset]) { + page_offset++; + page_count--; + continue; + } + + ret = mshv_region_process_chunk(region, flags, + page_offset, + page_count, + handler); + if (ret < 0) + return ret; + + page_offset += ret; + page_count -= ret; + } + + return 0; +} + +struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, + u64 uaddr, u32 flags) +{ + struct mshv_mem_region *region; + + region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages); + if (!region) + return ERR_PTR(-ENOMEM); + + region->nr_pages = nr_pages; + region->start_gfn = guest_pfn; + region->start_uaddr = uaddr; + region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE; + if (flags & BIT(MSHV_SET_MEM_BIT_WRITABLE)) + region->hv_map_flags |= HV_MAP_GPA_WRITABLE; + if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) + region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; + + kref_init(®ion->mreg_refcount); + + return region; +} + +static int mshv_region_chunk_share(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count, + bool huge_page) +{ + if (huge_page) + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; + + return hv_call_modify_spa_host_access(region->partition->pt_id, + region->mreg_pages + page_offset, + page_count, + HV_MAP_GPA_READABLE | + HV_MAP_GPA_WRITABLE, + flags, true); +} + +int mshv_region_share(struct mshv_mem_region *region) +{ + u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED; + + return mshv_region_process_range(region, flags, + 0, region->nr_pages, + mshv_region_chunk_share); +} + +static int mshv_region_chunk_unshare(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count, + bool huge_page) +{ + if (huge_page) + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; + + return hv_call_modify_spa_host_access(region->partition->pt_id, + region->mreg_pages + page_offset, + page_count, 0, + flags, false); +} + +int mshv_region_unshare(struct mshv_mem_region *region) +{ + u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE; + + return mshv_region_process_range(region, flags, + 0, region->nr_pages, + mshv_region_chunk_unshare); +} + +static int mshv_region_chunk_remap(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count, + bool huge_page) +{ + if (huge_page) + flags |= HV_MAP_GPA_LARGE_PAGE; + + return hv_call_map_gpa_pages(region->partition->pt_id, + region->start_gfn + page_offset, + page_count, flags, + region->mreg_pages + page_offset); +} + +static int mshv_region_remap_pages(struct mshv_mem_region *region, + u32 map_flags, + u64 page_offset, u64 page_count) +{ + return mshv_region_process_range(region, map_flags, + page_offset, page_count, + mshv_region_chunk_remap); +} + +int mshv_region_map(struct mshv_mem_region *region) +{ + u32 map_flags = region->hv_map_flags; + + return mshv_region_remap_pages(region, map_flags, + 0, region->nr_pages); +} + +static void mshv_region_invalidate_pages(struct mshv_mem_region *region, + u64 page_offset, u64 page_count) +{ + if (region->mreg_type == MSHV_REGION_TYPE_MEM_PINNED) + unpin_user_pages(region->mreg_pages + page_offset, page_count); + + memset(region->mreg_pages + page_offset, 0, + page_count * sizeof(struct page *)); +} + +void mshv_region_invalidate(struct mshv_mem_region *region) +{ + mshv_region_invalidate_pages(region, 0, region->nr_pages); +} + +int mshv_region_pin(struct mshv_mem_region *region) +{ + u64 done_count, nr_pages; + struct page **pages; + __u64 userspace_addr; + int ret; + + for (done_count = 0; done_count < region->nr_pages; done_count += ret) { + pages = region->mreg_pages + done_count; + userspace_addr = region->start_uaddr + + done_count * HV_HYP_PAGE_SIZE; + nr_pages = min(region->nr_pages - done_count, + MSHV_PIN_PAGES_BATCH_SIZE); + + /* + * Pinning assuming 4k pages works for large pages too. + * All page structs within the large page are returned. + * + * Pin requests are batched because pin_user_pages_fast + * with the FOLL_LONGTERM flag does a large temporary + * allocation of contiguous memory. + */ + ret = pin_user_pages_fast(userspace_addr, nr_pages, + FOLL_WRITE | FOLL_LONGTERM, + pages); + if (ret != nr_pages) + goto release_pages; + } + + return 0; + +release_pages: + if (ret > 0) + done_count += ret; + mshv_region_invalidate_pages(region, 0, done_count); + return ret < 0 ? ret : -ENOMEM; +} + +static int mshv_region_chunk_unmap(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count, + bool huge_page) +{ + if (huge_page) + flags |= HV_UNMAP_GPA_LARGE_PAGE; + + return hv_call_unmap_gpa_pages(region->partition->pt_id, + region->start_gfn + page_offset, + page_count, flags); +} + +static int mshv_region_unmap(struct mshv_mem_region *region) +{ + return mshv_region_process_range(region, 0, + 0, region->nr_pages, + mshv_region_chunk_unmap); +} + +static void mshv_region_destroy(struct kref *ref) +{ + struct mshv_mem_region *region = + container_of(ref, struct mshv_mem_region, mreg_refcount); + struct mshv_partition *partition = region->partition; + int ret; + + if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE) + mshv_region_movable_fini(region); + + if (mshv_partition_encrypted(partition)) { + ret = mshv_region_share(region); + if (ret) { + pt_err(partition, + "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n", + ret); + return; + } + } + + mshv_region_unmap(region); + + mshv_region_invalidate(region); + + vfree(region); +} + +void mshv_region_put(struct mshv_mem_region *region) +{ + kref_put(®ion->mreg_refcount, mshv_region_destroy); +} + +int mshv_region_get(struct mshv_mem_region *region) +{ + return kref_get_unless_zero(®ion->mreg_refcount); +} + +/** + * mshv_region_hmm_fault_and_lock - Handle HMM faults and lock the memory region + * @region: Pointer to the memory region structure + * @range: Pointer to the HMM range structure + * + * This function performs the following steps: + * 1. Reads the notifier sequence for the HMM range. + * 2. Acquires a read lock on the memory map. + * 3. Handles HMM faults for the specified range. + * 4. Releases the read lock on the memory map. + * 5. If successful, locks the memory region mutex. + * 6. Verifies if the notifier sequence has changed during the operation. + * If it has, releases the mutex and returns -EBUSY to match with + * hmm_range_fault() return code for repeating. + * + * Return: 0 on success, a negative error code otherwise. + */ +static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region, + struct hmm_range *range) +{ + int ret; + + range->notifier_seq = mmu_interval_read_begin(range->notifier); + mmap_read_lock(region->mreg_mni.mm); + ret = hmm_range_fault(range); + mmap_read_unlock(region->mreg_mni.mm); + if (ret) + return ret; + + mutex_lock(®ion->mreg_mutex); + + if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) { + mutex_unlock(®ion->mreg_mutex); + cond_resched(); + return -EBUSY; + } + + return 0; +} + +/** + * mshv_region_range_fault - Handle memory range faults for a given region. + * @region: Pointer to the memory region structure. + * @page_offset: Offset of the page within the region. + * @page_count: Number of pages to handle. + * + * This function resolves memory faults for a specified range of pages + * within a memory region. It uses HMM (Heterogeneous Memory Management) + * to fault in the required pages and updates the region's page array. + * + * Return: 0 on success, negative error code on failure. + */ +static int mshv_region_range_fault(struct mshv_mem_region *region, + u64 page_offset, u64 page_count) +{ + struct hmm_range range = { + .notifier = ®ion->mreg_mni, + .default_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE, + }; + unsigned long *pfns; + int ret; + u64 i; + + pfns = kmalloc_array(page_count, sizeof(*pfns), GFP_KERNEL); + if (!pfns) + return -ENOMEM; + + range.hmm_pfns = pfns; + range.start = region->start_uaddr + page_offset * HV_HYP_PAGE_SIZE; + range.end = range.start + page_count * HV_HYP_PAGE_SIZE; + + do { + ret = mshv_region_hmm_fault_and_lock(region, &range); + } while (ret == -EBUSY); + + if (ret) + goto out; + + for (i = 0; i < page_count; i++) + region->mreg_pages[page_offset + i] = hmm_pfn_to_page(pfns[i]); + + ret = mshv_region_remap_pages(region, region->hv_map_flags, + page_offset, page_count); + + mutex_unlock(®ion->mreg_mutex); +out: + kfree(pfns); + return ret; +} + +bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn) +{ + u64 page_offset, page_count; + int ret; + + /* Align the page offset to the nearest MSHV_MAP_FAULT_IN_PAGES. */ + page_offset = ALIGN_DOWN(gfn - region->start_gfn, + MSHV_MAP_FAULT_IN_PAGES); + + /* Map more pages than requested to reduce the number of faults. */ + page_count = min(region->nr_pages - page_offset, + MSHV_MAP_FAULT_IN_PAGES); + + ret = mshv_region_range_fault(region, page_offset, page_count); + + WARN_ONCE(ret, + "p%llu: GPA intercept failed: region %#llx-%#llx, gfn %#llx, page_offset %llu, page_count %llu\n", + region->partition->pt_id, region->start_uaddr, + region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT), + gfn, page_offset, page_count); + + return !ret; +} + +/** + * mshv_region_interval_invalidate - Invalidate a range of memory region + * @mni: Pointer to the mmu_interval_notifier structure + * @range: Pointer to the mmu_notifier_range structure + * @cur_seq: Current sequence number for the interval notifier + * + * This function invalidates a memory region by remapping its pages with + * no access permissions. It locks the region's mutex to ensure thread safety + * and updates the sequence number for the interval notifier. If the range + * is blockable, it uses a blocking lock; otherwise, it attempts a non-blocking + * lock and returns false if unsuccessful. + * + * NOTE: Failure to invalidate a region is a serious error, as the pages will + * be considered freed while they are still mapped by the hypervisor. + * Any attempt to access such pages will likely crash the system. + * + * Return: true if the region was successfully invalidated, false otherwise. + */ +static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct mshv_mem_region *region = container_of(mni, + struct mshv_mem_region, + mreg_mni); + u64 page_offset, page_count; + unsigned long mstart, mend; + int ret = -EPERM; + + mstart = max(range->start, region->start_uaddr); + mend = min(range->end, region->start_uaddr + + (region->nr_pages << HV_HYP_PAGE_SHIFT)); + + page_offset = HVPFN_DOWN(mstart - region->start_uaddr); + page_count = HVPFN_DOWN(mend - mstart); + + if (mmu_notifier_range_blockable(range)) + mutex_lock(®ion->mreg_mutex); + else if (!mutex_trylock(®ion->mreg_mutex)) + goto out_fail; + + mmu_interval_set_seq(mni, cur_seq); + + ret = mshv_region_remap_pages(region, HV_MAP_GPA_NO_ACCESS, + page_offset, page_count); + if (ret) + goto out_unlock; + + mshv_region_invalidate_pages(region, page_offset, page_count); + + mutex_unlock(®ion->mreg_mutex); + + return true; + +out_unlock: + mutex_unlock(®ion->mreg_mutex); +out_fail: + WARN_ONCE(ret, + "Failed to invalidate region %#llx-%#llx (range %#lx-%#lx, event: %u, pages %#llx-%#llx, mm: %#llx): %d\n", + region->start_uaddr, + region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT), + range->start, range->end, range->event, + page_offset, page_offset + page_count - 1, (u64)range->mm, ret); + return false; +} + +static const struct mmu_interval_notifier_ops mshv_region_mni_ops = { + .invalidate = mshv_region_interval_invalidate, +}; + +void mshv_region_movable_fini(struct mshv_mem_region *region) +{ + mmu_interval_notifier_remove(®ion->mreg_mni); +} + +bool mshv_region_movable_init(struct mshv_mem_region *region) +{ + int ret; + + ret = mmu_interval_notifier_insert(®ion->mreg_mni, current->mm, + region->start_uaddr, + region->nr_pages << HV_HYP_PAGE_SHIFT, + &mshv_region_mni_ops); + if (ret) + return false; + + mutex_init(®ion->mreg_mutex); + + return true; +} diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h new file mode 100644 index 000000000000..1f086dcb7aa1 --- /dev/null +++ b/drivers/hv/mshv_root.h @@ -0,0 +1,381 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2023, Microsoft Corporation. + */ + +#ifndef _MSHV_ROOT_H_ +#define _MSHV_ROOT_H_ + +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/semaphore.h> +#include <linux/sched.h> +#include <linux/srcu.h> +#include <linux/wait.h> +#include <linux/hashtable.h> +#include <linux/dev_printk.h> +#include <linux/build_bug.h> +#include <linux/mmu_notifier.h> +#include <uapi/linux/mshv.h> +#include "mshv_trace.h" + +/* + * Hypervisor must be between these version numbers (inclusive) + * to guarantee compatibility + */ +#define MSHV_HV_MIN_VERSION (27744) +#define MSHV_HV_MAX_VERSION (27751) + +static_assert(HV_HYP_PAGE_SIZE == MSHV_HV_PAGE_SIZE); + +#define MSHV_MAX_VPS 256 + +#define MSHV_PARTITIONS_HASH_BITS 9 + +#define MSHV_PIN_PAGES_BATCH_SIZE (0x10000000ULL / HV_HYP_PAGE_SIZE) + +struct mshv_vp { + u32 vp_index; + struct mshv_partition *vp_partition; + struct mutex vp_mutex; + struct hv_vp_register_page *vp_register_page; + struct hv_message *vp_intercept_msg_page; + void *vp_ghcb_page; + struct hv_stats_page *vp_stats_pages[2]; + struct { + atomic64_t vp_signaled_count; + struct { + u64 intercept_suspend: 1; + u64 root_sched_blocked: 1; /* root scheduler only */ + u64 root_sched_dispatched: 1; /* root scheduler only */ + u64 reserved: 61; + } flags; + unsigned int kicked_by_hv; + wait_queue_head_t vp_suspend_queue; + } run; +#if IS_ENABLED(CONFIG_DEBUG_FS) + struct dentry *vp_stats_dentry; +#endif +}; + +#define vp_fmt(fmt) "p%lluvp%u: " fmt +#define vp_devprintk(level, v, fmt, ...) \ +do { \ + const struct mshv_vp *__vp = (v); \ + const struct mshv_partition *__pt = __vp->vp_partition; \ + dev_##level(__pt->pt_module_dev, vp_fmt(fmt), __pt->pt_id, \ + __vp->vp_index, ##__VA_ARGS__); \ +} while (0) +#define vp_emerg(v, fmt, ...) vp_devprintk(emerg, v, fmt, ##__VA_ARGS__) +#define vp_crit(v, fmt, ...) vp_devprintk(crit, v, fmt, ##__VA_ARGS__) +#define vp_alert(v, fmt, ...) vp_devprintk(alert, v, fmt, ##__VA_ARGS__) +#define vp_err(v, fmt, ...) vp_devprintk(err, v, fmt, ##__VA_ARGS__) +#define vp_warn(v, fmt, ...) vp_devprintk(warn, v, fmt, ##__VA_ARGS__) +#define vp_notice(v, fmt, ...) vp_devprintk(notice, v, fmt, ##__VA_ARGS__) +#define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__) +#define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__) + +enum mshv_region_type { + MSHV_REGION_TYPE_MEM_PINNED, + MSHV_REGION_TYPE_MEM_MOVABLE, + MSHV_REGION_TYPE_MMIO +}; + +struct mshv_mem_region { + struct hlist_node hnode; + struct kref mreg_refcount; + u64 nr_pages; + u64 start_gfn; + u64 start_uaddr; + u32 hv_map_flags; + struct mshv_partition *partition; + enum mshv_region_type mreg_type; + struct mmu_interval_notifier mreg_mni; + struct mutex mreg_mutex; /* protects region pages remapping */ + struct page *mreg_pages[]; +}; + +struct mshv_irq_ack_notifier { + struct hlist_node link; + unsigned int irq_ack_gsi; + void (*irq_acked)(struct mshv_irq_ack_notifier *mian); +}; + +struct mshv_partition { + struct device *pt_module_dev; + + struct hlist_node pt_hnode; + u64 pt_id; + refcount_t pt_ref_count; + struct mutex pt_mutex; + + spinlock_t pt_mem_regions_lock; + struct hlist_head pt_mem_regions; // not ordered + + u32 pt_vp_count; + struct mshv_vp *pt_vp_array[MSHV_MAX_VPS]; + + struct mutex pt_irq_lock; + struct srcu_struct pt_irq_srcu; + struct hlist_head irq_ack_notifier_list; + + struct hlist_head pt_devices; + + /* + * MSHV does not support more than one async hypercall in flight + * for a single partition. Thus, it is okay to define per partition + * async hypercall status. + */ + struct completion async_hypercall; + u64 async_hypercall_status; + + spinlock_t pt_irqfds_lock; + struct hlist_head pt_irqfds_list; + struct mutex irqfds_resampler_lock; + struct hlist_head irqfds_resampler_list; + + struct hlist_head ioeventfds_list; + + struct mshv_girq_routing_table __rcu *pt_girq_tbl; + u64 isolation_type; + bool import_completed; + bool pt_initialized; +#if IS_ENABLED(CONFIG_DEBUG_FS) + struct dentry *pt_stats_dentry; + struct dentry *pt_vp_dentry; +#endif +}; + +#define pt_fmt(fmt) "p%llu: " fmt +#define pt_devprintk(level, p, fmt, ...) \ +do { \ + const struct mshv_partition *__pt = (p); \ + dev_##level(__pt->pt_module_dev, pt_fmt(fmt), __pt->pt_id, \ + ##__VA_ARGS__); \ +} while (0) +#define pt_emerg(p, fmt, ...) pt_devprintk(emerg, p, fmt, ##__VA_ARGS__) +#define pt_crit(p, fmt, ...) pt_devprintk(crit, p, fmt, ##__VA_ARGS__) +#define pt_alert(p, fmt, ...) pt_devprintk(alert, p, fmt, ##__VA_ARGS__) +#define pt_err(p, fmt, ...) pt_devprintk(err, p, fmt, ##__VA_ARGS__) +#define pt_warn(p, fmt, ...) pt_devprintk(warn, p, fmt, ##__VA_ARGS__) +#define pt_notice(p, fmt, ...) pt_devprintk(notice, p, fmt, ##__VA_ARGS__) +#define pt_info(p, fmt, ...) pt_devprintk(info, p, fmt, ##__VA_ARGS__) +#define pt_dbg(p, fmt, ...) pt_devprintk(dbg, p, fmt, ##__VA_ARGS__) + +struct mshv_lapic_irq { + u32 lapic_vector; + u64 lapic_apic_id; + union hv_interrupt_control lapic_control; +}; + +#define MSHV_MAX_GUEST_IRQS 4096 + +/* representation of one guest irq entry, either msi or legacy */ +struct mshv_guest_irq_ent { + u32 girq_entry_valid; /* vfio looks at this */ + u32 guest_irq_num; /* a unique number for each irq */ + u32 girq_addr_lo; /* guest irq msi address info */ + u32 girq_addr_hi; + u32 girq_irq_data; /* idt vector in some cases */ +}; + +struct mshv_girq_routing_table { + u32 num_rt_entries; + struct mshv_guest_irq_ent mshv_girq_info_tbl[]; +}; + +struct hv_synic_pages { + struct hv_message_page *hyp_synic_message_page; + struct hv_synic_event_flags_page *synic_event_flags_page; + struct hv_synic_event_ring_page *synic_event_ring_page; +}; + +struct mshv_root { + spinlock_t pt_ht_lock; + DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS); + struct hv_partition_property_vmm_capabilities vmm_caps; +}; + +/* + * Callback for doorbell events. + * NOTE: This is called in interrupt context. Callback + * should defer slow and sleeping logic to later. + */ +typedef void (*doorbell_cb_t) (int doorbell_id, void *); + +/* + * port table information + */ +struct port_table_info { + struct rcu_head portbl_rcu; + enum hv_port_type hv_port_type; + union { + struct { + u64 reserved[2]; + } hv_port_message; + struct { + u64 reserved[2]; + } hv_port_event; + struct { + u64 reserved[2]; + } hv_port_monitor; + struct { + doorbell_cb_t doorbell_cb; + void *data; + } hv_port_doorbell; + }; +}; + +int mshv_update_routing_table(struct mshv_partition *partition, + const struct mshv_user_irq_entry *entries, + unsigned int numents); +void mshv_free_routing_table(struct mshv_partition *partition); + +struct mshv_guest_irq_ent mshv_ret_girq_entry(struct mshv_partition *partition, + u32 irq_num); + +void mshv_copy_girq_info(struct mshv_guest_irq_ent *src_irq, + struct mshv_lapic_irq *dest_irq); + +void mshv_irqfd_routing_update(struct mshv_partition *partition); + +void mshv_port_table_fini(void); +int mshv_portid_alloc(struct port_table_info *info); +int mshv_portid_lookup(int port_id, struct port_table_info *info); +void mshv_portid_free(int port_id); + +int mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb, + void *data, u64 gpa, u64 val, u64 flags); +void mshv_unregister_doorbell(u64 partition_id, int doorbell_portid); + +void mshv_isr(void); +int mshv_synic_init(struct device *dev); +void mshv_synic_exit(void); + +static inline bool mshv_partition_encrypted(struct mshv_partition *partition) +{ + return partition->isolation_type == HV_PARTITION_ISOLATION_TYPE_SNP; +} + +struct mshv_partition *mshv_partition_get(struct mshv_partition *partition); +void mshv_partition_put(struct mshv_partition *partition); +struct mshv_partition *mshv_partition_find(u64 partition_id) __must_hold(RCU); + +static inline bool is_l1vh_parent(u64 partition_id) +{ + return hv_l1vh_partition() && (partition_id == HV_PARTITION_ID_SELF); +} + +int mshv_vp_stats_map(u64 partition_id, u32 vp_index, + struct hv_stats_page **stats_pages); +void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, + struct hv_stats_page **stats_pages); + +/* hypercalls */ + +int hv_call_withdraw_memory(u64 count, int node, u64 partition_id); +int hv_call_create_partition(u64 flags, + struct hv_partition_creation_properties creation_properties, + union hv_partition_isolation_properties isolation_properties, + u64 *partition_id); +int hv_call_initialize_partition(u64 partition_id); +int hv_call_finalize_partition(u64 partition_id); +int hv_call_delete_partition(u64 partition_id); +int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs); +int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count, + u32 flags, struct page **pages); +int hv_call_unmap_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count, + u32 flags); +int hv_call_delete_vp(u64 partition_id, u32 vp_index); +int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector, + u64 dest_addr, + union hv_interrupt_control control); +int hv_call_clear_virtual_interrupt(u64 partition_id); +int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn, + union hv_gpa_page_access_state_flags state_flags, + int *written_total, + union hv_gpa_page_access_state *states); +int hv_call_get_vp_state(u32 vp_index, u64 partition_id, + struct hv_vp_state_data state_data, + /* Choose between pages and ret_output */ + u64 page_count, struct page **pages, + union hv_output_get_vp_state *ret_output); +int hv_call_set_vp_state(u32 vp_index, u64 partition_id, + /* Choose between pages and bytes */ + struct hv_vp_state_data state_data, u64 page_count, + struct page **pages, u32 num_bytes, u8 *bytes); +int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl, + struct page **state_page); +int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + struct page *state_page, + union hv_input_vtl input_vtl); +int hv_call_create_port(u64 port_partition_id, union hv_port_id port_id, + u64 connection_partition_id, struct hv_port_info *port_info, + u8 port_vtl, u8 min_connection_vtl, int node); +int hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id); +int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id, + u64 connection_partition_id, + union hv_connection_id connection_id, + struct hv_connection_info *connection_info, + u8 connection_vtl, int node); +int hv_call_disconnect_port(u64 connection_partition_id, + union hv_connection_id connection_id); +int hv_call_notify_port_ring_empty(u32 sint_index); +int hv_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + struct hv_stats_page **addr); +int hv_unmap_stats_page(enum hv_stats_object_type type, + struct hv_stats_page *page_addr, + const union hv_stats_object_identity *identity); +int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, + u64 page_struct_count, u32 host_access, + u32 flags, u8 acquire); +int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 arg, + void *property_value, size_t property_value_sz); + +#if IS_ENABLED(CONFIG_DEBUG_FS) +int __init mshv_debugfs_init(void); +void mshv_debugfs_exit(void); + +int mshv_debugfs_partition_create(struct mshv_partition *partition); +void mshv_debugfs_partition_remove(struct mshv_partition *partition); +int mshv_debugfs_vp_create(struct mshv_vp *vp); +void mshv_debugfs_vp_remove(struct mshv_vp *vp); +#else +static inline int __init mshv_debugfs_init(void) +{ + return 0; +} +static inline void mshv_debugfs_exit(void) { } + +static inline int mshv_debugfs_partition_create(struct mshv_partition *partition) +{ + return 0; +} +static inline void mshv_debugfs_partition_remove(struct mshv_partition *partition) { } +static inline int mshv_debugfs_vp_create(struct mshv_vp *vp) +{ + return 0; +} +static inline void mshv_debugfs_vp_remove(struct mshv_vp *vp) { } +#endif + +extern struct mshv_root mshv_root; +extern enum hv_scheduler_type hv_scheduler_type; +extern u8 * __percpu *hv_synic_eventring_tail; + +struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, + u64 uaddr, u32 flags); +int mshv_region_share(struct mshv_mem_region *region); +int mshv_region_unshare(struct mshv_mem_region *region); +int mshv_region_map(struct mshv_mem_region *region); +void mshv_region_invalidate(struct mshv_mem_region *region); +int mshv_region_pin(struct mshv_mem_region *region); +void mshv_region_put(struct mshv_mem_region *region); +int mshv_region_get(struct mshv_mem_region *region); +bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn); +void mshv_region_movable_fini(struct mshv_mem_region *region); +bool mshv_region_movable_init(struct mshv_mem_region *region); + +#endif /* _MSHV_ROOT_H_ */ diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c new file mode 100644 index 000000000000..cb55d4d4be2e --- /dev/null +++ b/drivers/hv/mshv_root_hv_call.c @@ -0,0 +1,1074 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * Hypercall helper functions used by the mshv_root module. + * + * Authors: Microsoft Linux virtualization team + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/export.h> +#include <asm/mshyperv.h> + +#include "mshv_root.h" + +/* Determined empirically */ +#define HV_INIT_PARTITION_DEPOSIT_PAGES 208 +#define HV_MAP_GPA_DEPOSIT_PAGES 256 +#define HV_UMAP_GPA_PAGES 512 + +#define HV_PAGE_COUNT_2M_ALIGNED(pg_count) (!((pg_count) & (0x200 - 1))) + +#define HV_WITHDRAW_BATCH_SIZE (HV_HYP_PAGE_SIZE / sizeof(u64)) +#define HV_MAP_GPA_BATCH_SIZE \ + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_map_gpa_pages)) \ + / sizeof(u64)) +#define HV_GET_VP_STATE_BATCH_SIZE \ + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_get_vp_state)) \ + / sizeof(u64)) +#define HV_SET_VP_STATE_BATCH_SIZE \ + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_state)) \ + / sizeof(u64)) +#define HV_GET_GPA_ACCESS_STATES_BATCH_SIZE \ + ((HV_HYP_PAGE_SIZE - sizeof(union hv_gpa_page_access_state)) \ + / sizeof(union hv_gpa_page_access_state)) +#define HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT \ + ((HV_HYP_PAGE_SIZE - \ + sizeof(struct hv_input_modify_sparse_spa_page_host_access)) / \ + sizeof(u64)) + +int hv_call_withdraw_memory(u64 count, int node, u64 partition_id) +{ + struct hv_input_withdraw_memory *input_page; + struct hv_output_withdraw_memory *output_page; + struct page *page; + u16 completed; + u64 status, withdrawn = 0; + int i; + unsigned long flags; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + output_page = page_address(page); + + while (withdrawn < count) { + local_irq_save(flags); + + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input_page, 0, sizeof(*input_page)); + input_page->partition_id = partition_id; + status = hv_do_rep_hypercall(HVCALL_WITHDRAW_MEMORY, + min(count - withdrawn, HV_WITHDRAW_BATCH_SIZE), + 0, input_page, output_page); + + local_irq_restore(flags); + + completed = hv_repcomp(status); + + for (i = 0; i < completed; i++) + __free_page(pfn_to_page(output_page->gpa_page_list[i])); + + if (!hv_result_success(status)) { + if (hv_result(status) == HV_STATUS_NO_RESOURCES) + status = HV_STATUS_SUCCESS; + break; + } + + withdrawn += completed; + } + free_page((unsigned long)output_page); + + trace_mshv_hvcall_withdraw_memory(partition_id, withdrawn, status); + + return hv_result_to_errno(status); +} + +int hv_call_create_partition(u64 flags, + struct hv_partition_creation_properties creation_properties, + union hv_partition_isolation_properties isolation_properties, + u64 *partition_id) +{ + struct hv_input_create_partition *input; + struct hv_output_create_partition *output; + u64 status; + int ret; + unsigned long irq_flags; + + do { + local_irq_save(irq_flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->flags = flags; + input->compatibility_version = HV_COMPATIBILITY_21_H2; + + memcpy(&input->partition_creation_properties, &creation_properties, + sizeof(creation_properties)); + + memcpy(&input->isolation_properties, &isolation_properties, + sizeof(isolation_properties)); + + status = hv_do_hypercall(HVCALL_CREATE_PARTITION, + input, output); + + if (!hv_result_needs_memory(status)) { + if (hv_result_success(status)) + *partition_id = output->partition_id; + local_irq_restore(irq_flags); + ret = hv_result_to_errno(status); + break; + } + local_irq_restore(irq_flags); + ret = hv_deposit_memory(hv_current_partition_id, status); + } while (!ret); + + trace_mshv_hvcall_create_partition(flags, ret ? ret : *partition_id); + + return ret; +} + +int hv_call_initialize_partition(u64 partition_id) +{ + struct hv_input_initialize_partition input; + u64 status; + int ret; + + input.partition_id = partition_id; + + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, + HV_INIT_PARTITION_DEPOSIT_PAGES); + if (ret) + return ret; + + do { + status = hv_do_fast_hypercall8(HVCALL_INITIALIZE_PARTITION, + *(u64 *)&input); + + if (!hv_result_needs_memory(status)) { + ret = hv_result_to_errno(status); + break; + } + ret = hv_deposit_memory(partition_id, status); + } while (!ret); + + trace_mshv_hvcall_initialize_partition(partition_id, status); + + return ret; +} + +int hv_call_finalize_partition(u64 partition_id) +{ + struct hv_input_finalize_partition input; + u64 status; + + input.partition_id = partition_id; + status = hv_do_fast_hypercall8(HVCALL_FINALIZE_PARTITION, + *(u64 *)&input); + + trace_mshv_hvcall_finalize_partition(partition_id, status); + + return hv_result_to_errno(status); +} + +int hv_call_delete_partition(u64 partition_id) +{ + struct hv_input_delete_partition input; + u64 status; + + input.partition_id = partition_id; + status = hv_do_fast_hypercall8(HVCALL_DELETE_PARTITION, *(u64 *)&input); + + trace_mshv_hvcall_delete_partition(partition_id, status); + + return hv_result_to_errno(status); +} + +/* Ask the hypervisor to map guest ram pages or the guest mmio space */ +static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count, + u32 flags, struct page **pages, u64 mmio_spa) +{ + struct hv_input_map_gpa_pages *input_page; + u64 status, *pfnlist; + unsigned long irq_flags, large_shift = 0; + int ret = 0, done = 0; + u64 page_count = page_struct_count; + + if (page_count == 0 || (pages && mmio_spa)) + return -EINVAL; + + if (flags & HV_MAP_GPA_LARGE_PAGE) { + if (mmio_spa) + return -EINVAL; + + if (!HV_PAGE_COUNT_2M_ALIGNED(page_count)) + return -EINVAL; + + large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT; + page_count >>= large_shift; + } + + while (done < page_count) { + ulong i, completed, remain = page_count - done; + int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE); + + local_irq_save(irq_flags); + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input_page->target_partition_id = partition_id; + input_page->target_gpa_base = gfn + (done << large_shift); + input_page->map_flags = flags; + pfnlist = input_page->source_gpa_page_list; + + for (i = 0; i < rep_count; i++) + if (flags & HV_MAP_GPA_NO_ACCESS) { + pfnlist[i] = 0; + } else if (pages) { + u64 index = (done + i) << large_shift; + + if (index >= page_struct_count) { + ret = -EINVAL; + break; + } + pfnlist[i] = page_to_pfn(pages[index]); + } else { + pfnlist[i] = mmio_spa + done + i; + } + if (ret) + break; + + status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0, + input_page, NULL); + local_irq_restore(irq_flags); + + completed = hv_repcomp(status); + + if (hv_result_needs_memory(status)) { + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, + HV_MAP_GPA_DEPOSIT_PAGES); + if (ret) + break; + + } else if (!hv_result_success(status)) { + ret = hv_result_to_errno(status); + break; + } + + done += completed; + } + + if (ret && done) { + u32 unmap_flags = 0; + + if (flags & HV_MAP_GPA_LARGE_PAGE) + unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; + hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags); + } + + return ret; +} + +/* Ask the hypervisor to map guest ram pages */ +int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count, + u32 flags, struct page **pages) +{ + return hv_do_map_gpa_hcall(partition_id, gpa_target, page_count, + flags, pages, 0); +} + +/* Ask the hypervisor to map guest mmio space */ +int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs) +{ + int i; + u32 flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE | + HV_MAP_GPA_NOT_CACHED; + + for (i = 0; i < numpgs; i++) + if (page_is_ram(mmio_spa + i)) + return -EINVAL; + + return hv_do_map_gpa_hcall(partition_id, gfn, numpgs, flags, NULL, + mmio_spa); +} + +int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k, + u32 flags) +{ + struct hv_input_unmap_gpa_pages *input_page; + u64 status, page_count = page_count_4k; + unsigned long irq_flags, large_shift = 0; + int ret = 0, done = 0; + + if (page_count == 0) + return -EINVAL; + + if (flags & HV_UNMAP_GPA_LARGE_PAGE) { + if (!HV_PAGE_COUNT_2M_ALIGNED(page_count)) + return -EINVAL; + + large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT; + page_count >>= large_shift; + } + + while (done < page_count) { + ulong completed, remain = page_count - done; + int rep_count = min(remain, HV_UMAP_GPA_PAGES); + + local_irq_save(irq_flags); + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input_page->target_partition_id = partition_id; + input_page->target_gpa_base = gfn + (done << large_shift); + input_page->unmap_flags = flags; + status = hv_do_rep_hypercall(HVCALL_UNMAP_GPA_PAGES, rep_count, + 0, input_page, NULL); + local_irq_restore(irq_flags); + + completed = hv_repcomp(status); + if (!hv_result_success(status)) { + ret = hv_result_to_errno(status); + break; + } + + done += completed; + } + + return ret; +} + +int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn, + union hv_gpa_page_access_state_flags state_flags, + int *written_total, + union hv_gpa_page_access_state *states) +{ + struct hv_input_get_gpa_pages_access_state *input_page; + union hv_gpa_page_access_state *output_page; + int completed = 0; + unsigned long remaining = count; + int rep_count, i; + u64 status = 0; + unsigned long flags; + + *written_total = 0; + while (remaining) { + local_irq_save(flags); + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); + + input_page->partition_id = partition_id; + input_page->hv_gpa_page_number = gpa_base_pfn + *written_total; + input_page->flags = state_flags; + rep_count = min(remaining, HV_GET_GPA_ACCESS_STATES_BATCH_SIZE); + + status = hv_do_rep_hypercall(HVCALL_GET_GPA_PAGES_ACCESS_STATES, rep_count, + 0, input_page, output_page); + if (!hv_result_success(status)) { + local_irq_restore(flags); + break; + } + completed = hv_repcomp(status); + for (i = 0; i < completed; ++i) + states[i].as_uint8 = output_page[i].as_uint8; + + local_irq_restore(flags); + states += completed; + *written_total += completed; + remaining -= completed; + } + + return hv_result_to_errno(status); +} + +int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector, + u64 dest_addr, + union hv_interrupt_control control) +{ + struct hv_input_assert_virtual_interrupt *input; + unsigned long flags; + u64 status; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->partition_id = partition_id; + input->vector = vector; + /* + * NOTE: dest_addr only needs to be provided while asserting an + * interrupt on x86 platform + */ +#if IS_ENABLED(CONFIG_X86) + input->dest_addr = dest_addr; +#endif + input->control = control; + status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL); + local_irq_restore(flags); + + return hv_result_to_errno(status); +} + +int hv_call_delete_vp(u64 partition_id, u32 vp_index) +{ + union hv_input_delete_vp input = {}; + u64 status; + + input.partition_id = partition_id; + input.vp_index = vp_index; + + status = hv_do_fast_hypercall16(HVCALL_DELETE_VP, + input.as_uint64[0], input.as_uint64[1]); + + return hv_result_to_errno(status); +} +EXPORT_SYMBOL_GPL(hv_call_delete_vp); + +int hv_call_get_vp_state(u32 vp_index, u64 partition_id, + struct hv_vp_state_data state_data, + /* Choose between pages and ret_output */ + u64 page_count, struct page **pages, + union hv_output_get_vp_state *ret_output) +{ + struct hv_input_get_vp_state *input; + union hv_output_get_vp_state *output; + u64 status; + int i; + u64 control; + unsigned long flags; + int ret = 0; + + if (page_count > HV_GET_VP_STATE_BATCH_SIZE) + return -EINVAL; + + if (!page_count && !ret_output) + return -EINVAL; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + memset(input, 0, sizeof(*input)); + memset(output, 0, sizeof(*output)); + + input->partition_id = partition_id; + input->vp_index = vp_index; + input->state_data = state_data; + for (i = 0; i < page_count; i++) + input->output_data_pfns[i] = page_to_pfn(pages[i]); + + control = (HVCALL_GET_VP_STATE) | + (page_count << HV_HYPERCALL_VARHEAD_OFFSET); + + status = hv_do_hypercall(control, input, output); + + if (!hv_result_needs_memory(status)) { + if (hv_result_success(status) && ret_output) + memcpy(ret_output, output, sizeof(*output)); + + local_irq_restore(flags); + ret = hv_result_to_errno(status); + break; + } + local_irq_restore(flags); + + ret = hv_deposit_memory(partition_id, status); + } while (!ret); + + return ret; +} + +int hv_call_set_vp_state(u32 vp_index, u64 partition_id, + /* Choose between pages and bytes */ + struct hv_vp_state_data state_data, u64 page_count, + struct page **pages, u32 num_bytes, u8 *bytes) +{ + struct hv_input_set_vp_state *input; + u64 status; + int i; + u64 control; + unsigned long flags; + int ret = 0; + u16 varhead_sz; + + if (page_count > HV_SET_VP_STATE_BATCH_SIZE) + return -EINVAL; + if (sizeof(*input) + num_bytes > HV_HYP_PAGE_SIZE) + return -EINVAL; + + if (num_bytes) + /* round up to 8 and divide by 8 */ + varhead_sz = (num_bytes + 7) >> 3; + else if (page_count) + varhead_sz = page_count; + else + return -EINVAL; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + + input->partition_id = partition_id; + input->vp_index = vp_index; + input->state_data = state_data; + if (num_bytes) { + memcpy((u8 *)input->data, bytes, num_bytes); + } else { + for (i = 0; i < page_count; i++) + input->data[i].pfns = page_to_pfn(pages[i]); + } + + control = (HVCALL_SET_VP_STATE) | + (varhead_sz << HV_HYPERCALL_VARHEAD_OFFSET); + + status = hv_do_hypercall(control, input, NULL); + + if (!hv_result_needs_memory(status)) { + local_irq_restore(flags); + ret = hv_result_to_errno(status); + break; + } + local_irq_restore(flags); + + ret = hv_deposit_memory(partition_id, status); + } while (!ret); + + return ret; +} + +static int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl, + struct page **state_page) +{ + struct hv_input_map_vp_state_page *input; + struct hv_output_map_vp_state_page *output; + u64 status; + int ret; + unsigned long flags; + + do { + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->partition_id = partition_id; + input->vp_index = vp_index; + input->type = type; + input->input_vtl = input_vtl; + + if (*state_page) { + input->flags.map_location_provided = 1; + input->requested_map_location = + page_to_pfn(*state_page); + } + + status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input, + output); + + if (!hv_result_needs_memory(status)) { + if (hv_result_success(status)) + *state_page = pfn_to_page(output->map_location); + local_irq_restore(flags); + ret = hv_result_to_errno(status); + break; + } + + local_irq_restore(flags); + + ret = hv_deposit_memory(partition_id, status); + } while (!ret); + + trace_mshv_hvcall_map_vp_state_page(partition_id, vp_index, + type, status); + + return ret; +} + +static bool mshv_use_overlay_gpfn(void) +{ + return hv_l1vh_partition() && + mshv_root.vmm_caps.vmm_can_provide_overlay_gpfn; +} + +int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl, + struct page **state_page) +{ + int ret = 0; + struct page *allocated_page = NULL; + + if (mshv_use_overlay_gpfn()) { + allocated_page = alloc_page(GFP_KERNEL); + if (!allocated_page) + return -ENOMEM; + *state_page = allocated_page; + } else { + *state_page = NULL; + } + + ret = hv_call_map_vp_state_page(partition_id, vp_index, type, input_vtl, + state_page); + + if (ret && allocated_page) { + __free_page(allocated_page); + *state_page = NULL; + } + + return ret; +} + +static int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl) +{ + unsigned long flags; + u64 status; + struct hv_input_unmap_vp_state_page *input; + + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input, 0, sizeof(*input)); + + input->partition_id = partition_id; + input->vp_index = vp_index; + input->type = type; + input->input_vtl = input_vtl; + + status = hv_do_hypercall(HVCALL_UNMAP_VP_STATE_PAGE, input, NULL); + + local_irq_restore(flags); + + return hv_result_to_errno(status); +} + +int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + struct page *state_page, union hv_input_vtl input_vtl) +{ + int ret = hv_call_unmap_vp_state_page(partition_id, vp_index, type, input_vtl); + + if (mshv_use_overlay_gpfn() && state_page) + __free_page(state_page); + + return ret; +} + +int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, + u64 arg, void *property_value, + size_t property_value_sz) +{ + u64 status; + unsigned long flags; + struct hv_input_get_partition_property_ex *input; + struct hv_output_get_partition_property_ex *output; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->partition_id = partition_id; + input->property_code = property_code; + input->arg = arg; + status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY_EX, input, output); + + if (!hv_result_success(status)) { + local_irq_restore(flags); + hv_status_debug(status, "\n"); + return hv_result_to_errno(status); + } + memcpy(property_value, &output->property_value, property_value_sz); + + local_irq_restore(flags); + + return 0; +} + +int +hv_call_clear_virtual_interrupt(u64 partition_id) +{ + int status; + + status = hv_do_fast_hypercall8(HVCALL_CLEAR_VIRTUAL_INTERRUPT, + partition_id); + + return hv_result_to_errno(status); +} + +int +hv_call_create_port(u64 port_partition_id, union hv_port_id port_id, + u64 connection_partition_id, + struct hv_port_info *port_info, + u8 port_vtl, u8 min_connection_vtl, int node) +{ + struct hv_input_create_port *input; + unsigned long flags; + int ret = 0; + int status; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + + input->port_partition_id = port_partition_id; + input->port_id = port_id; + input->connection_partition_id = connection_partition_id; + input->port_info = *port_info; + input->port_vtl = port_vtl; + input->min_connection_vtl = min_connection_vtl; + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); + status = hv_do_hypercall(HVCALL_CREATE_PORT, input, NULL); + local_irq_restore(flags); + if (hv_result_success(status)) + break; + + if (!hv_result_needs_memory(status)) { + ret = hv_result_to_errno(status); + break; + } + ret = hv_deposit_memory(port_partition_id, status); + } while (!ret); + + return ret; +} + +int +hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id) +{ + union hv_input_delete_port input = { 0 }; + int status; + + input.port_partition_id = port_partition_id; + input.port_id = port_id; + status = hv_do_fast_hypercall16(HVCALL_DELETE_PORT, + input.as_uint64[0], + input.as_uint64[1]); + + return hv_result_to_errno(status); +} + +int +hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id, + u64 connection_partition_id, + union hv_connection_id connection_id, + struct hv_connection_info *connection_info, + u8 connection_vtl, int node) +{ + struct hv_input_connect_port *input; + unsigned long flags; + int ret = 0, status; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->port_partition_id = port_partition_id; + input->port_id = port_id; + input->connection_partition_id = connection_partition_id; + input->connection_id = connection_id; + input->connection_info = *connection_info; + input->connection_vtl = connection_vtl; + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); + status = hv_do_hypercall(HVCALL_CONNECT_PORT, input, NULL); + + local_irq_restore(flags); + if (hv_result_success(status)) + break; + + if (!hv_result_needs_memory(status)) { + ret = hv_result_to_errno(status); + break; + } + ret = hv_deposit_memory(connection_partition_id, status); + } while (!ret); + + return ret; +} + +int +hv_call_disconnect_port(u64 connection_partition_id, + union hv_connection_id connection_id) +{ + union hv_input_disconnect_port input = { 0 }; + int status; + + input.connection_partition_id = connection_partition_id; + input.connection_id = connection_id; + input.is_doorbell = 1; + status = hv_do_fast_hypercall16(HVCALL_DISCONNECT_PORT, + input.as_uint64[0], + input.as_uint64[1]); + + return hv_result_to_errno(status); +} + +int +hv_call_notify_port_ring_empty(u32 sint_index) +{ + union hv_input_notify_port_ring_empty input = { 0 }; + int status; + + input.sint_index = sint_index; + status = hv_do_fast_hypercall8(HVCALL_NOTIFY_PORT_RING_EMPTY, + input.as_uint64); + + return hv_result_to_errno(status); +} + +/* + * Equivalent of hv_call_map_stats_page() for cases when the caller provides + * the map location. + * + * NOTE: This is a newer hypercall that always supports SELF and PARENT stats + * areas, unlike hv_call_map_stats_page(). + */ +static int hv_call_map_stats_page2(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + u64 map_location) +{ + unsigned long flags; + struct hv_input_map_stats_page2 *input; + u64 status; + int ret; + + if (!map_location || !mshv_use_overlay_gpfn()) + return -EINVAL; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input, 0, sizeof(*input)); + input->type = type; + input->identity = *identity; + input->map_location = map_location; + + status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE2, input, NULL); + + local_irq_restore(flags); + + ret = hv_result_to_errno(status); + + if (!ret) + break; + + if (!hv_result_needs_memory(status)) { + hv_status_debug(status, "\n"); + break; + } + + ret = hv_deposit_memory(hv_current_partition_id, status); + } while (!ret); + + return ret; +} + +static int +hv_stats_get_area_type(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity) +{ + switch (type) { + case HV_STATS_OBJECT_HYPERVISOR: + return identity->hv.stats_area_type; + case HV_STATS_OBJECT_LOGICAL_PROCESSOR: + return identity->lp.stats_area_type; + case HV_STATS_OBJECT_PARTITION: + return identity->partition.stats_area_type; + case HV_STATS_OBJECT_VP: + return identity->vp.stats_area_type; + } + + return -EINVAL; +} + +/* + * Map a stats page, where the page location is provided by the hypervisor. + * + * NOTE: The concept of separate SELF and PARENT stats areas does not exist on + * older hypervisor versions. All the available stats information can be found + * on the SELF page. When attempting to map the PARENT area on a hypervisor + * that doesn't support it, return "success" but with a NULL address. The + * caller should check for this case and instead fallback to the SELF area + * alone. + */ +static int +hv_call_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + struct hv_stats_page **addr) +{ + unsigned long flags; + struct hv_input_map_stats_page *input; + struct hv_output_map_stats_page *output; + u64 status, pfn; + int ret = 0; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->type = type; + input->identity = *identity; + + status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE, input, output); + pfn = output->map_location; + + local_irq_restore(flags); + + if (!hv_result_needs_memory(status)) { + if (hv_result_success(status)) + break; + + if (hv_stats_get_area_type(type, identity) == HV_STATS_AREA_PARENT && + hv_result(status) == HV_STATUS_INVALID_PARAMETER) { + *addr = NULL; + return 0; + } + + hv_status_debug(status, "\n"); + return hv_result_to_errno(status); + } + + ret = hv_deposit_memory(hv_current_partition_id, status); + if (ret) + return ret; + } while (!ret); + + *addr = page_address(pfn_to_page(pfn)); + + return ret; +} + +int hv_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + struct hv_stats_page **addr) +{ + int ret; + struct page *allocated_page = NULL; + + if (!addr) + return -EINVAL; + + if (mshv_use_overlay_gpfn()) { + allocated_page = alloc_page(GFP_KERNEL); + if (!allocated_page) + return -ENOMEM; + + ret = hv_call_map_stats_page2(type, identity, + page_to_pfn(allocated_page)); + *addr = page_address(allocated_page); + } else { + ret = hv_call_map_stats_page(type, identity, addr); + } + + if (ret && allocated_page) { + __free_page(allocated_page); + *addr = NULL; + } + + return ret; +} + +static int hv_call_unmap_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity) +{ + unsigned long flags; + struct hv_input_unmap_stats_page *input; + u64 status; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input, 0, sizeof(*input)); + input->type = type; + input->identity = *identity; + + status = hv_do_hypercall(HVCALL_UNMAP_STATS_PAGE, input, NULL); + local_irq_restore(flags); + + return hv_result_to_errno(status); +} + +int hv_unmap_stats_page(enum hv_stats_object_type type, + struct hv_stats_page *page_addr, + const union hv_stats_object_identity *identity) +{ + int ret; + + ret = hv_call_unmap_stats_page(type, identity); + + if (mshv_use_overlay_gpfn() && page_addr) + __free_page(virt_to_page(page_addr)); + + return ret; +} + +int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, + u64 page_struct_count, u32 host_access, + u32 flags, u8 acquire) +{ + struct hv_input_modify_sparse_spa_page_host_access *input_page; + u64 status; + int done = 0; + unsigned long irq_flags, large_shift = 0; + u64 page_count = page_struct_count; + u16 code = acquire ? HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS : + HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS; + + if (page_count == 0) + return -EINVAL; + + if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE) { + if (!HV_PAGE_COUNT_2M_ALIGNED(page_count)) + return -EINVAL; + large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT; + page_count >>= large_shift; + } + + while (done < page_count) { + ulong i, completed, remain = page_count - done; + int rep_count = min(remain, + HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT); + + local_irq_save(irq_flags); + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input_page, 0, sizeof(*input_page)); + /* Only set the partition id if you are making the pages + * exclusive + */ + if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE) + input_page->partition_id = partition_id; + input_page->flags = flags; + input_page->host_access = host_access; + + for (i = 0; i < rep_count; i++) { + u64 index = (done + i) << large_shift; + + if (index >= page_struct_count) + return -EINVAL; + + input_page->spa_page_list[i] = + page_to_pfn(pages[index]); + } + + status = hv_do_rep_hypercall(code, rep_count, 0, input_page, + NULL); + local_irq_restore(irq_flags); + + completed = hv_repcomp(status); + + if (!hv_result_success(status)) + return hv_result_to_errno(status); + + done += completed; + } + + return 0; +} diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c new file mode 100644 index 000000000000..bd1359eb58dd --- /dev/null +++ b/drivers/hv/mshv_root_main.c @@ -0,0 +1,2417 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Microsoft Corporation. + * + * The main part of the mshv_root module, providing APIs to create + * and manage guest partitions. + * + * Authors: Microsoft Linux virtualization team + */ + +#include <linux/entry-virt.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/miscdevice.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/anon_inodes.h> +#include <linux/mm.h> +#include <linux/io.h> +#include <linux/cpuhotplug.h> +#include <linux/random.h> +#include <asm/mshyperv.h> +#include <linux/hyperv.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/kexec.h> +#include <linux/page-flags.h> +#include <linux/crash_dump.h> +#include <linux/panic_notifier.h> +#include <linux/vmalloc.h> +#include <linux/rseq.h> + +#include "mshv_eventfd.h" +#include "mshv.h" +#include "mshv_root.h" + +MODULE_AUTHOR("Microsoft"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); + +/* HV_THREAD_COUNTER */ +#if defined(CONFIG_X86_64) +#define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 202 +#elif defined(CONFIG_ARM64) +#define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95 +#endif + +struct mshv_root mshv_root; + +enum hv_scheduler_type hv_scheduler_type; + +/* Once we implement the fast extended hypercall ABI they can go away. */ +static void * __percpu *root_scheduler_input; +static void * __percpu *root_scheduler_output; + +static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +static int mshv_dev_open(struct inode *inode, struct file *filp); +static int mshv_dev_release(struct inode *inode, struct file *filp); +static int mshv_vp_release(struct inode *inode, struct file *filp); +static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +static int mshv_partition_release(struct inode *inode, struct file *filp); +static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma); +static vm_fault_t mshv_vp_fault(struct vm_fault *vmf); +static int mshv_init_async_handler(struct mshv_partition *partition); +static void mshv_async_hvcall_handler(void *data, u64 *status); + +static const union hv_input_vtl input_vtl_zero; +static const union hv_input_vtl input_vtl_normal = { + .target_vtl = HV_NORMAL_VTL, + .use_target_vtl = 1, +}; + +static const struct vm_operations_struct mshv_vp_vm_ops = { + .fault = mshv_vp_fault, +}; + +static const struct file_operations mshv_vp_fops = { + .owner = THIS_MODULE, + .release = mshv_vp_release, + .unlocked_ioctl = mshv_vp_ioctl, + .llseek = noop_llseek, + .mmap = mshv_vp_mmap, +}; + +static const struct file_operations mshv_partition_fops = { + .owner = THIS_MODULE, + .release = mshv_partition_release, + .unlocked_ioctl = mshv_partition_ioctl, + .llseek = noop_llseek, +}; + +static const struct file_operations mshv_dev_fops = { + .owner = THIS_MODULE, + .open = mshv_dev_open, + .release = mshv_dev_release, + .unlocked_ioctl = mshv_dev_ioctl, + .llseek = noop_llseek, +}; + +static struct miscdevice mshv_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "mshv", + .fops = &mshv_dev_fops, + .mode = 0600, +}; + +/* + * Only allow hypercalls that have a u64 partition id as the first member of + * the input structure. + * These are sorted by value. + */ +static u16 mshv_passthru_hvcalls[] = { + HVCALL_GET_PARTITION_PROPERTY, + HVCALL_GET_PARTITION_PROPERTY_EX, + HVCALL_SET_PARTITION_PROPERTY, + HVCALL_INSTALL_INTERCEPT, + HVCALL_GET_VP_REGISTERS, + HVCALL_SET_VP_REGISTERS, + HVCALL_TRANSLATE_VIRTUAL_ADDRESS, + HVCALL_CLEAR_VIRTUAL_INTERRUPT, + HVCALL_REGISTER_INTERCEPT_RESULT, + HVCALL_ASSERT_VIRTUAL_INTERRUPT, + HVCALL_GET_GPA_PAGES_ACCESS_STATES, + HVCALL_SIGNAL_EVENT_DIRECT, + HVCALL_POST_MESSAGE_DIRECT, + HVCALL_GET_VP_CPUID_VALUES, +}; + +/* + * Only allow hypercalls that are safe to be called by the VMM with the host + * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a + * hypercall cannot be misused by the VMM before adding it to this list. + */ +static u16 mshv_self_passthru_hvcalls[] = { + HVCALL_GET_PARTITION_PROPERTY, + HVCALL_GET_PARTITION_PROPERTY_EX, +}; + +static bool mshv_hvcall_is_async(u16 code) +{ + switch (code) { + case HVCALL_SET_PARTITION_PROPERTY: + return true; + default: + break; + } + return false; +} + +static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id) +{ + int i; + int n = ARRAY_SIZE(mshv_passthru_hvcalls); + u16 *allowed_hvcalls = mshv_passthru_hvcalls; + + if (pt_id == HV_PARTITION_ID_SELF) { + n = ARRAY_SIZE(mshv_self_passthru_hvcalls); + allowed_hvcalls = mshv_self_passthru_hvcalls; + } + + for (i = 0; i < n; ++i) + if (allowed_hvcalls[i] == code) + return true; + + return false; +} + +static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, + bool partition_locked, + void __user *user_args) +{ + u64 status; + int ret = 0; + bool is_async; + struct mshv_root_hvcall args; + struct page *page; + unsigned int pages_order; + void *input_pg = NULL; + void *output_pg = NULL; + u16 reps_completed; + u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) || + mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE) + return -EINVAL; + + if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) + return -EINVAL; + + if (!mshv_passthru_hvcall_allowed(args.code, pt_id)) + return -EINVAL; + + is_async = mshv_hvcall_is_async(args.code); + if (is_async) { + /* async hypercalls can only be called from partition fd */ + if (!partition || !partition_locked) + return -EINVAL; + ret = mshv_init_async_handler(partition); + if (ret) + return ret; + } + + pages_order = args.out_ptr ? 1 : 0; + page = alloc_pages(GFP_KERNEL, pages_order); + if (!page) + return -ENOMEM; + input_pg = page_address(page); + + if (args.out_ptr) + output_pg = (char *)input_pg + PAGE_SIZE; + else + output_pg = NULL; + + if (copy_from_user(input_pg, (void __user *)args.in_ptr, + args.in_sz)) { + ret = -EFAULT; + goto free_pages_out; + } + + /* + * NOTE: This only works because all the allowed hypercalls' input + * structs begin with a u64 partition_id field. + */ + *(u64 *)input_pg = pt_id; + + reps_completed = 0; + do { + if (args.reps) { + status = hv_do_rep_hypercall_ex(args.code, args.reps, + 0, reps_completed, + input_pg, output_pg); + reps_completed = hv_repcomp(status); + } else { + status = hv_do_hypercall(args.code, input_pg, output_pg); + } + + if (hv_result(status) == HV_STATUS_CALL_PENDING) { + if (is_async) { + mshv_async_hvcall_handler(partition, &status); + } else { /* Paranoia check. This shouldn't happen! */ + ret = -EBADFD; + goto free_pages_out; + } + } + + if (hv_result_success(status)) + break; + + if (!hv_result_needs_memory(status)) + ret = hv_result_to_errno(status); + else + ret = hv_deposit_memory(pt_id, status); + } while (!ret); + + args.status = hv_result(status); + args.reps = reps_completed; + if (copy_to_user(user_args, &args, sizeof(args))) + ret = -EFAULT; + + if (!ret && output_pg && + copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) + ret = -EFAULT; + +free_pages_out: + free_pages((unsigned long)input_pg, pages_order); + + return ret; +} + +static inline bool is_ghcb_mapping_available(void) +{ +#if IS_ENABLED(CONFIG_X86_64) + return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE; +#else + return 0; +#endif +} + +static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, + struct hv_register_assoc *registers) +{ + return hv_call_get_vp_registers(vp_index, partition_id, + count, input_vtl_zero, registers); +} + +static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, + struct hv_register_assoc *registers) +{ + return hv_call_set_vp_registers(vp_index, partition_id, + count, input_vtl_zero, registers); +} + +/* + * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by + * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend, + * done by the hypervisor. + * "Intercept" suspend leads to asynchronous message delivery to dom0 which + * should be awaited to keep the VP loop consistent (i.e. no message pending + * upon VP resume). + * VP intercept suspend can't be done when the VP is explicitly suspended + * already, and thus can be only two possible race scenarios: + * 1. implicit suspend bit set -> explicit suspend bit set -> message sent + * 2. implicit suspend bit set -> message sent -> explicit suspend bit set + * Checking for implicit suspend bit set after explicit suspend request has + * succeeded in either case allows us to reliably identify, if there is a + * message to receive and deliver to VMM. + */ +static int +mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight) +{ + struct hv_register_assoc explicit_suspend = { + .name = HV_REGISTER_EXPLICIT_SUSPEND + }; + struct hv_register_assoc intercept_suspend = { + .name = HV_REGISTER_INTERCEPT_SUSPEND + }; + union hv_explicit_suspend_register *es = + &explicit_suspend.value.explicit_suspend; + union hv_intercept_suspend_register *is = + &intercept_suspend.value.intercept_suspend; + int ret; + + es->suspended = 1; + + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + 1, &explicit_suspend); + if (ret) { + vp_err(vp, "Failed to explicitly suspend vCPU\n"); + return ret; + } + + ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + 1, &intercept_suspend); + if (ret) { + vp_err(vp, "Failed to get intercept suspend state\n"); + return ret; + } + + *message_in_flight = is->suspended; + + return 0; +} + +/* + * This function is used when VPs are scheduled by the hypervisor's + * scheduler. + * + * Caller has to make sure the registers contain cleared + * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers + * exactly in this order (the hypervisor clears them sequentially) to avoid + * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND + * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the + * opposite order. + */ +static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp) +{ + long ret; + struct hv_register_assoc suspend_regs[2] = { + { .name = HV_REGISTER_INTERCEPT_SUSPEND }, + { .name = HV_REGISTER_EXPLICIT_SUSPEND } + }; + size_t count = ARRAY_SIZE(suspend_regs); + + /* Resume VP execution */ + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + count, suspend_regs); + if (ret) { + vp_err(vp, "Failed to resume vp execution. %lx\n", ret); + return ret; + } + + ret = wait_event_interruptible(vp->run.vp_suspend_queue, + vp->run.kicked_by_hv == 1); + if (ret) { + bool message_in_flight; + + /* + * Otherwise the waiting was interrupted by a signal: suspend + * the vCPU explicitly and copy message in flight (if any). + */ + ret = mshv_suspend_vp(vp, &message_in_flight); + if (ret) + return ret; + + /* Return if no message in flight */ + if (!message_in_flight) + return -EINTR; + + /* Wait for the message in flight. */ + wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1); + } + + /* + * Reset the flag to make the wait_event call above work + * next time. + */ + vp->run.kicked_by_hv = 0; + + return 0; +} + +static int +mshv_vp_dispatch(struct mshv_vp *vp, u32 flags, + struct hv_output_dispatch_vp *res) +{ + struct hv_input_dispatch_vp *input; + struct hv_output_dispatch_vp *output; + u64 status; + + preempt_disable(); + input = *this_cpu_ptr(root_scheduler_input); + output = *this_cpu_ptr(root_scheduler_output); + + memset(input, 0, sizeof(*input)); + memset(output, 0, sizeof(*output)); + + input->partition_id = vp->vp_partition->pt_id; + input->vp_index = vp->vp_index; + input->time_slice = 0; /* Run forever until something happens */ + input->spec_ctrl = 0; /* TODO: set sensible flags */ + input->flags = flags; + + vp->run.flags.root_sched_dispatched = 1; + status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output); + vp->run.flags.root_sched_dispatched = 0; + + trace_mshv_hvcall_dispatch_vp(vp->vp_partition->pt_id, + vp->vp_index, flags, + output->dispatch_state, + output->dispatch_event, +#if defined(CONFIG_X86_64) + vp->vp_register_page->interrupt_vectors.as_uint64, +#else + 0, +#endif + status); + + *res = *output; + preempt_enable(); + + if (!hv_result_success(status)) + vp_err(vp, "%s: status %s\n", __func__, + hv_result_to_string(status)); + + return hv_result_to_errno(status); +} + +static int +mshv_vp_clear_explicit_suspend(struct mshv_vp *vp) +{ + struct hv_register_assoc explicit_suspend = { + .name = HV_REGISTER_EXPLICIT_SUSPEND, + .value.explicit_suspend.suspended = 0, + }; + int ret; + + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + 1, &explicit_suspend); + + trace_mshv_vp_clear_explicit_suspend(vp->vp_partition->pt_id, + vp->vp_index, ret); + + if (ret) + vp_err(vp, "Failed to unsuspend\n"); + + return ret; +} + +#if IS_ENABLED(CONFIG_X86_64) +static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) +{ + if (!vp->vp_register_page) + return 0; + return vp->vp_register_page->interrupt_vectors.as_uint64; +} +#else +static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) +{ + return 0; +} +#endif + +static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp) +{ + struct hv_stats_page **stats = vp->vp_stats_pages; + u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->data; + u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->data; + + return parent_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED] || + self_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED]; +} + +static int +mshv_vp_wait_for_hv_kick(struct mshv_vp *vp) +{ + int ret; + + ret = wait_event_interruptible(vp->run.vp_suspend_queue, + (vp->run.kicked_by_hv == 1 && + !mshv_vp_dispatch_thread_blocked(vp)) || + mshv_vp_interrupt_pending(vp)); + if (ret) + return -EINTR; + + trace_mshv_vp_wait_for_hv_kick(vp->vp_partition->pt_id, + vp->vp_index, + vp->run.kicked_by_hv, + mshv_vp_dispatch_thread_blocked(vp), + mshv_vp_interrupt_pending(vp)); + + vp->run.flags.root_sched_blocked = 0; + vp->run.kicked_by_hv = 0; + + return 0; +} + +/* Must be called with interrupts enabled */ +static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) +{ + long ret; + + if (vp->run.flags.root_sched_blocked) { + /* + * Dispatch state of this VP is blocked. Need to wait + * for the hypervisor to clear the blocked state before + * dispatching it. + */ + ret = mshv_vp_wait_for_hv_kick(vp); + if (ret) + return ret; + } + + do { + u32 flags = 0; + struct hv_output_dispatch_vp output; + + if (__xfer_to_guest_mode_work_pending()) { + ret = xfer_to_guest_mode_handle_work(); + + trace_mshv_xfer_to_guest_mode_work(vp->vp_partition->pt_id, + vp->vp_index, + read_thread_flags(), + ret); + + if (ret) + break; + } + + if (vp->run.flags.intercept_suspend) + flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND; + + if (mshv_vp_interrupt_pending(vp)) + flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION; + + ret = mshv_vp_dispatch(vp, flags, &output); + if (ret) + break; + + vp->run.flags.intercept_suspend = 0; + + if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) { + if (output.dispatch_event == + HV_VP_DISPATCH_EVENT_SUSPEND) { + /* + * TODO: remove the warning once VP canceling + * is supported + */ + WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count), + "%s: vp#%d: unexpected explicit suspend\n", + __func__, vp->vp_index); + /* + * Need to clear explicit suspend before + * dispatching. + * Explicit suspend is either: + * - set right after the first VP dispatch or + * - set explicitly via hypercall + * Since the latter case is not yet supported, + * simply clear it here. + */ + ret = mshv_vp_clear_explicit_suspend(vp); + if (ret) + break; + + ret = mshv_vp_wait_for_hv_kick(vp); + if (ret) + break; + } else { + vp->run.flags.root_sched_blocked = 1; + ret = mshv_vp_wait_for_hv_kick(vp); + if (ret) + break; + } + } else { + /* HV_VP_DISPATCH_STATE_READY */ + if (output.dispatch_event == + HV_VP_DISPATCH_EVENT_INTERCEPT) + vp->run.flags.intercept_suspend = 1; + } + } while (!vp->run.flags.intercept_suspend); + + rseq_virt_userspace_exit(); + + return ret; +} + +static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, + "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); + +static struct mshv_mem_region * +mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) +{ + struct mshv_mem_region *region; + + hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { + if (gfn >= region->start_gfn && + gfn < region->start_gfn + region->nr_pages) + return region; + } + + return NULL; +} + +static struct mshv_mem_region * +mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn) +{ + struct mshv_mem_region *region; + + spin_lock(&p->pt_mem_regions_lock); + region = mshv_partition_region_by_gfn(p, gfn); + if (!region || !mshv_region_get(region)) { + spin_unlock(&p->pt_mem_regions_lock); + return NULL; + } + spin_unlock(&p->pt_mem_regions_lock); + + return region; +} + +/** + * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts. + * @vp: Pointer to the virtual processor structure. + * + * This function processes GPA intercepts by identifying the memory region + * corresponding to the intercepted GPA, aligning the page offset, and + * mapping the required pages. It ensures that the region is valid and + * handles faults efficiently by mapping multiple pages at once. + * + * Return: true if the intercept was handled successfully, false otherwise. + */ +static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) +{ + struct mshv_partition *p = vp->vp_partition; + struct mshv_mem_region *region; + bool ret = false; + u64 gfn; +#if defined(CONFIG_X86_64) + struct hv_x64_memory_intercept_message *msg = + (struct hv_x64_memory_intercept_message *) + vp->vp_intercept_msg_page->u.payload; +#elif defined(CONFIG_ARM64) + struct hv_arm64_memory_intercept_message *msg = + (struct hv_arm64_memory_intercept_message *) + vp->vp_intercept_msg_page->u.payload; +#endif + enum hv_intercept_access_type access_type = + msg->header.intercept_access_type; + + gfn = HVPFN_DOWN(msg->guest_physical_address); + + region = mshv_partition_region_by_gfn_get(p, gfn); + if (!region) + goto out; + + if (access_type == HV_INTERCEPT_ACCESS_WRITE && + !(region->hv_map_flags & HV_MAP_GPA_WRITABLE)) + goto put_region; + + if (access_type == HV_INTERCEPT_ACCESS_EXECUTE && + !(region->hv_map_flags & HV_MAP_GPA_EXECUTABLE)) + goto put_region; + + /* Only movable memory ranges are supported for GPA intercepts */ + if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE) + ret = mshv_region_handle_gfn_fault(region, gfn); + +put_region: + mshv_region_put(region); +out: + trace_mshv_handle_gpa_intercept(p->pt_id, vp->vp_index, gfn, + access_type, ret); + return ret; +} + +static bool mshv_vp_handle_intercept(struct mshv_vp *vp) +{ + switch (vp->vp_intercept_msg_page->header.message_type) { + case HVMSG_GPA_INTERCEPT: + return mshv_handle_gpa_intercept(vp); + } + return false; +} + +static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) +{ + long rc; + + trace_mshv_run_vp_entry(vp->vp_partition->pt_id, vp->vp_index); + + do { + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + rc = mshv_run_vp_with_root_scheduler(vp); + else + rc = mshv_run_vp_with_hyp_scheduler(vp); + } while (rc == 0 && mshv_vp_handle_intercept(vp)); + + trace_mshv_run_vp_exit(vp->vp_partition->pt_id, vp->vp_index, + vp->vp_intercept_msg_page->header.message_type, + rc); + + if (rc) + return rc; + + if (copy_to_user(ret_msg, vp->vp_intercept_msg_page, + sizeof(struct hv_message))) + rc = -EFAULT; + + return rc; +} + +static int +mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp, + struct hv_vp_state_data state_data, + unsigned long user_pfn, size_t page_count, + bool is_set) +{ + int completed, ret = 0; + unsigned long check; + struct page **pages; + + if (page_count > INT_MAX) + return -EINVAL; + /* + * Check the arithmetic for wraparound/overflow. + * The last page address in the buffer is: + * (user_pfn + (page_count - 1)) * PAGE_SIZE + */ + if (check_add_overflow(user_pfn, (page_count - 1), &check)) + return -EOVERFLOW; + if (check_mul_overflow(check, PAGE_SIZE, &check)) + return -EOVERFLOW; + + /* Pin user pages so hypervisor can copy directly to them */ + pages = kzalloc_objs(struct page *, page_count); + if (!pages) + return -ENOMEM; + + for (completed = 0; completed < page_count; completed += ret) { + unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE; + int remaining = page_count - completed; + + ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE, + &pages[completed]); + if (ret < 0) { + vp_err(vp, "%s: Failed to pin user pages error %i\n", + __func__, ret); + goto unpin_pages; + } + } + + if (is_set) + ret = hv_call_set_vp_state(vp->vp_index, + vp->vp_partition->pt_id, + state_data, page_count, pages, + 0, NULL); + else + ret = hv_call_get_vp_state(vp->vp_index, + vp->vp_partition->pt_id, + state_data, page_count, pages, + NULL); + +unpin_pages: + unpin_user_pages(pages, completed); + kfree(pages); + return ret; +} + +static long +mshv_vp_ioctl_get_set_state(struct mshv_vp *vp, + struct mshv_get_set_vp_state __user *user_args, + bool is_set) +{ + struct mshv_get_set_vp_state args; + long ret = 0; + union hv_output_get_vp_state vp_state; + u32 data_sz; + struct hv_vp_state_data state_data = {}; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) || + !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) || + !PAGE_ALIGNED(args.buf_ptr)) + return -EINVAL; + + if (!access_ok((void __user *)args.buf_ptr, args.buf_sz)) + return -EFAULT; + + switch (args.type) { + case MSHV_VP_STATE_LAPIC: + state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE; + data_sz = HV_HYP_PAGE_SIZE; + break; + case MSHV_VP_STATE_XSAVE: + { + u64 data_sz_64; + + ret = hv_call_get_partition_property(vp->vp_partition->pt_id, + HV_PARTITION_PROPERTY_XSAVE_STATES, + &state_data.xsave.states.as_uint64); + if (ret) + return ret; + + ret = hv_call_get_partition_property(vp->vp_partition->pt_id, + HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE, + &data_sz_64); + if (ret) + return ret; + + data_sz = (u32)data_sz_64; + state_data.xsave.flags = 0; + /* Always request legacy states */ + state_data.xsave.states.legacy_x87 = 1; + state_data.xsave.states.legacy_sse = 1; + state_data.type = HV_GET_SET_VP_STATE_XSAVE; + break; + } + case MSHV_VP_STATE_SIMP: + state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE; + data_sz = HV_HYP_PAGE_SIZE; + break; + case MSHV_VP_STATE_SIEFP: + state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE; + data_sz = HV_HYP_PAGE_SIZE; + break; + case MSHV_VP_STATE_SYNTHETIC_TIMERS: + state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS; + data_sz = sizeof(vp_state.synthetic_timers_state); + break; + default: + return -EINVAL; + } + + if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz))) + return -EFAULT; + + if (data_sz > args.buf_sz) + return -EINVAL; + + /* If the data is transmitted via pfns, delegate to helper */ + if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) { + unsigned long user_pfn = PFN_DOWN(args.buf_ptr); + size_t page_count = PFN_DOWN(args.buf_sz); + + return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn, + page_count, is_set); + } + + /* Paranoia check - this shouldn't happen! */ + if (data_sz > sizeof(vp_state)) { + vp_err(vp, "Invalid vp state data size!\n"); + return -EINVAL; + } + + if (is_set) { + if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz)) + return -EFAULT; + + return hv_call_set_vp_state(vp->vp_index, + vp->vp_partition->pt_id, + state_data, 0, NULL, + sizeof(vp_state), (u8 *)&vp_state); + } + + ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id, + state_data, 0, NULL, &vp_state); + if (ret) + return ret; + + if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz)) + return -EFAULT; + + return 0; +} + +static long +mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + struct mshv_vp *vp = filp->private_data; + long r = -ENOTTY; + + if (mutex_lock_killable(&vp->vp_mutex)) + return -EINTR; + + switch (ioctl) { + case MSHV_RUN_VP: + r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg); + break; + case MSHV_GET_VP_STATE: + r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false); + break; + case MSHV_SET_VP_STATE: + r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true); + break; + case MSHV_ROOT_HVCALL: + r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false, + (void __user *)arg); + break; + default: + vp_warn(vp, "Invalid ioctl: %#x\n", ioctl); + break; + } + mutex_unlock(&vp->vp_mutex); + + return r; +} + +static vm_fault_t mshv_vp_fault(struct vm_fault *vmf) +{ + struct mshv_vp *vp = vmf->vma->vm_file->private_data; + + switch (vmf->vma->vm_pgoff) { + case MSHV_VP_MMAP_OFFSET_REGISTERS: + vmf->page = virt_to_page(vp->vp_register_page); + break; + case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: + vmf->page = virt_to_page(vp->vp_intercept_msg_page); + break; + case MSHV_VP_MMAP_OFFSET_GHCB: + vmf->page = virt_to_page(vp->vp_ghcb_page); + break; + default: + return VM_FAULT_SIGBUS; + } + + get_page(vmf->page); + + return 0; +} + +static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct mshv_vp *vp = file->private_data; + + switch (vma->vm_pgoff) { + case MSHV_VP_MMAP_OFFSET_REGISTERS: + if (!vp->vp_register_page) + return -ENODEV; + break; + case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: + if (!vp->vp_intercept_msg_page) + return -ENODEV; + break; + case MSHV_VP_MMAP_OFFSET_GHCB: + if (!vp->vp_ghcb_page) + return -ENODEV; + break; + default: + return -EINVAL; + } + + vma->vm_ops = &mshv_vp_vm_ops; + return 0; +} + +static int +mshv_vp_release(struct inode *inode, struct file *filp) +{ + struct mshv_vp *vp = filp->private_data; + + trace_mshv_vp_release(vp->vp_partition->pt_id, vp->vp_index); + + /* Rest of VP cleanup happens in destroy_partition() */ + mshv_partition_put(vp->vp_partition); + return 0; +} + +void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, + struct hv_stats_page *stats_pages[]) +{ + union hv_stats_object_identity identity = { + .vp.partition_id = partition_id, + .vp.vp_index = vp_index, + }; + int err; + + identity.vp.stats_area_type = HV_STATS_AREA_SELF; + err = hv_unmap_stats_page(HV_STATS_OBJECT_VP, + stats_pages[HV_STATS_AREA_SELF], + &identity); + if (err) + pr_err("%s: failed to unmap partition %llu vp %u self stats, err: %d\n", + __func__, partition_id, vp_index, err); + + if (stats_pages[HV_STATS_AREA_PARENT] != stats_pages[HV_STATS_AREA_SELF]) { + identity.vp.stats_area_type = HV_STATS_AREA_PARENT; + err = hv_unmap_stats_page(HV_STATS_OBJECT_VP, + stats_pages[HV_STATS_AREA_PARENT], + &identity); + if (err) + pr_err("%s: failed to unmap partition %llu vp %u parent stats, err: %d\n", + __func__, partition_id, vp_index, err); + } +} + +int mshv_vp_stats_map(u64 partition_id, u32 vp_index, + struct hv_stats_page *stats_pages[]) +{ + union hv_stats_object_identity identity = { + .vp.partition_id = partition_id, + .vp.vp_index = vp_index, + }; + int err; + + identity.vp.stats_area_type = HV_STATS_AREA_SELF; + err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, + &stats_pages[HV_STATS_AREA_SELF]); + if (err) { + pr_err("%s: failed to map partition %llu vp %u self stats, err: %d\n", + __func__, partition_id, vp_index, err); + return err; + } + + /* + * L1VH partition cannot access its vp stats in parent area. + */ + if (is_l1vh_parent(partition_id)) { + stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF]; + } else { + identity.vp.stats_area_type = HV_STATS_AREA_PARENT; + err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, + &stats_pages[HV_STATS_AREA_PARENT]); + if (err) { + pr_err("%s: failed to map partition %llu vp %u parent stats, err: %d\n", + __func__, partition_id, vp_index, err); + goto unmap_self; + } + if (!stats_pages[HV_STATS_AREA_PARENT]) + stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF]; + } + + return 0; + +unmap_self: + identity.vp.stats_area_type = HV_STATS_AREA_SELF; + hv_unmap_stats_page(HV_STATS_OBJECT_VP, + stats_pages[HV_STATS_AREA_SELF], + &identity); + return err; +} + +static long +mshv_partition_ioctl_create_vp(struct mshv_partition *partition, + void __user *arg) +{ + struct mshv_create_vp args; + struct mshv_vp *vp; + struct page *intercept_msg_page, *register_page, *ghcb_page; + struct hv_stats_page *stats_pages[2]; + long ret; + + if (copy_from_user(&args, arg, sizeof(args))) + return -EFAULT; + + if (args.vp_index >= MSHV_MAX_VPS) + return -EINVAL; + + if (partition->pt_vp_array[args.vp_index]) + return -EEXIST; + + ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index, + 0 /* Only valid for root partition VPs */); + if (ret) + return ret; + + ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + input_vtl_zero, &intercept_msg_page); + if (ret) + goto destroy_vp; + + if (!mshv_partition_encrypted(partition)) { + ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_REGISTERS, + input_vtl_zero, ®ister_page); + if (ret) + goto unmap_intercept_message_page; + } + + if (mshv_partition_encrypted(partition) && + is_ghcb_mapping_available()) { + ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_GHCB, + input_vtl_normal, &ghcb_page); + if (ret) + goto unmap_register_page; + } + + ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, + stats_pages); + if (ret) + goto unmap_ghcb_page; + + vp = kzalloc_obj(*vp); + if (!vp) + goto unmap_stats_pages; + + vp->vp_partition = mshv_partition_get(partition); + if (!vp->vp_partition) { + ret = -EBADF; + goto free_vp; + } + + mutex_init(&vp->vp_mutex); + init_waitqueue_head(&vp->run.vp_suspend_queue); + atomic64_set(&vp->run.vp_signaled_count, 0); + + vp->vp_index = args.vp_index; + vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page); + if (!mshv_partition_encrypted(partition)) + vp->vp_register_page = page_to_virt(register_page); + + if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) + vp->vp_ghcb_page = page_to_virt(ghcb_page); + + memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); + + ret = mshv_debugfs_vp_create(vp); + if (ret) + goto put_partition; + + /* + * Keep anon_inode_getfd last: it installs fd in the file struct and + * thus makes the state accessible in user space. + */ + ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp, + O_RDWR | O_CLOEXEC); + if (ret < 0) + goto remove_debugfs_vp; + + /* already exclusive with the partition mutex for all ioctls */ + partition->pt_vp_count++; + partition->pt_vp_array[args.vp_index] = vp; + + goto out; + +remove_debugfs_vp: + mshv_debugfs_vp_remove(vp); +put_partition: + mshv_partition_put(partition); +free_vp: + kfree(vp); +unmap_stats_pages: + mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages); +unmap_ghcb_page: + if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) + hv_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_GHCB, ghcb_page, + input_vtl_normal); +unmap_register_page: + if (!mshv_partition_encrypted(partition)) + hv_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_REGISTERS, + register_page, input_vtl_zero); +unmap_intercept_message_page: + hv_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + intercept_msg_page, input_vtl_zero); +destroy_vp: + hv_call_delete_vp(partition->pt_id, args.vp_index); +out: + trace_mshv_create_vp(partition->pt_id, args.vp_index, ret); + return ret; +} + +static int mshv_init_async_handler(struct mshv_partition *partition) +{ + if (completion_done(&partition->async_hypercall)) { + pt_err(partition, + "Cannot issue async hypercall while another one in progress!\n"); + return -EPERM; + } + + reinit_completion(&partition->async_hypercall); + return 0; +} + +static void mshv_async_hvcall_handler(void *data, u64 *status) +{ + struct mshv_partition *partition = data; + + wait_for_completion(&partition->async_hypercall); + pt_dbg(partition, "Async hypercall completed!\n"); + + *status = partition->async_hypercall_status; +} + +/* + * NB: caller checks and makes sure mem->size is page aligned + * Returns: 0 with regionpp updated on success, or -errno + */ +static int mshv_partition_create_region(struct mshv_partition *partition, + struct mshv_user_mem_region *mem, + struct mshv_mem_region **regionpp, + bool is_mmio) +{ + struct mshv_mem_region *rg; + u64 nr_pages = HVPFN_DOWN(mem->size); + + /* Reject overlapping regions */ + spin_lock(&partition->pt_mem_regions_lock); + hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) { + if (mem->guest_pfn + nr_pages <= rg->start_gfn || + rg->start_gfn + rg->nr_pages <= mem->guest_pfn) + continue; + spin_unlock(&partition->pt_mem_regions_lock); + return -EEXIST; + } + spin_unlock(&partition->pt_mem_regions_lock); + + rg = mshv_region_create(mem->guest_pfn, nr_pages, + mem->userspace_addr, mem->flags); + if (IS_ERR(rg)) + return PTR_ERR(rg); + + if (is_mmio) + rg->mreg_type = MSHV_REGION_TYPE_MMIO; + else if (mshv_partition_encrypted(partition) || + !mshv_region_movable_init(rg)) + rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED; + else + rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE; + + rg->partition = partition; + + *regionpp = rg; + + return 0; +} + +/** + * mshv_prepare_pinned_region - Pin and map memory regions + * @region: Pointer to the memory region structure + * + * This function processes memory regions that are explicitly marked as pinned. + * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based + * population. The function ensures the region is properly populated, handles + * encryption requirements for SNP partitions if applicable, maps the region, + * and performs necessary sharing or eviction operations based on the mapping + * result. + * + * Return: 0 on success, negative error code on failure. + */ +static int mshv_prepare_pinned_region(struct mshv_mem_region *region) +{ + struct mshv_partition *partition = region->partition; + int ret; + + ret = mshv_region_pin(region); + if (ret) { + pt_err(partition, "Failed to pin memory region: %d\n", + ret); + goto err_out; + } + + /* + * For an SNP partition it is a requirement that for every memory region + * that we are going to map for this partition we should make sure that + * host access to that region is released. This is ensured by doing an + * additional hypercall which will update the SLAT to release host + * access to guest memory regions. + */ + if (mshv_partition_encrypted(partition)) { + ret = mshv_region_unshare(region); + if (ret) { + pt_err(partition, + "Failed to unshare memory region (guest_pfn: %llu): %d\n", + region->start_gfn, ret); + goto invalidate_region; + } + } + + ret = mshv_region_map(region); + if (ret && mshv_partition_encrypted(partition)) { + int shrc; + + shrc = mshv_region_share(region); + if (!shrc) + goto invalidate_region; + + pt_err(partition, + "Failed to share memory region (guest_pfn: %llu): %d\n", + region->start_gfn, shrc); + /* + * Don't unpin if marking shared failed because pages are no + * longer mapped in the host, ie root, anymore. + */ + goto err_out; + } + + return 0; + +invalidate_region: + mshv_region_invalidate(region); +err_out: + return ret; +} + +/* + * This maps two things: guest RAM and for pci passthru mmio space. + * + * mmio: + * - vfio overloads vm_pgoff to store the mmio start pfn/spa. + * - Two things need to happen for mapping mmio range: + * 1. mapped in the uaddr so VMM can access it. + * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it. + * + * This function takes care of the second. The first one is managed by vfio, + * and hence is taken care of via vfio_pci_mmap_fault(). + */ +static long +mshv_map_user_memory(struct mshv_partition *partition, + struct mshv_user_mem_region *mem) +{ + struct mshv_mem_region *region; + struct vm_area_struct *vma; + bool is_mmio; + ulong mmio_pfn; + long ret; + + if (mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP) || + !access_ok((const void __user *)mem->userspace_addr, mem->size)) + return -EINVAL; + + mmap_read_lock(current->mm); + vma = vma_lookup(current->mm, mem->userspace_addr); + is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0; + mmio_pfn = is_mmio ? vma->vm_pgoff : 0; + mmap_read_unlock(current->mm); + + if (!vma) + return -EINVAL; + + ret = mshv_partition_create_region(partition, mem, ®ion, + is_mmio); + if (ret) + return ret; + + switch (region->mreg_type) { + case MSHV_REGION_TYPE_MEM_PINNED: + ret = mshv_prepare_pinned_region(region); + break; + case MSHV_REGION_TYPE_MEM_MOVABLE: + /* + * For movable memory regions, remap with no access to let + * the hypervisor track dirty pages, enabling pre-copy live + * migration. + */ + ret = hv_call_map_gpa_pages(partition->pt_id, + region->start_gfn, + region->nr_pages, + HV_MAP_GPA_NO_ACCESS, NULL); + break; + case MSHV_REGION_TYPE_MMIO: + ret = hv_call_map_mmio_pages(partition->pt_id, + region->start_gfn, + mmio_pfn, + region->nr_pages); + break; + } + + trace_mshv_map_user_memory(partition->pt_id, region->start_uaddr, + region->start_gfn, region->nr_pages, + region->hv_map_flags, ret); + + if (ret) + goto errout; + + spin_lock(&partition->pt_mem_regions_lock); + hlist_add_head(®ion->hnode, &partition->pt_mem_regions); + spin_unlock(&partition->pt_mem_regions_lock); + + return 0; + +errout: + mshv_region_put(region); + return ret; +} + +/* Called for unmapping both the guest ram and the mmio space */ +static long +mshv_unmap_user_memory(struct mshv_partition *partition, + struct mshv_user_mem_region *mem) +{ + struct mshv_mem_region *region; + + if (!(mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) + return -EINVAL; + + spin_lock(&partition->pt_mem_regions_lock); + + region = mshv_partition_region_by_gfn(partition, mem->guest_pfn); + if (!region) { + spin_unlock(&partition->pt_mem_regions_lock); + return -ENOENT; + } + + /* Paranoia check */ + if (region->start_uaddr != mem->userspace_addr || + region->start_gfn != mem->guest_pfn || + region->nr_pages != HVPFN_DOWN(mem->size)) { + spin_unlock(&partition->pt_mem_regions_lock); + return -EINVAL; + } + + hlist_del(®ion->hnode); + + spin_unlock(&partition->pt_mem_regions_lock); + + mshv_region_put(region); + + return 0; +} + +static long +mshv_partition_ioctl_set_memory(struct mshv_partition *partition, + struct mshv_user_mem_region __user *user_mem) +{ + struct mshv_user_mem_region mem; + + if (copy_from_user(&mem, user_mem, sizeof(mem))) + return -EFAULT; + + if (!mem.size || + !PAGE_ALIGNED(mem.size) || + !PAGE_ALIGNED(mem.userspace_addr) || + (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) || + mshv_field_nonzero(mem, rsvd)) + return -EINVAL; + + if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)) + return mshv_unmap_user_memory(partition, &mem); + + return mshv_map_user_memory(partition, &mem); +} + +static long +mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition, + void __user *user_args) +{ + struct mshv_user_ioeventfd args; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + return mshv_set_unset_ioeventfd(partition, &args); +} + +static long +mshv_partition_ioctl_irqfd(struct mshv_partition *partition, + void __user *user_args) +{ + struct mshv_user_irqfd args; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + return mshv_set_unset_irqfd(partition, &args); +} + +static long +mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition, + void __user *user_args) +{ + struct mshv_gpap_access_bitmap args; + union hv_gpa_page_access_state *states; + long ret, i; + union hv_gpa_page_access_state_flags hv_flags = {}; + u8 hv_type_mask; + ulong bitmap_buf_sz, states_buf_sz; + int written = 0; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT || + args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT || + mshv_field_nonzero(args, rsvd) || !args.page_count || + !args.bitmap_ptr) + return -EINVAL; + + if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz)) + return -E2BIG; + + /* Num bytes needed to store bitmap; one bit per page rounded up */ + bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8); + + /* Sanity check */ + if (bitmap_buf_sz > states_buf_sz) + return -EBADFD; + + switch (args.access_type) { + case MSHV_GPAP_ACCESS_TYPE_ACCESSED: + hv_type_mask = 1; + if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { + hv_flags.clear_accessed = 1; + /* not accessed implies not dirty */ + hv_flags.clear_dirty = 1; + } else { /* MSHV_GPAP_ACCESS_OP_SET */ + hv_flags.set_accessed = 1; + } + break; + case MSHV_GPAP_ACCESS_TYPE_DIRTY: + hv_type_mask = 2; + if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { + hv_flags.clear_dirty = 1; + } else { /* MSHV_GPAP_ACCESS_OP_SET */ + hv_flags.set_dirty = 1; + /* dirty implies accessed */ + hv_flags.set_accessed = 1; + } + break; + } + + states = vzalloc(states_buf_sz); + if (!states) + return -ENOMEM; + + ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count, + args.gpap_base, hv_flags, &written, + states); + if (ret) + goto free_return; + + /* + * Overwrite states buffer with bitmap - the bits in hv_type_mask + * correspond to bitfields in hv_gpa_page_access_state + */ + for (i = 0; i < written; ++i) + __assign_bit(i, (ulong *)states, + states[i].as_uint8 & hv_type_mask); + + /* zero the unused bits in the last byte(s) of the returned bitmap */ + for (i = written; i < bitmap_buf_sz * 8; ++i) + __clear_bit(i, (ulong *)states); + + if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz)) + ret = -EFAULT; + +free_return: + vfree(states); + return ret; +} + +static long +mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition, + void __user *user_args) +{ + struct mshv_user_irq_entry *entries = NULL; + struct mshv_user_irq_table args; + long ret; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + if (args.nr > MSHV_MAX_GUEST_IRQS || + mshv_field_nonzero(args, rsvd)) + return -EINVAL; + + if (args.nr) { + struct mshv_user_irq_table __user *urouting = user_args; + + entries = vmemdup_user(urouting->entries, + array_size(sizeof(*entries), + args.nr)); + if (IS_ERR(entries)) + return PTR_ERR(entries); + } + ret = mshv_update_routing_table(partition, entries, args.nr); + kvfree(entries); + + return ret; +} + +static long +mshv_partition_ioctl_initialize(struct mshv_partition *partition) +{ + long ret; + + if (partition->pt_initialized) + return 0; + + ret = hv_call_initialize_partition(partition->pt_id); + if (ret) + goto withdraw_mem; + + ret = mshv_debugfs_partition_create(partition); + if (ret) + goto finalize_partition; + + partition->pt_initialized = true; + + return 0; + +finalize_partition: + hv_call_finalize_partition(partition->pt_id); +withdraw_mem: + hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); + + return ret; +} + +static long +mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + struct mshv_partition *partition = filp->private_data; + long ret; + void __user *uarg = (void __user *)arg; + + if (mutex_lock_killable(&partition->pt_mutex)) + return -EINTR; + + switch (ioctl) { + case MSHV_INITIALIZE_PARTITION: + ret = mshv_partition_ioctl_initialize(partition); + break; + case MSHV_SET_GUEST_MEMORY: + ret = mshv_partition_ioctl_set_memory(partition, uarg); + break; + case MSHV_CREATE_VP: + ret = mshv_partition_ioctl_create_vp(partition, uarg); + break; + case MSHV_IRQFD: + ret = mshv_partition_ioctl_irqfd(partition, uarg); + break; + case MSHV_IOEVENTFD: + ret = mshv_partition_ioctl_ioeventfd(partition, uarg); + break; + case MSHV_SET_MSI_ROUTING: + ret = mshv_partition_ioctl_set_msi_routing(partition, uarg); + break; + case MSHV_GET_GPAP_ACCESS_BITMAP: + ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition, + uarg); + break; + case MSHV_ROOT_HVCALL: + ret = mshv_ioctl_passthru_hvcall(partition, true, uarg); + break; + default: + ret = -ENOTTY; + } + + mutex_unlock(&partition->pt_mutex); + return ret; +} + +static int +disable_vp_dispatch(struct mshv_vp *vp) +{ + int ret; + struct hv_register_assoc dispatch_suspend = { + .name = HV_REGISTER_DISPATCH_SUSPEND, + .value.dispatch_suspend.suspended = 1, + }; + + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + 1, &dispatch_suspend); + if (ret) + vp_err(vp, "failed to suspend\n"); + + trace_mshv_disable_vp_dispatch(vp->vp_partition->pt_id, + vp->vp_index, ret); + + return ret; +} + +static int +get_vp_signaled_count(struct mshv_vp *vp, u64 *count) +{ + int ret; + struct hv_register_assoc root_signal_count = { + .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT, + }; + + ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + 1, &root_signal_count); + + if (ret) { + vp_err(vp, "Failed to get root signal count"); + *count = 0; + return ret; + } + + *count = root_signal_count.value.reg64; + + return ret; +} + +static void +drain_vp_signals(struct mshv_vp *vp) +{ + u64 hv_signal_count; + u64 vp_signal_count; + + get_vp_signaled_count(vp, &hv_signal_count); + + vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); + + /* + * There should be at most 1 outstanding notification, but be extra + * careful anyway. + */ + while (hv_signal_count != vp_signal_count) { + WARN_ON(hv_signal_count - vp_signal_count != 1); + + if (wait_event_interruptible(vp->run.vp_suspend_queue, + vp->run.kicked_by_hv == 1)) + break; + vp->run.kicked_by_hv = 0; + vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); + } + + trace_mshv_drain_vp_signals(vp->vp_partition->pt_id, vp->vp_index); +} + +static void drain_all_vps(const struct mshv_partition *partition) +{ + int i; + struct mshv_vp *vp; + + /* + * VPs are reachable from ISR. It is safe to not take the partition + * lock because nobody else can enter this function and drop the + * partition from the list. + */ + for (i = 0; i < MSHV_MAX_VPS; i++) { + vp = partition->pt_vp_array[i]; + if (!vp) + continue; + /* + * Disable dispatching of the VP in the hypervisor. After this + * the hypervisor guarantees it won't generate any signals for + * the VP and the hypervisor's VP signal count won't change. + */ + disable_vp_dispatch(vp); + drain_vp_signals(vp); + } +} + +static void +remove_partition(struct mshv_partition *partition) +{ + spin_lock(&mshv_root.pt_ht_lock); + hlist_del_rcu(&partition->pt_hnode); + spin_unlock(&mshv_root.pt_ht_lock); + + synchronize_rcu(); +} + +/* + * Tear down a partition and remove it from the list. + * Partition's refcount must be 0 + */ +static void destroy_partition(struct mshv_partition *partition) +{ + struct mshv_vp *vp; + struct mshv_mem_region *region; + struct hlist_node *n; + int i; + + if (refcount_read(&partition->pt_ref_count)) { + pt_err(partition, + "Attempt to destroy partition but refcount > 0\n"); + return; + } + + trace_mshv_destroy_partition(partition->pt_id); + + if (partition->pt_initialized) { + /* + * We only need to drain signals for root scheduler. This should be + * done before removing the partition from the partition list. + */ + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + drain_all_vps(partition); + + /* Remove vps */ + for (i = 0; i < MSHV_MAX_VPS; ++i) { + vp = partition->pt_vp_array[i]; + if (!vp) + continue; + + mshv_debugfs_vp_remove(vp); + mshv_vp_stats_unmap(partition->pt_id, vp->vp_index, + vp->vp_stats_pages); + + if (vp->vp_register_page) { + (void)hv_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_REGISTERS, + virt_to_page(vp->vp_register_page), + input_vtl_zero); + vp->vp_register_page = NULL; + } + + (void)hv_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + virt_to_page(vp->vp_intercept_msg_page), + input_vtl_zero); + vp->vp_intercept_msg_page = NULL; + + if (vp->vp_ghcb_page) { + (void)hv_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_GHCB, + virt_to_page(vp->vp_ghcb_page), + input_vtl_normal); + vp->vp_ghcb_page = NULL; + } + + kfree(vp); + + partition->pt_vp_array[i] = NULL; + } + + mshv_debugfs_partition_remove(partition); + + /* Deallocates and unmaps everything including vcpus, GPA mappings etc */ + hv_call_finalize_partition(partition->pt_id); + + partition->pt_initialized = false; + } + + remove_partition(partition); + + hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, + hnode) { + hlist_del(®ion->hnode); + mshv_region_put(region); + } + + /* Withdraw and free all pages we deposited */ + hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); + hv_call_delete_partition(partition->pt_id); + + mshv_free_routing_table(partition); + kfree(partition); +} + +struct +mshv_partition *mshv_partition_get(struct mshv_partition *partition) +{ + if (refcount_inc_not_zero(&partition->pt_ref_count)) + return partition; + return NULL; +} + +struct +mshv_partition *mshv_partition_find(u64 partition_id) + __must_hold(RCU) +{ + struct mshv_partition *p; + + hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode, + partition_id) + if (p->pt_id == partition_id) + return p; + + return NULL; +} + +void +mshv_partition_put(struct mshv_partition *partition) +{ + if (refcount_dec_and_test(&partition->pt_ref_count)) + destroy_partition(partition); +} + +static int +mshv_partition_release(struct inode *inode, struct file *filp) +{ + struct mshv_partition *partition = filp->private_data; + + trace_mshv_partition_release(partition->pt_id); + + mshv_eventfd_release(partition); + + cleanup_srcu_struct(&partition->pt_irq_srcu); + + mshv_partition_put(partition); + + return 0; +} + +static int +add_partition(struct mshv_partition *partition) +{ + spin_lock(&mshv_root.pt_ht_lock); + + hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode, + partition->pt_id); + + spin_unlock(&mshv_root.pt_ht_lock); + + return 0; +} + +static_assert(MSHV_NUM_CPU_FEATURES_BANKS == + HV_PARTITION_PROCESSOR_FEATURES_BANKS); + +static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags, + struct hv_partition_creation_properties *cr_props, + union hv_partition_isolation_properties *isol_props) +{ + int i; + struct mshv_create_partition_v2 args; + union hv_partition_processor_features *disabled_procs; + union hv_partition_processor_xsave_features *disabled_xsave; + + /* First, copy v1 struct in case user is on previous versions */ + if (copy_from_user(&args, user_arg, + sizeof(struct mshv_create_partition))) + return -EFAULT; + + if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || + args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) + return -EINVAL; + + disabled_procs = &cr_props->disabled_processor_features; + disabled_xsave = &cr_props->disabled_processor_xsave_features; + + /* Check if user provided newer struct with feature fields */ + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) { + if (copy_from_user(&args, user_arg, sizeof(args))) + return -EFAULT; + + /* Re-validate v1 fields after second copy_from_user() */ + if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || + args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) + return -EINVAL; + + if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS || + mshv_field_nonzero(args, pt_rsvd) || + mshv_field_nonzero(args, pt_rsvd1)) + return -EINVAL; + + /* + * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never + * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS + * (i.e. 2). + * + * Further banks (index >= 2) will be modifiable as 'early' + * properties via the set partition property hypercall. + */ + for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) + disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i]; + +#if IS_ENABLED(CONFIG_X86_64) + disabled_xsave->as_uint64 = args.pt_disabled_xsave; +#else + /* + * In practice this field is ignored on arm64, but safer to + * zero it in case it is ever used. + */ + disabled_xsave->as_uint64 = 0; + + if (mshv_field_nonzero(args, pt_rsvd2)) + return -EINVAL; +#endif + } else { + /* + * v1 behavior: try to enable everything. The hypervisor will + * disable features that are not supported. The banks can be + * queried via the get partition property hypercall. + */ + for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) + disabled_procs->as_uint64[i] = 0; + + disabled_xsave->as_uint64 = 0; + } + + /* Only support EXO partitions */ + *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | + HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; + + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; + if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE; + if (args.pt_flags & BIT(MSHV_PT_BIT_SMT_ENABLED_GUEST)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST; + + isol_props->as_uint64 = 0; + + switch (args.pt_isolation) { + case MSHV_PT_ISOLATION_NONE: + isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE; + break; + } + + return 0; +} + +static long +mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) +{ + u64 creation_flags; + struct hv_partition_creation_properties creation_properties; + union hv_partition_isolation_properties isolation_properties; + struct mshv_partition *partition; + u64 pt_id = -1; + long ret; + + ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags, + &creation_properties, + &isolation_properties); + if (ret) + return ret; + + partition = kzalloc_obj(*partition); + if (!partition) + return -ENOMEM; + + partition->pt_module_dev = module_dev; + partition->isolation_type = isolation_properties.isolation_type; + + refcount_set(&partition->pt_ref_count, 1); + + mutex_init(&partition->pt_mutex); + + mutex_init(&partition->pt_irq_lock); + + init_completion(&partition->async_hypercall); + + INIT_HLIST_HEAD(&partition->irq_ack_notifier_list); + + INIT_HLIST_HEAD(&partition->pt_devices); + + spin_lock_init(&partition->pt_mem_regions_lock); + INIT_HLIST_HEAD(&partition->pt_mem_regions); + + mshv_eventfd_init(partition); + + ret = init_srcu_struct(&partition->pt_irq_srcu); + if (ret) + goto free_partition; + + ret = hv_call_create_partition(creation_flags, + creation_properties, + isolation_properties, + &pt_id); + if (ret) + goto cleanup_irq_srcu; + + partition->pt_id = pt_id; + + ret = add_partition(partition); + if (ret) + goto delete_partition; + + ret = mshv_init_async_handler(partition); + if (ret) + goto remove_partition; + + ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition", + &mshv_partition_fops, + partition, O_RDWR)); + if (ret < 0) + goto remove_partition; + + goto out; + +remove_partition: + remove_partition(partition); +delete_partition: + hv_call_delete_partition(partition->pt_id); +cleanup_irq_srcu: + cleanup_srcu_struct(&partition->pt_irq_srcu); +free_partition: + kfree(partition); +out: + trace_mshv_create_partition(pt_id, ret); + return ret; +} + +static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct miscdevice *misc = filp->private_data; + + switch (ioctl) { + case MSHV_CREATE_PARTITION: + return mshv_ioctl_create_partition((void __user *)arg, + misc->this_device); + case MSHV_ROOT_HVCALL: + return mshv_ioctl_passthru_hvcall(NULL, false, + (void __user *)arg); + } + + return -ENOTTY; +} + +static int +mshv_dev_open(struct inode *inode, struct file *filp) +{ + return 0; +} + +static int +mshv_dev_release(struct inode *inode, struct file *filp) +{ + return 0; +} + +static int mshv_root_sched_online; + +static const char *scheduler_type_to_string(enum hv_scheduler_type type) +{ + switch (type) { + case HV_SCHEDULER_TYPE_LP: + return "classic scheduler without SMT"; + case HV_SCHEDULER_TYPE_LP_SMT: + return "classic scheduler with SMT"; + case HV_SCHEDULER_TYPE_CORE_SMT: + return "core scheduler"; + case HV_SCHEDULER_TYPE_ROOT: + return "root scheduler"; + default: + return "unknown scheduler"; + }; +} + +static int __init l1vh_retrieve_scheduler_type(enum hv_scheduler_type *out) +{ + u64 integrated_sched_enabled; + int ret; + + *out = HV_SCHEDULER_TYPE_CORE_SMT; + + if (!mshv_root.vmm_caps.vmm_enable_integrated_scheduler) + return 0; + + ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, + HV_PARTITION_PROPERTY_INTEGRATED_SCHEDULER_ENABLED, + 0, &integrated_sched_enabled, + sizeof(integrated_sched_enabled)); + if (ret) + return ret; + + if (integrated_sched_enabled) + *out = HV_SCHEDULER_TYPE_ROOT; + + return 0; +} + +/* TODO move this to hv_common.c when needed outside */ +static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) +{ + struct hv_input_get_system_property *input; + struct hv_output_get_system_property *output; + unsigned long flags; + u64 status; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + memset(output, 0, sizeof(*output)); + input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE; + + status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); + if (!hv_result_success(status)) { + local_irq_restore(flags); + pr_err("%s: %s\n", __func__, hv_result_to_string(status)); + return hv_result_to_errno(status); + } + + *out = output->scheduler_type; + local_irq_restore(flags); + + return 0; +} + +/* Retrieve and stash the supported scheduler type */ +static int __init mshv_retrieve_scheduler_type(struct device *dev) +{ + int ret; + + if (hv_l1vh_partition()) + ret = l1vh_retrieve_scheduler_type(&hv_scheduler_type); + else + ret = hv_retrieve_scheduler_type(&hv_scheduler_type); + if (ret) + return ret; + + dev_info(dev, "Hypervisor using %s\n", + scheduler_type_to_string(hv_scheduler_type)); + + switch (hv_scheduler_type) { + case HV_SCHEDULER_TYPE_CORE_SMT: + case HV_SCHEDULER_TYPE_LP_SMT: + case HV_SCHEDULER_TYPE_ROOT: + case HV_SCHEDULER_TYPE_LP: + /* Supported scheduler, nothing to do */ + break; + default: + dev_err(dev, "unsupported scheduler 0x%x, bailing.\n", + hv_scheduler_type); + return -EOPNOTSUPP; + } + + return 0; +} + +static int mshv_root_scheduler_init(unsigned int cpu) +{ + void **inputarg, **outputarg, *p; + + inputarg = (void **)this_cpu_ptr(root_scheduler_input); + outputarg = (void **)this_cpu_ptr(root_scheduler_output); + + /* Allocate two consecutive pages. One for input, one for output. */ + p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL); + if (!p) + return -ENOMEM; + + *inputarg = p; + *outputarg = (char *)p + HV_HYP_PAGE_SIZE; + + return 0; +} + +static int mshv_root_scheduler_cleanup(unsigned int cpu) +{ + void *p, **inputarg, **outputarg; + + inputarg = (void **)this_cpu_ptr(root_scheduler_input); + outputarg = (void **)this_cpu_ptr(root_scheduler_output); + + p = *inputarg; + + *inputarg = NULL; + *outputarg = NULL; + + kfree(p); + + return 0; +} + +/* Must be called after retrieving the scheduler type */ +static int +root_scheduler_init(struct device *dev) +{ + int ret; + + if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) + return 0; + + root_scheduler_input = alloc_percpu(void *); + root_scheduler_output = alloc_percpu(void *); + + if (!root_scheduler_input || !root_scheduler_output) { + dev_err(dev, "Failed to allocate root scheduler buffers\n"); + ret = -ENOMEM; + goto out; + } + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched", + mshv_root_scheduler_init, + mshv_root_scheduler_cleanup); + + if (ret < 0) { + dev_err(dev, "Failed to setup root scheduler state: %i\n", ret); + goto out; + } + + mshv_root_sched_online = ret; + + return 0; + +out: + free_percpu(root_scheduler_input); + free_percpu(root_scheduler_output); + return ret; +} + +static void +root_scheduler_deinit(void) +{ + if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) + return; + + cpuhp_remove_state(mshv_root_sched_online); + free_percpu(root_scheduler_input); + free_percpu(root_scheduler_output); +} + +static int __init mshv_init_vmm_caps(struct device *dev) +{ + int ret; + + ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, + HV_PARTITION_PROPERTY_VMM_CAPABILITIES, + 0, &mshv_root.vmm_caps, + sizeof(mshv_root.vmm_caps)); + if (ret && hv_l1vh_partition()) { + dev_err(dev, "Failed to get VMM capabilities: %d\n", ret); + return ret; + } + + dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]); + + return 0; +} + +static int __init mshv_parent_partition_init(void) +{ + int ret; + struct device *dev; + union hv_hypervisor_version_info version_info; + + if (!hv_parent_partition() || is_kdump_kernel()) + return -ENODEV; + + if (hv_get_hypervisor_version(&version_info)) + return -ENODEV; + + ret = misc_register(&mshv_dev); + if (ret) + return ret; + + dev = mshv_dev.this_device; + + if (version_info.build_number < MSHV_HV_MIN_VERSION || + version_info.build_number > MSHV_HV_MAX_VERSION) { + dev_err(dev, "Running on unvalidated Hyper-V version\n"); + dev_err(dev, "Versions: current: %u min: %u max: %u\n", + version_info.build_number, MSHV_HV_MIN_VERSION, + MSHV_HV_MAX_VERSION); + } + + ret = mshv_synic_init(dev); + if (ret) + goto device_deregister; + + ret = mshv_init_vmm_caps(dev); + if (ret) + goto synic_cleanup; + + ret = mshv_retrieve_scheduler_type(dev); + if (ret) + goto synic_cleanup; + + ret = root_scheduler_init(dev); + if (ret) + goto synic_cleanup; + + ret = mshv_debugfs_init(); + if (ret) + goto deinit_root_scheduler; + + ret = mshv_irqfd_wq_init(); + if (ret) + goto exit_debugfs; + + spin_lock_init(&mshv_root.pt_ht_lock); + hash_init(mshv_root.pt_htable); + + hv_setup_mshv_handler(mshv_isr); + + return 0; + +exit_debugfs: + mshv_debugfs_exit(); +deinit_root_scheduler: + root_scheduler_deinit(); +synic_cleanup: + mshv_synic_exit(); +device_deregister: + misc_deregister(&mshv_dev); + return ret; +} + +static void __exit mshv_parent_partition_exit(void) +{ + hv_setup_mshv_handler(NULL); + mshv_port_table_fini(); + mshv_debugfs_exit(); + misc_deregister(&mshv_dev); + mshv_irqfd_wq_cleanup(); + root_scheduler_deinit(); + mshv_synic_exit(); +} + +module_init(mshv_parent_partition_init); +module_exit(mshv_parent_partition_exit); diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c new file mode 100644 index 000000000000..e2288a726fec --- /dev/null +++ b/drivers/hv/mshv_synic.c @@ -0,0 +1,820 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * mshv_root module's main interrupt handler and associated functionality. + * + * Authors: Microsoft Linux virtualization team + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/cpuhotplug.h> +#include <linux/reboot.h> +#include <asm/mshyperv.h> +#include <linux/acpi.h> + +#include "mshv_eventfd.h" +#include "mshv.h" + +static int synic_cpuhp_online; +static struct hv_synic_pages __percpu *synic_pages; +static int mshv_sint_vector = -1; /* hwirq for the SynIC SINTs */ +static int mshv_sint_irq = -1; /* Linux IRQ for mshv_sint_vector */ + +static u32 synic_event_ring_get_queued_port(u32 sint_index) +{ + struct hv_synic_event_ring_page **event_ring_page; + volatile struct hv_synic_event_ring *ring; + struct hv_synic_pages *spages; + u8 **synic_eventring_tail; + u32 message; + u8 tail; + + spages = this_cpu_ptr(synic_pages); + event_ring_page = &spages->synic_event_ring_page; + synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail); + + if (unlikely(!*synic_eventring_tail)) { + pr_debug("Missing synic event ring tail!\n"); + return 0; + } + tail = (*synic_eventring_tail)[sint_index]; + + if (unlikely(!*event_ring_page)) { + pr_debug("Missing synic event ring page!\n"); + return 0; + } + + ring = &(*event_ring_page)->sint_event_ring[sint_index]; + + /* + * Get the message. + */ + message = ring->data[tail]; + + if (!message) { + if (ring->ring_full) { + /* + * Ring is marked full, but we would have consumed all + * the messages. Notify the hypervisor that ring is now + * empty and check again. + */ + ring->ring_full = 0; + hv_call_notify_port_ring_empty(sint_index); + message = ring->data[tail]; + } + + if (!message) { + ring->signal_masked = 0; + /* + * Unmask the signal and sync with hypervisor + * before one last check for any message. + */ + mb(); + message = ring->data[tail]; + + /* + * Ok, lets bail out. + */ + if (!message) + return 0; + } + + ring->signal_masked = 1; + } + + /* + * Clear the message in the ring buffer. + */ + ring->data[tail] = 0; + + if (++tail == HV_SYNIC_EVENT_RING_MESSAGE_COUNT) + tail = 0; + + (*synic_eventring_tail)[sint_index] = tail; + + return message; +} + +static bool +mshv_doorbell_isr(struct hv_message *msg) +{ + struct hv_notification_message_payload *notification; + u32 port; + + if (msg->header.message_type != HVMSG_SYNIC_SINT_INTERCEPT) + return false; + + notification = (struct hv_notification_message_payload *)msg->u.payload; + if (notification->sint_index != HV_SYNIC_DOORBELL_SINT_INDEX) + return false; + + while ((port = synic_event_ring_get_queued_port(HV_SYNIC_DOORBELL_SINT_INDEX))) { + struct port_table_info ptinfo = { 0 }; + + if (mshv_portid_lookup(port, &ptinfo)) { + pr_debug("Failed to get port info from port_table!\n"); + continue; + } + + if (ptinfo.hv_port_type != HV_PORT_TYPE_DOORBELL) { + pr_debug("Not a doorbell port!, port: %d, port_type: %d\n", + port, ptinfo.hv_port_type); + continue; + } + + /* Invoke the callback */ + ptinfo.hv_port_doorbell.doorbell_cb(port, + ptinfo.hv_port_doorbell.data); + } + + return true; +} + +static bool mshv_async_call_completion_isr(struct hv_message *msg) +{ + bool handled = false; + struct hv_async_completion_message_payload *async_msg; + struct mshv_partition *partition; + u64 partition_id; + + if (msg->header.message_type != HVMSG_ASYNC_CALL_COMPLETION) + goto out; + + async_msg = + (struct hv_async_completion_message_payload *)msg->u.payload; + + partition_id = async_msg->partition_id; + + /* + * Hold this lock for the rest of the isr, because the partition could + * be released anytime. + * e.g. the MSHV_RUN_VP thread could wake on another cpu; it could + * release the partition unless we hold this! + */ + rcu_read_lock(); + + partition = mshv_partition_find(partition_id); + + if (unlikely(!partition)) { + pr_debug("failed to find partition %llu\n", partition_id); + goto unlock_out; + } + + partition->async_hypercall_status = async_msg->status; + complete(&partition->async_hypercall); + + handled = true; + +unlock_out: + rcu_read_unlock(); +out: + return handled; +} + +static void kick_vp(struct mshv_vp *vp) +{ + atomic64_inc(&vp->run.vp_signaled_count); + vp->run.kicked_by_hv = 1; + wake_up(&vp->run.vp_suspend_queue); +} + +static void +handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg) +{ + int bank_idx, vps_signaled = 0, bank_mask_size; + struct mshv_partition *partition; + const struct hv_vpset *vpset; + const u64 *bank_contents; + u64 partition_id = msg->partition_id; + + if (msg->vp_bitset.bitset.format != HV_GENERIC_SET_SPARSE_4K) { + pr_debug("scheduler message format is not HV_GENERIC_SET_SPARSE_4K"); + return; + } + + if (msg->vp_count == 0) { + pr_debug("scheduler message with no VP specified"); + return; + } + + rcu_read_lock(); + + partition = mshv_partition_find(partition_id); + if (unlikely(!partition)) { + pr_debug("failed to find partition %llu\n", partition_id); + goto unlock_out; + } + + vpset = &msg->vp_bitset.bitset; + + bank_idx = -1; + bank_contents = vpset->bank_contents; + bank_mask_size = sizeof(vpset->valid_bank_mask) * BITS_PER_BYTE; + + while (true) { + int vp_bank_idx = -1; + int vp_bank_size = sizeof(*bank_contents) * BITS_PER_BYTE; + int vp_index; + + bank_idx = find_next_bit((unsigned long *)&vpset->valid_bank_mask, + bank_mask_size, bank_idx + 1); + if (bank_idx == bank_mask_size) + break; + + while (true) { + struct mshv_vp *vp; + + vp_bank_idx = find_next_bit((unsigned long *)bank_contents, + vp_bank_size, vp_bank_idx + 1); + if (vp_bank_idx == vp_bank_size) + break; + + vp_index = (bank_idx * vp_bank_size) + vp_bank_idx; + + /* This shouldn't happen, but just in case. */ + if (unlikely(vp_index >= MSHV_MAX_VPS)) { + pr_debug("VP index %u out of bounds\n", + vp_index); + goto unlock_out; + } + + vp = partition->pt_vp_array[vp_index]; + if (unlikely(!vp)) { + pr_debug("failed to find VP %u\n", vp_index); + goto unlock_out; + } + + kick_vp(vp); + vps_signaled++; + } + + bank_contents++; + } + +unlock_out: + rcu_read_unlock(); + + if (vps_signaled != msg->vp_count) + pr_debug("asked to signal %u VPs but only did %u\n", + msg->vp_count, vps_signaled); +} + +static void +handle_pair_message(const struct hv_vp_signal_pair_scheduler_message *msg) +{ + struct mshv_partition *partition = NULL; + struct mshv_vp *vp; + int idx; + + rcu_read_lock(); + + for (idx = 0; idx < msg->vp_count; idx++) { + u64 partition_id = msg->partition_ids[idx]; + u32 vp_index = msg->vp_indexes[idx]; + + if (idx == 0 || partition->pt_id != partition_id) { + partition = mshv_partition_find(partition_id); + if (unlikely(!partition)) { + pr_debug("failed to find partition %llu\n", + partition_id); + break; + } + } + + /* This shouldn't happen, but just in case. */ + if (unlikely(vp_index >= MSHV_MAX_VPS)) { + pr_debug("VP index %u out of bounds\n", vp_index); + break; + } + + vp = partition->pt_vp_array[vp_index]; + if (!vp) { + pr_debug("failed to find VP %u\n", vp_index); + break; + } + + kick_vp(vp); + } + + rcu_read_unlock(); +} + +static bool +mshv_scheduler_isr(struct hv_message *msg) +{ + if (msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_BITSET && + msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_PAIR) + return false; + + if (msg->header.message_type == HVMSG_SCHEDULER_VP_SIGNAL_BITSET) + handle_bitset_message((struct hv_vp_signal_bitset_scheduler_message *) + msg->u.payload); + else + handle_pair_message((struct hv_vp_signal_pair_scheduler_message *) + msg->u.payload); + + return true; +} + +static bool +mshv_intercept_isr(struct hv_message *msg) +{ + struct mshv_partition *partition; + bool handled = false; + struct mshv_vp *vp; + u64 partition_id; + u32 vp_index; + + partition_id = msg->header.sender; + + rcu_read_lock(); + + partition = mshv_partition_find(partition_id); + if (unlikely(!partition)) { + pr_debug("failed to find partition %llu\n", + partition_id); + goto unlock_out; + } + + if (msg->header.message_type == HVMSG_X64_APIC_EOI) { + /* + * Check if this gsi is registered in the + * ack_notifier list and invoke the callback + * if registered. + */ + + /* + * If there is a notifier, the ack callback is supposed + * to handle the VMEXIT. So we need not pass this message + * to vcpu thread. + */ + struct hv_x64_apic_eoi_message *eoi_msg = + (struct hv_x64_apic_eoi_message *)&msg->u.payload[0]; + + if (mshv_notify_acked_gsi(partition, eoi_msg->interrupt_vector)) { + handled = true; + goto unlock_out; + } + } + + /* + * We should get an opaque intercept message here for all intercept + * messages, since we're using the mapped VP intercept message page. + * + * The intercept message will have been placed in intercept message + * page at this point. + * + * Make sure the message type matches our expectation. + */ + if (msg->header.message_type != HVMSG_OPAQUE_INTERCEPT) { + pr_debug("wrong message type %d", msg->header.message_type); + goto unlock_out; + } + + /* + * Since we directly index the vp, and it has to exist for us to be here + * (because the vp is only deleted when the partition is), no additional + * locking is needed here + */ + vp_index = + ((struct hv_opaque_intercept_message *)msg->u.payload)->vp_index; + vp = partition->pt_vp_array[vp_index]; + if (unlikely(!vp)) { + pr_debug("failed to find VP %u\n", vp_index); + goto unlock_out; + } + + kick_vp(vp); + + handled = true; + +unlock_out: + rcu_read_unlock(); + + return handled; +} + +void mshv_isr(void) +{ + struct hv_synic_pages *spages = this_cpu_ptr(synic_pages); + struct hv_message_page **msg_page = &spages->hyp_synic_message_page; + struct hv_message *msg; + bool handled; + + if (unlikely(!(*msg_page))) { + pr_debug("Missing synic page!\n"); + return; + } + + msg = &((*msg_page)->sint_message[HV_SYNIC_INTERCEPTION_SINT_INDEX]); + + /* + * If the type isn't set, there isn't really a message; + * it may be some other hyperv interrupt + */ + if (msg->header.message_type == HVMSG_NONE) + return; + + handled = mshv_doorbell_isr(msg); + + if (!handled) + handled = mshv_scheduler_isr(msg); + + if (!handled) + handled = mshv_async_call_completion_isr(msg); + + if (!handled) + handled = mshv_intercept_isr(msg); + + if (handled) { + /* + * Acknowledge message with hypervisor if another message is + * pending. + */ + msg->header.message_type = HVMSG_NONE; + /* + * Ensure the write is complete so the hypervisor will deliver + * the next message if available. + */ + mb(); + if (msg->header.message_flags.msg_pending) + hv_set_non_nested_msr(HV_MSR_EOM, 0); + } else { + pr_warn_once("%s: unknown message type 0x%x\n", __func__, + msg->header.message_type); + } +} + +static int mshv_synic_cpu_init(unsigned int cpu) +{ + union hv_synic_simp simp; + union hv_synic_siefp siefp; + union hv_synic_sirbp sirbp; + union hv_synic_sint sint; + union hv_synic_scontrol sctrl; + struct hv_synic_pages *spages = this_cpu_ptr(synic_pages); + struct hv_message_page **msg_page = &spages->hyp_synic_message_page; + struct hv_synic_event_flags_page **event_flags_page = + &spages->synic_event_flags_page; + struct hv_synic_event_ring_page **event_ring_page = + &spages->synic_event_ring_page; + + /* Setup the Synic's message page */ + simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP); + simp.simp_enabled = true; + *msg_page = memremap(simp.base_simp_gpa << HV_HYP_PAGE_SHIFT, + HV_HYP_PAGE_SIZE, + MEMREMAP_WB); + + if (!(*msg_page)) + return -EFAULT; + + hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64); + + /* Setup the Synic's event flags page */ + siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP); + siefp.siefp_enabled = true; + *event_flags_page = memremap(siefp.base_siefp_gpa << PAGE_SHIFT, + PAGE_SIZE, MEMREMAP_WB); + + if (!(*event_flags_page)) + goto cleanup; + + hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64); + + /* Setup the Synic's event ring page */ + sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP); + sirbp.sirbp_enabled = true; + *event_ring_page = memremap(sirbp.base_sirbp_gpa << PAGE_SHIFT, + PAGE_SIZE, MEMREMAP_WB); + + if (!(*event_ring_page)) + goto cleanup; + + hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64); + + if (mshv_sint_irq != -1) + enable_percpu_irq(mshv_sint_irq, 0); + + /* Enable intercepts */ + sint.as_uint64 = 0; + sint.vector = mshv_sint_vector; + sint.masked = false; + sint.auto_eoi = hv_recommend_using_aeoi(); + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, + sint.as_uint64); + + /* Doorbell SINT */ + sint.as_uint64 = 0; + sint.vector = mshv_sint_vector; + sint.masked = false; + sint.as_intercept = 1; + sint.auto_eoi = hv_recommend_using_aeoi(); + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX, + sint.as_uint64); + + /* Enable global synic bit */ + sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL); + sctrl.enable = 1; + hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64); + + return 0; + +cleanup: + if (*event_ring_page) { + sirbp.sirbp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64); + memunmap(*event_ring_page); + } + if (*event_flags_page) { + siefp.siefp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64); + memunmap(*event_flags_page); + } + if (*msg_page) { + simp.simp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64); + memunmap(*msg_page); + } + + return -EFAULT; +} + +static int mshv_synic_cpu_exit(unsigned int cpu) +{ + union hv_synic_sint sint; + union hv_synic_simp simp; + union hv_synic_siefp siefp; + union hv_synic_sirbp sirbp; + union hv_synic_scontrol sctrl; + struct hv_synic_pages *spages = this_cpu_ptr(synic_pages); + struct hv_message_page **msg_page = &spages->hyp_synic_message_page; + struct hv_synic_event_flags_page **event_flags_page = + &spages->synic_event_flags_page; + struct hv_synic_event_ring_page **event_ring_page = + &spages->synic_event_ring_page; + + /* Disable the interrupt */ + sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX); + sint.masked = true; + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, + sint.as_uint64); + + /* Disable Doorbell SINT */ + sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX); + sint.masked = true; + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX, + sint.as_uint64); + + if (mshv_sint_irq != -1) + disable_percpu_irq(mshv_sint_irq); + + /* Disable Synic's event ring page */ + sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP); + sirbp.sirbp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64); + memunmap(*event_ring_page); + + /* Disable Synic's event flags page */ + siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP); + siefp.siefp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64); + memunmap(*event_flags_page); + + /* Disable Synic's message page */ + simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP); + simp.simp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64); + memunmap(*msg_page); + + /* Disable global synic bit */ + sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL); + sctrl.enable = 0; + hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64); + + return 0; +} + +int +mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb, void *data, + u64 gpa, u64 val, u64 flags) +{ + struct hv_connection_info connection_info = { 0 }; + union hv_connection_id connection_id = { 0 }; + struct port_table_info *port_table_info; + struct hv_port_info port_info = { 0 }; + union hv_port_id port_id = { 0 }; + int ret; + + port_table_info = kmalloc_obj(*port_table_info); + if (!port_table_info) + return -ENOMEM; + + port_table_info->hv_port_type = HV_PORT_TYPE_DOORBELL; + port_table_info->hv_port_doorbell.doorbell_cb = doorbell_cb; + port_table_info->hv_port_doorbell.data = data; + ret = mshv_portid_alloc(port_table_info); + if (ret < 0) { + kfree(port_table_info); + return ret; + } + + port_id.u.id = ret; + port_info.port_type = HV_PORT_TYPE_DOORBELL; + port_info.doorbell_port_info.target_sint = HV_SYNIC_DOORBELL_SINT_INDEX; + port_info.doorbell_port_info.target_vp = HV_ANY_VP; + ret = hv_call_create_port(hv_current_partition_id, port_id, partition_id, + &port_info, + 0, 0, NUMA_NO_NODE); + + if (ret < 0) { + mshv_portid_free(port_id.u.id); + return ret; + } + + connection_id.u.id = port_id.u.id; + connection_info.port_type = HV_PORT_TYPE_DOORBELL; + connection_info.doorbell_connection_info.gpa = gpa; + connection_info.doorbell_connection_info.trigger_value = val; + connection_info.doorbell_connection_info.flags = flags; + + ret = hv_call_connect_port(hv_current_partition_id, port_id, partition_id, + connection_id, &connection_info, 0, NUMA_NO_NODE); + if (ret < 0) { + hv_call_delete_port(hv_current_partition_id, port_id); + mshv_portid_free(port_id.u.id); + return ret; + } + + // lets use the port_id as the doorbell_id + return port_id.u.id; +} + +void +mshv_unregister_doorbell(u64 partition_id, int doorbell_portid) +{ + union hv_port_id port_id = { 0 }; + union hv_connection_id connection_id = { 0 }; + + connection_id.u.id = doorbell_portid; + hv_call_disconnect_port(partition_id, connection_id); + + port_id.u.id = doorbell_portid; + hv_call_delete_port(hv_current_partition_id, port_id); + + mshv_portid_free(doorbell_portid); +} + +static int mshv_synic_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (!hv_root_partition()) + return 0; + + cpuhp_remove_state(synic_cpuhp_online); + return 0; +} + +static struct notifier_block mshv_synic_reboot_nb = { + .notifier_call = mshv_synic_reboot_notify, +}; + +#ifndef HYPERVISOR_CALLBACK_VECTOR +static DEFINE_PER_CPU(long, mshv_evt); + +static irqreturn_t mshv_percpu_isr(int irq, void *dev_id) +{ + mshv_isr(); + return IRQ_HANDLED; +} + +#ifdef CONFIG_ACPI +static int __init mshv_acpi_setup_sint_irq(void) +{ + return acpi_register_gsi(NULL, mshv_sint_vector, ACPI_EDGE_SENSITIVE, + ACPI_ACTIVE_HIGH); +} + +static void mshv_acpi_cleanup_sint_irq(void) +{ + acpi_unregister_gsi(mshv_sint_vector); +} +#else +static int __init mshv_acpi_setup_sint_irq(void) +{ + return -ENODEV; +} + +static void mshv_acpi_cleanup_sint_irq(void) +{ +} +#endif + +static int __init mshv_sint_vector_setup(void) +{ + int ret; + struct hv_register_assoc reg = { + .name = HV_ARM64_REGISTER_SINT_RESERVED_INTERRUPT_ID, + }; + union hv_input_vtl input_vtl = { 0 }; + + if (acpi_disabled) + return -ENODEV; + + ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl, ®); + if (ret || !reg.value.reg64) + return -ENODEV; + + mshv_sint_vector = reg.value.reg64; + ret = mshv_acpi_setup_sint_irq(); + if (ret < 0) { + pr_err("Failed to setup IRQ for MSHV SINT vector %d: %d\n", + mshv_sint_vector, ret); + goto out_fail; + } + + mshv_sint_irq = ret; + + ret = request_percpu_irq(mshv_sint_irq, mshv_percpu_isr, "MSHV", + &mshv_evt); + if (ret) + goto out_unregister; + + return 0; + +out_unregister: + mshv_acpi_cleanup_sint_irq(); +out_fail: + return ret; +} + +static void mshv_sint_vector_cleanup(void) +{ + free_percpu_irq(mshv_sint_irq, &mshv_evt); + mshv_acpi_cleanup_sint_irq(); +} +#else /* !HYPERVISOR_CALLBACK_VECTOR */ +static int __init mshv_sint_vector_setup(void) +{ + mshv_sint_vector = HYPERVISOR_CALLBACK_VECTOR; + return 0; +} + +static void mshv_sint_vector_cleanup(void) +{ +} +#endif /* HYPERVISOR_CALLBACK_VECTOR */ + +int __init mshv_synic_init(struct device *dev) +{ + int ret = 0; + + ret = mshv_sint_vector_setup(); + if (ret) + return ret; + + synic_pages = alloc_percpu(struct hv_synic_pages); + if (!synic_pages) { + dev_err(dev, "Failed to allocate percpu synic page\n"); + ret = -ENOMEM; + goto sint_vector_cleanup; + } + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic", + mshv_synic_cpu_init, + mshv_synic_cpu_exit); + if (ret < 0) { + dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret); + goto free_synic_pages; + } + + synic_cpuhp_online = ret; + + ret = register_reboot_notifier(&mshv_synic_reboot_nb); + if (ret) + goto remove_cpuhp_state; + + return 0; + +remove_cpuhp_state: + cpuhp_remove_state(synic_cpuhp_online); +free_synic_pages: + free_percpu(synic_pages); +sint_vector_cleanup: + mshv_sint_vector_cleanup(); + return ret; +} + +void mshv_synic_exit(void) +{ + unregister_reboot_notifier(&mshv_synic_reboot_nb); + cpuhp_remove_state(synic_cpuhp_online); + free_percpu(synic_pages); + mshv_sint_vector_cleanup(); +} diff --git a/drivers/hv/mshv_trace.c b/drivers/hv/mshv_trace.c new file mode 100644 index 000000000000..0936b2f95edd --- /dev/null +++ b/drivers/hv/mshv_trace.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2026, Microsoft Corporation. + * + * Tracepoint definitions for mshv driver. + */ + +#define CREATE_TRACE_POINTS +#include "mshv_trace.h" diff --git a/drivers/hv/mshv_trace.h b/drivers/hv/mshv_trace.h new file mode 100644 index 000000000000..e7280c47e579 --- /dev/null +++ b/drivers/hv/mshv_trace.h @@ -0,0 +1,544 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2026, Microsoft Corporation. + * + * Tracepoint declarations for mshv driver. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mshv + +#if !defined(__MSHV_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _MSHV_TRACE_H_ + +#include <linux/tracepoint.h> +#include <hyperv/hvhdk.h> + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../drivers/hv + +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE mshv_trace + +TRACE_EVENT(mshv_create_partition, + TP_PROTO(u64 partition_id, int vm_fd), + TP_ARGS(partition_id, vm_fd), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(int, vm_fd) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->vm_fd = vm_fd; + ), + TP_printk("partition_id=%llu vm_fd=%d", + __entry->partition_id, + __entry->vm_fd + ) +); + +TRACE_EVENT(mshv_hvcall_create_partition, + TP_PROTO(u64 flags, s64 partition_id), + TP_ARGS(flags, partition_id), + TP_STRUCT__entry( + __field(u64, flags) + __field(s64, partition_id) + ), + TP_fast_assign( + __entry->flags = flags; + __entry->partition_id = partition_id; + ), + TP_printk("flags=%#llx partition_id=%lld", + __entry->flags, + __entry->partition_id + ) +); + +TRACE_EVENT(mshv_hvcall_initialize_partition, + TP_PROTO(u64 partition_id, u64 status), + TP_ARGS(partition_id, status), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u64, status) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->status = status; + ), + TP_printk("partition_id=%llu status=%#llx", + __entry->partition_id, + __entry->status + ) +); + +TRACE_EVENT(mshv_partition_release, + TP_PROTO(u64 partition_id), + TP_ARGS(partition_id), + TP_STRUCT__entry( + __field(u64, partition_id) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + ), + TP_printk("partition_id=%llu", + __entry->partition_id + ) +); + +TRACE_EVENT(mshv_destroy_partition, + TP_PROTO(u64 partition_id), + TP_ARGS(partition_id), + TP_STRUCT__entry( + __field(u64, partition_id) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + ), + TP_printk("partition_id=%llu", + __entry->partition_id + ) +); + +TRACE_EVENT(mshv_hvcall_finalize_partition, + TP_PROTO(u64 partition_id, u64 status), + TP_ARGS(partition_id, status), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u64, status) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->status = status; + ), + TP_printk("partition_id=%llu status=%#llx ", + __entry->partition_id, + __entry->status + ) +); + +TRACE_EVENT(mshv_hvcall_withdraw_memory, + TP_PROTO(u64 partition_id, u64 withdrawn, u64 status), + TP_ARGS(partition_id, withdrawn, status), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u64, withdrawn) + __field(u64, status) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->withdrawn = withdrawn; + __entry->status = status; + ), + TP_printk("partition_id=%llu withdrawn=%llu status=%#llx", + __entry->partition_id, + __entry->withdrawn, + __entry->status + ) +); + +TRACE_EVENT(mshv_hvcall_delete_partition, + TP_PROTO(u64 partition_id, u64 status), + TP_ARGS(partition_id, status), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u64, status) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->status = status; + ), + TP_printk("partition_id=%llu status=%#llx", + __entry->partition_id, + __entry->status + ) +); + +TRACE_EVENT(mshv_create_vp, + TP_PROTO(u64 partition_id, u32 vp_index, long vp_fd), + TP_ARGS(partition_id, vp_index, vp_fd), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u32, vp_index) + __field(long, vp_fd) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->vp_index = vp_index; + __entry->vp_fd = vp_fd; + ), + TP_printk("partition_id=%llu vp_index=%u vp_fd=%ld", + __entry->partition_id, + __entry->vp_index, + __entry->vp_fd + ) +); + +TRACE_EVENT(mshv_hvcall_map_vp_state_page, + TP_PROTO(u64 partition_id, u32 vp_index, u32 page_type, u64 status), + TP_ARGS(partition_id, vp_index, page_type, status), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u32, vp_index) + __field(u32, page_type) + __field(u64, status) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->vp_index = vp_index; + __entry->page_type = page_type; + __entry->status = status; + ), + TP_printk("partition_id=%llu vp_index=%u page_type=%u status=%#llx", + __entry->partition_id, + __entry->vp_index, + __entry->page_type, + __entry->status + ) +); + +TRACE_EVENT(mshv_drain_vp_signals, + TP_PROTO(u64 partition_id, u32 vp_index), + TP_ARGS(partition_id, vp_index), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u32, vp_index) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->vp_index = vp_index; + ), + TP_printk("partition_id=%llu vp_index=%u", + __entry->partition_id, + __entry->vp_index + ) +); + +TRACE_EVENT(mshv_disable_vp_dispatch, + TP_PROTO(u64 partition_id, u32 vp_index, int ret), + TP_ARGS(partition_id, vp_index, ret), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u32, vp_index) + __field(int, ret) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->vp_index = vp_index; + __entry->ret = ret; + ), + TP_printk("partition_id=%llu vp_index=%u ret=%d", + __entry->partition_id, + __entry->vp_index, + __entry->ret + ) +); + +TRACE_EVENT(mshv_vp_release, + TP_PROTO(u64 partition_id, u32 vp_index), + TP_ARGS(partition_id, vp_index), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u32, vp_index) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->vp_index = vp_index; + ), + TP_printk("partition_id=%llu vp_index=%u", + __entry->partition_id, + __entry->vp_index + ) +); + +TRACE_EVENT(mshv_run_vp_entry, + TP_PROTO(u64 partition_id, u32 vp_index), + TP_ARGS(partition_id, vp_index), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u32, vp_index) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->vp_index = vp_index; + ), + TP_printk("partition_id=%llu vp_index=%u", + __entry->partition_id, + __entry->vp_index + ) +); + +TRACE_EVENT(mshv_run_vp_exit, + TP_PROTO(u64 partition_id, u32 vp_index, u64 hv_message_type, long ret), + TP_ARGS(partition_id, vp_index, hv_message_type, ret), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u32, vp_index) + __field(u64, hv_message_type) + __field(long, ret) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->vp_index = vp_index; + __entry->hv_message_type = hv_message_type; + __entry->ret = ret; + ), + TP_printk("partition_id=%llu vp_index=%u hv_message_type=%#llx ret=%ld", + __entry->partition_id, + __entry->vp_index, + __entry->hv_message_type, + __entry->ret + ) +); + +TRACE_EVENT(mshv_vp_clear_explicit_suspend, + TP_PROTO(u64 partition_id, u32 vp_index, int ret), + TP_ARGS(partition_id, vp_index, ret), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u32, vp_index) + __field(int, ret) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->vp_index = vp_index; + __entry->ret = ret; + ), + TP_printk("partition_id=%llu vp_index=%u ret=%d", + __entry->partition_id, + __entry->vp_index, + __entry->ret + ) +); + +TRACE_EVENT(mshv_xfer_to_guest_mode_work, + TP_PROTO(u64 partition_id, u32 vp_index, unsigned long thread_info_flag, long ret), + TP_ARGS(partition_id, vp_index, thread_info_flag, ret), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u32, vp_index) + __field(unsigned long, thread_info_flag) + __field(long, ret) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->vp_index = vp_index; + __entry->thread_info_flag = thread_info_flag; + __entry->ret = ret; + ), + TP_printk("partition_id=%llu vp_index=%u thread_info_flag=%#lx ret=%ld", + __entry->partition_id, + __entry->vp_index, + __entry->thread_info_flag, + __entry->ret + ) +); + +TRACE_EVENT(mshv_hvcall_dispatch_vp, + TP_PROTO(u64 partition_id, u32 vp_index, u32 flags, + u32 dispatch_state, u32 dispatch_event, u64 irq_vectors, u64 status), + TP_ARGS(partition_id, vp_index, flags, dispatch_state, dispatch_event, irq_vectors, + status), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u32, vp_index) + __field(u32, flags) + __field(u32, dispatch_state) + __field(u32, dispatch_event) + __field(u64, irq_vectors) + __field(u64, status) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->vp_index = vp_index; + __entry->flags = flags; + __entry->dispatch_state = dispatch_state; + __entry->dispatch_event = dispatch_event; + __entry->irq_vectors = irq_vectors; + __entry->status = status; + ), + TP_printk("partition_id=%llu vp_index=%u flags=%#x dispatch_state=%#x dispatch_event=%#x irq_vectors=%#016llx status=%#llx", + __entry->partition_id, + __entry->vp_index, + __entry->flags, + __entry->dispatch_state, + __entry->dispatch_event, + __entry->irq_vectors, + __entry->status + ) +); + +TRACE_EVENT(mshv_update_routing_table, + TP_PROTO(u64 partition_id, void *old, void *new, u32 numents), + TP_ARGS(partition_id, old, new, numents), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(struct mshv_girq_routing_table *, old) + __field(struct mshv_girq_routing_table *, new) + __field(u32, numents) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->old = old; + __entry->new = new; + __entry->numents = numents; + ), + TP_printk("partition_id=%llu old=%p new=%p numents=%u", + __entry->partition_id, + __entry->old, + __entry->new, + __entry->numents + ) +); + +TRACE_EVENT(mshv_map_user_memory, + TP_PROTO(u64 partition_id, u64 start_uaddr, u64 start_gfn, u64 nr_pages, u32 map_flags, + long ret), + TP_ARGS(partition_id, start_uaddr, start_gfn, nr_pages, map_flags, ret), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u64, start_uaddr) + __field(u64, start_gfn) + __field(u64, nr_pages) + __field(u32, map_flags) + __field(long, ret) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->start_uaddr = start_uaddr; + __entry->start_gfn = start_gfn; + __entry->nr_pages = nr_pages; + __entry->map_flags = map_flags; + __entry->ret = ret; + ), + TP_printk("partition_id=%llu start_uaddr=%#llx start_gfn=%#llx nr_pages=%llu map_flags=%#x ret=%ld", + __entry->partition_id, + __entry->start_uaddr, + __entry->start_gfn, + __entry->nr_pages, + __entry->map_flags, + __entry->ret + ) +); + +TRACE_EVENT(mshv_assign_ioeventfd, + TP_PROTO(u64 partition_id, u64 addr, u64 length, u64 datamatch, bool wildcard, + void *eventfd, int ret), + TP_ARGS(partition_id, addr, length, datamatch, wildcard, eventfd, ret), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u64, addr) + __field(u64, length) + __field(u64, datamatch) + __field(bool, wildcard) + __field(struct eventfd_ctx *, eventfd) + __field(int, ret) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->addr = addr; + __entry->length = length; + __entry->datamatch = datamatch; + __entry->wildcard = wildcard; + __entry->eventfd = eventfd; + __entry->ret = ret; + ), + TP_printk("partition_id=%llu addr=%#016llx length=%#llx datamatch=%#llx wildcard=%d eventfd=%p ret=%d", + __entry->partition_id, + __entry->addr, + __entry->length, + __entry->datamatch, + __entry->wildcard, + __entry->eventfd, + __entry->ret + ) +); + +TRACE_EVENT(mshv_deassign_ioeventfd, + TP_PROTO(u64 partition_id, u64 addr, u64 length, u64 datamatch, bool wildcard, + void *eventfd), + TP_ARGS(partition_id, addr, length, datamatch, wildcard, eventfd), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u64, addr) + __field(u64, length) + __field(u64, datamatch) + __field(bool, wildcard) + __field(struct eventfd_ctx *, eventfd) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->addr = addr; + __entry->length = length; + __entry->datamatch = datamatch; + __entry->wildcard = wildcard; + __entry->eventfd = eventfd; + ), + TP_printk("partition_id=%llu addr=%#016llx length=%#llx datamatch=%#llx wildcard=%d eventfd=%p", + __entry->partition_id, + __entry->addr, + __entry->length, + __entry->datamatch, + __entry->wildcard, + __entry->eventfd + ) +); + +TRACE_EVENT(mshv_vp_wait_for_hv_kick, + TP_PROTO(u64 partition_id, u32 vp_index, bool kicked_by_hv, bool blocked, + bool irq_pending), + TP_ARGS(partition_id, vp_index, kicked_by_hv, blocked, irq_pending), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u32, vp_index) + __field(bool, kicked_by_hv) + __field(bool, blocked) + __field(bool, irq_pending) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->vp_index = vp_index; + __entry->kicked_by_hv = kicked_by_hv; + __entry->blocked = blocked; + __entry->irq_pending = irq_pending; + ), + TP_printk("partition_id=%llu vp_index=%u kicked_by_hv=%d blocked=%d irq_pending=%d", + __entry->partition_id, + __entry->vp_index, + __entry->kicked_by_hv, + __entry->blocked, + __entry->irq_pending + ) +); + +TRACE_EVENT(mshv_handle_gpa_intercept, + TP_PROTO(u64 partition_id, u32 vp_index, u64 gfn, u8 access_type, bool handled), + TP_ARGS(partition_id, vp_index, gfn, access_type, handled), + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u32, vp_index) + __field(u64, gfn) + __field(u8, access_type) + __field(bool, handled) + ), + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->vp_index = vp_index; + __entry->gfn = gfn; + __entry->access_type = access_type == HV_INTERCEPT_ACCESS_READ ? 'R' : + (access_type == HV_INTERCEPT_ACCESS_WRITE ? 'W' : + (access_type == HV_INTERCEPT_ACCESS_EXECUTE ? 'X' : '?')); + __entry->handled = handled; + ), + TP_printk("partition_id=%llu vp_index=%u gfn=0x%llx access_type=%c handled=%d", + __entry->partition_id, + __entry->vp_index, + __entry->gfn, + __entry->access_type, + __entry->handled + ) +); + +#endif /* _MSHV_TRACE_H_ */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/drivers/hv/mshv_vtl.h b/drivers/hv/mshv_vtl.h new file mode 100644 index 000000000000..a6eea52f7aa2 --- /dev/null +++ b/drivers/hv/mshv_vtl.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _MSHV_VTL_H +#define _MSHV_VTL_H + +#include <linux/mshv.h> +#include <linux/types.h> + +struct mshv_vtl_run { + u32 cancel; + u32 vtl_ret_action_size; + u32 pad[2]; + char exit_message[MSHV_MAX_RUN_MSG_SIZE]; + union { + struct mshv_vtl_cpu_context cpu_context; + + /* + * Reserving room for the cpu context to grow and to maintain compatibility + * with user mode. + */ + char reserved[1024]; + }; + char vtl_ret_actions[MSHV_MAX_RUN_MSG_SIZE]; +}; + +#endif /* _MSHV_VTL_H */ diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c new file mode 100644 index 000000000000..c19400701467 --- /dev/null +++ b/drivers/hv/mshv_vtl_main.c @@ -0,0 +1,1399 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * Author: + * Roman Kisel <romank@linux.microsoft.com> + * Saurabh Sengar <ssengar@linux.microsoft.com> + * Naman Jain <namjain@linux.microsoft.com> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/miscdevice.h> +#include <linux/anon_inodes.h> +#include <linux/cpuhotplug.h> +#include <linux/count_zeros.h> +#include <linux/entry-virt.h> +#include <linux/eventfd.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/vmalloc.h> +#include <asm/debugreg.h> +#include <asm/mshyperv.h> +#include <trace/events/ipi.h> +#include <uapi/asm/mtrr.h> +#include <uapi/linux/mshv.h> +#include <hyperv/hvhdk.h> + +#include "../../kernel/fpu/legacy.h" +#include "mshv.h" +#include "mshv_vtl.h" +#include "hyperv_vmbus.h" + +MODULE_AUTHOR("Microsoft"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Microsoft Hyper-V VTL Driver"); + +#define MSHV_ENTRY_REASON_LOWER_VTL_CALL 0x1 +#define MSHV_ENTRY_REASON_INTERRUPT 0x2 +#define MSHV_ENTRY_REASON_INTERCEPT 0x3 + +#define MSHV_REAL_OFF_SHIFT 16 +#define MSHV_PG_OFF_CPU_MASK (BIT_ULL(MSHV_REAL_OFF_SHIFT) - 1) +#define MSHV_RUN_PAGE_OFFSET 0 +#define MSHV_REG_PAGE_OFFSET 1 +#define VTL2_VMBUS_SINT_INDEX 7 + +static struct device *mem_dev; + +static struct tasklet_struct msg_dpc; +static wait_queue_head_t fd_wait_queue; +static bool has_message; +static struct eventfd_ctx *flag_eventfds[HV_EVENT_FLAGS_COUNT]; +static DEFINE_MUTEX(flag_lock); +static bool __read_mostly mshv_has_reg_page; + +/* hvcall code is of type u16, allocate a bitmap of size (1 << 16) to accommodate it */ +#define MAX_BITMAP_SIZE ((U16_MAX + 1) / 8) + +struct mshv_vtl_hvcall_fd { + u8 allow_bitmap[MAX_BITMAP_SIZE]; + bool allow_map_initialized; + /* + * Used to protect hvcall setup in IOCTLs + */ + struct mutex init_mutex; + struct miscdevice *dev; +}; + +struct mshv_vtl_poll_file { + struct file *file; + wait_queue_entry_t wait; + wait_queue_head_t *wqh; + poll_table pt; + int cpu; +}; + +struct mshv_vtl { + struct device *module_dev; + u64 id; +}; + +struct mshv_vtl_per_cpu { + struct mshv_vtl_run *run; + struct page *reg_page; +}; + +/* SYNIC_OVERLAY_PAGE_MSR - internal, identical to hv_synic_simp */ +union hv_synic_overlay_page_msr { + u64 as_uint64; + struct { + u64 enabled: 1; + u64 reserved: 11; + u64 pfn: 52; + } __packed; +}; + +static struct mutex mshv_vtl_poll_file_lock; +static union hv_register_vsm_page_offsets mshv_vsm_page_offsets; +static union hv_register_vsm_capabilities mshv_vsm_capabilities; + +static DEFINE_PER_CPU(struct mshv_vtl_poll_file, mshv_vtl_poll_file); +static DEFINE_PER_CPU(unsigned long long, num_vtl0_transitions); +static DEFINE_PER_CPU(struct mshv_vtl_per_cpu, mshv_vtl_per_cpu); + +static const union hv_input_vtl input_vtl_zero; +static const union hv_input_vtl input_vtl_normal = { + .use_target_vtl = 1, +}; + +static const struct file_operations mshv_vtl_fops; + +static long +mshv_ioctl_create_vtl(void __user *user_arg, struct device *module_dev) +{ + struct mshv_vtl *vtl; + struct file *file; + int fd; + + vtl = kzalloc_obj(*vtl); + if (!vtl) + return -ENOMEM; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + kfree(vtl); + return fd; + } + file = anon_inode_getfile("mshv_vtl", &mshv_vtl_fops, + vtl, O_RDWR); + if (IS_ERR(file)) { + kfree(vtl); + return PTR_ERR(file); + } + vtl->module_dev = module_dev; + fd_install(fd, file); + + return fd; +} + +static long +mshv_ioctl_check_extension(void __user *user_arg) +{ + u32 arg; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + switch (arg) { + case MSHV_CAP_CORE_API_STABLE: + return 0; + case MSHV_CAP_REGISTER_PAGE: + return mshv_has_reg_page; + case MSHV_CAP_VTL_RETURN_ACTION: + return mshv_vsm_capabilities.return_action_available; + case MSHV_CAP_DR6_SHARED: + return mshv_vsm_capabilities.dr6_shared; + } + + return -EOPNOTSUPP; +} + +static long +mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + struct miscdevice *misc = filp->private_data; + + switch (ioctl) { + case MSHV_CHECK_EXTENSION: + return mshv_ioctl_check_extension((void __user *)arg); + case MSHV_CREATE_VTL: + return mshv_ioctl_create_vtl((void __user *)arg, misc->this_device); + } + + return -ENOTTY; +} + +static const struct file_operations mshv_dev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = mshv_dev_ioctl, + .llseek = noop_llseek, +}; + +static struct miscdevice mshv_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "mshv", + .fops = &mshv_dev_fops, + .mode = 0600, +}; + +static struct mshv_vtl_run *mshv_vtl_this_run(void) +{ + return *this_cpu_ptr(&mshv_vtl_per_cpu.run); +} + +static struct mshv_vtl_run *mshv_vtl_cpu_run(int cpu) +{ + return *per_cpu_ptr(&mshv_vtl_per_cpu.run, cpu); +} + +static struct page *mshv_vtl_cpu_reg_page(int cpu) +{ + return *per_cpu_ptr(&mshv_vtl_per_cpu.reg_page, cpu); +} + +static void mshv_vtl_configure_reg_page(struct mshv_vtl_per_cpu *per_cpu) +{ + struct hv_register_assoc reg_assoc = {}; + union hv_synic_overlay_page_msr overlay = {}; + struct page *reg_page; + + reg_page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL); + if (!reg_page) { + WARN(1, "failed to allocate register page\n"); + return; + } + + overlay.enabled = 1; + overlay.pfn = page_to_hvpfn(reg_page); + reg_assoc.name = HV_X64_REGISTER_REG_PAGE; + reg_assoc.value.reg64 = overlay.as_uint64; + + if (hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_zero, ®_assoc)) { + WARN(1, "failed to setup register page\n"); + __free_page(reg_page); + return; + } + + per_cpu->reg_page = reg_page; + mshv_has_reg_page = true; +} + +static void mshv_vtl_synic_enable_regs(unsigned int cpu) +{ + union hv_synic_sint sint; + + sint.as_uint64 = 0; + sint.vector = HYPERVISOR_CALLBACK_VECTOR; + sint.masked = false; + sint.auto_eoi = hv_recommend_using_aeoi(); + + /* Enable intercepts */ + if (!mshv_vsm_capabilities.intercept_page_available) + hv_set_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, + sint.as_uint64); + + /* VTL2 Host VSP SINT is (un)masked when the user mode requests that */ +} + +static int mshv_vtl_get_vsm_regs(void) +{ + struct hv_register_assoc registers[2]; + int ret, count = 2; + + registers[0].name = HV_REGISTER_VSM_CODE_PAGE_OFFSETS; + registers[1].name = HV_REGISTER_VSM_CAPABILITIES; + + ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + count, input_vtl_zero, registers); + if (ret) + return ret; + + mshv_vsm_page_offsets.as_uint64 = registers[0].value.reg64; + mshv_vsm_capabilities.as_uint64 = registers[1].value.reg64; + + return ret; +} + +static int mshv_vtl_configure_vsm_partition(struct device *dev) +{ + union hv_register_vsm_partition_config config; + struct hv_register_assoc reg_assoc; + + config.as_uint64 = 0; + config.default_vtl_protection_mask = HV_MAP_GPA_PERMISSIONS_MASK; + config.enable_vtl_protection = 1; + config.zero_memory_on_reset = 1; + config.intercept_vp_startup = 1; + config.intercept_cpuid_unimplemented = 1; + + if (mshv_vsm_capabilities.intercept_page_available) { + dev_dbg(dev, "using intercept page\n"); + config.intercept_page = 1; + } + + reg_assoc.name = HV_REGISTER_VSM_PARTITION_CONFIG; + reg_assoc.value.reg64 = config.as_uint64; + + return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_zero, ®_assoc); +} + +static void mshv_vtl_vmbus_isr(void) +{ + struct hv_per_cpu_context *per_cpu; + struct hv_message *msg; + u32 message_type; + union hv_synic_event_flags *event_flags; + struct eventfd_ctx *eventfd; + u16 i; + + per_cpu = this_cpu_ptr(hv_context.cpu_context); + if (smp_processor_id() == 0) { + msg = (struct hv_message *)per_cpu->hyp_synic_message_page + VTL2_VMBUS_SINT_INDEX; + message_type = READ_ONCE(msg->header.message_type); + if (message_type != HVMSG_NONE) + tasklet_schedule(&msg_dpc); + } + + event_flags = (union hv_synic_event_flags *)per_cpu->hyp_synic_event_page + + VTL2_VMBUS_SINT_INDEX; + for_each_set_bit(i, event_flags->flags, HV_EVENT_FLAGS_COUNT) { + if (!sync_test_and_clear_bit(i, event_flags->flags)) + continue; + rcu_read_lock(); + eventfd = READ_ONCE(flag_eventfds[i]); + if (eventfd) + eventfd_signal(eventfd); + rcu_read_unlock(); + } + + vmbus_isr(); +} + +static int mshv_vtl_alloc_context(unsigned int cpu) +{ + struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu); + + per_cpu->run = (struct mshv_vtl_run *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!per_cpu->run) + return -ENOMEM; + + if (mshv_vsm_capabilities.intercept_page_available) + mshv_vtl_configure_reg_page(per_cpu); + + mshv_vtl_synic_enable_regs(cpu); + + return 0; +} + +static int mshv_vtl_cpuhp_online; + +static int hv_vtl_setup_synic(void) +{ + int ret; + + /* Use our isr to first filter out packets destined for userspace */ + hv_setup_vmbus_handler(mshv_vtl_vmbus_isr); + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vtl:online", + mshv_vtl_alloc_context, NULL); + if (ret < 0) { + hv_setup_vmbus_handler(vmbus_isr); + return ret; + } + + mshv_vtl_cpuhp_online = ret; + + return 0; +} + +static void hv_vtl_remove_synic(void) +{ + cpuhp_remove_state(mshv_vtl_cpuhp_online); + hv_setup_vmbus_handler(vmbus_isr); +} + +static int vtl_get_vp_register(struct hv_register_assoc *reg) +{ + return hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_normal, reg); +} + +static int vtl_set_vp_register(struct hv_register_assoc *reg) +{ + return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_normal, reg); +} + +static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg) +{ + struct mshv_vtl_ram_disposition vtl0_mem; + struct dev_pagemap *pgmap; + void *addr; + + if (copy_from_user(&vtl0_mem, arg, sizeof(vtl0_mem))) + return -EFAULT; + if (vtl0_mem.last_pfn <= vtl0_mem.start_pfn) { + dev_err(vtl->module_dev, "range start pfn (%llx) > end pfn (%llx)\n", + vtl0_mem.start_pfn, vtl0_mem.last_pfn); + return -EFAULT; + } + + pgmap = kzalloc_obj(*pgmap); + if (!pgmap) + return -ENOMEM; + + /* + * vtl0_mem.last_pfn is excluded in the pagemap range for VTL0 as per design. + * last_pfn is not reserved or wasted, and reflects 'start_pfn + size' of pagemap range. + */ + pgmap->ranges[0].start = PFN_PHYS(vtl0_mem.start_pfn); + pgmap->ranges[0].end = PFN_PHYS(vtl0_mem.last_pfn) - 1; + pgmap->nr_range = 1; + pgmap->type = MEMORY_DEVICE_GENERIC; + + /* + * Determine the highest page order that can be used for the given memory range. + * This works best when the range is aligned; i.e. both the start and the length. + * Clamp to MAX_FOLIO_ORDER to avoid a WARN in memremap_pages() when the range + * alignment exceeds the maximum supported folio order for this kernel config. + */ + pgmap->vmemmap_shift = min(count_trailing_zeros(vtl0_mem.start_pfn | vtl0_mem.last_pfn), + MAX_FOLIO_ORDER); + dev_dbg(vtl->module_dev, + "Add VTL0 memory: start: 0x%llx, end_pfn: 0x%llx, page order: %lu\n", + vtl0_mem.start_pfn, vtl0_mem.last_pfn, pgmap->vmemmap_shift); + + addr = devm_memremap_pages(mem_dev, pgmap); + if (IS_ERR(addr)) { + dev_err(vtl->module_dev, "devm_memremap_pages error: %ld\n", PTR_ERR(addr)); + kfree(pgmap); + return PTR_ERR(addr); + } + + /* Don't free pgmap, since it has to stick around until the memory + * is unmapped, which will never happen as there is no scenario + * where VTL0 can be released/shutdown without bringing down VTL2. + */ + return 0; +} + +static void mshv_vtl_cancel(int cpu) +{ + int here = get_cpu(); + + if (here != cpu) { + if (!xchg_relaxed(&mshv_vtl_cpu_run(cpu)->cancel, 1)) + smp_send_reschedule(cpu); + } else { + WRITE_ONCE(mshv_vtl_this_run()->cancel, 1); + } + put_cpu(); +} + +static int mshv_vtl_poll_file_wake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) +{ + struct mshv_vtl_poll_file *poll_file = container_of(wait, struct mshv_vtl_poll_file, wait); + + mshv_vtl_cancel(poll_file->cpu); + + return 0; +} + +static void mshv_vtl_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) +{ + struct mshv_vtl_poll_file *poll_file = container_of(pt, struct mshv_vtl_poll_file, pt); + + WARN_ON(poll_file->wqh); + poll_file->wqh = wqh; + add_wait_queue(wqh, &poll_file->wait); +} + +static int mshv_vtl_ioctl_set_poll_file(struct mshv_vtl_set_poll_file __user *user_input) +{ + struct file *file, *old_file; + struct mshv_vtl_poll_file *poll_file; + struct mshv_vtl_set_poll_file input; + + if (copy_from_user(&input, user_input, sizeof(input))) + return -EFAULT; + + if (input.cpu >= num_possible_cpus() || !cpu_online(input.cpu)) + return -EINVAL; + /* + * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists. + * CPU is expected to remain online after above cpu_online() check. + */ + + file = NULL; + file = fget(input.fd); + if (!file) + return -EBADFD; + + poll_file = per_cpu_ptr(&mshv_vtl_poll_file, READ_ONCE(input.cpu)); + if (!poll_file) + return -EINVAL; + + mutex_lock(&mshv_vtl_poll_file_lock); + + if (poll_file->wqh) + remove_wait_queue(poll_file->wqh, &poll_file->wait); + poll_file->wqh = NULL; + + old_file = poll_file->file; + poll_file->file = file; + poll_file->cpu = input.cpu; + + if (file) { + init_waitqueue_func_entry(&poll_file->wait, mshv_vtl_poll_file_wake); + init_poll_funcptr(&poll_file->pt, mshv_vtl_ptable_queue_proc); + vfs_poll(file, &poll_file->pt); + } + + mutex_unlock(&mshv_vtl_poll_file_lock); + + if (old_file) + fput(old_file); + + return 0; +} + +/* Static table mapping register names to their corresponding actions */ +static const struct { + enum hv_register_name reg_name; + int debug_reg_num; /* -1 if not a debug register */ + u32 msr_addr; /* 0 if not an MSR */ +} reg_table[] = { + /* Debug registers */ + {HV_X64_REGISTER_DR0, 0, 0}, + {HV_X64_REGISTER_DR1, 1, 0}, + {HV_X64_REGISTER_DR2, 2, 0}, + {HV_X64_REGISTER_DR3, 3, 0}, + {HV_X64_REGISTER_DR6, 6, 0}, + /* MTRR MSRs */ + {HV_X64_REGISTER_MSR_MTRR_CAP, -1, MSR_MTRRcap}, + {HV_X64_REGISTER_MSR_MTRR_DEF_TYPE, -1, MSR_MTRRdefType}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0, -1, MTRRphysBase_MSR(0)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1, -1, MTRRphysBase_MSR(1)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2, -1, MTRRphysBase_MSR(2)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3, -1, MTRRphysBase_MSR(3)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4, -1, MTRRphysBase_MSR(4)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5, -1, MTRRphysBase_MSR(5)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6, -1, MTRRphysBase_MSR(6)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7, -1, MTRRphysBase_MSR(7)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8, -1, MTRRphysBase_MSR(8)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9, -1, MTRRphysBase_MSR(9)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA, -1, MTRRphysBase_MSR(0xa)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB, -1, MTRRphysBase_MSR(0xb)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC, -1, MTRRphysBase_MSR(0xc)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASED, -1, MTRRphysBase_MSR(0xd)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE, -1, MTRRphysBase_MSR(0xe)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF, -1, MTRRphysBase_MSR(0xf)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0, -1, MTRRphysMask_MSR(0)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1, -1, MTRRphysMask_MSR(1)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2, -1, MTRRphysMask_MSR(2)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3, -1, MTRRphysMask_MSR(3)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4, -1, MTRRphysMask_MSR(4)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5, -1, MTRRphysMask_MSR(5)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6, -1, MTRRphysMask_MSR(6)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7, -1, MTRRphysMask_MSR(7)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8, -1, MTRRphysMask_MSR(8)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9, -1, MTRRphysMask_MSR(9)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA, -1, MTRRphysMask_MSR(0xa)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB, -1, MTRRphysMask_MSR(0xb)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC, -1, MTRRphysMask_MSR(0xc)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD, -1, MTRRphysMask_MSR(0xd)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE, -1, MTRRphysMask_MSR(0xe)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF, -1, MTRRphysMask_MSR(0xf)}, + {HV_X64_REGISTER_MSR_MTRR_FIX64K00000, -1, MSR_MTRRfix64K_00000}, + {HV_X64_REGISTER_MSR_MTRR_FIX16K80000, -1, MSR_MTRRfix16K_80000}, + {HV_X64_REGISTER_MSR_MTRR_FIX16KA0000, -1, MSR_MTRRfix16K_A0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KC0000, -1, MSR_MTRRfix4K_C0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KC8000, -1, MSR_MTRRfix4K_C8000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KD0000, -1, MSR_MTRRfix4K_D0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KD8000, -1, MSR_MTRRfix4K_D8000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KE0000, -1, MSR_MTRRfix4K_E0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KE8000, -1, MSR_MTRRfix4K_E8000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KF0000, -1, MSR_MTRRfix4K_F0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KF8000, -1, MSR_MTRRfix4K_F8000}, +}; + +static int mshv_vtl_get_set_reg(struct hv_register_assoc *regs, bool set) +{ + u64 *reg64; + enum hv_register_name gpr_name; + int i; + + gpr_name = regs->name; + reg64 = ®s->value.reg64; + + /* Search for the register in the table */ + for (i = 0; i < ARRAY_SIZE(reg_table); i++) { + if (reg_table[i].reg_name != gpr_name) + continue; + if (reg_table[i].debug_reg_num != -1) { + /* Handle debug registers */ + if (gpr_name == HV_X64_REGISTER_DR6 && + !mshv_vsm_capabilities.dr6_shared) + goto hypercall; + if (set) + native_set_debugreg(reg_table[i].debug_reg_num, *reg64); + else + *reg64 = native_get_debugreg(reg_table[i].debug_reg_num); + } else { + /* Handle MSRs */ + if (set) + wrmsrl(reg_table[i].msr_addr, *reg64); + else + rdmsrl(reg_table[i].msr_addr, *reg64); + } + return 0; + } + +hypercall: + return 1; +} + +static void mshv_vtl_return(struct mshv_vtl_cpu_context *vtl0) +{ + struct hv_vp_assist_page *hvp; + + hvp = hv_vp_assist_page[smp_processor_id()]; + + /* + * Process signal event direct set in the run page, if any. + */ + if (mshv_vsm_capabilities.return_action_available) { + u32 offset = READ_ONCE(mshv_vtl_this_run()->vtl_ret_action_size); + + WRITE_ONCE(mshv_vtl_this_run()->vtl_ret_action_size, 0); + + /* + * Hypervisor will take care of clearing out the actions + * set in the assist page. + */ + memcpy(hvp->vtl_ret_actions, + mshv_vtl_this_run()->vtl_ret_actions, + min_t(u32, offset, sizeof(hvp->vtl_ret_actions))); + } + + mshv_vtl_return_call(vtl0); +} + +static bool mshv_vtl_process_intercept(void) +{ + struct hv_per_cpu_context *mshv_cpu; + void *synic_message_page; + struct hv_message *msg; + u32 message_type; + + mshv_cpu = this_cpu_ptr(hv_context.cpu_context); + synic_message_page = mshv_cpu->hyp_synic_message_page; + if (unlikely(!synic_message_page)) + return true; + + msg = (struct hv_message *)synic_message_page + HV_SYNIC_INTERCEPTION_SINT_INDEX; + message_type = READ_ONCE(msg->header.message_type); + if (message_type == HVMSG_NONE) + return true; + + memcpy(mshv_vtl_this_run()->exit_message, msg, sizeof(*msg)); + vmbus_signal_eom(msg, message_type); + + return false; +} + +static int mshv_vtl_ioctl_return_to_lower_vtl(void) +{ + preempt_disable(); + for (;;) { + unsigned long irq_flags; + struct hv_vp_assist_page *hvp; + int ret; + + if (__xfer_to_guest_mode_work_pending()) { + preempt_enable(); + ret = xfer_to_guest_mode_handle_work(); + if (ret) + return ret; + preempt_disable(); + } + + local_irq_save(irq_flags); + if (READ_ONCE(mshv_vtl_this_run()->cancel)) { + local_irq_restore(irq_flags); + preempt_enable(); + return -EINTR; + } + + mshv_vtl_return(&mshv_vtl_this_run()->cpu_context); + local_irq_restore(irq_flags); + + hvp = hv_vp_assist_page[smp_processor_id()]; + this_cpu_inc(num_vtl0_transitions); + switch (hvp->vtl_entry_reason) { + case MSHV_ENTRY_REASON_INTERRUPT: + if (!mshv_vsm_capabilities.intercept_page_available && + likely(!mshv_vtl_process_intercept())) + goto done; + break; + + case MSHV_ENTRY_REASON_INTERCEPT: + WARN_ON(!mshv_vsm_capabilities.intercept_page_available); + memcpy(mshv_vtl_this_run()->exit_message, hvp->intercept_message, + sizeof(hvp->intercept_message)); + goto done; + + default: + panic("unknown entry reason: %d", hvp->vtl_entry_reason); + } + } + +done: + preempt_enable(); + + return 0; +} + +static long +mshv_vtl_ioctl_get_regs(void __user *user_args) +{ + struct mshv_vp_registers args; + struct hv_register_assoc reg; + long ret; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + /* This IOCTL supports processing only one register at a time. */ + if (args.count != 1) + return -EINVAL; + + if (copy_from_user(®, (void __user *)args.regs_ptr, + sizeof(reg))) + return -EFAULT; + + ret = mshv_vtl_get_set_reg(®, false); + if (!ret) + goto copy_args; /* No need of hypercall */ + ret = vtl_get_vp_register(®); + if (ret) + return ret; + +copy_args: + if (copy_to_user((void __user *)args.regs_ptr, ®, sizeof(reg))) + ret = -EFAULT; + + return ret; +} + +static long +mshv_vtl_ioctl_set_regs(void __user *user_args) +{ + struct mshv_vp_registers args; + struct hv_register_assoc reg; + long ret; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + /* This IOCTL supports processing only one register at a time. */ + if (args.count != 1) + return -EINVAL; + + if (copy_from_user(®, (void __user *)args.regs_ptr, sizeof(reg))) + return -EFAULT; + + ret = mshv_vtl_get_set_reg(®, true); + if (!ret) + return ret; /* No need of hypercall */ + ret = vtl_set_vp_register(®); + + return ret; +} + +static long +mshv_vtl_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + long ret; + struct mshv_vtl *vtl = filp->private_data; + + switch (ioctl) { + case MSHV_SET_POLL_FILE: + ret = mshv_vtl_ioctl_set_poll_file((struct mshv_vtl_set_poll_file __user *)arg); + break; + case MSHV_GET_VP_REGISTERS: + ret = mshv_vtl_ioctl_get_regs((void __user *)arg); + break; + case MSHV_SET_VP_REGISTERS: + ret = mshv_vtl_ioctl_set_regs((void __user *)arg); + break; + case MSHV_RETURN_TO_LOWER_VTL: + ret = mshv_vtl_ioctl_return_to_lower_vtl(); + break; + case MSHV_ADD_VTL0_MEMORY: + ret = mshv_vtl_ioctl_add_vtl0_mem(vtl, (void __user *)arg); + break; + default: + dev_err(vtl->module_dev, "invalid vtl ioctl: %#x\n", ioctl); + ret = -ENOTTY; + } + + return ret; +} + +static vm_fault_t mshv_vtl_fault(struct vm_fault *vmf) +{ + struct page *page; + int cpu = vmf->pgoff & MSHV_PG_OFF_CPU_MASK; + int real_off = vmf->pgoff >> MSHV_REAL_OFF_SHIFT; + + if (!cpu_online(cpu)) + return VM_FAULT_SIGBUS; + /* + * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists. + * CPU is expected to remain online after above cpu_online() check. + */ + + if (real_off == MSHV_RUN_PAGE_OFFSET) { + page = virt_to_page(mshv_vtl_cpu_run(cpu)); + } else if (real_off == MSHV_REG_PAGE_OFFSET) { + if (!mshv_has_reg_page) + return VM_FAULT_SIGBUS; + page = mshv_vtl_cpu_reg_page(cpu); + } else { + return VM_FAULT_NOPAGE; + } + + get_page(page); + vmf->page = page; + + return 0; +} + +static const struct vm_operations_struct mshv_vtl_vm_ops = { + .fault = mshv_vtl_fault, +}; + +static int mshv_vtl_mmap(struct file *filp, struct vm_area_struct *vma) +{ + vma->vm_ops = &mshv_vtl_vm_ops; + + return 0; +} + +static int mshv_vtl_release(struct inode *inode, struct file *filp) +{ + struct mshv_vtl *vtl = filp->private_data; + + kfree(vtl); + + return 0; +} + +static const struct file_operations mshv_vtl_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = mshv_vtl_ioctl, + .release = mshv_vtl_release, + .mmap = mshv_vtl_mmap, +}; + +static void mshv_vtl_synic_mask_vmbus_sint(void *info) +{ + union hv_synic_sint sint; + const u8 *mask = info; + + sint.as_uint64 = 0; + sint.vector = HYPERVISOR_CALLBACK_VECTOR; + sint.masked = (*mask != 0); + sint.auto_eoi = hv_recommend_using_aeoi(); + + hv_set_msr(HV_MSR_SINT0 + VTL2_VMBUS_SINT_INDEX, + sint.as_uint64); + + if (!sint.masked) + pr_debug("%s: Unmasking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id()); + else + pr_debug("%s: Masking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id()); +} + +static void mshv_vtl_read_remote(void *buffer) +{ + struct hv_per_cpu_context *mshv_cpu = this_cpu_ptr(hv_context.cpu_context); + struct hv_message *msg = (struct hv_message *)mshv_cpu->hyp_synic_message_page + + VTL2_VMBUS_SINT_INDEX; + u32 message_type = READ_ONCE(msg->header.message_type); + + WRITE_ONCE(has_message, false); + if (message_type == HVMSG_NONE) + return; + + memcpy(buffer, msg, sizeof(*msg)); + vmbus_signal_eom(msg, message_type); +} + +static bool vtl_synic_mask_vmbus_sint_masked = true; + +static ssize_t mshv_vtl_sint_read(struct file *filp, char __user *arg, size_t size, loff_t *offset) +{ + struct hv_message msg = {}; + int ret; + + if (size < sizeof(msg)) + return -EINVAL; + + for (;;) { + smp_call_function_single(VMBUS_CONNECT_CPU, mshv_vtl_read_remote, &msg, true); + if (msg.header.message_type != HVMSG_NONE) + break; + + if (READ_ONCE(vtl_synic_mask_vmbus_sint_masked)) + return 0; /* EOF */ + + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + ret = wait_event_interruptible(fd_wait_queue, + READ_ONCE(has_message) || + READ_ONCE(vtl_synic_mask_vmbus_sint_masked)); + if (ret) + return ret; + } + + if (copy_to_user(arg, &msg, sizeof(msg))) + return -EFAULT; + + return sizeof(msg); +} + +static __poll_t mshv_vtl_sint_poll(struct file *filp, poll_table *wait) +{ + __poll_t mask = 0; + + poll_wait(filp, &fd_wait_queue, wait); + if (READ_ONCE(has_message) || READ_ONCE(vtl_synic_mask_vmbus_sint_masked)) + mask |= EPOLLIN | EPOLLRDNORM; + + return mask; +} + +static void mshv_vtl_sint_on_msg_dpc(unsigned long data) +{ + WRITE_ONCE(has_message, true); + wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN); +} + +static int mshv_vtl_sint_ioctl_post_msg(struct mshv_vtl_sint_post_msg __user *arg) +{ + struct mshv_vtl_sint_post_msg message; + u8 payload[HV_MESSAGE_PAYLOAD_BYTE_COUNT]; + + if (copy_from_user(&message, arg, sizeof(message))) + return -EFAULT; + if (message.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) + return -EINVAL; + if (copy_from_user(payload, (void __user *)message.payload_ptr, + message.payload_size)) + return -EFAULT; + + return hv_post_message((union hv_connection_id)message.connection_id, + message.message_type, (void *)payload, + message.payload_size); +} + +static int mshv_vtl_sint_ioctl_signal_event(struct mshv_vtl_signal_event __user *arg) +{ + u64 input, status; + struct mshv_vtl_signal_event signal_event; + + if (copy_from_user(&signal_event, arg, sizeof(signal_event))) + return -EFAULT; + + input = signal_event.connection_id | ((u64)signal_event.flag << 32); + + status = hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, input); + + return hv_result_to_errno(status); +} + +static int mshv_vtl_sint_ioctl_set_eventfd(struct mshv_vtl_set_eventfd __user *arg) +{ + struct mshv_vtl_set_eventfd set_eventfd; + struct eventfd_ctx *eventfd, *old_eventfd; + + if (copy_from_user(&set_eventfd, arg, sizeof(set_eventfd))) + return -EFAULT; + if (set_eventfd.flag >= HV_EVENT_FLAGS_COUNT) + return -EINVAL; + + eventfd = NULL; + if (set_eventfd.fd >= 0) { + eventfd = eventfd_ctx_fdget(set_eventfd.fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + } + + guard(mutex)(&flag_lock); + old_eventfd = READ_ONCE(flag_eventfds[set_eventfd.flag]); + WRITE_ONCE(flag_eventfds[set_eventfd.flag], eventfd); + + if (old_eventfd) { + synchronize_rcu(); + eventfd_ctx_put(old_eventfd); + } + + return 0; +} + +static int mshv_vtl_sint_ioctl_pause_msg_stream(struct mshv_sint_mask __user *arg) +{ + static DEFINE_MUTEX(vtl2_vmbus_sint_mask_mutex); + struct mshv_sint_mask mask; + + if (copy_from_user(&mask, arg, sizeof(mask))) + return -EFAULT; + guard(mutex)(&vtl2_vmbus_sint_mask_mutex); + on_each_cpu(mshv_vtl_synic_mask_vmbus_sint, &mask.mask, 1); + WRITE_ONCE(vtl_synic_mask_vmbus_sint_masked, mask.mask != 0); + if (mask.mask) + wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN); + + return 0; +} + +static long mshv_vtl_sint_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case MSHV_SINT_POST_MESSAGE: + return mshv_vtl_sint_ioctl_post_msg((struct mshv_vtl_sint_post_msg __user *)arg); + case MSHV_SINT_SIGNAL_EVENT: + return mshv_vtl_sint_ioctl_signal_event((struct mshv_vtl_signal_event __user *)arg); + case MSHV_SINT_SET_EVENTFD: + return mshv_vtl_sint_ioctl_set_eventfd((struct mshv_vtl_set_eventfd __user *)arg); + case MSHV_SINT_PAUSE_MESSAGE_STREAM: + return mshv_vtl_sint_ioctl_pause_msg_stream((struct mshv_sint_mask __user *)arg); + default: + return -ENOIOCTLCMD; + } +} + +static const struct file_operations mshv_vtl_sint_ops = { + .owner = THIS_MODULE, + .read = mshv_vtl_sint_read, + .poll = mshv_vtl_sint_poll, + .unlocked_ioctl = mshv_vtl_sint_ioctl, +}; + +static struct miscdevice mshv_vtl_sint_dev = { + .name = "mshv_sint", + .fops = &mshv_vtl_sint_ops, + .mode = 0600, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int mshv_vtl_hvcall_dev_open(struct inode *node, struct file *f) +{ + struct miscdevice *dev = f->private_data; + struct mshv_vtl_hvcall_fd *fd; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + fd = vzalloc(sizeof(*fd)); + if (!fd) + return -ENOMEM; + fd->dev = dev; + f->private_data = fd; + mutex_init(&fd->init_mutex); + + return 0; +} + +static int mshv_vtl_hvcall_dev_release(struct inode *node, struct file *f) +{ + struct mshv_vtl_hvcall_fd *fd; + + fd = f->private_data; + if (fd) { + vfree(fd); + f->private_data = NULL; + } + + return 0; +} + +static int mshv_vtl_hvcall_do_setup(struct mshv_vtl_hvcall_fd *fd, + struct mshv_vtl_hvcall_setup __user *hvcall_setup_user) +{ + struct mshv_vtl_hvcall_setup hvcall_setup; + + guard(mutex)(&fd->init_mutex); + + if (fd->allow_map_initialized) { + dev_err(fd->dev->this_device, + "Hypercall allow map has already been set, pid %d\n", + current->pid); + return -EINVAL; + } + + if (copy_from_user(&hvcall_setup, hvcall_setup_user, + sizeof(struct mshv_vtl_hvcall_setup))) { + return -EFAULT; + } + if (hvcall_setup.bitmap_array_size > ARRAY_SIZE(fd->allow_bitmap)) + return -EINVAL; + + if (copy_from_user(&fd->allow_bitmap, + (void __user *)hvcall_setup.allow_bitmap_ptr, + hvcall_setup.bitmap_array_size)) { + return -EFAULT; + } + + dev_info(fd->dev->this_device, "Hypercall allow map has been set, pid %d\n", + current->pid); + fd->allow_map_initialized = true; + return 0; +} + +static bool mshv_vtl_hvcall_is_allowed(struct mshv_vtl_hvcall_fd *fd, u16 call_code) +{ + return test_bit(call_code, (unsigned long *)fd->allow_bitmap); +} + +static int mshv_vtl_hvcall_call(struct mshv_vtl_hvcall_fd *fd, + struct mshv_vtl_hvcall __user *hvcall_user) +{ + struct mshv_vtl_hvcall hvcall; + void *in, *out; + int ret; + + if (copy_from_user(&hvcall, hvcall_user, sizeof(struct mshv_vtl_hvcall))) + return -EFAULT; + if (hvcall.input_size > HV_HYP_PAGE_SIZE) + return -EINVAL; + if (hvcall.output_size > HV_HYP_PAGE_SIZE) + return -EINVAL; + + /* + * By default, all hypercalls are not allowed. + * The user mode code has to set up the allow bitmap once. + */ + + if (!mshv_vtl_hvcall_is_allowed(fd, hvcall.control & 0xFFFF)) { + dev_err(fd->dev->this_device, + "Hypercall with control data %#llx isn't allowed\n", + hvcall.control); + return -EPERM; + } + + /* + * This may create a problem for Confidential VM (CVM) usecase where we need to use + * Hyper-V driver allocated per-cpu input and output pages (hyperv_pcpu_input_arg and + * hyperv_pcpu_output_arg) for making a hypervisor call. + * + * TODO: Take care of this when CVM support is added. + */ + in = (void *)__get_free_page(GFP_KERNEL); + out = (void *)__get_free_page(GFP_KERNEL); + + if (copy_from_user(in, (void __user *)hvcall.input_ptr, hvcall.input_size)) { + ret = -EFAULT; + goto free_pages; + } + + hvcall.status = hv_do_hypercall(hvcall.control, in, out); + + if (copy_to_user((void __user *)hvcall.output_ptr, out, hvcall.output_size)) { + ret = -EFAULT; + goto free_pages; + } + ret = put_user(hvcall.status, &hvcall_user->status); +free_pages: + free_page((unsigned long)in); + free_page((unsigned long)out); + + return ret; +} + +static long mshv_vtl_hvcall_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + struct mshv_vtl_hvcall_fd *fd = f->private_data; + + switch (cmd) { + case MSHV_HVCALL_SETUP: + return mshv_vtl_hvcall_do_setup(fd, (struct mshv_vtl_hvcall_setup __user *)arg); + case MSHV_HVCALL: + return mshv_vtl_hvcall_call(fd, (struct mshv_vtl_hvcall __user *)arg); + default: + break; + } + + return -ENOIOCTLCMD; +} + +static const struct file_operations mshv_vtl_hvcall_dev_file_ops = { + .owner = THIS_MODULE, + .open = mshv_vtl_hvcall_dev_open, + .release = mshv_vtl_hvcall_dev_release, + .unlocked_ioctl = mshv_vtl_hvcall_dev_ioctl, +}; + +static struct miscdevice mshv_vtl_hvcall_dev = { + .name = "mshv_hvcall", + .nodename = "mshv_hvcall", + .fops = &mshv_vtl_hvcall_dev_file_ops, + .mode = 0600, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int mshv_vtl_low_open(struct inode *inodep, struct file *filp) +{ + pid_t pid = task_pid_vnr(current); + uid_t uid = current_uid().val; + int ret = 0; + + pr_debug("%s: Opening VTL low, task group %d, uid %d\n", __func__, pid, uid); + + if (capable(CAP_SYS_ADMIN)) { + filp->private_data = inodep; + } else { + pr_err("%s: VTL low open failed: CAP_SYS_ADMIN required. task group %d, uid %d", + __func__, pid, uid); + ret = -EPERM; + } + + return ret; +} + +static bool can_fault(struct vm_fault *vmf, unsigned long size, unsigned long *pfn) +{ + unsigned long mask = size - 1; + unsigned long start = vmf->address & ~mask; + unsigned long end = start + size; + bool is_valid; + + is_valid = (vmf->address & mask) == ((vmf->pgoff << PAGE_SHIFT) & mask) && + start >= vmf->vma->vm_start && + end <= vmf->vma->vm_end; + + if (is_valid) + *pfn = vmf->pgoff & ~(mask >> PAGE_SHIFT); + + return is_valid; +} + +static vm_fault_t mshv_vtl_low_huge_fault(struct vm_fault *vmf, unsigned int order) +{ + unsigned long pfn = vmf->pgoff; + vm_fault_t ret = VM_FAULT_FALLBACK; + + switch (order) { + case 0: + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); + + case PMD_ORDER: + if (can_fault(vmf, PMD_SIZE, &pfn)) + ret = vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); + return ret; + + case PUD_ORDER: + if (can_fault(vmf, PUD_SIZE, &pfn)) + ret = vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); + return ret; + + default: + return VM_FAULT_SIGBUS; + } +} + +static vm_fault_t mshv_vtl_low_fault(struct vm_fault *vmf) +{ + return mshv_vtl_low_huge_fault(vmf, 0); +} + +static const struct vm_operations_struct mshv_vtl_low_vm_ops = { + .fault = mshv_vtl_low_fault, + .huge_fault = mshv_vtl_low_huge_fault, +}; + +static int mshv_vtl_low_mmap(struct file *filp, struct vm_area_struct *vma) +{ + vma->vm_ops = &mshv_vtl_low_vm_ops; + vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP); + + return 0; +} + +static const struct file_operations mshv_vtl_low_file_ops = { + .owner = THIS_MODULE, + .open = mshv_vtl_low_open, + .mmap = mshv_vtl_low_mmap, +}; + +static struct miscdevice mshv_vtl_low = { + .name = "mshv_vtl_low", + .nodename = "mshv_vtl_low", + .fops = &mshv_vtl_low_file_ops, + .mode = 0600, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int __init mshv_vtl_init(void) +{ + int ret; + struct device *dev = mshv_dev.this_device; + + /* + * This creates /dev/mshv which provides functionality to create VTLs and partitions. + */ + ret = misc_register(&mshv_dev); + if (ret) { + dev_err(dev, "mshv device register failed: %d\n", ret); + goto free_dev; + } + + tasklet_init(&msg_dpc, mshv_vtl_sint_on_msg_dpc, 0); + init_waitqueue_head(&fd_wait_queue); + + if (mshv_vtl_get_vsm_regs()) { + dev_emerg(dev, "Unable to get VSM capabilities !!\n"); + ret = -ENODEV; + goto free_dev; + } + if (mshv_vtl_configure_vsm_partition(dev)) { + dev_emerg(dev, "VSM configuration failed !!\n"); + ret = -ENODEV; + goto free_dev; + } + + mshv_vtl_return_call_init(mshv_vsm_page_offsets.vtl_return_offset); + ret = hv_vtl_setup_synic(); + if (ret) + goto free_dev; + + /* + * mshv_sint device adds VMBus relay ioctl support. + * This provides a channel for VTL0 to communicate with VTL2. + */ + ret = misc_register(&mshv_vtl_sint_dev); + if (ret) + goto free_synic; + + /* + * mshv_hvcall device adds interface to enable userspace for direct hypercalls support. + */ + ret = misc_register(&mshv_vtl_hvcall_dev); + if (ret) + goto free_sint; + + /* + * mshv_vtl_low device is used to map VTL0 address space to a user-mode process in VTL2. + * It implements mmap() to allow a user-mode process in VTL2 to map to the address of VTL0. + */ + ret = misc_register(&mshv_vtl_low); + if (ret) + goto free_hvcall; + + /* + * "mshv vtl mem dev" device is later used to setup VTL0 memory. + */ + mem_dev = kzalloc_obj(*mem_dev); + if (!mem_dev) { + ret = -ENOMEM; + goto free_low; + } + + mutex_init(&mshv_vtl_poll_file_lock); + + device_initialize(mem_dev); + dev_set_name(mem_dev, "mshv vtl mem dev"); + ret = device_add(mem_dev); + if (ret) { + dev_err(dev, "mshv vtl mem dev add: %d\n", ret); + goto free_mem; + } + + return 0; + +free_mem: + kfree(mem_dev); +free_low: + misc_deregister(&mshv_vtl_low); +free_hvcall: + misc_deregister(&mshv_vtl_hvcall_dev); +free_sint: + misc_deregister(&mshv_vtl_sint_dev); +free_synic: + hv_vtl_remove_synic(); +free_dev: + misc_deregister(&mshv_dev); + + return ret; +} + +static void __exit mshv_vtl_exit(void) +{ + device_del(mem_dev); + kfree(mem_dev); + misc_deregister(&mshv_vtl_low); + misc_deregister(&mshv_vtl_hvcall_dev); + misc_deregister(&mshv_vtl_sint_dev); + hv_vtl_remove_synic(); + misc_deregister(&mshv_dev); +} + +module_init(mshv_vtl_init); +module_exit(mshv_vtl_exit); diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index 3c9b02471760..592a9601faaa 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/prefetch.h> #include <linux/io.h> +#include <linux/export.h> #include <asm/mshyperv.h> #include "hyperv_vmbus.h" @@ -183,7 +184,8 @@ void hv_ringbuffer_pre_init(struct vmbus_channel *channel) /* Initialize the ring buffer. */ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, - struct page *pages, u32 page_cnt, u32 max_pkt_size) + struct page *pages, u32 page_cnt, u32 max_pkt_size, + bool confidential) { struct page **pages_wraparound; int i; @@ -194,9 +196,7 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, * First page holds struct hv_ring_buffer, do wraparound mapping for * the rest. */ - pages_wraparound = kcalloc(page_cnt * 2 - 1, - sizeof(struct page *), - GFP_KERNEL); + pages_wraparound = kzalloc_objs(struct page *, page_cnt * 2 - 1); if (!pages_wraparound) return -ENOMEM; @@ -207,7 +207,7 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, ring_info->ring_buffer = (struct hv_ring_buffer *) vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP, - pgprot_decrypted(PAGE_KERNEL)); + confidential ? PAGE_KERNEL : pgprot_decrypted(PAGE_KERNEL)); kfree(pages_wraparound); if (!ring_info->ring_buffer) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 0f6cd44fff29..d28ff45d4cfd 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -25,17 +25,18 @@ #include <linux/cpu.h> #include <linux/sched/isolation.h> #include <linux/sched/task_stack.h> +#include <linux/smpboot.h> #include <linux/delay.h> #include <linux/panic_notifier.h> #include <linux/ptrace.h> -#include <linux/screen_info.h> +#include <linux/sysfb.h> #include <linux/efi.h> -#include <linux/random.h> #include <linux/kernel.h> #include <linux/syscore_ops.h> #include <linux/dma-map-ops.h> #include <linux/pci.h> +#include <linux/export.h> #include <clocksource/hyperv_timer.h> #include <asm/mshyperv.h> #include "hyperv_vmbus.h" @@ -45,17 +46,30 @@ struct vmbus_dynid { struct hv_vmbus_device_id id; }; -static struct device *hv_dev; +/* VMBus Root Device */ +static struct device *vmbus_root_device; static int hyperv_cpuhp_online; -static long __percpu *vmbus_evt; +static DEFINE_PER_CPU(long, vmbus_evt); /* Values parsed from ACPI DSDT */ int vmbus_irq; int vmbus_interrupt; /* + * If the Confidential VMBus is used, the data on the "wire" is not + * visible to either the host or the hypervisor. + */ +static bool is_confidential; + +bool vmbus_is_confidential(void) +{ + return is_confidential; +} +EXPORT_SYMBOL_GPL(vmbus_is_confidential); + +/* * The panic notifier below is responsible solely for unloading the * vmbus connection, which is necessary in a panic event. * @@ -80,13 +94,17 @@ static struct resource *fb_mmio; static struct resource *hyperv_mmio; static DEFINE_MUTEX(hyperv_mmio_lock); -static int vmbus_exists(void) +struct device *hv_get_vmbus_root_device(void) { - if (hv_dev == NULL) - return -ENODEV; + return vmbus_root_device; +} +EXPORT_SYMBOL_GPL(hv_get_vmbus_root_device); - return 0; +bool hv_vmbus_exists(void) +{ + return vmbus_root_device != NULL; } +EXPORT_SYMBOL_GPL(hv_vmbus_exists); static u8 channel_monitor_group(const struct vmbus_channel *channel) { @@ -315,7 +333,7 @@ static ssize_t out_read_index_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sysfs_emit(buf, "%d\n", outbound.current_read_index); + return sysfs_emit(buf, "%u\n", outbound.current_read_index); } static DEVICE_ATTR_RO(out_read_index); @@ -334,7 +352,7 @@ static ssize_t out_write_index_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sysfs_emit(buf, "%d\n", outbound.current_write_index); + return sysfs_emit(buf, "%u\n", outbound.current_write_index); } static DEVICE_ATTR_RO(out_write_index); @@ -707,12 +725,35 @@ static const struct hv_vmbus_device_id *hv_vmbus_get_id(const struct hv_driver * return id; } -/* vmbus_add_dynid - add a new device ID to this driver and re-probe devices */ +/* vmbus_add_dynid - add a new device ID to this driver and re-probe devices + * + * This function can race with vmbus_device_register(). This function is + * typically running on a user thread in response to writing to the "new_id" + * sysfs entry for a driver. vmbus_device_register() is running on a + * workqueue thread in response to the Hyper-V host offering a device to the + * guest. This function calls driver_attach(), which looks for an existing + * device matching the new id, and attaches the driver to which the new id + * has been assigned. vmbus_device_register() calls device_register(), which + * looks for a driver that matches the device being registered. If both + * operations are running simultaneously, the device driver probe function runs + * on whichever thread establishes the linkage between the driver and device. + * + * In most cases, it doesn't matter which thread runs the driver probe + * function. But if vmbus_device_register() does not find a matching driver, + * it proceeds to create the "channels" subdirectory and numbered per-channel + * subdirectory in sysfs. While that multi-step creation is in progress, this + * function could run the driver probe function. If the probe function checks + * for, or operates on, entries in the "channels" subdirectory, including by + * calling hv_create_ring_sysfs(), the operation may or may not succeed + * depending on the race. The race can't create a kernel failure in VMBus + * or device subsystem code, but probe functions in VMBus drivers doing such + * operations must be prepared for the failure case. + */ static int vmbus_add_dynid(struct hv_driver *drv, guid_t *guid) { struct vmbus_dynid *dynid; - dynid = kzalloc(sizeof(*dynid), GFP_KERNEL); + dynid = kzalloc_obj(*dynid); if (!dynid) return -ENOMEM; @@ -861,7 +902,7 @@ static int vmbus_dma_configure(struct device *child_device) * On x86/x64 coherence is assumed and these calls have no effect. */ hv_setup_dma_ops(child_device, - device_get_dma_attr(hv_dev) == DEV_DMA_COHERENT); + device_get_dma_attr(vmbus_root_device) == DEV_DMA_COHERENT); return 0; } @@ -1015,12 +1056,9 @@ static void vmbus_onmessage_work(struct work_struct *work) kfree(ctx); } -void vmbus_on_msg_dpc(unsigned long data) +static void __vmbus_on_msg_dpc(void *message_page_addr) { - struct hv_per_cpu_context *hv_cpu = (void *)data; - void *page_addr = hv_cpu->synic_message_page; - struct hv_message msg_copy, *msg = (struct hv_message *)page_addr + - VMBUS_MESSAGE_SINT; + struct hv_message msg_copy, *msg; struct vmbus_channel_message_header *hdr; enum vmbus_channel_message_type msgtype; const struct vmbus_channel_message_table_entry *entry; @@ -1028,6 +1066,10 @@ void vmbus_on_msg_dpc(unsigned long data) __u8 payload_size; u32 message_type; + if (!message_page_addr) + return; + msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT; + /* * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as * it is being used in 'struct vmbus_channel_message_header' definition @@ -1075,7 +1117,7 @@ void vmbus_on_msg_dpc(unsigned long data) } if (entry->handler_type == VMHT_BLOCKING) { - ctx = kmalloc(struct_size(ctx, msg.payload, payload_size), GFP_ATOMIC); + ctx = kmalloc_flex(*ctx, msg.payload, payload_size, GFP_ATOMIC); if (ctx == NULL) return; @@ -1153,6 +1195,14 @@ msg_handled: vmbus_signal_eom(msg, message_type); } +void vmbus_on_msg_dpc(unsigned long data) +{ + struct hv_per_cpu_context *hv_cpu = (void *)data; + + __vmbus_on_msg_dpc(hv_cpu->hyp_synic_message_page); + __vmbus_on_msg_dpc(hv_cpu->para_synic_message_page); +} + #ifdef CONFIG_PM_SLEEP /* * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for @@ -1191,23 +1241,21 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) #endif /* CONFIG_PM_SLEEP */ /* - * Schedule all channels with events pending + * Schedule all channels with events pending. + * The event page can be directly checked to get the id of + * the channel that has the interrupt pending. */ -static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) +static void vmbus_chan_sched(void *event_page_addr) { unsigned long *recv_int_page; u32 maxbits, relid; + union hv_synic_event_flags *event; - /* - * The event page can be directly checked to get the id of - * the channel that has the interrupt pending. - */ - void *page_addr = hv_cpu->synic_event_page; - union hv_synic_event_flags *event - = (union hv_synic_event_flags *)page_addr + - VMBUS_MESSAGE_SINT; + if (!event_page_addr) + return; + event = (union hv_synic_event_flags *)event_page_addr + VMBUS_MESSAGE_SINT; - maxbits = HV_EVENT_FLAGS_COUNT; + maxbits = READ_ONCE(vmbus_connection.relid_hiwater) + 1; recv_int_page = event->flags; if (unlikely(!recv_int_page)) @@ -1276,29 +1324,84 @@ sched_unlock_rcu: } } -static void vmbus_isr(void) +static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message_page_addr) { - struct hv_per_cpu_context *hv_cpu - = this_cpu_ptr(hv_context.cpu_context); - void *page_addr; struct hv_message *msg; - vmbus_chan_sched(hv_cpu); - - page_addr = hv_cpu->synic_message_page; - msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; + if (!message_page_addr) + return; + msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT; /* Check if there are actual msgs to be processed */ if (msg->header.message_type != HVMSG_NONE) { if (msg->header.message_type == HVMSG_TIMER_EXPIRED) { hv_stimer0_isr(); vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED); - } else + } else { tasklet_schedule(&hv_cpu->msg_dpc); + } } +} - add_interrupt_randomness(vmbus_interrupt); +static void __vmbus_isr(void) +{ + struct hv_per_cpu_context *hv_cpu + = this_cpu_ptr(hv_context.cpu_context); + + vmbus_chan_sched(hv_cpu->hyp_synic_event_page); + vmbus_chan_sched(hv_cpu->para_synic_event_page); + + vmbus_message_sched(hv_cpu, hv_cpu->hyp_synic_message_page); + vmbus_message_sched(hv_cpu, hv_cpu->para_synic_message_page); +} + +static DEFINE_PER_CPU(bool, vmbus_irq_pending); +static DEFINE_PER_CPU(struct task_struct *, vmbus_irqd); + +static void vmbus_irqd_wake(void) +{ + struct task_struct *tsk = __this_cpu_read(vmbus_irqd); + + __this_cpu_write(vmbus_irq_pending, true); + wake_up_process(tsk); +} + +static void vmbus_irqd_setup(unsigned int cpu) +{ + sched_set_fifo(current); +} + +static int vmbus_irqd_should_run(unsigned int cpu) +{ + return __this_cpu_read(vmbus_irq_pending); +} + +static void run_vmbus_irqd(unsigned int cpu) +{ + __this_cpu_write(vmbus_irq_pending, false); + __vmbus_isr(); +} + +static bool vmbus_irq_initialized; + +static struct smp_hotplug_thread vmbus_irq_threads = { + .store = &vmbus_irqd, + .setup = vmbus_irqd_setup, + .thread_should_run = vmbus_irqd_should_run, + .thread_fn = run_vmbus_irqd, + .thread_comm = "vmbus_irq/%u", +}; + +void vmbus_isr(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + vmbus_irqd_wake(); + } else { + lockdep_hardirq_threaded(); + __vmbus_isr(); + } } +EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl"); static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id) { @@ -1313,6 +1416,58 @@ static void vmbus_percpu_work(struct work_struct *work) hv_synic_init(cpu); } +static int vmbus_alloc_synic_and_connect(void) +{ + int ret, cpu; + struct work_struct __percpu *works; + + ret = hv_synic_alloc(); + if (ret < 0) + goto err_alloc; + + works = alloc_percpu(struct work_struct); + if (!works) { + ret = -ENOMEM; + goto err_alloc; + } + + /* + * Initialize the per-cpu interrupt state and stimer state. + * Then connect to the host. + */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, vmbus_percpu_work); + schedule_work_on(cpu, work); + } + + for_each_online_cpu(cpu) + flush_work(per_cpu_ptr(works, cpu)); + + /* Register the callbacks for possible CPU online/offline'ing */ + ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online", + hv_synic_init, hv_synic_cleanup); + cpus_read_unlock(); + free_percpu(works); + if (ret < 0) + goto err_alloc; + hyperv_cpuhp_online = ret; + + ret = vmbus_connect(); + if (ret) + goto err_connect; + return 0; + +err_connect: + cpuhp_remove_state(hyperv_cpuhp_online); + return -ENODEV; +err_alloc: + hv_synic_free(); + return -ENOMEM; +} + /* * vmbus_bus_init -Main vmbus driver initialization routine. * @@ -1323,8 +1478,7 @@ static void vmbus_percpu_work(struct work_struct *work) */ static int vmbus_bus_init(void) { - int ret, cpu; - struct work_struct __percpu *works; + int ret; ret = hv_init(); if (ret != 0) { @@ -1345,55 +1499,34 @@ static int vmbus_bus_init(void) * the VMbus interrupt handler. */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !vmbus_irq_initialized) { + ret = smpboot_register_percpu_thread(&vmbus_irq_threads); + if (ret) + goto err_kthread; + vmbus_irq_initialized = true; + } + if (vmbus_irq == -1) { hv_setup_vmbus_handler(vmbus_isr); } else { - vmbus_evt = alloc_percpu(long); ret = request_percpu_irq(vmbus_irq, vmbus_percpu_isr, - "Hyper-V VMbus", vmbus_evt); + "Hyper-V VMbus", &vmbus_evt); if (ret) { pr_err("Can't request Hyper-V VMbus IRQ %d, Err %d", vmbus_irq, ret); - free_percpu(vmbus_evt); goto err_setup; } } - ret = hv_synic_alloc(); - if (ret) - goto err_alloc; - - works = alloc_percpu(struct work_struct); - if (!works) { - ret = -ENOMEM; - goto err_alloc; - } - /* - * Initialize the per-cpu interrupt state and stimer state. - * Then connect to the host. + * Cache the value as getting it involves a VM exit on x86(_64), and + * doing that on each VP while initializing SynIC's wastes time. */ - cpus_read_lock(); - for_each_online_cpu(cpu) { - struct work_struct *work = per_cpu_ptr(works, cpu); - - INIT_WORK(work, vmbus_percpu_work); - schedule_work_on(cpu, work); - } - - for_each_online_cpu(cpu) - flush_work(per_cpu_ptr(works, cpu)); - - /* Register the callbacks for possible CPU online/offline'ing */ - ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online", - hv_synic_init, hv_synic_cleanup); - cpus_read_unlock(); - free_percpu(works); - if (ret < 0) - goto err_alloc; - hyperv_cpuhp_online = ret; - - ret = vmbus_connect(); + is_confidential = ms_hyperv.confidential_vmbus_available; + if (is_confidential) + pr_info("Establishing connection to the confidential VMBus\n"); + hv_para_set_sint_proxy(!is_confidential); + ret = vmbus_alloc_synic_and_connect(); if (ret) goto err_connect; @@ -1409,16 +1542,16 @@ static int vmbus_bus_init(void) return 0; err_connect: - cpuhp_remove_state(hyperv_cpuhp_online); -err_alloc: - hv_synic_free(); - if (vmbus_irq == -1) { + if (vmbus_irq == -1) hv_remove_vmbus_handler(); - } else { - free_percpu_irq(vmbus_irq, vmbus_evt); - free_percpu(vmbus_evt); - } + else + free_percpu_irq(vmbus_irq, &vmbus_evt); err_setup: + if (IS_ENABLED(CONFIG_PREEMPT_RT) && vmbus_irq_initialized) { + smpboot_unregister_percpu_thread(&vmbus_irq_threads); + vmbus_irq_initialized = false; + } +err_kthread: bus_unregister(&hv_bus); return ret; } @@ -1438,11 +1571,10 @@ int __vmbus_driver_register(struct hv_driver *hv_driver, struct module *owner, c { int ret; - pr_info("registering driver %s\n", hv_driver->name); + if (!hv_vmbus_exists()) + return -ENODEV; - ret = vmbus_exists(); - if (ret < 0) - return ret; + pr_info("registering driver %s\n", hv_driver->name); hv_driver->driver.name = hv_driver->name; hv_driver->driver.owner = owner; @@ -1468,9 +1600,8 @@ EXPORT_SYMBOL_GPL(__vmbus_driver_register); */ void vmbus_driver_unregister(struct hv_driver *hv_driver) { - pr_info("unregistering driver %s\n", hv_driver->name); - - if (!vmbus_exists()) { + if (hv_vmbus_exists()) { + pr_info("unregistering driver %s\n", hv_driver->name); driver_unregister(&hv_driver->driver); vmbus_free_dynids(hv_driver); } @@ -1611,16 +1742,16 @@ static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf) { return sprintf(buf, "%u\n", channel->target_cpu); } -static ssize_t target_cpu_store(struct vmbus_channel *channel, - const char *buf, size_t count) + +int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu) { - u32 target_cpu, origin_cpu; - ssize_t ret = count; + u32 origin_cpu; + int ret = 0; - if (vmbus_proto_version < VERSION_WIN10_V4_1) - return -EIO; + lockdep_assert_cpus_held(); + lockdep_assert_held(&vmbus_connection.channel_mutex); - if (sscanf(buf, "%uu", &target_cpu) != 1) + if (vmbus_proto_version < VERSION_WIN10_V4_1) return -EIO; /* Validate target_cpu for the cpumask_test_cpu() operation below. */ @@ -1630,22 +1761,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, if (!cpumask_test_cpu(target_cpu, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ))) return -EINVAL; - /* No CPUs should come up or down during this. */ - cpus_read_lock(); - - if (!cpu_online(target_cpu)) { - cpus_read_unlock(); + if (!cpu_online(target_cpu)) return -EINVAL; - } /* - * Synchronizes target_cpu_store() and channel closure: + * Synchronizes vmbus_channel_set_cpu() and channel closure: * * { Initially: state = CHANNEL_OPENED } * * CPU1 CPU2 * - * [target_cpu_store()] [vmbus_disconnect_ring()] + * [vmbus_channel_set_cpu()] [vmbus_disconnect_ring()] * * LOCK channel_mutex LOCK channel_mutex * LOAD r1 = state LOAD r2 = state @@ -1660,7 +1786,6 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, * Note. The host processes the channel messages "sequentially", in * the order in which they are received on a per-partition basis. */ - mutex_lock(&vmbus_connection.channel_mutex); /* * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels; @@ -1668,17 +1793,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, */ if (channel->state != CHANNEL_OPENED_STATE) { ret = -EIO; - goto cpu_store_unlock; + goto end; } origin_cpu = channel->target_cpu; if (target_cpu == origin_cpu) - goto cpu_store_unlock; + goto end; if (vmbus_send_modifychannel(channel, hv_cpu_number_to_vp_number(target_cpu))) { ret = -EIO; - goto cpu_store_unlock; + goto end; } /* @@ -1708,10 +1833,26 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, origin_cpu, target_cpu); } -cpu_store_unlock: +end: + return ret; +} + +static ssize_t target_cpu_store(struct vmbus_channel *channel, + const char *buf, size_t count) +{ + u32 target_cpu; + ssize_t ret; + + if (sscanf(buf, "%u", &target_cpu) != 1) + return -EIO; + + cpus_read_lock(); + mutex_lock(&vmbus_connection.channel_mutex); + ret = vmbus_channel_set_cpu(channel, target_cpu); mutex_unlock(&vmbus_connection.channel_mutex); cpus_read_unlock(); - return ret; + + return ret ?: count; } static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store); @@ -1792,6 +1933,33 @@ static ssize_t subchannel_id_show(struct vmbus_channel *channel, } static VMBUS_CHAN_ATTR_RO(subchannel_id); +static int hv_mmap_ring_buffer_wrapper(struct file *filp, struct kobject *kobj, + const struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + struct vmbus_channel *channel = container_of(kobj, struct vmbus_channel, kobj); + struct vm_area_desc desc; + int err; + + /* + * hv_(create|remove)_ring_sysfs implementation ensures that + * mmap_prepare_ring_buffer is not NULL. + */ + compat_set_desc_from_vma(&desc, filp, vma); + err = channel->mmap_prepare_ring_buffer(channel, &desc); + if (err) + return err; + + return __compat_vma_mmap(&desc, vma); +} + +static struct bin_attribute chan_attr_ring_buffer = { + .attr = { + .name = "ring", + .mode = 0600, + }, + .mmap = hv_mmap_ring_buffer_wrapper, +}; static struct attribute *vmbus_chan_attrs[] = { &chan_attr_out_mask.attr, &chan_attr_in_mask.attr, @@ -1811,6 +1979,11 @@ static struct attribute *vmbus_chan_attrs[] = { NULL }; +static const struct bin_attribute *vmbus_chan_bin_attrs[] = { + &chan_attr_ring_buffer, + NULL +}; + /* * Channel-level attribute_group callback function. Returns the permission for * each attribute, and returns 0 if an attribute is not visible. @@ -1831,9 +2004,34 @@ static umode_t vmbus_chan_attr_is_visible(struct kobject *kobj, return attr->mode; } +static umode_t vmbus_chan_bin_attr_is_visible(struct kobject *kobj, + const struct bin_attribute *attr, int idx) +{ + const struct vmbus_channel *channel = + container_of(kobj, struct vmbus_channel, kobj); + + /* Hide ring attribute if channel's ring_sysfs_visible is set to false */ + if (attr == &chan_attr_ring_buffer && !channel->ring_sysfs_visible) + return 0; + + return attr->attr.mode; +} + +static size_t vmbus_chan_bin_size(struct kobject *kobj, + const struct bin_attribute *bin_attr, int a) +{ + const struct vmbus_channel *channel = + container_of(kobj, struct vmbus_channel, kobj); + + return channel->ringbuffer_pagecount << PAGE_SHIFT; +} + static const struct attribute_group vmbus_chan_group = { .attrs = vmbus_chan_attrs, - .is_visible = vmbus_chan_attr_is_visible + .bin_attrs = vmbus_chan_bin_attrs, + .is_visible = vmbus_chan_attr_is_visible, + .is_bin_visible = vmbus_chan_bin_attr_is_visible, + .bin_size = vmbus_chan_bin_size, }; static const struct kobj_type vmbus_chan_ktype = { @@ -1841,6 +2039,64 @@ static const struct kobj_type vmbus_chan_ktype = { .release = vmbus_chan_release, }; +/** + * hv_create_ring_sysfs() - create "ring" sysfs entry corresponding to ring buffers for a channel. + * @channel: Pointer to vmbus_channel structure + * @hv_mmap_prepare_ring_buffer: function pointer for initializing the function to be called on mmap + * channel's "ring" sysfs node, which is for the ring buffer of that channel. + * Function pointer is of below type: + * int (*hv_mmap_prepare_ring_buffer)(struct vmbus_channel *channel, + * struct vm_area_desc *desc)) + * This has a pointer to the channel and a pointer to vm_area_desc, + * used for mmap_prepare, as arguments. + * + * Sysfs node for ring buffer of a channel is created along with other fields, however its + * visibility is disabled by default. Sysfs creation needs to be controlled when the use-case + * is running. + * For example, HV_NIC device is used either by uio_hv_generic or hv_netvsc at any given point of + * time, and "ring" sysfs is needed only when uio_hv_generic is bound to that device. To avoid + * exposing the ring buffer by default, this function is responsible to enable visibility of + * ring for userspace to use. + * Note: Race conditions can happen with userspace and it is not encouraged to create new + * use-cases for this. This was added to maintain backward compatibility, while solving + * one of the race conditions in uio_hv_generic while creating sysfs. See comments with + * vmbus_add_dynid() and vmbus_device_register(). + * + * Returns 0 on success or error code on failure. + */ +int hv_create_ring_sysfs(struct vmbus_channel *channel, + int (*hv_mmap_prepare_ring_buffer)(struct vmbus_channel *channel, + struct vm_area_desc *desc)) +{ + struct kobject *kobj = &channel->kobj; + + channel->mmap_prepare_ring_buffer = hv_mmap_prepare_ring_buffer; + channel->ring_sysfs_visible = true; + + return sysfs_update_group(kobj, &vmbus_chan_group); +} +EXPORT_SYMBOL_GPL(hv_create_ring_sysfs); + +/** + * hv_remove_ring_sysfs() - remove ring sysfs entry corresponding to ring buffers for a channel. + * @channel: Pointer to vmbus_channel structure + * + * Hide "ring" sysfs for a channel by changing its is_visible attribute and updating sysfs group. + * + * Returns 0 on success or error code on failure. + */ +int hv_remove_ring_sysfs(struct vmbus_channel *channel) +{ + struct kobject *kobj = &channel->kobj; + int ret; + + channel->ring_sysfs_visible = false; + ret = sysfs_update_group(kobj, &vmbus_chan_group); + channel->mmap_prepare_ring_buffer = NULL; + return ret; +} +EXPORT_SYMBOL_GPL(hv_remove_ring_sysfs); + /* * vmbus_add_channel_kobj - setup a sub-directory under device/channels */ @@ -1894,7 +2150,7 @@ struct hv_device *vmbus_device_create(const guid_t *type, { struct hv_device *child_device_obj; - child_device_obj = kzalloc(sizeof(struct hv_device), GFP_KERNEL); + child_device_obj = kzalloc_obj(struct hv_device); if (!child_device_obj) { pr_err("Unable to allocate device object for child device\n"); return NULL; @@ -1920,7 +2176,7 @@ int vmbus_device_register(struct hv_device *child_device_obj) &child_device_obj->channel->offermsg.offer.if_instance); child_device_obj->device.bus = &hv_bus; - child_device_obj->device.parent = hv_dev; + child_device_obj->device.parent = vmbus_root_device; child_device_obj->device.release = vmbus_device_release; child_device_obj->device.dma_parms = &child_device_obj->dma_parms; @@ -1938,6 +2194,20 @@ int vmbus_device_register(struct hv_device *child_device_obj) return ret; } + /* + * If device_register() found a driver to assign to the device, the + * driver's probe function has already run at this point. If that + * probe function accesses or operates on the "channels" subdirectory + * in sysfs, those operations will have failed because the "channels" + * subdirectory doesn't exist until the code below runs. Or if the + * probe function creates a /dev entry, a user space program could + * find and open the /dev entry, and then create a race by accessing + * the "channels" subdirectory while the creation steps are in progress + * here. The race can't result in a kernel failure, but the user space + * program may get an error in accessing "channels" or its + * subdirectories. See also comments with vmbus_add_dynid() about a + * related race condition. + */ child_device_obj->channels_kset = kset_create_and_add("channels", NULL, kobj); if (!child_device_obj->channels_kset) { @@ -1948,7 +2218,7 @@ int vmbus_device_register(struct hv_device *child_device_obj) ret = vmbus_add_channel_kobj(child_device_obj, child_device_obj->channel); if (ret) { - pr_err("Unable to register primary channeln"); + pr_err("Unable to register primary channel\n"); goto err_kset_unregister; } hv_debug_add_dev_dir(child_device_obj); @@ -2042,7 +2312,7 @@ static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx) if (end < 0x100000) return AE_OK; - new_res = kzalloc(sizeof(*new_res), GFP_ATOMIC); + new_res = kzalloc_obj(*new_res, GFP_ATOMIC); if (!new_res) return AE_NO_MEMORY; @@ -2120,8 +2390,8 @@ static void __maybe_unused vmbus_reserve_fb(void) if (efi_enabled(EFI_BOOT)) { /* Gen2 VM: get FB base from EFI framebuffer */ if (IS_ENABLED(CONFIG_SYSFB)) { - start = screen_info.lfb_base; - size = max_t(__u32, screen_info.lfb_size, 0x800000); + start = sysfb_primary_display.screen.lfb_base; + size = max_t(__u32, sysfb_primary_display.screen.lfb_size, 0x800000); } } else { /* Gen1 VM: get FB base from PCI */ @@ -2136,8 +2406,8 @@ static void __maybe_unused vmbus_reserve_fb(void) } /* - * Release the PCI device so hyperv_drm or hyperv_fb driver can - * grab it later. + * Release the PCI device so hyperv_drm driver can grab it + * later. */ pci_dev_put(pdev); } @@ -2262,12 +2532,25 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size) struct resource *iter; mutex_lock(&hyperv_mmio_lock); + + /* + * If all bytes of the MMIO range to be released are within the + * special case fb_mmio shadow region, skip releasing the shadow + * region since no corresponding __request_region() was done + * in vmbus_allocate_mmio(). + */ + if (fb_mmio && start >= fb_mmio->start && + (start + size - 1 <= fb_mmio->end)) + goto skip_shadow_release; + for (iter = hyperv_mmio; iter; iter = iter->sibling) { if ((iter->start >= start + size) || (iter->end <= start)) continue; __release_region(iter, start, size); } + +skip_shadow_release: release_mem_region(start, size); mutex_unlock(&hyperv_mmio_lock); @@ -2282,7 +2565,7 @@ static int vmbus_acpi_add(struct platform_device *pdev) struct acpi_device *ancestor; struct acpi_device *device = ACPI_COMPANION(&pdev->dev); - hv_dev = &device->dev; + vmbus_root_device = &device->dev; /* * Older versions of Hyper-V for ARM64 fail to include the _CCA @@ -2334,6 +2617,32 @@ static int vmbus_acpi_add(struct platform_device *pdev) return 0; } #endif +#ifndef HYPERVISOR_CALLBACK_VECTOR +static int vmbus_set_irq(struct platform_device *pdev) +{ + struct irq_data *data; + int irq; + irq_hw_number_t hwirq; + + irq = platform_get_irq(pdev, 0); + /* platform_get_irq() may not return 0. */ + if (irq < 0) + return irq; + + data = irq_get_irq_data(irq); + if (!data) { + pr_err("No interrupt data for VMBus virq %d\n", irq); + return -ENODEV; + } + hwirq = irqd_to_hwirq(data); + + vmbus_irq = irq; + vmbus_interrupt = hwirq; + pr_debug("VMBus virq %d, hwirq %d\n", vmbus_irq, vmbus_interrupt); + + return 0; +} +#endif static int vmbus_device_add(struct platform_device *pdev) { @@ -2343,16 +2652,21 @@ static int vmbus_device_add(struct platform_device *pdev) struct device_node *np = pdev->dev.of_node; int ret; - hv_dev = &pdev->dev; + vmbus_root_device = &pdev->dev; ret = of_range_parser_init(&parser, np); if (ret) return ret; +#ifndef HYPERVISOR_CALLBACK_VECTOR + ret = vmbus_set_irq(pdev); + if (ret) + return ret; +#endif for_each_of_range(&parser, &range) { struct resource *res; - res = kzalloc(sizeof(*res), GFP_KERNEL); + res = kzalloc_obj(*res); if (!res) { vmbus_mmio_remove(); return -ENOMEM; @@ -2573,7 +2887,6 @@ static struct platform_driver vmbus_platform_driver = { static void hv_kexec_handler(void) { - hv_stimer_global_cleanup(); vmbus_initiate_unload(false); /* Make sure conn_state is set as hv_synic_cleanup checks for it */ mb(); @@ -2592,10 +2905,10 @@ static void hv_crash_handler(struct pt_regs *regs) */ cpu = smp_processor_id(); hv_stimer_cleanup(cpu); - hv_synic_disable_regs(cpu); + hv_hyp_synic_disable_regs(cpu); }; -static int hv_synic_suspend(void) +static int hv_synic_suspend(void *data) { /* * When we reach here, all the non-boot CPUs have been offlined. @@ -2617,14 +2930,14 @@ static int hv_synic_suspend(void) * interrupts-disabled context. */ - hv_synic_disable_regs(0); + hv_hyp_synic_disable_regs(0); return 0; } -static void hv_synic_resume(void) +static void hv_synic_resume(void *data) { - hv_synic_enable_regs(0); + hv_hyp_synic_enable_regs(0); /* * Note: we don't need to call hv_stimer_init(0), because the timer @@ -2634,11 +2947,15 @@ static void hv_synic_resume(void) } /* The callbacks run only on CPU0, with irqs_disabled. */ -static struct syscore_ops hv_synic_syscore_ops = { +static const struct syscore_ops hv_synic_syscore_ops = { .suspend = hv_synic_suspend, .resume = hv_synic_resume, }; +static struct syscore hv_synic_syscore = { + .ops = &hv_synic_syscore_ops, +}; + static int __init hv_acpi_init(void) { int ret; @@ -2646,7 +2963,7 @@ static int __init hv_acpi_init(void) if (!hv_is_hyperv_initialized()) return -ENODEV; - if (hv_root_partition && !hv_nested) + if (hv_root_partition() && !hv_nested) return 0; /* @@ -2656,7 +2973,7 @@ static int __init hv_acpi_init(void) if (ret) return ret; - if (!hv_dev) { + if (!vmbus_root_device) { ret = -ENODEV; goto cleanup; } @@ -2681,13 +2998,13 @@ static int __init hv_acpi_init(void) hv_setup_kexec_handler(hv_kexec_handler); hv_setup_crash_handler(hv_crash_handler); - register_syscore_ops(&hv_synic_syscore_ops); + register_syscore(&hv_synic_syscore); return 0; cleanup: platform_driver_unregister(&vmbus_platform_driver); - hv_dev = NULL; + vmbus_root_device = NULL; return ret; } @@ -2695,18 +3012,20 @@ static void __exit vmbus_exit(void) { int cpu; - unregister_syscore_ops(&hv_synic_syscore_ops); + unregister_syscore(&hv_synic_syscore); hv_remove_kexec_handler(); hv_remove_crash_handler(); vmbus_connection.conn_state = DISCONNECTED; hv_stimer_global_cleanup(); vmbus_disconnect(); - if (vmbus_irq == -1) { + if (vmbus_irq == -1) hv_remove_vmbus_handler(); - } else { - free_percpu_irq(vmbus_irq, vmbus_evt); - free_percpu(vmbus_evt); + else + free_percpu_irq(vmbus_irq, &vmbus_evt); + if (IS_ENABLED(CONFIG_PREEMPT_RT) && vmbus_irq_initialized) { + smpboot_unregister_percpu_thread(&vmbus_irq_threads); + vmbus_irq_initialized = false; } for_each_online_cpu(cpu) { struct hv_per_cpu_context *hv_cpu |
