From 6fef518594bcb7e374f809717281bd02894929f8 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 25 Apr 2024 15:07:01 -0700 Subject: KVM: x86: Add a capability to configure bus frequency for APIC timer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add KVM_CAP_X86_APIC_BUS_CYCLES_NS capability to configure the APIC bus clock frequency for APIC timer emulation. Allow KVM_ENABLE_CAPABILITY(KVM_CAP_X86_APIC_BUS_CYCLES_NS) to set the frequency in nanoseconds. When using this capability, the user space VMM should configure CPUID leaf 0x15 to advertise the frequency. Vishal reported that the TDX guest kernel expects a 25MHz APIC bus frequency but ends up getting interrupts at a significantly higher rate. The TDX architecture hard-codes the core crystal clock frequency to 25MHz and mandates exposing it via CPUID leaf 0x15. The TDX architecture does not allow the VMM to override the value. In addition, per Intel SDM: "The APIC timer frequency will be the processor’s bus clock or core crystal clock frequency (when TSC/core crystal clock ratio is enumerated in CPUID leaf 0x15) divided by the value specified in the divide configuration register." The resulting 25MHz APIC bus frequency conflicts with the KVM hardcoded APIC bus frequency of 1GHz. The KVM doesn't enumerate CPUID leaf 0x15 to the guest unless the user space VMM sets it using KVM_SET_CPUID. If the CPUID leaf 0x15 is enumerated, the guest kernel uses it as the APIC bus frequency. If not, the guest kernel measures the frequency based on other known timers like the ACPI timer or the legacy PIT. As reported by Vishal the TDX guest kernel expects a 25MHz timer frequency but gets timer interrupt more frequently due to the 1GHz frequency used by KVM. To ensure that the guest doesn't have a conflicting view of the APIC bus frequency, allow the userspace to tell KVM to use the same frequency that TDX mandates instead of the default 1Ghz. Reported-by: Vishal Annapurve Closes: https://lore.kernel.org/lkml/20231006011255.4163884-1-vannapurve@google.com Signed-off-by: Isaku Yamahata Reviewed-by: Rick Edgecombe Co-developed-by: Reinette Chatre Signed-off-by: Reinette Chatre Reviewed-by: Xiaoyao Li Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/6748a4c12269e756f0c48680da8ccc5367c31ce7.1714081726.git.reinette.chatre@intel.com Signed-off-by: Sean Christopherson --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d03842abae57..ec998e6b6555 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -917,6 +917,7 @@ struct kvm_enable_cap { #define KVM_CAP_MEMORY_ATTRIBUTES 233 #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 +#define KVM_CAP_X86_APIC_BUS_CYCLES_NS 236 struct kvm_irq_routing_irqchip { __u32 irqchip; -- cgit v1.2.3 From 85542adb65ecd7cc0e442e8befef74f2ed07f5f6 Mon Sep 17 00:00:00 2001 From: Thomas Prescher Date: Wed, 8 May 2024 15:25:01 +0200 Subject: KVM: x86: Add KVM_RUN_X86_GUEST_MODE kvm_run flag When a vCPU is interrupted by a signal while running a nested guest, KVM will exit to userspace with L2 state. However, userspace has no way to know whether it sees L1 or L2 state (besides calling KVM_GET_STATS_FD, which does not have a stable ABI). This causes multiple problems: The simplest one is L2 state corruption when userspace marks the sregs as dirty. See this mailing list thread [1] for a complete discussion. Another problem is that if userspace decides to continue by emulating instructions, it will unknowingly emulate with L2 state as if L1 doesn't exist, which can be considered a weird guest escape. Introduce a new flag, KVM_RUN_X86_GUEST_MODE, in the kvm_run data structure, which is set when the vCPU exited while running a nested guest. Also introduce a new capability, KVM_CAP_X86_GUEST_MODE, to advertise the functionality to userspace. [1] https://lore.kernel.org/kvm/20240416123558.212040-1-julian.stecklina@cyberus-technology.de/T/#m280aadcb2e10ae02c191a7dc4ed4b711a74b1f55 Signed-off-by: Thomas Prescher Signed-off-by: Julian Stecklina Link: https://lore.kernel.org/r/20240508132502.184428-1-julian.stecklina@cyberus-technology.de Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 17 +++++++++++++++++ arch/x86/include/uapi/asm/kvm.h | 1 + arch/x86/kvm/x86.c | 3 +++ include/uapi/linux/kvm.h | 1 + 4 files changed, 22 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 67edb84317ef..42d1d9518bf2 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6419,6 +6419,9 @@ affect the device's behavior. Current defined flags:: #define KVM_RUN_X86_SMM (1 << 0) /* x86, set if bus lock detected in VM */ #define KVM_RUN_X86_BUS_LOCK (1 << 1) + /* x86, set if the VCPU is executing a nested (L2) guest */ + #define KVM_RUN_X86_GUEST_MODE (1 << 2) + /* arm64, set for KVM_EXIT_DEBUG */ #define KVM_DEBUG_ARCH_HSR_HIGH_VALID (1 << 0) @@ -8089,6 +8092,20 @@ by KVM_CHECK_EXTENSION. Note: Userspace is responsible for correctly configuring CPUID 0x15, a.k.a. the core crystal clock frequency, if a non-zero CPUID 0x15 is exposed to the guest. +7.36 KVM_CAP_X86_GUEST_MODE +------------------------------ + +:Architectures: x86 +:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP. + +The presence of this capability indicates that KVM_RUN will update the +KVM_RUN_X86_GUEST_MODE bit in kvm_run.flags to indicate whether the +vCPU was executing nested guest code when it exited. + +KVM exits with the register state of either the L1 or L2 guest +depending on which executed at the time of an exit. Userspace must +take care to differentiate between these cases. + 8. Other capabilities. ====================== diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 9fae1b73b529..b85671d9c8aa 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -106,6 +106,7 @@ struct kvm_ioapic_state { #define KVM_RUN_X86_SMM (1 << 0) #define KVM_RUN_X86_BUS_LOCK (1 << 1) +#define KVM_RUN_X86_GUEST_MODE (1 << 2) /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bfe3dba56e24..33e41103fcde 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4704,6 +4704,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_MEMORY_FAULT_INFO: + case KVM_CAP_X86_GUEST_MODE: r = 1; break; case KVM_CAP_X86_APIC_BUS_CYCLES_NS: @@ -10277,6 +10278,8 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) if (is_smm(vcpu)) kvm_run->flags |= KVM_RUN_X86_SMM; + if (is_guest_mode(vcpu)) + kvm_run->flags |= KVM_RUN_X86_GUEST_MODE; } static void update_cr8_intercept(struct kvm_vcpu *vcpu) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ec998e6b6555..a6ac00ec77ad 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -918,6 +918,7 @@ struct kvm_enable_cap { #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 236 +#define KVM_CAP_X86_GUEST_MODE 237 struct kvm_irq_routing_irqchip { __u32 irqchip; -- cgit v1.2.3